summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/H5.c19
-rw-r--r--src/H5AC.c73
-rw-r--r--src/H5C.c218
-rw-r--r--src/H5CX.c2
-rw-r--r--src/H5Cepoch.c31
-rw-r--r--src/H5Cpkg.h226
-rw-r--r--src/H5Cprivate.h10
-rw-r--r--src/H5Dchunk.c395
-rw-r--r--src/H5Dint.c55
-rw-r--r--src/H5Dio.c128
-rw-r--r--src/H5Dlayout.c2
-rw-r--r--src/H5Dmpio.c5356
-rw-r--r--src/H5Dpkg.h17
-rw-r--r--src/H5Dselect.c187
-rw-r--r--src/H5ES.c2
-rw-r--r--src/H5FD.c5
-rw-r--r--src/H5FDmirror_priv.h4
-rw-r--r--src/H5FDmpio.c94
-rw-r--r--src/H5Fmpi.c196
-rw-r--r--src/H5Fprivate.h3
-rw-r--r--src/H5Ocache.c8
-rw-r--r--src/H5Ocopy.c3
-rw-r--r--src/H5Ocopy_ref.c82
-rw-r--r--src/H5Odtype.c38
-rw-r--r--src/H5Oint.c12
-rw-r--r--src/H5Opkg.h2
-rw-r--r--src/H5PLpath.c4
-rw-r--r--src/H5Pfapl.c24
-rw-r--r--src/H5Tcommit.c2
-rw-r--r--src/H5Tnative.c6
-rw-r--r--src/H5VLcallback.c2
-rw-r--r--src/H5VLnative.h6
-rw-r--r--src/H5Z.c14
-rw-r--r--src/H5Znbit.c29
-rw-r--r--src/H5Zscaleoffset.c6
-rw-r--r--src/H5mpi.c233
-rw-r--r--src/H5private.h33
-rw-r--r--src/H5public.h11
-rw-r--r--src/H5system.c2
-rw-r--r--src/H5timer.c15
-rw-r--r--src/H5trace.c2
41 files changed, 5511 insertions, 2046 deletions
diff --git a/src/H5.c b/src/H5.c
index d4fca9a..1585dd3 100644
--- a/src/H5.c
+++ b/src/H5.c
@@ -70,9 +70,9 @@ static int H5__mpi_delete_cb(MPI_Comm comm, int keyval, void *attr_val, int *fla
/* Library Private Variables */
/*****************************/
-/* Library incompatible release versions */
-const unsigned VERS_RELEASE_EXCEPTIONS[] = {0};
-const unsigned VERS_RELEASE_EXCEPTIONS_SIZE = 0;
+/* Library incompatible release versions, develop releases are incompatible by design */
+const unsigned VERS_RELEASE_EXCEPTIONS[] = {0, 1, 2};
+const unsigned VERS_RELEASE_EXCEPTIONS_SIZE = 3;
/* statically initialize block for pthread_once call used in initializing */
/* the first global mutex */
@@ -954,6 +954,7 @@ H5check_version(unsigned majnum, unsigned minnum, unsigned relnum)
static int checked = 0; /* If we've already checked the version info */
static unsigned int disable_version_check = 0; /* Set if the version check should be disabled */
static const char * version_mismatch_warning = VERSION_MISMATCH_WARNING;
+ static const char * release_mismatch_warning = RELEASE_MISMATCH_WARNING;
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_API_NOINIT_NOERR_NOFS
@@ -974,10 +975,7 @@ H5check_version(unsigned majnum, unsigned minnum, unsigned relnum)
}
/* H5_VERS_MAJOR and H5_VERS_MINOR must match */
- /* Cast relnum to int to avoid warning for unsigned < 0 comparison
- * in first release versions */
- if (H5_VERS_MAJOR != majnum || H5_VERS_MINOR != minnum || H5_VERS_RELEASE > (int)relnum) {
-
+ if (H5_VERS_MAJOR != majnum || H5_VERS_MINOR != minnum) {
switch (disable_version_check) {
case 0:
HDfprintf(stderr, "%s%s", version_mismatch_warning,
@@ -1012,9 +1010,10 @@ H5check_version(unsigned majnum, unsigned minnum, unsigned relnum)
break;
} /* end switch */
- } /* end if (H5_VERS_MAJOR != majnum || H5_VERS_MINOR != minnum || H5_VERS_RELEASE > relnum) */
+ } /* end if (H5_VERS_MAJOR != majnum || H5_VERS_MINOR != minnum) */
/* H5_VERS_RELEASE should be compatible, we will only add checks for exceptions */
+ /* Library develop release versions are incompatible by design */
if (H5_VERS_RELEASE != relnum) {
for (unsigned i = 0; i < VERS_RELEASE_EXCEPTIONS_SIZE; i++) {
/* Check for incompatible headers or incompatible library */
@@ -1022,7 +1021,7 @@ H5check_version(unsigned majnum, unsigned minnum, unsigned relnum)
switch (disable_version_check) {
case 0:
HDfprintf(
- stderr, "%s%s", version_mismatch_warning,
+ stderr, "%s%s", release_mismatch_warning,
"You can, at your own risk, disable this warning by setting the environment\n"
"variable 'HDF5_DISABLE_VERSION_CHECK' to a value of '1'.\n"
"Setting it to 2 or higher will suppress the warning messages totally.\n");
@@ -1041,7 +1040,7 @@ H5check_version(unsigned majnum, unsigned minnum, unsigned relnum)
"%s'HDF5_DISABLE_VERSION_CHECK' "
"environment variable is set to %d, application will\n"
"continue at your own risk.\n",
- version_mismatch_warning, disable_version_check);
+ release_mismatch_warning, disable_version_check);
/* Mention the versions we are referring to */
HDfprintf(stderr, "Headers are %u.%u.%u, library is %u.%u.%u\n", majnum, minnum,
relnum, (unsigned)H5_VERS_MAJOR, (unsigned)H5_VERS_MINOR,
diff --git a/src/H5AC.c b/src/H5AC.c
index 47d3a65..ac28a8c 100644
--- a/src/H5AC.c
+++ b/src/H5AC.c
@@ -1440,21 +1440,82 @@ H5AC_resize_entry(void *thing, size_t new_size)
cache_ptr = entry_ptr->cache_ptr;
HDassert(cache_ptr);
- /* Resize the entry */
- if (H5C_resize_entry(thing, new_size) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTRESIZE, FAIL, "can't resize entry")
-
#ifdef H5_HAVE_PARALLEL
- {
+ /* Log the generation of dirty bytes of metadata iff:
+ *
+ * 1) The entry is clean on entry, and this resize will dirty it
+ * (i.e. the current and new sizes are different), and
+ *
+ * 2) This is a parallel computation -- which it is if the aux_ptr
+ * is non-null.
+ *
+ * A few points to note about this section of the code:
+ *
+ * 1) This call must occur before the call to H5C_resize_entry() since
+ * H5AC__log_dirtied_entry() expects the target entry to be clean
+ * on entry.
+ *
+ * 2) This code has some basic issues in terms of the number of bytes
+ * added to the dirty bytes count.
+ *
+ * First, it adds the initial entry size to aux_ptr->dirty_bytes,
+ * not the final size. Note that this code used to use the final
+ * size, but code to support this has been removed from
+ * H5AC__log_dirtied_entry() for reasons unknown since I wrote this
+ * code.
+ *
+ * As long as all ranks do the same thing here, this probably doesn't
+ * matter much, although it will delay initiation of sync points.
+ *
+ * A more interesting point is that this code will not increment
+ * aux_ptr->dirty_bytes if a dirty entry is resized. At first glance
+ * this seems major, as particularly with the older file formats,
+ * resizes can be quite large. However, this is probably not an
+ * issue either, since such resizes will be accompanied by large
+ * amounts of dirty metadata creation in other areas -- which will
+ * cause aux_ptr->dirty_bytes to be incremented.
+ *
+ * The bottom line is that this code is probably OK, but the above
+ * points should be kept in mind.
+ *
+ * One final observation: This comment is occasioned by a bug caused
+ * by moving the call to H5AC__log_dirtied_entry() after the call to
+ * H5C_resize_entry(), and then only calling H5AC__log_dirtied_entry()
+ * if entry_ptr->is_dirty was false.
+ *
+ * Since H5C_resize_entry() marks the target entry dirty unless there
+ * is not change in size, this had the effect of not calling
+ * H5AC__log_dirtied_entry() when it should be, and corrupting
+ * the cleaned and dirtied lists used by rank 0 in the parallel
+ * version of the metadata cache.
+ *
+ * The point here is that you should be very careful when working with
+ * this code, and not modify it unless you fully understand it.
+ *
+ * JRM -- 2/28/22
+ */
+
+ if ((!entry_ptr->is_dirty) && (entry_ptr->size != new_size)) {
+
+ /* the entry is clean, and will be marked dirty in the resize
+ * operation.
+ */
H5AC_aux_t *aux_ptr;
aux_ptr = (H5AC_aux_t *)H5C_get_aux_ptr(cache_ptr);
- if ((!entry_ptr->is_dirty) && (NULL != aux_ptr))
+
+ if (NULL != aux_ptr) {
+
if (H5AC__log_dirtied_entry(entry_ptr) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTMARKDIRTY, FAIL, "can't log dirtied entry")
+ }
}
#endif /* H5_HAVE_PARALLEL */
+ /* Resize the entry */
+ if (H5C_resize_entry(thing, new_size) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTRESIZE, FAIL, "can't resize entry")
+
done:
/* If currently logging, generate a message */
if (cache_ptr != NULL && cache_ptr->log_info != NULL)
diff --git a/src/H5C.c b/src/H5C.c
index 889351d..fa46ff2 100644
--- a/src/H5C.c
+++ b/src/H5C.c
@@ -138,16 +138,6 @@ static herr_t H5C__generate_image(H5F_t *f, H5C_t *cache_ptr, H5C_cache_entry_t
static herr_t H5C__verify_len_eoa(H5F_t *f, const H5C_class_t *type, haddr_t addr, size_t *len,
hbool_t actual);
-#if H5C_DO_SLIST_SANITY_CHECKS
-static hbool_t H5C__entry_in_skip_list(H5C_t *cache_ptr, H5C_cache_entry_t *target_ptr);
-#endif /* H5C_DO_SLIST_SANITY_CHECKS */
-
-#if H5C_DO_EXTREME_SANITY_CHECKS
-static herr_t H5C__validate_lru_list(H5C_t *cache_ptr);
-static herr_t H5C__validate_pinned_entry_list(H5C_t *cache_ptr);
-static herr_t H5C__validate_protected_entry_list(H5C_t *cache_ptr);
-#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
-
#ifndef NDEBUG
static void H5C__assert_flush_dep_nocycle(const H5C_cache_entry_t *entry,
const H5C_cache_entry_t *base_entry);
@@ -996,7 +986,7 @@ H5C_expunge_entry(H5F_t *f, const H5C_class_t *type, haddr_t addr, unsigned flag
HDassert(H5F_addr_defined(addr));
#if H5C_DO_EXTREME_SANITY_CHECKS
- if (H5C__validate_lru_list(cache_ptr) < 0)
+ if (H5C_validate_lru_list(cache_ptr) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "LRU extreme sanity check failed on entry")
#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
@@ -1031,7 +1021,7 @@ H5C_expunge_entry(H5F_t *f, const H5C_class_t *type, haddr_t addr, unsigned flag
done:
#if H5C_DO_EXTREME_SANITY_CHECKS
- if (H5C__validate_lru_list(cache_ptr) < 0)
+ if (H5C_validate_lru_list(cache_ptr) < 0)
HDONE_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "LRU extreme sanity check failed on exit")
#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
@@ -1138,8 +1128,8 @@ H5C_flush_cache(H5F_t *f, unsigned flags)
#endif /* H5C_DO_SANITY_CHECKS */
#if H5C_DO_EXTREME_SANITY_CHECKS
- if ((H5C__validate_protected_entry_list(cache_ptr) < 0) ||
- (H5C__validate_pinned_entry_list(cache_ptr) < 0) || (H5C__validate_lru_list(cache_ptr) < 0))
+ if ((H5C_validate_protected_entry_list(cache_ptr) < 0) ||
+ (H5C_validate_pinned_entry_list(cache_ptr) < 0) || (H5C_validate_lru_list(cache_ptr) < 0))
HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "an extreme sanity check failed on entry")
#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
@@ -1314,8 +1304,8 @@ H5C_insert_entry(H5F_t *f, const H5C_class_t *type, haddr_t addr, void *thing, u
#if H5C_DO_EXTREME_SANITY_CHECKS
/* no need to verify that entry is not already in the index as */
/* we already make that check below. */
- if ((H5C__validate_protected_entry_list(cache_ptr) < 0) ||
- (H5C__validate_pinned_entry_list(cache_ptr) < 0) || (H5C__validate_lru_list(cache_ptr) < 0))
+ if ((H5C_validate_protected_entry_list(cache_ptr) < 0) ||
+ (H5C_validate_pinned_entry_list(cache_ptr) < 0) || (H5C_validate_lru_list(cache_ptr) < 0))
HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "an extreme sanity check failed on entry")
#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
@@ -1424,6 +1414,7 @@ H5C_insert_entry(H5F_t *f, const H5C_class_t *type, haddr_t addr, void *thing, u
entry_ptr->serialization_count = 0;
#endif /* NDEBUG */
+ /* initialize tag list fields */
entry_ptr->tl_next = NULL;
entry_ptr->tl_prev = NULL;
entry_ptr->tag_info = NULL;
@@ -1503,8 +1494,8 @@ H5C_insert_entry(H5F_t *f, const H5C_class_t *type, haddr_t addr, void *thing, u
H5C__UPDATE_RP_FOR_INSERTION(cache_ptr, entry_ptr, FAIL)
#if H5C_DO_EXTREME_SANITY_CHECKS
- if ((H5C__validate_protected_entry_list(cache_ptr) < 0) ||
- (H5C__validate_pinned_entry_list(cache_ptr) < 0) || (H5C__validate_lru_list(cache_ptr) < 0))
+ if ((H5C_validate_protected_entry_list(cache_ptr) < 0) ||
+ (H5C_validate_pinned_entry_list(cache_ptr) < 0) || (H5C_validate_lru_list(cache_ptr) < 0))
HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "an extreme sanity check failed just before done")
#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
@@ -1518,23 +1509,32 @@ H5C_insert_entry(H5F_t *f, const H5C_class_t *type, haddr_t addr, void *thing, u
#ifdef H5_HAVE_PARALLEL
if (H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI))
- coll_access = H5CX_get_coll_metadata_read();
+ coll_access = H5F_get_coll_metadata_reads(f);
entry_ptr->coll_access = coll_access;
if (coll_access) {
H5C__INSERT_IN_COLL_LIST(cache_ptr, entry_ptr, FAIL)
/* Make sure the size of the collective entries in the cache remain in check */
- if (cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100)
- if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't clear collective metadata entries")
- } /* end if */
+ if (H5P_USER_TRUE == H5F_COLL_MD_READ(f)) {
+ if (cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100) {
+ if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't clear collective metadata entries")
+ } /* end if */
+ } /* end if */
+ else {
+ if (cache_ptr->max_cache_size * 40 < cache_ptr->coll_list_size * 100) {
+ if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't clear collective metadata entries")
+ } /* end if */
+ } /* end else */
+ } /* end if */
#endif
done:
#if H5C_DO_EXTREME_SANITY_CHECKS
- if ((H5C__validate_protected_entry_list(cache_ptr) < 0) ||
- (H5C__validate_pinned_entry_list(cache_ptr) < 0) || (H5C__validate_lru_list(cache_ptr) < 0))
+ if ((H5C_validate_protected_entry_list(cache_ptr) < 0) ||
+ (H5C_validate_pinned_entry_list(cache_ptr) < 0) || (H5C_validate_lru_list(cache_ptr) < 0))
HDONE_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "an extreme sanity check failed on exit")
#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
@@ -1858,8 +1858,8 @@ H5C_move_entry(H5C_t *cache_ptr, const H5C_class_t *type, haddr_t old_addr, hadd
HDassert(H5F_addr_ne(old_addr, new_addr));
#if H5C_DO_EXTREME_SANITY_CHECKS
- if ((H5C__validate_protected_entry_list(cache_ptr) < 0) ||
- (H5C__validate_pinned_entry_list(cache_ptr) < 0) || (H5C__validate_lru_list(cache_ptr) < 0))
+ if ((H5C_validate_protected_entry_list(cache_ptr) < 0) ||
+ (H5C_validate_pinned_entry_list(cache_ptr) < 0) || (H5C_validate_lru_list(cache_ptr) < 0))
HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "an extreme sanity check failed on entry")
#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
@@ -1964,8 +1964,8 @@ H5C_move_entry(H5C_t *cache_ptr, const H5C_class_t *type, haddr_t old_addr, hadd
done:
#if H5C_DO_EXTREME_SANITY_CHECKS
- if ((H5C__validate_protected_entry_list(cache_ptr) < 0) ||
- (H5C__validate_pinned_entry_list(cache_ptr) < 0) || (H5C__validate_lru_list(cache_ptr) < 0))
+ if ((H5C_validate_protected_entry_list(cache_ptr) < 0) ||
+ (H5C_validate_pinned_entry_list(cache_ptr) < 0) || (H5C_validate_lru_list(cache_ptr) < 0))
HDONE_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "an extreme sanity check failed on exit")
#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
@@ -2011,8 +2011,7 @@ H5C_resize_entry(void *thing, size_t new_size)
HGOTO_ERROR(H5E_CACHE, H5E_BADTYPE, FAIL, "Entry isn't pinned or protected??")
#if H5C_DO_EXTREME_SANITY_CHECKS
- if ((H5C__validate_protected_entry_list(cache_ptr) < 0) ||
- (H5C__validate_pinned_entry_list(cache_ptr) < 0))
+ if ((H5C_validate_protected_entry_list(cache_ptr) < 0) || (H5C_validate_pinned_entry_list(cache_ptr) < 0))
HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "an extreme sanity check failed on entry")
#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
@@ -2108,8 +2107,7 @@ H5C_resize_entry(void *thing, size_t new_size)
done:
#if H5C_DO_EXTREME_SANITY_CHECKS
- if ((H5C__validate_protected_entry_list(cache_ptr) < 0) ||
- (H5C__validate_pinned_entry_list(cache_ptr) < 0))
+ if ((H5C_validate_protected_entry_list(cache_ptr) < 0) || (H5C_validate_pinned_entry_list(cache_ptr) < 0))
HDONE_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "an extreme sanity check failed on exit")
#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
@@ -2149,8 +2147,8 @@ H5C_pin_protected_entry(void *thing)
HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC);
#if H5C_DO_EXTREME_SANITY_CHECKS
- if ((H5C__validate_protected_entry_list(cache_ptr) < 0) ||
- (H5C__validate_pinned_entry_list(cache_ptr) < 0) || (H5C__validate_lru_list(cache_ptr) < 0))
+ if ((H5C_validate_protected_entry_list(cache_ptr) < 0) ||
+ (H5C_validate_pinned_entry_list(cache_ptr) < 0) || (H5C_validate_lru_list(cache_ptr) < 0))
HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "an extreme sanity check failed on entry")
#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
@@ -2164,8 +2162,8 @@ H5C_pin_protected_entry(void *thing)
done:
#if H5C_DO_EXTREME_SANITY_CHECKS
- if ((H5C__validate_protected_entry_list(cache_ptr) < 0) ||
- (H5C__validate_pinned_entry_list(cache_ptr) < 0) || (H5C__validate_lru_list(cache_ptr) < 0))
+ if ((H5C_validate_protected_entry_list(cache_ptr) < 0) ||
+ (H5C_validate_pinned_entry_list(cache_ptr) < 0) || (H5C_validate_lru_list(cache_ptr) < 0))
HDONE_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "an extreme sanity check failed on exit")
#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
@@ -2228,8 +2226,8 @@ H5C_protect(H5F_t *f, const H5C_class_t *type, haddr_t addr, void *udata, unsign
HDassert(H5F_addr_defined(addr));
#if H5C_DO_EXTREME_SANITY_CHECKS
- if ((H5C__validate_protected_entry_list(cache_ptr) < 0) ||
- (H5C__validate_pinned_entry_list(cache_ptr) < 0) || (H5C__validate_lru_list(cache_ptr) < 0))
+ if ((H5C_validate_protected_entry_list(cache_ptr) < 0) ||
+ (H5C_validate_pinned_entry_list(cache_ptr) < 0) || (H5C_validate_lru_list(cache_ptr) < 0))
HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, NULL, "an extreme sanity check failed on entry")
#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
@@ -2248,7 +2246,7 @@ H5C_protect(H5F_t *f, const H5C_class_t *type, haddr_t addr, void *udata, unsign
#ifdef H5_HAVE_PARALLEL
if (H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI))
- coll_access = H5CX_get_coll_metadata_read();
+ coll_access = H5F_get_coll_metadata_reads(f);
#endif /* H5_HAVE_PARALLEL */
/* first check to see if the target is in cache */
@@ -2600,16 +2598,24 @@ H5C_protect(H5F_t *f, const H5C_class_t *type, haddr_t addr, void *udata, unsign
#ifdef H5_HAVE_PARALLEL
/* Make sure the size of the collective entries in the cache remain in check */
- if (coll_access)
- if (cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100)
- if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "can't clear collective metadata entries")
-#endif /* H5_HAVE_PARALLEL */
+ if (coll_access) {
+ if (H5P_USER_TRUE == H5F_COLL_MD_READ(f)) {
+ if (cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100)
+ if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "can't clear collective metadata entries")
+ } /* end if */
+ else {
+ if (cache_ptr->max_cache_size * 40 < cache_ptr->coll_list_size * 100)
+ if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "can't clear collective metadata entries")
+ } /* end else */
+ } /* end if */
+#endif /* H5_HAVE_PARALLEL */
done:
#if H5C_DO_EXTREME_SANITY_CHECKS
- if ((H5C__validate_protected_entry_list(cache_ptr) < 0) ||
- (H5C__validate_pinned_entry_list(cache_ptr) < 0) || (H5C__validate_lru_list(cache_ptr) < 0))
+ if ((H5C_validate_protected_entry_list(cache_ptr) < 0) ||
+ (H5C_validate_pinned_entry_list(cache_ptr) < 0) || (H5C_validate_lru_list(cache_ptr) < 0))
HDONE_ERROR(H5E_CACHE, H5E_SYSTEM, NULL, "an extreme sanity check failed on exit")
#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
@@ -3082,8 +3088,8 @@ H5C_unpin_entry(void *_entry_ptr)
HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC);
#if H5C_DO_EXTREME_SANITY_CHECKS
- if ((H5C__validate_protected_entry_list(cache_ptr) < 0) ||
- (H5C__validate_pinned_entry_list(cache_ptr) < 0) || (H5C__validate_lru_list(cache_ptr) < 0))
+ if ((H5C_validate_protected_entry_list(cache_ptr) < 0) ||
+ (H5C_validate_pinned_entry_list(cache_ptr) < 0) || (H5C_validate_lru_list(cache_ptr) < 0))
HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "an extreme sanity check failed on entry")
#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
@@ -3093,8 +3099,8 @@ H5C_unpin_entry(void *_entry_ptr)
done:
#if H5C_DO_EXTREME_SANITY_CHECKS
- if ((H5C__validate_protected_entry_list(cache_ptr) < 0) ||
- (H5C__validate_pinned_entry_list(cache_ptr) < 0) || (H5C__validate_lru_list(cache_ptr) < 0))
+ if ((H5C_validate_protected_entry_list(cache_ptr) < 0) ||
+ (H5C_validate_pinned_entry_list(cache_ptr) < 0) || (H5C_validate_lru_list(cache_ptr) < 0))
HDONE_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "an extreme sanity check failed on exit")
#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
@@ -3261,8 +3267,8 @@ H5C_unprotect(H5F_t *f, haddr_t addr, void *thing, unsigned flags)
was_clean = !(entry_ptr->is_dirty);
#if H5C_DO_EXTREME_SANITY_CHECKS
- if ((H5C__validate_protected_entry_list(cache_ptr) < 0) ||
- (H5C__validate_pinned_entry_list(cache_ptr) < 0) || (H5C__validate_lru_list(cache_ptr) < 0))
+ if ((H5C_validate_protected_entry_list(cache_ptr) < 0) ||
+ (H5C_validate_pinned_entry_list(cache_ptr) < 0) || (H5C_validate_lru_list(cache_ptr) < 0))
HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "an extreme sanity check failed on entry")
#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
@@ -3528,8 +3534,8 @@ H5C_unprotect(H5F_t *f, haddr_t addr, void *thing, unsigned flags)
done:
#if H5C_DO_EXTREME_SANITY_CHECKS
- if ((H5C__validate_protected_entry_list(cache_ptr) < 0) ||
- (H5C__validate_pinned_entry_list(cache_ptr) < 0) || (H5C__validate_lru_list(cache_ptr) < 0))
+ if ((H5C_validate_protected_entry_list(cache_ptr) < 0) ||
+ (H5C_validate_pinned_entry_list(cache_ptr) < 0) || (H5C_validate_lru_list(cache_ptr) < 0))
HDONE_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "an extreme sanity check failed on exit")
#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
@@ -6063,8 +6069,8 @@ H5C__flush_ring(H5F_t *f, H5C_ring_t ring, unsigned flags)
HDassert(ring < H5C_RING_NTYPES);
#if H5C_DO_EXTREME_SANITY_CHECKS
- if ((H5C__validate_protected_entry_list(cache_ptr) < 0) ||
- (H5C__validate_pinned_entry_list(cache_ptr) < 0) || (H5C__validate_lru_list(cache_ptr) < 0))
+ if ((H5C_validate_protected_entry_list(cache_ptr) < 0) ||
+ (H5C_validate_pinned_entry_list(cache_ptr) < 0) || (H5C_validate_lru_list(cache_ptr) < 0))
HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "an extreme sanity check failed on entry")
#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
@@ -7411,6 +7417,7 @@ H5C__load_entry(H5F_t *f,
entry->serialization_count = 0;
#endif /* NDEBUG */
+ /* initialize tag list fields */
entry->tl_next = NULL;
entry->tl_prev = NULL;
entry->tag_info = NULL;
@@ -7739,7 +7746,7 @@ done:
/*-------------------------------------------------------------------------
*
- * Function: H5C__validate_lru_list
+ * Function: H5C_validate_lru_list
*
* Purpose: Debugging function that scans the LRU list for errors.
*
@@ -7754,15 +7761,15 @@ done:
*-------------------------------------------------------------------------
*/
#if H5C_DO_EXTREME_SANITY_CHECKS
-static herr_t
-H5C__validate_lru_list(H5C_t *cache_ptr)
+herr_t
+H5C_validate_lru_list(H5C_t *cache_ptr)
{
int32_t len = 0;
size_t size = 0;
H5C_cache_entry_t *entry_ptr = NULL;
herr_t ret_value = SUCCEED; /* Return value */
- FUNC_ENTER_STATIC
+ FUNC_ENTER_NOAPI(FAIL)
HDassert(cache_ptr);
HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC);
@@ -7771,51 +7778,48 @@ H5C__validate_lru_list(H5C_t *cache_ptr)
(cache_ptr->LRU_head_ptr != cache_ptr->LRU_tail_ptr))
HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 1 failed")
- if (cache_ptr->LRU_list_len < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 2 failed")
-
if ((cache_ptr->LRU_list_len == 1) &&
((cache_ptr->LRU_head_ptr != cache_ptr->LRU_tail_ptr) || (cache_ptr->LRU_head_ptr == NULL) ||
(cache_ptr->LRU_head_ptr->size != cache_ptr->LRU_list_size)))
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 3 failed")
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 2 failed")
if ((cache_ptr->LRU_list_len >= 1) &&
((cache_ptr->LRU_head_ptr == NULL) || (cache_ptr->LRU_head_ptr->prev != NULL) ||
(cache_ptr->LRU_tail_ptr == NULL) || (cache_ptr->LRU_tail_ptr->next != NULL)))
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 4 failed")
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 3 failed")
entry_ptr = cache_ptr->LRU_head_ptr;
while (entry_ptr != NULL) {
if ((entry_ptr != cache_ptr->LRU_head_ptr) &&
((entry_ptr->prev == NULL) || (entry_ptr->prev->next != entry_ptr)))
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 5 failed")
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 4 failed")
if ((entry_ptr != cache_ptr->LRU_tail_ptr) &&
((entry_ptr->next == NULL) || (entry_ptr->next->prev != entry_ptr)))
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 6 failed")
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 5 failed")
if ((entry_ptr->is_pinned) || (entry_ptr->pinned_from_client) || (entry_ptr->pinned_from_cache))
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 7 failed")
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 6 failed")
len++;
size += entry_ptr->size;
entry_ptr = entry_ptr->next;
}
- if ((cache_ptr->LRU_list_len != len) || (cache_ptr->LRU_list_size != size))
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 8 failed")
+ if ((cache_ptr->LRU_list_len != (uint32_t)len) || (cache_ptr->LRU_list_size != size))
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 7 failed")
done:
if (ret_value != SUCCEED)
HDassert(0);
FUNC_LEAVE_NOAPI(ret_value)
-} /* H5C__validate_lru_list() */
+} /* H5C_validate_lru_list() */
#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
/*-------------------------------------------------------------------------
*
- * Function: H5C__validate_pinned_entry_list
+ * Function: H5C_validate_pinned_entry_list
*
* Purpose: Debugging function that scans the pinned entry list for
* errors.
@@ -7831,15 +7835,15 @@ done:
*-------------------------------------------------------------------------
*/
#if H5C_DO_EXTREME_SANITY_CHECKS
-static herr_t
-H5C__validate_pinned_entry_list(H5C_t *cache_ptr)
+herr_t
+H5C_validate_pinned_entry_list(H5C_t *cache_ptr)
{
int32_t len = 0;
size_t size = 0;
H5C_cache_entry_t *entry_ptr = NULL;
herr_t ret_value = SUCCEED; /* Return value */
- FUNC_ENTER_STATIC
+ FUNC_ENTER_NOAPI(FAIL)
HDassert(cache_ptr);
HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC);
@@ -7848,54 +7852,51 @@ H5C__validate_pinned_entry_list(H5C_t *cache_ptr)
(cache_ptr->pel_head_ptr != cache_ptr->pel_tail_ptr))
HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 1 failed")
- if (cache_ptr->pel_len < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 2 failed")
-
if ((cache_ptr->pel_len == 1) &&
((cache_ptr->pel_head_ptr != cache_ptr->pel_tail_ptr) || (cache_ptr->pel_head_ptr == NULL) ||
(cache_ptr->pel_head_ptr->size != cache_ptr->pel_size)))
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 3 failed")
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 2 failed")
if ((cache_ptr->pel_len >= 1) &&
((cache_ptr->pel_head_ptr == NULL) || (cache_ptr->pel_head_ptr->prev != NULL) ||
(cache_ptr->pel_tail_ptr == NULL) || (cache_ptr->pel_tail_ptr->next != NULL)))
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 4 failed")
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 3 failed")
entry_ptr = cache_ptr->pel_head_ptr;
while (entry_ptr != NULL) {
if ((entry_ptr != cache_ptr->pel_head_ptr) &&
((entry_ptr->prev == NULL) || (entry_ptr->prev->next != entry_ptr)))
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 5 failed")
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 4 failed")
if ((entry_ptr != cache_ptr->pel_tail_ptr) &&
((entry_ptr->next == NULL) || (entry_ptr->next->prev != entry_ptr)))
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 6 failed")
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 5 failed")
if (!entry_ptr->is_pinned)
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 7 failed")
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 6 failed")
if (!(entry_ptr->pinned_from_client || entry_ptr->pinned_from_cache))
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 8 failed")
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 7 failed")
len++;
size += entry_ptr->size;
entry_ptr = entry_ptr->next;
}
- if ((cache_ptr->pel_len != len) || (cache_ptr->pel_size != size))
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 9 failed")
+ if ((cache_ptr->pel_len != (uint32_t)len) || (cache_ptr->pel_size != size))
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 8 failed")
done:
if (ret_value != SUCCEED)
HDassert(0);
FUNC_LEAVE_NOAPI(ret_value)
-} /* H5C__validate_pinned_entry_list() */
+} /* H5C_validate_pinned_entry_list() */
#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
/*-------------------------------------------------------------------------
*
- * Function: H5C__validate_protected_entry_list
+ * Function: H5C_validate_protected_entry_list
*
* Purpose: Debugging function that scans the protected entry list for
* errors.
@@ -7911,15 +7912,15 @@ done:
*-------------------------------------------------------------------------
*/
#if H5C_DO_EXTREME_SANITY_CHECKS
-static herr_t
-H5C__validate_protected_entry_list(H5C_t *cache_ptr)
+herr_t
+H5C_validate_protected_entry_list(H5C_t *cache_ptr)
{
int32_t len = 0;
size_t size = 0;
H5C_cache_entry_t *entry_ptr = NULL;
herr_t ret_value = SUCCEED; /* Return value */
- FUNC_ENTER_STATIC
+ FUNC_ENTER_NOAPI(FAIL)
HDassert(cache_ptr);
HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC);
@@ -7928,54 +7929,51 @@ H5C__validate_protected_entry_list(H5C_t *cache_ptr)
(cache_ptr->pl_head_ptr != cache_ptr->pl_tail_ptr))
HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 1 failed")
- if (cache_ptr->pl_len < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 2 failed")
-
if ((cache_ptr->pl_len == 1) &&
((cache_ptr->pl_head_ptr != cache_ptr->pl_tail_ptr) || (cache_ptr->pl_head_ptr == NULL) ||
(cache_ptr->pl_head_ptr->size != cache_ptr->pl_size)))
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 3 failed")
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 2 failed")
if ((cache_ptr->pl_len >= 1) &&
((cache_ptr->pl_head_ptr == NULL) || (cache_ptr->pl_head_ptr->prev != NULL) ||
(cache_ptr->pl_tail_ptr == NULL) || (cache_ptr->pl_tail_ptr->next != NULL)))
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 4 failed")
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 3 failed")
entry_ptr = cache_ptr->pl_head_ptr;
while (entry_ptr != NULL) {
if ((entry_ptr != cache_ptr->pl_head_ptr) &&
((entry_ptr->prev == NULL) || (entry_ptr->prev->next != entry_ptr)))
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 5 failed")
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 4 failed")
if ((entry_ptr != cache_ptr->pl_tail_ptr) &&
((entry_ptr->next == NULL) || (entry_ptr->next->prev != entry_ptr)))
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 6 failed")
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 5 failed")
if (!entry_ptr->is_protected)
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 7 failed")
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 6 failed")
if (entry_ptr->is_read_only && (entry_ptr->ro_ref_count <= 0))
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 8 failed")
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 7 failed")
len++;
size += entry_ptr->size;
entry_ptr = entry_ptr->next;
}
- if ((cache_ptr->pl_len != len) || (cache_ptr->pl_size != size))
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 9 failed")
+ if ((cache_ptr->pl_len != (uint32_t)len) || (cache_ptr->pl_size != size))
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Check 8 failed")
done:
if (ret_value != SUCCEED)
HDassert(0);
FUNC_LEAVE_NOAPI(ret_value)
-} /* H5C__validate_protected_entry_list() */
+} /* H5C_validate_protected_entry_list() */
#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
/*-------------------------------------------------------------------------
*
- * Function: H5C__entry_in_skip_list
+ * Function: H5C_entry_in_skip_list
*
* Purpose: Debugging function that scans skip list to see if it
* is in present. We need this, as it is possible for
@@ -7989,8 +7987,8 @@ done:
*-------------------------------------------------------------------------
*/
#if H5C_DO_SLIST_SANITY_CHECKS
-static hbool_t
-H5C__entry_in_skip_list(H5C_t *cache_ptr, H5C_cache_entry_t *target_ptr)
+hbool_t
+H5C_entry_in_skip_list(H5C_t *cache_ptr, H5C_cache_entry_t *target_ptr)
{
H5SL_node_t *node_ptr;
hbool_t in_slist;
@@ -8018,7 +8016,7 @@ H5C__entry_in_skip_list(H5C_t *cache_ptr, H5C_cache_entry_t *target_ptr)
}
return (in_slist);
-} /* H5C__entry_in_skip_list() */
+} /* H5C_entry_in_skip_list() */
#endif /* H5C_DO_SLIST_SANITY_CHECKS */
/*-------------------------------------------------------------------------
@@ -8508,8 +8506,8 @@ H5C__serialize_cache(H5F_t *f)
#endif /* H5C_DO_SANITY_CHECKS */
#if H5C_DO_EXTREME_SANITY_CHECKS
- if ((H5C__validate_protected_entry_list(cache_ptr) < 0) ||
- (H5C__validate_pinned_entry_list(cache_ptr) < 0) || (H5C__validate_lru_list(cache_ptr) < 0))
+ if ((H5C_validate_protected_entry_list(cache_ptr) < 0) ||
+ (H5C_validate_pinned_entry_list(cache_ptr) < 0) || (H5C_validate_lru_list(cache_ptr) < 0))
HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "an extreme sanity check failed on entry")
#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
diff --git a/src/H5CX.c b/src/H5CX.c
index 89e4c91..c5bb8e4 100644
--- a/src/H5CX.c
+++ b/src/H5CX.c
@@ -1423,7 +1423,7 @@ done:
* Purpose: Sanity checks and sets up collective operations.
*
* Note: Should be called for all API routines that modify file
- * file metadata but don't pass in an access property list.
+ * metadata but don't pass in an access property list.
*
* Return: Non-negative on success / Negative on failure
*
diff --git a/src/H5Cepoch.c b/src/H5Cepoch.c
index 3434fed..8655881 100644
--- a/src/H5Cepoch.c
+++ b/src/H5Cepoch.c
@@ -78,22 +78,21 @@ static herr_t H5C__epoch_marker_fsf_size(const void H5_ATTR_UNUSED *thing,
/* Local Variables */
/*******************/
-const H5AC_class_t H5AC_EPOCH_MARKER[1] = {{
- /* id = */ H5AC_EPOCH_MARKER_ID,
- /* name = */ "epoch marker",
- /* mem_type = */ H5FD_MEM_DEFAULT, /* value doesn't matter */
- /* flags = */ H5AC__CLASS_NO_FLAGS_SET,
- /* get_initial_load_size = */ H5C__epoch_marker_get_initial_load_size,
- /* get_final_load_size = */ H5C__epoch_marker_get_final_load_size,
- /* verify_chksum = */ H5C__epoch_marker_verify_chksum,
- /* deserialize = */ H5C__epoch_marker_deserialize,
- /* image_len = */ H5C__epoch_marker_image_len,
- /* pre_serialize = */ H5C__epoch_marker_pre_serialize,
- /* serialize = */ H5C__epoch_marker_serialize,
- /* notify = */ H5C__epoch_marker_notify,
- /* free_icr = */ H5C__epoch_marker_free_icr,
- /* fsf_size = */ H5C__epoch_marker_fsf_size,
-}};
+const H5AC_class_t H5AC_EPOCH_MARKER[1] = {
+ {/* id = */ H5AC_EPOCH_MARKER_ID,
+ /* name = */ "epoch marker",
+ /* mem_type = */ H5FD_MEM_DEFAULT, /* value doesn't matter */
+ /* flags = */ H5AC__CLASS_NO_FLAGS_SET,
+ /* get_initial_load_size = */ H5C__epoch_marker_get_initial_load_size,
+ /* get_final_load_size = */ H5C__epoch_marker_get_final_load_size,
+ /* verify_chksum = */ H5C__epoch_marker_verify_chksum,
+ /* deserialize = */ H5C__epoch_marker_deserialize,
+ /* image_len = */ H5C__epoch_marker_image_len,
+ /* pre_serialize = */ H5C__epoch_marker_pre_serialize,
+ /* serialize = */ H5C__epoch_marker_serialize,
+ /* notify = */ H5C__epoch_marker_notify,
+ /* free_icr = */ H5C__epoch_marker_free_icr,
+ /* fsf_size = */ H5C__epoch_marker_fsf_size}};
/***************************************************************************
* Class functions for H5C__EPOCH_MAKER_TYPE:
diff --git a/src/H5Cpkg.h b/src/H5Cpkg.h
index 30b86b9..61c3afc 100644
--- a/src/H5Cpkg.h
+++ b/src/H5Cpkg.h
@@ -1011,7 +1011,7 @@ if ( ( (cache_ptr) == NULL ) || \
( H5C__HASH_FCN((entry_ptr)->addr) >= H5C__HASH_TABLE_LEN ) || \
( (cache_ptr)->index_size != \
((cache_ptr)->clean_index_size + \
- (cache_ptr)->dirty_index_size) ) || \
+ (cache_ptr)->dirty_index_size) ) || \
( (cache_ptr)->index_size < ((cache_ptr)->clean_index_size) ) || \
( (cache_ptr)->index_size < ((cache_ptr)->dirty_index_size) ) || \
( (entry_ptr)->ring <= H5C_RING_UNDEFINED ) || \
@@ -1034,7 +1034,7 @@ if ( ( (cache_ptr) == NULL ) || \
( (cache_ptr)->magic != H5C__H5C_T_MAGIC ) || \
( (cache_ptr)->index_size != \
((cache_ptr)->clean_index_size + \
- (cache_ptr)->dirty_index_size) ) || \
+ (cache_ptr)->dirty_index_size) ) || \
( (cache_ptr)->index_size < ((cache_ptr)->clean_index_size) ) || \
( (cache_ptr)->index_size < ((cache_ptr)->dirty_index_size) ) || \
( (cache_ptr)->index_ring_len[(entry_ptr)->ring] == 0 ) || \
@@ -1071,7 +1071,7 @@ if ( ( (cache_ptr) == NULL ) || \
( (entry_ptr)->ht_prev != NULL ) ) || \
( (cache_ptr)->index_size != \
((cache_ptr)->clean_index_size + \
- (cache_ptr)->dirty_index_size) ) || \
+ (cache_ptr)->dirty_index_size) ) || \
( (cache_ptr)->index_size < ((cache_ptr)->clean_index_size) ) || \
( (cache_ptr)->index_size < ((cache_ptr)->dirty_index_size) ) || \
( (entry_ptr)->ring <= H5C_RING_UNDEFINED ) || \
@@ -1102,7 +1102,7 @@ if ( ( (cache_ptr) == NULL ) || \
( (entry_ptr)->ht_prev != NULL ) || \
( (cache_ptr)->index_size != \
((cache_ptr)->clean_index_size + \
- (cache_ptr)->dirty_index_size) ) || \
+ (cache_ptr)->dirty_index_size) ) || \
( (cache_ptr)->index_size < ((cache_ptr)->clean_index_size) ) || \
( (cache_ptr)->index_size < ((cache_ptr)->dirty_index_size) ) || \
( (cache_ptr)->index_ring_len[(entry_ptr)->ring] > \
@@ -1161,7 +1161,7 @@ if ( ( (cache_ptr) == NULL ) || \
}
#define H5C__PRE_HT_ENTRY_SIZE_CHANGE_SC(cache_ptr, old_size, new_size, \
- entry_ptr, was_clean) \
+ entry_ptr, was_clean) \
if ( ( (cache_ptr) == NULL ) || \
( (cache_ptr)->index_len <= 0 ) || \
( (cache_ptr)->index_size <= 0 ) || \
@@ -1175,9 +1175,9 @@ if ( ( (cache_ptr) == NULL ) || \
( (cache_ptr)->index_size < ((cache_ptr)->clean_index_size) ) || \
( (cache_ptr)->index_size < ((cache_ptr)->dirty_index_size) ) || \
( ( !( was_clean ) || \
- ( (cache_ptr)->clean_index_size < (old_size) ) ) && \
- ( ( (was_clean) ) || \
- ( (cache_ptr)->dirty_index_size < (old_size) ) ) ) || \
+ ( (cache_ptr)->clean_index_size < (old_size) ) ) && \
+ ( ( (was_clean) ) || \
+ ( (cache_ptr)->dirty_index_size < (old_size) ) ) ) || \
( (entry_ptr) == NULL ) || \
( (entry_ptr)->ring <= H5C_RING_UNDEFINED ) || \
( (entry_ptr)->ring >= H5C_RING_NTYPES ) || \
@@ -1196,20 +1196,20 @@ if ( ( (cache_ptr) == NULL ) || \
}
#define H5C__POST_HT_ENTRY_SIZE_CHANGE_SC(cache_ptr, old_size, new_size, \
- entry_ptr) \
+ entry_ptr) \
if ( ( (cache_ptr) == NULL ) || \
( (cache_ptr)->index_len <= 0 ) || \
( (cache_ptr)->index_size <= 0 ) || \
( (new_size) > (cache_ptr)->index_size ) || \
( (cache_ptr)->index_size != \
- ((cache_ptr)->clean_index_size + \
+ ((cache_ptr)->clean_index_size + \
(cache_ptr)->dirty_index_size) ) || \
( (cache_ptr)->index_size < ((cache_ptr)->clean_index_size) ) || \
( (cache_ptr)->index_size < ((cache_ptr)->dirty_index_size) ) || \
( ( !((entry_ptr)->is_dirty ) || \
- ( (cache_ptr)->dirty_index_size < (new_size) ) ) && \
- ( ( ((entry_ptr)->is_dirty) ) || \
- ( (cache_ptr)->clean_index_size < (new_size) ) ) ) || \
+ ( (cache_ptr)->dirty_index_size < (new_size) ) ) && \
+ ( ( ((entry_ptr)->is_dirty) ) || \
+ ( (cache_ptr)->clean_index_size < (new_size) ) ) ) || \
( ( (cache_ptr)->index_len == 1 ) && \
( (cache_ptr)->index_size != (new_size) ) ) || \
( (cache_ptr)->index_ring_len[(entry_ptr)->ring] > \
@@ -1465,10 +1465,10 @@ if ( ( (cache_ptr)->index_size != \
H5C__PRE_HT_UPDATE_FOR_ENTRY_CLEAN_SC(cache_ptr, entry_ptr); \
(cache_ptr)->dirty_index_size -= (entry_ptr)->size; \
((cache_ptr)->dirty_index_ring_size[entry_ptr->ring]) \
- -= (entry_ptr)->size; \
+ -= (entry_ptr)->size; \
(cache_ptr)->clean_index_size += (entry_ptr)->size; \
((cache_ptr)->clean_index_ring_size[entry_ptr->ring]) \
- += (entry_ptr)->size; \
+ += (entry_ptr)->size; \
H5C__POST_HT_UPDATE_FOR_ENTRY_CLEAN_SC(cache_ptr, entry_ptr); \
}
@@ -1477,18 +1477,18 @@ if ( ( (cache_ptr)->index_size != \
H5C__PRE_HT_UPDATE_FOR_ENTRY_DIRTY_SC(cache_ptr, entry_ptr); \
(cache_ptr)->clean_index_size -= (entry_ptr)->size; \
((cache_ptr)->clean_index_ring_size[entry_ptr->ring]) \
- -= (entry_ptr)->size; \
+ -= (entry_ptr)->size; \
(cache_ptr)->dirty_index_size += (entry_ptr)->size; \
((cache_ptr)->dirty_index_ring_size[entry_ptr->ring]) \
- += (entry_ptr)->size; \
+ += (entry_ptr)->size; \
H5C__POST_HT_UPDATE_FOR_ENTRY_DIRTY_SC(cache_ptr, entry_ptr); \
}
#define H5C__UPDATE_INDEX_FOR_SIZE_CHANGE(cache_ptr, old_size, new_size, \
- entry_ptr, was_clean) \
+ entry_ptr, was_clean) \
{ \
H5C__PRE_HT_ENTRY_SIZE_CHANGE_SC(cache_ptr, old_size, new_size, \
- entry_ptr, was_clean) \
+ entry_ptr, was_clean) \
(cache_ptr)->index_size -= (old_size); \
(cache_ptr)->index_size += (new_size); \
((cache_ptr)->index_ring_size[entry_ptr->ring]) -= (old_size); \
@@ -1497,14 +1497,14 @@ if ( ( (cache_ptr)->index_size != \
(cache_ptr)->clean_index_size -= (old_size); \
((cache_ptr)->clean_index_ring_size[entry_ptr->ring])-= (old_size); \
} else { \
- (cache_ptr)->dirty_index_size -= (old_size); \
+ (cache_ptr)->dirty_index_size -= (old_size); \
((cache_ptr)->dirty_index_ring_size[entry_ptr->ring])-= (old_size); \
} \
if((entry_ptr)->is_dirty) { \
(cache_ptr)->dirty_index_size += (new_size); \
((cache_ptr)->dirty_index_ring_size[entry_ptr->ring])+= (new_size); \
} else { \
- (cache_ptr)->clean_index_size += (new_size); \
+ (cache_ptr)->clean_index_size += (new_size); \
((cache_ptr)->clean_index_ring_size[entry_ptr->ring])+= (new_size); \
} \
H5C__DLL_UPDATE_FOR_SIZE_CHANGE((cache_ptr)->il_len, \
@@ -1791,7 +1791,7 @@ if ( ( (cache_ptr)->index_size != \
} else { /* slist disabled */ \
\
HDassert( (cache_ptr)->slist_len == 0 ); \
- HDassert( (cache_ptr)->slist_size == 0 ); \
+ HDassert( (cache_ptr)->slist_size == 0 ); \
} \
} /* H5C__REMOVE_ENTRY_FROM_SLIST */
@@ -2033,16 +2033,16 @@ if ( ( (cache_ptr)->index_size != \
/* modified LRU specific code */ \
\
/* remove the entry from the LRU list, and re-insert it at the head.\
- */ \
+ */ \
\
H5C__DLL_REMOVE((entry_ptr), (cache_ptr)->LRU_head_ptr, \
(cache_ptr)->LRU_tail_ptr, \
- (cache_ptr)->LRU_list_len, \
+ (cache_ptr)->LRU_list_len, \
(cache_ptr)->LRU_list_size, (fail_val)) \
\
H5C__DLL_PREPEND((entry_ptr), (cache_ptr)->LRU_head_ptr, \
(cache_ptr)->LRU_tail_ptr, \
- (cache_ptr)->LRU_list_len, \
+ (cache_ptr)->LRU_list_len, \
(cache_ptr)->LRU_list_size, (fail_val)) \
\
/* Use the dirty flag to infer whether the entry is on the clean or \
@@ -2096,16 +2096,16 @@ if ( ( (cache_ptr)->index_size != \
/* modified LRU specific code */ \
\
/* remove the entry from the LRU list, and re-insert it at the head \
- */ \
+ */ \
\
H5C__DLL_REMOVE((entry_ptr), (cache_ptr)->LRU_head_ptr, \
(cache_ptr)->LRU_tail_ptr, \
- (cache_ptr)->LRU_list_len, \
+ (cache_ptr)->LRU_list_len, \
(cache_ptr)->LRU_list_size, (fail_val)) \
\
H5C__DLL_PREPEND((entry_ptr), (cache_ptr)->LRU_head_ptr, \
(cache_ptr)->LRU_tail_ptr, \
- (cache_ptr)->LRU_list_len, \
+ (cache_ptr)->LRU_list_len, \
(cache_ptr)->LRU_list_size, (fail_val)) \
\
/* End modified LRU specific code. */ \
@@ -2288,28 +2288,28 @@ if ( ( (cache_ptr)->index_size != \
/* modified LRU specific code */ \
\
/* remove the entry from the LRU list, and re-insert it at the \
- * head. \
- */ \
+ * head. \
+ */ \
\
H5C__DLL_REMOVE((entry_ptr), (cache_ptr)->LRU_head_ptr, \
(cache_ptr)->LRU_tail_ptr, \
- (cache_ptr)->LRU_list_len, \
+ (cache_ptr)->LRU_list_len, \
(cache_ptr)->LRU_list_size, (fail_val)) \
\
H5C__DLL_PREPEND((entry_ptr), (cache_ptr)->LRU_head_ptr, \
(cache_ptr)->LRU_tail_ptr, \
- (cache_ptr)->LRU_list_len, \
+ (cache_ptr)->LRU_list_len, \
(cache_ptr)->LRU_list_size, (fail_val)) \
\
/* since the entry is being flushed or cleared, one would think \
- * that it must be dirty -- but that need not be the case. Use the \
- * dirty flag to infer whether the entry is on the clean or dirty \
- * LRU list, and remove it. Then insert it at the head of the \
- * clean LRU list. \
+ * that it must be dirty -- but that need not be the case. Use the \
+ * dirty flag to infer whether the entry is on the clean or dirty \
+ * LRU list, and remove it. Then insert it at the head of the \
+ * clean LRU list. \
* \
* The function presumes that a dirty entry will be either cleared \
- * or flushed shortly, so it is OK if we put a dirty entry on the \
- * clean LRU list. \
+ * or flushed shortly, so it is OK if we put a dirty entry on the \
+ * clean LRU list. \
*/ \
\
if ( (entry_ptr)->is_dirty ) { \
@@ -2350,17 +2350,17 @@ if ( ( (cache_ptr)->index_size != \
/* modified LRU specific code */ \
\
/* remove the entry from the LRU list, and re-insert it at the \
- * head. \
- */ \
+ * head. \
+ */ \
\
H5C__DLL_REMOVE((entry_ptr), (cache_ptr)->LRU_head_ptr, \
(cache_ptr)->LRU_tail_ptr, \
- (cache_ptr)->LRU_list_len, \
+ (cache_ptr)->LRU_list_len, \
(cache_ptr)->LRU_list_size, (fail_val)) \
\
H5C__DLL_PREPEND((entry_ptr), (cache_ptr)->LRU_head_ptr, \
(cache_ptr)->LRU_tail_ptr, \
- (cache_ptr)->LRU_list_len, \
+ (cache_ptr)->LRU_list_len, \
(cache_ptr)->LRU_list_size, (fail_val)) \
\
/* End modified LRU specific code. */ \
@@ -2424,7 +2424,7 @@ if ( ( (cache_ptr)->index_size != \
\
H5C__DLL_APPEND((entry_ptr), (cache_ptr)->LRU_head_ptr, \
(cache_ptr)->LRU_tail_ptr, \
- (cache_ptr)->LRU_list_len, \
+ (cache_ptr)->LRU_list_len, \
(cache_ptr)->LRU_list_size, (fail_val)) \
\
/* insert the entry at the tail of the clean or dirty LRU list as \
@@ -2465,7 +2465,7 @@ if ( ( (cache_ptr)->index_size != \
(cache_ptr)->pel_tail_ptr, \
(cache_ptr)->pel_len, \
(cache_ptr)->pel_size, (fail_val)) \
- \
+ \
} else { \
\
/* modified LRU specific code */ \
@@ -2474,7 +2474,7 @@ if ( ( (cache_ptr)->index_size != \
\
H5C__DLL_APPEND((entry_ptr), (cache_ptr)->LRU_head_ptr, \
(cache_ptr)->LRU_tail_ptr, \
- (cache_ptr)->LRU_list_len, \
+ (cache_ptr)->LRU_list_len, \
(cache_ptr)->LRU_list_size, (fail_val)) \
\
/* End modified LRU specific code. */ \
@@ -2558,7 +2558,7 @@ if ( ( (cache_ptr)->index_size != \
\
H5C__DLL_PREPEND((entry_ptr), (cache_ptr)->LRU_head_ptr, \
(cache_ptr)->LRU_tail_ptr, \
- (cache_ptr)->LRU_list_len, \
+ (cache_ptr)->LRU_list_len, \
(cache_ptr)->LRU_list_size, (fail_val)) \
\
/* insert the entry at the head of the clean or dirty LRU list as \
@@ -2599,7 +2599,7 @@ if ( ( (cache_ptr)->index_size != \
(cache_ptr)->pel_tail_ptr, \
(cache_ptr)->pel_len, \
(cache_ptr)->pel_size, (fail_val)) \
- \
+ \
} else { \
\
/* modified LRU specific code */ \
@@ -2608,7 +2608,7 @@ if ( ( (cache_ptr)->index_size != \
\
H5C__DLL_PREPEND((entry_ptr), (cache_ptr)->LRU_head_ptr, \
(cache_ptr)->LRU_tail_ptr, \
- (cache_ptr)->LRU_list_len, \
+ (cache_ptr)->LRU_list_len, \
(cache_ptr)->LRU_list_size, (fail_val)) \
\
/* End modified LRU specific code. */ \
@@ -2677,12 +2677,12 @@ if ( ( (cache_ptr)->index_size != \
HDassert( !((entry_ptr)->is_read_only) ); \
HDassert( ((entry_ptr)->ro_ref_count) == 0 ); \
HDassert( (entry_ptr)->size > 0 ); \
- \
+ \
if ( (entry_ptr)->is_pinned ) { \
\
H5C__DLL_REMOVE((entry_ptr), (cache_ptr)->pel_head_ptr, \
- (cache_ptr)->pel_tail_ptr, \
- (cache_ptr)->pel_len, \
+ (cache_ptr)->pel_tail_ptr, \
+ (cache_ptr)->pel_len, \
(cache_ptr)->pel_size, (fail_val)) \
\
} else { \
@@ -2693,7 +2693,7 @@ if ( ( (cache_ptr)->index_size != \
\
H5C__DLL_REMOVE((entry_ptr), (cache_ptr)->LRU_head_ptr, \
(cache_ptr)->LRU_tail_ptr, \
- (cache_ptr)->LRU_list_len, \
+ (cache_ptr)->LRU_list_len, \
(cache_ptr)->LRU_list_size, (fail_val)) \
\
/* Similarly, remove the entry from the clean or dirty LRU list \
@@ -2739,12 +2739,12 @@ if ( ( (cache_ptr)->index_size != \
HDassert( !((entry_ptr)->is_read_only) ); \
HDassert( ((entry_ptr)->ro_ref_count) == 0 ); \
HDassert( (entry_ptr)->size > 0 ); \
- \
+ \
if ( (entry_ptr)->is_pinned ) { \
\
H5C__DLL_REMOVE((entry_ptr), (cache_ptr)->pel_head_ptr, \
- (cache_ptr)->pel_tail_ptr, \
- (cache_ptr)->pel_len, \
+ (cache_ptr)->pel_tail_ptr, \
+ (cache_ptr)->pel_len, \
(cache_ptr)->pel_size, (fail_val)) \
\
} else { \
@@ -2755,7 +2755,7 @@ if ( ( (cache_ptr)->index_size != \
\
H5C__DLL_REMOVE((entry_ptr), (cache_ptr)->LRU_head_ptr, \
(cache_ptr)->LRU_tail_ptr, \
- (cache_ptr)->LRU_list_len, \
+ (cache_ptr)->LRU_list_len, \
(cache_ptr)->LRU_list_size, (fail_val)) \
\
/* End modified LRU specific code. */ \
@@ -2804,21 +2804,21 @@ if ( ( (cache_ptr)->index_size != \
HDassert( ((entry_ptr)->ro_ref_count) == 0 ); \
HDassert( (entry_ptr)->size > 0 ); \
\
- if ( ! ( (entry_ptr)->is_pinned ) && ! ( (entry_ptr->is_protected ) ) ) { \
- \
+ if ( ! ( (entry_ptr)->is_pinned ) && ! ( (entry_ptr->is_protected ) ) ) {\
+ \
/* modified LRU specific code */ \
\
/* remove the entry from the LRU list, and re-insert it at the head. \
- */ \
+ */ \
\
H5C__DLL_REMOVE((entry_ptr), (cache_ptr)->LRU_head_ptr, \
(cache_ptr)->LRU_tail_ptr, \
- (cache_ptr)->LRU_list_len, \
+ (cache_ptr)->LRU_list_len, \
(cache_ptr)->LRU_list_size, (fail_val)) \
\
H5C__DLL_PREPEND((entry_ptr), (cache_ptr)->LRU_head_ptr, \
(cache_ptr)->LRU_tail_ptr, \
- (cache_ptr)->LRU_list_len, \
+ (cache_ptr)->LRU_list_len, \
(cache_ptr)->LRU_list_size, (fail_val)) \
\
/* remove the entry from either the clean or dirty LUR list as \
@@ -2827,7 +2827,7 @@ if ( ( (cache_ptr)->index_size != \
if ( was_dirty ) { \
\
H5C__AUX_DLL_REMOVE((entry_ptr), \
- (cache_ptr)->dLRU_head_ptr, \
+ (cache_ptr)->dLRU_head_ptr, \
(cache_ptr)->dLRU_tail_ptr, \
(cache_ptr)->dLRU_list_len, \
(cache_ptr)->dLRU_list_size, \
@@ -2836,34 +2836,34 @@ if ( ( (cache_ptr)->index_size != \
} else { \
\
H5C__AUX_DLL_REMOVE((entry_ptr), \
- (cache_ptr)->cLRU_head_ptr, \
+ (cache_ptr)->cLRU_head_ptr, \
(cache_ptr)->cLRU_tail_ptr, \
(cache_ptr)->cLRU_list_len, \
(cache_ptr)->cLRU_list_size, \
- (fail_val)) \
+ (fail_val)) \
} \
\
/* insert the entry at the head of either the clean or dirty \
- * LRU list as appropriate. \
+ * LRU list as appropriate. \
*/ \
\
if ( (entry_ptr)->is_dirty ) { \
\
H5C__AUX_DLL_PREPEND((entry_ptr), \
- (cache_ptr)->dLRU_head_ptr, \
+ (cache_ptr)->dLRU_head_ptr, \
(cache_ptr)->dLRU_tail_ptr, \
(cache_ptr)->dLRU_list_len, \
(cache_ptr)->dLRU_list_size, \
- (fail_val)) \
+ (fail_val)) \
\
} else { \
\
H5C__AUX_DLL_PREPEND((entry_ptr), \
- (cache_ptr)->cLRU_head_ptr, \
+ (cache_ptr)->cLRU_head_ptr, \
(cache_ptr)->cLRU_tail_ptr, \
(cache_ptr)->cLRU_list_len, \
(cache_ptr)->cLRU_list_size, \
- (fail_val)) \
+ (fail_val)) \
} \
\
/* End modified LRU specific code. */ \
@@ -2872,7 +2872,7 @@ if ( ( (cache_ptr)->index_size != \
#else /* H5C_MAINTAIN_CLEAN_AND_DIRTY_LRU_LISTS */
-#define H5C__UPDATE_RP_FOR_MOVE(cache_ptr, entry_ptr, was_dirty, fail_val) \
+#define H5C__UPDATE_RP_FOR_MOVE(cache_ptr, entry_ptr, was_dirty, fail_val) \
{ \
HDassert( (cache_ptr) ); \
HDassert( (cache_ptr)->magic == H5C__H5C_T_MAGIC ); \
@@ -2881,21 +2881,21 @@ if ( ( (cache_ptr)->index_size != \
HDassert( ((entry_ptr)->ro_ref_count) == 0 ); \
HDassert( (entry_ptr)->size > 0 ); \
\
- if ( ! ( (entry_ptr)->is_pinned ) && ! ( (entry_ptr->is_protected ) ) ) { \
- \
+ if ( ! ( (entry_ptr)->is_pinned ) && ! ( (entry_ptr->is_protected ) ) ) {\
+ \
/* modified LRU specific code */ \
\
/* remove the entry from the LRU list, and re-insert it at the head. \
- */ \
+ */ \
\
H5C__DLL_REMOVE((entry_ptr), (cache_ptr)->LRU_head_ptr, \
(cache_ptr)->LRU_tail_ptr, \
- (cache_ptr)->LRU_list_len, \
+ (cache_ptr)->LRU_list_len, \
(cache_ptr)->LRU_list_size, (fail_val)) \
\
H5C__DLL_PREPEND((entry_ptr), (cache_ptr)->LRU_head_ptr, \
(cache_ptr)->LRU_tail_ptr, \
- (cache_ptr)->LRU_list_len, \
+ (cache_ptr)->LRU_list_len, \
(cache_ptr)->LRU_list_size, (fail_val)) \
\
/* End modified LRU specific code. */ \
@@ -2952,49 +2952,49 @@ if ( ( (cache_ptr)->index_size != \
\
if ( (entry_ptr)->coll_access ) { \
\
- H5C__DLL_UPDATE_FOR_SIZE_CHANGE((cache_ptr)->coll_list_len, \
- (cache_ptr)->coll_list_size, \
- (entry_ptr)->size, \
- (new_size)); \
- \
+ H5C__DLL_UPDATE_FOR_SIZE_CHANGE((cache_ptr)->coll_list_len, \
+ (cache_ptr)->coll_list_size, \
+ (entry_ptr)->size, \
+ (new_size)); \
+ \
} \
\
if ( (entry_ptr)->is_pinned ) { \
\
- H5C__DLL_UPDATE_FOR_SIZE_CHANGE((cache_ptr)->pel_len, \
- (cache_ptr)->pel_size, \
- (entry_ptr)->size, \
- (new_size)); \
- \
+ H5C__DLL_UPDATE_FOR_SIZE_CHANGE((cache_ptr)->pel_len, \
+ (cache_ptr)->pel_size, \
+ (entry_ptr)->size, \
+ (new_size)); \
+ \
} else { \
\
/* modified LRU specific code */ \
\
- /* Update the size of the LRU list */ \
+ /* Update the size of the LRU list */ \
\
- H5C__DLL_UPDATE_FOR_SIZE_CHANGE((cache_ptr)->LRU_list_len, \
- (cache_ptr)->LRU_list_size, \
- (entry_ptr)->size, \
- (new_size)); \
+ H5C__DLL_UPDATE_FOR_SIZE_CHANGE((cache_ptr)->LRU_list_len, \
+ (cache_ptr)->LRU_list_size, \
+ (entry_ptr)->size, \
+ (new_size)); \
\
/* Similarly, update the size of the clean or dirty LRU list as \
- * appropriate. At present, the entry must be clean, but that \
- * could change. \
+ * appropriate. At present, the entry must be clean, but that \
+ * could change. \
*/ \
\
if ( (entry_ptr)->is_dirty ) { \
\
- H5C__DLL_UPDATE_FOR_SIZE_CHANGE((cache_ptr)->dLRU_list_len, \
- (cache_ptr)->dLRU_list_size, \
- (entry_ptr)->size, \
- (new_size)); \
+ H5C__DLL_UPDATE_FOR_SIZE_CHANGE((cache_ptr)->dLRU_list_len, \
+ (cache_ptr)->dLRU_list_size, \
+ (entry_ptr)->size, \
+ (new_size)); \
\
} else { \
\
- H5C__DLL_UPDATE_FOR_SIZE_CHANGE((cache_ptr)->cLRU_list_len, \
- (cache_ptr)->cLRU_list_size, \
- (entry_ptr)->size, \
- (new_size)); \
+ H5C__DLL_UPDATE_FOR_SIZE_CHANGE((cache_ptr)->cLRU_list_len, \
+ (cache_ptr)->cLRU_list_size, \
+ (entry_ptr)->size, \
+ (new_size)); \
} \
\
/* End modified LRU specific code. */ \
@@ -3017,21 +3017,21 @@ if ( ( (cache_ptr)->index_size != \
\
if ( (entry_ptr)->is_pinned ) { \
\
- H5C__DLL_UPDATE_FOR_SIZE_CHANGE((cache_ptr)->pel_len, \
- (cache_ptr)->pel_size, \
- (entry_ptr)->size, \
- (new_size)); \
+ H5C__DLL_UPDATE_FOR_SIZE_CHANGE((cache_ptr)->pel_len, \
+ (cache_ptr)->pel_size, \
+ (entry_ptr)->size, \
+ (new_size)); \
\
} else { \
\
/* modified LRU specific code */ \
\
- /* Update the size of the LRU list */ \
+ /* Update the size of the LRU list */ \
\
- H5C__DLL_UPDATE_FOR_SIZE_CHANGE((cache_ptr)->LRU_list_len, \
- (cache_ptr)->LRU_list_size, \
- (entry_ptr)->size, \
- (new_size)); \
+ H5C__DLL_UPDATE_FOR_SIZE_CHANGE((cache_ptr)->LRU_list_len, \
+ (cache_ptr)->LRU_list_size, \
+ (entry_ptr)->size, \
+ (new_size)); \
\
/* End modified LRU specific code. */ \
} \
@@ -3318,7 +3318,7 @@ if ( ( (hd_ptr) == NULL ) || \
( (Size) < (entry_ptr)->size ) || \
( ( (Size) == (entry_ptr)->size ) && ( ! ( (len) == 1 ) ) ) || \
( ( (entry_ptr)->coll_prev == NULL ) && ( (hd_ptr) != (entry_ptr) ) ) || \
- ( ( (entry_ptr)->coll_next == NULL ) && ( (tail_ptr) != (entry_ptr) ) ) || \
+ ( ( (entry_ptr)->coll_next == NULL ) && ( (tail_ptr) != (entry_ptr) ) ) ||\
( ( (len) == 1 ) && \
( ! ( ( (hd_ptr) == (entry_ptr) ) && ( (tail_ptr) == (entry_ptr) ) && \
( (entry_ptr)->coll_next == NULL ) && \
@@ -3350,10 +3350,10 @@ if ( ( ( ( (head_ptr) == NULL ) || ( (tail_ptr) == NULL ) ) && \
) \
) { \
HDassert(0 && "COLL DLL sanity check failed"); \
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, (fv), "COLL DLL sanity check failed") \
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, (fv), "COLL DLL sanity check failed")\
}
-#define H5C__COLL_DLL_PRE_INSERT_SC(entry_ptr, hd_ptr, tail_ptr, len, Size, fv) \
+#define H5C__COLL_DLL_PRE_INSERT_SC(entry_ptr, hd_ptr, tail_ptr, len, Size, fv)\
if ( ( (entry_ptr) == NULL ) || \
( (entry_ptr)->coll_next != NULL ) || \
( (entry_ptr)->coll_prev != NULL ) || \
@@ -5074,7 +5074,7 @@ H5_DLL herr_t H5C__generate_cache_image(H5F_t *f, H5C_t *cache_ptr);
H5_DLL herr_t H5C__load_cache_image(H5F_t *f);
H5_DLL herr_t H5C__mark_flush_dep_serialized(H5C_cache_entry_t * entry_ptr);
H5_DLL herr_t H5C__mark_flush_dep_unserialized(H5C_cache_entry_t * entry_ptr);
-H5_DLL herr_t H5C__make_space_in_cache(H5F_t * f, size_t space_needed,
+H5_DLL herr_t H5C__make_space_in_cache(H5F_t * f, size_t space_needed,
hbool_t write_permitted);
H5_DLL herr_t H5C__flush_marked_entries(H5F_t * f);
H5_DLL herr_t H5C__serialize_cache(H5F_t *f);
diff --git a/src/H5Cprivate.h b/src/H5Cprivate.h
index 8a1043e..9514443 100644
--- a/src/H5Cprivate.h
+++ b/src/H5Cprivate.h
@@ -2292,6 +2292,16 @@ H5_DLL herr_t H5C_cache_image_status(H5F_t *f, hbool_t *load_ci_ptr, hbool_t *
H5_DLL hbool_t H5C_cache_image_pending(const H5C_t *cache_ptr);
H5_DLL herr_t H5C_get_mdc_image_info(const H5C_t *cache_ptr, haddr_t *image_addr, hsize_t *image_len);
+#if H5C_DO_SLIST_SANITY_CHECKS
+H5_DLL hbool_t H5C_entry_in_skip_list(H5C_t *cache_ptr, H5C_cache_entry_t *target_ptr);
+#endif
+
+#if H5C_DO_EXTREME_SANITY_CHECKS
+H5_DLL herr_t H5C_validate_lru_list(H5C_t *cache_ptr);
+H5_DLL herr_t H5C_validate_pinned_entry_list(H5C_t *cache_ptr);
+H5_DLL herr_t H5C_validate_protected_entry_list(H5C_t *cache_ptr);
+#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
+
/* Logging functions */
H5_DLL herr_t H5C_start_logging(H5C_t *cache);
H5_DLL herr_t H5C_stop_logging(H5C_t *cache);
diff --git a/src/H5Dchunk.c b/src/H5Dchunk.c
index 5d7c1b2..e4d8706 100644
--- a/src/H5Dchunk.c
+++ b/src/H5Dchunk.c
@@ -239,10 +239,14 @@ typedef struct H5D_chunk_file_iter_ud_t {
#ifdef H5_HAVE_PARALLEL
/* information to construct a collective I/O operation for filling chunks */
-typedef struct H5D_chunk_coll_info_t {
- size_t num_io; /* Number of write operations */
- haddr_t *addr; /* array of the file addresses of the write operation */
-} H5D_chunk_coll_info_t;
+typedef struct H5D_chunk_coll_fill_info_t {
+ size_t num_chunks; /* Number of chunks in the write operation */
+ struct chunk_coll_fill_info {
+ haddr_t addr; /* File address of the chunk */
+ size_t chunk_size; /* Size of the chunk in the file */
+ hbool_t unfiltered_partial_chunk;
+ } * chunk_info;
+} H5D_chunk_coll_fill_info_t;
#endif /* H5_HAVE_PARALLEL */
typedef struct H5D_chunk_iter_ud_t {
@@ -287,9 +291,6 @@ static int H5D__chunk_format_convert_cb(const H5D_chunk_rec_t *chunk_rec, void *
/* Helper routines */
static herr_t H5D__chunk_set_info_real(H5O_layout_chunk_t *layout, unsigned ndims, const hsize_t *curr_dims,
const hsize_t *max_dims);
-static void * H5D__chunk_mem_alloc(size_t size, const H5O_pline_t *pline);
-static void * H5D__chunk_mem_xfree(void *chk, const void *pline);
-static void * H5D__chunk_mem_realloc(void *chk, size_t size, const H5O_pline_t *pline);
static herr_t H5D__chunk_cinfo_cache_reset(H5D_chunk_cached_t *last);
static herr_t H5D__chunk_cinfo_cache_update(H5D_chunk_cached_t *last, const H5D_chunk_ud_t *udata);
static hbool_t H5D__chunk_cinfo_cache_found(const H5D_chunk_cached_t *last, H5D_chunk_ud_t *udata);
@@ -306,8 +307,6 @@ static herr_t H5D__chunk_mem_cb(void *elem, const H5T_t *type, unsigned ndims,
static unsigned H5D__chunk_hash_val(const H5D_shared_t *shared, const hsize_t *scaled);
static herr_t H5D__chunk_flush_entry(const H5D_t *dset, H5D_rdcc_ent_t *ent, hbool_t reset);
static herr_t H5D__chunk_cache_evict(const H5D_t *dset, H5D_rdcc_ent_t *ent, hbool_t flush);
-static hbool_t H5D__chunk_is_partial_edge_chunk(unsigned dset_ndims, const uint32_t *chunk_dims,
- const hsize_t *chunk_scaled, const hsize_t *dset_dims);
static void * H5D__chunk_lock(const H5D_io_info_t *io_info, H5D_chunk_ud_t *udata, hbool_t relax,
hbool_t prev_unfilt_chunk);
static herr_t H5D__chunk_unlock(const H5D_io_info_t *io_info, const H5D_chunk_ud_t *udata, hbool_t dirty,
@@ -315,9 +314,9 @@ static herr_t H5D__chunk_unlock(const H5D_io_info_t *io_info, const H5D_chunk_
static herr_t H5D__chunk_cache_prune(const H5D_t *dset, size_t size);
static herr_t H5D__chunk_prune_fill(H5D_chunk_it_ud1_t *udata, hbool_t new_unfilt_chunk);
#ifdef H5_HAVE_PARALLEL
-static herr_t H5D__chunk_collective_fill(const H5D_t *dset, H5D_chunk_coll_info_t *chunk_info,
- size_t chunk_size, const void *fill_buf);
-static int H5D__chunk_cmp_addr(const void *addr1, const void *addr2);
+static herr_t H5D__chunk_collective_fill(const H5D_t *dset, H5D_chunk_coll_fill_info_t *chunk_fill_info,
+ const void *fill_buf, const void *partial_chunk_fill_buf);
+static int H5D__chunk_cmp_coll_fill_info(const void *_entry1, const void *_entry2);
#endif /* H5_HAVE_PARALLEL */
/* Debugging helper routine callback */
@@ -1362,7 +1361,7 @@ done:
*
*-------------------------------------------------------------------------
*/
-static void *
+void *
H5D__chunk_mem_alloc(size_t size, const H5O_pline_t *pline)
{
void *ret_value = NULL; /* Return value */
@@ -1393,7 +1392,7 @@ H5D__chunk_mem_alloc(size_t size, const H5O_pline_t *pline)
*
*-------------------------------------------------------------------------
*/
-static void *
+void *
H5D__chunk_mem_xfree(void *chk, const void *_pline)
{
const H5O_pline_t *pline = (const H5O_pline_t *)_pline;
@@ -1417,7 +1416,7 @@ H5D__chunk_mem_xfree(void *chk, const void *_pline)
* calls H5D__chunk_mem_xfree and discards the return value.
*-------------------------------------------------------------------------
*/
-static void
+void
H5D__chunk_mem_free(void *chk, const void *_pline)
{
(void)H5D__chunk_mem_xfree(chk, _pline);
@@ -1437,7 +1436,7 @@ H5D__chunk_mem_free(void *chk, const void *_pline)
*
*-------------------------------------------------------------------------
*/
-static void *
+void *
H5D__chunk_mem_realloc(void *chk, size_t size, const H5O_pline_t *pline)
{
void *ret_value = NULL; /* Return value */
@@ -3178,7 +3177,9 @@ H5D__chunk_lookup(const H5D_t *dset, const hsize_t *scaled, H5D_chunk_ud_t *udat
unsigned idx = 0; /* Index of chunk in cache, if present */
hbool_t found = FALSE; /* In cache? */
#ifdef H5_HAVE_PARALLEL
- hbool_t reenable_coll_md_reads = FALSE;
+ H5P_coll_md_read_flag_t md_reads_file_flag;
+ hbool_t md_reads_context_flag;
+ hbool_t restore_md_reads_state = FALSE;
#endif
herr_t ret_value = SUCCEED; /* Return value */
@@ -3252,11 +3253,10 @@ H5D__chunk_lookup(const H5D_t *dset, const hsize_t *scaled, H5D_chunk_ud_t *udat
* processes.
*/
if (H5F_HAS_FEATURE(idx_info.f, H5FD_FEAT_HAS_MPI)) {
- hbool_t do_coll_md_reads = H5CX_get_coll_metadata_read();
- if (do_coll_md_reads) {
- H5CX_set_coll_metadata_read(FALSE);
- reenable_coll_md_reads = TRUE;
- }
+ md_reads_file_flag = H5P_FORCE_FALSE;
+ md_reads_context_flag = FALSE;
+ H5F_set_coll_metadata_reads(idx_info.f, &md_reads_file_flag, &md_reads_context_flag);
+ restore_md_reads_state = TRUE;
}
#endif /* H5_HAVE_PARALLEL */
@@ -3302,8 +3302,8 @@ H5D__chunk_lookup(const H5D_t *dset, const hsize_t *scaled, H5D_chunk_ud_t *udat
done:
#ifdef H5_HAVE_PARALLEL
/* Re-enable collective metadata reads if we disabled them */
- if (reenable_coll_md_reads)
- H5CX_set_coll_metadata_read(TRUE);
+ if (restore_md_reads_state)
+ H5F_set_coll_metadata_reads(dset->oloc.file, &md_reads_file_flag, &md_reads_context_flag);
#endif /* H5_HAVE_PARALLEL */
FUNC_LEAVE_NOAPI(ret_value)
@@ -4319,8 +4319,8 @@ H5D__chunk_allocate(const H5D_io_info_t *io_info, hbool_t full_overwrite, const
hbool_t blocks_written = FALSE; /* Flag to indicate that chunk was actually written */
hbool_t using_mpi =
FALSE; /* Flag to indicate that the file is being accessed with an MPI-capable file driver */
- H5D_chunk_coll_info_t chunk_info; /* chunk address information for doing I/O */
-#endif /* H5_HAVE_PARALLEL */
+ H5D_chunk_coll_fill_info_t chunk_fill_info; /* chunk address information for doing I/O */
+#endif /* H5_HAVE_PARALLEL */
hbool_t carry; /* Flag to indicate that chunk increment carrys to higher dimension (sorta) */
unsigned space_ndims; /* Dataset's space rank */
const hsize_t * space_dim; /* Dataset's dataspace dimensions */
@@ -4367,8 +4367,8 @@ H5D__chunk_allocate(const H5D_io_info_t *io_info, hbool_t full_overwrite, const
using_mpi = TRUE;
/* init chunk info stuff for collective I/O */
- chunk_info.num_io = 0;
- chunk_info.addr = NULL;
+ chunk_fill_info.num_chunks = 0;
+ chunk_fill_info.chunk_info = NULL;
} /* end if */
#endif /* H5_HAVE_PARALLEL */
@@ -4640,19 +4640,26 @@ H5D__chunk_allocate(const H5D_io_info_t *io_info, hbool_t full_overwrite, const
if (using_mpi) {
/* collect all chunk addresses to be written to
write collectively at the end */
- /* allocate/resize address array if no more space left */
- /* Note that if we add support for parallel filters we must
- * also store an array of chunk sizes and pass it to the
- * apporpriate collective write function */
- if (0 == chunk_info.num_io % 1024)
- if (NULL == (chunk_info.addr = (haddr_t *)H5MM_realloc(
- chunk_info.addr, (chunk_info.num_io + 1024) * sizeof(haddr_t))))
+
+ /* allocate/resize chunk info array if no more space left */
+ if (0 == chunk_fill_info.num_chunks % 1024) {
+ void *tmp_realloc;
+
+ if (NULL == (tmp_realloc = H5MM_realloc(chunk_fill_info.chunk_info,
+ (chunk_fill_info.num_chunks + 1024) *
+ sizeof(struct chunk_coll_fill_info))))
HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL,
- "memory allocation failed for chunk addresses")
+ "memory allocation failed for chunk fill info")
- /* Store the chunk's address for later */
- chunk_info.addr[chunk_info.num_io] = udata.chunk_block.offset;
- chunk_info.num_io++;
+ chunk_fill_info.chunk_info = tmp_realloc;
+ }
+
+ /* Store info about the chunk for later */
+ chunk_fill_info.chunk_info[chunk_fill_info.num_chunks].addr = udata.chunk_block.offset;
+ chunk_fill_info.chunk_info[chunk_fill_info.num_chunks].chunk_size = chunk_size;
+ chunk_fill_info.chunk_info[chunk_fill_info.num_chunks].unfiltered_partial_chunk =
+ (*fill_buf == unfilt_fill_buf);
+ chunk_fill_info.num_chunks++;
/* Indicate that blocks will be written */
blocks_written = TRUE;
@@ -4725,7 +4732,7 @@ H5D__chunk_allocate(const H5D_io_info_t *io_info, hbool_t full_overwrite, const
#ifdef H5_HAVE_PARALLEL
/* do final collective I/O */
if (using_mpi && blocks_written)
- if (H5D__chunk_collective_fill(dset, &chunk_info, chunk_size, fb_info.fill_buf) < 0)
+ if (H5D__chunk_collective_fill(dset, &chunk_fill_info, fb_info.fill_buf, unfilt_fill_buf) < 0)
HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "unable to write raw data to file")
#endif /* H5_HAVE_PARALLEL */
@@ -4741,8 +4748,8 @@ done:
unfilt_fill_buf = H5D__chunk_mem_xfree(unfilt_fill_buf, &def_pline);
#ifdef H5_HAVE_PARALLEL
- if (using_mpi && chunk_info.addr)
- H5MM_free(chunk_info.addr);
+ if (using_mpi && chunk_fill_info.chunk_info)
+ H5MM_free(chunk_fill_info.chunk_info);
#endif
FUNC_LEAVE_NOAPI(ret_value)
@@ -4936,27 +4943,35 @@ done:
*-------------------------------------------------------------------------
*/
static herr_t
-H5D__chunk_collective_fill(const H5D_t *dset, H5D_chunk_coll_info_t *chunk_info, size_t chunk_size,
- const void *fill_buf)
+H5D__chunk_collective_fill(const H5D_t *dset, H5D_chunk_coll_fill_info_t *chunk_fill_info,
+ const void *fill_buf, const void *partial_chunk_fill_buf)
{
- MPI_Comm mpi_comm = MPI_COMM_NULL; /* MPI communicator for file */
- int mpi_rank = (-1); /* This process's rank */
- int mpi_size = (-1); /* MPI Comm size */
- int mpi_code; /* MPI return code */
- size_t num_blocks; /* Number of blocks between processes. */
- size_t leftover_blocks; /* Number of leftover blocks to handle */
- int blocks, leftover, block_len; /* converted to int for MPI */
+ MPI_Comm mpi_comm = MPI_COMM_NULL; /* MPI communicator for file */
+ int mpi_rank = (-1); /* This process's rank */
+ int mpi_size = (-1); /* MPI Comm size */
+ int mpi_code; /* MPI return code */
+ size_t num_blocks; /* Number of blocks between processes. */
+ size_t leftover_blocks; /* Number of leftover blocks to handle */
+ int blocks, leftover; /* converted to int for MPI */
MPI_Aint * chunk_disp_array = NULL;
+ MPI_Aint * block_disps = NULL;
int * block_lens = NULL;
MPI_Datatype mem_type = MPI_BYTE, file_type = MPI_BYTE;
H5FD_mpio_xfer_t prev_xfer_mode; /* Previous data xfer mode */
hbool_t have_xfer_mode = FALSE; /* Whether the previous xffer mode has been retrieved */
- hbool_t need_addr_sort = FALSE;
- int i; /* Local index variable */
+ hbool_t need_sort = FALSE;
+ size_t i; /* Local index variable */
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_STATIC
+ /*
+ * If a separate fill buffer is provided for partial chunks, ensure
+ * that the "don't filter partial edge chunks" flag is set.
+ */
+ if (partial_chunk_fill_buf)
+ HDassert(dset->shared->layout.u.chunk.flags & H5O_LAYOUT_CHUNK_DONT_FILTER_PARTIAL_BOUND_CHUNKS);
+
/* Get the MPI communicator */
if (MPI_COMM_NULL == (mpi_comm = H5F_mpi_get_comm(dset->oloc.file)))
HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "Can't retrieve MPI communicator")
@@ -4972,39 +4987,89 @@ H5D__chunk_collective_fill(const H5D_t *dset, H5D_chunk_coll_info_t *chunk_info,
/* Distribute evenly the number of blocks between processes. */
if (mpi_size == 0)
HGOTO_ERROR(H5E_DATASET, H5E_BADVALUE, FAIL, "Resulted in division by zero")
- num_blocks = (size_t)(chunk_info->num_io / (size_t)mpi_size); /* value should be the same on all procs */
+ num_blocks =
+ (size_t)(chunk_fill_info->num_chunks / (size_t)mpi_size); /* value should be the same on all procs */
/* After evenly distributing the blocks between processes, are there any
* leftover blocks for each individual process (round-robin)?
*/
- leftover_blocks = (size_t)(chunk_info->num_io % (size_t)mpi_size);
+ leftover_blocks = (size_t)(chunk_fill_info->num_chunks % (size_t)mpi_size);
/* Cast values to types needed by MPI */
H5_CHECKED_ASSIGN(blocks, int, num_blocks, size_t);
H5_CHECKED_ASSIGN(leftover, int, leftover_blocks, size_t);
- H5_CHECKED_ASSIGN(block_len, int, chunk_size, size_t);
/* Check if we have any chunks to write on this rank */
if (num_blocks > 0 || (leftover && leftover > mpi_rank)) {
+ MPI_Aint partial_fill_buf_disp = 0;
+ hbool_t all_same_block_len = TRUE;
+
/* Allocate buffers */
- /* (MSC - should not need block_lens if MPI_type_create_hindexed_block is working) */
- if (NULL == (block_lens = (int *)H5MM_malloc((size_t)(blocks + 1) * sizeof(int))))
- HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate chunk lengths buffer")
if (NULL == (chunk_disp_array = (MPI_Aint *)H5MM_malloc((size_t)(blocks + 1) * sizeof(MPI_Aint))))
HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate chunk file displacement buffer")
- for (i = 0; i < blocks; i++) {
- /* store the chunk address as an MPI_Aint */
- chunk_disp_array[i] = (MPI_Aint)(chunk_info->addr[i + (mpi_rank * blocks)]);
+ if (partial_chunk_fill_buf) {
+ MPI_Aint fill_buf_addr;
+ MPI_Aint partial_fill_buf_addr;
- /* MSC - should not need this if MPI_type_create_hindexed_block is working */
- block_lens[i] = block_len;
+ /* Calculate the displacement between the fill buffer and partial chunk fill buffer */
+ if (MPI_SUCCESS != (mpi_code = MPI_Get_address(fill_buf, &fill_buf_addr)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Get_address failed", mpi_code)
+ if (MPI_SUCCESS != (mpi_code = MPI_Get_address(partial_chunk_fill_buf, &partial_fill_buf_addr)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Get_address failed", mpi_code)
+
+#if MPI_VERSION >= 3 && MPI_SUBVERSION >= 1
+ partial_fill_buf_disp = MPI_Aint_diff(partial_fill_buf_addr, fill_buf_addr);
+#else
+ partial_fill_buf_disp = partial_fill_buf_addr - fill_buf_addr;
+#endif
- /* Make sure that the addresses in the datatype are
- * monotonically non-decreasing
+ /*
+ * Allocate all-zero block displacements array. If a block's displacement
+ * is left as zero, that block will be written to from the regular fill
+ * buffer. If a block represents an unfiltered partial edge chunk, its
+ * displacement will be set so that the block is written to from the
+ * unfiltered fill buffer.
*/
- if (i && (chunk_disp_array[i] < chunk_disp_array[i - 1]))
- need_addr_sort = TRUE;
+ if (NULL == (block_disps = (MPI_Aint *)H5MM_calloc((size_t)(blocks + 1) * sizeof(MPI_Aint))))
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate block displacements buffer")
+ }
+
+ /*
+ * Perform initial scan of chunk info list to:
+ * - make sure that chunk addresses are monotonically non-decreasing
+ * - check if all blocks have the same length
+ */
+ for (i = 1; i < chunk_fill_info->num_chunks; i++) {
+ if (chunk_fill_info->chunk_info[i].addr < chunk_fill_info->chunk_info[i - 1].addr)
+ need_sort = TRUE;
+
+ if (chunk_fill_info->chunk_info[i].chunk_size != chunk_fill_info->chunk_info[i - 1].chunk_size)
+ all_same_block_len = FALSE;
+ }
+
+ if (need_sort)
+ HDqsort(chunk_fill_info->chunk_info, chunk_fill_info->num_chunks,
+ sizeof(struct chunk_coll_fill_info), H5D__chunk_cmp_coll_fill_info);
+
+ /* Allocate buffer for block lengths if necessary */
+ if (!all_same_block_len)
+ if (NULL == (block_lens = (int *)H5MM_malloc((size_t)(blocks + 1) * sizeof(int))))
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate chunk lengths buffer")
+
+ for (i = 0; i < (size_t)blocks; i++) {
+ size_t idx = i + (size_t)(mpi_rank * blocks);
+
+ /* store the chunk address as an MPI_Aint */
+ chunk_disp_array[i] = (MPI_Aint)(chunk_fill_info->chunk_info[idx].addr);
+
+ if (!all_same_block_len)
+ H5_CHECKED_ASSIGN(block_lens[i], int, chunk_fill_info->chunk_info[idx].chunk_size, size_t);
+
+ if (chunk_fill_info->chunk_info[idx].unfiltered_partial_chunk) {
+ HDassert(partial_chunk_fill_buf);
+ block_disps[i] = partial_fill_buf_disp;
+ }
} /* end for */
/* Calculate if there are any leftover blocks after evenly
@@ -5012,32 +5077,71 @@ H5D__chunk_collective_fill(const H5D_t *dset, H5D_chunk_coll_info_t *chunk_info,
* to processes 0 -> leftover.
*/
if (leftover && leftover > mpi_rank) {
- chunk_disp_array[blocks] = (MPI_Aint)chunk_info->addr[(blocks * mpi_size) + mpi_rank];
- if (blocks && (chunk_disp_array[blocks] < chunk_disp_array[blocks - 1]))
- need_addr_sort = TRUE;
- block_lens[blocks] = block_len;
+ chunk_disp_array[blocks] =
+ (MPI_Aint)chunk_fill_info->chunk_info[(blocks * mpi_size) + mpi_rank].addr;
+
+ if (!all_same_block_len)
+ H5_CHECKED_ASSIGN(block_lens[blocks], int,
+ chunk_fill_info->chunk_info[(blocks * mpi_size) + mpi_rank].chunk_size,
+ size_t);
+
+ if (chunk_fill_info->chunk_info[(blocks * mpi_size) + mpi_rank].unfiltered_partial_chunk) {
+ HDassert(partial_chunk_fill_buf);
+ block_disps[blocks] = partial_fill_buf_disp;
+ }
+
blocks++;
}
- /* Ensure that the blocks are sorted in monotonically non-decreasing
- * order of offset in the file.
- */
- if (need_addr_sort)
- HDqsort(chunk_disp_array, (size_t)blocks, sizeof(MPI_Aint), H5D__chunk_cmp_addr);
+ /* Create file and memory types for the write operation */
+ if (all_same_block_len) {
+ int block_len;
+
+ H5_CHECKED_ASSIGN(block_len, int, chunk_fill_info->chunk_info[0].chunk_size, size_t);
+
+ mpi_code =
+ MPI_Type_create_hindexed_block(blocks, block_len, chunk_disp_array, MPI_BYTE, &file_type);
+ if (mpi_code != MPI_SUCCESS)
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed_block failed", mpi_code)
+
+ if (partial_chunk_fill_buf) {
+ /*
+ * If filters are disabled for partial edge chunks, those chunks could
+ * potentially have the same block length as the other chunks, but still
+ * need to be written to using the unfiltered fill buffer. Use an hindexed
+ * block type rather than an hvector.
+ */
+ mpi_code =
+ MPI_Type_create_hindexed_block(blocks, block_len, block_disps, MPI_BYTE, &mem_type);
+ if (mpi_code != MPI_SUCCESS)
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed_block failed", mpi_code)
+ }
+ else {
+ mpi_code = MPI_Type_create_hvector(blocks, block_len, 0, MPI_BYTE, &mem_type);
+ if (mpi_code != MPI_SUCCESS)
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hvector failed", mpi_code)
+ }
+ }
+ else {
+ /*
+ * Currently, different block lengths implies that there are partial
+ * edge chunks and the "don't filter partial edge chunks" flag is set.
+ */
+ HDassert(partial_chunk_fill_buf);
+ HDassert(block_lens);
+ HDassert(block_disps);
+
+ mpi_code = MPI_Type_create_hindexed(blocks, block_lens, chunk_disp_array, MPI_BYTE, &file_type);
+ if (mpi_code != MPI_SUCCESS)
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed failed", mpi_code)
+
+ mpi_code = MPI_Type_create_hindexed(blocks, block_lens, block_disps, MPI_BYTE, &mem_type);
+ if (mpi_code != MPI_SUCCESS)
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed failed", mpi_code)
+ }
- /* MSC - should use this if MPI_type_create_hindexed block is working:
- * mpi_code = MPI_Type_create_hindexed_block(blocks, block_len, chunk_disp_array, MPI_BYTE,
- * &file_type);
- */
- mpi_code = MPI_Type_create_hindexed(blocks, block_lens, chunk_disp_array, MPI_BYTE, &file_type);
- if (mpi_code != MPI_SUCCESS)
- HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed failed", mpi_code)
if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(&file_type)))
HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code)
-
- mpi_code = MPI_Type_create_hvector(blocks, block_len, 0, MPI_BYTE, &mem_type);
- if (mpi_code != MPI_SUCCESS)
- HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hvector failed", mpi_code)
if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(&mem_type)))
HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code)
} /* end if */
@@ -5080,39 +5184,25 @@ done:
if (MPI_SUCCESS != (mpi_code = MPI_Type_free(&mem_type)))
HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
H5MM_xfree(chunk_disp_array);
+ H5MM_xfree(block_disps);
H5MM_xfree(block_lens);
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5D__chunk_collective_fill() */
static int
-H5D__chunk_cmp_addr(const void *addr1, const void *addr2)
+H5D__chunk_cmp_coll_fill_info(const void *_entry1, const void *_entry2)
{
- MPI_Aint _addr1 = (MPI_Aint)0, _addr2 = (MPI_Aint)0;
- int ret_value = 0;
+ const struct chunk_coll_fill_info *entry1;
+ const struct chunk_coll_fill_info *entry2;
FUNC_ENTER_STATIC_NOERR
- _addr1 = *((const MPI_Aint *)addr1);
- _addr2 = *((const MPI_Aint *)addr2);
+ entry1 = (const struct chunk_coll_fill_info *)_entry1;
+ entry2 = (const struct chunk_coll_fill_info *)_entry2;
-#if MPI_VERSION >= 3 && MPI_SUBVERSION >= 1
- {
- MPI_Aint diff = MPI_Aint_diff(_addr1, _addr2);
-
- if (diff < (MPI_Aint)0)
- ret_value = -1;
- else if (diff > (MPI_Aint)0)
- ret_value = 1;
- else
- ret_value = 0;
- }
-#else
- ret_value = (_addr1 > _addr2) - (_addr1 < _addr2);
-#endif
-
- FUNC_LEAVE_NOAPI(ret_value)
-} /* end H5D__chunk_cmp_addr() */
+ FUNC_LEAVE_NOAPI(H5F_addr_cmp(entry1->addr, entry2->addr))
+} /* end H5D__chunk_cmp_coll_fill_info() */
#endif /* H5_HAVE_PARALLEL */
/*-------------------------------------------------------------------------
@@ -6826,7 +6916,7 @@ done:
*
*-------------------------------------------------------------------------
*/
-static hbool_t
+hbool_t
H5D__chunk_is_partial_edge_chunk(unsigned dset_ndims, const uint32_t *chunk_dims, const hsize_t scaled[],
const hsize_t *dset_dims)
{
@@ -7121,6 +7211,89 @@ done:
} /* end H5D__chunk_format_convert() */
/*-------------------------------------------------------------------------
+ * Function: H5D__chunk_index_empty_cb
+ *
+ * Purpose: Callback function that simply stops iteration and sets the
+ * `empty` parameter to FALSE if called. If this callback is
+ * entered, it means that the chunk index contains at least
+ * one chunk, so is not empty.
+ *
+ * Return: H5_ITER_STOP
+ *
+ *-------------------------------------------------------------------------
+ */
+static int
+H5D__chunk_index_empty_cb(const H5D_chunk_rec_t H5_ATTR_UNUSED *chunk_rec, void *_udata)
+{
+ hbool_t *empty = (hbool_t *)_udata;
+ int ret_value = H5_ITER_STOP;
+
+ FUNC_ENTER_STATIC_NOERR
+
+ *empty = FALSE;
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D__chunk_index_empty_cb() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5D__chunk_index_empty
+ *
+ * Purpose: Determines whether a chunk index is empty (has no chunks
+ * inserted into it yet).
+ *
+ * Note: This routine is meant to be a little more performant than
+ * just counting the number of chunks in the index. In the
+ * future, this is probably a callback that the chunk index
+ * ops structure should provide.
+ *
+ * Return: Non-negative on Success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5D__chunk_index_empty(const H5D_t *dset, hbool_t *empty)
+{
+ H5D_chk_idx_info_t idx_info; /* Chunked index info */
+ H5D_rdcc_ent_t * ent; /* Cache entry */
+ const H5D_rdcc_t * rdcc = NULL; /* Raw data chunk cache */
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_PACKAGE_TAG(dset->oloc.addr)
+
+ HDassert(dset);
+ HDassert(dset->shared);
+ HDassert(empty);
+
+ rdcc = &(dset->shared->cache.chunk); /* raw data chunk cache */
+ HDassert(rdcc);
+
+ /* Search for cached chunks that haven't been written out */
+ for (ent = rdcc->head; ent; ent = ent->next)
+ /* Flush the chunk out to disk, to make certain the size is correct later */
+ if (H5D__chunk_flush_entry(dset, ent, FALSE) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "cannot flush indexed storage buffer")
+
+ /* Compose chunked index info struct */
+ idx_info.f = dset->oloc.file;
+ idx_info.pline = &dset->shared->dcpl_cache.pline;
+ idx_info.layout = &dset->shared->layout.u.chunk;
+ idx_info.storage = &dset->shared->layout.storage.u.chunk;
+
+ *empty = TRUE;
+
+ if (H5F_addr_defined(idx_info.storage->idx_addr)) {
+ /* Iterate over the allocated chunks */
+ if ((dset->shared->layout.storage.u.chunk.ops->iterate)(&idx_info, H5D__chunk_index_empty_cb, empty) <
+ 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL,
+ "unable to retrieve allocated chunk information from index")
+ }
+
+done:
+ FUNC_LEAVE_NOAPI_TAG(ret_value)
+} /* end H5D__chunk_index_empty() */
+
+/*-------------------------------------------------------------------------
* Function: H5D__get_num_chunks_cb
*
* Purpose: Callback function that increments the number of written
diff --git a/src/H5Dint.c b/src/H5Dint.c
index c9ea6bd..cc17265 100644
--- a/src/H5Dint.c
+++ b/src/H5Dint.c
@@ -378,40 +378,18 @@ H5D__get_space_status(const H5D_t *dset, H5D_space_status_t *allocation)
/* Check for chunked layout */
if (dset->shared->layout.type == H5D_CHUNKED) {
- hsize_t space_allocated; /* The number of bytes allocated for chunks */
- hssize_t snelmts; /* Temporary holder for number of elements in dataspace */
- hsize_t nelmts; /* Number of elements in dataspace */
- size_t dt_size; /* Size of datatype */
- hsize_t full_size; /* The number of bytes in the dataset when fully populated */
-
- /* For chunked layout set the space status by the storage size */
- /* Get the dataset's dataspace */
- HDassert(dset->shared->space);
-
- /* Get the total number of elements in dataset's dataspace */
- if ((snelmts = H5S_GET_EXTENT_NPOINTS(dset->shared->space)) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "unable to retrieve number of elements in dataspace")
- nelmts = (hsize_t)snelmts;
-
- /* Get the size of the dataset's datatype */
- if (0 == (dt_size = H5T_GET_SIZE(dset->shared->type)))
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "unable to retrieve size of datatype")
-
- /* Compute the maximum size of the dataset in bytes */
- full_size = nelmts * dt_size;
-
- /* Check for overflow during multiplication */
- if (nelmts != (full_size / dt_size))
- HGOTO_ERROR(H5E_DATASET, H5E_OVERFLOW, FAIL, "size of dataset's storage overflowed")
-
- /* Difficult to error check, since the error value is 0 and 0 is a valid value... :-/ */
- if (H5D__get_storage_size(dset, &space_allocated) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get size of dataset's storage")
-
- /* Decide on how much of the space is allocated */
- if (space_allocated == 0)
+ hsize_t n_chunks_total = dset->shared->layout.u.chunk.nchunks;
+ hsize_t n_chunks_alloc = 0;
+
+ if (H5D__get_num_chunks(dset, dset->shared->space, &n_chunks_alloc) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL,
+ "unable to retrieve number of allocated chunks in dataset")
+
+ HDassert(n_chunks_alloc <= n_chunks_total);
+
+ if (n_chunks_alloc == 0)
*allocation = H5D_SPACE_STATUS_NOT_ALLOCATED;
- else if (space_allocated == full_size)
+ else if (n_chunks_alloc == n_chunks_total)
*allocation = H5D_SPACE_STATUS_ALLOCATED;
else
*allocation = H5D_SPACE_STATUS_PART_ALLOCATED;
@@ -1301,10 +1279,19 @@ H5D__create(H5F_t *file, hid_t type_id, const H5S_t *space, hid_t dcpl_id, hid_t
HGOTO_ERROR(H5E_DATASET, H5E_CANTSET, NULL, "can't set latest indexing")
} /* end if */
- /* Check if this dataset is going into a parallel file and set space allocation time */
+ /* Check if the file driver would like to force early space allocation */
if (H5F_HAS_FEATURE(file, H5FD_FEAT_ALLOCATE_EARLY))
new_dset->shared->dcpl_cache.fill.alloc_time = H5D_ALLOC_TIME_EARLY;
+ /*
+ * Check if this dataset is going into a parallel file and set space allocation time.
+ * If the dataset has filters applied to it, writes to the dataset must be collective,
+ * so we don't need to force early space allocation. Otherwise, we force early space
+ * allocation to facilitate independent raw data operations.
+ */
+ if (H5F_HAS_FEATURE(file, H5FD_FEAT_HAS_MPI) && (new_dset->shared->dcpl_cache.pline.nused == 0))
+ new_dset->shared->dcpl_cache.fill.alloc_time = H5D_ALLOC_TIME_EARLY;
+
/* Set the dataset's I/O operations */
if (H5D__layout_set_io_ops(new_dset) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, NULL, "unable to initialize I/O operations")
diff --git a/src/H5Dio.c b/src/H5Dio.c
index 1ea3f07..e226a0a 100644
--- a/src/H5Dio.c
+++ b/src/H5Dio.c
@@ -300,6 +300,7 @@ H5D__write(H5D_t *dataset, hid_t mem_type_id, H5S_t *mem_space, H5S_t *file_spac
H5D_io_info_t io_info; /* Dataset I/O info */
H5D_type_info_t type_info; /* Datatype info for operation */
hbool_t type_info_init = FALSE; /* Whether the datatype info has been initialized */
+ hbool_t should_alloc_space = FALSE; /* Whether or not to initialize dataset's storage */
H5S_t * projected_mem_space = NULL; /* If not NULL, ptr to dataspace containing a */
/* projection of the supplied mem_space to a new */
/* dataspace with rank equal to that of */
@@ -432,8 +433,20 @@ H5D__write(H5D_t *dataset, hid_t mem_type_id, H5S_t *mem_space, H5S_t *file_spac
HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "unable to set up I/O operation")
/* Allocate dataspace and initialize it if it hasn't been. */
- if (nelmts > 0 && dataset->shared->dcpl_cache.efl.nused == 0 &&
- !(*dataset->shared->layout.ops->is_space_alloc)(&dataset->shared->layout.storage)) {
+ should_alloc_space = dataset->shared->dcpl_cache.efl.nused == 0 &&
+ !(*dataset->shared->layout.ops->is_space_alloc)(&dataset->shared->layout.storage);
+
+ /*
+ * If not using an MPI-based VFD, we only need to allocate
+ * and initialize storage if there's a selection in the
+ * dataset's dataspace. Otherwise, we always need to participate
+ * in the storage allocation since this may use collective
+ * operations and we will hang if we don't participate.
+ */
+ if (!H5F_HAS_FEATURE(dataset->oloc.file, H5FD_FEAT_HAS_MPI))
+ should_alloc_space = should_alloc_space && (nelmts > 0);
+
+ if (should_alloc_space) {
hssize_t file_nelmts; /* Number of elements in file dataset's dataspace */
hbool_t full_overwrite; /* Whether we are over-writing all the elements */
@@ -808,98 +821,35 @@ H5D__ioinfo_adjust(H5D_io_info_t *io_info, const H5D_t *dset, const H5S_t *file_
io_info->io_ops.single_write = H5D__mpio_select_write;
} /* end if */
else {
- int comm_size = 0;
-
- /* Retrieve size of MPI communicator used for file */
- if ((comm_size = H5F_shared_mpi_get_size(io_info->f_sh)) < 0)
- HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "can't get MPI communicator size")
-
/* Check if there are any filters in the pipeline. If there are,
* we cannot break to independent I/O if this is a write operation
* with multiple ranks involved; otherwise, there will be metadata
* inconsistencies in the file.
*/
- if (comm_size > 1 && io_info->op_type == H5D_IO_OP_WRITE &&
- io_info->dset->shared->dcpl_cache.pline.nused > 0) {
- H5D_mpio_no_collective_cause_t cause;
- uint32_t local_no_collective_cause;
- uint32_t global_no_collective_cause;
- hbool_t local_error_message_previously_written = FALSE;
- hbool_t global_error_message_previously_written = FALSE;
- size_t idx;
- size_t cause_strings_len;
- char local_no_collective_cause_string[512] = "";
- char global_no_collective_cause_string[512] = "";
- const char * cause_strings[] = {
- "independent I/O was requested",
- "datatype conversions were required",
- "data transforms needed to be applied",
- "optimized MPI types flag wasn't set",
- "one of the dataspaces was neither simple nor scalar",
- "dataset was not contiguous or chunked",
- "parallel writes to filtered datasets are disabled",
- "an error occurred while checking if collective I/O was possible"};
-
- cause_strings_len = sizeof(cause_strings) / sizeof(cause_strings[0]);
-
- if (H5CX_get_mpio_local_no_coll_cause(&local_no_collective_cause) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL,
- "unable to get local no collective cause value")
- if (H5CX_get_mpio_global_no_coll_cause(&global_no_collective_cause) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL,
- "unable to get global no collective cause value")
-
- /* Append each of the "reason for breaking collective I/O" error messages to the
- * local and global no collective cause strings */
- for (cause = 1, idx = 0;
- (cause < H5D_MPIO_NO_COLLECTIVE_MAX_CAUSE) && (idx < cause_strings_len);
- cause <<= 1, idx++) {
- if (cause & local_no_collective_cause) {
- size_t local_buffer_space = sizeof(local_no_collective_cause_string) -
- HDstrlen(local_no_collective_cause_string) - 1;
-
- /* Check if there were any previous error messages included. If so, prepend a
- * semicolon to separate the messages.
- */
- if (local_buffer_space && local_error_message_previously_written) {
- HDstrncat(local_no_collective_cause_string, "; ", local_buffer_space);
- local_buffer_space -= MIN(local_buffer_space, 2);
- }
-
- if (local_buffer_space)
- HDstrncat(local_no_collective_cause_string, cause_strings[idx],
- local_buffer_space);
-
- local_error_message_previously_written = TRUE;
- } /* end if */
-
- if (cause & global_no_collective_cause) {
- size_t global_buffer_space = sizeof(global_no_collective_cause_string) -
- HDstrlen(global_no_collective_cause_string) - 1;
-
- /* Check if there were any previous error messages included. If so, prepend a
- * semicolon to separate the messages.
- */
- if (global_buffer_space && global_error_message_previously_written) {
- HDstrncat(global_no_collective_cause_string, "; ", global_buffer_space);
- global_buffer_space -= MIN(global_buffer_space, 2);
- }
-
- if (global_buffer_space)
- HDstrncat(global_no_collective_cause_string, cause_strings[idx],
- global_buffer_space);
-
- global_error_message_previously_written = TRUE;
- } /* end if */
- } /* end for */
-
- HGOTO_ERROR(H5E_IO, H5E_NO_INDEPENDENT, FAIL,
- "Can't perform independent write with filters in pipeline.\n"
- " The following caused a break from collective I/O:\n"
- " Local causes: %s\n"
- " Global causes: %s",
- local_no_collective_cause_string, global_no_collective_cause_string);
- } /* end if */
+ if (io_info->op_type == H5D_IO_OP_WRITE && io_info->dset->shared->dcpl_cache.pline.nused > 0) {
+ int comm_size = 0;
+
+ /* Retrieve size of MPI communicator used for file */
+ if ((comm_size = H5F_shared_mpi_get_size(io_info->f_sh)) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "can't get MPI communicator size")
+
+ if (comm_size > 1) {
+ char local_no_coll_cause_string[512];
+ char global_no_coll_cause_string[512];
+
+ if (H5D__mpio_get_no_coll_cause_strings(local_no_coll_cause_string, 512,
+ global_no_coll_cause_string, 512) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL,
+ "can't get reasons for breaking collective I/O")
+
+ HGOTO_ERROR(H5E_IO, H5E_NO_INDEPENDENT, FAIL,
+ "Can't perform independent write with filters in pipeline.\n"
+ " The following caused a break from collective I/O:\n"
+ " Local causes: %s\n"
+ " Global causes: %s",
+ local_no_coll_cause_string, global_no_coll_cause_string);
+ }
+ }
/* If we won't be doing collective I/O, but the user asked for
* collective I/O, change the request to use independent I/O
diff --git a/src/H5Dlayout.c b/src/H5Dlayout.c
index 6c4fc12..6fdec05 100644
--- a/src/H5Dlayout.c
+++ b/src/H5Dlayout.c
@@ -213,7 +213,7 @@ H5D__layout_meta_size(const H5F_t *f, const H5O_layout_t *layout, hbool_t includ
ret_value++;
/* Dimension sizes */
- ret_value += layout->u.chunk.ndims * layout->u.chunk.enc_bytes_per_dim;
+ ret_value += layout->u.chunk.ndims * (size_t)layout->u.chunk.enc_bytes_per_dim;
/* Type of chunk index */
ret_value++;
diff --git a/src/H5Dmpio.c b/src/H5Dmpio.c
index cf8a6ef..527fc7b 100644
--- a/src/H5Dmpio.c
+++ b/src/H5Dmpio.c
@@ -36,6 +36,7 @@
#include "H5Eprivate.h" /* Error handling */
#include "H5Fprivate.h" /* File access */
#include "H5FDprivate.h" /* File drivers */
+#include "H5FLprivate.h" /* Free Lists */
#include "H5Iprivate.h" /* IDs */
#include "H5MMprivate.h" /* Memory management */
#include "H5Oprivate.h" /* Object headers */
@@ -43,6 +44,15 @@
#include "H5Sprivate.h" /* Dataspaces */
#include "H5VMprivate.h" /* Vector */
+/* uthash is an external, header-only hash table implementation.
+ *
+ * We include the file directly in src/ and #define a few functions
+ * to use our internal memory calls.
+ */
+#define uthash_malloc(sz) H5MM_malloc(sz)
+#define uthash_free(ptr, sz) H5MM_free(ptr) /* Ignoring sz is intentional */
+#include "uthash.h"
+
#ifdef H5_HAVE_PARALLEL
/****************/
@@ -81,9 +91,54 @@
/* Macros to represent the regularity of the selection for multiple chunk IO case. */
#define H5D_CHUNK_SELECT_REG 1
+/*
+ * Threshold value for redistributing shared filtered chunks
+ * on all MPI ranks, or just MPI rank 0
+ */
+#define H5D_CHUNK_REDISTRIBUTE_THRES ((size_t)((25 * H5_MB) / sizeof(H5D_chunk_redistribute_info_t)))
+
+/*
+ * Initial allocation size for the arrays that hold
+ * buffers for chunk modification data that is sent
+ * to other ranks and the MPI_Request objects for
+ * those send operations
+ */
+#define H5D_CHUNK_NUM_SEND_MSGS_INIT 64
+
+/*
+ * Define a tag value for the MPI messages sent/received for
+ * chunk modification data
+ */
+#define H5D_CHUNK_MOD_DATA_TAG 64
+
+/*
+ * Macro to initialize a H5D_chk_idx_info_t
+ * structure, given a pointer to a H5D_io_info_t
+ * structure
+ */
+#define H5D_MPIO_INIT_CHUNK_IDX_INFO(index_info, io_info_ptr) \
+ do { \
+ index_info.f = (io_info_ptr)->dset->oloc.file; \
+ index_info.pline = &((io_info_ptr)->dset->shared->dcpl_cache.pline); \
+ index_info.layout = &((io_info_ptr)->dset->shared->layout.u.chunk); \
+ index_info.storage = &((io_info_ptr)->dset->shared->layout.storage.u.chunk); \
+ } while (0)
+
+/*
+ * Macro to initialize a H5D_chunk_ud_t structure
+ * given a pointer to a H5D_chk_idx_info_t structure
+ */
+#define H5D_MPIO_INIT_CHUNK_UD_INFO(chunk_ud, index_info_ptr) \
+ do { \
+ HDmemset(&chunk_ud, 0, sizeof(H5D_chunk_ud_t)); \
+ chunk_ud.common.layout = (index_info_ptr)->layout; \
+ chunk_ud.common.storage = (index_info_ptr)->storage; \
+ } while (0)
+
/******************/
/* Local Typedefs */
/******************/
+
/* Combine chunk address and chunk info into a struct for better performance. */
typedef struct H5D_chunk_addr_info_t {
haddr_t chunk_addr;
@@ -100,115 +155,137 @@ typedef enum H5D_mpio_no_rank0_bcast_cause_t {
} H5D_mpio_no_rank0_bcast_cause_t;
/*
+ * Information necessary for re-allocating file space for a chunk
+ * during a parallel write of a chunked dataset with filters
+ * applied.
+ */
+typedef struct H5D_chunk_alloc_info_t {
+ H5F_block_t chunk_current;
+ H5F_block_t chunk_new;
+ hsize_t chunk_idx;
+} H5D_chunk_alloc_info_t;
+
+/*
+ * Information for a chunk pertaining to the dataset's chunk
+ * index entry for the chunk
+ */
+typedef struct H5D_chunk_index_info_t {
+ hsize_t chunk_idx;
+ unsigned filter_mask;
+ hbool_t need_insert;
+} H5D_chunk_index_info_t;
+
+/*
* Information about a single chunk when performing collective filtered I/O. All
* of the fields of one of these structs are initialized at the start of collective
- * filtered I/O in the function H5D__construct_filtered_io_info_list().
+ * filtered I/O in the function H5D__mpio_collective_filtered_chunk_io_setup(). This
+ * struct's fields are as follows:
*
- * This struct's fields are as follows:
+ * index_info - A structure containing the information needed when collectively
+ * re-inserting the chunk into the dataset's chunk index. The structure
+ * is distributed to all ranks during the re-insertion operation. Its fields
+ * are as follows:
*
- * index - The "Index" of the chunk in the dataset. The index of a chunk is used during
- * the collective re-insertion of chunks into the chunk index after the collective
- * I/O has been performed.
+ * chunk_idx - The index of the chunk in the dataset's chunk index.
*
- * scaled - The scaled coordinates of the chunk in the dataset's file dataspace. The
- * coordinates are used in both the collective re-allocation of space in the file
- * and the collective re-insertion of chunks into the chunk index after the collective
- * I/O has been performed.
+ * filter_mask - A bit-mask that indicates which filters are to be applied to the
+ * chunk. Each filter in a chunk's filter pipeline has a bit position
+ * that can be masked to disable that particular filter for the chunk.
+ * This filter mask is saved alongside the chunk in the file.
*
- * full_overwrite - A flag which determines whether or not a chunk needs to be read from the
- * file when being updated. If a chunk is being fully overwritten (the entire
- * extent is selected in its file dataspace), then it is not necessary to
- * read the chunk from the file. However, if the chunk is not being fully
- * overwritten, it has to be read from the file in order to update the chunk
- * without trashing the parts of the chunk that are not selected.
+ * need_insert - A flag which determines whether or not a chunk needs to be re-inserted into
+ * the chunk index after the write operation.
*
- * num_writers - The total number of processors writing to this chunk. This field is used
- * when the new owner of a chunk is receiving messages, which contain selections in
- * the chunk and data to update the chunk with, from other processors which have this
- * chunk selected in the I/O operation. The new owner must know how many processors it
- * should expect messages from so that it can post an equal number of receive calls.
+ * chunk_info - A pointer to the chunk's H5D_chunk_info_t structure, which contains useful
+ * information like the dataspaces containing the selection in the chunk.
*
- * io_size - The total size of I/O to this chunk. This field is an accumulation of the size of
- * I/O to the chunk from each processor which has the chunk selected and is used to
- * determine the value for the previous full_overwrite flag.
+ * chunk_current - The address in the file and size of this chunk before the filtering
+ * operation. When reading a chunk from the file, this field is used to
+ * read the correct amount of bytes. It is also used when redistributing
+ * shared chunks among MPI ranks and as a parameter to the chunk file
+ * space reallocation function.
*
- * buf - A pointer which serves the dual purpose of holding either the chunk data which is to be
- * written to the file or the chunk data which has been read from the file.
+ * chunk_new - The address in the file and size of this chunk after the filtering
+ * operation. This field is relevant when collectively re-allocating space
+ * in the file for all of the chunks written to in the I/O operation, as
+ * their sizes may have changed after their data has been filtered.
*
- * chunk_states - In the case of dataset writes only, this struct is used to track a chunk's size and
- * address in the file before and after the filtering operation has occurred.
+ * need_read - A flag which determines whether or not a chunk needs to be read from the
+ * file. During writes, if a chunk is being fully overwritten (the entire extent
+ * is selected in its file dataspace), then it is not necessary to read the chunk
+ * from the file. However, if the chunk is not being fully overwritten, it has to
+ * be read from the file in order to update the chunk without trashing the parts
+ * of the chunk that are not selected. During reads, this field should generally
+ * be true, but may be false if the chunk isn't allocated, for example.
*
- * Its fields are as follows:
+ * skip_filter_pline - A flag which determines whether to skip calls to the filter pipeline
+ * for this chunk. This flag is mostly useful for correct handling of
+ * partial edge chunks when the "don't filter partial edge chunks" flag
+ * is set on the dataset's DCPL.
*
- * chunk_current - The address in the file and size of this chunk before the filtering
- * operation. When reading a chunk from the file, this field is used to
- * read the correct amount of bytes. It is also used when redistributing
- * shared chunks among processors and as a parameter to the chunk file
- * space reallocation function.
+ * io_size - The total size of I/O to this chunk. This field is an accumulation of the size of
+ * I/O to the chunk from each MPI rank which has the chunk selected and is used to
+ * determine the value for the previous `full_overwrite` flag.
*
- * new_chunk - The address in the file and size of this chunk after the filtering
- * operation. This field is relevant when collectively re-allocating space
- * in the file for all of the chunks written to in the I/O operation, as
- * their sizes may have changed after their data has been filtered.
+ * chunk_buf_size - The size in bytes of the data buffer allocated for the chunk
*
- * owners - In the case of dataset writes only, this struct is used to manage which single processor
- * will ultimately write data out to the chunk. It allows the other processors to act according
- * to the decision and send their selection in the chunk, as well as the data they wish
- * to update the chunk with, to the processor which is writing to the chunk.
+ * orig_owner - The MPI rank which originally had this chunk selected at the beginning of
+ * the collective filtered I/O operation. This field is currently used when
+ * redistributing shared chunks among MPI ranks.
*
- * Its fields are as follows:
+ * new_owner - The MPI rank which has been selected to perform the modifications to this chunk.
*
- * original_owner - The processor which originally had this chunk selected at the beginning of
- * the collective filtered I/O operation. This field is currently used when
- * redistributing shared chunks among processors.
+ * num_writers - The total number of MPI ranks writing to this chunk. This field is used when
+ * the new owner of a chunk is receiving messages from other MPI ranks that
+ * contain their selections in the chunk and the data to update the chunk with.
+ * The new owner must know how many MPI ranks it should expect messages from so
+ * that it can post an equal number of receive calls.
*
- * new_owner - The processor which has been selected to perform the write to this chunk.
+ * buf - A pointer which serves the dual purpose of holding either the chunk data which is to be
+ * written to the file or the chunk data which has been read from the file.
*
- * async_info - In the case of dataset writes only, this struct is used by the owning processor of the
- * chunk in order to manage the MPI send and receive calls made between it and all of
- * the other processors which have this chunk selected in the I/O operation.
+ * hh - A handle for hash tables provided by the uthash.h header
*
- * Its fields are as follows:
- *
- * receive_requests_array - An array containing one MPI_Request for each of the
- * asynchronous MPI receive calls the owning processor of this
- * chunk makes to another processor in order to receive that
- * processor's chunk modification data and selection in the chunk.
- *
- * receive_buffer_array - An array of buffers into which the owning processor of this chunk
- * will store chunk modification data and the selection in the chunk
- * received from another processor.
- *
- * num_receive_requests - The number of entries in the receive_request_array and
- * receive_buffer_array fields.
*/
typedef struct H5D_filtered_collective_io_info_t {
- hsize_t index;
- hsize_t scaled[H5O_LAYOUT_NDIMS];
- hbool_t full_overwrite;
- size_t num_writers;
- size_t io_size;
- void * buf;
-
- struct {
- H5F_block_t chunk_current;
- H5F_block_t new_chunk;
- } chunk_states;
-
- struct {
- int original_owner;
- int new_owner;
- } owners;
-
- struct {
- MPI_Request * receive_requests_array;
- unsigned char **receive_buffer_array;
- int num_receive_requests;
- } async_info;
+ H5D_chunk_index_info_t index_info;
+
+ H5D_chunk_info_t *chunk_info;
+ H5F_block_t chunk_current;
+ H5F_block_t chunk_new;
+ hbool_t need_read;
+ hbool_t skip_filter_pline;
+ size_t io_size;
+ size_t chunk_buf_size;
+ int orig_owner;
+ int new_owner;
+ int num_writers;
+ void * buf;
+
+ UT_hash_handle hh;
} H5D_filtered_collective_io_info_t;
-/* Function pointer typedef for sort function */
-typedef int (*H5D_mpio_sort_func_cb_t)(const void *, const void *);
+/*
+ * Information necessary for redistributing shared chunks during
+ * a parallel write of a chunked dataset with filters applied.
+ */
+typedef struct H5D_chunk_redistribute_info_t {
+ H5F_block_t chunk_block;
+ hsize_t chunk_idx;
+ int orig_owner;
+ int new_owner;
+ int num_writers;
+} H5D_chunk_redistribute_info_t;
+
+/*
+ * Information used when re-inserting a chunk into a dataset's
+ * chunk index during a parallel write of a chunked dataset with
+ * filters applied.
+ */
+typedef struct H5D_chunk_insert_info_t {
+ H5F_block_t chunk_block;
+ H5D_chunk_index_info_t index_info;
+} H5D_chunk_insert_info_t;
/********************/
/* Local Prototypes */
@@ -216,53 +293,98 @@ typedef int (*H5D_mpio_sort_func_cb_t)(const void *, const void *);
static herr_t H5D__chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info,
H5D_chunk_map_t *fm);
static herr_t H5D__multi_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info,
- H5D_chunk_map_t *fm);
+ H5D_chunk_map_t *fm, int mpi_rank, int mpi_size);
static herr_t H5D__multi_chunk_filtered_collective_io(H5D_io_info_t * io_info,
- const H5D_type_info_t *type_info, H5D_chunk_map_t *fm);
+ const H5D_type_info_t *type_info, H5D_chunk_map_t *fm,
+ int mpi_rank, int mpi_size);
static herr_t H5D__link_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info,
- H5D_chunk_map_t *fm, int sum_chunk);
+ H5D_chunk_map_t *fm, int sum_chunk, int mpi_rank, int mpi_size);
static herr_t H5D__link_chunk_filtered_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info,
- H5D_chunk_map_t *fm);
+ H5D_chunk_map_t *fm, int mpi_rank, int mpi_size);
static herr_t H5D__inter_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info,
const H5S_t *file_space, const H5S_t *mem_space);
static herr_t H5D__final_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info,
hsize_t nelmts, MPI_Datatype mpi_file_type, MPI_Datatype mpi_buf_type);
static herr_t H5D__sort_chunk(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm,
- H5D_chunk_addr_info_t chunk_addr_info_array[], int many_chunk_opt);
+ H5D_chunk_addr_info_t chunk_addr_info_array[], int many_chunk_opt, int mpi_rank,
+ int mpi_size);
static herr_t H5D__obtain_mpio_mode(H5D_io_info_t *io_info, H5D_chunk_map_t *fm, uint8_t assign_io_mode[],
- haddr_t chunk_addr[]);
+ haddr_t chunk_addr[], int mpi_rank, int mpi_size);
static herr_t H5D__mpio_get_sum_chunk(const H5D_io_info_t *io_info, const H5D_chunk_map_t *fm,
int *sum_chunkf);
-static herr_t H5D__construct_filtered_io_info_list(const H5D_io_info_t * io_info,
- const H5D_type_info_t * type_info,
- const H5D_chunk_map_t * fm,
- H5D_filtered_collective_io_info_t **chunk_list,
- size_t * num_entries);
-#if MPI_VERSION >= 3
-static herr_t H5D__chunk_redistribute_shared_chunks(const H5D_io_info_t * io_info,
- const H5D_type_info_t * type_info,
- const H5D_chunk_map_t * fm,
- H5D_filtered_collective_io_info_t *local_chunk_array,
- size_t *local_chunk_array_num_entries);
-#endif
-static herr_t H5D__mpio_array_gatherv(void *local_array, size_t local_array_num_entries,
- size_t array_entry_size, void **gathered_array,
- size_t *gathered_array_num_entries, hbool_t allgather, int root,
- MPI_Comm comm, int (*sort_func)(const void *, const void *));
-static herr_t H5D__mpio_filtered_collective_write_type(H5D_filtered_collective_io_info_t *chunk_list,
- size_t num_entries, MPI_Datatype *new_mem_type,
- hbool_t *mem_type_derived, MPI_Datatype *new_file_type,
- hbool_t *file_type_derived);
-static herr_t H5D__filtered_collective_chunk_entry_io(H5D_filtered_collective_io_info_t *chunk_entry,
- const H5D_io_info_t * io_info,
- const H5D_type_info_t * type_info,
- const H5D_chunk_map_t * fm);
+static herr_t H5D__mpio_collective_filtered_chunk_io_setup(const H5D_io_info_t * io_info,
+ const H5D_type_info_t * type_info,
+ const H5D_chunk_map_t * fm,
+ H5D_filtered_collective_io_info_t **chunk_list,
+ size_t *num_entries, int mpi_rank);
+static herr_t H5D__mpio_redistribute_shared_chunks(H5D_filtered_collective_io_info_t *chunk_list,
+ size_t chunk_list_num_entries,
+ const H5D_io_info_t *io_info, const H5D_chunk_map_t *fm,
+ int mpi_rank, int mpi_size,
+ size_t **rank_chunks_assigned_map);
+static herr_t H5D__mpio_redistribute_shared_chunks_int(H5D_filtered_collective_io_info_t *chunk_list,
+ size_t * num_chunks_assigned_map,
+ hbool_t all_ranks_involved,
+ const H5D_io_info_t * io_info,
+ const H5D_chunk_map_t *fm, int mpi_rank, int mpi_size);
+static herr_t H5D__mpio_share_chunk_modification_data(H5D_filtered_collective_io_info_t *chunk_list,
+ size_t *chunk_list_num_entries, H5D_io_info_t *io_info,
+ const H5D_type_info_t *type_info, int mpi_rank,
+ int mpi_size,
+ H5D_filtered_collective_io_info_t **chunk_hash_table,
+ unsigned char *** chunk_msg_bufs,
+ int * chunk_msg_bufs_len);
+static herr_t H5D__mpio_collective_filtered_chunk_common_io(H5D_filtered_collective_io_info_t *chunk_list,
+ size_t chunk_list_num_entries,
+ const H5D_io_info_t * io_info,
+ const H5D_type_info_t *type_info, int mpi_size);
+static herr_t H5D__mpio_collective_filtered_chunk_read(H5D_filtered_collective_io_info_t *chunk_list,
+ size_t chunk_list_num_entries,
+ const H5D_io_info_t * io_info,
+ const H5D_type_info_t *type_info, int mpi_rank,
+ int mpi_size);
+static herr_t H5D__mpio_collective_filtered_chunk_update(H5D_filtered_collective_io_info_t *chunk_list,
+ size_t chunk_list_num_entries,
+ H5D_filtered_collective_io_info_t *chunk_hash_table,
+ unsigned char ** chunk_msg_bufs,
+ int chunk_msg_bufs_len, const H5D_io_info_t *io_info,
+ const H5D_type_info_t *type_info, int mpi_rank,
+ int mpi_size);
+static herr_t H5D__mpio_collective_filtered_chunk_reallocate(H5D_filtered_collective_io_info_t *chunk_list,
+ size_t chunk_list_num_entries,
+ size_t * num_chunks_assigned_map,
+ H5D_io_info_t * io_info,
+ H5D_chk_idx_info_t *idx_info, int mpi_rank,
+ int mpi_size);
+static herr_t H5D__mpio_collective_filtered_chunk_reinsert(H5D_filtered_collective_io_info_t *chunk_list,
+ size_t chunk_list_num_entries,
+ size_t * num_chunks_assigned_map,
+ H5D_io_info_t * io_info,
+ H5D_chk_idx_info_t *idx_info, int mpi_rank,
+ int mpi_size);
+static herr_t H5D__mpio_get_chunk_redistribute_info_types(MPI_Datatype *contig_type,
+ hbool_t * contig_type_derived,
+ MPI_Datatype *resized_type,
+ hbool_t * resized_type_derived);
+static herr_t H5D__mpio_get_chunk_alloc_info_types(MPI_Datatype *contig_type, hbool_t *contig_type_derived,
+ MPI_Datatype *resized_type, hbool_t *resized_type_derived);
+static herr_t H5D__mpio_get_chunk_insert_info_types(MPI_Datatype *contig_type, hbool_t *contig_type_derived,
+ MPI_Datatype *resized_type,
+ hbool_t * resized_type_derived);
+static herr_t H5D__mpio_collective_filtered_io_type(H5D_filtered_collective_io_info_t *chunk_list,
+ size_t num_entries, H5D_io_op_type_t op_type,
+ MPI_Datatype *new_mem_type, hbool_t *mem_type_derived,
+ MPI_Datatype *new_file_type, hbool_t *file_type_derived);
static int H5D__cmp_chunk_addr(const void *chunk_addr_info1, const void *chunk_addr_info2);
static int H5D__cmp_filtered_collective_io_info_entry(const void *filtered_collective_io_info_entry1,
const void *filtered_collective_io_info_entry2);
-#if MPI_VERSION >= 3
-static int H5D__cmp_filtered_collective_io_info_entry_owner(const void *filtered_collective_io_info_entry1,
- const void *filtered_collective_io_info_entry2);
+static int H5D__cmp_chunk_redistribute_info(const void *entry1, const void *entry2);
+static int H5D__cmp_chunk_redistribute_info_orig_owner(const void *entry1, const void *entry2);
+
+#ifdef H5Dmpio_DEBUG
+static herr_t H5D__mpio_debug_init(void);
+static herr_t H5D__mpio_dump_collective_filtered_chunk_list(H5D_filtered_collective_io_info_t *chunk_list,
+ size_t chunk_list_num_entries, int mpi_rank);
#endif
/*********************/
@@ -273,6 +395,188 @@ static int H5D__cmp_filtered_collective_io_info_entry_owner(const void *filtered
/* Local Variables */
/*******************/
+/* Declare extern free list to manage the H5S_sel_iter_t struct */
+H5FL_EXTERN(H5S_sel_iter_t);
+
+#ifdef H5Dmpio_DEBUG
+
+/* Flags to control debug actions in this file.
+ * (Meant to be indexed by characters)
+ *
+ * These flags can be set with either (or both) the environment variable
+ * "H5D_mpio_Debug" set to a string containing one or more characters
+ * (flags) or by setting them as a string value for the
+ * "H5D_mpio_debug_key" MPI Info key.
+ *
+ * Supported characters in 'H5D_mpio_Debug' string:
+ * 't' trace function entry and exit
+ * 'f' log to file rather than debugging stream
+ * 'm' show (rough) memory usage statistics
+ * 'c' show critical timing information
+ *
+ * To only show output from a particular MPI rank, specify its rank
+ * number as a character, e.g.:
+ *
+ * '0' only show output from rank 0
+ *
+ * To only show output from a particular range (up to 8 ranks supported
+ * between 0-9) of MPI ranks, specify the start and end ranks separated
+ * by a hyphen, e.g.:
+ *
+ * '0-7' only show output from ranks 0 through 7
+ *
+ */
+static int H5D_mpio_debug_flags_s[256];
+static int H5D_mpio_debug_rank_s[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+static hbool_t H5D_mpio_debug_inited = FALSE;
+static const char *const trace_in_pre = "-> ";
+static const char *const trace_out_pre = "<- ";
+static int debug_indent = 0;
+static FILE * debug_stream = NULL;
+
+/* Determine if this rank should output debugging info */
+#define H5D_MPIO_DEBUG_THIS_RANK(rank) \
+ (H5D_mpio_debug_rank_s[0] < 0 || rank == H5D_mpio_debug_rank_s[0] || rank == H5D_mpio_debug_rank_s[1] || \
+ rank == H5D_mpio_debug_rank_s[2] || rank == H5D_mpio_debug_rank_s[3] || \
+ rank == H5D_mpio_debug_rank_s[4] || rank == H5D_mpio_debug_rank_s[5] || \
+ rank == H5D_mpio_debug_rank_s[6] || rank == H5D_mpio_debug_rank_s[7])
+
+/* Print some debugging string */
+#define H5D_MPIO_DEBUG(rank, string) \
+ do { \
+ if (debug_stream && H5D_MPIO_DEBUG_THIS_RANK(rank)) { \
+ HDfprintf(debug_stream, "%*s(Rank %d) " string "\n", debug_indent, "", rank); \
+ fflush(debug_stream); \
+ } \
+ } while (0)
+
+/* Print some debugging string with printf-style arguments */
+#define H5D_MPIO_DEBUG_VA(rank, string, ...) \
+ do { \
+ if (debug_stream && H5D_MPIO_DEBUG_THIS_RANK(rank)) { \
+ HDfprintf(debug_stream, "%*s(Rank %d) " string "\n", debug_indent, "", rank, __VA_ARGS__); \
+ fflush(debug_stream); \
+ } \
+ } while (0)
+
+#define H5D_MPIO_TRACE_ENTER(rank) \
+ do { \
+ hbool_t trace_flag = H5D_mpio_debug_flags_s[(int)'t']; \
+ \
+ if (trace_flag) { \
+ H5D_MPIO_DEBUG_VA(rank, "%s%s", trace_in_pre, __func__); \
+ debug_indent += (int)strlen(trace_in_pre); \
+ } \
+ } while (0)
+
+#define H5D_MPIO_TRACE_EXIT(rank) \
+ do { \
+ hbool_t trace_flag = H5D_mpio_debug_flags_s[(int)'t']; \
+ \
+ if (trace_flag) { \
+ debug_indent -= (int)strlen(trace_out_pre); \
+ H5D_MPIO_DEBUG_VA(rank, "%s%s", trace_out_pre, __func__); \
+ } \
+ } while (0)
+
+#define H5D_MPIO_TIME_START(rank, op_name) \
+ { \
+ hbool_t time_flag = H5D_mpio_debug_flags_s[(int)'c']; \
+ double start_time = 0.0, end_time = 0.0; \
+ const char *const op = op_name; \
+ \
+ if (time_flag) { \
+ start_time = MPI_Wtime(); \
+ }
+
+#define H5D_MPIO_TIME_STOP(rank) \
+ if (time_flag) { \
+ end_time = MPI_Wtime(); \
+ H5D_MPIO_DEBUG_VA(rank, "'%s' took %f seconds", op, (end_time - start_time)); \
+ } \
+ }
+
+/*---------------------------------------------------------------------------
+ * Function: H5D__mpio_parse_debug_str
+ *
+ * Purpose: Parse a string for H5Dmpio-related debugging flags
+ *
+ * Returns: N/A
+ *
+ *---------------------------------------------------------------------------
+ */
+static void
+H5D__mpio_parse_debug_str(const char *s)
+{
+ FUNC_ENTER_STATIC_NOERR
+
+ HDassert(s);
+
+ while (*s) {
+ int c = (int)(*s);
+
+ if (c >= (int)'0' && c <= (int)'9') {
+ hbool_t range = FALSE;
+
+ if (*(s + 1) && *(s + 2))
+ range = (int)*(s + 1) == '-' && (int)*(s + 2) >= (int)'0' && (int)*(s + 2) <= (int)'9';
+
+ if (range) {
+ int start_rank = c - (int)'0';
+ int end_rank = (int)*(s + 2) - '0';
+ int num_ranks = end_rank - start_rank + 1;
+ int i;
+
+ if (num_ranks > 8) {
+ end_rank = start_rank + 7;
+ num_ranks = 8;
+ }
+
+ for (i = 0; i < num_ranks; i++)
+ H5D_mpio_debug_rank_s[i] = start_rank++;
+
+ s += 3;
+ }
+ else
+ H5D_mpio_debug_rank_s[0] = c - (int)'0';
+ }
+ else
+ H5D_mpio_debug_flags_s[c]++;
+
+ s++;
+ }
+
+ FUNC_LEAVE_NOAPI_VOID
+}
+
+static herr_t
+H5D__mpio_debug_init(void)
+{
+ const char *debug_str;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_STATIC_NOERR
+
+ HDassert(!H5D_mpio_debug_inited);
+
+ /* Clear the debug flag buffer */
+ HDmemset(H5D_mpio_debug_flags_s, 0, sizeof(H5D_mpio_debug_flags_s));
+
+ /* Retrieve and parse the H5Dmpio debug string */
+ debug_str = HDgetenv("H5D_mpio_Debug");
+ if (debug_str)
+ H5D__mpio_parse_debug_str(debug_str);
+
+ if (H5DEBUG(D))
+ debug_stream = H5DEBUG(D);
+
+ H5D_mpio_debug_inited = TRUE;
+
+ FUNC_LEAVE_NOAPI(ret_value)
+}
+
+#endif
+
/*-------------------------------------------------------------------------
* Function: H5D__mpio_opt_possible
*
@@ -347,14 +651,9 @@ H5D__mpio_opt_possible(const H5D_io_info_t *io_info, const H5S_t *file_space, co
* use collective IO will defer until each chunk IO is reached.
*/
-#if MPI_VERSION < 3
- /*
- * Don't allow parallel writes to filtered datasets if the MPI version
- * is less than 3. The functions needed (MPI_Mprobe and MPI_Imrecv) will
- * not be available.
- */
- if (io_info->op_type == H5D_IO_OP_WRITE && io_info->dset->shared->layout.type == H5D_CHUNKED &&
- io_info->dset->shared->dcpl_cache.pline.nused > 0)
+#ifndef H5_HAVE_PARALLEL_FILTERED_WRITES
+ /* Don't allow writes to filtered datasets if the functionality is disabled */
+ if (io_info->op_type == H5D_IO_OP_WRITE && io_info->dset->shared->dcpl_cache.pline.nused > 0)
local_cause[0] |= H5D_MPIO_PARALLEL_FILTERED_WRITES_DISABLED;
#endif
@@ -437,6 +736,150 @@ done:
} /* H5D__mpio_opt_possible() */
/*-------------------------------------------------------------------------
+ * Function: H5D__mpio_get_no_coll_cause_strings
+ *
+ * Purpose: When collective I/O is broken internally, it can be useful
+ * for users to see a representative string for the reason(s)
+ * why it was broken. This routine inspects the current
+ * "cause" flags from the API context and prints strings into
+ * the caller's buffers for the local and global reasons that
+ * collective I/O was broken.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5D__mpio_get_no_coll_cause_strings(char *local_cause, size_t local_cause_len, char *global_cause,
+ size_t global_cause_len)
+{
+ uint32_t local_no_coll_cause;
+ uint32_t global_no_coll_cause;
+ size_t local_cause_bytes_written = 0;
+ size_t global_cause_bytes_written = 0;
+ int nbits;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_PACKAGE
+
+ HDassert((local_cause && local_cause_len > 0) || (global_cause && global_cause_len > 0));
+
+ /*
+ * Use compile-time assertion so this routine is updated
+ * when any new "no collective cause" values are added
+ */
+ HDcompile_assert(H5D_MPIO_NO_COLLECTIVE_MAX_CAUSE == (H5D_mpio_no_collective_cause_t)256);
+
+ /* Initialize output buffers */
+ if (local_cause)
+ *local_cause = '\0';
+ if (global_cause)
+ *global_cause = '\0';
+
+ /* Retrieve the local and global cause flags from the API context */
+ if (H5CX_get_mpio_local_no_coll_cause(&local_no_coll_cause) < 0)
+ HGOTO_ERROR(H5E_CONTEXT, H5E_CANTGET, FAIL, "unable to get local no collective cause value")
+ if (H5CX_get_mpio_global_no_coll_cause(&global_no_coll_cause) < 0)
+ HGOTO_ERROR(H5E_CONTEXT, H5E_CANTGET, FAIL, "unable to get global no collective cause value")
+
+ /*
+ * Append each of the "reason for breaking collective I/O"
+ * error messages to the local and global cause string buffers
+ */
+ nbits = 8 * sizeof(local_no_coll_cause);
+ for (int bit_pos = 0; bit_pos < nbits; bit_pos++) {
+ H5D_mpio_no_collective_cause_t cur_cause;
+ const char * cause_str;
+ size_t buf_space_left;
+
+ cur_cause = (H5D_mpio_no_collective_cause_t)(1 << bit_pos);
+ if (cur_cause == H5D_MPIO_NO_COLLECTIVE_MAX_CAUSE)
+ break;
+
+ switch (cur_cause) {
+ case H5D_MPIO_SET_INDEPENDENT:
+ cause_str = "independent I/O was requested";
+ break;
+ case H5D_MPIO_DATATYPE_CONVERSION:
+ cause_str = "datatype conversions were required";
+ break;
+ case H5D_MPIO_DATA_TRANSFORMS:
+ cause_str = "data transforms needed to be applied";
+ break;
+ case H5D_MPIO_MPI_OPT_TYPES_ENV_VAR_DISABLED:
+ cause_str = "optimized MPI types flag wasn't set";
+ break;
+ case H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES:
+ cause_str = "one of the dataspaces was neither simple nor scalar";
+ break;
+ case H5D_MPIO_NOT_CONTIGUOUS_OR_CHUNKED_DATASET:
+ cause_str = "dataset was not contiguous or chunked";
+ break;
+ case H5D_MPIO_PARALLEL_FILTERED_WRITES_DISABLED:
+ cause_str = "parallel writes to filtered datasets are disabled";
+ break;
+ case H5D_MPIO_ERROR_WHILE_CHECKING_COLLECTIVE_POSSIBLE:
+ cause_str = "an error occurred while checking if collective I/O was possible";
+ break;
+ case H5D_MPIO_COLLECTIVE:
+ case H5D_MPIO_NO_COLLECTIVE_MAX_CAUSE:
+ default:
+ HDassert(0 && "invalid no collective cause reason");
+ break;
+ }
+
+ /*
+ * Determine if the local reasons for breaking collective I/O
+ * included the current cause
+ */
+ if (local_cause && (cur_cause & local_no_coll_cause)) {
+ buf_space_left = local_cause_len - local_cause_bytes_written;
+
+ /*
+ * Check if there were any previous error messages included. If
+ * so, prepend a semicolon to separate the messages.
+ */
+ if (buf_space_left && local_cause_bytes_written) {
+ HDstrncat(local_cause, "; ", buf_space_left);
+ local_cause_bytes_written += MIN(buf_space_left, 2);
+ buf_space_left -= MIN(buf_space_left, 2);
+ }
+
+ if (buf_space_left) {
+ HDstrncat(local_cause, cause_str, buf_space_left);
+ local_cause_bytes_written += MIN(buf_space_left, HDstrlen(cause_str));
+ }
+ }
+
+ /*
+ * Determine if the global reasons for breaking collective I/O
+ * included the current cause
+ */
+ if (global_cause && (cur_cause & global_no_coll_cause)) {
+ buf_space_left = global_cause_len - global_cause_bytes_written;
+
+ /*
+ * Check if there were any previous error messages included. If
+ * so, prepend a semicolon to separate the messages.
+ */
+ if (buf_space_left && global_cause_bytes_written) {
+ HDstrncat(global_cause, "; ", buf_space_left);
+ global_cause_bytes_written += MIN(buf_space_left, 2);
+ buf_space_left -= MIN(buf_space_left, 2);
+ }
+
+ if (buf_space_left) {
+ HDstrncat(global_cause, cause_str, buf_space_left);
+ global_cause_bytes_written += MIN(buf_space_left, HDstrlen(cause_str));
+ }
+ }
+ }
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D__mpio_get_no_coll_cause_strings() */
+
+/*-------------------------------------------------------------------------
* Function: H5D__mpio_select_read
*
* Purpose: MPI-IO function to read directly from app buffer to file.
@@ -500,145 +943,6 @@ done:
} /* end H5D__mpio_select_write() */
/*-------------------------------------------------------------------------
- * Function: H5D__mpio_array_gatherv
- *
- * Purpose: Given an array, specified in local_array, by each processor
- * calling this function, collects each array into a single
- * array which is then either gathered to the processor
- * specified by root, when allgather is false, or is
- * distributed back to all processors when allgather is true.
- *
- * The number of entries in the array contributed by an
- * individual processor and the size of each entry should be
- * specified in local_array_num_entries and array_entry_size,
- * respectively.
- *
- * The MPI communicator to use should be specified for comm.
- *
- * If the sort_func argument is supplied, the array is sorted
- * before the function returns.
- *
- * Note: if allgather is specified as true, root is ignored.
- *
- * Return: Non-negative on success/Negative on failure
- *
- * Programmer: Jordan Henderson
- * Sunday, April 9th, 2017
- *
- *-------------------------------------------------------------------------
- */
-static herr_t
-H5D__mpio_array_gatherv(void *local_array, size_t local_array_num_entries, size_t array_entry_size,
- void **_gathered_array, size_t *_gathered_array_num_entries, hbool_t allgather,
- int root, MPI_Comm comm, H5D_mpio_sort_func_cb_t sort_func)
-{
- size_t gathered_array_num_entries = 0; /* The size of the newly-constructed array */
- void * gathered_array = NULL; /* The newly-constructed array returned to the caller */
- int *receive_counts_array = NULL; /* Array containing number of entries each processor is contributing */
- int *displacements_array =
- NULL; /* Array of displacements where each processor places its data in the final array */
- int mpi_code, mpi_rank, mpi_size;
- int sendcount;
- herr_t ret_value = SUCCEED;
-
- FUNC_ENTER_STATIC
-
- HDassert(_gathered_array);
- HDassert(_gathered_array_num_entries);
-
- MPI_Comm_size(comm, &mpi_size);
- MPI_Comm_rank(comm, &mpi_rank);
-
- /* Determine the size of the end result array by collecting the number
- * of entries contributed by each processor into a single total.
- */
- if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&local_array_num_entries, &gathered_array_num_entries, 1,
- MPI_INT, MPI_SUM, comm)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code)
-
- /* If 0 entries resulted from the collective operation, no processor is contributing anything and there is
- * nothing to do */
- if (gathered_array_num_entries > 0) {
- /*
- * If gathering to all processors, all processors need to allocate space for the resulting array, as
- * well as the receive counts and displacements arrays for the collective MPI_Allgatherv call.
- * Otherwise, only the root processor needs to allocate the space for an MPI_Gatherv call.
- */
- if (allgather || (mpi_rank == root)) {
- if (NULL == (gathered_array = H5MM_malloc(gathered_array_num_entries * array_entry_size)))
- HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate gathered array")
-
- if (NULL == (receive_counts_array = (int *)H5MM_malloc((size_t)mpi_size * sizeof(int))))
- HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate receive counts array")
-
- if (NULL == (displacements_array = (int *)H5MM_malloc((size_t)mpi_size * sizeof(int))))
- HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate receive displacements array")
- } /* end if */
-
- /*
- * If gathering to all processors, inform each processor of how many entries each other processor is
- * contributing to the resulting array by collecting the counts into each processor's "receive counts"
- * array. Otherwise, inform only the root processor of how many entries each other processor is
- * contributing.
- */
- if (allgather) {
- if (MPI_SUCCESS != (mpi_code = MPI_Allgather(&local_array_num_entries, 1, MPI_INT,
- receive_counts_array, 1, MPI_INT, comm)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Allgather failed", mpi_code)
- } /* end if */
- else {
- if (MPI_SUCCESS != (mpi_code = MPI_Gather(&local_array_num_entries, 1, MPI_INT,
- receive_counts_array, 1, MPI_INT, root, comm)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Gather failed", mpi_code)
- } /* end else */
-
- if (allgather || (mpi_rank == root)) {
- size_t i;
-
- /* Multiply each receive count by the size of the array entry, since the data is sent as bytes. */
- for (i = 0; i < (size_t)mpi_size; i++)
- H5_CHECKED_ASSIGN(receive_counts_array[i], int,
- (size_t)receive_counts_array[i] * array_entry_size, size_t);
-
- /* Set receive buffer offsets for the collective MPI_Allgatherv/MPI_Gatherv call. */
- displacements_array[0] = 0;
- for (i = 1; i < (size_t)mpi_size; i++)
- displacements_array[i] = displacements_array[i - 1] + receive_counts_array[i - 1];
- } /* end if */
-
- /* As the data is sent as bytes, calculate the true sendcount for the data. */
- H5_CHECKED_ASSIGN(sendcount, int, local_array_num_entries *array_entry_size, size_t);
-
- if (allgather) {
- if (MPI_SUCCESS !=
- (mpi_code = MPI_Allgatherv(local_array, sendcount, MPI_BYTE, gathered_array,
- receive_counts_array, displacements_array, MPI_BYTE, comm)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Allgatherv failed", mpi_code)
- } /* end if */
- else {
- if (MPI_SUCCESS !=
- (mpi_code = MPI_Gatherv(local_array, sendcount, MPI_BYTE, gathered_array,
- receive_counts_array, displacements_array, MPI_BYTE, root, comm)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Gatherv failed", mpi_code)
- } /* end else */
-
- if (sort_func && (allgather || (mpi_rank == root)))
- HDqsort(gathered_array, gathered_array_num_entries, array_entry_size, sort_func);
- } /* end if */
-
- *_gathered_array = gathered_array;
- *_gathered_array_num_entries = gathered_array_num_entries;
-
-done:
- if (receive_counts_array)
- H5MM_free(receive_counts_array);
- if (displacements_array)
- H5MM_free(displacements_array);
-
- FUNC_LEAVE_NOAPI(ret_value)
-} /* end H5D__mpio_array_gatherv() */
-
-/*-------------------------------------------------------------------------
* Function: H5D__mpio_get_sum_chunk
*
* Purpose: Routine for obtaining total number of chunks to cover
@@ -793,11 +1097,17 @@ static herr_t
H5D__chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, H5D_chunk_map_t *fm)
{
H5FD_mpio_chunk_opt_t chunk_opt_mode;
- int io_option = H5D_MULTI_CHUNK_IO_MORE_OPT;
- int sum_chunk = -1;
+#ifdef H5Dmpio_DEBUG
+ hbool_t log_file_flag = FALSE;
+ FILE * debug_log_file = NULL;
+#endif
#ifdef H5_HAVE_INSTRUMENTED_LIBRARY
htri_t temp_not_link_io = FALSE;
#endif
+ int io_option = H5D_MULTI_CHUNK_IO_MORE_OPT;
+ int sum_chunk = -1;
+ int mpi_rank;
+ int mpi_size;
herr_t ret_value = SUCCEED;
FUNC_ENTER_STATIC
@@ -808,9 +1118,35 @@ H5D__chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_inf
HDassert(type_info);
HDassert(fm);
- /* Disable collective metadata reads for chunked dataset I/O operations
- * in order to prevent potential hangs */
- H5CX_set_coll_metadata_read(FALSE);
+ /* Obtain the current rank of the process and the number of ranks */
+ if ((mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file)) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain MPI rank")
+ if ((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file)) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain MPI size")
+
+#ifdef H5Dmpio_DEBUG
+ /* Initialize file-level debugging if not initialized */
+ if (!H5D_mpio_debug_inited && H5D__mpio_debug_init() < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "can't initialize H5Dmpio debugging")
+
+ /* Open file for debugging if necessary */
+ log_file_flag = H5D_mpio_debug_flags_s[(int)'f'];
+ if (log_file_flag) {
+ char debug_log_filename[1024];
+ time_t time_now;
+
+ HDsnprintf(debug_log_filename, 1024, "H5Dmpio_debug.rank%d", mpi_rank);
+
+ if (NULL == (debug_log_file = HDfopen(debug_log_filename, "a")))
+ HGOTO_ERROR(H5E_IO, H5E_OPENERROR, FAIL, "couldn't open debugging log file")
+
+ /* Print a short header for this I/O operation */
+ time_now = time(NULL);
+ HDfprintf(debug_log_file, "##### %s", asctime(localtime(&time_now)));
+
+ debug_stream = debug_log_file;
+ }
+#endif
/* Check the optional property list for the collective chunk IO optimization option */
if (H5CX_get_mpio_chunk_opt_mode(&chunk_opt_mode) < 0)
@@ -824,13 +1160,10 @@ H5D__chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_inf
/* via default path. branch by num threshold */
else {
unsigned one_link_chunk_io_threshold; /* Threshold to use single collective I/O for all chunks */
- int mpi_size; /* Number of processes in MPI job */
if (H5D__mpio_get_sum_chunk(io_info, fm, &sum_chunk) < 0)
HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL,
"unable to obtain the total chunk number of all processes");
- if ((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file)) < 0)
- HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size")
/* Get the chunk optimization option threshold */
if (H5CX_get_mpio_chunk_opt_num(&one_link_chunk_io_threshold) < 0)
@@ -876,22 +1209,12 @@ H5D__chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_inf
case H5D_ONE_LINK_CHUNK_IO_MORE_OPT:
/* Check if there are any filters in the pipeline */
if (io_info->dset->shared->dcpl_cache.pline.nused > 0) {
- /* For now, Multi-chunk IO must be forced for parallel filtered read,
- * so that data can be unfiltered as it is received. There is significant
- * complexity in unfiltering the data when it is read all at once into a
- * single buffer.
- */
- if (io_info->op_type == H5D_IO_OP_READ) {
- if (H5D__multi_chunk_filtered_collective_io(io_info, type_info, fm) < 0)
- HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,
- "couldn't finish optimized multiple filtered chunk MPI-IO")
- } /* end if */
- else if (H5D__link_chunk_filtered_collective_io(io_info, type_info, fm) < 0)
+ if (H5D__link_chunk_filtered_collective_io(io_info, type_info, fm, mpi_rank, mpi_size) < 0)
HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish filtered linked chunk MPI-IO")
} /* end if */
else
/* Perform unfiltered link chunk collective IO */
- if (H5D__link_chunk_collective_io(io_info, type_info, fm, sum_chunk) < 0)
+ if (H5D__link_chunk_collective_io(io_info, type_info, fm, sum_chunk, mpi_rank, mpi_size) < 0)
HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish linked chunk MPI-IO")
break;
@@ -899,18 +1222,28 @@ H5D__chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_inf
default: /* multiple chunk IO via threshold */
/* Check if there are any filters in the pipeline */
if (io_info->dset->shared->dcpl_cache.pline.nused > 0) {
- if (H5D__multi_chunk_filtered_collective_io(io_info, type_info, fm) < 0)
+ if (H5D__multi_chunk_filtered_collective_io(io_info, type_info, fm, mpi_rank, mpi_size) < 0)
HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,
"couldn't finish optimized multiple filtered chunk MPI-IO")
} /* end if */
else
/* Perform unfiltered multi chunk collective IO */
- if (H5D__multi_chunk_collective_io(io_info, type_info, fm) < 0)
+ if (H5D__multi_chunk_collective_io(io_info, type_info, fm, mpi_rank, mpi_size) < 0)
HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish optimized multiple chunk MPI-IO")
break;
} /* end switch */
done:
+#ifdef H5Dmpio_DEBUG
+ /* Close debugging log file */
+ if (debug_log_file) {
+ HDfprintf(debug_log_file, "##############\n\n");
+ if (EOF == HDfclose(debug_log_file))
+ HDONE_ERROR(H5E_IO, H5E_CLOSEERROR, FAIL, "couldn't close debugging log file")
+ debug_stream = H5DEBUG(D);
+ }
+#endif
+
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5D__chunk_collective_io */
@@ -993,7 +1326,7 @@ done:
*/
static herr_t
H5D__link_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, H5D_chunk_map_t *fm,
- int sum_chunk)
+ int sum_chunk, int mpi_rank, int mpi_size)
{
H5D_chunk_addr_info_t *chunk_addr_info_array = NULL;
MPI_Datatype chunk_final_mtype; /* Final memory MPI datatype for all chunks with selection */
@@ -1074,9 +1407,8 @@ H5D__link_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *typ
/* Set up the base storage address for this chunk */
io_info->store = &ctg_store;
-#ifdef H5D_DEBUG
- if (H5DEBUG(D))
- HDfprintf(H5DEBUG(D), "before inter_collective_io for total chunk = 1 \n");
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_DEBUG(mpi_rank, "before inter_collective_io for total chunk = 1");
#endif
/* Perform I/O */
@@ -1092,9 +1424,8 @@ H5D__link_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *typ
num_chunk = H5SL_count(fm->sel_chunks);
H5_CHECK_OVERFLOW(num_chunk, size_t, int);
-#ifdef H5D_DEBUG
- if (H5DEBUG(D))
- HDfprintf(H5DEBUG(D), "total_chunks = %zu, num_chunk = %zu\n", total_chunks, num_chunk);
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_DEBUG_VA(mpi_rank, "total_chunks = %zu, num_chunk = %zu", total_chunks, num_chunk);
#endif
/* Set up MPI datatype for chunks selected */
@@ -1125,18 +1456,17 @@ H5D__link_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *typ
HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL,
"couldn't allocate chunk file is derived datatype flags buffer")
-#ifdef H5D_DEBUG
- if (H5DEBUG(D))
- HDfprintf(H5DEBUG(D), "before sorting the chunk address \n");
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_DEBUG(mpi_rank, "before sorting chunk addresses");
#endif
+
/* Sort the chunk address */
- if (H5D__sort_chunk(io_info, fm, chunk_addr_info_array, sum_chunk) < 0)
+ if (H5D__sort_chunk(io_info, fm, chunk_addr_info_array, sum_chunk, mpi_rank, mpi_size) < 0)
HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to sort chunk address")
ctg_store.contig.dset_addr = chunk_addr_info_array[0].chunk_addr;
-#ifdef H5D_DEBUG
- if (H5DEBUG(D))
- HDfprintf(H5DEBUG(D), "after sorting the chunk address \n");
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_DEBUG(mpi_rank, "after sorting chunk addresses");
#endif
/* Obtain MPI derived datatype from all individual chunks */
@@ -1241,9 +1571,9 @@ H5D__link_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *typ
/* No chunks selected for this process */
mpi_buf_count = (hsize_t)0;
} /* end else */
-#ifdef H5D_DEBUG
- if (H5DEBUG(D))
- HDfprintf(H5DEBUG(D), "before coming to final collective IO\n");
+
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_DEBUG(mpi_rank, "before coming to final collective I/O");
#endif
/* Set up the base storage address for this chunk */
@@ -1256,11 +1586,11 @@ H5D__link_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *typ
} /* end else */
done:
-#ifdef H5D_DEBUG
- if (H5DEBUG(D))
- HDfprintf(H5DEBUG(D), "before freeing memory inside H5D_link_collective_io ret_value = %d\n",
- ret_value);
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_DEBUG_VA(mpi_rank, "before freeing memory inside H5D_link_collective_io ret_value = %d",
+ ret_value);
#endif
+
/* Release resources */
if (chunk_addr_info_array)
H5MM_xfree(chunk_addr_info_array);
@@ -1293,68 +1623,89 @@ done:
/*-------------------------------------------------------------------------
* Function: H5D__link_chunk_filtered_collective_io
*
- * Purpose: Routine for one collective IO with one MPI derived datatype
- * to link with all filtered chunks
- *
- * 1. Construct a list of selected chunks in the collective IO
- * operation
- * A. If any chunk is being written to by more than 1
- * process, the process writing to the chunk which
- * currently has the least amount of chunks assigned
- * to it becomes the new owner (in the case of ties,
- * the lowest MPI rank becomes the new owner)
- * 2. If the operation is a write operation
- * A. Loop through each chunk in the operation
- * I. If this is not a full overwrite of the chunk
- * a) Read the chunk from file and pass the chunk
- * through the filter pipeline in reverse order
- * (Unfilter the chunk)
+ * Purpose: Performs collective I/O on filtered chunks by creating a
+ * single MPI derived datatype to link with all filtered
+ * chunks. The general algorithm is as follows:
+ *
+ * 1. Construct a list of selected chunks in the collective
+ * I/O operation
+ * 2. If the operation is a read operation
+ * A. Ensure that the list of chunks is sorted in
+ * monotonically non-decreasing order of chunk offset
+ * in the file
+ * B. Participate in a collective read of chunks from
+ * the file
+ * C. Loop through each selected chunk, unfiltering it and
+ * scattering the data to the application's read buffer
+ * 3. If the operation is a write operation
+ * A. Redistribute any chunks being written by more than 1
+ * MPI rank, such that the chunk is only owned by 1 MPI
+ * rank. The rank writing to the chunk which currently
+ * has the least amount of chunks assigned to it becomes
+ * the new owner (in the case of ties, the lowest MPI
+ * rank becomes the new owner)
+ * B. Participate in a collective read of chunks from the
+ * file
+ * C. Loop through each chunk selected in the operation
+ * and for each chunk:
+ * I. If we actually read the chunk from the file (if
+ * a chunk is being fully overwritten, we skip
+ * reading it), pass the chunk through the filter
+ * pipeline in reverse order (unfilter the chunk)
* II. Update the chunk data with the modifications from
- * the owning process
+ * the owning MPI rank
* III. Receive any modification data from other
- * processes and update the chunk data with these
+ * ranks and update the chunk data with those
* modifications
* IV. Filter the chunk
- * B. Contribute the modified chunks to an array gathered
- * by all processes which contains the new sizes of
- * every chunk modified in the collective IO operation
- * C. All processes collectively re-allocate each chunk
- * from the gathered array with their new sizes after
- * the filter operation
- * D. If this process has any chunks selected in the IO
- * operation, create an MPI derived type for memory and
- * file to write out the process' selected chunks to the
- * file
- * E. Perform the collective write
- * F. All processes collectively re-insert each modified
+ * D. Contribute the modified chunks to an array gathered
+ * by all ranks which contains information for
+ * re-allocating space in the file for every chunk
+ * modified. Then, each rank collectively re-allocates
+ * each chunk from the gathered array with their new
+ * sizes after the filter operation
+ * E. Proceed with the collective write operation for all
+ * the modified chunks
+ * F. Contribute the modified chunks to an array gathered
+ * by all ranks which contains information for
+ * re-inserting every chunk modified into the chunk
+ * index. Then, each rank collectively re-inserts each
* chunk from the gathered array into the chunk index
*
+ * TODO: Note that steps D. and F. here are both collective
+ * operations that partially share data from the
+ * H5D_filtered_collective_io_info_t structure. To
+ * try to conserve on memory a bit, the distributed
+ * arrays these operations create are discarded after
+ * each operation is performed. If memory consumption
+ * here proves to not be an issue, the necessary data
+ * for both operations could be combined into a single
+ * structure so that only one collective MPI operation
+ * is needed to carry out both operations, rather than
+ * two.
*
* Return: Non-negative on success/Negative on failure
*
- * Programmer: Jordan Henderson
- * Friday, Nov. 4th, 2016
- *
*-------------------------------------------------------------------------
*/
static herr_t
H5D__link_chunk_filtered_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info,
- H5D_chunk_map_t *fm)
+ H5D_chunk_map_t *fm, int mpi_rank, int mpi_size)
{
- H5D_filtered_collective_io_info_t *chunk_list = NULL; /* The list of chunks being read/written */
- H5D_filtered_collective_io_info_t *collective_chunk_list =
- NULL; /* The list of chunks used during collective operations */
- H5D_storage_t ctg_store; /* Chunk storage information as contiguous dataset */
- MPI_Datatype mem_type = MPI_BYTE;
- MPI_Datatype file_type = MPI_BYTE;
- hbool_t mem_type_is_derived = FALSE;
- hbool_t file_type_is_derived = FALSE;
- size_t chunk_list_num_entries;
- size_t collective_chunk_list_num_entries;
- size_t * num_chunks_selected_array = NULL; /* Array of number of chunks selected on each process */
- size_t i; /* Local index variable */
- int mpi_rank, mpi_size, mpi_code;
- herr_t ret_value = SUCCEED;
+ H5D_filtered_collective_io_info_t *chunk_list = NULL; /* The list of chunks being read/written */
+ H5D_filtered_collective_io_info_t *chunk_hash_table = NULL;
+ unsigned char ** chunk_msg_bufs = NULL;
+ H5D_storage_t ctg_store; /* Chunk storage information as contiguous dataset */
+ MPI_Datatype mem_type = MPI_BYTE;
+ MPI_Datatype file_type = MPI_BYTE;
+ hbool_t mem_type_is_derived = FALSE;
+ hbool_t file_type_is_derived = FALSE;
+ size_t * rank_chunks_assigned_map = NULL;
+ size_t chunk_list_num_entries;
+ size_t i;
+ int chunk_msg_bufs_len = 0;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
FUNC_ENTER_STATIC
@@ -1362,11 +1713,12 @@ H5D__link_chunk_filtered_collective_io(H5D_io_info_t *io_info, const H5D_type_in
HDassert(type_info);
HDassert(fm);
- /* Obtain the current rank of the process and the number of processes */
- if ((mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file)) < 0)
- HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank")
- if ((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file)) < 0)
- HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size")
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_TRACE_ENTER(mpi_rank);
+ H5D_MPIO_DEBUG_VA(mpi_rank, "Performing Linked-chunk I/O (%s) with MPI Comm size of %d",
+ io_info->op_type == H5D_IO_OP_WRITE ? "write" : "read", mpi_size);
+ H5D_MPIO_TIME_START(mpi_rank, "Linked-chunk I/O");
+#endif
/* Set the actual-chunk-opt-mode property. */
H5CX_set_mpio_actual_chunk_opt(H5D_MPIO_LINK_CHUNK);
@@ -1377,123 +1729,127 @@ H5D__link_chunk_filtered_collective_io(H5D_io_info_t *io_info, const H5D_type_in
H5CX_set_mpio_actual_io_mode(H5D_MPIO_CHUNK_COLLECTIVE);
/* Build a list of selected chunks in the collective io operation */
- if (H5D__construct_filtered_io_info_list(io_info, type_info, fm, &chunk_list, &chunk_list_num_entries) <
- 0)
+ if (H5D__mpio_collective_filtered_chunk_io_setup(io_info, type_info, fm, &chunk_list,
+ &chunk_list_num_entries, mpi_rank) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "couldn't construct filtered I/O info list")
- if (io_info->op_type == H5D_IO_OP_WRITE) { /* Filtered collective write */
+ if (io_info->op_type == H5D_IO_OP_READ) { /* Filtered collective read */
+ if (H5D__mpio_collective_filtered_chunk_read(chunk_list, chunk_list_num_entries, io_info, type_info,
+ mpi_rank, mpi_size) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "couldn't read filtered chunks")
+ }
+ else { /* Filtered collective write */
H5D_chk_idx_info_t index_info;
- H5D_chunk_ud_t udata;
hsize_t mpi_buf_count;
- /* Construct chunked index info */
- index_info.f = io_info->dset->oloc.file;
- index_info.pline = &(io_info->dset->shared->dcpl_cache.pline);
- index_info.layout = &(io_info->dset->shared->layout.u.chunk);
- index_info.storage = &(io_info->dset->shared->layout.storage.u.chunk);
-
- /* Set up chunk information for insertion to chunk index */
- udata.common.layout = index_info.layout;
- udata.common.storage = index_info.storage;
- udata.filter_mask = 0;
-
- /* Iterate through all the chunks in the collective write operation,
- * updating each chunk with the data modifications from other processes,
- * then re-filtering the chunk.
- */
- for (i = 0; i < chunk_list_num_entries; i++)
- if (mpi_rank == chunk_list[i].owners.new_owner)
- if (H5D__filtered_collective_chunk_entry_io(&chunk_list[i], io_info, type_info, fm) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "couldn't process chunk entry")
-
- /* Gather the new chunk sizes to all processes for a collective reallocation
- * of the chunks in the file.
+ H5D_MPIO_INIT_CHUNK_IDX_INFO(index_info, io_info);
+
+ if (mpi_size > 1) {
+ /* Redistribute shared chunks being written to */
+ if (H5D__mpio_redistribute_shared_chunks(chunk_list, chunk_list_num_entries, io_info, fm,
+ mpi_rank, mpi_size, &rank_chunks_assigned_map) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "unable to redistribute shared chunks")
+
+ /* Send any chunk modification messages for chunks this rank no longer owns */
+ if (H5D__mpio_share_chunk_modification_data(chunk_list, &chunk_list_num_entries, io_info,
+ type_info, mpi_rank, mpi_size, &chunk_hash_table,
+ &chunk_msg_bufs, &chunk_msg_bufs_len) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL,
+ "unable to send chunk modification data between MPI ranks")
+
+ /* Make sure the local chunk list was updated correctly */
+ HDassert(chunk_list_num_entries == rank_chunks_assigned_map[mpi_rank]);
+ }
+
+ /* Proceed to update all the chunks this rank owns with its own
+ * modification data and data from other ranks, before re-filtering
+ * the chunks. As chunk reads are done collectively here, all ranks
+ * must participate.
*/
- if (H5D__mpio_array_gatherv(chunk_list, chunk_list_num_entries,
- sizeof(H5D_filtered_collective_io_info_t),
- (void **)&collective_chunk_list, &collective_chunk_list_num_entries, true,
- 0, io_info->comm, NULL) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGATHER, FAIL, "couldn't gather new chunk sizes")
-
- /* Collectively re-allocate the modified chunks (from each process) in the file */
- for (i = 0; i < collective_chunk_list_num_entries; i++) {
- hbool_t insert;
-
- if (H5D__chunk_file_alloc(&index_info, &collective_chunk_list[i].chunk_states.chunk_current,
- &collective_chunk_list[i].chunk_states.new_chunk, &insert,
- collective_chunk_list[i].scaled) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "unable to allocate chunk")
- } /* end for */
-
- if (NULL == (num_chunks_selected_array = (size_t *)H5MM_malloc((size_t)mpi_size * sizeof(size_t))))
- HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate num chunks selected array")
-
- if (MPI_SUCCESS !=
- (mpi_code = MPI_Allgather(&chunk_list_num_entries, 1, MPI_UNSIGNED_LONG_LONG,
- num_chunks_selected_array, 1, MPI_UNSIGNED_LONG_LONG, io_info->comm)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Allgather failed", mpi_code)
-
- /* If this process has any chunks selected, create a MPI type for collectively
- * writing out the chunks to file. Otherwise, the process contributes to the
+ if (H5D__mpio_collective_filtered_chunk_update(chunk_list, chunk_list_num_entries, chunk_hash_table,
+ chunk_msg_bufs, chunk_msg_bufs_len, io_info, type_info,
+ mpi_rank, mpi_size) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "couldn't update modified chunks")
+
+ /* Free up resources used by chunk hash table now that we're done updating chunks */
+ HASH_CLEAR(hh, chunk_hash_table);
+
+ /* All ranks now collectively re-allocate file space for all chunks */
+ if (H5D__mpio_collective_filtered_chunk_reallocate(chunk_list, chunk_list_num_entries,
+ rank_chunks_assigned_map, io_info, &index_info,
+ mpi_rank, mpi_size) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL,
+ "couldn't collectively re-allocate file space for chunks")
+
+ /* If this rank has any chunks selected, create a MPI type for collectively
+ * writing out the chunks to file. Otherwise, the rank contributes to the
* collective write with a none type.
*/
- if (chunk_list_num_entries) {
- size_t offset;
-
- /* During the collective re-allocation of chunks in the file, the record for each
- * chunk is only updated in the collective array, not in the local copy of chunks on each
- * process. However, each process needs the updated chunk records so that they can create
- * a MPI type for the collective write that will write to the chunk's possible new locations
- * in the file instead of the old ones. This ugly hack seems to be the best solution to
- * copy the information back to the local array and avoid having to modify the collective
- * write type function in an ugly way so that it will accept the collective array instead
- * of the local array. This works correctly because the array gather function guarantees
- * that the chunk data in the collective array is ordered in blocks by rank.
- */
- for (i = 0, offset = 0; i < (size_t)mpi_rank; i++)
- offset += num_chunks_selected_array[i];
-
- H5MM_memcpy(chunk_list, &collective_chunk_list[offset],
- num_chunks_selected_array[mpi_rank] * sizeof(H5D_filtered_collective_io_info_t));
+ if (H5D__mpio_collective_filtered_io_type(chunk_list, chunk_list_num_entries, io_info->op_type,
+ &mem_type, &mem_type_is_derived, &file_type,
+ &file_type_is_derived) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL,
+ "couldn't create MPI type for writing filtered chunks")
- /* Create single MPI type encompassing each selection in the dataspace */
- if (H5D__mpio_filtered_collective_write_type(chunk_list, chunk_list_num_entries, &mem_type,
- &mem_type_is_derived, &file_type,
- &file_type_is_derived) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_BADTYPE, FAIL, "couldn't create MPI link chunk I/O type")
+ mpi_buf_count = (file_type_is_derived || mem_type_is_derived) ? 1 : 0;
- /* Override the write buffer to point to the address of the first
- * chunk data buffer
+ /* Setup contig storage info for I/O operation */
+ if (chunk_list_num_entries) {
+ /*
+ * Override the write buffer to point to the first
+ * chunk's data buffer
*/
io_info->u.wbuf = chunk_list[0].buf;
- } /* end if */
-
- /* We have a single, complicated MPI datatype for both memory & file */
- mpi_buf_count = (mem_type_is_derived && file_type_is_derived) ? (hsize_t)1 : (hsize_t)0;
- /* Set up the base storage address for this operation */
- ctg_store.contig.dset_addr = 0; /* Write address must be set to address 0 */
- io_info->store = &ctg_store;
+ /*
+ * Setup the base storage address for this operation
+ * to be the first chunk's file address
+ */
+ ctg_store.contig.dset_addr = chunk_list[0].chunk_new.offset;
+ }
+ else
+ ctg_store.contig.dset_addr = 0;
/* Perform I/O */
+ io_info->store = &ctg_store;
if (H5D__final_collective_io(io_info, type_info, mpi_buf_count, file_type, mem_type) < 0)
HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish MPI-IO")
+ /* Free up resources in anticipation of following collective operation */
+ for (i = 0; i < chunk_list_num_entries; i++) {
+ if (chunk_list[i].buf) {
+ H5MM_free(chunk_list[i].buf);
+ chunk_list[i].buf = NULL;
+ }
+ }
+
/* Participate in the collective re-insertion of all chunks modified
- * in this iteration into the chunk index
+ * into the chunk index
*/
- for (i = 0; i < collective_chunk_list_num_entries; i++) {
- udata.chunk_block = collective_chunk_list[i].chunk_states.new_chunk;
- udata.common.scaled = collective_chunk_list[i].scaled;
- udata.chunk_idx = collective_chunk_list[i].index;
-
- if ((index_info.storage->ops->insert)(&index_info, &udata, io_info->dset) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTINSERT, FAIL, "unable to insert chunk address into index")
- } /* end for */
- } /* end if */
+ if (H5D__mpio_collective_filtered_chunk_reinsert(chunk_list, chunk_list_num_entries,
+ rank_chunks_assigned_map, io_info, &index_info,
+ mpi_rank, mpi_size) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL,
+ "couldn't collectively re-insert modified chunks into chunk index")
+ }
done:
- /* Free resources used by a process which had some selection */
+ /* Free the MPI buf and file types, if they were derived */
+ if (mem_type_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&mem_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ if (file_type_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&file_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+
+ if (chunk_msg_bufs) {
+ for (i = 0; i < (size_t)chunk_msg_bufs_len; i++)
+ H5MM_free(chunk_msg_bufs[i]);
+
+ H5MM_free(chunk_msg_bufs);
+ }
+
+ HASH_CLEAR(hh, chunk_hash_table);
+
+ /* Free resources used by a rank which had some selection */
if (chunk_list) {
for (i = 0; i < chunk_list_num_entries; i++)
if (chunk_list[i].buf)
@@ -1502,16 +1858,13 @@ done:
H5MM_free(chunk_list);
} /* end if */
- if (num_chunks_selected_array)
- H5MM_free(num_chunks_selected_array);
- if (collective_chunk_list)
- H5MM_free(collective_chunk_list);
+ if (rank_chunks_assigned_map)
+ H5MM_free(rank_chunks_assigned_map);
- /* Free the MPI buf and file types, if they were derived */
- if (mem_type_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&mem_type)))
- HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
- if (file_type_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&file_type)))
- HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_TIME_STOP(mpi_rank);
+ H5D_MPIO_TRACE_EXIT(mpi_rank);
+#endif
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5D__link_chunk_filtered_collective_io() */
@@ -1534,7 +1887,8 @@ done:
*-------------------------------------------------------------------------
*/
static herr_t
-H5D__multi_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, H5D_chunk_map_t *fm)
+H5D__multi_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, H5D_chunk_map_t *fm,
+ int mpi_rank, int mpi_size)
{
H5D_io_info_t ctg_io_info; /* Contiguous I/O info object */
H5D_storage_t ctg_store; /* Chunk storage information as contiguous dataset */
@@ -1547,11 +1901,8 @@ H5D__multi_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *ty
H5FD_mpio_collective_opt_t last_coll_opt_mode =
H5FD_MPIO_COLLECTIVE_IO; /* Last parallel transfer with independent IO or collective IO with this mode
*/
- size_t total_chunk; /* Total # of chunks in dataset */
-#ifdef H5Dmpio_DEBUG
- int mpi_rank;
-#endif
- size_t u; /* Local index variable */
+ size_t total_chunk; /* Total # of chunks in dataset */
+ size_t u; /* Local index variable */
H5D_mpio_actual_io_mode_t actual_io_mode =
H5D_MPIO_NO_COLLECTIVE; /* Local variable for tracking the I/O mode used. */
herr_t ret_value = SUCCEED;
@@ -1561,10 +1912,6 @@ H5D__multi_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *ty
/* Set the actual chunk opt mode property */
H5CX_set_mpio_actual_chunk_opt(H5D_MPIO_MULTI_CHUNK);
-#ifdef H5Dmpio_DEBUG
- mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file);
-#endif
-
/* Retrieve total # of chunks in dataset */
H5_CHECKED_ASSIGN(total_chunk, size_t, fm->layout->u.chunk.nchunks, hsize_t);
HDassert(total_chunk != 0);
@@ -1572,13 +1919,13 @@ H5D__multi_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *ty
/* Allocate memories */
chunk_io_option = (uint8_t *)H5MM_calloc(total_chunk);
chunk_addr = (haddr_t *)H5MM_calloc(total_chunk * sizeof(haddr_t));
-#ifdef H5D_DEBUG
- if (H5DEBUG(D))
- HDfprintf(H5DEBUG(D), "total_chunk %zu\n", total_chunk);
+
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_DEBUG_VA(mpi_rank, "total_chunk %zu", total_chunk);
#endif
/* Obtain IO option for each chunk */
- if (H5D__obtain_mpio_mode(io_info, fm, chunk_io_option, chunk_addr) < 0)
+ if (H5D__obtain_mpio_mode(io_info, fm, chunk_io_option, chunk_addr, mpi_rank, mpi_size) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_CANTRECV, FAIL, "unable to obtain MPIO mode")
/* Set up contiguous I/O info object */
@@ -1606,9 +1953,8 @@ H5D__multi_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *ty
H5S_t * fspace; /* Dataspace describing chunk & selection in it */
H5S_t * mspace; /* Dataspace describing selection in memory corresponding to this chunk */
-#ifdef H5D_DEBUG
- if (H5DEBUG(D))
- HDfprintf(H5DEBUG(D), "mpi_rank = %d, chunk index = %zu\n", mpi_rank, u);
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_DEBUG_VA(mpi_rank, "mpi_rank = %d, chunk index = %zu", mpi_rank, u);
#endif
/* Get the chunk info for this chunk, if there are elements selected */
chunk_info = fm->select_chunk[u];
@@ -1626,10 +1972,9 @@ H5D__multi_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *ty
* needs to contribute MPI NONE TYPE.
*/
if (chunk_io_option[u] == H5D_CHUNK_IO_MODE_COL) {
-#ifdef H5D_DEBUG
- if (H5DEBUG(D))
- HDfprintf(H5DEBUG(D), "inside collective chunk IO mpi_rank = %d, chunk index = %zu\n",
- mpi_rank, u);
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_DEBUG_VA(mpi_rank, "inside collective chunk IO mpi_rank = %d, chunk index = %zu",
+ mpi_rank, u);
#endif
/* Set the file & memory dataspaces */
@@ -1665,10 +2010,9 @@ H5D__multi_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *ty
HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish shared collective MPI-IO")
} /* end if */
else { /* possible independent IO for this chunk */
-#ifdef H5D_DEBUG
- if (H5DEBUG(D))
- HDfprintf(H5DEBUG(D), "inside independent IO mpi_rank = %d, chunk index = %zu\n", mpi_rank,
- u);
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_DEBUG_VA(mpi_rank, "inside independent IO mpi_rank = %d, chunk index = %zu", mpi_rank,
+ u);
#endif
HDassert(chunk_io_option[u] == 0);
@@ -1698,9 +2042,8 @@ H5D__multi_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *ty
/* Perform the I/O */
if (H5D__inter_collective_io(&ctg_io_info, type_info, fspace, mspace) < 0)
HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish shared collective MPI-IO")
-#ifdef H5D_DEBUG
- if (H5DEBUG(D))
- HDfprintf(H5DEBUG(D), "after inter collective IO\n");
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_DEBUG(mpi_rank, "after inter collective IO");
#endif
} /* end else */
} /* end for */
@@ -1720,80 +2063,101 @@ done:
/*-------------------------------------------------------------------------
* Function: H5D__multi_chunk_filtered_collective_io
*
- * Purpose: To do filtered collective IO iteratively to save on memory.
- * While link_chunk_filtered_collective_io will construct and
- * work on a list of all of the chunks selected in the IO
- * operation at once, this function works iteratively on a set
- * of chunks at a time; at most one chunk per rank per
- * iteration.
- *
- * 1. Construct a list of selected chunks in the collective IO
- * operation
- * A. If any chunk is being written to by more than 1
- * process, the process writing to the chunk which
- * currently has the least amount of chunks assigned
- * to it becomes the new owner (in the case of ties,
- * the lowest MPI rank becomes the new owner)
- * 2. If the operation is a read operation
- * A. Loop through each chunk in the operation
- * I. Read the chunk from the file
- * II. Unfilter the chunk
- * III. Scatter the read chunk data to the user's buffer
- * 3. If the operation is a write operation
- * A. Loop through each chunk in the operation
- * I. If this is not a full overwrite of the chunk
- * a) Read the chunk from file and pass the chunk
- * through the filter pipeline in reverse order
- * (Unfilter the chunk)
- * II. Update the chunk data with the modifications from
- * the owning process
- * III. Receive any modification data from other
- * processes and update the chunk data with these
- * modifications
- * IV. Filter the chunk
- * V. Contribute the chunk to an array gathered by
- * all processes which contains every chunk
- * modified in this iteration (up to one chunk
- * per process, some processes may not have a
- * selection/may have less chunks to work on than
- * other processes)
- * VI. All processes collectively re-allocate each
- * chunk from the gathered array with their new
- * sizes after the filter operation
- * VII. Proceed with the collective write operation
- * for the chunks modified on this iteration
- * VIII. All processes collectively re-insert each
- * chunk from the gathered array into the chunk
- * index
+ * Purpose: Performs collective I/O on filtered chunks iteratively to
+ * save on memory and potentially get better performance
+ * depending on the average number of chunks per rank. While
+ * linked-chunk I/O will construct and work on a list of all
+ * of the chunks selected in the I/O operation at once, this
+ * function works iteratively on a set of chunks at a time; at
+ * most one chunk per rank per iteration. The general
+ * algorithm is as follows:
+ *
+ * 1. Construct a list of selected chunks in the collective
+ * I/O operation
+ * 2. If the operation is a read operation, loop an amount of
+ * times equal to the maximum number of chunks selected on
+ * any particular rank and on each iteration:
+ * A. Participate in a collective read of chunks from
+ * the file (ranks that run out of chunks still need
+ * to participate)
+ * B. Unfilter the chunk that was read (if any)
+ * C. Scatter the read chunk's data to the application's
+ * read buffer
+ * 3. If the operation is a write operation, redistribute any
+ * chunks being written to by more than 1 MPI rank, such
+ * that the chunk is only owned by 1 MPI rank. The rank
+ * writing to the chunk which currently has the least
+ * amount of chunks assigned to it becomes the new owner
+ * (in the case of ties, the lowest MPI rank becomes the
+ * new owner). Then, loop an amount of times equal to the
+ * maximum number of chunks selected on any particular
+ * rank and on each iteration:
+ * A. Participate in a collective read of chunks from
+ * the file (ranks that run out of chunks still need
+ * to participate)
+ * I. If we actually read a chunk from the file (if
+ * a chunk is being fully overwritten, we skip
+ * reading it), pass the chunk through the filter
+ * pipeline in reverse order (unfilter the chunk)
+ * B. Update the chunk data with the modifications from
+ * the owning rank
+ * C. Receive any modification data from other ranks and
+ * update the chunk data with those modifications
+ * D. Filter the chunk
+ * E. Contribute the chunk to an array gathered by
+ * all ranks which contains information for
+ * re-allocating space in the file for every chunk
+ * modified in this iteration (up to one chunk per
+ * rank; some ranks may not have a selection/may have
+ * less chunks to work on than other ranks). Then,
+ * each rank collectively re-allocates each chunk
+ * from the gathered array with their new sizes
+ * after the filter operation
+ * F. Proceed with the collective write operation
+ * for the chunks modified on this iteration
+ * G. Contribute the chunk to an array gathered by
+ * all ranks which contains information for
+ * re-inserting every chunk modified on this
+ * iteration into the chunk index. Then, each rank
+ * collectively re-inserts each chunk from the
+ * gathered array into the chunk index
+ *
+ * TODO: Note that steps E. and G. here are both collective
+ * operations that partially share data from the
+ * H5D_filtered_collective_io_info_t structure. To
+ * try to conserve on memory a bit, the distributed
+ * arrays these operations create are discarded after
+ * each operation is performed. If memory consumption
+ * here proves to not be an issue, the necessary data
+ * for both operations could be combined into a single
+ * structure so that only one collective MPI operation
+ * is needed to carry out both operations, rather than
+ * two.
*
* Return: Non-negative on success/Negative on failure
*
- * Programmer: Jordan Henderson
- * Friday, Dec. 2nd, 2016
- *
*-------------------------------------------------------------------------
*/
static herr_t
H5D__multi_chunk_filtered_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info,
- H5D_chunk_map_t *fm)
+ H5D_chunk_map_t *fm, int mpi_rank, int mpi_size)
{
- H5D_filtered_collective_io_info_t *chunk_list = NULL; /* The list of chunks being read/written */
- H5D_filtered_collective_io_info_t *collective_chunk_list =
- NULL; /* The list of chunks used during collective operations */
- H5D_storage_t store; /* union of EFL and chunk pointer in file space */
- H5D_io_info_t ctg_io_info; /* Contiguous I/O info object */
- H5D_storage_t ctg_store; /* Chunk storage information as contiguous dataset */
- MPI_Datatype *file_type_array = NULL;
- MPI_Datatype *mem_type_array = NULL;
- hbool_t * file_type_is_derived_array = NULL;
- hbool_t * mem_type_is_derived_array = NULL;
- hbool_t * has_chunk_selected_array =
- NULL; /* Array of whether or not each process is contributing a chunk to each iteration */
- size_t chunk_list_num_entries;
- size_t collective_chunk_list_num_entries;
- size_t i, j; /* Local index variable */
- int mpi_rank, mpi_size, mpi_code;
- herr_t ret_value = SUCCEED;
+ H5D_filtered_collective_io_info_t *chunk_list = NULL; /* The list of chunks being read/written */
+ H5D_filtered_collective_io_info_t *chunk_hash_table = NULL;
+ unsigned char ** chunk_msg_bufs = NULL;
+ H5D_io_info_t ctg_io_info; /* Contiguous I/O info object */
+ H5D_storage_t ctg_store; /* Chunk storage information as contiguous dataset */
+ MPI_Datatype mem_type = MPI_BYTE;
+ MPI_Datatype file_type = MPI_BYTE;
+ hbool_t mem_type_is_derived = FALSE;
+ hbool_t file_type_is_derived = FALSE;
+ hbool_t have_chunk_to_process;
+ size_t chunk_list_num_entries;
+ size_t i;
+ size_t max_num_chunks;
+ int chunk_msg_bufs_len = 0;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
FUNC_ENTER_STATIC
@@ -1801,11 +2165,12 @@ H5D__multi_chunk_filtered_collective_io(H5D_io_info_t *io_info, const H5D_type_i
HDassert(type_info);
HDassert(fm);
- /* Obtain the current rank of the process and the number of processes */
- if ((mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file)) < 0)
- HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank")
- if ((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file)) < 0)
- HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size")
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_TRACE_ENTER(mpi_rank);
+ H5D_MPIO_DEBUG_VA(mpi_rank, "Performing Multi-chunk I/O (%s) with MPI Comm size of %d",
+ io_info->op_type == H5D_IO_OP_WRITE ? "write" : "read", mpi_size);
+ H5D_MPIO_TIME_START(mpi_rank, "Multi-chunk I/O");
+#endif
/* Set the actual chunk opt mode property */
H5CX_set_mpio_actual_chunk_opt(H5D_MPIO_MULTI_CHUNK);
@@ -1816,10 +2181,19 @@ H5D__multi_chunk_filtered_collective_io(H5D_io_info_t *io_info, const H5D_type_i
H5CX_set_mpio_actual_io_mode(H5D_MPIO_CHUNK_COLLECTIVE);
/* Build a list of selected chunks in the collective IO operation */
- if (H5D__construct_filtered_io_info_list(io_info, type_info, fm, &chunk_list, &chunk_list_num_entries) <
- 0)
+ if (H5D__mpio_collective_filtered_chunk_io_setup(io_info, type_info, fm, &chunk_list,
+ &chunk_list_num_entries, mpi_rank) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "couldn't construct filtered I/O info list")
+ /* Retrieve the maximum number of chunks selected for any rank */
+ if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&chunk_list_num_entries, &max_num_chunks, 1,
+ MPI_UNSIGNED_LONG_LONG, MPI_MAX, io_info->comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code)
+
+ /* If no one has anything selected at all, end the operation */
+ if (0 == max_num_chunks)
+ HGOTO_DONE(SUCCEED);
+
/* Set up contiguous I/O info object */
H5MM_memcpy(&ctg_io_info, io_info, sizeof(ctg_io_info));
ctg_io_info.store = &ctg_store;
@@ -1827,190 +2201,147 @@ H5D__multi_chunk_filtered_collective_io(H5D_io_info_t *io_info, const H5D_type_i
/* Initialize temporary contiguous storage info */
ctg_store.contig.dset_size = (hsize_t)io_info->dset->shared->layout.u.chunk.size;
- ctg_store.contig.dset_addr = 0;
-
- /* Set dataset storage for I/O info */
- io_info->store = &store;
if (io_info->op_type == H5D_IO_OP_READ) { /* Filtered collective read */
- for (i = 0; i < chunk_list_num_entries; i++)
- if (H5D__filtered_collective_chunk_entry_io(&chunk_list[i], io_info, type_info, fm) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "couldn't process chunk entry")
- } /* end if */
+ for (i = 0; i < max_num_chunks; i++) {
+ /* Check if this rank has a chunk to work on for this iteration */
+ have_chunk_to_process = (i < chunk_list_num_entries);
+
+ if (H5D__mpio_collective_filtered_chunk_read(have_chunk_to_process ? &chunk_list[i] : NULL,
+ have_chunk_to_process ? 1 : 0, io_info, type_info,
+ mpi_rank, mpi_size) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "couldn't read filtered chunks")
+
+ if (have_chunk_to_process && chunk_list[i].buf) {
+ H5MM_free(chunk_list[i].buf);
+ chunk_list[i].buf = NULL;
+ }
+ }
+ }
else { /* Filtered collective write */
H5D_chk_idx_info_t index_info;
- H5D_chunk_ud_t udata;
- size_t max_num_chunks;
hsize_t mpi_buf_count;
/* Construct chunked index info */
- index_info.f = io_info->dset->oloc.file;
- index_info.pline = &(io_info->dset->shared->dcpl_cache.pline);
- index_info.layout = &(io_info->dset->shared->layout.u.chunk);
- index_info.storage = &(io_info->dset->shared->layout.storage.u.chunk);
-
- /* Set up chunk information for insertion to chunk index */
- udata.common.layout = index_info.layout;
- udata.common.storage = index_info.storage;
- udata.filter_mask = 0;
-
- /* Retrieve the maximum number of chunks being written among all processes */
- if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&chunk_list_num_entries, &max_num_chunks, 1,
- MPI_UNSIGNED_LONG_LONG, MPI_MAX, io_info->comm)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code)
-
- /* If no one is writing anything at all, end the operation */
- if (!(max_num_chunks > 0))
- HGOTO_DONE(SUCCEED);
-
- /* Allocate arrays for storing MPI file and mem types and whether or not the
- * types were derived.
- */
- if (NULL == (file_type_array = (MPI_Datatype *)H5MM_malloc(max_num_chunks * sizeof(MPI_Datatype))))
- HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate file type array")
-
- if (NULL == (file_type_is_derived_array = (hbool_t *)H5MM_calloc(max_num_chunks * sizeof(hbool_t))))
- HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate file type is derived array")
-
- if (NULL == (mem_type_array = (MPI_Datatype *)H5MM_malloc(max_num_chunks * sizeof(MPI_Datatype))))
- HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate mem type array")
-
- if (NULL == (mem_type_is_derived_array = (hbool_t *)H5MM_calloc(max_num_chunks * sizeof(hbool_t))))
- HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate mem type is derived array")
-
- /* Iterate over the max number of chunks among all processes, as this process could
- * have no chunks left to work on, but it still needs to participate in the collective
- * re-allocation and re-insertion of chunks modified by other processes.
+ H5D_MPIO_INIT_CHUNK_IDX_INFO(index_info, io_info);
+
+ if (mpi_size > 1) {
+ /* Redistribute shared chunks being written to */
+ if (H5D__mpio_redistribute_shared_chunks(chunk_list, chunk_list_num_entries, io_info, fm,
+ mpi_rank, mpi_size, NULL) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "unable to redistribute shared chunks")
+
+ /* Send any chunk modification messages for chunks this rank no longer owns */
+ if (H5D__mpio_share_chunk_modification_data(chunk_list, &chunk_list_num_entries, io_info,
+ type_info, mpi_rank, mpi_size, &chunk_hash_table,
+ &chunk_msg_bufs, &chunk_msg_bufs_len) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL,
+ "unable to send chunk modification data between MPI ranks")
+ }
+
+ /* Iterate over the max number of chunks among all ranks, as this rank could
+ * have no chunks left to work on, but it still needs to participate in the
+ * collective re-allocation and re-insertion of chunks modified by other ranks.
*/
for (i = 0; i < max_num_chunks; i++) {
- /* Check if this process has a chunk to work on for this iteration */
- hbool_t have_chunk_to_process =
- (i < chunk_list_num_entries) && (mpi_rank == chunk_list[i].owners.new_owner);
-
- if (have_chunk_to_process)
- if (H5D__filtered_collective_chunk_entry_io(&chunk_list[i], io_info, type_info, fm) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "couldn't process chunk entry")
+ /* Check if this rank has a chunk to work on for this iteration */
+ have_chunk_to_process = (i < chunk_list_num_entries) && (mpi_rank == chunk_list[i].new_owner);
- /* Gather the new chunk sizes to all processes for a collective re-allocation
- * of the chunks in the file
+ /* Proceed to update the chunk this rank owns (if any left) with its
+ * own modification data and data from other ranks, before re-filtering
+ * the chunks. As chunk reads are done collectively here, all ranks
+ * must participate.
*/
- if (H5D__mpio_array_gatherv(&chunk_list[i], have_chunk_to_process ? 1 : 0,
- sizeof(H5D_filtered_collective_io_info_t),
- (void **)&collective_chunk_list, &collective_chunk_list_num_entries,
- true, 0, io_info->comm, NULL) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGATHER, FAIL, "couldn't gather new chunk sizes")
-
- /* Participate in the collective re-allocation of all chunks modified
- * in this iteration.
+ if (H5D__mpio_collective_filtered_chunk_update(have_chunk_to_process ? &chunk_list[i] : NULL,
+ have_chunk_to_process ? 1 : 0, chunk_hash_table,
+ chunk_msg_bufs, chunk_msg_bufs_len, io_info,
+ type_info, mpi_rank, mpi_size) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "couldn't update modified chunks")
+
+ /* All ranks now collectively re-allocate file space for all chunks */
+ if (H5D__mpio_collective_filtered_chunk_reallocate(have_chunk_to_process ? &chunk_list[i] : NULL,
+ have_chunk_to_process ? 1 : 0, NULL, io_info,
+ &index_info, mpi_rank, mpi_size) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL,
+ "couldn't collectively re-allocate file space for chunks")
+
+ /*
+ * If this rank has a chunk to work on, create a MPI type
+ * for writing out the chunk. Otherwise, the rank will
+ * use MPI_BYTE for the file and memory type and specify
+ * a count of 0.
*/
- for (j = 0; j < collective_chunk_list_num_entries; j++) {
- hbool_t insert = FALSE;
-
- if (H5D__chunk_file_alloc(&index_info, &collective_chunk_list[j].chunk_states.chunk_current,
- &collective_chunk_list[j].chunk_states.new_chunk, &insert,
- chunk_list[j].scaled) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "unable to allocate chunk")
- } /* end for */
-
- if (NULL ==
- (has_chunk_selected_array = (hbool_t *)H5MM_malloc((size_t)mpi_size * sizeof(hbool_t))))
- HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate num chunks selected array")
+ if (H5D__mpio_collective_filtered_io_type(
+ have_chunk_to_process ? &chunk_list[i] : NULL, have_chunk_to_process ? 1 : 0,
+ io_info->op_type, &mem_type, &mem_type_is_derived, &file_type, &file_type_is_derived) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL,
+ "couldn't create MPI type for writing filtered chunks")
- if (MPI_SUCCESS !=
- (mpi_code = MPI_Allgather(&have_chunk_to_process, 1, MPI_C_BOOL, has_chunk_selected_array, 1,
- MPI_C_BOOL, io_info->comm)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Allgather failed", mpi_code)
+ mpi_buf_count = (file_type_is_derived || mem_type_is_derived) ? 1 : 0;
- /* If this process has a chunk to work on, create a MPI type for the
- * memory and file for writing out the chunk
- */
+ /* Override the write buffer to point to the chunk data buffer */
if (have_chunk_to_process) {
- size_t offset;
- int mpi_type_count;
-
- for (j = 0, offset = 0; j < (size_t)mpi_rank; j++)
- offset += has_chunk_selected_array[j];
-
- /* Collect the new chunk info back to the local copy, since only the record in the
- * collective array gets updated by the chunk re-allocation */
- H5MM_memcpy(&chunk_list[i].chunk_states.new_chunk,
- &collective_chunk_list[offset].chunk_states.new_chunk,
- sizeof(chunk_list[i].chunk_states.new_chunk));
-
- H5_CHECKED_ASSIGN(mpi_type_count, int, chunk_list[i].chunk_states.new_chunk.length, hsize_t);
-
- /* Create MPI memory type for writing to chunk */
- if (MPI_SUCCESS !=
- (mpi_code = MPI_Type_contiguous(mpi_type_count, MPI_BYTE, &mem_type_array[i])))
- HMPI_GOTO_ERROR(FAIL, "MPI_Type_contiguous failed", mpi_code)
- if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(&mem_type_array[i])))
- HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code)
- mem_type_is_derived_array[i] = TRUE;
-
- /* Create MPI file type for writing to chunk */
- if (MPI_SUCCESS !=
- (mpi_code = MPI_Type_contiguous(mpi_type_count, MPI_BYTE, &file_type_array[i])))
- HMPI_GOTO_ERROR(FAIL, "MPI_Type_contiguous failed", mpi_code)
- if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(&file_type_array[i])))
- HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code)
- file_type_is_derived_array[i] = TRUE;
-
- mpi_buf_count = 1;
-
- /* Set up the base storage address for this operation */
- ctg_store.contig.dset_addr = chunk_list[i].chunk_states.new_chunk.offset;
-
- /* Override the write buffer to point to the address of the
- * chunk data buffer
+ /*
+ * Override the write buffer to point to the
+ * chunk's data buffer
*/
ctg_io_info.u.wbuf = chunk_list[i].buf;
- } /* end if */
- else {
- mem_type_array[i] = file_type_array[i] = MPI_BYTE;
- mpi_buf_count = 0;
- } /* end else */
+
+ /*
+ * Setup the base storage address for this
+ * operation to be the chunk's file address
+ */
+ ctg_store.contig.dset_addr = chunk_list[i].chunk_new.offset;
+ }
+ else
+ ctg_store.contig.dset_addr = 0;
/* Perform the I/O */
- if (H5D__final_collective_io(&ctg_io_info, type_info, mpi_buf_count, file_type_array[i],
- mem_type_array[i]) < 0)
+ if (H5D__final_collective_io(&ctg_io_info, type_info, mpi_buf_count, file_type, mem_type) < 0)
HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish MPI-IO")
+ /* Free up resources in anticipation of following collective operation */
+ if (have_chunk_to_process && chunk_list[i].buf) {
+ H5MM_free(chunk_list[i].buf);
+ chunk_list[i].buf = NULL;
+ }
+
/* Participate in the collective re-insertion of all chunks modified
* in this iteration into the chunk index
*/
- for (j = 0; j < collective_chunk_list_num_entries; j++) {
- udata.chunk_block = collective_chunk_list[j].chunk_states.new_chunk;
- udata.common.scaled = collective_chunk_list[j].scaled;
- udata.chunk_idx = collective_chunk_list[j].index;
-
- if ((index_info.storage->ops->insert)(&index_info, &udata, io_info->dset) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTINSERT, FAIL,
- "unable to insert chunk address into index")
- } /* end for */
+ if (H5D__mpio_collective_filtered_chunk_reinsert(have_chunk_to_process ? &chunk_list[i] : NULL,
+ have_chunk_to_process ? 1 : 0, NULL, io_info,
+ &index_info, mpi_rank, mpi_size) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL,
+ "couldn't collectively re-insert modified chunks into chunk index")
+
+ /* Free the MPI types, if they were derived */
+ if (mem_type_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&mem_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ mem_type_is_derived = FALSE;
+ if (file_type_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&file_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ file_type_is_derived = FALSE;
+ } /* end for */
+ }
- if (collective_chunk_list) {
- H5MM_free(collective_chunk_list);
- collective_chunk_list = NULL;
- } /* end if */
- if (has_chunk_selected_array) {
- H5MM_free(has_chunk_selected_array);
- has_chunk_selected_array = NULL;
- } /* end if */
- } /* end for */
+done:
+ /* Free the MPI buf and file types, if they were derived */
+ if (mem_type_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&mem_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ if (file_type_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&file_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
- /* Free the MPI file and memory types, if they were derived */
- for (i = 0; i < max_num_chunks; i++) {
- if (file_type_is_derived_array[i])
- if (MPI_SUCCESS != (mpi_code = MPI_Type_free(&file_type_array[i])))
- HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ if (chunk_msg_bufs) {
+ for (i = 0; i < (size_t)chunk_msg_bufs_len; i++)
+ H5MM_free(chunk_msg_bufs[i]);
- if (mem_type_is_derived_array[i])
- if (MPI_SUCCESS != (mpi_code = MPI_Type_free(&mem_type_array[i])))
- HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
- } /* end for */
- } /* end else */
+ H5MM_free(chunk_msg_bufs);
+ }
-done:
+ HASH_CLEAR(hh, chunk_hash_table);
+
+ /* Free resources used by a rank which had some selection */
if (chunk_list) {
for (i = 0; i < chunk_list_num_entries; i++)
if (chunk_list[i].buf)
@@ -2019,16 +2350,10 @@ done:
H5MM_free(chunk_list);
} /* end if */
- if (collective_chunk_list)
- H5MM_free(collective_chunk_list);
- if (file_type_array)
- H5MM_free(file_type_array);
- if (mem_type_array)
- H5MM_free(mem_type_array);
- if (file_type_is_derived_array)
- H5MM_free(file_type_is_derived_array);
- if (mem_type_is_derived_array)
- H5MM_free(mem_type_is_derived_array);
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_TIME_STOP(mpi_rank);
+ H5D_MPIO_TRACE_EXIT(mpi_rank);
+#endif
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5D__multi_chunk_filtered_collective_io() */
@@ -2054,11 +2379,22 @@ H5D__inter_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_inf
hbool_t mbt_is_derived = FALSE;
hbool_t mft_is_derived = FALSE;
MPI_Datatype mpi_file_type, mpi_buf_type;
- int mpi_code; /* MPI return code */
- herr_t ret_value = SUCCEED; /* return value */
+ int mpi_code; /* MPI return code */
+#ifdef H5Dmpio_DEBUG
+ int mpi_rank;
+#endif
+ herr_t ret_value = SUCCEED; /* return value */
FUNC_ENTER_STATIC
+#ifdef H5Dmpio_DEBUG
+ mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file);
+ H5D_MPIO_TRACE_ENTER(mpi_rank);
+ H5D_MPIO_TIME_START(mpi_rank, "Inter collective I/O");
+ if (mpi_rank < 0)
+ HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain MPI rank")
+#endif
+
if ((file_space != NULL) && (mem_space != NULL)) {
int mpi_file_count; /* Number of file "objects" to transfer */
hsize_t *permute_map = NULL; /* array that holds the mapping from the old,
@@ -2117,9 +2453,8 @@ H5D__inter_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_inf
mft_is_derived = FALSE;
} /* end else */
-#ifdef H5D_DEBUG
- if (H5DEBUG(D))
- HDfprintf(H5DEBUG(D), "before final collective IO \n");
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_DEBUG(mpi_rank, "before final collective I/O");
#endif
/* Perform final collective I/O operation */
@@ -2133,9 +2468,10 @@ done:
if (mft_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&mpi_file_type)))
HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
-#ifdef H5D_DEBUG
- if (H5DEBUG(D))
- HDfprintf(H5DEBUG(D), "before leaving inter_collective_io ret_value = %d\n", ret_value);
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_TIME_STOP(mpi_rank);
+ H5D_MPIO_DEBUG_VA(mpi_rank, "before leaving inter_collective_io ret_value = %d", ret_value);
+ H5D_MPIO_TRACE_EXIT(mpi_rank);
#endif
FUNC_LEAVE_NOAPI(ret_value)
@@ -2157,10 +2493,21 @@ static herr_t
H5D__final_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, hsize_t mpi_buf_count,
MPI_Datatype mpi_file_type, MPI_Datatype mpi_buf_type)
{
+#ifdef H5Dmpio_DEBUG
+ int mpi_rank;
+#endif
herr_t ret_value = SUCCEED;
FUNC_ENTER_STATIC
+#ifdef H5Dmpio_DEBUG
+ mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file);
+ H5D_MPIO_TRACE_ENTER(mpi_rank);
+ H5D_MPIO_TIME_START(mpi_rank, "Final collective I/O");
+ if (mpi_rank < 0)
+ HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain MPI rank")
+#endif
+
/* Pass buf type, file type to the file driver. */
if (H5CX_set_mpi_coll_datatypes(mpi_buf_type, mpi_file_type) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_CANTSET, FAIL, "can't set MPI-I/O collective I/O datatypes")
@@ -2175,10 +2522,12 @@ H5D__final_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_inf
} /* end else */
done:
-#ifdef H5D_DEBUG
- if (H5DEBUG(D))
- HDfprintf(H5DEBUG(D), "ret_value before leaving final_collective_io=%d\n", ret_value);
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_TIME_STOP(mpi_rank);
+ H5D_MPIO_DEBUG_VA(mpi_rank, "ret_value before leaving final_collective_io=%d", ret_value);
+ H5D_MPIO_TRACE_EXIT(mpi_rank);
#endif
+
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5D__final_collective_io */
@@ -2220,62 +2569,149 @@ H5D__cmp_chunk_addr(const void *chunk_addr_info1, const void *chunk_addr_info2)
*
* Return: -1, 0, 1
*
- * Programmer: Jordan Henderson
- * Wednesday, Nov. 30th, 2016
- *
*-------------------------------------------------------------------------
*/
static int
H5D__cmp_filtered_collective_io_info_entry(const void *filtered_collective_io_info_entry1,
const void *filtered_collective_io_info_entry2)
{
- haddr_t addr1 = HADDR_UNDEF, addr2 = HADDR_UNDEF;
+ const H5D_filtered_collective_io_info_t *entry1;
+ const H5D_filtered_collective_io_info_t *entry2;
+ haddr_t addr1 = HADDR_UNDEF;
+ haddr_t addr2 = HADDR_UNDEF;
+ int ret_value;
FUNC_ENTER_STATIC_NOERR
- addr1 = ((const H5D_filtered_collective_io_info_t *)filtered_collective_io_info_entry1)
- ->chunk_states.new_chunk.offset;
- addr2 = ((const H5D_filtered_collective_io_info_t *)filtered_collective_io_info_entry2)
- ->chunk_states.new_chunk.offset;
+ entry1 = (const H5D_filtered_collective_io_info_t *)filtered_collective_io_info_entry1;
+ entry2 = (const H5D_filtered_collective_io_info_t *)filtered_collective_io_info_entry2;
- FUNC_LEAVE_NOAPI(H5F_addr_cmp(addr1, addr2))
-} /* end H5D__cmp_filtered_collective_io_info_entry() */
+ addr1 = entry1->chunk_new.offset;
+ addr2 = entry2->chunk_new.offset;
-#if MPI_VERSION >= 3
+ /*
+ * If both chunk addresses are defined, H5F_addr_cmp is safe to use.
+ * Otherwise, if both addresses aren't defined, compared chunk
+ * entries based on their chunk index. Finally, if only one chunk
+ * address is defined, return the appropriate value based on which
+ * is defined.
+ */
+ if (H5F_addr_defined(addr1) && H5F_addr_defined(addr2)) {
+ ret_value = H5F_addr_cmp(addr1, addr2);
+ }
+ else if (!H5F_addr_defined(addr1) && !H5F_addr_defined(addr2)) {
+ hsize_t chunk_idx1 = entry1->index_info.chunk_idx;
+ hsize_t chunk_idx2 = entry2->index_info.chunk_idx;
+
+ ret_value = (chunk_idx1 > chunk_idx2) - (chunk_idx1 < chunk_idx2);
+ }
+ else
+ ret_value = H5F_addr_defined(addr1) ? 1 : -1;
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D__cmp_filtered_collective_io_info_entry() */
/*-------------------------------------------------------------------------
- * Function: H5D__cmp_filtered_collective_io_info_entry_owner
+ * Function: H5D__cmp_chunk_redistribute_info
*
- * Purpose: Routine to compare filtered collective chunk io info
- * entries's original owner fields
+ * Purpose: Routine to compare two H5D_chunk_redistribute_info_t
+ * structures
*
- * Description: Callback for qsort() to compare filtered collective chunk
- * io info entries's original owner fields
+ * Description: Callback for qsort() to compare two
+ * H5D_chunk_redistribute_info_t structures
*
- * Return: The difference between the two
- * H5D_filtered_collective_io_info_t's original owner fields
+ * Return: -1, 0, 1
*
- * Programmer: Jordan Henderson
- * Monday, Apr. 10th, 2017
+ *-------------------------------------------------------------------------
+ */
+static int
+H5D__cmp_chunk_redistribute_info(const void *_entry1, const void *_entry2)
+{
+ const H5D_chunk_redistribute_info_t *entry1;
+ const H5D_chunk_redistribute_info_t *entry2;
+ hsize_t chunk_index1;
+ hsize_t chunk_index2;
+ int ret_value;
+
+ FUNC_ENTER_STATIC_NOERR
+
+ entry1 = (const H5D_chunk_redistribute_info_t *)_entry1;
+ entry2 = (const H5D_chunk_redistribute_info_t *)_entry2;
+
+ chunk_index1 = entry1->chunk_idx;
+ chunk_index2 = entry2->chunk_idx;
+
+ if (chunk_index1 == chunk_index2) {
+ int orig_owner1 = entry1->orig_owner;
+ int orig_owner2 = entry2->orig_owner;
+
+ ret_value = (orig_owner1 > orig_owner2) - (orig_owner1 < orig_owner2);
+ }
+ else
+ ret_value = (chunk_index1 > chunk_index2) - (chunk_index1 < chunk_index2);
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D__cmp_chunk_redistribute_info() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5D__cmp_chunk_redistribute_info_orig_owner
+ *
+ * Purpose: Routine to compare the original owning MPI rank for two
+ * H5D_chunk_redistribute_info_t structures
+ *
+ * Description: Callback for qsort() to compare the original owning MPI
+ * rank for two H5D_chunk_redistribute_info_t
+ * structures
+ *
+ * Return: -1, 0, 1
*
*-------------------------------------------------------------------------
*/
static int
-H5D__cmp_filtered_collective_io_info_entry_owner(const void *filtered_collective_io_info_entry1,
- const void *filtered_collective_io_info_entry2)
+H5D__cmp_chunk_redistribute_info_orig_owner(const void *_entry1, const void *_entry2)
{
- int owner1 = -1, owner2 = -1;
+ const H5D_chunk_redistribute_info_t *entry1;
+ const H5D_chunk_redistribute_info_t *entry2;
+ int owner1 = -1;
+ int owner2 = -1;
+ int ret_value;
FUNC_ENTER_STATIC_NOERR
- owner1 = ((const H5D_filtered_collective_io_info_t *)filtered_collective_io_info_entry1)
- ->owners.original_owner;
- owner2 = ((const H5D_filtered_collective_io_info_t *)filtered_collective_io_info_entry2)
- ->owners.original_owner;
+ entry1 = (const H5D_chunk_redistribute_info_t *)_entry1;
+ entry2 = (const H5D_chunk_redistribute_info_t *)_entry2;
- FUNC_LEAVE_NOAPI(owner1 - owner2)
-} /* end H5D__cmp_filtered_collective_io_info_entry_owner() */
-#endif
+ owner1 = entry1->orig_owner;
+ owner2 = entry2->orig_owner;
+
+ if (owner1 == owner2) {
+ haddr_t addr1 = entry1->chunk_block.offset;
+ haddr_t addr2 = entry2->chunk_block.offset;
+
+ /*
+ * If both chunk addresses are defined, H5F_addr_cmp is safe to use.
+ * Otherwise, if both addresses aren't defined, compared chunk
+ * entries based on their chunk index. Finally, if only one chunk
+ * address is defined, return the appropriate value based on which
+ * is defined.
+ */
+ if (H5F_addr_defined(addr1) && H5F_addr_defined(addr2)) {
+ ret_value = H5F_addr_cmp(addr1, addr2);
+ }
+ else if (!H5F_addr_defined(addr1) && !H5F_addr_defined(addr2)) {
+ hsize_t chunk_idx1 = entry1->chunk_idx;
+ hsize_t chunk_idx2 = entry2->chunk_idx;
+
+ ret_value = (chunk_idx1 > chunk_idx2) - (chunk_idx1 < chunk_idx2);
+ }
+ else
+ ret_value = H5F_addr_defined(addr1) ? 1 : -1;
+ }
+ else
+ ret_value = (owner1 > owner2) - (owner1 < owner2);
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D__cmp_chunk_redistribute_info_orig_owner() */
/*-------------------------------------------------------------------------
* Function: H5D__sort_chunk
@@ -2304,26 +2740,24 @@ H5D__cmp_filtered_collective_io_info_entry_owner(const void *filtered_collective
*/
static herr_t
H5D__sort_chunk(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm,
- H5D_chunk_addr_info_t chunk_addr_info_array[], int sum_chunk)
+ H5D_chunk_addr_info_t chunk_addr_info_array[], int sum_chunk, int mpi_rank, int mpi_size)
{
- H5SL_node_t * chunk_node; /* Current node in chunk skip list */
- H5D_chunk_info_t *chunk_info; /* Current chunking info. of this node. */
- haddr_t chunk_addr; /* Current chunking address of this node */
- haddr_t *total_chunk_addr_array = NULL; /* The array of chunk address for the total number of chunk */
- hbool_t do_sort = FALSE; /* Whether the addresses need to be sorted */
- int bsearch_coll_chunk_threshold;
- int many_chunk_opt = H5D_OBTAIN_ONE_CHUNK_ADDR_IND;
- int mpi_size; /* Number of MPI processes */
- int mpi_code; /* MPI return code */
- int i; /* Local index variable */
- herr_t ret_value = SUCCEED; /* Return value */
+ H5SL_node_t * chunk_node; /* Current node in chunk skip list */
+ H5D_chunk_info_t *chunk_info; /* Current chunking info. of this node. */
+ haddr_t chunk_addr; /* Current chunking address of this node */
+ haddr_t *total_chunk_addr_array = NULL; /* The array of chunk address for the total number of chunk */
+ H5P_coll_md_read_flag_t md_reads_file_flag;
+ hbool_t md_reads_context_flag;
+ hbool_t restore_md_reads_state = FALSE;
+ hbool_t do_sort = FALSE; /* Whether the addresses need to be sorted */
+ int bsearch_coll_chunk_threshold;
+ int many_chunk_opt = H5D_OBTAIN_ONE_CHUNK_ADDR_IND;
+ int mpi_code; /* MPI return code */
+ int i; /* Local index variable */
+ herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_STATIC
- /* Retrieve # of MPI processes */
- if ((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file)) < 0)
- HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size")
-
/* Calculate the actual threshold to obtain all chunk addresses collectively
* The bigger this number is, the more possible the use of obtaining chunk
* address collectively.
@@ -2337,30 +2771,47 @@ H5D__sort_chunk(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm,
((sum_chunk / mpi_size) >= H5D_ALL_CHUNK_ADDR_THRES_COL_NUM))
many_chunk_opt = H5D_OBTAIN_ALL_CHUNK_ADDR_COL;
-#ifdef H5D_DEBUG
- if (H5DEBUG(D))
- HDfprintf(H5DEBUG(D), "many_chunk_opt= %d\n", many_chunk_opt);
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_DEBUG_VA(mpi_rank, "many_chunk_opt = %d", many_chunk_opt);
#endif
/* If we need to optimize the way to obtain the chunk address */
if (many_chunk_opt != H5D_OBTAIN_ONE_CHUNK_ADDR_IND) {
- int mpi_rank;
-
-#ifdef H5D_DEBUG
- if (H5DEBUG(D))
- HDfprintf(H5DEBUG(D), "Coming inside H5D_OBTAIN_ALL_CHUNK_ADDR_COL\n");
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_DEBUG(mpi_rank, "Coming inside H5D_OBTAIN_ALL_CHUNK_ADDR_COL");
#endif
/* Allocate array for chunk addresses */
if (NULL == (total_chunk_addr_array =
(haddr_t *)H5MM_malloc(sizeof(haddr_t) * (size_t)fm->layout->u.chunk.nchunks)))
HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, "unable to allocate memory chunk address array")
- /* Retrieve all the chunk addresses with process 0 */
- if ((mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file)) < 0)
- HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank")
-
if (mpi_rank == 0) {
- if (H5D__chunk_addrmap(io_info, total_chunk_addr_array) < 0) {
+ herr_t result;
+
+ /*
+ * If enabled, disable collective metadata reads here.
+ * Since the chunk address mapping is done on rank 0
+ * only here, it will cause problems if collective
+ * metadata reads are enabled.
+ */
+ if (H5F_get_coll_metadata_reads(io_info->dset->oloc.file)) {
+ md_reads_file_flag = H5P_FORCE_FALSE;
+ md_reads_context_flag = FALSE;
+ H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag,
+ &md_reads_context_flag);
+ restore_md_reads_state = TRUE;
+ }
+
+ result = H5D__chunk_addrmap(io_info, total_chunk_addr_array);
+
+ /* Ensure that we restore the old collective metadata reads state */
+ if (restore_md_reads_state) {
+ H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag,
+ &md_reads_context_flag);
+ restore_md_reads_state = FALSE;
+ }
+
+ if (result < 0) {
size_t u;
/* Clear total chunk address array */
@@ -2413,10 +2864,10 @@ H5D__sort_chunk(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm,
chunk_node = H5SL_next(chunk_node);
} /* end while */
-#ifdef H5D_DEBUG
- if (H5DEBUG(D))
- HDfprintf(H5DEBUG(D), "before Qsort\n");
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_DEBUG(mpi_rank, "before Qsort");
#endif
+
if (do_sort) {
size_t num_chunks = H5SL_count(fm->sel_chunks);
@@ -2424,6 +2875,10 @@ H5D__sort_chunk(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm,
} /* end if */
done:
+ /* Re-enable collective metadata reads if we disabled them */
+ if (restore_md_reads_state)
+ H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag, &md_reads_context_flag);
+
if (total_chunk_addr_array)
H5MM_xfree(total_chunk_addr_array);
@@ -2469,22 +2924,24 @@ done:
*/
static herr_t
H5D__obtain_mpio_mode(H5D_io_info_t *io_info, H5D_chunk_map_t *fm, uint8_t assign_io_mode[],
- haddr_t chunk_addr[])
+ haddr_t chunk_addr[], int mpi_rank, int mpi_size)
{
- size_t total_chunks;
- unsigned percent_nproc_per_chunk, threshold_nproc_per_chunk;
- uint8_t * io_mode_info = NULL;
- uint8_t * recv_io_mode_info = NULL;
- uint8_t * mergebuf = NULL;
- uint8_t * tempbuf;
- H5SL_node_t * chunk_node;
- H5D_chunk_info_t *chunk_info;
- int mpi_size, mpi_rank;
- MPI_Comm comm;
- int root;
- size_t ic;
- int mpi_code;
- herr_t ret_value = SUCCEED;
+ size_t total_chunks;
+ unsigned percent_nproc_per_chunk, threshold_nproc_per_chunk;
+ uint8_t * io_mode_info = NULL;
+ uint8_t * recv_io_mode_info = NULL;
+ uint8_t * mergebuf = NULL;
+ uint8_t * tempbuf;
+ H5SL_node_t * chunk_node;
+ H5D_chunk_info_t * chunk_info;
+ H5P_coll_md_read_flag_t md_reads_file_flag;
+ hbool_t md_reads_context_flag;
+ hbool_t restore_md_reads_state = FALSE;
+ MPI_Comm comm;
+ int root;
+ size_t ic;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
FUNC_ENTER_STATIC
@@ -2492,12 +2949,6 @@ H5D__obtain_mpio_mode(H5D_io_info_t *io_info, H5D_chunk_map_t *fm, uint8_t assig
root = 0;
comm = io_info->comm;
- /* Obtain the number of process and the current rank of the process */
- if ((mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file)) < 0)
- HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank")
- if ((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file)) < 0)
- HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size")
-
/* Setup parameters */
H5_CHECKED_ASSIGN(total_chunks, size_t, fm->layout->u.chunk.nchunks, hsize_t);
if (H5CX_get_mpio_chunk_opt_ratio(&percent_nproc_per_chunk) < 0)
@@ -2544,6 +2995,20 @@ H5D__obtain_mpio_mode(H5D_io_info_t *io_info, H5D_chunk_map_t *fm, uint8_t assig
size_t nproc;
unsigned *nproc_per_chunk;
+ /*
+ * If enabled, disable collective metadata reads here.
+ * Since the chunk address mapping is done on rank 0
+ * only here, it will cause problems if collective
+ * metadata reads are enabled.
+ */
+ if (H5F_get_coll_metadata_reads(io_info->dset->oloc.file)) {
+ md_reads_file_flag = H5P_FORCE_FALSE;
+ md_reads_context_flag = FALSE;
+ H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag,
+ &md_reads_context_flag);
+ restore_md_reads_state = TRUE;
+ }
+
/* pre-computing: calculate number of processes and
regularity of the selection occupied in each chunk */
if (NULL == (nproc_per_chunk = (unsigned *)H5MM_calloc(total_chunks * sizeof(unsigned))))
@@ -2610,6 +3075,10 @@ H5D__obtain_mpio_mode(H5D_io_info_t *io_info, H5D_chunk_map_t *fm, uint8_t assig
#endif
done:
+ /* Re-enable collective metadata reads if we disabled them */
+ if (restore_md_reads_state)
+ H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag, &md_reads_context_flag);
+
if (io_mode_info)
H5MM_free(io_mode_info);
if (mergebuf)
@@ -2623,34 +3092,32 @@ done:
} /* end H5D__obtain_mpio_mode() */
/*-------------------------------------------------------------------------
- * Function: H5D__construct_filtered_io_info_list
+ * Function: H5D__mpio_collective_filtered_chunk_io_setup
*
* Purpose: Constructs a list of entries which contain the necessary
* information for inter-process communication when performing
* collective io on filtered chunks. This list is used by
- * each process when performing I/O on locally selected chunks
- * and also in operations that must be collectively done
- * on every chunk, such as chunk re-allocation, insertion of
- * chunks into the chunk index, etc.
+ * each MPI rank when performing I/O on locally selected
+ * chunks and also in operations that must be collectively
+ * done on every chunk, such as chunk re-allocation, insertion
+ * of chunks into the chunk index, etc.
*
* Return: Non-negative on success/Negative on failure
*
- * Programmer: Jordan Henderson
- * Tuesday, January 10th, 2017
- *
*-------------------------------------------------------------------------
*/
static herr_t
-H5D__construct_filtered_io_info_list(const H5D_io_info_t *io_info, const H5D_type_info_t *type_info,
- const H5D_chunk_map_t * fm,
- H5D_filtered_collective_io_info_t **chunk_list, size_t *num_entries)
+H5D__mpio_collective_filtered_chunk_io_setup(const H5D_io_info_t *io_info, const H5D_type_info_t *type_info,
+ const H5D_chunk_map_t * fm,
+ H5D_filtered_collective_io_info_t **chunk_list,
+ size_t *num_entries, int mpi_rank)
{
- H5D_filtered_collective_io_info_t *local_info_array =
- NULL; /* The list of initially selected chunks for this process */
- size_t num_chunks_selected;
- size_t i;
- int mpi_rank;
- herr_t ret_value = SUCCEED;
+ H5D_filtered_collective_io_info_t *local_info_array = NULL;
+ H5D_chunk_ud_t udata;
+ hbool_t filter_partial_edge_chunks;
+ size_t num_chunks_selected;
+ size_t i;
+ herr_t ret_value = SUCCEED;
FUNC_ENTER_STATIC
@@ -2660,19 +3127,23 @@ H5D__construct_filtered_io_info_list(const H5D_io_info_t *io_info, const H5D_typ
HDassert(chunk_list);
HDassert(num_entries);
- if ((mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file)) < 0)
- HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank")
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_TRACE_ENTER(mpi_rank);
+ H5D_MPIO_TIME_START(mpi_rank, "Filtered Collective I/O Setup");
+#endif
- /* Each process builds a local list of the chunks they have selected */
+ /* Each rank builds a local list of the chunks they have selected */
if ((num_chunks_selected = H5SL_count(fm->sel_chunks))) {
H5D_chunk_info_t *chunk_info;
- H5D_chunk_ud_t udata;
H5SL_node_t * chunk_node;
hsize_t select_npoints;
- hssize_t chunk_npoints;
+ hbool_t need_sort = FALSE;
- if (NULL == (local_info_array = (H5D_filtered_collective_io_info_t *)H5MM_malloc(
- num_chunks_selected * sizeof(H5D_filtered_collective_io_info_t))))
+ /* Determine whether partial edge chunks should be filtered */
+ filter_partial_edge_chunks = !(io_info->dset->shared->layout.u.chunk.flags &
+ H5O_LAYOUT_CHUNK_DONT_FILTER_PARTIAL_BOUND_CHUNKS);
+
+ if (NULL == (local_info_array = H5MM_malloc(num_chunks_selected * sizeof(*local_info_array))))
HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate local io info array buffer")
chunk_node = H5SL_first(fm->sel_chunks);
@@ -2683,275 +3154,787 @@ H5D__construct_filtered_io_info_list(const H5D_io_info_t *io_info, const H5D_typ
if (H5D__chunk_lookup(io_info->dset, chunk_info->scaled, &udata) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "error looking up chunk address")
- local_info_array[i].index = chunk_info->index;
- local_info_array[i].chunk_states.chunk_current = local_info_array[i].chunk_states.new_chunk =
- udata.chunk_block;
- local_info_array[i].num_writers = 0;
- local_info_array[i].owners.original_owner = local_info_array[i].owners.new_owner = mpi_rank;
- local_info_array[i].buf = NULL;
-
- local_info_array[i].async_info.num_receive_requests = 0;
- local_info_array[i].async_info.receive_buffer_array = NULL;
- local_info_array[i].async_info.receive_requests_array = NULL;
-
- H5MM_memcpy(local_info_array[i].scaled, chunk_info->scaled, sizeof(chunk_info->scaled));
-
- select_npoints = H5S_GET_SELECT_NPOINTS(chunk_info->mspace);
- local_info_array[i].io_size = (size_t)select_npoints * type_info->src_type_size;
-
- /* Currently the full overwrite status of a chunk is only obtained on a per-process
- * basis. This means that if the total selection in the chunk, as determined by the combination
- * of selections of all of the processes interested in the chunk, covers the entire chunk,
- * the performance optimization of not reading the chunk from the file is still valid, but
- * is not applied in the current implementation. Something like an appropriately placed
- * MPI_Allreduce or a running total of the number of chunk points selected during chunk
- * redistribution should suffice for implementing this case - JTH.
+ /* Initialize rank-local chunk info */
+ local_info_array[i].chunk_info = chunk_info;
+ local_info_array[i].chunk_buf_size = 0;
+ local_info_array[i].num_writers = 0;
+ local_info_array[i].orig_owner = mpi_rank;
+ local_info_array[i].new_owner = mpi_rank;
+ local_info_array[i].buf = NULL;
+
+ select_npoints = H5S_GET_SELECT_NPOINTS(chunk_info->fspace);
+ local_info_array[i].io_size = (size_t)select_npoints * type_info->dst_type_size;
+
+ /*
+ * Determine whether this chunk will need to be read from the file. If this is
+ * a read operation, the chunk will be read. If this is a write operation, we
+ * generally need to read a filtered chunk from the file before modifying it,
+ * unless the chunk is being fully overwritten.
+ *
+ * TODO: Currently the full overwrite status of a chunk is only obtained on a
+ * per-rank basis. This means that if the total selection in the chunk, as
+ * determined by the combination of selections of all of the ranks interested in
+ * the chunk, covers the entire chunk, the performance optimization of not reading
+ * the chunk from the file is still valid, but is not applied in the current
+ * implementation.
+ *
+ * To implement this case, a few approaches were considered:
+ *
+ * - Keep a running total (distributed to each rank) of the number of chunk
+ * elements selected during chunk redistribution and compare that to the total
+ * number of elements in the chunk once redistribution is finished
+ *
+ * - Process all incoming chunk messages before doing I/O (these are currently
+ * processed AFTER doing I/O), combine the owning rank's selection in a chunk
+ * with the selections received from other ranks and check to see whether that
+ * combined selection covers the entire chunk
+ *
+ * The first approach will be dangerous if the application performs an overlapping
+ * write to a chunk, as the number of selected elements can equal or exceed the
+ * number of elements in the chunk without the whole chunk selection being covered.
+ * While it might be considered erroneous for an application to do an overlapping
+ * write, we don't explicitly disallow it.
+ *
+ * The second approach contains a bit of complexity in that part of the chunk
+ * messages will be needed before doing I/O and part will be needed after doing I/O.
+ * Since modification data from chunk messages can't be applied until after any I/O
+ * is performed (otherwise, we'll overwrite any applied modification data), chunk
+ * messages are currently entirely processed after I/O. However, in order to determine
+ * if a chunk is being fully overwritten, we need the dataspace portion of the chunk
+ * messages before doing I/O. The naive way to do this is to process chunk messages
+ * twice, using just the relevant information from the message before and after I/O.
+ * The better way would be to avoid processing chunk messages twice by extracting (and
+ * keeping around) the dataspace portion of the message before I/O and processing the
+ * rest of the chunk message after I/O. Note that the dataspace portion of each chunk
+ * message is used to correctly apply chunk modification data from the message, so
+ * must be kept around both before and after I/O in this case.
+ */
+ if (io_info->op_type == H5D_IO_OP_READ)
+ local_info_array[i].need_read = TRUE;
+ else {
+ local_info_array[i].need_read =
+ local_info_array[i].io_size < (size_t)io_info->dset->shared->layout.u.chunk.size;
+ }
+
+ local_info_array[i].skip_filter_pline = FALSE;
+ if (!filter_partial_edge_chunks) {
+ /*
+ * If this is a partial edge chunk and the "don't filter partial edge
+ * chunks" flag is set, make sure not to apply filters to the chunk.
+ */
+ if (H5D__chunk_is_partial_edge_chunk(io_info->dset->shared->ndims,
+ io_info->dset->shared->layout.u.chunk.dim,
+ chunk_info->scaled, io_info->dset->shared->curr_dims))
+ local_info_array[i].skip_filter_pline = TRUE;
+ }
+
+ /* Initialize the chunk's shared info */
+ local_info_array[i].chunk_current = udata.chunk_block;
+ local_info_array[i].chunk_new = udata.chunk_block;
+
+ /*
+ * Check if the list is not in ascending order of offset in the file
+ * or has unallocated chunks. In either case, the list should get
+ * sorted.
+ */
+ if (i) {
+ haddr_t curr_chunk_offset = local_info_array[i].chunk_current.offset;
+ haddr_t prev_chunk_offset = local_info_array[i - 1].chunk_current.offset;
+
+ if (!H5F_addr_defined(prev_chunk_offset) || !H5F_addr_defined(curr_chunk_offset) ||
+ (curr_chunk_offset < prev_chunk_offset))
+ need_sort = TRUE;
+ }
+
+ /*
+ * Extensible arrays may calculate a chunk's index a little differently
+ * than normal when the dataset's unlimited dimension is not the
+ * slowest-changing dimension, so set the index here based on what the
+ * extensible array code calculated instead of what was calculated
+ * in the chunk file mapping.
*/
- if ((chunk_npoints = H5S_GET_EXTENT_NPOINTS(chunk_info->fspace)) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTCOUNT, FAIL, "dataspace is invalid")
- local_info_array[i].full_overwrite =
- (local_info_array[i].io_size >= (hsize_t)chunk_npoints * type_info->dst_type_size) ? TRUE
- : FALSE;
+ if (io_info->dset->shared->layout.u.chunk.idx_type == H5D_CHUNK_IDX_EARRAY)
+ local_info_array[i].index_info.chunk_idx = udata.chunk_idx;
+ else
+ local_info_array[i].index_info.chunk_idx = chunk_info->index;
+
+ local_info_array[i].index_info.filter_mask = udata.filter_mask;
+ local_info_array[i].index_info.need_insert = FALSE;
chunk_node = H5SL_next(chunk_node);
- } /* end for */
- } /* end if */
+ }
- /* Redistribute shared chunks to new owners as necessary */
- if (io_info->op_type == H5D_IO_OP_WRITE)
-#if MPI_VERSION >= 3
- if (H5D__chunk_redistribute_shared_chunks(io_info, type_info, fm, local_info_array,
- &num_chunks_selected) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "unable to redistribute shared chunks")
-#else
- HGOTO_ERROR(
- H5E_DATASET, H5E_WRITEERROR, FAIL,
- "unable to redistribute shared chunks - MPI version < 3 (MPI_Mprobe and MPI_Imrecv missing)")
+ /* Ensure the chunk list is sorted in ascending order of offset in the file */
+ if (need_sort)
+ HDqsort(local_info_array, num_chunks_selected, sizeof(H5D_filtered_collective_io_info_t),
+ H5D__cmp_filtered_collective_io_info_entry);
+
+#ifdef H5Dmpio_DEBUG
+ H5D__mpio_dump_collective_filtered_chunk_list(local_info_array, num_chunks_selected, mpi_rank);
#endif
+ }
+ else if (H5F_get_coll_metadata_reads(io_info->dset->oloc.file)) {
+ hsize_t scaled[H5O_LAYOUT_NDIMS] = {0};
+
+ /*
+ * If this rank has no selection in the dataset and collective
+ * metadata reads are enabled, do a fake lookup of a chunk to
+ * ensure that this rank has the chunk index opened. Otherwise,
+ * only the ranks that had a selection will have opened the
+ * chunk index and they will have done so independently. Therefore,
+ * when ranks with no selection participate in later collective
+ * metadata reads, they will try to open the chunk index collectively
+ * and issues will occur since other ranks won't participate.
+ *
+ * In the future, we should consider having a chunk index "open"
+ * callback that can be used to ensure collectivity between ranks
+ * in a more natural way, but this hack should suffice for now.
+ */
+ if (H5D__chunk_lookup(io_info->dset, scaled, &udata) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "error looking up chunk address")
+ }
*chunk_list = local_info_array;
*num_entries = num_chunks_selected;
done:
- FUNC_LEAVE_NOAPI(ret_value)
-} /* end H5D__construct_filtered_io_info_list() */
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_TIME_STOP(mpi_rank);
+ H5D_MPIO_TRACE_EXIT(mpi_rank);
+#endif
-#if MPI_VERSION >= 3
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D__mpio_collective_filtered_chunk_io_setup() */
/*-------------------------------------------------------------------------
- * Function: H5D__chunk_redistribute_shared_chunks
- *
- * Purpose: When performing a collective write on a Dataset with
- * filters applied, this function is used to redistribute any
- * chunks which are selected by more than one process, so as
- * to preserve file integrity after the write by ensuring
- * that any shared chunks are only modified by one process.
- *
- * The current implementation follows this 3-phase process:
- *
- * - Collect everyone's list of chunks into one large list,
- * sort the list in increasing order of chunk offset in the
- * file and hand the list off to rank 0
- *
- * - Rank 0 scans the list looking for matching runs of chunk
- * offset in the file (corresponding to a shared chunk which
- * has been selected by more than one rank in the I/O
- * operation) and for each shared chunk, it redistributes
- * the chunk to the process writing to the chunk which
- * currently has the least amount of chunks assigned to it
- * by modifying the "new_owner" field in each of the list
- * entries corresponding to that chunk
- *
- * - After the chunks have been redistributed, rank 0 re-sorts
- * the list in order of previous owner so that each rank
- * will get back exactly the array that they contributed to
- * the redistribution operation, with the "new_owner" field
- * of each chunk they are modifying having possibly been
- * modified. Rank 0 then scatters each segment of the list
- * back to its corresponding rank
+ * Function: H5D__mpio_redistribute_shared_chunks
+ *
+ * Purpose: When performing a parallel write on a chunked Dataset with
+ * filters applied, we must ensure that any particular chunk
+ * is only written to by a single MPI rank in order to avoid
+ * potential data races on the chunk. This function is used to
+ * redistribute (by assigning ownership to a single rank) any
+ * chunks which are selected by more than one MPI rank.
+ *
+ * An initial Allgather is performed to determine how many
+ * chunks each rank has selected in the write operation and
+ * then that number is compared against a threshold value to
+ * determine whether chunk redistribution should be done on
+ * MPI rank 0 only, or on all MPI ranks.
*
* Return: Non-negative on success/Negative on failure
*
- * Programmer: Jordan Henderson
- * Monday, May 1, 2017
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5D__mpio_redistribute_shared_chunks(H5D_filtered_collective_io_info_t *chunk_list,
+ size_t chunk_list_num_entries, const H5D_io_info_t *io_info,
+ const H5D_chunk_map_t *fm, int mpi_rank, int mpi_size,
+ size_t **rank_chunks_assigned_map)
+{
+ hbool_t redistribute_on_all_ranks;
+ size_t *num_chunks_map = NULL;
+ size_t coll_chunk_list_size = 0;
+ size_t i;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_STATIC
+
+ HDassert(chunk_list || 0 == chunk_list_num_entries);
+ HDassert(io_info);
+ HDassert(fm);
+ HDassert(mpi_size > 1); /* No chunk sharing is possible for MPI Comm size of 1 */
+
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_TRACE_ENTER(mpi_rank);
+ H5D_MPIO_TIME_START(mpi_rank, "Redistribute shared chunks");
+#endif
+
+ /*
+ * Allocate an array for each rank to keep track of the number of
+ * chunks assigned to any other rank in order to cut down on future
+ * MPI communication.
+ */
+ if (NULL == (num_chunks_map = H5MM_malloc((size_t)mpi_size * sizeof(*num_chunks_map))))
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't allocate assigned chunks array")
+
+ /* Perform initial Allgather to determine the collective chunk list size */
+ if (MPI_SUCCESS != (mpi_code = MPI_Allgather(&chunk_list_num_entries, 1, H5_SIZE_T_AS_MPI_TYPE,
+ num_chunks_map, 1, H5_SIZE_T_AS_MPI_TYPE, io_info->comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Allgather failed", mpi_code)
+
+ for (i = 0; i < (size_t)mpi_size; i++)
+ coll_chunk_list_size += num_chunks_map[i];
+
+ /*
+ * Determine whether we should perform chunk redistribution on all
+ * ranks or just rank 0. For a relatively small number of chunks,
+ * we redistribute on all ranks to cut down on MPI communication
+ * overhead. For a larger number of chunks, we redistribute on
+ * rank 0 only to cut down on memory usage.
+ */
+ redistribute_on_all_ranks = coll_chunk_list_size < H5D_CHUNK_REDISTRIBUTE_THRES;
+
+ if (H5D__mpio_redistribute_shared_chunks_int(chunk_list, num_chunks_map, redistribute_on_all_ranks,
+ io_info, fm, mpi_rank, mpi_size) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTREDISTRIBUTE, FAIL, "can't redistribute shared chunks")
+
+ /*
+ * If the caller provided a pointer for the mapping from
+ * rank value -> number of chunks assigned, return that
+ * mapping here.
+ */
+ if (rank_chunks_assigned_map) {
+ /*
+ * If we performed chunk redistribution on rank 0 only, distribute
+ * the rank value -> number of chunks assigned mapping back to all
+ * ranks.
+ */
+ if (!redistribute_on_all_ranks) {
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Bcast(num_chunks_map, mpi_size, H5_SIZE_T_AS_MPI_TYPE, 0, io_info->comm)))
+ HMPI_GOTO_ERROR(FAIL, "couldn't broadcast chunk mapping to other ranks", mpi_code)
+ }
+
+ *rank_chunks_assigned_map = num_chunks_map;
+ }
+
+done:
+ if (!rank_chunks_assigned_map || (ret_value < 0)) {
+ num_chunks_map = H5MM_xfree(num_chunks_map);
+ }
+
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_TIME_STOP(mpi_rank);
+ H5D_MPIO_TRACE_EXIT(mpi_rank);
+#endif
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D__mpio_redistribute_shared_chunks() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5D__mpio_redistribute_shared_chunks_int
+ *
+ * Purpose: Routine to perform redistribution of shared chunks during
+ * parallel writes to datasets with filters applied.
+ *
+ * If `all_ranks_involved` is TRUE, chunk redistribution
+ * occurs on all MPI ranks. This is usually done when there
+ * is a relatively small number of chunks involved in order to
+ * cut down on MPI communication overhead while increasing
+ * total memory usage a bit.
+ *
+ * If `all_ranks_involved` is FALSE, only rank 0 will perform
+ * chunk redistribution. This is usually done when there is
+ * a relatively large number of chunks involved in order to
+ * cut down on total memory usage at the cost of increased
+ * overhead from MPI communication.
+ *
+ * This implementation is as follows:
+ *
+ * - All MPI ranks send their list of selected chunks to the
+ * ranks involved in chunk redistribution. Then, the
+ * involved ranks sort this new list in order of chunk
+ * index.
+ *
+ * - The involved ranks scan the list looking for matching
+ * runs of chunk index values (corresponding to a shared
+ * chunk which has been selected by more than one rank in
+ * the I/O operation) and for each shared chunk,
+ * redistribute the chunk to the MPI rank writing to the
+ * chunk which currently has the least amount of chunks
+ * assigned to it. This is done by modifying the "new_owner"
+ * field in each of the list entries corresponding to that
+ * chunk. The involved ranks then re-sort the list in order
+ * of original chunk owner so that each rank's section of
+ * contributed chunks is contiguous in the collective chunk
+ * list.
+ *
+ * - If chunk redistribution occurred on all ranks, each rank
+ * scans through the collective chunk list to find their
+ * contributed section of chunks and uses that to update
+ * their local chunk list with the newly-updated "new_owner"
+ * and "num_writers" fields. If chunk redistribution
+ * occurred only on rank 0, an MPI_Scatterv operation will
+ * be used to scatter the segments of the collective chunk
+ * list from rank 0 back to the corresponding ranks.
+ *
+ * Return: Non-negative on success/Negative on failure
*
*-------------------------------------------------------------------------
*/
static herr_t
-H5D__chunk_redistribute_shared_chunks(const H5D_io_info_t *io_info, const H5D_type_info_t *type_info,
- const H5D_chunk_map_t * fm,
- H5D_filtered_collective_io_info_t *local_chunk_array,
- size_t * local_chunk_array_num_entries)
+H5D__mpio_redistribute_shared_chunks_int(H5D_filtered_collective_io_info_t *chunk_list,
+ size_t *num_chunks_assigned_map, hbool_t all_ranks_involved,
+ const H5D_io_info_t *io_info, const H5D_chunk_map_t *fm,
+ int mpi_rank, int mpi_size)
{
- H5D_filtered_collective_io_info_t *shared_chunks_info_array =
- NULL; /* The list of all chunks selected in the operation by all processes */
- H5S_sel_iter_t *mem_iter = NULL; /* Memory iterator for H5D__gather_mem */
- unsigned char **mod_data =
- NULL; /* Array of chunk modification data buffers sent by a process to new chunk owners */
- MPI_Request *send_requests = NULL; /* Array of MPI_Isend chunk modification data send requests */
- MPI_Status * send_statuses = NULL; /* Array of MPI_Isend chunk modification send statuses */
- hbool_t mem_iter_init = FALSE;
- size_t shared_chunks_info_array_num_entries = 0;
- size_t num_send_requests = 0;
- size_t * num_assigned_chunks_array = NULL;
- size_t i, last_assigned_idx;
- int * send_counts = NULL;
- int * send_displacements = NULL;
- int scatter_recvcount_int;
- int mpi_rank, mpi_size, mpi_code;
+ MPI_Datatype struct_type;
+ MPI_Datatype packed_type;
+ hbool_t struct_type_derived = FALSE;
+ hbool_t packed_type_derived = FALSE;
+ size_t i;
+ size_t coll_chunk_list_num_entries = 0;
+ void * coll_chunk_list = NULL;
+ int * counts_disps_array = NULL;
+ int * counts_ptr = NULL;
+ int * displacements_ptr = NULL;
+ int num_chunks_int;
+ int mpi_code;
herr_t ret_value = SUCCEED;
FUNC_ENTER_STATIC
+ HDassert(num_chunks_assigned_map);
+ HDassert(chunk_list || 0 == num_chunks_assigned_map[mpi_rank]);
HDassert(io_info);
- HDassert(type_info);
HDassert(fm);
- HDassert(local_chunk_array_num_entries);
+ HDassert(mpi_size > 1);
- if ((mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file)) < 0)
- HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank")
- if ((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file)) < 0)
- HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size")
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_TRACE_ENTER(mpi_rank);
+ H5D_MPIO_TIME_START(mpi_rank, "Redistribute shared chunks (internal)");
+#endif
- /* Set to latest format for encoding dataspace */
- H5CX_set_libver_bounds(NULL);
+ /*
+ * Make sure it's safe to cast this rank's number
+ * of chunks to be sent into an int for MPI
+ */
+ H5_CHECKED_ASSIGN(num_chunks_int, int, num_chunks_assigned_map[mpi_rank], size_t);
+
+ /*
+ * Phase 1 - Participate in collective gathering of every rank's
+ * list of chunks to the ranks which are performing the redistribution
+ * operation.
+ */
- if (*local_chunk_array_num_entries)
- if (NULL == (send_requests =
- (MPI_Request *)H5MM_malloc(*local_chunk_array_num_entries * sizeof(MPI_Request))))
- HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate send requests buffer")
+ if (all_ranks_involved || (mpi_rank == 0)) {
+ /*
+ * Allocate array to store the receive counts of each rank, as well as
+ * the displacements into the final array where each rank will place
+ * their data. The first half of the array contains the receive counts
+ * (in rank order), while the latter half contains the displacements
+ * (also in rank order).
+ */
+ if (NULL == (counts_disps_array = H5MM_malloc(2 * (size_t)mpi_size * sizeof(*counts_disps_array)))) {
+ /* Push an error, but still participate in collective gather operation */
+ HDONE_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "couldn't allocate receive counts and displacements array")
+ }
+ else {
+ /* Set the receive counts from the assigned chunks map */
+ counts_ptr = counts_disps_array;
- if (NULL == (mem_iter = (H5S_sel_iter_t *)H5MM_malloc(sizeof(H5S_sel_iter_t))))
- HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate memory iterator")
+ for (i = 0; i < (size_t)mpi_size; i++)
+ H5_CHECKED_ASSIGN(counts_ptr[i], int, num_chunks_assigned_map[i], size_t);
+
+ /* Set the displacements into the receive buffer for the gather operation */
+ displacements_ptr = &counts_disps_array[mpi_size];
+
+ *displacements_ptr = 0;
+ for (i = 1; i < (size_t)mpi_size; i++)
+ displacements_ptr[i] = displacements_ptr[i - 1] + counts_ptr[i - 1];
+ }
+ }
+
+ /*
+ * Construct MPI derived types for extracting information
+ * necessary for MPI communication
+ */
+ if (H5D__mpio_get_chunk_redistribute_info_types(&packed_type, &packed_type_derived, &struct_type,
+ &struct_type_derived) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL,
+ "can't create derived datatypes for chunk redistribution info")
+
+ /* Perform gather operation */
+ if (H5_mpio_gatherv_alloc(chunk_list, num_chunks_int, struct_type, counts_ptr, displacements_ptr,
+ packed_type, all_ranks_involved, 0, io_info->comm, mpi_rank, mpi_size,
+ &coll_chunk_list, &coll_chunk_list_num_entries) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGATHER, FAIL,
+ "can't gather chunk redistribution info to involved ranks")
+
+ /*
+ * If all ranks are redistributing shared chunks, we no
+ * longer need the receive counts and displacements array
+ */
+ if (all_ranks_involved) {
+ counts_disps_array = H5MM_xfree(counts_disps_array);
+ }
- /* Gather every rank's list of chunks to rank 0 to allow it to perform the redistribution operation. After
- * this call, the gathered list will initially be sorted in increasing order of chunk offset in the file.
+ /*
+ * Phase 2 - Involved ranks now redistribute any shared chunks to new
+ * owners as necessary.
*/
- if (H5D__mpio_array_gatherv(local_chunk_array, *local_chunk_array_num_entries,
- sizeof(H5D_filtered_collective_io_info_t), (void **)&shared_chunks_info_array,
- &shared_chunks_info_array_num_entries, false, 0, io_info->comm,
- H5D__cmp_filtered_collective_io_info_entry) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGATHER, FAIL, "couldn't gather array")
- /* Rank 0 redistributes any shared chunks to new owners as necessary */
- if (mpi_rank == 0) {
- if (NULL == (send_counts = (int *)H5MM_calloc((size_t)mpi_size * sizeof(int))))
- HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "unable to allocate send counts buffer")
+ if (all_ranks_involved || (mpi_rank == 0)) {
+ H5D_chunk_redistribute_info_t *chunk_entry;
+ hsize_t curr_chunk_idx;
+ size_t set_begin_index;
+ int num_writers;
+ int new_chunk_owner;
- if (NULL == (send_displacements = (int *)H5MM_malloc((size_t)mpi_size * sizeof(int))))
- HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "unable to allocate send displacements buffer")
+ /* Clear the mapping from rank value -> number of assigned chunks */
+ HDmemset(num_chunks_assigned_map, 0, (size_t)mpi_size * sizeof(*num_chunks_assigned_map));
- if (NULL == (num_assigned_chunks_array = (size_t *)H5MM_calloc((size_t)mpi_size * sizeof(size_t))))
- HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL,
- "unable to allocate number of assigned chunks array")
+ /* Sort collective chunk list according to chunk index */
+ HDqsort(coll_chunk_list, coll_chunk_list_num_entries, sizeof(H5D_chunk_redistribute_info_t),
+ H5D__cmp_chunk_redistribute_info);
- for (i = 0; i < shared_chunks_info_array_num_entries;) {
- H5D_filtered_collective_io_info_t *chunk_entry;
- haddr_t last_seen_addr = shared_chunks_info_array[i].chunk_states.chunk_current.offset;
- size_t set_begin_index = i;
- size_t num_writers = 0;
- int new_chunk_owner = shared_chunks_info_array[i].owners.original_owner;
+ /*
+ * Process all chunks in the collective chunk list.
+ * Note that the loop counter is incremented by both
+ * the outer loop (while processing each entry in
+ * the collective chunk list) and the inner loop
+ * (while processing duplicate entries for shared
+ * chunks).
+ */
+ chunk_entry = &((H5D_chunk_redistribute_info_t *)coll_chunk_list)[0];
+ for (i = 0; i < coll_chunk_list_num_entries;) {
+ /* Set chunk's initial new owner to its original owner */
+ new_chunk_owner = chunk_entry->orig_owner;
+
+ /*
+ * Set the current chunk index so we know when we've processed
+ * all duplicate entries for a particular shared chunk
+ */
+ curr_chunk_idx = chunk_entry->chunk_idx;
- /* Process each set of duplicate entries caused by another process writing to the same chunk */
- do {
- chunk_entry = &shared_chunks_info_array[i];
+ /* Reset the initial number of writers to this chunk */
+ num_writers = 0;
- send_counts[chunk_entry->owners.original_owner] += (int)sizeof(*chunk_entry);
+ /* Set index for the beginning of this section of duplicate chunk entries */
+ set_begin_index = i;
- /* The new owner of the chunk is determined by the process
+ /*
+ * Process each chunk entry in the set for the current
+ * (possibly shared) chunk and increment the loop counter
+ * while doing so.
+ */
+ do {
+ /*
+ * The new owner of the chunk is determined by the rank
* writing to the chunk which currently has the least amount
* of chunks assigned to it
*/
- if (num_assigned_chunks_array[chunk_entry->owners.original_owner] <
- num_assigned_chunks_array[new_chunk_owner])
- new_chunk_owner = chunk_entry->owners.original_owner;
+ if (num_chunks_assigned_map[chunk_entry->orig_owner] <
+ num_chunks_assigned_map[new_chunk_owner])
+ new_chunk_owner = chunk_entry->orig_owner;
+ /* Update the number of writers to this particular chunk */
num_writers++;
- } while (++i < shared_chunks_info_array_num_entries &&
- shared_chunks_info_array[i].chunk_states.chunk_current.offset == last_seen_addr);
- /* Set all of the chunk entries' "new_owner" fields */
+ chunk_entry++;
+ } while (++i < coll_chunk_list_num_entries && chunk_entry->chunk_idx == curr_chunk_idx);
+
+ /* We should never have more writers to a chunk than the number of MPI ranks */
+ HDassert(num_writers <= mpi_size);
+
+ /* Set all processed chunk entries' "new_owner" and "num_writers" fields */
for (; set_begin_index < i; set_begin_index++) {
- shared_chunks_info_array[set_begin_index].owners.new_owner = new_chunk_owner;
- shared_chunks_info_array[set_begin_index].num_writers = num_writers;
- } /* end for */
+ H5D_chunk_redistribute_info_t *entry;
- num_assigned_chunks_array[new_chunk_owner]++;
- } /* end for */
+ entry = &((H5D_chunk_redistribute_info_t *)coll_chunk_list)[set_begin_index];
+
+ entry->new_owner = new_chunk_owner;
+ entry->num_writers = num_writers;
+ }
+
+ /* Update the number of chunks assigned to the MPI rank that now owns this chunk */
+ num_chunks_assigned_map[new_chunk_owner]++;
+ }
- /* Sort the new list in order of previous owner so that each original owner of a chunk
- * entry gets that entry back, with the possibly newly-modified "new_owner" field
+ /*
+ * Re-sort the collective chunk list in order of original chunk owner
+ * so that each rank's section of contributed chunks is contiguous in
+ * the collective chunk list.
+ *
+ * NOTE: this re-sort is frail in that it needs to sort the collective
+ * chunk list so that each rank's section of contributed chunks
+ * is in the exact order it was contributed in, or things will
+ * be scrambled when each rank's local chunk list is updated.
+ * Therefore, the sorting algorithm here is tied to the one
+ * used during the I/O setup operation. Specifically, chunks
+ * are first sorted by ascending order of offset in the file and
+ * then by chunk index. In the future, a better redistribution
+ * algorithm may be devised that doesn't rely on frail sorting,
+ * but the current implementation is a quick and naive approach.
*/
- if (shared_chunks_info_array_num_entries > 1)
- HDqsort(shared_chunks_info_array, shared_chunks_info_array_num_entries,
- sizeof(H5D_filtered_collective_io_info_t),
- H5D__cmp_filtered_collective_io_info_entry_owner);
-
- send_displacements[0] = 0;
- for (i = 1; i < (size_t)mpi_size; i++)
- send_displacements[i] = send_displacements[i - 1] + send_counts[i - 1];
- } /* end if */
+ HDqsort(coll_chunk_list, coll_chunk_list_num_entries, sizeof(H5D_chunk_redistribute_info_t),
+ H5D__cmp_chunk_redistribute_info_orig_owner);
+ }
- /* Scatter the segments of the list back to each process */
- H5_CHECKED_ASSIGN(scatter_recvcount_int, int,
- *local_chunk_array_num_entries * sizeof(H5D_filtered_collective_io_info_t), size_t);
- if (MPI_SUCCESS !=
- (mpi_code = MPI_Scatterv(shared_chunks_info_array, send_counts, send_displacements, MPI_BYTE,
- local_chunk_array, scatter_recvcount_int, MPI_BYTE, 0, io_info->comm)))
- HMPI_GOTO_ERROR(FAIL, "unable to scatter shared chunks info buffer", mpi_code)
+ if (all_ranks_involved) {
+ /*
+ * If redistribution occurred on all ranks, search for the section
+ * in the collective chunk list corresponding to this rank's locally
+ * selected chunks and update the local list after redistribution.
+ */
+ for (i = 0; i < coll_chunk_list_num_entries; i++)
+ if (mpi_rank == ((H5D_chunk_redistribute_info_t *)coll_chunk_list)[i].orig_owner)
+ break;
- if (shared_chunks_info_array) {
- H5MM_free(shared_chunks_info_array);
- shared_chunks_info_array = NULL;
- } /* end if */
+ for (size_t j = 0; j < (size_t)num_chunks_int; j++) {
+ H5D_chunk_redistribute_info_t *coll_entry;
+
+ coll_entry = &((H5D_chunk_redistribute_info_t *)coll_chunk_list)[i++];
+
+ chunk_list[j].new_owner = coll_entry->new_owner;
+ chunk_list[j].num_writers = coll_entry->num_writers;
+ }
+ }
+ else {
+ /*
+ * If redistribution occurred only on rank 0, scatter the segments
+ * of the collective chunk list back to each rank so that their
+ * local chunk lists get updated
+ */
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Scatterv(coll_chunk_list, counts_ptr, displacements_ptr, packed_type, chunk_list,
+ num_chunks_int, struct_type, 0, io_info->comm)))
+ HMPI_GOTO_ERROR(FAIL, "unable to scatter shared chunks info buffer", mpi_code)
+ }
+
+#ifdef H5Dmpio_DEBUG
+ H5D__mpio_dump_collective_filtered_chunk_list(chunk_list, num_chunks_assigned_map[mpi_rank], mpi_rank);
+#endif
+
+done:
+ H5MM_free(coll_chunk_list);
+
+ if (struct_type_derived) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_free(&struct_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ }
+ if (packed_type_derived) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_free(&packed_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ }
+
+ H5MM_free(counts_disps_array);
+
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_TIME_STOP(mpi_rank);
+ H5D_MPIO_TRACE_EXIT(mpi_rank);
+#endif
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D__mpio_redistribute_shared_chunks_int() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5D__mpio_share_chunk_modification_data
+ *
+ * Purpose: When performing a parallel write on a chunked dataset with
+ * filters applied, we must first ensure that any particular
+ * chunk is only written to by a single MPI rank in order to
+ * avoid potential data races on the chunk. Once dataset
+ * chunks have been redistributed in a suitable manner, each
+ * MPI rank must send its chunk data to other ranks for each
+ * chunk it no longer owns.
+ *
+ * The current implementation here follows the Nonblocking
+ * Consensus algorithm described in:
+ * http://unixer.de/publications/img/hoefler-dsde-protocols.pdf
+ *
+ * First, each MPI rank scans through its list of selected
+ * chunks and does the following for each chunk:
+ *
+ * * If a chunk in the MPI rank's chunk list is still owned
+ * by that rank, the rank checks how many messages are
+ * incoming for that chunk and adds that to its running
+ * total. Then, the rank updates its local chunk list so
+ * that any previous chunk entries for chunks that are no
+ * longer owned by the rank get overwritten by chunk
+ * entries for chunks the rank still owns. Since the data
+ * for the chunks no longer owned will have already been
+ * sent, those chunks can effectively be discarded.
+ * * If a chunk in the MPI rank's chunk list is no longer
+ * owned by that rank, the rank sends the data it wishes to
+ * update the chunk with to the MPI rank that now has
+ * ownership of that chunk. To do this, it encodes the
+ * chunk's index, its selection in the chunk and its
+ * modification data into a buffer and then posts a
+ * non-blocking MPI_Issend to the owning rank.
+ *
+ * Once this step is complete, all MPI ranks allocate arrays
+ * to hold chunk message receive buffers and MPI request
+ * objects for each non-blocking receive they will post for
+ * incoming chunk modification messages. Then, all MPI ranks
+ * enter a loop that alternates between non-blocking
+ * MPI_Iprobe calls to probe for incoming messages and
+ * MPI_Testall calls to see if all send requests have
+ * completed. As chunk modification messages arrive,
+ * non-blocking MPI_Irecv calls will be posted for each
+ * message.
+ *
+ * Once all send requests have completed, an MPI_Ibarrier is
+ * posted and the loop then alternates between MPI_Iprobe
+ * calls and MPI_Test calls to check if all ranks have reached
+ * the non-blocking barrier. Once all ranks have reached the
+ * barrier, processing can move on to updating the selected
+ * chunks that are owned in the operation.
+ *
+ * Any chunk messages that were received from other ranks
+ * will be returned through the `chunk_msg_bufs` array and
+ * `chunk_msg_bufs_len` will be set appropriately.
+ *
+ * NOTE: The use of non-blocking sends and receives of chunk
+ * data here may contribute to large amounts of memory
+ * usage/MPI request overhead if the number of shared
+ * chunks is high. If this becomes a problem, it may be
+ * useful to split the message receiving loop away so
+ * that chunk modification messages can be received and
+ * processed immediately (MPI_Recv) using a single chunk
+ * message buffer. However, it's possible this may
+ * degrade performance since the chunk message sends
+ * are synchronous (MPI_Issend) in the Nonblocking
+ * Consensus algorithm.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5D__mpio_share_chunk_modification_data(H5D_filtered_collective_io_info_t *chunk_list,
+ size_t *chunk_list_num_entries, H5D_io_info_t *io_info,
+ const H5D_type_info_t *type_info, int mpi_rank, int mpi_size,
+ H5D_filtered_collective_io_info_t **chunk_hash_table,
+ unsigned char ***chunk_msg_bufs, int *chunk_msg_bufs_len)
+{
+#if MPI_VERSION >= 3
+ H5D_filtered_collective_io_info_t *chunk_table = NULL;
+ H5S_sel_iter_t * mem_iter = NULL;
+ unsigned char ** msg_send_bufs = NULL;
+ unsigned char ** msg_recv_bufs = NULL;
+ MPI_Request * send_requests = NULL;
+ MPI_Request * recv_requests = NULL;
+ MPI_Request ibarrier = MPI_REQUEST_NULL;
+ hbool_t mem_iter_init = FALSE;
+ hbool_t ibarrier_posted = FALSE;
+ size_t send_bufs_nalloc = 0;
+ size_t num_send_requests = 0;
+ size_t num_recv_requests = 0;
+ size_t num_msgs_incoming = 0;
+ size_t last_assigned_idx;
+ size_t i;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_STATIC
+
+ HDassert(chunk_list_num_entries);
+ HDassert(chunk_list || 0 == *chunk_list_num_entries);
+ HDassert(io_info);
+ HDassert(type_info);
+ HDassert(mpi_size > 1);
+ HDassert(chunk_msg_bufs);
+ HDassert(chunk_msg_bufs_len);
+
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_TRACE_ENTER(mpi_rank);
+ H5D_MPIO_TIME_START(mpi_rank, "Share chunk modification data");
+#endif
+
+ /* Set to latest format for encoding dataspace */
+ H5CX_set_libver_bounds(NULL);
+
+ if (*chunk_list_num_entries) {
+ /* Allocate a selection iterator for iterating over chunk dataspaces */
+ if (NULL == (mem_iter = H5FL_MALLOC(H5S_sel_iter_t)))
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate dataspace selection iterator")
- /* Now that the chunks have been redistributed, each process must send its modification data
- * to the new owners of any of the chunks it previously possessed. Accordingly, each process
- * must also issue asynchronous receives for any messages it may receive for each of the
- * chunks it is assigned, in order to avoid potential deadlocking issues.
+ /*
+ * Allocate send buffer and MPI_Request arrays for non-blocking
+ * sends of outgoing chunk messages
+ */
+ send_bufs_nalloc = H5D_CHUNK_NUM_SEND_MSGS_INIT;
+ if (NULL == (msg_send_bufs = H5MM_malloc(send_bufs_nalloc * sizeof(*msg_send_bufs))))
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL,
+ "couldn't allocate chunk modification message buffer array")
+
+ if (NULL == (send_requests = H5MM_malloc(send_bufs_nalloc * sizeof(*send_requests))))
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate send requests array")
+ }
+
+ /*
+ * For each chunk this rank owns, add to the total number of
+ * incoming MPI messages, then update the local chunk list to
+ * overwrite any previous chunks no longer owned by this rank.
+ * Since the data for those chunks will have already been sent,
+ * this rank should no longer be interested in them and they
+ * can effectively be discarded. This bookkeeping also makes
+ * the code for the collective file space re-allocation and
+ * chunk re-insertion operations a bit simpler.
+ *
+ * For each chunk this rank doesn't own, use non-blocking
+ * synchronous sends to send the data this rank is writing to
+ * the rank that does own the chunk.
*/
- if (*local_chunk_array_num_entries)
- if (NULL == (mod_data = (unsigned char **)H5MM_malloc(*local_chunk_array_num_entries *
- sizeof(unsigned char *))))
- HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "unable to allocate modification data buffer array")
-
- /* Perform all the sends on the chunks that this rank doesn't own */
- /* (Sends and recvs must be two separate loops, to avoid deadlock) */
- for (i = 0, last_assigned_idx = 0; i < *local_chunk_array_num_entries; i++) {
- H5D_filtered_collective_io_info_t *chunk_entry = &local_chunk_array[i];
-
- if (mpi_rank != chunk_entry->owners.new_owner) {
- H5D_chunk_info_t *chunk_info = NULL;
+ for (i = 0, last_assigned_idx = 0; i < *chunk_list_num_entries; i++) {
+ H5D_filtered_collective_io_info_t *chunk_entry = &chunk_list[i];
+
+ if (mpi_rank == chunk_entry->new_owner) {
+ num_msgs_incoming += (size_t)(chunk_entry->num_writers - 1);
+
+ /*
+ * Overwrite chunk entries this rank doesn't own with entries that it
+ * does own, since it has sent the necessary data and is no longer
+ * interested in the chunks it doesn't own.
+ */
+ chunk_list[last_assigned_idx] = chunk_list[i];
+
+ /*
+ * Since, at large scale, a chunk's index value may be larger than
+ * the maximum value that can be stored in an int, we cannot rely
+ * on using a chunk's index value as the tag for the MPI messages
+ * sent/received for a chunk. Therefore, add this chunk to a hash
+ * table with the chunk's index as a key so that we can quickly find
+ * the chunk when processing chunk messages that were received. The
+ * message itself will contain the chunk's index so we can update
+ * the correct chunk with the received data.
+ */
+ HASH_ADD(hh, chunk_table, index_info.chunk_idx, sizeof(hsize_t), &chunk_list[last_assigned_idx]);
+
+ last_assigned_idx++;
+ }
+ else {
+ H5D_chunk_info_t *chunk_info = chunk_entry->chunk_info;
unsigned char * mod_data_p = NULL;
hsize_t iter_nelmts;
- size_t mod_data_size;
+ size_t mod_data_size = 0;
+ size_t space_size = 0;
- /* Look up the chunk and get its file and memory dataspaces */
- if (NULL == (chunk_info = (H5D_chunk_info_t *)H5SL_search(fm->sel_chunks, &chunk_entry->index)))
- HGOTO_ERROR(H5E_DATASPACE, H5E_NOTFOUND, FAIL, "can't locate chunk in skip list")
+ /* Add the size of the chunk index to the encoded size */
+ mod_data_size += sizeof(hsize_t);
- /* Determine size of serialized chunk file dataspace, plus the size of
- * the data being written
- */
- if (H5S_encode(chunk_info->fspace, &mod_data_p, &mod_data_size) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTENCODE, FAIL, "unable to get encoded dataspace size")
+ /* Determine size of serialized chunk file dataspace */
+ if (H5S_encode(chunk_info->fspace, &mod_data_p, &space_size) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "unable to get encoded dataspace size")
+ mod_data_size += space_size;
+ /* Determine size of data being written */
iter_nelmts = H5S_GET_SELECT_NPOINTS(chunk_info->mspace);
-
H5_CHECK_OVERFLOW(iter_nelmts, hsize_t, size_t);
+
mod_data_size += (size_t)iter_nelmts * type_info->src_type_size;
- if (NULL == (mod_data[num_send_requests] = (unsigned char *)H5MM_malloc(mod_data_size)))
+ if (NULL == (msg_send_bufs[num_send_requests] = H5MM_malloc(mod_data_size)))
HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL,
- "couldn't allocate chunk modification send buffer")
+ "couldn't allocate chunk modification message buffer")
+
+ mod_data_p = msg_send_bufs[num_send_requests];
+
+ /* Store the chunk's index into the buffer */
+ HDmemcpy(mod_data_p, &chunk_entry->index_info.chunk_idx, sizeof(hsize_t));
+ mod_data_p += sizeof(hsize_t);
/* Serialize the chunk's file dataspace into the buffer */
- mod_data_p = mod_data[num_send_requests];
if (H5S_encode(chunk_info->fspace, &mod_data_p, &mod_data_size) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_CANTENCODE, FAIL, "unable to encode dataspace")
/* Initialize iterator for memory selection */
- if (H5S_select_iter_init(mem_iter, chunk_info->mspace, type_info->src_type_size, 0) < 0)
+ if (H5S_select_iter_init(mem_iter, chunk_info->mspace, type_info->src_type_size,
+ H5S_SEL_ITER_SHARE_WITH_DATASPACE) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL,
"unable to initialize memory selection information")
mem_iter_init = TRUE;
@@ -2960,466 +3943,2057 @@ H5D__chunk_redistribute_shared_chunks(const H5D_io_info_t *io_info, const H5D_ty
if (0 == H5D__gather_mem(io_info->u.wbuf, mem_iter, (size_t)iter_nelmts, mod_data_p))
HGOTO_ERROR(H5E_IO, H5E_CANTGATHER, FAIL, "couldn't gather from write buffer")
- /* Send modification data to new owner */
+ /*
+ * Ensure that the size of the chunk data being sent can be
+ * safely cast to an int for MPI. Note that this should
+ * generally be OK for now (unless a rank is sending a
+ * whole 32-bit-sized chunk of data + its encoded selection),
+ * but if we allow larger than 32-bit-sized chunks in the
+ * future, this may become a problem and derived datatypes
+ * will need to be used.
+ */
H5_CHECK_OVERFLOW(mod_data_size, size_t, int)
- H5_CHECK_OVERFLOW(chunk_entry->index, hsize_t, int)
+
+ /* Send modification data to new owner */
if (MPI_SUCCESS !=
- (mpi_code = MPI_Isend(mod_data[num_send_requests], (int)mod_data_size, MPI_BYTE,
- chunk_entry->owners.new_owner, (int)chunk_entry->index, io_info->comm,
- &send_requests[num_send_requests])))
- HMPI_GOTO_ERROR(FAIL, "MPI_Isend failed", mpi_code)
+ (mpi_code = MPI_Issend(msg_send_bufs[num_send_requests], (int)mod_data_size, MPI_BYTE,
+ chunk_entry->new_owner, H5D_CHUNK_MOD_DATA_TAG, io_info->comm,
+ &send_requests[num_send_requests])))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Issend failed", mpi_code)
+
+ num_send_requests++;
+
+ /* Resize send buffer and send request arrays if necessary */
+ if (num_send_requests == send_bufs_nalloc) {
+ void *tmp_alloc;
+
+ send_bufs_nalloc = (size_t)((double)send_bufs_nalloc * 1.5);
+
+ if (NULL ==
+ (tmp_alloc = H5MM_realloc(msg_send_bufs, send_bufs_nalloc * sizeof(*msg_send_bufs))))
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL,
+ "couldn't resize chunk modification message buffer array")
+ msg_send_bufs = tmp_alloc;
- if (mem_iter_init && H5S_SELECT_ITER_RELEASE(mem_iter) < 0)
+ if (NULL ==
+ (tmp_alloc = H5MM_realloc(send_requests, send_bufs_nalloc * sizeof(*send_requests))))
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't resize send requests array")
+ send_requests = tmp_alloc;
+ }
+
+ if (H5S_SELECT_ITER_RELEASE(mem_iter) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_CANTFREE, FAIL, "couldn't release memory selection iterator")
mem_iter_init = FALSE;
+ }
+ }
- num_send_requests++;
- } /* end if */
- } /* end for */
+ /* Check if the number of send or receive requests will overflow an int (MPI requirement) */
+ if (num_send_requests > INT_MAX || num_msgs_incoming > INT_MAX)
+ HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL,
+ "too many shared chunks in parallel filtered write operation")
- /* Perform all the recvs on the chunks this rank owns */
- for (i = 0, last_assigned_idx = 0; i < *local_chunk_array_num_entries; i++) {
- H5D_filtered_collective_io_info_t *chunk_entry = &local_chunk_array[i];
+ H5_CHECK_OVERFLOW(num_send_requests, size_t, int)
+ H5_CHECK_OVERFLOW(num_msgs_incoming, size_t, int)
- if (mpi_rank == chunk_entry->owners.new_owner) {
- /* Allocate all necessary buffers for an asynchronous receive operation */
- if (chunk_entry->num_writers > 1) {
- MPI_Message message;
- MPI_Status status;
- size_t j;
+ /*
+ * Allocate receive buffer and MPI_Request arrays for non-blocking
+ * receives of incoming chunk messages
+ */
+ if (num_msgs_incoming) {
+ if (NULL == (msg_recv_bufs = H5MM_malloc(num_msgs_incoming * sizeof(*msg_recv_bufs))))
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL,
+ "couldn't allocate chunk modification message buffer array")
- chunk_entry->async_info.num_receive_requests = (int)chunk_entry->num_writers - 1;
- if (NULL == (chunk_entry->async_info.receive_requests_array = (MPI_Request *)H5MM_malloc(
- (size_t)chunk_entry->async_info.num_receive_requests * sizeof(MPI_Request))))
- HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "unable to allocate async requests array")
+ if (NULL == (recv_requests = H5MM_malloc(num_msgs_incoming * sizeof(*recv_requests))))
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate receive requests array")
+ }
- if (NULL ==
- (chunk_entry->async_info.receive_buffer_array = (unsigned char **)H5MM_malloc(
- (size_t)chunk_entry->async_info.num_receive_requests * sizeof(unsigned char *))))
- HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "unable to allocate async receive buffers")
+ /* Process any incoming messages until everyone is done */
+ do {
+ MPI_Status status;
+ int msg_flag;
- for (j = 0; j < chunk_entry->num_writers - 1; j++) {
- int count = 0;
+ /* Probe for an incoming message from any rank */
+ if (MPI_SUCCESS != (mpi_code = MPI_Iprobe(MPI_ANY_SOURCE, H5D_CHUNK_MOD_DATA_TAG, io_info->comm,
+ &msg_flag, &status)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Iprobe failed", mpi_code)
- /* Probe for a particular message from any process, removing that message
- * from the receive queue in the process and allocating that much memory
- * for the asynchronous receive
- */
- if (MPI_SUCCESS != (mpi_code = MPI_Mprobe(MPI_ANY_SOURCE, (int)chunk_entry->index,
- io_info->comm, &message, &status)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Mprobe failed", mpi_code)
-
- if (MPI_SUCCESS != (mpi_code = MPI_Get_count(&status, MPI_BYTE, &count)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Get_count failed", mpi_code)
-
- HDassert(count >= 0);
- if (NULL == (chunk_entry->async_info.receive_buffer_array[j] =
- (unsigned char *)H5MM_malloc((size_t)count * sizeof(char *))))
- HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL,
- "unable to allocate modification data receive buffer")
-
- if (MPI_SUCCESS != (mpi_code = MPI_Imrecv(
- chunk_entry->async_info.receive_buffer_array[j], count, MPI_BYTE,
- &message, &chunk_entry->async_info.receive_requests_array[j])))
- HMPI_GOTO_ERROR(FAIL, "MPI_Imrecv failed", mpi_code)
- } /* end for */
- } /* end if */
-
- local_chunk_array[last_assigned_idx++] = local_chunk_array[i];
- } /* end else */
- } /* end for */
+ /*
+ * If a message was found, allocate a buffer for the message and
+ * post a non-blocking receive to receive it
+ */
+ if (msg_flag) {
+#if MPI_VERSION >= 3
+ MPI_Count msg_size = 0;
- *local_chunk_array_num_entries = last_assigned_idx;
+ if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&status, MPI_BYTE, &msg_size)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements_x failed", mpi_code)
- /* Wait for all async send requests to complete before returning */
- if (num_send_requests) {
- if (NULL == (send_statuses = (MPI_Status *)H5MM_malloc(num_send_requests * sizeof(MPI_Status))))
- HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate send statuses buffer")
+ H5_CHECK_OVERFLOW(msg_size, MPI_Count, int)
+#else
+ int msg_size = 0;
- H5_CHECK_OVERFLOW(num_send_requests, size_t, int);
- if (MPI_SUCCESS != (mpi_code = MPI_Waitall((int)num_send_requests, send_requests, send_statuses)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Waitall failed", mpi_code)
- } /* end if */
+ if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&status, MPI_BYTE, &msg_size)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
+#endif
-done:
- /* Now that all async send requests have completed, free up the send
- * buffers used in the async operations
+ if (msg_size <= 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_BADVALUE, FAIL, "invalid chunk modification message size")
+
+ HDassert((num_recv_requests + 1) <= num_msgs_incoming);
+ if (NULL ==
+ (msg_recv_bufs[num_recv_requests] = H5MM_malloc((size_t)msg_size * sizeof(unsigned char))))
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL,
+ "couldn't allocate chunk modification message receive buffer")
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Irecv(msg_recv_bufs[num_recv_requests], (int)msg_size,
+ MPI_BYTE, status.MPI_SOURCE, H5D_CHUNK_MOD_DATA_TAG,
+ io_info->comm, &recv_requests[num_recv_requests])))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Irecv failed", mpi_code)
+
+ num_recv_requests++;
+ }
+
+ if (ibarrier_posted) {
+ int ibarrier_completed;
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Test(&ibarrier, &ibarrier_completed, MPI_STATUS_IGNORE)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Test failed", mpi_code)
+
+ if (ibarrier_completed)
+ break;
+ }
+ else {
+ int all_sends_completed;
+
+ /* Determine if all send requests have completed */
+ if (MPI_SUCCESS != (mpi_code = MPI_Testall((int)num_send_requests, send_requests,
+ &all_sends_completed, MPI_STATUSES_IGNORE)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Testall failed", mpi_code)
+
+ if (all_sends_completed) {
+ /* Post non-blocking barrier */
+ if (MPI_SUCCESS != (mpi_code = MPI_Ibarrier(io_info->comm, &ibarrier)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Ibarrier failed", mpi_code)
+ ibarrier_posted = TRUE;
+
+ /*
+ * Now that all send requests have completed, free up the
+ * send buffers used in the non-blocking operations
+ */
+ if (msg_send_bufs) {
+ for (i = 0; i < num_send_requests; i++) {
+ if (msg_send_bufs[i])
+ H5MM_free(msg_send_bufs[i]);
+ }
+
+ msg_send_bufs = H5MM_xfree(msg_send_bufs);
+ }
+ }
+ }
+ } while (1);
+
+ /*
+ * Ensure all receive requests have completed before moving on.
+ * For linked-chunk I/O, more overlap with computation could
+ * theoretically be achieved by returning the receive requests
+ * array and postponing this wait until during chunk updating
+ * when the data is really needed. However, multi-chunk I/O
+ * only updates a chunk at a time and the messages may not come
+ * in the order that chunks are processed. So, the safest way to
+ * support both I/O modes is to simply make sure all messages
+ * are available.
*/
- for (i = 0; i < num_send_requests; i++) {
- if (mod_data[i])
- H5MM_free(mod_data[i]);
- } /* end for */
+ if (MPI_SUCCESS != (mpi_code = MPI_Waitall((int)num_recv_requests, recv_requests, MPI_STATUSES_IGNORE)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Waitall failed", mpi_code)
+
+ /* Set the new number of locally-selected chunks */
+ *chunk_list_num_entries = last_assigned_idx;
+
+ /* Return chunk message buffers if any were received */
+ *chunk_hash_table = chunk_table;
+ *chunk_msg_bufs = msg_recv_bufs;
+ *chunk_msg_bufs_len = (int)num_recv_requests;
+
+done:
+ if (ret_value < 0) {
+ /* If this rank failed, make sure to participate in collective barrier */
+ if (!ibarrier_posted) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Ibarrier(io_info->comm, &ibarrier)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Ibarrier failed", mpi_code)
+ }
+
+ if (num_send_requests) {
+ for (i = 0; i < num_send_requests; i++) {
+ MPI_Cancel(&send_requests[i]);
+ }
+ }
+
+ if (recv_requests) {
+ for (i = 0; i < num_recv_requests; i++) {
+ MPI_Cancel(&recv_requests[i]);
+ }
+ }
+
+ if (msg_recv_bufs) {
+ for (i = 0; i < num_recv_requests; i++) {
+ H5MM_free(msg_recv_bufs[i]);
+ }
+
+ H5MM_free(msg_recv_bufs);
+ }
+ HASH_CLEAR(hh, chunk_table);
+ }
+
+ if (recv_requests)
+ H5MM_free(recv_requests);
if (send_requests)
H5MM_free(send_requests);
- if (send_statuses)
- H5MM_free(send_statuses);
- if (send_counts)
- H5MM_free(send_counts);
- if (send_displacements)
- H5MM_free(send_displacements);
- if (mod_data)
- H5MM_free(mod_data);
- if (mem_iter_init && H5S_SELECT_ITER_RELEASE(mem_iter) < 0)
- HDONE_ERROR(H5E_DATASET, H5E_CANTFREE, FAIL, "couldn't release selection iterator")
- if (mem_iter)
- H5MM_free(mem_iter);
- if (num_assigned_chunks_array)
- H5MM_free(num_assigned_chunks_array);
- if (shared_chunks_info_array)
- H5MM_free(shared_chunks_info_array);
+
+ if (msg_send_bufs) {
+ for (i = 0; i < num_send_requests; i++) {
+ if (msg_send_bufs[i])
+ H5MM_free(msg_send_bufs[i]);
+ }
+
+ H5MM_free(msg_send_bufs);
+ }
+
+ if (mem_iter) {
+ if (mem_iter_init && H5S_SELECT_ITER_RELEASE(mem_iter) < 0)
+ HDONE_ERROR(H5E_DATASET, H5E_CANTFREE, FAIL, "couldn't release dataspace selection iterator")
+ mem_iter = H5FL_FREE(H5S_sel_iter_t, mem_iter);
+ }
+
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_TIME_STOP(mpi_rank);
+ H5D_MPIO_TRACE_EXIT(mpi_rank);
+#endif
FUNC_LEAVE_NOAPI(ret_value)
-} /* end H5D__chunk_redistribute_shared_chunks() */
+#else
+ FUNC_ENTER_STATIC
+ HERROR(
+ H5E_DATASET, H5E_WRITEERROR,
+ "unable to send chunk modification data between MPI ranks - MPI version < 3 (MPI_Ibarrier missing)")
+ FUNC_LEAVE_NOAPI(FAIL)
#endif
+} /* end H5D__mpio_share_chunk_modification_data() */
/*-------------------------------------------------------------------------
- * Function: H5D__mpio_filtered_collective_write_type
+ * Function: H5D__mpio_collective_filtered_chunk_common_io
*
- * Purpose: Constructs a MPI derived datatype for both the memory and
- * the file for a collective write of filtered chunks. The
- * datatype contains the offsets in the file and the locations
- * of the filtered chunk data buffers.
+ * Purpose: This routine performs the common part of collective I/O
+ * when reading or writing filtered chunks collectively.
*
* Return: Non-negative on success/Negative on failure
*
- * Programmer: Jordan Henderson
- * Tuesday, November 22, 2016
- *
*-------------------------------------------------------------------------
*/
static herr_t
-H5D__mpio_filtered_collective_write_type(H5D_filtered_collective_io_info_t *chunk_list, size_t num_entries,
- MPI_Datatype *new_mem_type, hbool_t *mem_type_derived,
- MPI_Datatype *new_file_type, hbool_t *file_type_derived)
+H5D__mpio_collective_filtered_chunk_common_io(H5D_filtered_collective_io_info_t *chunk_list,
+ size_t chunk_list_num_entries, const H5D_io_info_t *io_info,
+ const H5D_type_info_t *type_info, int mpi_size)
{
- MPI_Aint *write_buf_array = NULL; /* Relative displacements of filtered chunk data buffers */
- MPI_Aint *file_offset_array = NULL; /* Chunk offsets in the file */
- int * length_array = NULL; /* Filtered Chunk lengths */
- herr_t ret_value = SUCCEED;
+ H5D_io_info_t coll_io_info;
+ H5D_storage_t ctg_store;
+ MPI_Datatype file_type = MPI_DATATYPE_NULL;
+ MPI_Datatype mem_type = MPI_DATATYPE_NULL;
+ hbool_t mem_type_is_derived = FALSE;
+ hbool_t file_type_is_derived = FALSE;
+ hsize_t mpi_buf_count;
+ haddr_t base_read_offset = HADDR_UNDEF;
+ size_t num_chunks;
+ size_t i;
+ char fake_buf; /* Used as a fake buffer for ranks with no chunks, thus a NULL buf pointer */
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
FUNC_ENTER_STATIC
- HDassert(chunk_list);
- HDassert(new_mem_type);
- HDassert(mem_type_derived);
- HDassert(new_file_type);
- HDassert(file_type_derived);
+ HDassert(chunk_list || 0 == chunk_list_num_entries);
+ HDassert(io_info);
+ HDassert(type_info);
- if (num_entries > 0) {
- size_t i;
- int mpi_code;
- void * base_buf;
-
- H5_CHECK_OVERFLOW(num_entries, size_t, int);
-
- /* Allocate arrays */
- if (NULL == (length_array = (int *)H5MM_malloc((size_t)num_entries * sizeof(int))))
- HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
- "memory allocation failed for filtered collective write length array")
- if (NULL == (write_buf_array = (MPI_Aint *)H5MM_malloc((size_t)num_entries * sizeof(MPI_Aint))))
- HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
- "memory allocation failed for filtered collective write buf length array")
- if (NULL == (file_offset_array = (MPI_Aint *)H5MM_malloc((size_t)num_entries * sizeof(MPI_Aint))))
- HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
- "memory allocation failed for collective write offset array")
-
- /* Ensure the list is sorted in ascending order of offset in the file */
- HDqsort(chunk_list, num_entries, sizeof(H5D_filtered_collective_io_info_t),
- H5D__cmp_filtered_collective_io_info_entry);
+ /* Initialize temporary I/O info */
+ coll_io_info = *io_info;
- base_buf = chunk_list[0].buf;
- for (i = 0; i < num_entries; i++) {
- /* Set up the offset in the file, the length of the chunk data, and the relative
- * displacement of the chunk data write buffer
- */
- file_offset_array[i] = (MPI_Aint)chunk_list[i].chunk_states.new_chunk.offset;
- length_array[i] = (int)chunk_list[i].chunk_states.new_chunk.length;
- write_buf_array[i] = (MPI_Aint)chunk_list[i].buf - (MPI_Aint)base_buf;
- } /* end for */
+ /*
+ * Construct MPI derived datatype for collective I/O on chunks
+ */
+ if (H5D__mpio_collective_filtered_io_type(chunk_list, chunk_list_num_entries, io_info->op_type, &mem_type,
+ &mem_type_is_derived, &file_type, &file_type_is_derived) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_BADTYPE, FAIL, "couldn't create MPI I/O type for chunk I/O")
- /* Create memory MPI type */
- if (MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed((int)num_entries, length_array,
- write_buf_array, MPI_BYTE, new_mem_type)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed failed", mpi_code)
- *mem_type_derived = TRUE;
- if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(new_mem_type)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code)
-
- /* Create file MPI type */
- if (MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed((int)num_entries, length_array,
- file_offset_array, MPI_BYTE, new_file_type)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed failed", mpi_code)
- *file_type_derived = TRUE;
- if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(new_file_type)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code)
- } /* end if */
+ /*
+ * For reads, determine how many chunks are actually being read.
+ * Note that if this is a read during a write operation
+ * (read chunk -> unfilter -> modify -> write back), some
+ * chunks may not need to be read if they're being fully
+ * overwritten during a write operation.
+ */
+ if (io_info->op_type == H5D_IO_OP_READ) {
+ for (i = 0, num_chunks = 0; i < chunk_list_num_entries; i++) {
+ HDassert(chunk_list[i].buf);
+
+ if (chunk_list[i].need_read) {
+ if (!H5F_addr_defined(base_read_offset))
+ base_read_offset = chunk_list[i].chunk_current.offset;
+
+ num_chunks++;
+ }
+ }
+ }
+ else
+ num_chunks = chunk_list_num_entries;
+
+ /*
+ * If this rank doesn't have a selection, it can
+ * skip I/O if independent I/O was requested at
+ * the low level, or if the MPI communicator size
+ * is 1.
+ *
+ * Otherwise, this rank has to participate in
+ * collective I/O, but probably has a NULL buf
+ * pointer, so override to a fake buffer since our
+ * write/read function expects one.
+ */
+ if (num_chunks == 0) {
+ H5FD_mpio_collective_opt_t coll_opt_mode;
+
+ /* Get the collective_opt property to check whether the application wants to do IO individually. */
+ if (H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_opt property")
+
+ if ((mpi_size == 1) || (H5FD_MPIO_INDIVIDUAL_IO == coll_opt_mode)) {
+ HGOTO_DONE(SUCCEED)
+ }
+ else {
+ if (io_info->op_type == H5D_IO_OP_WRITE)
+ coll_io_info.u.wbuf = &fake_buf;
+ else
+ coll_io_info.u.rbuf = &fake_buf;
+ }
+ }
+
+ /*
+ * Setup for I/O operation
+ */
+
+ mpi_buf_count = (num_chunks) ? 1 : 0;
+
+ if (num_chunks) {
+ /*
+ * Setup the base storage address for this operation
+ * to be the first chunk's file address
+ */
+ if (io_info->op_type == H5D_IO_OP_WRITE)
+ ctg_store.contig.dset_addr = chunk_list[0].chunk_new.offset;
+ else
+ ctg_store.contig.dset_addr = base_read_offset;
+ }
+ else
+ ctg_store.contig.dset_addr = 0;
+
+ ctg_store.contig.dset_size = (hsize_t)io_info->dset->shared->layout.u.chunk.size;
+ coll_io_info.store = &ctg_store;
+
+ /* Perform I/O */
+ if (H5D__final_collective_io(&coll_io_info, type_info, mpi_buf_count, file_type, mem_type) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "couldn't finish MPI I/O")
done:
- if (write_buf_array)
- H5MM_free(write_buf_array);
- if (file_offset_array)
- H5MM_free(file_offset_array);
- if (length_array)
- H5MM_free(length_array);
+ /* Free the MPI buf and file types, if they were derived */
+ if (mem_type_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&mem_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ if (file_type_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&file_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
FUNC_LEAVE_NOAPI(ret_value)
-} /* end H5D__mpio_filtered_collective_write_type() */
+} /* end H5D__mpio_collective_filtered_chunk_common_io() */
/*-------------------------------------------------------------------------
- * Function: H5D__filtered_collective_chunk_entry_io
+ * Function: H5D__mpio_collective_filtered_chunk_read
*
- * Purpose: Given an entry for a filtered chunk, performs the necessary
- * steps for updating the chunk data during a collective
- * write, or for reading the chunk from file during a
- * collective read.
+ * Purpose: This routine coordinates a collective read across all ranks
+ * of the chunks they have selected. Each rank will then go
+ * and
*
* Return: Non-negative on success/Negative on failure
*
- * Programmer: Jordan Henderson
- * Wednesday, January 18, 2017
- *
*-------------------------------------------------------------------------
*/
static herr_t
-H5D__filtered_collective_chunk_entry_io(H5D_filtered_collective_io_info_t *chunk_entry,
- const H5D_io_info_t *io_info, const H5D_type_info_t *type_info,
- const H5D_chunk_map_t *fm)
+H5D__mpio_collective_filtered_chunk_read(H5D_filtered_collective_io_info_t *chunk_list,
+ size_t chunk_list_num_entries, const H5D_io_info_t *io_info,
+ const H5D_type_info_t *type_info, int mpi_rank, int mpi_size)
{
- H5D_chunk_info_t *chunk_info = NULL;
- H5S_sel_iter_t * mem_iter = NULL; /* Memory iterator for H5D__scatter_mem/H5D__gather_mem */
- H5S_sel_iter_t * file_iter = NULL;
- H5Z_EDC_t err_detect; /* Error detection info */
- H5Z_cb_t filter_cb; /* I/O filter callback function */
- unsigned filter_mask = 0;
- hsize_t iter_nelmts; /* Number of points to iterate over for the chunk IO operation */
- hssize_t extent_npoints;
- hsize_t true_chunk_size;
- hbool_t mem_iter_init = FALSE;
- hbool_t file_iter_init = FALSE;
- size_t buf_size;
- size_t i;
- H5S_t * dataspace = NULL; /* Other process' dataspace for the chunk */
- void * tmp_gath_buf = NULL; /* Temporary gather buffer to gather into from application buffer
- before scattering out to the chunk data buffer (when writing data),
- or vice versa (when reading data) */
- int mpi_code;
- herr_t ret_value = SUCCEED;
+ H5D_fill_buf_info_t fb_info;
+ H5D_chunk_info_t * chunk_info = NULL;
+ H5D_io_info_t coll_io_info;
+ H5Z_EDC_t err_detect; /* Error detection info */
+ H5Z_cb_t filter_cb; /* I/O filter callback function */
+ hsize_t file_chunk_size = 0;
+ hsize_t iter_nelmts; /* Number of points to iterate over for the chunk IO operation */
+ hbool_t should_fill = FALSE;
+ hbool_t fb_info_init = FALSE;
+ hbool_t index_empty = FALSE;
+ size_t i;
+ H5S_t * fill_space = NULL;
+ void * base_read_buf = NULL;
+ herr_t ret_value = SUCCEED;
FUNC_ENTER_STATIC
- HDassert(chunk_entry);
+ HDassert(chunk_list || 0 == chunk_list_num_entries);
HDassert(io_info);
HDassert(type_info);
- HDassert(fm);
- /* Retrieve filter settings from API context */
- if (H5CX_get_err_detect(&err_detect) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get error detection info")
- if (H5CX_get_filter_cb(&filter_cb) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get I/O filter callback function")
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_TRACE_ENTER(mpi_rank);
+ H5D_MPIO_TIME_START(mpi_rank, "Filtered collective chunk read");
+#else
+ (void)mpi_rank;
+#endif
+
+ /* Initialize temporary I/O info */
+ coll_io_info = *io_info;
+ coll_io_info.u.rbuf = NULL;
- /* Look up the chunk and get its file and memory dataspaces */
- if (NULL == (chunk_info = (H5D_chunk_info_t *)H5SL_search(fm->sel_chunks, &chunk_entry->index)))
- HGOTO_ERROR(H5E_DATASPACE, H5E_NOTFOUND, FAIL, "can't locate chunk in skip list")
+ if (chunk_list_num_entries) {
+ /* Retrieve filter settings from API context */
+ if (H5CX_get_err_detect(&err_detect) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get error detection info")
+ if (H5CX_get_filter_cb(&filter_cb) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get I/O filter callback function")
- if ((extent_npoints = H5S_GET_EXTENT_NPOINTS(chunk_info->fspace)) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTCOUNT, FAIL, "dataspace is invalid")
- true_chunk_size = (hsize_t)extent_npoints * type_info->src_type_size;
+ /* Set size of full chunks in dataset */
+ file_chunk_size = io_info->dset->shared->layout.u.chunk.size;
- /* If the size of the filtered chunk is larger than the number of points in the
- * chunk file space extent times the datatype size, allocate enough space to hold the
- * whole filtered chunk. Otherwise, allocate a buffer equal to the size of the
- * chunk so that the unfiltering operation doesn't have to grow the buffer.
+ /* Determine if fill values should be "read" for unallocated chunks */
+ should_fill = (io_info->dset->shared->dcpl_cache.fill.fill_time == H5D_FILL_TIME_ALLOC) ||
+ ((io_info->dset->shared->dcpl_cache.fill.fill_time == H5D_FILL_TIME_IFSET) &&
+ io_info->dset->shared->dcpl_cache.fill.fill_defined);
+ }
+
+ /*
+ * Allocate memory buffers for all chunks being read. Chunk data buffers are of
+ * the largest size between the chunk's current filtered size and the chunk's true
+ * size, as calculated by the number of elements in the chunk's file space extent
+ * multiplied by the datatype size. This tries to ensure that:
+ *
+ * * If we're reading the chunk and the filter normally reduces the chunk size,
+ * the unfiltering operation won't need to grow the buffer.
+ * * If we're reading the chunk and the filter normally grows the chunk size,
+ * we make sure to read into a buffer of size equal to the filtered chunk's
+ * size; reading into a (smaller) buffer of size equal to the unfiltered
+ * chunk size would of course be bad.
*/
- buf_size = MAX(chunk_entry->chunk_states.chunk_current.length, true_chunk_size);
+ for (i = 0; i < chunk_list_num_entries; i++) {
+ HDassert(chunk_list[i].need_read);
+
+ chunk_list[i].chunk_buf_size = MAX(chunk_list[i].chunk_current.length, file_chunk_size);
+
+ if (NULL == (chunk_list[i].buf = H5MM_malloc(chunk_list[i].chunk_buf_size))) {
+ /* Push an error, but participate in collective read */
+ HDONE_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate chunk data buffer")
+ break;
+ }
+
+ /*
+ * Check if chunk is currently allocated. If not, don't try to
+ * read it from the file. Instead, just fill the chunk buffer
+ * with the fill value if necessary.
+ */
+ if (H5F_addr_defined(chunk_list[i].chunk_current.offset)) {
+ /* Set first read buffer */
+ if (!base_read_buf)
+ base_read_buf = chunk_list[i].buf;
+
+ /* Set chunk's new length for eventual filter pipeline calls */
+ if (chunk_list[i].skip_filter_pline)
+ chunk_list[i].chunk_new.length = file_chunk_size;
+ else
+ chunk_list[i].chunk_new.length = chunk_list[i].chunk_current.length;
+ }
+ else {
+ chunk_list[i].need_read = FALSE;
+
+ /* Set chunk's new length for eventual filter pipeline calls */
+ chunk_list[i].chunk_new.length = file_chunk_size;
+
+ if (should_fill) {
+ /* Initialize fill value buffer if not already initialized */
+ if (!fb_info_init) {
+ hsize_t chunk_dims[H5S_MAX_RANK];
+
+ HDassert(io_info->dset->shared->ndims == io_info->dset->shared->layout.u.chunk.ndims - 1);
+ for (size_t j = 0; j < io_info->dset->shared->layout.u.chunk.ndims - 1; j++)
+ chunk_dims[j] = (hsize_t)io_info->dset->shared->layout.u.chunk.dim[j];
+
+ /* Get a dataspace for filling chunk memory buffers */
+ if (NULL == (fill_space = H5S_create_simple(
+ io_info->dset->shared->layout.u.chunk.ndims - 1, chunk_dims, NULL)))
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "unable to create chunk fill dataspace")
+
+ /* Initialize fill value buffer */
+ if (H5D__fill_init(&fb_info, NULL, (H5MM_allocate_t)H5D__chunk_mem_alloc,
+ (void *)&io_info->dset->shared->dcpl_cache.pline,
+ (H5MM_free_t)H5D__chunk_mem_free,
+ (void *)&io_info->dset->shared->dcpl_cache.pline,
+ &io_info->dset->shared->dcpl_cache.fill, io_info->dset->shared->type,
+ io_info->dset->shared->type_id, 0, file_chunk_size) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "can't initialize fill value buffer")
+
+ fb_info_init = TRUE;
+ }
- if (NULL == (chunk_entry->buf = H5MM_malloc(buf_size)))
- HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate chunk data buffer")
+ /* Write fill value to memory buffer */
+ HDassert(fb_info.fill_buf);
+ if (H5D__fill(fb_info.fill_buf, io_info->dset->shared->type, chunk_list[i].buf,
+ type_info->mem_type, fill_space) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "couldn't fill chunk buffer with fill value")
+ }
+ }
+ }
- /* If this is not a full chunk overwrite or this is a read operation, the chunk must be
- * read from the file and unfiltered.
+ /*
+ * If dataset is incrementally allocated and hasn't been written to
+ * yet, the chunk index should be empty. In this case, a collective
+ * read of chunks is essentially a no-op, so avoid it here.
*/
- if (!chunk_entry->full_overwrite || io_info->op_type == H5D_IO_OP_READ) {
- H5FD_mpio_xfer_t xfer_mode; /* Parallel transfer for this request */
+ index_empty = FALSE;
+ if (io_info->dset->shared->dcpl_cache.fill.alloc_time == H5D_ALLOC_TIME_INCR)
+ if (H5D__chunk_index_empty(io_info->dset, &index_empty) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "couldn't determine if chunk index is empty")
- chunk_entry->chunk_states.new_chunk.length = chunk_entry->chunk_states.chunk_current.length;
+ if (!index_empty) {
+ /*
+ * Override the read buffer to point to the address of
+ * the first chunk data buffer being read into
+ */
+ if (base_read_buf)
+ coll_io_info.u.rbuf = base_read_buf;
- /* Currently, these chunk reads are done independently and will likely
- * cause issues with collective metadata reads enabled. In the future,
- * this should be refactored to use collective chunk reads - JTH */
+ /* Perform collective chunk read */
+ if (H5D__mpio_collective_filtered_chunk_common_io(chunk_list, chunk_list_num_entries, &coll_io_info,
+ type_info, mpi_size) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "couldn't finish collective filtered chunk read")
+ }
- /* Get the original state of parallel I/O transfer mode */
- if (H5CX_get_io_xfer_mode(&xfer_mode) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode")
+ /*
+ * Iterate through all the read chunks, unfiltering them and scattering their
+ * data out to the application's read buffer.
+ */
+ for (i = 0; i < chunk_list_num_entries; i++) {
+ chunk_info = chunk_list[i].chunk_info;
+
+ /* Unfilter the chunk, unless we didn't read it from the file */
+ if (chunk_list[i].need_read && !chunk_list[i].skip_filter_pline) {
+ if (H5Z_pipeline(&io_info->dset->shared->dcpl_cache.pline, H5Z_FLAG_REVERSE,
+ &(chunk_list[i].index_info.filter_mask), err_detect, filter_cb,
+ (size_t *)&chunk_list[i].chunk_new.length, &chunk_list[i].chunk_buf_size,
+ &chunk_list[i].buf) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTFILTER, FAIL, "couldn't unfilter chunk for modifying")
+ }
+
+ /* Scatter the chunk data to the read buffer */
+ iter_nelmts = H5S_GET_SELECT_NPOINTS(chunk_info->fspace);
+
+ if (H5D_select_io_mem(io_info->u.rbuf, chunk_info->mspace, chunk_list[i].buf, chunk_info->fspace,
+ type_info->src_type_size, (size_t)iter_nelmts) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "couldn't copy chunk data to read buffer")
+ }
- /* Change the xfer_mode to independent for handling the I/O */
- if (H5CX_set_io_xfer_mode(H5FD_MPIO_INDEPENDENT) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTSET, FAIL, "can't set MPI-I/O transfer mode")
+done:
+ /* Free all resources used by entries in the chunk list */
+ for (i = 0; i < chunk_list_num_entries; i++) {
+ if (chunk_list[i].buf) {
+ H5MM_free(chunk_list[i].buf);
+ chunk_list[i].buf = NULL;
+ }
+ }
- if (H5F_shared_block_read(io_info->f_sh, H5FD_MEM_DRAW,
- chunk_entry->chunk_states.chunk_current.offset,
- chunk_entry->chunk_states.new_chunk.length, chunk_entry->buf) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "unable to read raw data chunk")
+ /* Release the fill buffer info, if it's been initialized */
+ if (fb_info_init && H5D__fill_term(&fb_info) < 0)
+ HDONE_ERROR(H5E_DATASET, H5E_CANTFREE, FAIL, "Can't release fill buffer info")
+ if (fill_space && (H5S_close(fill_space) < 0))
+ HDONE_ERROR(H5E_DATASET, H5E_CLOSEERROR, FAIL, "can't close fill space")
- /* Return to the original I/O transfer mode setting */
- if (H5CX_set_io_xfer_mode(xfer_mode) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTSET, FAIL, "can't set MPI-I/O transfer mode")
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_TIME_STOP(mpi_rank);
+ H5D_MPIO_TRACE_EXIT(mpi_rank);
+#endif
- if (H5Z_pipeline(&io_info->dset->shared->dcpl_cache.pline, H5Z_FLAG_REVERSE, &filter_mask, err_detect,
- filter_cb, (size_t *)&chunk_entry->chunk_states.new_chunk.length, &buf_size,
- &chunk_entry->buf) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTFILTER, FAIL, "couldn't unfilter chunk for modifying")
- } /* end if */
- else {
- chunk_entry->chunk_states.new_chunk.length = true_chunk_size;
- } /* end else */
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D__mpio_collective_filtered_chunk_read() */
- /* Initialize iterator for memory selection */
- if (NULL == (mem_iter = (H5S_sel_iter_t *)H5MM_malloc(sizeof(H5S_sel_iter_t))))
- HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate memory iterator")
+/*-------------------------------------------------------------------------
+ * Function: H5D__mpio_collective_filtered_chunk_update
+ *
+ * Purpose: When performing a parallel write on a chunked dataset with
+ * filters applied, all ranks must update their owned chunks
+ * with their own modification data and data from other ranks.
+ * This routine is responsible for coordinating that process.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5D__mpio_collective_filtered_chunk_update(H5D_filtered_collective_io_info_t *chunk_list,
+ size_t chunk_list_num_entries,
+ H5D_filtered_collective_io_info_t *chunk_hash_table,
+ unsigned char **chunk_msg_bufs, int chunk_msg_bufs_len,
+ const H5D_io_info_t *io_info, const H5D_type_info_t *type_info,
+ int mpi_rank, int mpi_size)
+{
+ H5D_fill_buf_info_t fb_info;
+ H5D_chunk_info_t * chunk_info = NULL;
+ H5S_sel_iter_t * sel_iter = NULL; /* Dataspace selection iterator for H5D__scatter_mem */
+ H5D_io_info_t coll_io_info;
+ H5Z_EDC_t err_detect; /* Error detection info */
+ H5Z_cb_t filter_cb; /* I/O filter callback function */
+ hsize_t file_chunk_size = 0;
+ hsize_t iter_nelmts; /* Number of points to iterate over for the chunk IO operation */
+ hbool_t should_fill = FALSE;
+ hbool_t fb_info_init = FALSE;
+ hbool_t sel_iter_init = FALSE;
+ hbool_t index_empty = FALSE;
+ size_t i;
+ H5S_t * dataspace = NULL;
+ H5S_t * fill_space = NULL;
+ void * base_read_buf = NULL;
+ herr_t ret_value = SUCCEED;
- if (H5S_select_iter_init(mem_iter, chunk_info->mspace, type_info->src_type_size, 0) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "unable to initialize memory selection information")
- mem_iter_init = TRUE;
+ FUNC_ENTER_STATIC
- /* If this is a read operation, scatter the read chunk data to the user's buffer.
- *
- * If this is a write operation, update the chunk data buffer with the modifications
- * from the current process, then apply any modifications from other processes. Finally,
- * filter the newly-updated chunk.
- */
- switch (io_info->op_type) {
- case H5D_IO_OP_READ:
- if (NULL == (file_iter = (H5S_sel_iter_t *)H5MM_malloc(sizeof(H5S_sel_iter_t))))
- HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate file iterator")
+ HDassert(chunk_list || 0 == chunk_list_num_entries);
+ HDassert((chunk_msg_bufs && chunk_hash_table) || 0 == chunk_msg_bufs_len);
+ HDassert(io_info);
+ HDassert(type_info);
- if (H5S_select_iter_init(file_iter, chunk_info->fspace, type_info->src_type_size, 0) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL,
- "unable to initialize memory selection information")
- file_iter_init = TRUE;
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_TRACE_ENTER(mpi_rank);
+ H5D_MPIO_TIME_START(mpi_rank, "Filtered collective chunk update");
+#endif
- iter_nelmts = H5S_GET_SELECT_NPOINTS(chunk_info->fspace);
+ if (chunk_list_num_entries) {
+ /* Retrieve filter settings from API context */
+ if (H5CX_get_err_detect(&err_detect) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get error detection info")
+ if (H5CX_get_filter_cb(&filter_cb) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get I/O filter callback function")
- if (NULL == (tmp_gath_buf = H5MM_malloc(iter_nelmts * type_info->src_type_size)))
- HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate temporary gather buffer")
+ /* Set size of full chunks in dataset */
+ file_chunk_size = io_info->dset->shared->layout.u.chunk.size;
- if (!H5D__gather_mem(chunk_entry->buf, file_iter, (size_t)iter_nelmts, tmp_gath_buf))
- HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "couldn't gather from chunk buffer")
+ /* Determine if fill values should be written to chunks */
+ should_fill = (io_info->dset->shared->dcpl_cache.fill.fill_time == H5D_FILL_TIME_ALLOC) ||
+ ((io_info->dset->shared->dcpl_cache.fill.fill_time == H5D_FILL_TIME_IFSET) &&
+ io_info->dset->shared->dcpl_cache.fill.fill_defined);
+ }
- iter_nelmts = H5S_GET_SELECT_NPOINTS(chunk_info->mspace);
+ /*
+ * Allocate memory buffers for all owned chunks. Chunk data buffers are of the
+ * largest size between the chunk's current filtered size and the chunk's true
+ * size, as calculated by the number of elements in the chunk's file space extent
+ * multiplied by the datatype size. This tries to ensure that:
+ *
+ * * If we're fully overwriting the chunk and the filter normally reduces the
+ * chunk size, we simply have the exact buffer size required to hold the
+ * unfiltered chunk data.
+ * * If we're fully overwriting the chunk and the filter normally grows the
+ * chunk size (e.g., fletcher32 filter), the final filtering operation
+ * (hopefully) won't need to grow the buffer.
+ * * If we're reading the chunk and the filter normally reduces the chunk size,
+ * the unfiltering operation won't need to grow the buffer.
+ * * If we're reading the chunk and the filter normally grows the chunk size,
+ * we make sure to read into a buffer of size equal to the filtered chunk's
+ * size; reading into a (smaller) buffer of size equal to the unfiltered
+ * chunk size would of course be bad.
+ */
+ for (i = 0; i < chunk_list_num_entries; i++) {
+ HDassert(mpi_rank == chunk_list[i].new_owner);
+
+ chunk_list[i].chunk_buf_size = MAX(chunk_list[i].chunk_current.length, file_chunk_size);
- if (H5D__scatter_mem(tmp_gath_buf, mem_iter, (size_t)iter_nelmts, io_info->u.rbuf) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "couldn't scatter to read buffer")
+ /*
+ * If this chunk hasn't been allocated yet and we aren't writing
+ * out fill values to it, make sure to 0-fill its memory buffer
+ * so we don't use uninitialized memory.
+ */
+ if (!H5F_addr_defined(chunk_list[i].chunk_current.offset) && !should_fill)
+ chunk_list[i].buf = H5MM_calloc(chunk_list[i].chunk_buf_size);
+ else
+ chunk_list[i].buf = H5MM_malloc(chunk_list[i].chunk_buf_size);
+ if (NULL == chunk_list[i].buf) {
+ /* Push an error, but participate in collective read */
+ HDONE_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate chunk data buffer")
break;
+ }
+
+ /* Set chunk's new length for eventual filter pipeline calls */
+ if (chunk_list[i].need_read) {
+ /*
+ * Check if chunk is currently allocated. If not, don't try to
+ * read it from the file. Instead, just fill the chunk buffer
+ * with the fill value if fill values are to be written.
+ */
+ if (H5F_addr_defined(chunk_list[i].chunk_current.offset)) {
+ /* Set first read buffer */
+ if (!base_read_buf)
+ base_read_buf = chunk_list[i].buf;
+
+ /* Set chunk's new length for eventual filter pipeline calls */
+ if (chunk_list[i].skip_filter_pline)
+ chunk_list[i].chunk_new.length = file_chunk_size;
+ else
+ chunk_list[i].chunk_new.length = chunk_list[i].chunk_current.length;
+ }
+ else {
+ chunk_list[i].need_read = FALSE;
+
+ /* Set chunk's new length for eventual filter pipeline calls */
+ chunk_list[i].chunk_new.length = file_chunk_size;
+
+ if (should_fill) {
+ /* Initialize fill value buffer if not already initialized */
+ if (!fb_info_init) {
+ hsize_t chunk_dims[H5S_MAX_RANK];
+
+ HDassert(io_info->dset->shared->ndims ==
+ io_info->dset->shared->layout.u.chunk.ndims - 1);
+ for (size_t j = 0; j < io_info->dset->shared->layout.u.chunk.ndims - 1; j++)
+ chunk_dims[j] = (hsize_t)io_info->dset->shared->layout.u.chunk.dim[j];
+
+ /* Get a dataspace for filling chunk memory buffers */
+ if (NULL == (fill_space = H5S_create_simple(
+ io_info->dset->shared->layout.u.chunk.ndims - 1, chunk_dims, NULL)))
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL,
+ "unable to create chunk fill dataspace")
+
+ /* Initialize fill value buffer */
+ if (H5D__fill_init(&fb_info, NULL, (H5MM_allocate_t)H5D__chunk_mem_alloc,
+ (void *)&io_info->dset->shared->dcpl_cache.pline,
+ (H5MM_free_t)H5D__chunk_mem_free,
+ (void *)&io_info->dset->shared->dcpl_cache.pline,
+ &io_info->dset->shared->dcpl_cache.fill,
+ io_info->dset->shared->type, io_info->dset->shared->type_id, 0,
+ file_chunk_size) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "can't initialize fill value buffer")
+
+ fb_info_init = TRUE;
+ }
+
+ /* Write fill value to memory buffer */
+ HDassert(fb_info.fill_buf);
+ if (H5D__fill(fb_info.fill_buf, io_info->dset->shared->type, chunk_list[i].buf,
+ type_info->mem_type, fill_space) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL,
+ "couldn't fill chunk buffer with fill value")
+ }
+ }
+ }
+ else
+ chunk_list[i].chunk_new.length = file_chunk_size;
+ }
- case H5D_IO_OP_WRITE:
- iter_nelmts = H5S_GET_SELECT_NPOINTS(chunk_info->mspace);
+ /*
+ * If dataset is incrementally allocated and hasn't been written to
+ * yet, the chunk index should be empty. In this case, a collective
+ * read of chunks is essentially a no-op, so avoid it here.
+ */
+ index_empty = FALSE;
+ if (io_info->dset->shared->dcpl_cache.fill.alloc_time == H5D_ALLOC_TIME_INCR)
+ if (H5D__chunk_index_empty(io_info->dset, &index_empty) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "couldn't determine if chunk index is empty")
- if (NULL == (tmp_gath_buf = H5MM_malloc(iter_nelmts * type_info->src_type_size)))
- HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate temporary gather buffer")
+ if (!index_empty) {
+ /*
+ * Setup for I/O operation
+ */
- /* Gather modification data from the application write buffer into a temporary buffer */
- if (0 == H5D__gather_mem(io_info->u.wbuf, mem_iter, (size_t)iter_nelmts, tmp_gath_buf))
- HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "couldn't gather from write buffer")
+ /* Initialize temporary I/O info */
+ coll_io_info = *io_info;
+ coll_io_info.op_type = H5D_IO_OP_READ;
- if (H5S_SELECT_ITER_RELEASE(mem_iter) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTFREE, FAIL, "couldn't release selection iterator")
- mem_iter_init = FALSE;
+ /* Override the read buffer to point to the address of the first
+ * chunk data buffer being read into
+ */
+ if (base_read_buf)
+ coll_io_info.u.rbuf = base_read_buf;
- /* Initialize iterator for file selection */
- if (H5S_select_iter_init(mem_iter, chunk_info->fspace, type_info->dst_type_size, 0) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL,
- "unable to initialize file selection information")
- mem_iter_init = TRUE;
+ /* Read all chunks that need to be read from the file */
+ if (H5D__mpio_collective_filtered_chunk_common_io(chunk_list, chunk_list_num_entries, &coll_io_info,
+ type_info, mpi_size) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "couldn't finish collective filtered chunk read")
+ }
- iter_nelmts = H5S_GET_SELECT_NPOINTS(chunk_info->fspace);
+ /*
+ * Now that all owned chunks have been read, update the chunks
+ * with modification data from the owning rank and other ranks.
+ */
- /* Scatter the owner's modification data into the chunk data buffer according to
- * the file space.
- */
- if (H5D__scatter_mem(tmp_gath_buf, mem_iter, (size_t)iter_nelmts, chunk_entry->buf) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "couldn't scatter to chunk data buffer")
+ /* Process all chunks with data from the owning rank first */
+ for (i = 0; i < chunk_list_num_entries; i++) {
+ HDassert(mpi_rank == chunk_list[i].new_owner);
- if (H5S_SELECT_ITER_RELEASE(mem_iter) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTFREE, FAIL, "couldn't release selection iterator")
- mem_iter_init = FALSE;
+ chunk_info = chunk_list[i].chunk_info;
- if (MPI_SUCCESS !=
- (mpi_code = MPI_Waitall(chunk_entry->async_info.num_receive_requests,
- chunk_entry->async_info.receive_requests_array, MPI_STATUSES_IGNORE)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Waitall failed", mpi_code)
+ /*
+ * If this chunk wasn't being fully overwritten, we read it from
+ * the file, so we need to unfilter it
+ */
+ if (chunk_list[i].need_read && !chunk_list[i].skip_filter_pline) {
+ if (H5Z_pipeline(&io_info->dset->shared->dcpl_cache.pline, H5Z_FLAG_REVERSE,
+ &(chunk_list[i].index_info.filter_mask), err_detect, filter_cb,
+ (size_t *)&chunk_list[i].chunk_new.length, &chunk_list[i].chunk_buf_size,
+ &chunk_list[i].buf) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTFILTER, FAIL, "couldn't unfilter chunk for modifying")
+ }
+
+ iter_nelmts = H5S_GET_SELECT_NPOINTS(chunk_info->mspace);
+
+ if (H5D_select_io_mem(chunk_list[i].buf, chunk_info->fspace, io_info->u.wbuf, chunk_info->mspace,
+ type_info->dst_type_size, (size_t)iter_nelmts) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "couldn't copy chunk data to write buffer")
+ }
- /* For each asynchronous receive call previously posted, receive the chunk modification
- * buffer from another rank and update the chunk data
- */
- for (i = 0; i < (size_t)chunk_entry->async_info.num_receive_requests; i++) {
- const unsigned char *mod_data_p;
+ /* Allocate iterator for memory selection */
+ if (NULL == (sel_iter = H5FL_MALLOC(H5S_sel_iter_t)))
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate memory iterator")
- /* Decode the process' chunk file dataspace */
- mod_data_p = chunk_entry->async_info.receive_buffer_array[i];
- if (NULL == (dataspace = H5S_decode(&mod_data_p)))
+ /* Now process all received chunk message buffers */
+ for (i = 0; i < (size_t)chunk_msg_bufs_len; i++) {
+ H5D_filtered_collective_io_info_t *chunk_entry = NULL;
+ const unsigned char * msg_ptr = chunk_msg_bufs[i];
+ hsize_t chunk_idx;
+
+ if (msg_ptr) {
+ /* Retrieve the chunk's index value */
+ HDmemcpy(&chunk_idx, msg_ptr, sizeof(hsize_t));
+ msg_ptr += sizeof(hsize_t);
+
+ /* Find the chunk entry according to its chunk index */
+ HASH_FIND(hh, chunk_hash_table, &chunk_idx, sizeof(hsize_t), chunk_entry);
+ HDassert(chunk_entry);
+ HDassert(mpi_rank == chunk_entry->new_owner);
+
+ /*
+ * Only process the chunk if its data buffer is allocated.
+ * In the case of multi-chunk I/O, we're only working on
+ * a chunk at a time, so we need to skip over messages
+ * that aren't for the chunk we're currently working on.
+ */
+ if (!chunk_entry->buf)
+ continue;
+ else {
+ /* Decode the chunk file dataspace from the message */
+ if (NULL == (dataspace = H5S_decode(&msg_ptr)))
HGOTO_ERROR(H5E_DATASET, H5E_CANTDECODE, FAIL, "unable to decode dataspace")
- if (H5S_select_iter_init(mem_iter, dataspace, type_info->dst_type_size, 0) < 0)
+ if (H5S_select_iter_init(sel_iter, dataspace, type_info->dst_type_size,
+ H5S_SEL_ITER_SHARE_WITH_DATASPACE) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL,
"unable to initialize memory selection information")
- mem_iter_init = TRUE;
+ sel_iter_init = TRUE;
iter_nelmts = H5S_GET_SELECT_NPOINTS(dataspace);
/* Update the chunk data with the received modification data */
- if (H5D__scatter_mem(mod_data_p, mem_iter, (size_t)iter_nelmts, chunk_entry->buf) < 0)
+ if (H5D__scatter_mem(msg_ptr, sel_iter, (size_t)iter_nelmts, chunk_entry->buf) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "couldn't scatter to write buffer")
- if (H5S_SELECT_ITER_RELEASE(mem_iter) < 0)
+ if (H5S_SELECT_ITER_RELEASE(sel_iter) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_CANTFREE, FAIL, "couldn't release selection iterator")
- mem_iter_init = FALSE;
+ sel_iter_init = FALSE;
+
if (dataspace) {
if (H5S_close(dataspace) < 0)
HGOTO_ERROR(H5E_DATASPACE, H5E_CANTFREE, FAIL, "can't close dataspace")
dataspace = NULL;
}
- H5MM_free(chunk_entry->async_info.receive_buffer_array[i]);
- } /* end for */
- /* Filter the chunk */
- if (H5Z_pipeline(&io_info->dset->shared->dcpl_cache.pline, 0, &filter_mask, err_detect, filter_cb,
- (size_t *)&chunk_entry->chunk_states.new_chunk.length, &buf_size,
- &chunk_entry->buf) < 0)
+ H5MM_free(chunk_msg_bufs[i]);
+ chunk_msg_bufs[i] = NULL;
+ }
+ }
+ }
+
+ /* Finally, filter all the chunks */
+ for (i = 0; i < chunk_list_num_entries; i++) {
+ if (!chunk_list[i].skip_filter_pline) {
+ if (H5Z_pipeline(&io_info->dset->shared->dcpl_cache.pline, 0,
+ &(chunk_list[i].index_info.filter_mask), err_detect, filter_cb,
+ (size_t *)&chunk_list[i].chunk_new.length, &chunk_list[i].chunk_buf_size,
+ &chunk_list[i].buf) < 0)
HGOTO_ERROR(H5E_PLINE, H5E_CANTFILTER, FAIL, "output pipeline failed")
+ }
#if H5_SIZEOF_SIZE_T > 4
- /* Check for the chunk expanding too much to encode in a 32-bit value */
- if (chunk_entry->chunk_states.new_chunk.length > ((size_t)0xffffffff))
- HGOTO_ERROR(H5E_DATASET, H5E_BADRANGE, FAIL, "chunk too large for 32-bit length")
+ /* Check for the chunk expanding too much to encode in a 32-bit value */
+ if (chunk_list[i].chunk_new.length > ((size_t)0xffffffff))
+ HGOTO_ERROR(H5E_DATASET, H5E_BADRANGE, FAIL, "chunk too large for 32-bit length")
#endif
- break;
+ }
- default:
- HGOTO_ERROR(H5E_DATASET, H5E_BADVALUE, FAIL, "invalid I/O operation")
- } /* end switch */
+done:
+ if (sel_iter) {
+ if (sel_iter_init && H5S_SELECT_ITER_RELEASE(sel_iter) < 0)
+ HDONE_ERROR(H5E_DATASET, H5E_CANTFREE, FAIL, "couldn't release selection iterator")
+ sel_iter = H5FL_FREE(H5S_sel_iter_t, sel_iter);
+ }
+ if (dataspace && (H5S_close(dataspace) < 0))
+ HDONE_ERROR(H5E_DATASPACE, H5E_CANTFREE, FAIL, "can't close dataspace")
+ if (fill_space && (H5S_close(fill_space) < 0))
+ HDONE_ERROR(H5E_DATASET, H5E_CLOSEERROR, FAIL, "can't close fill space")
+
+ /* Release the fill buffer info, if it's been initialized */
+ if (fb_info_init && H5D__fill_term(&fb_info) < 0)
+ HDONE_ERROR(H5E_DATASET, H5E_CANTFREE, FAIL, "Can't release fill buffer info")
+
+ /* On failure, try to free all resources used by entries in the chunk list */
+ if (ret_value < 0) {
+ for (i = 0; i < chunk_list_num_entries; i++) {
+ if (chunk_list[i].buf) {
+ H5MM_free(chunk_list[i].buf);
+ chunk_list[i].buf = NULL;
+ }
+ }
+ }
+
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_TIME_STOP(mpi_rank);
+ H5D_MPIO_TRACE_EXIT(mpi_rank);
+#endif
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D__mpio_collective_filtered_chunk_update() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5D__mpio_collective_filtered_chunk_reallocate
+ *
+ * Purpose: When performing a parallel write on a chunked dataset with
+ * filters applied, all ranks must eventually get together and
+ * perform a collective reallocation of space in the file for
+ * all chunks that were modified on all ranks. This routine is
+ * responsible for coordinating that process.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5D__mpio_collective_filtered_chunk_reallocate(H5D_filtered_collective_io_info_t *chunk_list,
+ size_t chunk_list_num_entries, size_t *num_chunks_assigned_map,
+ H5D_io_info_t *io_info, H5D_chk_idx_info_t *idx_info,
+ int mpi_rank, int mpi_size)
+{
+ H5D_chunk_alloc_info_t *collective_list = NULL;
+ MPI_Datatype send_type;
+ MPI_Datatype recv_type;
+ hbool_t send_type_derived = FALSE;
+ hbool_t recv_type_derived = FALSE;
+ hbool_t need_sort = FALSE;
+ size_t collective_num_entries = 0;
+ size_t num_local_chunks_processed = 0;
+ size_t i;
+ void * gathered_array = NULL;
+ int * counts_disps_array = NULL;
+ int * counts_ptr = NULL;
+ int * displacements_ptr = NULL;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_STATIC
+
+ HDassert(chunk_list || 0 == chunk_list_num_entries);
+ HDassert(io_info);
+ HDassert(idx_info);
+ HDassert(idx_info->storage->idx_type != H5D_CHUNK_IDX_NONE);
+
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_TRACE_ENTER(mpi_rank);
+ H5D_MPIO_TIME_START(mpi_rank, "Reallocation of chunk file space");
+#endif
+
+ /*
+ * Make sure it's safe to cast this rank's number
+ * of chunks to be sent into an int for MPI
+ */
+ H5_CHECK_OVERFLOW(chunk_list_num_entries, size_t, int);
+
+ /* Create derived datatypes for the chunk file space info needed */
+ if (H5D__mpio_get_chunk_alloc_info_types(&recv_type, &recv_type_derived, &send_type, &send_type_derived) <
+ 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL,
+ "can't create derived datatypes for chunk file space info")
+
+ /*
+ * Gather the new chunk sizes to all ranks for a collective reallocation
+ * of the chunks in the file.
+ */
+ if (num_chunks_assigned_map) {
+ /*
+ * If a mapping between rank value -> number of assigned chunks has
+ * been provided (usually during linked-chunk I/O), we can use this
+ * to optimize MPI overhead a bit since MPI ranks won't need to
+ * first inform each other about how many chunks they're contributing.
+ */
+ if (NULL == (counts_disps_array = H5MM_malloc(2 * (size_t)mpi_size * sizeof(*counts_disps_array)))) {
+ /* Push an error, but still participate in collective gather operation */
+ HDONE_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "couldn't allocate receive counts and displacements array")
+ }
+ else {
+ /* Set the receive counts from the assigned chunks map */
+ counts_ptr = counts_disps_array;
+
+ for (i = 0; i < (size_t)mpi_size; i++)
+ H5_CHECKED_ASSIGN(counts_ptr[i], int, num_chunks_assigned_map[i], size_t);
+
+ /* Set the displacements into the receive buffer for the gather operation */
+ displacements_ptr = &counts_disps_array[mpi_size];
+
+ *displacements_ptr = 0;
+ for (i = 1; i < (size_t)mpi_size; i++)
+ displacements_ptr[i] = displacements_ptr[i - 1] + counts_ptr[i - 1];
+ }
+
+ /* Perform gather operation */
+ if (H5_mpio_gatherv_alloc(chunk_list, (int)chunk_list_num_entries, send_type, counts_ptr,
+ displacements_ptr, recv_type, TRUE, 0, io_info->comm, mpi_rank, mpi_size,
+ &gathered_array, &collective_num_entries) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGATHER, FAIL, "can't gather chunk file space info to/from ranks")
+ }
+ else {
+ /*
+ * If no mapping between rank value -> number of assigned chunks has
+ * been provided (usually during multi-chunk I/O), all MPI ranks will
+ * need to first inform other ranks about how many chunks they're
+ * contributing before performing the actual gather operation. Use
+ * the 'simple' MPI_Allgatherv wrapper for this.
+ */
+ if (H5_mpio_gatherv_alloc_simple(chunk_list, (int)chunk_list_num_entries, send_type, recv_type, TRUE,
+ 0, io_info->comm, mpi_rank, mpi_size, &gathered_array,
+ &collective_num_entries) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGATHER, FAIL, "can't gather chunk file space info to/from ranks")
+ }
+
+ /* Collectively re-allocate the modified chunks (from each rank) in the file */
+ collective_list = (H5D_chunk_alloc_info_t *)gathered_array;
+ for (i = 0, num_local_chunks_processed = 0; i < collective_num_entries; i++) {
+ H5D_chunk_alloc_info_t *coll_entry = &collective_list[i];
+ hbool_t need_insert;
+ hbool_t update_local_chunk;
+
+ if (H5D__chunk_file_alloc(idx_info, &coll_entry->chunk_current, &coll_entry->chunk_new, &need_insert,
+ NULL) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "unable to allocate chunk")
+
+ /*
+ * If we just re-allocated a chunk that is local to this
+ * rank, make sure to update the chunk entry in the local
+ * chunk list
+ */
+ update_local_chunk =
+ (num_local_chunks_processed < chunk_list_num_entries) &&
+ (coll_entry->chunk_idx == chunk_list[num_local_chunks_processed].index_info.chunk_idx);
+
+ if (update_local_chunk) {
+ H5D_filtered_collective_io_info_t *local_chunk;
+
+ local_chunk = &chunk_list[num_local_chunks_processed];
+
+ /* Sanity check that this chunk is actually local */
+ HDassert(mpi_rank == local_chunk->orig_owner);
+ HDassert(mpi_rank == local_chunk->new_owner);
+
+ local_chunk->chunk_new = coll_entry->chunk_new;
+ local_chunk->index_info.need_insert = need_insert;
+
+ /*
+ * Since chunk reallocation can move chunks around, check if
+ * the local chunk list is still in ascending offset of order
+ * in the file
+ */
+ if (num_local_chunks_processed) {
+ haddr_t curr_chunk_offset = local_chunk->chunk_new.offset;
+ haddr_t prev_chunk_offset = chunk_list[num_local_chunks_processed - 1].chunk_new.offset;
+
+ HDassert(H5F_addr_defined(prev_chunk_offset) && H5F_addr_defined(curr_chunk_offset));
+ if (curr_chunk_offset < prev_chunk_offset)
+ need_sort = TRUE;
+ }
+
+ num_local_chunks_processed++;
+ }
+ }
+
+ HDassert(chunk_list_num_entries == num_local_chunks_processed);
+
+ /*
+ * Ensure this rank's local chunk list is sorted in
+ * ascending order of offset in the file
+ */
+ if (need_sort)
+ HDqsort(chunk_list, chunk_list_num_entries, sizeof(H5D_filtered_collective_io_info_t),
+ H5D__cmp_filtered_collective_io_info_entry);
+
+done:
+ H5MM_free(gathered_array);
+ H5MM_free(counts_disps_array);
+
+ if (send_type_derived) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_free(&send_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ }
+ if (recv_type_derived) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_free(&recv_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ }
+
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_TIME_STOP(mpi_rank);
+ H5D_MPIO_TRACE_EXIT(mpi_rank);
+#endif
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5D__mpio_collective_filtered_chunk_reallocate() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5D__mpio_collective_filtered_chunk_reinsert
+ *
+ * Purpose: When performing a parallel write on a chunked dataset with
+ * filters applied, all ranks must eventually get together and
+ * perform a collective reinsertion into the dataset's chunk
+ * index of chunks that were modified. This routine is
+ * responsible for coordinating that process.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5D__mpio_collective_filtered_chunk_reinsert(H5D_filtered_collective_io_info_t *chunk_list,
+ size_t chunk_list_num_entries, size_t *num_chunks_assigned_map,
+ H5D_io_info_t *io_info, H5D_chk_idx_info_t *idx_info,
+ int mpi_rank, int mpi_size)
+{
+ H5D_chunk_ud_t chunk_ud;
+ MPI_Datatype send_type;
+ MPI_Datatype recv_type;
+ hbool_t send_type_derived = FALSE;
+ hbool_t recv_type_derived = FALSE;
+ hsize_t scaled_coords[H5O_LAYOUT_NDIMS];
+ size_t collective_num_entries = 0;
+ size_t i;
+ void * gathered_array = NULL;
+ int * counts_disps_array = NULL;
+ int * counts_ptr = NULL;
+ int * displacements_ptr = NULL;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_STATIC
+
+ HDassert(chunk_list || 0 == chunk_list_num_entries);
+ HDassert(io_info);
+ HDassert(idx_info);
+
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_TRACE_ENTER(mpi_rank);
+ H5D_MPIO_TIME_START(mpi_rank, "Reinsertion of modified chunks into chunk index");
+#endif
+
+ /* Only re-insert chunks if index has an insert method */
+ if (!idx_info->storage->ops->insert)
+ HGOTO_DONE(SUCCEED);
+
+ /*
+ * Make sure it's safe to cast this rank's number
+ * of chunks to be sent into an int for MPI
+ */
+ H5_CHECK_OVERFLOW(chunk_list_num_entries, size_t, int);
+
+ /* Create derived datatypes for the chunk re-insertion info needed */
+ if (H5D__mpio_get_chunk_insert_info_types(&recv_type, &recv_type_derived, &send_type,
+ &send_type_derived) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL,
+ "can't create derived datatypes for chunk re-insertion info")
+
+ /*
+ * Gather information to all ranks for a collective re-insertion
+ * of the modified chunks into the chunk index
+ */
+ if (num_chunks_assigned_map) {
+ /*
+ * If a mapping between rank value -> number of assigned chunks has
+ * been provided (usually during linked-chunk I/O), we can use this
+ * to optimize MPI overhead a bit since MPI ranks won't need to
+ * first inform each other about how many chunks they're contributing.
+ */
+ if (NULL == (counts_disps_array = H5MM_malloc(2 * (size_t)mpi_size * sizeof(*counts_disps_array)))) {
+ /* Push an error, but still participate in collective gather operation */
+ HDONE_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "couldn't allocate receive counts and displacements array")
+ }
+ else {
+ /* Set the receive counts from the assigned chunks map */
+ counts_ptr = counts_disps_array;
+
+ for (i = 0; i < (size_t)mpi_size; i++)
+ H5_CHECKED_ASSIGN(counts_ptr[i], int, num_chunks_assigned_map[i], size_t);
+
+ /* Set the displacements into the receive buffer for the gather operation */
+ displacements_ptr = &counts_disps_array[mpi_size];
+
+ *displacements_ptr = 0;
+ for (i = 1; i < (size_t)mpi_size; i++)
+ displacements_ptr[i] = displacements_ptr[i - 1] + counts_ptr[i - 1];
+ }
+
+ /* Perform gather operation */
+ if (H5_mpio_gatherv_alloc(chunk_list, (int)chunk_list_num_entries, send_type, counts_ptr,
+ displacements_ptr, recv_type, TRUE, 0, io_info->comm, mpi_rank, mpi_size,
+ &gathered_array, &collective_num_entries) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGATHER, FAIL,
+ "can't gather chunk index re-insertion info to/from ranks")
+ }
+ else {
+ /*
+ * If no mapping between rank value -> number of assigned chunks has
+ * been provided (usually during multi-chunk I/O), all MPI ranks will
+ * need to first inform other ranks about how many chunks they're
+ * contributing before performing the actual gather operation. Use
+ * the 'simple' MPI_Allgatherv wrapper for this.
+ */
+ if (H5_mpio_gatherv_alloc_simple(chunk_list, (int)chunk_list_num_entries, send_type, recv_type, TRUE,
+ 0, io_info->comm, mpi_rank, mpi_size, &gathered_array,
+ &collective_num_entries) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGATHER, FAIL,
+ "can't gather chunk index re-insertion info to/from ranks")
+ }
+
+ /* Initialize static chunk udata fields from chunk index info */
+ H5D_MPIO_INIT_CHUNK_UD_INFO(chunk_ud, idx_info);
+
+ for (i = 0; i < collective_num_entries; i++) {
+ H5D_chunk_insert_info_t *coll_entry = &((H5D_chunk_insert_info_t *)gathered_array)[i];
+
+ /*
+ * We only need to reinsert this chunk if we had to actually
+ * allocate or reallocate space in the file for it
+ */
+ if (!coll_entry->index_info.need_insert)
+ continue;
+
+ chunk_ud.chunk_block = coll_entry->chunk_block;
+ chunk_ud.chunk_idx = coll_entry->index_info.chunk_idx;
+ chunk_ud.filter_mask = coll_entry->index_info.filter_mask;
+ chunk_ud.common.scaled = scaled_coords;
+
+ /* Calculate scaled coordinates for the chunk */
+ if (idx_info->layout->idx_type == H5D_CHUNK_IDX_EARRAY && idx_info->layout->u.earray.unlim_dim > 0) {
+ /*
+ * Extensible arrays where the unlimited dimension is not
+ * the slowest-changing dimension "swizzle" the coordinates
+ * to move the unlimited dimension value to offset 0. Therefore,
+ * we use the "swizzled" down chunks to calculate the "swizzled"
+ * scaled coordinates and then we undo the "swizzle" operation.
+ *
+ * TODO: In the future, this is something that should be handled
+ * by the particular chunk index rather than manually
+ * here. Likely, the chunk index ops should get a new
+ * callback that accepts a chunk index and provides the
+ * caller with the scaled coordinates for that chunk.
+ */
+ H5VM_array_calc_pre(chunk_ud.chunk_idx, io_info->dset->shared->ndims,
+ idx_info->layout->u.earray.swizzled_down_chunks, scaled_coords);
+
+ H5VM_unswizzle_coords(hsize_t, scaled_coords, idx_info->layout->u.earray.unlim_dim);
+ }
+ else {
+ H5VM_array_calc_pre(chunk_ud.chunk_idx, io_info->dset->shared->ndims,
+ io_info->dset->shared->layout.u.chunk.down_chunks, scaled_coords);
+ }
+
+ scaled_coords[io_info->dset->shared->ndims] = 0;
+
+#ifndef NDEBUG
+ /*
+ * If a matching local chunk entry is found, the
+ * `chunk_info` structure (which contains the chunk's
+ * pre-computed scaled coordinates) will be valid
+ * for this rank. Compare those coordinates against
+ * the calculated coordinates above to make sure
+ * they match.
+ */
+ for (size_t dbg_idx = 0; dbg_idx < chunk_list_num_entries; dbg_idx++) {
+ if (coll_entry->index_info.chunk_idx == chunk_list[dbg_idx].index_info.chunk_idx) {
+ hbool_t coords_match = !HDmemcmp(scaled_coords, chunk_list[dbg_idx].chunk_info->scaled,
+ io_info->dset->shared->ndims * sizeof(hsize_t));
+
+ HDassert(coords_match && "Calculated scaled coordinates for chunk didn't match "
+ "chunk's actual scaled coordinates!");
+ break;
+ }
+ }
+#endif
+
+ if ((idx_info->storage->ops->insert)(idx_info, &chunk_ud, io_info->dset) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTINSERT, FAIL, "unable to insert chunk address into index")
+ }
+
+done:
+ H5MM_free(gathered_array);
+ H5MM_free(counts_disps_array);
+
+ if (send_type_derived) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_free(&send_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ }
+ if (recv_type_derived) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_free(&recv_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ }
+
+#ifdef H5Dmpio_DEBUG
+ H5D_MPIO_TIME_STOP(mpi_rank);
+ H5D_MPIO_TRACE_EXIT(mpi_rank);
+#endif
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D__mpio_collective_filtered_chunk_reinsert() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5D__mpio_get_chunk_redistribute_info_types
+ *
+ * Purpose: Constructs MPI derived datatypes for communicating the
+ * info from a H5D_filtered_collective_io_info_t structure
+ * that is necessary for redistributing shared chunks during a
+ * collective write of filtered chunks.
+ *
+ * The datatype returned through `contig_type` has an extent
+ * equal to the size of an H5D_chunk_redistribute_info_t
+ * structure and is suitable for communicating that structure
+ * type.
+ *
+ * The datatype returned through `resized_type` has an extent
+ * equal to the size of an H5D_filtered_collective_io_info_t
+ * structure. This makes it suitable for sending an array of
+ * those structures, while extracting out just the info
+ * necessary for the chunk redistribution operation during
+ * communication.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5D__mpio_get_chunk_redistribute_info_types(MPI_Datatype *contig_type, hbool_t *contig_type_derived,
+ MPI_Datatype *resized_type, hbool_t *resized_type_derived)
+{
+ MPI_Datatype struct_type = MPI_DATATYPE_NULL;
+ hbool_t struct_type_derived = FALSE;
+ MPI_Datatype chunk_block_type = MPI_DATATYPE_NULL;
+ hbool_t chunk_block_type_derived = FALSE;
+ MPI_Datatype types[5];
+ MPI_Aint displacements[5];
+ int block_lengths[5];
+ int field_count;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_STATIC
+
+ HDassert(contig_type);
+ HDassert(contig_type_derived);
+ HDassert(resized_type);
+ HDassert(resized_type_derived);
+
+ *contig_type_derived = FALSE;
+ *resized_type_derived = FALSE;
+
+ /* Create struct type for the inner H5F_block_t structure */
+ if (H5F_mpi_get_file_block_type(FALSE, &chunk_block_type, &chunk_block_type_derived) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't create derived type for chunk file description")
+
+ field_count = 5;
+ HDassert(field_count == (sizeof(types) / sizeof(MPI_Datatype)));
+
+ /*
+ * Create structure type to pack chunk H5F_block_t structure
+ * next to chunk_idx, orig_owner, new_owner and num_writers
+ * fields
+ */
+ block_lengths[0] = 1;
+ block_lengths[1] = 1;
+ block_lengths[2] = 1;
+ block_lengths[3] = 1;
+ block_lengths[4] = 1;
+ displacements[0] = offsetof(H5D_chunk_redistribute_info_t, chunk_block);
+ displacements[1] = offsetof(H5D_chunk_redistribute_info_t, chunk_idx);
+ displacements[2] = offsetof(H5D_chunk_redistribute_info_t, orig_owner);
+ displacements[3] = offsetof(H5D_chunk_redistribute_info_t, new_owner);
+ displacements[4] = offsetof(H5D_chunk_redistribute_info_t, num_writers);
+ types[0] = chunk_block_type;
+ types[1] = HSIZE_AS_MPI_TYPE;
+ types[2] = MPI_INT;
+ types[3] = MPI_INT;
+ types[4] = MPI_INT;
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Type_create_struct(field_count, block_lengths, displacements, types, contig_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct failed", mpi_code)
+ *contig_type_derived = TRUE;
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(contig_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code)
+
+ /* Create struct type to extract the chunk_current, chunk_idx, orig_owner,
+ * new_owner and num_writers fields from a H5D_filtered_collective_io_info_t
+ * structure
+ */
+ block_lengths[0] = 1;
+ block_lengths[1] = 1;
+ block_lengths[2] = 1;
+ block_lengths[3] = 1;
+ block_lengths[4] = 1;
+ displacements[0] = offsetof(H5D_filtered_collective_io_info_t, chunk_current);
+ displacements[1] = offsetof(H5D_filtered_collective_io_info_t, index_info.chunk_idx);
+ displacements[2] = offsetof(H5D_filtered_collective_io_info_t, orig_owner);
+ displacements[3] = offsetof(H5D_filtered_collective_io_info_t, new_owner);
+ displacements[4] = offsetof(H5D_filtered_collective_io_info_t, num_writers);
+ types[0] = chunk_block_type;
+ types[1] = HSIZE_AS_MPI_TYPE;
+ types[2] = MPI_INT;
+ types[3] = MPI_INT;
+ types[4] = MPI_INT;
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Type_create_struct(field_count, block_lengths, displacements, types, &struct_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct failed", mpi_code)
+ struct_type_derived = TRUE;
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_create_resized(
+ struct_type, 0, sizeof(H5D_filtered_collective_io_info_t), resized_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_resized failed", mpi_code)
+ *resized_type_derived = TRUE;
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(resized_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code)
+
+done:
+ if (struct_type_derived) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_free(&struct_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ }
+ if (chunk_block_type_derived) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_free(&chunk_block_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ }
+
+ if (ret_value < 0) {
+ if (*resized_type_derived) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_free(resized_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ *resized_type_derived = FALSE;
+ }
+ if (*contig_type_derived) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_free(contig_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ *contig_type_derived = FALSE;
+ }
+ }
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D__mpio_get_chunk_redistribute_info_types() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5D__mpio_get_chunk_alloc_info_types
+ *
+ * Purpose: Constructs MPI derived datatypes for communicating the info
+ * from a H5D_filtered_collective_io_info_t structure that is
+ * necessary for re-allocating file space during a collective
+ * write of filtered chunks.
+ *
+ * The datatype returned through `contig_type` has an extent
+ * equal to the size of an H5D_chunk_alloc_info_t structure
+ * and is suitable for communicating that structure type.
+ *
+ * The datatype returned through `resized_type` has an extent
+ * equal to the size of an H5D_filtered_collective_io_info_t
+ * structure. This makes it suitable for sending an array of
+ * those structures, while extracting out just the info
+ * necessary for the chunk file space reallocation operation
+ * during communication.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5D__mpio_get_chunk_alloc_info_types(MPI_Datatype *contig_type, hbool_t *contig_type_derived,
+ MPI_Datatype *resized_type, hbool_t *resized_type_derived)
+{
+ MPI_Datatype struct_type = MPI_DATATYPE_NULL;
+ hbool_t struct_type_derived = FALSE;
+ MPI_Datatype chunk_block_type = MPI_DATATYPE_NULL;
+ hbool_t chunk_block_type_derived = FALSE;
+ MPI_Datatype types[3];
+ MPI_Aint displacements[3];
+ int block_lengths[3];
+ int field_count;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_STATIC
+
+ HDassert(contig_type);
+ HDassert(contig_type_derived);
+ HDassert(resized_type);
+ HDassert(resized_type_derived);
+
+ *contig_type_derived = FALSE;
+ *resized_type_derived = FALSE;
+
+ /* Create struct type for the inner H5F_block_t structure */
+ if (H5F_mpi_get_file_block_type(FALSE, &chunk_block_type, &chunk_block_type_derived) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't create derived type for chunk file description")
+
+ field_count = 3;
+ HDassert(field_count == (sizeof(types) / sizeof(MPI_Datatype)));
+
+ /*
+ * Create structure type to pack both chunk H5F_block_t structures
+ * next to chunk_idx field
+ */
+ block_lengths[0] = 1;
+ block_lengths[1] = 1;
+ block_lengths[2] = 1;
+ displacements[0] = offsetof(H5D_chunk_alloc_info_t, chunk_current);
+ displacements[1] = offsetof(H5D_chunk_alloc_info_t, chunk_new);
+ displacements[2] = offsetof(H5D_chunk_alloc_info_t, chunk_idx);
+ types[0] = chunk_block_type;
+ types[1] = chunk_block_type;
+ types[2] = HSIZE_AS_MPI_TYPE;
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Type_create_struct(field_count, block_lengths, displacements, types, contig_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct failed", mpi_code)
+ *contig_type_derived = TRUE;
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(contig_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code)
+
+ /*
+ * Create struct type to extract the chunk_current, chunk_new and chunk_idx
+ * fields from a H5D_filtered_collective_io_info_t structure
+ */
+ block_lengths[0] = 1;
+ block_lengths[1] = 1;
+ block_lengths[2] = 1;
+ displacements[0] = offsetof(H5D_filtered_collective_io_info_t, chunk_current);
+ displacements[1] = offsetof(H5D_filtered_collective_io_info_t, chunk_new);
+ displacements[2] = offsetof(H5D_filtered_collective_io_info_t, index_info.chunk_idx);
+ types[0] = chunk_block_type;
+ types[1] = chunk_block_type;
+ types[2] = HSIZE_AS_MPI_TYPE;
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Type_create_struct(field_count, block_lengths, displacements, types, &struct_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct failed", mpi_code)
+ struct_type_derived = TRUE;
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_create_resized(
+ struct_type, 0, sizeof(H5D_filtered_collective_io_info_t), resized_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_resized failed", mpi_code)
+ *resized_type_derived = TRUE;
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(resized_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code)
+
+done:
+ if (struct_type_derived) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_free(&struct_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ }
+ if (chunk_block_type_derived) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_free(&chunk_block_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ }
+
+ if (ret_value < 0) {
+ if (*resized_type_derived) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_free(resized_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ *resized_type_derived = FALSE;
+ }
+ if (*contig_type_derived) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_free(contig_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ *contig_type_derived = FALSE;
+ }
+ }
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D__mpio_get_chunk_alloc_info_types() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5D__mpio_get_chunk_insert_info_types
+ *
+ * Purpose: Constructs MPI derived datatypes for communicating the
+ * information necessary when reinserting chunks into a
+ * dataset's chunk index. This includes the chunk's new offset
+ * and size (H5F_block_t) and the inner `index_info` structure
+ * of a H5D_filtered_collective_io_info_t structure.
+ *
+ * The datatype returned through `contig_type` has an extent
+ * equal to the size of an H5D_chunk_insert_info_t structure
+ * and is suitable for communicating that structure type.
+ *
+ * The datatype returned through `resized_type` has an extent
+ * equal to the size of the encompassing
+ * H5D_filtered_collective_io_info_t structure. This makes it
+ * suitable for sending an array of
+ * H5D_filtered_collective_io_info_t structures, while
+ * extracting out just the information needed during
+ * communication.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5D__mpio_get_chunk_insert_info_types(MPI_Datatype *contig_type, hbool_t *contig_type_derived,
+ MPI_Datatype *resized_type, hbool_t *resized_type_derived)
+{
+ MPI_Datatype struct_type = MPI_DATATYPE_NULL;
+ hbool_t struct_type_derived = FALSE;
+ MPI_Datatype chunk_block_type = MPI_DATATYPE_NULL;
+ hbool_t chunk_block_type_derived = FALSE;
+ MPI_Aint contig_type_extent;
+ MPI_Datatype types[4];
+ MPI_Aint displacements[4];
+ int block_lengths[4];
+ int field_count;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_STATIC
+
+ HDassert(contig_type);
+ HDassert(contig_type_derived);
+ HDassert(resized_type);
+ HDassert(resized_type_derived);
+
+ *contig_type_derived = FALSE;
+ *resized_type_derived = FALSE;
+
+ /* Create struct type for an H5F_block_t structure */
+ if (H5F_mpi_get_file_block_type(FALSE, &chunk_block_type, &chunk_block_type_derived) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't create derived type for chunk file description")
+
+ field_count = 4;
+ HDassert(field_count == (sizeof(types) / sizeof(MPI_Datatype)));
+
+ /*
+ * Create struct type to pack information into memory as follows:
+ *
+ * Chunk's new Offset/Size (H5F_block_t) ->
+ * Chunk Index Info (H5D_chunk_index_info_t)
+ */
+ block_lengths[0] = 1;
+ block_lengths[1] = 1;
+ block_lengths[2] = 1;
+ block_lengths[3] = 1;
+ displacements[0] = offsetof(H5D_chunk_insert_info_t, chunk_block);
+ displacements[1] = offsetof(H5D_chunk_insert_info_t, index_info.chunk_idx);
+ displacements[2] = offsetof(H5D_chunk_insert_info_t, index_info.filter_mask);
+ displacements[3] = offsetof(H5D_chunk_insert_info_t, index_info.need_insert);
+ types[0] = chunk_block_type;
+ types[1] = HSIZE_AS_MPI_TYPE;
+ types[2] = MPI_UNSIGNED;
+ types[3] = MPI_C_BOOL;
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Type_create_struct(field_count, block_lengths, displacements, types, &struct_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct failed", mpi_code)
+ struct_type_derived = TRUE;
+
+ contig_type_extent = (MPI_Aint)(sizeof(H5F_block_t) + sizeof(H5D_chunk_index_info_t));
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_create_resized(struct_type, 0, contig_type_extent, contig_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_resized failed", mpi_code)
+ *contig_type_derived = TRUE;
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(contig_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code)
+
+ struct_type_derived = FALSE;
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_free(&struct_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+
+ /*
+ * Create struct type to correctly extract all needed
+ * information from a H5D_filtered_collective_io_info_t
+ * structure.
+ */
+ displacements[0] = offsetof(H5D_filtered_collective_io_info_t, chunk_new);
+ displacements[1] = offsetof(H5D_filtered_collective_io_info_t, index_info.chunk_idx);
+ displacements[2] = offsetof(H5D_filtered_collective_io_info_t, index_info.filter_mask);
+ displacements[3] = offsetof(H5D_filtered_collective_io_info_t, index_info.need_insert);
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Type_create_struct(field_count, block_lengths, displacements, types, &struct_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct failed", mpi_code)
+ struct_type_derived = TRUE;
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_create_resized(
+ struct_type, 0, sizeof(H5D_filtered_collective_io_info_t), resized_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_resized failed", mpi_code)
+ *resized_type_derived = TRUE;
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(resized_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code)
+
+done:
+ if (struct_type_derived) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_free(&struct_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ }
+ if (chunk_block_type_derived) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_free(&chunk_block_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ }
+
+ if (ret_value < 0) {
+ if (*resized_type_derived) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_free(resized_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ *resized_type_derived = FALSE;
+ }
+ if (*contig_type_derived) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_free(contig_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ *contig_type_derived = FALSE;
+ }
+ }
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D__mpio_get_chunk_insert_info_types() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5D__mpio_collective_filtered_io_type
+ *
+ * Purpose: Constructs a MPI derived datatype for both the memory and
+ * the file for a collective I/O operation on filtered chunks.
+ * The datatype contains the chunk offsets and lengths in the
+ * file and the locations of the chunk data buffers to read
+ * into/write from.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5D__mpio_collective_filtered_io_type(H5D_filtered_collective_io_info_t *chunk_list, size_t num_entries,
+ H5D_io_op_type_t op_type, MPI_Datatype *new_mem_type,
+ hbool_t *mem_type_derived, MPI_Datatype *new_file_type,
+ hbool_t *file_type_derived)
+{
+ MPI_Aint *io_buf_array = NULL; /* Relative displacements of filtered chunk data buffers */
+ MPI_Aint *file_offset_array = NULL; /* Chunk offsets in the file */
+ int * length_array = NULL; /* Filtered Chunk lengths */
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_STATIC
+
+ HDassert(chunk_list || 0 == num_entries);
+ HDassert(new_mem_type);
+ HDassert(mem_type_derived);
+ HDassert(new_file_type);
+ HDassert(file_type_derived);
+
+ *mem_type_derived = FALSE;
+ *file_type_derived = FALSE;
+ *new_mem_type = MPI_BYTE;
+ *new_file_type = MPI_BYTE;
+
+ if (num_entries > 0) {
+ H5F_block_t *chunk_block;
+ size_t last_valid_idx = 0;
+ size_t i;
+ int chunk_count;
+
+ /*
+ * Determine number of chunks for I/O operation and
+ * setup for derived datatype creation if I/O operation
+ * includes multiple chunks
+ */
+ if (num_entries == 1) {
+ /* Set last valid index to 0 for contiguous datatype creation */
+ last_valid_idx = 0;
+
+ if (op_type == H5D_IO_OP_WRITE)
+ chunk_count = 1;
+ else
+ chunk_count = chunk_list[0].need_read ? 1 : 0;
+ }
+ else {
+ MPI_Aint chunk_buf;
+ MPI_Aint base_buf;
+ haddr_t base_offset = HADDR_UNDEF;
+
+ H5_CHECK_OVERFLOW(num_entries, size_t, int);
+
+ /* Allocate arrays */
+ if (NULL == (length_array = H5MM_malloc((size_t)num_entries * sizeof(int))))
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "memory allocation failed for filtered collective I/O length array")
+ if (NULL == (io_buf_array = H5MM_malloc((size_t)num_entries * sizeof(MPI_Aint))))
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "memory allocation failed for filtered collective I/O buf length array")
+ if (NULL == (file_offset_array = H5MM_malloc((size_t)num_entries * sizeof(MPI_Aint))))
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "memory allocation failed for filtered collective I/O offset array")
+
+ /*
+ * If doing a write, we can set the base chunk offset
+ * and base chunk data buffer right away.
+ *
+ * If doing a read, some chunks may be skipped over
+ * for reading if they aren't yet allocated in the
+ * file. Therefore, we have to find the first chunk
+ * actually being read in order to set the base chunk
+ * offset and base chunk data buffer.
+ */
+ if (op_type == H5D_IO_OP_WRITE) {
+#if MPI_VERSION >= 3
+ if (MPI_SUCCESS != (mpi_code = MPI_Get_address(chunk_list[0].buf, &base_buf)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Get_address failed", mpi_code)
+#else
+ base_buf = (MPI_Aint)chunk_list[0].buf;
+#endif
+
+ base_offset = chunk_list[0].chunk_new.offset;
+ }
+
+ for (i = 0, chunk_count = 0; i < num_entries; i++) {
+ if (op_type == H5D_IO_OP_READ) {
+ /*
+ * If this chunk isn't being read, don't add it
+ * to the MPI type we're building up for I/O
+ */
+ if (!chunk_list[i].need_read)
+ continue;
+
+ /*
+ * If this chunk is being read, go ahead and
+ * set the base chunk offset and base chunk
+ * data buffer if we haven't already
+ */
+ if (!H5F_addr_defined(base_offset)) {
+#if MPI_VERSION >= 3
+ if (MPI_SUCCESS != (mpi_code = MPI_Get_address(chunk_list[i].buf, &base_buf)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Get_address failed", mpi_code)
+#else
+ base_buf = (MPI_Aint)chunk_list[i].buf;
+#endif
+
+ base_offset = chunk_list[i].chunk_current.offset;
+ }
+ }
+
+ /* Set convenience pointer for current chunk block */
+ chunk_block =
+ (op_type == H5D_IO_OP_READ) ? &chunk_list[i].chunk_current : &chunk_list[i].chunk_new;
+
+ /*
+ * Set the current chunk entry's offset in the file, relative to
+ * the first chunk entry
+ */
+ HDassert(H5F_addr_defined(chunk_block->offset));
+ file_offset_array[chunk_count] = (MPI_Aint)(chunk_block->offset - base_offset);
+
+ /*
+ * Ensure the chunk list is sorted in ascending ordering of
+ * offset in the file
+ */
+ if (chunk_count)
+ HDassert(file_offset_array[chunk_count] > file_offset_array[chunk_count - 1]);
+
+ /* Set the current chunk entry's size for the I/O operation */
+ H5_CHECK_OVERFLOW(chunk_block->length, hsize_t, int);
+ length_array[chunk_count] = (int)chunk_block->length;
+
+ /*
+ * Set the displacement of the chunk entry's chunk data buffer,
+ * relative to the first entry's data buffer
+ */
+#if MPI_VERSION >= 3 && MPI_SUBVERSION >= 1
+ if (MPI_SUCCESS != (mpi_code = MPI_Get_address(chunk_list[i].buf, &chunk_buf)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Get_address failed", mpi_code)
+
+ io_buf_array[chunk_count] = MPI_Aint_diff(chunk_buf, base_buf);
+#else
+ chunk_buf = (MPI_Aint)chunk_list[i].buf;
+ io_buf_array[chunk_count] = chunk_buf - base_buf;
+#endif
+
+ /*
+ * Set last valid index in case only a single chunk will
+ * be involved in the I/O operation
+ */
+ last_valid_idx = i;
+
+ chunk_count++;
+ } /* end for */
+ }
+
+ /*
+ * Create derived datatypes for the chunk list if this
+ * rank has any chunks to work on
+ */
+ if (chunk_count > 0) {
+ if (chunk_count == 1) {
+ int chunk_len;
+
+ /* Single chunk - use a contiguous type for both memory and file */
+
+ /* Ensure that we can cast chunk size to an int for MPI */
+ chunk_block = (op_type == H5D_IO_OP_READ) ? &chunk_list[last_valid_idx].chunk_current
+ : &chunk_list[last_valid_idx].chunk_new;
+ H5_CHECKED_ASSIGN(chunk_len, int, chunk_block->length, hsize_t);
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_contiguous(chunk_len, MPI_BYTE, new_file_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_contiguous failed", mpi_code)
+ *new_mem_type = *new_file_type;
+
+ /*
+ * Since we use the same datatype for both memory and file, only
+ * mark the file type as derived so the caller doesn't try to
+ * free the same type twice
+ */
+ *mem_type_derived = FALSE;
+ *file_type_derived = TRUE;
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(new_file_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code)
+ }
+ else {
+ HDassert(file_offset_array);
+ HDassert(length_array);
+ HDassert(io_buf_array);
+
+ /* Multiple chunks - use an hindexed type for both memory and file */
+
+ /* Create memory MPI type */
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed(
+ chunk_count, length_array, io_buf_array, MPI_BYTE, new_mem_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed failed", mpi_code)
+ *mem_type_derived = TRUE;
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(new_mem_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code)
+
+ /* Create file MPI type */
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Type_create_hindexed(chunk_count, length_array, file_offset_array,
+ MPI_BYTE, new_file_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed failed", mpi_code)
+ *file_type_derived = TRUE;
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(new_file_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code)
+ }
+ }
+ } /* end if */
done:
- if (chunk_entry->async_info.receive_buffer_array)
- H5MM_free(chunk_entry->async_info.receive_buffer_array);
- if (chunk_entry->async_info.receive_requests_array)
- H5MM_free(chunk_entry->async_info.receive_requests_array);
- if (tmp_gath_buf)
- H5MM_free(tmp_gath_buf);
- if (file_iter_init && H5S_SELECT_ITER_RELEASE(file_iter) < 0)
- HDONE_ERROR(H5E_DATASET, H5E_CANTFREE, FAIL, "couldn't release selection iterator")
- if (file_iter)
- H5MM_free(file_iter);
- if (mem_iter_init && H5S_SELECT_ITER_RELEASE(mem_iter) < 0)
- HDONE_ERROR(H5E_DATASET, H5E_CANTFREE, FAIL, "couldn't release selection iterator")
- if (mem_iter)
- H5MM_free(mem_iter);
- if (dataspace)
- if (H5S_close(dataspace) < 0)
- HDONE_ERROR(H5E_DATASPACE, H5E_CANTFREE, FAIL, "can't close dataspace")
+ if (file_offset_array)
+ H5MM_free(file_offset_array);
+ if (io_buf_array)
+ H5MM_free(io_buf_array);
+ if (length_array)
+ H5MM_free(length_array);
+
+ if (ret_value < 0) {
+ if (*file_type_derived) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_free(new_file_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ *file_type_derived = FALSE;
+ }
+ if (*mem_type_derived) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_free(new_mem_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ *mem_type_derived = FALSE;
+ }
+ }
FUNC_LEAVE_NOAPI(ret_value)
-} /* end H5D__filtered_collective_chunk_entry_io() */
+} /* end H5D__mpio_collective_filtered_io_type() */
+
+#ifdef H5Dmpio_DEBUG
+
+static herr_t
+H5D__mpio_dump_collective_filtered_chunk_list(H5D_filtered_collective_io_info_t *chunk_list,
+ size_t chunk_list_num_entries, int mpi_rank)
+{
+ H5D_filtered_collective_io_info_t *chunk_entry;
+ size_t i;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_STATIC_NOERR
+
+ H5D_MPIO_DEBUG(mpi_rank, "CHUNK LIST: [");
+ for (i = 0; i < chunk_list_num_entries; i++) {
+ unsigned chunk_rank;
+
+ chunk_entry = &chunk_list[i];
+
+ HDassert(chunk_entry->chunk_info);
+ chunk_rank = (unsigned)H5S_GET_EXTENT_NDIMS(chunk_entry->chunk_info->fspace);
+
+ H5D_MPIO_DEBUG(mpi_rank, " {");
+ H5D_MPIO_DEBUG_VA(mpi_rank, " - Entry %zu -", i);
+
+ H5D_MPIO_DEBUG(mpi_rank, " - Chunk Fspace Info -");
+ H5D_MPIO_DEBUG_VA(mpi_rank,
+ " Chunk Current Info: { Offset: %" PRIuHADDR ", Length: %" PRIuHADDR " }",
+ chunk_entry->chunk_current.offset, chunk_entry->chunk_current.length);
+ H5D_MPIO_DEBUG_VA(mpi_rank, " Chunk New Info: { Offset: %" PRIuHADDR ", Length: %" PRIuHADDR " }",
+ chunk_entry->chunk_new.offset, chunk_entry->chunk_new.length);
+
+ H5D_MPIO_DEBUG(mpi_rank, " - Chunk Insert Info -");
+ H5D_MPIO_DEBUG_VA(mpi_rank,
+ " Chunk Scaled Coords (4-d): { %" PRIuHSIZE ", %" PRIuHSIZE ", %" PRIuHSIZE
+ ", %" PRIuHSIZE " }",
+ chunk_rank < 1 ? 0 : chunk_entry->chunk_info->scaled[0],
+ chunk_rank < 2 ? 0 : chunk_entry->chunk_info->scaled[1],
+ chunk_rank < 3 ? 0 : chunk_entry->chunk_info->scaled[2],
+ chunk_rank < 4 ? 0 : chunk_entry->chunk_info->scaled[3]);
+ H5D_MPIO_DEBUG_VA(mpi_rank, " Chunk Index: %" PRIuHSIZE, chunk_entry->index_info.chunk_idx);
+ H5D_MPIO_DEBUG_VA(mpi_rank, " Filter Mask: %u", chunk_entry->index_info.filter_mask);
+ H5D_MPIO_DEBUG_VA(mpi_rank, " Need Insert: %s",
+ chunk_entry->index_info.need_insert ? "YES" : "NO");
+
+ H5D_MPIO_DEBUG(mpi_rank, " - Other Info -");
+ H5D_MPIO_DEBUG_VA(mpi_rank, " Chunk Info Ptr: %p", (void *)chunk_entry->chunk_info);
+ H5D_MPIO_DEBUG_VA(mpi_rank, " Need Read: %s", chunk_entry->need_read ? "YES" : "NO");
+ H5D_MPIO_DEBUG_VA(mpi_rank, " Chunk I/O Size: %zu", chunk_entry->io_size);
+ H5D_MPIO_DEBUG_VA(mpi_rank, " Chunk Buffer Size: %zu", chunk_entry->chunk_buf_size);
+ H5D_MPIO_DEBUG_VA(mpi_rank, " Original Owner: %d", chunk_entry->orig_owner);
+ H5D_MPIO_DEBUG_VA(mpi_rank, " New Owner: %d", chunk_entry->new_owner);
+ H5D_MPIO_DEBUG_VA(mpi_rank, " # of Writers: %d", chunk_entry->num_writers);
+ H5D_MPIO_DEBUG_VA(mpi_rank, " Chunk Data Buffer Ptr: %p", (void *)chunk_entry->buf);
+
+ H5D_MPIO_DEBUG(mpi_rank, " }");
+ }
+ H5D_MPIO_DEBUG(mpi_rank, "]");
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D__mpio_dump_collective_filtered_chunk_list() */
+
+#endif
+
#endif /* H5_HAVE_PARALLEL */
diff --git a/src/H5Dpkg.h b/src/H5Dpkg.h
index 49c95a5..a424929 100644
--- a/src/H5Dpkg.h
+++ b/src/H5Dpkg.h
@@ -559,6 +559,7 @@ H5_DLL herr_t H5D__alloc_storage(const H5D_io_info_t *io_info, H5D_time_alloc_t
hbool_t full_overwrite, hsize_t old_dim[]);
H5_DLL herr_t H5D__get_storage_size(const H5D_t *dset, hsize_t *storage_size);
H5_DLL herr_t H5D__get_chunk_storage_size(H5D_t *dset, const hsize_t *offset, hsize_t *storage_size);
+H5_DLL herr_t H5D__chunk_index_empty(const H5D_t *dset, hbool_t *empty);
H5_DLL herr_t H5D__get_num_chunks(const H5D_t *dset, const H5S_t *space, hsize_t *nchunks);
H5_DLL herr_t H5D__get_chunk_info(const H5D_t *dset, const H5S_t *space, hsize_t chk_idx, hsize_t *coord,
unsigned *filter_mask, haddr_t *offset, hsize_t *size);
@@ -591,6 +592,10 @@ H5_DLL herr_t H5D__select_read(const H5D_io_info_t *io_info, const H5D_type_info
H5_DLL herr_t H5D__select_write(const H5D_io_info_t *io_info, const H5D_type_info_t *type_info,
hsize_t nelmts, H5S_t *file_space, H5S_t *mem_space);
+/* Functions that perform direct copying between memory buffers */
+H5_DLL herr_t H5D_select_io_mem(void *dst_buf, const H5S_t *dst_space, const void *src_buf,
+ const H5S_t *src_space, size_t elmt_size, size_t nelmts);
+
/* Functions that perform scatter-gather serial I/O operations */
H5_DLL herr_t H5D__scatter_mem(const void *_tscat_buf, H5S_sel_iter_t *iter, size_t nelmts, void *_buf);
H5_DLL size_t H5D__gather_mem(const void *_buf, H5S_sel_iter_t *iter, size_t nelmts,
@@ -635,7 +640,13 @@ H5_DLL herr_t H5D__chunk_allocate(const H5D_io_info_t *io_info, hbool_t full_ov
const hsize_t old_dim[]);
H5_DLL herr_t H5D__chunk_file_alloc(const H5D_chk_idx_info_t *idx_info, const H5F_block_t *old_chunk,
H5F_block_t *new_chunk, hbool_t *need_insert, const hsize_t *scaled);
+H5_DLL void * H5D__chunk_mem_alloc(size_t size, const H5O_pline_t *pline);
+H5_DLL void H5D__chunk_mem_free(void *chk, const void *_pline);
+H5_DLL void * H5D__chunk_mem_xfree(void *chk, const void *pline);
+H5_DLL void * H5D__chunk_mem_realloc(void *chk, size_t size, const H5O_pline_t *pline);
H5_DLL herr_t H5D__chunk_update_old_edge_chunks(H5D_t *dset, hsize_t old_dim[]);
+H5_DLL hbool_t H5D__chunk_is_partial_edge_chunk(unsigned dset_ndims, const uint32_t *chunk_dims,
+ const hsize_t *chunk_scaled, const hsize_t *dset_dims);
H5_DLL herr_t H5D__chunk_prune_by_extent(H5D_t *dset, const hsize_t *old_dim);
H5_DLL herr_t H5D__chunk_set_sizes(H5D_t *dset);
#ifdef H5_HAVE_PARALLEL
@@ -694,11 +705,11 @@ H5_DLL herr_t H5D__fill_term(H5D_fill_buf_info_t *fb_info);
#ifdef H5_HAVE_PARALLEL
-#ifdef H5S_DEBUG
+#ifdef H5D_DEBUG
#ifndef H5Dmpio_DEBUG
#define H5Dmpio_DEBUG
#endif /*H5Dmpio_DEBUG*/
-#endif /*H5S_DEBUG*/
+#endif /*H5D_DEBUG*/
/* MPI-IO function to read, it will select either regular or irregular read */
H5_DLL herr_t H5D__mpio_select_read(const H5D_io_info_t *io_info, const H5D_type_info_t *type_info,
hsize_t nelmts, H5S_t *file_space, H5S_t *mem_space);
@@ -727,6 +738,8 @@ H5_DLL herr_t H5D__chunk_collective_write(H5D_io_info_t *io_info, const H5D_type
* memory and the file */
H5_DLL htri_t H5D__mpio_opt_possible(const H5D_io_info_t *io_info, const H5S_t *file_space,
const H5S_t *mem_space, const H5D_type_info_t *type_info);
+H5_DLL herr_t H5D__mpio_get_no_coll_cause_strings(char *local_cause, size_t local_cause_len,
+ char *global_cause, size_t global_cause_len);
#endif /* H5_HAVE_PARALLEL */
diff --git a/src/H5Dselect.c b/src/H5Dselect.c
index e64d657..f464ca5 100644
--- a/src/H5Dselect.c
+++ b/src/H5Dselect.c
@@ -105,6 +105,9 @@ H5D__select_io(const H5D_io_info_t *io_info, size_t elmt_size, size_t nelmts, H5
HDassert(io_info->store);
HDassert(io_info->u.rbuf);
+ if (elmt_size == 0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADVALUE, FAIL, "invalid elmt_size of 0")
+
/* Check for only one element in selection */
if (nelmts == 1) {
hsize_t single_mem_off; /* Offset in memory */
@@ -226,8 +229,6 @@ H5D__select_io(const H5D_io_info_t *io_info, size_t elmt_size, size_t nelmts, H5
/* Decrement number of elements left to process */
HDassert(((size_t)tmp_file_len % elmt_size) == 0);
- if (elmt_size == 0)
- HGOTO_ERROR(H5E_DATASPACE, H5E_BADVALUE, FAIL, "Resulted in division by zero")
nelmts -= ((size_t)tmp_file_len / elmt_size);
} /* end while */
} /* end else */
@@ -257,6 +258,188 @@ done:
} /* end H5D__select_io() */
/*-------------------------------------------------------------------------
+ * Function: H5D_select_io_mem
+ *
+ * Purpose: Perform memory copies directly between two memory buffers
+ * according to the selections in the `dst_space` and
+ * `src_space` dataspaces.
+ *
+ * Note: This routine is [basically] the same as H5D__select_io,
+ * with the only difference being that the readvv/writevv
+ * calls are exchanged for H5VM_memcpyvv calls. Changes should
+ * be made to both routines.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5D_select_io_mem(void *dst_buf, const H5S_t *dst_space, const void *src_buf, const H5S_t *src_space,
+ size_t elmt_size, size_t nelmts)
+{
+ H5S_sel_iter_t *dst_sel_iter = NULL; /* Destination dataspace iteration info */
+ H5S_sel_iter_t *src_sel_iter = NULL; /* Source dataspace iteration info */
+ hbool_t dst_sel_iter_init = FALSE; /* Destination dataspace selection iterator initialized? */
+ hbool_t src_sel_iter_init = FALSE; /* Source dataspace selection iterator initialized? */
+ hsize_t * dst_off = NULL; /* Pointer to sequence offsets in destination buffer */
+ hsize_t * src_off = NULL; /* Pointer to sequence offsets in source buffer */
+ size_t * dst_len = NULL; /* Pointer to sequence lengths in destination buffer */
+ size_t * src_len = NULL; /* Pointer to sequence lengths in source buffer */
+ size_t curr_dst_seq; /* Current destination buffer sequence to operate on */
+ size_t curr_src_seq; /* Current source buffer sequence to operate on */
+ size_t dst_nseq; /* Number of sequences generated for destination buffer */
+ size_t src_nseq; /* Number of sequences generated for source buffer */
+ size_t dxpl_vec_size; /* Vector length from API context's DXPL */
+ size_t vec_size; /* Vector length */
+ ssize_t bytes_copied;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ HDassert(dst_buf);
+ HDassert(dst_space);
+ HDassert(src_buf);
+ HDassert(src_space);
+
+ if (elmt_size == 0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADVALUE, FAIL, "invalid elmt_size of 0")
+
+ /* Check for only one element in selection */
+ if (nelmts == 1) {
+ hsize_t single_dst_off; /* Offset in dst_space */
+ hsize_t single_src_off; /* Offset in src_space */
+ size_t single_dst_len; /* Length in dst_space */
+ size_t single_src_len; /* Length in src_space */
+
+ /* Get offset of first element in selections */
+ if (H5S_SELECT_OFFSET(dst_space, &single_dst_off) < 0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_CANTGET, FAIL, "can't retrieve destination selection offset")
+ if (H5S_SELECT_OFFSET(src_space, &single_src_off) < 0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_CANTGET, FAIL, "can't retrieve source selection offset")
+
+ /* Set up necessary information for I/O operation */
+ dst_nseq = src_nseq = 1;
+ curr_dst_seq = curr_src_seq = 0;
+ single_dst_off *= elmt_size;
+ single_src_off *= elmt_size;
+ single_dst_len = single_src_len = elmt_size;
+
+ /* Perform vectorized memcpy from src_buf to dst_buf */
+ if ((bytes_copied =
+ H5VM_memcpyvv(dst_buf, dst_nseq, &curr_dst_seq, &single_dst_len, &single_dst_off, src_buf,
+ src_nseq, &curr_src_seq, &single_src_len, &single_src_off)) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "vectorized memcpy failed")
+
+ HDassert(((size_t)bytes_copied % elmt_size) == 0);
+ }
+ else {
+ unsigned sel_iter_flags = H5S_SEL_ITER_GET_SEQ_LIST_SORTED | H5S_SEL_ITER_SHARE_WITH_DATASPACE;
+ size_t dst_nelem; /* Number of elements used in destination buffer sequences */
+ size_t src_nelem; /* Number of elements used in source buffer sequences */
+
+ /* Get info from API context */
+ if (H5CX_get_vec_size(&dxpl_vec_size) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "can't retrieve I/O vector size")
+
+ /* Allocate the vector I/O arrays */
+ if (dxpl_vec_size > H5D_IO_VECTOR_SIZE)
+ vec_size = dxpl_vec_size;
+ else
+ vec_size = H5D_IO_VECTOR_SIZE;
+
+ if (NULL == (dst_len = H5FL_SEQ_MALLOC(size_t, vec_size)))
+ HGOTO_ERROR(H5E_IO, H5E_CANTALLOC, FAIL, "can't allocate I/O length vector array")
+ if (NULL == (dst_off = H5FL_SEQ_MALLOC(hsize_t, vec_size)))
+ HGOTO_ERROR(H5E_IO, H5E_CANTALLOC, FAIL, "can't allocate I/O offset vector array")
+ if (NULL == (src_len = H5FL_SEQ_MALLOC(size_t, vec_size)))
+ HGOTO_ERROR(H5E_IO, H5E_CANTALLOC, FAIL, "can't allocate I/O length vector array")
+ if (NULL == (src_off = H5FL_SEQ_MALLOC(hsize_t, vec_size)))
+ HGOTO_ERROR(H5E_IO, H5E_CANTALLOC, FAIL, "can't allocate I/O offset vector array")
+
+ /* Allocate the dataspace selection iterators */
+ if (NULL == (dst_sel_iter = H5FL_MALLOC(H5S_sel_iter_t)))
+ HGOTO_ERROR(H5E_DATASPACE, H5E_CANTALLOC, FAIL, "can't allocate destination selection iterator")
+ if (NULL == (src_sel_iter = H5FL_MALLOC(H5S_sel_iter_t)))
+ HGOTO_ERROR(H5E_DATASPACE, H5E_CANTALLOC, FAIL, "can't allocate source selection iterator")
+
+ /* Initialize destination selection iterator */
+ if (H5S_select_iter_init(dst_sel_iter, dst_space, elmt_size, sel_iter_flags) < 0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_CANTINIT, FAIL, "unable to initialize selection iterator")
+ dst_sel_iter_init = TRUE; /* Destination selection iteration info has been initialized */
+
+ /* Initialize source selection iterator */
+ if (H5S_select_iter_init(src_sel_iter, src_space, elmt_size, H5S_SEL_ITER_SHARE_WITH_DATASPACE) < 0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_CANTINIT, FAIL, "unable to initialize selection iterator")
+ src_sel_iter_init = TRUE; /* Source selection iteration info has been initialized */
+
+ /* Initialize sequence counts */
+ curr_dst_seq = curr_src_seq = 0;
+ dst_nseq = src_nseq = 0;
+
+ /* Loop, until all bytes are processed */
+ while (nelmts > 0) {
+ /* Check if more destination buffer sequences are needed */
+ if (curr_dst_seq >= dst_nseq) {
+ /* Get sequences for destination selection */
+ if (H5S_SELECT_ITER_GET_SEQ_LIST(dst_sel_iter, vec_size, nelmts, &dst_nseq, &dst_nelem,
+ dst_off, dst_len) < 0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_CANTGET, FAIL, "sequence length generation failed")
+
+ /* Start at the beginning of the sequences again */
+ curr_dst_seq = 0;
+ }
+
+ /* Check if more source buffer sequences are needed */
+ if (curr_src_seq >= src_nseq) {
+ /* Get sequences for source selection */
+ if (H5S_SELECT_ITER_GET_SEQ_LIST(src_sel_iter, vec_size, nelmts, &src_nseq, &src_nelem,
+ src_off, src_len) < 0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_CANTGET, FAIL, "sequence length generation failed")
+
+ /* Start at the beginning of the sequences again */
+ curr_src_seq = 0;
+ } /* end if */
+
+ /* Perform vectorized memcpy from src_buf to dst_buf */
+ if ((bytes_copied = H5VM_memcpyvv(dst_buf, dst_nseq, &curr_dst_seq, dst_len, dst_off, src_buf,
+ src_nseq, &curr_src_seq, src_len, src_off)) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "vectorized memcpy failed")
+
+ /* Decrement number of elements left to process */
+ HDassert(((size_t)bytes_copied % elmt_size) == 0);
+ nelmts -= ((size_t)bytes_copied / elmt_size);
+ }
+ }
+
+done:
+ /* Release selection iterators */
+ if (src_sel_iter) {
+ if (src_sel_iter_init && H5S_SELECT_ITER_RELEASE(src_sel_iter) < 0)
+ HDONE_ERROR(H5E_DATASPACE, H5E_CANTRELEASE, FAIL, "unable to release selection iterator")
+
+ src_sel_iter = H5FL_FREE(H5S_sel_iter_t, src_sel_iter);
+ }
+ if (dst_sel_iter) {
+ if (dst_sel_iter_init && H5S_SELECT_ITER_RELEASE(dst_sel_iter) < 0)
+ HDONE_ERROR(H5E_DATASPACE, H5E_CANTRELEASE, FAIL, "unable to release selection iterator")
+
+ dst_sel_iter = H5FL_FREE(H5S_sel_iter_t, dst_sel_iter);
+ }
+
+ /* Release vector arrays, if allocated */
+ if (src_off)
+ src_off = H5FL_SEQ_FREE(hsize_t, src_off);
+ if (src_len)
+ src_len = H5FL_SEQ_FREE(size_t, src_len);
+ if (dst_off)
+ dst_off = H5FL_SEQ_FREE(hsize_t, dst_off);
+ if (dst_len)
+ dst_len = H5FL_SEQ_FREE(size_t, dst_len);
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D_select_io_mem() */
+
+/*-------------------------------------------------------------------------
* Function: H5D__select_read
*
* Purpose: Reads directly from file into application memory.
diff --git a/src/H5ES.c b/src/H5ES.c
index 9abaa54..ad42000 100644
--- a/src/H5ES.c
+++ b/src/H5ES.c
@@ -269,7 +269,7 @@ H5ESget_requests(hid_t es_id, H5_iter_order_t order, hid_t *connector_ids, void
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_API(FAIL)
- H5TRACE5("e", "iIo*i**xx", es_id, order, connector_ids, requests, count);
+ H5TRACE6("e", "iIo*i**xzx", es_id, order, connector_ids, requests, array_len, count);
/* Check arguments */
if (NULL == (es = H5I_object_verify(es_id, H5I_EVENTSET)))
diff --git a/src/H5FD.c b/src/H5FD.c
index 397da34..20f69cb 100644
--- a/src/H5FD.c
+++ b/src/H5FD.c
@@ -928,9 +928,10 @@ H5FD_cmp(const H5FD_t *f1, const H5FD_t *f2)
{
int ret_value = -1; /* Return value */
- FUNC_ENTER_NOAPI_NOERR /* return value is arbitrary */
+ FUNC_ENTER_NOAPI_NOERR; /* return value is arbitrary */
- if ((!f1 || !f1->cls) && (!f2 || !f2->cls)) HGOTO_DONE(0)
+ if ((!f1 || !f1->cls) && (!f2 || !f2->cls))
+ HGOTO_DONE(0)
if (!f1 || !f1->cls)
HGOTO_DONE(-1)
if (!f2 || !f2->cls)
diff --git a/src/H5FDmirror_priv.h b/src/H5FDmirror_priv.h
index 6a7b13e..f647c21 100644
--- a/src/H5FDmirror_priv.h
+++ b/src/H5FDmirror_priv.h
@@ -28,10 +28,10 @@ extern "C" {
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
*/
-/* The maximum allowed size for a receiving buffer when accepting bytes to
+/* Define the maximum allowed size for a receiving buffer when accepting bytes to
* write. Writes larger than this size are performed by multiple accept-write
* steps by the Writer. */
-#define H5FD_MIRROR_DATA_BUFFER_MAX H5_GB /* 1 Gigabyte */
+#define H5FD_MIRROR_DATA_BUFFER_MAX (1024 * 1024 * 1024) /* 1 Gigabyte */
#define H5FD_MIRROR_XMIT_CURR_VERSION 1
#define H5FD_MIRROR_XMIT_MAGIC 0x87F8005B
diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c
index 1feff43..4aa8a96 100644
--- a/src/H5FDmpio.c
+++ b/src/H5FDmpio.c
@@ -188,6 +188,41 @@ H5FD__mpio_parse_debug_str(const char *s)
FUNC_LEAVE_NOAPI_VOID
} /* end H5FD__mpio_parse_debug_str() */
+
+/*---------------------------------------------------------------------------
+ * Function: H5FD__mem_t_to_str
+ *
+ * Purpose: Returns a string representing the enum value in an H5FD_mem_t
+ * enum
+ *
+ * Returns: H5FD_mem_t enum value string
+ *
+ *---------------------------------------------------------------------------
+ */
+static const char *
+H5FD__mem_t_to_str(H5FD_mem_t mem_type)
+{
+ switch (mem_type) {
+ case H5FD_MEM_NOLIST:
+ return "H5FD_MEM_NOLIST";
+ case H5FD_MEM_DEFAULT:
+ return "H5FD_MEM_DEFAULT";
+ case H5FD_MEM_SUPER:
+ return "H5FD_MEM_SUPER";
+ case H5FD_MEM_BTREE:
+ return "H5FD_MEM_BTREE";
+ case H5FD_MEM_DRAW:
+ return "H5FD_MEM_DRAW";
+ case H5FD_MEM_GHEAP:
+ return "H5FD_MEM_GHEAP";
+ case H5FD_MEM_LHEAP:
+ return "H5FD_MEM_LHEAP";
+ case H5FD_MEM_OHDR:
+ return "H5FD_MEM_OHDR";
+ default:
+ return "(Unknown)";
+ }
+}
#endif /* H5FDmpio_DEBUG */
/*-------------------------------------------------------------------------
@@ -994,7 +1029,6 @@ H5FD__mpio_query(const H5FD_t H5_ATTR_UNUSED *_file, unsigned long *flags /* out
*flags |= H5FD_FEAT_AGGREGATE_METADATA; /* OK to aggregate metadata allocations */
*flags |= H5FD_FEAT_AGGREGATE_SMALLDATA; /* OK to aggregate "small" raw data allocations */
*flags |= H5FD_FEAT_HAS_MPI; /* This driver uses MPI */
- *flags |= H5FD_FEAT_ALLOCATE_EARLY; /* Allocate space early instead of late */
*flags |= H5FD_FEAT_DEFAULT_VFD_COMPATIBLE; /* VFD creates a file which can be opened with the default
VFD */
} /* end if */
@@ -1172,6 +1206,7 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU
int n;
#endif
hbool_t use_view_this_time = FALSE;
+ hbool_t derived_type = FALSE;
hbool_t rank0_bcast = FALSE; /* If read-with-rank0-and-bcast flag was used */
#ifdef H5FDmpio_DEBUG
hbool_t H5FD_mpio_debug_t_flag = (H5FD_mpio_debug_flags_s[(int)'t'] && H5FD_MPIO_TRACE_THIS_RANK(file));
@@ -1199,8 +1234,6 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU
if (H5FD_mpi_haddr_to_MPIOff(addr, &mpi_off /*out*/) < 0)
HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off")
size_i = (int)size;
- if ((hsize_t)size_i != size)
- HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from size to size_i")
/* Only look for MPI views for raw data transfers */
if (type == H5FD_MEM_DRAW) {
@@ -1304,6 +1337,21 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU
HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
} /* end if */
else {
+ if (size != (hsize_t)size_i) {
+ /* If HERE, then we need to work around the integer size limit
+ * of 2GB. The input size_t size variable cannot fit into an integer,
+ * but we can get around that limitation by creating a different datatype
+ * and then setting the integer size (or element count) to 1 when using
+ * the derived_type.
+ */
+
+ if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &buf_type) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype")
+
+ derived_type = TRUE;
+ size_i = 1;
+ }
+
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_r_flag)
HDfprintf(stderr, "%s: (%d) doing MPI independent IO\n", __func__, file->mpi_rank);
@@ -1366,8 +1414,8 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_r_flag)
- HDfprintf(stderr, "%s: (%d) mpi_off = %ld bytes_read = %lld\n", __func__, file->mpi_rank,
- (long)mpi_off, bytes_read);
+ HDfprintf(stderr, "%s: (%d) mpi_off = %ld bytes_read = %lld type = %s\n", __func__, file->mpi_rank,
+ (long)mpi_off, bytes_read, H5FD__mem_t_to_str(type));
#endif
/*
@@ -1377,6 +1425,9 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU
HDmemset((char *)buf + bytes_read, 0, (size_t)n);
done:
+ if (derived_type)
+ MPI_Type_free(&buf_type);
+
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_t_flag)
HDfprintf(stderr, "%s: (%d) Leaving\n", __func__, file->mpi_rank);
@@ -1489,20 +1540,6 @@ H5FD__mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, h
*/
mpi_off = 0;
} /* end if */
- else if (size != (hsize_t)size_i) {
- /* If HERE, then we need to work around the integer size limit
- * of 2GB. The input size_t size variable cannot fit into an integer,
- * but we can get around that limitation by creating a different datatype
- * and then setting the integer size (or element count) to 1 when using
- * the derived_type.
- */
-
- if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &buf_type) < 0)
- HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype")
-
- derived_type = TRUE;
- size_i = 1;
- }
/* Write the data. */
if (use_view_this_time) {
@@ -1548,6 +1585,21 @@ H5FD__mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, h
HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
} /* end if */
else {
+ if (size != (hsize_t)size_i) {
+ /* If HERE, then we need to work around the integer size limit
+ * of 2GB. The input size_t size variable cannot fit into an integer,
+ * but we can get around that limitation by creating a different datatype
+ * and then setting the integer size (or element count) to 1 when using
+ * the derived_type.
+ */
+
+ if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &buf_type) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype")
+
+ derived_type = TRUE;
+ size_i = 1;
+ }
+
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_w_flag)
HDfprintf(stderr, "%s: (%d) doing MPI independent IO\n", __func__, file->mpi_rank);
@@ -1583,8 +1635,8 @@ H5FD__mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, h
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_w_flag)
- HDfprintf(stderr, "%s: (%d) mpi_off = %ld bytes_written = %lld\n", __func__, file->mpi_rank,
- (long)mpi_off, bytes_written);
+ HDfprintf(stderr, "%s: (%d) mpi_off = %ld bytes_written = %lld type = %s\n", __func__,
+ file->mpi_rank, (long)mpi_off, bytes_written, H5FD__mem_t_to_str(type));
#endif
/* Each process will keep track of its perceived EOF value locally, and
diff --git a/src/H5Fmpi.c b/src/H5Fmpi.c
index 53d2d78..02d8d52 100644
--- a/src/H5Fmpi.c
+++ b/src/H5Fmpi.c
@@ -31,11 +31,12 @@
/***********/
/* Headers */
/***********/
-#include "H5private.h" /* Generic Functions */
-#include "H5Eprivate.h" /* Error handling */
-#include "H5Fpkg.h" /* File access */
-#include "H5FDprivate.h" /* File drivers */
-#include "H5Iprivate.h" /* IDs */
+#include "H5private.h" /* Generic Functions */
+#include "H5CXprivate.h" /* API Contexts */
+#include "H5Eprivate.h" /* Error handling */
+#include "H5Fpkg.h" /* File access */
+#include "H5FDprivate.h" /* File drivers */
+#include "H5Iprivate.h" /* IDs */
#include "H5VLnative_private.h" /* Native VOL connector */
@@ -402,4 +403,189 @@ H5F_mpi_retrieve_comm(hid_t loc_id, hid_t acspl_id, MPI_Comm *mpi_comm)
done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5F_mpi_retrieve_comm */
+
+/*-------------------------------------------------------------------------
+ * Function: H5F_get_coll_metadata_reads
+ *
+ * Purpose: Determines whether collective metadata reads should be
+ * performed. This routine is meant to be the single source of
+ * truth for the collective metadata reads status, as it
+ * coordinates between the file-global flag and the flag set
+ * for the current operation in the current API context.
+ *
+ * Return: TRUE/FALSE (can't fail)
+ *
+ *-------------------------------------------------------------------------
+ */
+hbool_t
+H5F_get_coll_metadata_reads(const H5F_t *file)
+{
+ H5P_coll_md_read_flag_t file_flag = H5P_USER_FALSE;
+ hbool_t ret_value = FALSE;
+
+ FUNC_ENTER_NOAPI_NOERR
+
+ HDassert(file && file->shared);
+
+ /* Retrieve the file-global flag */
+ file_flag = H5F_COLL_MD_READ(file);
+
+ /* If file flag is set to H5P_FORCE_FALSE, exit early
+ * with FALSE, since collective metadata reads have
+ * been explicitly disabled somewhere in the library.
+ */
+ if (H5P_FORCE_FALSE == file_flag)
+ ret_value = FALSE;
+ else {
+ /* If file flag is set to H5P_USER_TRUE, ignore
+ * any settings in the API context. A file-global
+ * setting of H5P_USER_TRUE for collective metadata
+ * reads should ignore any settings on an Access
+ * Property List for an individual operation.
+ */
+ if (H5P_USER_TRUE == file_flag)
+ ret_value = TRUE;
+ else {
+ /* Get the collective metadata reads flag from
+ * the current API context.
+ */
+ ret_value = H5CX_get_coll_metadata_read();
+ }
+ }
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5F_get_coll_metadata_reads() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5F_set_coll_metadata_reads
+ *
+ * Purpose: Used to temporarily modify the collective metadata reads
+ * status. This is useful for cases where either:
+ *
+ * * Collective metadata reads are enabled, but need to be
+ * disabled for an operation about to occur that may trigger
+ * an independent metadata read (such as only rank 0 doing
+ * something)
+ *
+ * * Metadata reads are currently independent, but it is
+ * guaranteed that the application has maintained
+ * collectivity at the interface level (e.g., an operation
+ * that modifies metadata is being performed). In this case,
+ * it should be safe to enable collective metadata reads,
+ * barring any internal library issues that may occur
+ *
+ * After completion, the `file_flag` parameter will be set to
+ * the previous value of the file-global collective metadata
+ * reads flag. The `context_flag` parameter will be set to the
+ * previous value of the API context's collective metadata
+ * reads flag. Another call to this routine should be made to
+ * restore these values (see below warning).
+ *
+ * !! WARNING !!
+ * It is dangerous to modify the collective metadata reads
+ * status, as this can cause crashes, hangs and corruption in
+ * the HDF5 file when improperly done. Therefore, the
+ * `file_flag` and `context_flag` parameters are both
+ * mandatory, and it is assumed that the caller will guarantee
+ * these settings are restored with another call to this
+ * routine once the bracketed operation is complete.
+ * !! WARNING !!
+ *
+ * Return: Nothing
+ *
+ *-------------------------------------------------------------------------
+ */
+void
+H5F_set_coll_metadata_reads(H5F_t *file, H5P_coll_md_read_flag_t *file_flag, hbool_t *context_flag)
+{
+ H5P_coll_md_read_flag_t prev_file_flag = H5P_USER_FALSE;
+ hbool_t prev_context_flag = FALSE;
+
+ FUNC_ENTER_NOAPI_NOERR
+
+ HDassert(file && file->shared);
+ HDassert(file_flag);
+ HDassert(context_flag);
+
+ /* Save old state */
+ prev_file_flag = H5F_COLL_MD_READ(file);
+ prev_context_flag = H5CX_get_coll_metadata_read();
+
+ /* Set new desired state */
+ if (prev_file_flag != *file_flag) {
+ file->shared->coll_md_read = *file_flag;
+ *file_flag = prev_file_flag;
+ }
+ if (prev_context_flag != *context_flag) {
+ H5CX_set_coll_metadata_read(*context_flag);
+ *context_flag = prev_context_flag;
+ }
+
+ FUNC_LEAVE_NOAPI_VOID
+} /* end H5F_set_coll_metadata_reads() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5F_mpi_get_file_block_type
+ *
+ * Purpose: Creates an MPI derived datatype for communicating an
+ * H5F_block_t structure. If `commit` is specified as TRUE,
+ * the resulting datatype will be committed and ready for
+ * use in communication. Otherwise, the type is only suitable
+ * for building other derived types.
+ *
+ * If TRUE is returned through `new_type_derived`, this lets
+ * the caller know that the datatype has been derived and
+ * should be freed with MPI_Type_free once it is no longer
+ * needed.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5F_mpi_get_file_block_type(hbool_t commit, MPI_Datatype *new_type, hbool_t *new_type_derived)
+{
+ MPI_Datatype types[2];
+ MPI_Aint displacements[2];
+ int block_lengths[2];
+ int field_count;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ HDassert(new_type);
+ HDassert(new_type_derived);
+
+ *new_type_derived = FALSE;
+
+ field_count = 2;
+ HDassert(field_count == sizeof(types) / sizeof(MPI_Datatype));
+
+ block_lengths[0] = 1;
+ block_lengths[1] = 1;
+ displacements[0] = offsetof(H5F_block_t, offset);
+ displacements[1] = offsetof(H5F_block_t, length);
+ types[0] = HADDR_AS_MPI_TYPE;
+ types[1] = HSIZE_AS_MPI_TYPE;
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Type_create_struct(field_count, block_lengths, displacements, types, new_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct failed", mpi_code)
+ *new_type_derived = TRUE;
+
+ if (commit && MPI_SUCCESS != (mpi_code = MPI_Type_commit(new_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code)
+
+done:
+ if (ret_value < 0) {
+ if (*new_type_derived) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_free(new_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ *new_type_derived = FALSE;
+ }
+ }
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5F_mpi_get_file_block_type() */
+
#endif /* H5_HAVE_PARALLEL */
diff --git a/src/H5Fprivate.h b/src/H5Fprivate.h
index a5ccbab..67e153e 100644
--- a/src/H5Fprivate.h
+++ b/src/H5Fprivate.h
@@ -962,6 +962,9 @@ H5_DLL MPI_Comm H5F_mpi_get_comm(const H5F_t *f);
H5_DLL int H5F_shared_mpi_get_size(const H5F_shared_t *f_sh);
H5_DLL int H5F_mpi_get_size(const H5F_t *f);
H5_DLL herr_t H5F_mpi_retrieve_comm(hid_t loc_id, hid_t acspl_id, MPI_Comm *mpi_comm);
+H5_DLL herr_t H5F_mpi_get_file_block_type(hbool_t commit, MPI_Datatype *new_type, hbool_t *new_type_derived);
+H5_DLL hbool_t H5F_get_coll_metadata_reads(const H5F_t *f);
+H5_DLL void H5F_set_coll_metadata_reads(H5F_t *f, H5P_coll_md_read_flag_t *file_flag, hbool_t *context_flag);
#endif /* H5_HAVE_PARALLEL */
/* External file cache routines */
diff --git a/src/H5Ocache.c b/src/H5Ocache.c
index ba47da3..c7586cc 100644
--- a/src/H5Ocache.c
+++ b/src/H5Ocache.c
@@ -346,7 +346,7 @@ H5O__cache_deserialize(const void *image, size_t len, void *_udata, hbool_t *dir
done:
/* Release the [possibly partially initialized] object header on errors */
if (!ret_value && oh)
- if (H5O__free(oh) < 0)
+ if (H5O__free(oh, FALSE) < 0)
HDONE_ERROR(H5E_OHDR, H5E_CANTRELEASE, NULL, "unable to destroy object header data")
FUNC_LEAVE_NOAPI(ret_value)
@@ -639,7 +639,7 @@ H5O__cache_free_icr(void *_thing)
HDassert(oh->cache_info.type == H5AC_OHDR);
/* Destroy object header */
- if (H5O__free(oh) < 0)
+ if (H5O__free(oh, FALSE) < 0)
HGOTO_ERROR(H5E_OHDR, H5E_CANTRELEASE, FAIL, "can't destroy object header")
done:
@@ -1242,7 +1242,7 @@ H5O__prefix_deserialize(const uint8_t *_image, H5O_cache_ud_t *udata)
/* Save the object header for later use in 'deserialize' callback */
udata->oh = oh;
- if (H5O__free(saved_oh) < 0)
+ if (H5O__free(saved_oh, FALSE) < 0)
HGOTO_ERROR(H5E_OHDR, H5E_CANTRELEASE, FAIL, "can't destroy object header")
udata->free_oh = FALSE;
}
@@ -1255,7 +1255,7 @@ H5O__prefix_deserialize(const uint8_t *_image, H5O_cache_ud_t *udata)
done:
/* Release the [possibly partially initialized] object header on errors */
if (ret_value < 0 && oh)
- if (H5O__free(oh) < 0)
+ if (H5O__free(oh, FALSE) < 0)
HDONE_ERROR(H5E_OHDR, H5E_CANTRELEASE, FAIL, "unable to destroy object header data")
FUNC_LEAVE_NOAPI(ret_value)
diff --git a/src/H5Ocopy.c b/src/H5Ocopy.c
index 05dfc72..0b0bb55 100644
--- a/src/H5Ocopy.c
+++ b/src/H5Ocopy.c
@@ -33,7 +33,6 @@
#include "H5Aprivate.h" /* Attributes */
#include "H5CXprivate.h" /* API Contexts */
#include "H5Eprivate.h" /* Error handling */
-#include "H5ESprivate.h" /* Event Sets */
#include "H5FLprivate.h" /* Free lists */
#include "H5Iprivate.h" /* IDs */
#include "H5HGprivate.h" /* Global Heaps */
@@ -772,7 +771,7 @@ done:
/* Free destination object header on failure */
if (ret_value < 0) {
if (oh_dst && !inserted) {
- if (H5O__free(oh_dst) < 0)
+ if (H5O__free(oh_dst, TRUE) < 0)
HDONE_ERROR(H5E_OHDR, H5E_CANTFREE, FAIL, "unable to destroy object header data")
if (H5O_loc_reset(oloc_dst) < 0)
HDONE_ERROR(H5E_OHDR, H5E_CANTFREE, FAIL, "unable to destroy object header data")
diff --git a/src/H5Ocopy_ref.c b/src/H5Ocopy_ref.c
index f1f8aaf..1cda3ea 100644
--- a/src/H5Ocopy_ref.c
+++ b/src/H5Ocopy_ref.c
@@ -288,21 +288,22 @@ H5O__copy_expand_ref_object2(H5O_loc_t *src_oloc, hid_t tid_src, const H5T_t *dt
size_t nbytes_src, H5O_loc_t *dst_oloc, H5G_loc_t *dst_root_loc, void *buf_dst,
size_t ref_count, H5O_copy_t *cpy_info)
{
- H5T_t * dt_mem = NULL; /* Memory datatype */
- H5T_t * dt_dst = NULL; /* Destination datatype */
- hid_t tid_mem = H5I_INVALID_HID; /* Datatype ID for memory datatype */
- hid_t tid_dst = H5I_INVALID_HID; /* Datatype ID for memory datatype */
- H5T_path_t *tpath_src_mem = NULL, *tpath_mem_dst = NULL; /* Datatype conversion paths */
- size_t i; /* Local index variable */
- hbool_t reg_tid_src = (tid_src == H5I_INVALID_HID);
- hid_t dst_loc_id = H5I_INVALID_HID;
- void * conv_buf = NULL; /* Buffer for converting data */
- size_t conv_buf_size = 0; /* Buffer size */
- void * reclaim_buf = NULL; /* Buffer for reclaiming data */
- H5S_t * buf_space = NULL; /* Dataspace describing buffer */
- hsize_t buf_dim[1] = {ref_count}; /* Dimension for buffer */
- size_t token_size = H5F_SIZEOF_ADDR(src_oloc->file);
- herr_t ret_value = SUCCEED;
+ H5T_t * dt_mem = NULL; /* Memory datatype */
+ H5T_t * dt_dst = NULL; /* Destination datatype */
+ hid_t tid_mem = H5I_INVALID_HID; /* Datatype ID for memory datatype */
+ hid_t tid_dst = H5I_INVALID_HID; /* Datatype ID for memory datatype */
+ H5T_path_t * tpath_src_mem = NULL, *tpath_mem_dst = NULL; /* Datatype conversion paths */
+ size_t i; /* Local index variable */
+ hbool_t reg_tid_src = (tid_src == H5I_INVALID_HID);
+ hid_t dst_loc_id = H5I_INVALID_HID;
+ void * conv_buf = NULL; /* Buffer for converting data */
+ size_t conv_buf_size = 0; /* Buffer size */
+ void * reclaim_buf = NULL; /* Buffer for reclaiming data */
+ H5S_t * buf_space = NULL; /* Dataspace describing buffer */
+ hsize_t buf_dim[1] = {ref_count}; /* Dimension for buffer */
+ size_t token_size = H5F_SIZEOF_ADDR(src_oloc->file);
+ const unsigned char zeros[H5R_REF_BUF_SIZE] = {0};
+ herr_t ret_value = SUCCEED;
FUNC_ENTER_STATIC
@@ -353,29 +354,34 @@ H5O__copy_expand_ref_object2(H5O_loc_t *src_oloc, hid_t tid_src, const H5T_t *dt
/* Making equivalent references in the destination file */
for (i = 0; i < ref_count; i++) {
- H5R_ref_t * ref_ptr = (H5R_ref_t *)conv_buf;
- H5R_ref_priv_t *ref = (H5R_ref_priv_t *)&ref_ptr[i];
- H5O_token_t tmp_token = {0};
-
- /* Get src object address */
- if (H5R__get_obj_token(ref, &tmp_token, &token_size) < 0)
- HGOTO_ERROR(H5E_OHDR, H5E_CANTGET, FAIL, "unable to get object token")
- if (H5VL_native_token_to_addr(src_oloc->file, H5I_FILE, tmp_token, &src_oloc->addr) < 0)
- HGOTO_ERROR(H5E_OHDR, H5E_CANTUNSERIALIZE, FAIL, "can't deserialize object token into address")
-
- /* Attempt to copy object from source to destination file */
- if (H5O__copy_obj_by_ref(src_oloc, dst_oloc, dst_root_loc, cpy_info) < 0)
- HGOTO_ERROR(H5E_OHDR, H5E_CANTCOPY, FAIL, "unable to copy object")
-
- /* Set dst object address */
- if (H5VL_native_addr_to_token(dst_oloc->file, H5I_FILE, dst_oloc->addr, &tmp_token) < 0)
- HGOTO_ERROR(H5E_OHDR, H5E_CANTSERIALIZE, FAIL, "can't serialize address into object token")
- if (H5R__set_obj_token(ref, (const H5O_token_t *)&tmp_token, token_size) < 0)
- HGOTO_ERROR(H5E_OHDR, H5E_CANTSET, FAIL, "unable to set object token")
- /* Do not set app_ref since references are released once the copy is done */
- if (H5R__set_loc_id(ref, dst_loc_id, TRUE, FALSE) < 0)
- HGOTO_ERROR(H5E_OHDR, H5E_CANTSET, FAIL, "unable to set destination loc id")
- } /* end for */
+ H5R_ref_t * ref_ptr = (H5R_ref_t *)conv_buf;
+ H5R_ref_priv_t *ref = (H5R_ref_priv_t *)&ref_ptr[i];
+
+ /* Check for null reference - only expand reference if it is not null */
+ if (HDmemcmp(ref, zeros, H5R_REF_BUF_SIZE)) {
+ H5O_token_t tmp_token = {0};
+
+ /* Get src object address */
+ if (H5R__get_obj_token(ref, &tmp_token, &token_size) < 0)
+ HGOTO_ERROR(H5E_OHDR, H5E_CANTGET, FAIL, "unable to get object token")
+ if (H5VL_native_token_to_addr(src_oloc->file, H5I_FILE, tmp_token, &src_oloc->addr) < 0)
+ HGOTO_ERROR(H5E_OHDR, H5E_CANTUNSERIALIZE, FAIL,
+ "can't deserialize object token into address")
+
+ /* Attempt to copy object from source to destination file */
+ if (H5O__copy_obj_by_ref(src_oloc, dst_oloc, dst_root_loc, cpy_info) < 0)
+ HGOTO_ERROR(H5E_OHDR, H5E_CANTCOPY, FAIL, "unable to copy object")
+
+ /* Set dst object address */
+ if (H5VL_native_addr_to_token(dst_oloc->file, H5I_FILE, dst_oloc->addr, &tmp_token) < 0)
+ HGOTO_ERROR(H5E_OHDR, H5E_CANTSERIALIZE, FAIL, "can't serialize address into object token")
+ if (H5R__set_obj_token(ref, (const H5O_token_t *)&tmp_token, token_size) < 0)
+ HGOTO_ERROR(H5E_OHDR, H5E_CANTSET, FAIL, "unable to set object token")
+ /* Do not set app_ref since references are released once the copy is done */
+ if (H5R__set_loc_id(ref, dst_loc_id, TRUE, FALSE) < 0)
+ HGOTO_ERROR(H5E_OHDR, H5E_CANTSET, FAIL, "unable to set destination loc id")
+ } /* end if */
+ } /* end for */
/* Copy into another buffer, to reclaim memory later */
if (NULL == (reclaim_buf = H5FL_BLK_MALLOC(type_conv, conv_buf_size)))
diff --git a/src/H5Odtype.c b/src/H5Odtype.c
index fa49924..9af79f4 100644
--- a/src/H5Odtype.c
+++ b/src/H5Odtype.c
@@ -1731,7 +1731,7 @@ H5O__dtype_debug(H5F_t *f, const void *mesg, FILE *stream, int indent, int fwidt
case H5T_NO_CLASS:
case H5T_NCLASSES:
default:
- HDsprintf(buf, "H5T_CLASS_%d", (int)(dt->shared->type));
+ HDsnprintf(buf, sizeof(buf), "H5T_CLASS_%d", (int)(dt->shared->type));
s = buf;
break;
} /* end switch */
@@ -1746,7 +1746,7 @@ H5O__dtype_debug(H5F_t *f, const void *mesg, FILE *stream, int indent, int fwidt
HDfprintf(stream, "%*s%-*s %u\n", indent, "", fwidth,
"Number of members:", dt->shared->u.compnd.nmembs);
for (i = 0; i < dt->shared->u.compnd.nmembs; i++) {
- HDsprintf(buf, "Member %u:", i);
+ HDsnprintf(buf, sizeof(buf), "Member %u:", i);
HDfprintf(stream, "%*s%-*s %s\n", indent, "", fwidth, buf, dt->shared->u.compnd.memb[i].name);
HDfprintf(stream, "%*s%-*s %lu\n", indent + 3, "", MAX(0, fwidth - 3),
"Byte offset:", (unsigned long)(dt->shared->u.compnd.memb[i].offset));
@@ -1759,7 +1759,7 @@ H5O__dtype_debug(H5F_t *f, const void *mesg, FILE *stream, int indent, int fwidt
HDfprintf(stream, "%*s%-*s %u\n", indent, "", fwidth,
"Number of members:", dt->shared->u.enumer.nmembs);
for (i = 0; i < dt->shared->u.enumer.nmembs; i++) {
- HDsprintf(buf, "Member %u:", i);
+ HDsnprintf(buf, sizeof(buf), "Member %u:", i);
HDfprintf(stream, "%*s%-*s %s\n", indent, "", fwidth, buf, dt->shared->u.enumer.name[i]);
HDfprintf(stream, "%*s%-*s 0x", indent, "", fwidth, "Raw bytes of value:");
for (k = 0; k < dt->shared->parent->shared->size; k++)
@@ -1799,13 +1799,14 @@ H5O__dtype_debug(H5F_t *f, const void *mesg, FILE *stream, int indent, int fwidt
case H5T_CSET_RESERVED_13:
case H5T_CSET_RESERVED_14:
case H5T_CSET_RESERVED_15:
- HDsprintf(buf, "H5T_CSET_RESERVED_%d", (int)(dt->shared->u.atomic.u.s.cset));
+ HDsnprintf(buf, sizeof(buf), "H5T_CSET_RESERVED_%d", (int)(dt->shared->u.atomic.u.s.cset));
s = buf;
break;
case H5T_CSET_ERROR:
default:
- HDsprintf(buf, "Unknown character set: %d", (int)(dt->shared->u.atomic.u.s.cset));
+ HDsnprintf(buf, sizeof(buf), "Unknown character set: %d",
+ (int)(dt->shared->u.atomic.u.s.cset));
s = buf;
break;
} /* end switch */
@@ -1837,13 +1838,14 @@ H5O__dtype_debug(H5F_t *f, const void *mesg, FILE *stream, int indent, int fwidt
case H5T_STR_RESERVED_13:
case H5T_STR_RESERVED_14:
case H5T_STR_RESERVED_15:
- HDsprintf(buf, "H5T_STR_RESERVED_%d", (int)(dt->shared->u.atomic.u.s.pad));
+ HDsnprintf(buf, sizeof(buf), "H5T_STR_RESERVED_%d", (int)(dt->shared->u.atomic.u.s.pad));
s = buf;
break;
case H5T_STR_ERROR:
default:
- HDsprintf(buf, "Unknown string padding: %d", (int)(dt->shared->u.atomic.u.s.pad));
+ HDsnprintf(buf, sizeof(buf), "Unknown string padding: %d",
+ (int)(dt->shared->u.atomic.u.s.pad));
s = buf;
break;
} /* end switch */
@@ -1862,7 +1864,7 @@ H5O__dtype_debug(H5F_t *f, const void *mesg, FILE *stream, int indent, int fwidt
case H5T_VLEN_BADTYPE:
case H5T_VLEN_MAXTYPE:
default:
- HDsprintf(buf, "H5T_VLEN_%d", dt->shared->u.vlen.type);
+ HDsnprintf(buf, sizeof(buf), "H5T_VLEN_%d", dt->shared->u.vlen.type);
s = buf;
break;
} /* end switch */
@@ -1880,7 +1882,7 @@ H5O__dtype_debug(H5F_t *f, const void *mesg, FILE *stream, int indent, int fwidt
case H5T_LOC_BADLOC:
case H5T_LOC_MAXLOC:
default:
- HDsprintf(buf, "H5T_LOC_%d", (int)dt->shared->u.vlen.loc);
+ HDsnprintf(buf, sizeof(buf), "H5T_LOC_%d", (int)dt->shared->u.vlen.loc);
s = buf;
break;
} /* end switch */
@@ -1911,13 +1913,13 @@ H5O__dtype_debug(H5F_t *f, const void *mesg, FILE *stream, int indent, int fwidt
case H5T_CSET_RESERVED_13:
case H5T_CSET_RESERVED_14:
case H5T_CSET_RESERVED_15:
- HDsprintf(buf, "H5T_CSET_RESERVED_%d", (int)(dt->shared->u.vlen.cset));
+ HDsnprintf(buf, sizeof(buf), "H5T_CSET_RESERVED_%d", (int)(dt->shared->u.vlen.cset));
s = buf;
break;
case H5T_CSET_ERROR:
default:
- HDsprintf(buf, "Unknown character set: %d", (int)(dt->shared->u.vlen.cset));
+ HDsnprintf(buf, sizeof(buf), "Unknown character set: %d", (int)(dt->shared->u.vlen.cset));
s = buf;
break;
} /* end switch */
@@ -1949,13 +1951,13 @@ H5O__dtype_debug(H5F_t *f, const void *mesg, FILE *stream, int indent, int fwidt
case H5T_STR_RESERVED_13:
case H5T_STR_RESERVED_14:
case H5T_STR_RESERVED_15:
- HDsprintf(buf, "H5T_STR_RESERVED_%d", (int)(dt->shared->u.vlen.pad));
+ HDsnprintf(buf, sizeof(buf), "H5T_STR_RESERVED_%d", (int)(dt->shared->u.vlen.pad));
s = buf;
break;
case H5T_STR_ERROR:
default:
- HDsprintf(buf, "Unknown string padding: %d", (int)(dt->shared->u.vlen.pad));
+ HDsnprintf(buf, sizeof(buf), "Unknown string padding: %d", (int)(dt->shared->u.vlen.pad));
s = buf;
break;
} /* end switch */
@@ -1995,7 +1997,7 @@ H5O__dtype_debug(H5F_t *f, const void *mesg, FILE *stream, int indent, int fwidt
case H5T_ORDER_ERROR:
default:
- HDsprintf(buf, "H5T_ORDER_%d", dt->shared->u.atomic.order);
+ HDsnprintf(buf, sizeof(buf), "H5T_ORDER_%d", dt->shared->u.atomic.order);
s = buf;
break;
} /* end switch */
@@ -2069,9 +2071,9 @@ H5O__dtype_debug(H5F_t *f, const void *mesg, FILE *stream, int indent, int fwidt
case H5T_NPAD:
default:
if (dt->shared->u.atomic.u.f.pad < 0)
- HDsprintf(buf, "H5T_PAD_%d", -(dt->shared->u.atomic.u.f.pad));
+ HDsnprintf(buf, sizeof(buf), "H5T_PAD_%d", -(dt->shared->u.atomic.u.f.pad));
else
- HDsprintf(buf, "bit-%d", dt->shared->u.atomic.u.f.pad);
+ HDsnprintf(buf, sizeof(buf), "bit-%d", dt->shared->u.atomic.u.f.pad);
s = buf;
break;
} /* end switch */
@@ -2092,7 +2094,7 @@ H5O__dtype_debug(H5F_t *f, const void *mesg, FILE *stream, int indent, int fwidt
case H5T_NORM_ERROR:
default:
- HDsprintf(buf, "H5T_NORM_%d", (int)(dt->shared->u.atomic.u.f.norm));
+ HDsnprintf(buf, sizeof(buf), "H5T_NORM_%d", (int)(dt->shared->u.atomic.u.f.norm));
s = buf;
} /* end switch */
HDfprintf(stream, "%*s%-*s %s\n", indent, "", fwidth, "Normalization:", s);
@@ -2129,7 +2131,7 @@ H5O__dtype_debug(H5F_t *f, const void *mesg, FILE *stream, int indent, int fwidt
case H5T_SGN_ERROR:
case H5T_NSGN:
default:
- HDsprintf(buf, "H5T_SGN_%d", (int)(dt->shared->u.atomic.u.i.sign));
+ HDsnprintf(buf, sizeof(buf), "H5T_SGN_%d", (int)(dt->shared->u.atomic.u.i.sign));
s = buf;
break;
} /* end switch */
diff --git a/src/H5Oint.c b/src/H5Oint.c
index ee79b0c..2348790 100644
--- a/src/H5Oint.c
+++ b/src/H5Oint.c
@@ -289,7 +289,7 @@ H5O_create(H5F_t *f, size_t size_hint, size_t initial_rc, hid_t ocpl_id, H5O_loc
HGOTO_ERROR(H5E_OHDR, H5E_BADVALUE, FAIL, "Can't apply object header to file")
done:
- if ((FAIL == ret_value) && (NULL != oh) && (H5O__free(oh) < 0))
+ if ((FAIL == ret_value) && (NULL != oh) && (H5O__free(oh, TRUE) < 0))
HDONE_ERROR(H5E_OHDR, H5E_CANTFREE, FAIL, "can't delete object header")
FUNC_LEAVE_NOAPI(ret_value)
@@ -353,7 +353,7 @@ H5O_create_ohdr(H5F_t *f, hid_t ocpl_id)
ret_value = oh;
done:
- if ((NULL == ret_value) && (NULL != oh) && (H5O__free(oh) < 0))
+ if ((NULL == ret_value) && (NULL != oh) && (H5O__free(oh, TRUE) < 0))
HDONE_ERROR(H5E_OHDR, H5E_CANTFREE, NULL, "can't delete object header")
FUNC_LEAVE_NOAPI(ret_value)
@@ -3014,7 +3014,7 @@ H5O_get_proxy(const H5O_t *oh)
*-------------------------------------------------------------------------
*/
herr_t
-H5O__free(H5O_t *oh)
+H5O__free(H5O_t *oh, hbool_t force)
{
unsigned u; /* Local index variable */
herr_t ret_value = SUCCEED; /* Return value */
@@ -3038,10 +3038,12 @@ H5O__free(H5O_t *oh)
for (u = 0; u < oh->nmesgs; u++) {
#ifndef NDEBUG
/* Verify that message is clean, unless it could have been marked
- * dirty by decoding */
+ * dirty by decoding, or if this is a forced free (in case of
+ * failure during creation of the object some messages may be dirty)
+ */
if (oh->ndecode_dirtied && oh->mesg[u].dirty)
oh->ndecode_dirtied--;
- else
+ else if (!force)
HDassert(oh->mesg[u].dirty == 0);
#endif /* NDEBUG */
diff --git a/src/H5Opkg.h b/src/H5Opkg.h
index ebfe636..1fe918d 100644
--- a/src/H5Opkg.h
+++ b/src/H5Opkg.h
@@ -551,7 +551,7 @@ H5_DLL herr_t H5O__visit(H5G_loc_t *loc, const char *obj_name, H5_index_t idx_ty
H5O_iterate2_t op, void *op_data, unsigned fields);
H5_DLL herr_t H5O__inc_rc(H5O_t *oh);
H5_DLL herr_t H5O__dec_rc(H5O_t *oh);
-H5_DLL herr_t H5O__free(H5O_t *oh);
+H5_DLL herr_t H5O__free(H5O_t *oh, hbool_t force);
/* Object header message routines */
H5_DLL herr_t H5O__msg_alloc(H5F_t *f, H5O_t *oh, const H5O_msg_class_t *type, unsigned *mesg_flags,
diff --git a/src/H5PLpath.c b/src/H5PLpath.c
index 87ff831..b86fd6e 100644
--- a/src/H5PLpath.c
+++ b/src/H5PLpath.c
@@ -709,7 +709,7 @@ H5PL__path_table_iterate_process_path(const char *plugin_path, H5PL_iterate_type
/* Specify a file mask. *.* = We want everything! -
* skip the path if the directory can't be opened */
- HDsprintf(service, "%s\\*.dll", plugin_path);
+ HDsnprintf(service, sizeof(service), "%s\\*.dll", plugin_path);
if ((hFind = FindFirstFileA(service, &fdFile)) == INVALID_HANDLE_VALUE)
HGOTO_DONE(H5_ITER_CONT)
@@ -934,7 +934,7 @@ H5PL__find_plugin_in_path(const H5PL_search_params_t *search_params, hbool_t *fo
*found = FALSE;
/* Specify a file mask. *.* = We want everything! */
- HDsprintf(service, "%s\\*.dll", dir);
+ HDsnprintf(service, sizeof(service), "%s\\*.dll", dir);
if ((hFind = FindFirstFileA(service, &fdFile)) == INVALID_HANDLE_VALUE)
HGOTO_ERROR(H5E_PLUGIN, H5E_OPENERROR, FAIL, "can't open directory")
diff --git a/src/H5Pfapl.c b/src/H5Pfapl.c
index 47c17db..2c3caa8 100644
--- a/src/H5Pfapl.c
+++ b/src/H5Pfapl.c
@@ -5284,15 +5284,14 @@ H5P__decode_coll_md_read_flag_t(const void **_pp, void *_value)
* Function: H5Pset_all_coll_metadata_ops
*
* Purpose: Tell the library whether the metadata read operations will
- * be done collectively (1) or not (0). Default is independent.
- * With collective mode, the library will optimize access to
- * metadata operations on the file.
+ * be done collectively (1) or not (0). Default is independent.
+ * With collective mode, the library will optimize access to
+ * metadata operations on the file.
*
* Note: This routine accepts file access property lists, link
- * access property lists, attribute access property lists,
- * dataset access property lists, group access property lists,
- * named datatype access property lists,
- * and dataset transfer property lists.
+ * access property lists, attribute access property lists,
+ * dataset access property lists, group access property lists
+ * and named datatype access property lists.
*
* Return: Non-negative on success/Negative on failure
*
@@ -5312,7 +5311,7 @@ H5Pset_all_coll_metadata_ops(hid_t plist_id, hbool_t is_collective)
H5TRACE2("e", "ib", plist_id, is_collective);
/* Compare the property list's class against the other class */
- /* (Dataset, group, attribute, and named datype access property lists
+ /* (Dataset, group, attribute, and named datatype access property lists
* are sub-classes of link access property lists -QAK)
*/
if (TRUE != H5P_isa_class(plist_id, H5P_LINK_ACCESS) && TRUE != H5P_isa_class(plist_id, H5P_FILE_ACCESS))
@@ -5342,10 +5341,9 @@ done:
* Purpose: Gets information about collective metadata read mode.
*
* Note: This routine accepts file access property lists, link
- * access property lists, attribute access property lists,
- * dataset access property lists, group access property lists,
- * named datatype access property lists,
- * and dataset transfer property lists.
+ * access property lists, attribute access property lists,
+ * dataset access property lists, group access property lists,
+ * and named datatype access property lists.
*
* Return: Non-negative on success/Negative on failure
*
@@ -5363,7 +5361,7 @@ H5Pget_all_coll_metadata_ops(hid_t plist_id, hbool_t *is_collective /*out*/)
H5TRACE2("e", "ix", plist_id, is_collective);
/* Compare the property list's class against the other class */
- /* (Dataset, group, attribute, and named datype access property lists
+ /* (Dataset, group, attribute, and named datatype access property lists
* are sub-classes of link access property lists -QAK)
*/
if (TRUE != H5P_isa_class(plist_id, H5P_LINK_ACCESS) && TRUE != H5P_isa_class(plist_id, H5P_FILE_ACCESS))
diff --git a/src/H5Tcommit.c b/src/H5Tcommit.c
index e99494a..a3a1aa0 100644
--- a/src/H5Tcommit.c
+++ b/src/H5Tcommit.c
@@ -1127,7 +1127,7 @@ H5T_open(const H5G_loc_t *loc)
done:
if (ret_value == NULL) {
if (dt) {
- if (shared_fo == NULL) { /* Need to free shared of */
+ if (shared_fo == NULL) { /* Need to free shared file object */
if (dt->shared->owned_vol_obj && H5VL_free_object(dt->shared->owned_vol_obj) < 0)
HDONE_ERROR(H5E_DATATYPE, H5E_CANTCLOSEOBJ, NULL, "unable to close owned VOL object")
dt->shared = H5FL_FREE(H5T_shared_t, dt->shared);
diff --git a/src/H5Tnative.c b/src/H5Tnative.c
index 44c6d45..d56ac99 100644
--- a/src/H5Tnative.c
+++ b/src/H5Tnative.c
@@ -43,7 +43,7 @@ static herr_t H5T__cmp_offset(size_t *comp_size, size_t *offset, size_t elem_siz
*
* Purpose: High-level API to return the native type of a datatype.
* The native type is chosen by matching the size and class of
- * querried datatype from the following native premitive
+ * queried datatype from the following native primitive
* datatypes:
* H5T_NATIVE_CHAR H5T_NATIVE_UCHAR
* H5T_NATIVE_SHORT H5T_NATIVE_USHORT
@@ -56,7 +56,7 @@ static herr_t H5T__cmp_offset(size_t *comp_size, size_t *offset, size_t elem_siz
* H5T_NATIVE_LDOUBLE
*
* Compound, array, enum, and VL types all choose among these
- * types for their members. Time, Bifield, Opaque, Reference
+ * types for their members. Time, Bitfield, Opaque, Reference
* types are only copy out.
*
* Return: Success: Returns the native data type if successful.
@@ -696,7 +696,7 @@ H5_GCC_DIAG_OFF("duplicated-branches")
/*-------------------------------------------------------------------------
* Function: H5T__get_native_float
*
- * Purpose: Returns the native floatt type of a datatype.
+ * Purpose: Returns the native float type of a datatype.
*
* Return: Success: Returns the native data type if successful.
*
diff --git a/src/H5VLcallback.c b/src/H5VLcallback.c
index 4cf4d53..0c5c73d 100644
--- a/src/H5VLcallback.c
+++ b/src/H5VLcallback.c
@@ -30,7 +30,7 @@
#include "H5private.h" /* Generic Functions */
#include "H5Eprivate.h" /* Error handling */
#include "H5ESprivate.h" /* Event Sets */
-#include "H5Fprivate.h" /* File access */
+#include "H5Fprivate.h" /* File access */
#include "H5Iprivate.h" /* IDs */
#include "H5MMprivate.h" /* Memory management */
#include "H5Pprivate.h" /* Property lists */
diff --git a/src/H5VLnative.h b/src/H5VLnative.h
index fe8ede2..5e43c4e 100644
--- a/src/H5VLnative.h
+++ b/src/H5VLnative.h
@@ -74,7 +74,7 @@ typedef union H5VL_native_attr_optional_args_t {
#define H5VL_NATIVE_DATASET_CHUNK_WRITE 7 /* H5Dchunk_write */
#define H5VL_NATIVE_DATASET_GET_VLEN_BUF_SIZE 8 /* H5Dvlen_get_buf_size */
#define H5VL_NATIVE_DATASET_GET_OFFSET 9 /* H5Dget_offset */
-#define H5VL_NATIVE_DATASET_CHUNK_ITER 10 /* H5Dget_offset */
+#define H5VL_NATIVE_DATASET_CHUNK_ITER 10 /* H5Dchunk_iter */
/* NOTE: If values over 1023 are added, the H5VL_RESERVED_NATIVE_OPTIONAL macro
* must be updated.
*/
@@ -208,8 +208,8 @@ typedef union H5VL_native_dataset_optional_args_t {
#ifdef H5_HAVE_PARALLEL
#define H5VL_NATIVE_FILE_GET_MPI_ATOMICITY 26 /* H5Fget_mpi_atomicity */
#define H5VL_NATIVE_FILE_SET_MPI_ATOMICITY 27 /* H5Fset_mpi_atomicity */
-#endif /* H5_HAVE_PARALLEL */
-#define H5VL_NATIVE_FILE_POST_OPEN 28 /* Adjust file after open, with wrapping context */
+#endif
+#define H5VL_NATIVE_FILE_POST_OPEN 28 /* Adjust file after open, with wrapping context */
/* NOTE: If values over 1023 are added, the H5VL_RESERVED_NATIVE_OPTIONAL macro
* must be updated.
*/
diff --git a/src/H5Z.c b/src/H5Z.c
index bcdd837..763eac2 100644
--- a/src/H5Z.c
+++ b/src/H5Z.c
@@ -594,14 +594,9 @@ H5Z__flush_file_cb(void *obj_ptr, hid_t H5_ATTR_UNUSED obj_id, void H5_ATTR_PARA
/* Do a global flush if the file is opened for write */
if (H5F_ACC_RDWR & H5F_INTENT(f)) {
-/* When parallel HDF5 is defined, check for collective metadata reads on this
- * file and set the flag for metadata I/O in the API context. -QAK, 2018/02/14
- */
#ifdef H5_HAVE_PARALLEL
/* Check if MPIO driver is used */
if (H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) {
- H5P_coll_md_read_flag_t coll_md_read; /* Do all metadata reads collectively */
-
/* Sanity check for collectively calling H5Zunregister, if requested */
/* (Sanity check assumes that a barrier on one file's comm
* is sufficient (i.e. that there aren't different comms for
@@ -621,13 +616,8 @@ H5Z__flush_file_cb(void *obj_ptr, hid_t H5_ATTR_UNUSED obj_id, void H5_ATTR_PARA
/* Set the "sanity checked" flag */
object->sanity_checked = TRUE;
} /* end if */
-
- /* Check whether to use the collective metadata read DXPL */
- coll_md_read = H5F_COLL_MD_READ(f);
- if (H5P_USER_TRUE == coll_md_read)
- H5CX_set_coll_metadata_read(TRUE);
- } /* end if */
-#endif /* H5_HAVE_PARALLEL */
+ } /* end if */
+#endif /* H5_HAVE_PARALLEL */
/* Call the flush routine for mounted file hierarchies */
if (H5F_flush_mounts((H5F_t *)obj_ptr) < 0)
diff --git a/src/H5Znbit.c b/src/H5Znbit.c
index 21363bc..905d417 100644
--- a/src/H5Znbit.c
+++ b/src/H5Znbit.c
@@ -975,7 +975,7 @@ H5Z__filter_nbit(unsigned flags, size_t cd_nelmts, const unsigned cd_values[], s
/* input; decompress */
if (flags & H5Z_FLAG_REVERSE) {
- size_out = d_nelmts * cd_values[4]; /* cd_values[4] stores datatype size */
+ size_out = d_nelmts * (size_t)cd_values[4]; /* cd_values[4] stores datatype size */
/* allocate memory space for decompressed buffer */
if (NULL == (outbuf = (unsigned char *)H5MM_malloc(size_out)))
@@ -1170,7 +1170,8 @@ H5Z__nbit_decompress_one_array(unsigned char *data, size_t data_offset, unsigned
n = total_size / p.size;
for (i = 0; i < n; i++)
- H5Z__nbit_decompress_one_atomic(data, data_offset + i * p.size, buffer, j, buf_len, &p);
+ H5Z__nbit_decompress_one_atomic(data, data_offset + i * (size_t)p.size, buffer, j, buf_len,
+ &p);
break;
case H5Z_NBIT_ARRAY:
@@ -1178,8 +1179,8 @@ H5Z__nbit_decompress_one_array(unsigned char *data, size_t data_offset, unsigned
n = total_size / base_size; /* number of base_type elements inside the array datatype */
begin_index = *parms_index;
for (i = 0; i < n; i++) {
- if (H5Z__nbit_decompress_one_array(data, data_offset + i * base_size, buffer, j, buf_len,
- parms, parms_index) < 0)
+ if (H5Z__nbit_decompress_one_array(data, data_offset + i * (size_t)base_size, buffer, j,
+ buf_len, parms, parms_index) < 0)
HGOTO_ERROR(H5E_PLINE, H5E_CANTFILTER, FAIL, "can't decompress array")
*parms_index = begin_index;
}
@@ -1190,8 +1191,8 @@ H5Z__nbit_decompress_one_array(unsigned char *data, size_t data_offset, unsigned
n = total_size / base_size; /* number of base_type elements inside the array datatype */
begin_index = *parms_index;
for (i = 0; i < n; i++) {
- if (H5Z__nbit_decompress_one_compound(data, data_offset + i * base_size, buffer, j, buf_len,
- parms, parms_index) < 0)
+ if (H5Z__nbit_decompress_one_compound(data, data_offset + i * (size_t)base_size, buffer, j,
+ buf_len, parms, parms_index) < 0)
HGOTO_ERROR(H5E_PLINE, H5E_CANTFILTER, FAIL, "can't decompress compound")
*parms_index = begin_index;
}
@@ -1291,7 +1292,7 @@ H5Z__nbit_decompress(unsigned char *data, unsigned d_nelmts, unsigned char *buff
FUNC_ENTER_STATIC
/* may not have to initialize to zeros */
- HDmemset(data, 0, d_nelmts * parms[4]);
+ HDmemset(data, 0, d_nelmts * (size_t)parms[4]);
/* initialization before the loop */
j = 0;
@@ -1309,7 +1310,7 @@ H5Z__nbit_decompress(unsigned char *data, unsigned d_nelmts, unsigned char *buff
HGOTO_ERROR(H5E_PLINE, H5E_BADTYPE, FAIL, "invalid datatype precision/offset")
for (i = 0; i < d_nelmts; i++)
- H5Z__nbit_decompress_one_atomic(data, i * p.size, buffer, &j, &buf_len, &p);
+ H5Z__nbit_decompress_one_atomic(data, i * (size_t)p.size, buffer, &j, &buf_len, &p);
break;
case H5Z_NBIT_ARRAY:
@@ -1468,7 +1469,7 @@ H5Z__nbit_compress_one_array(unsigned char *data, size_t data_offset, unsigned c
p.offset = parms[(*parms_index)++];
n = total_size / p.size;
for (i = 0; i < n; i++)
- H5Z__nbit_compress_one_atomic(data, data_offset + i * p.size, buffer, j, buf_len, &p);
+ H5Z__nbit_compress_one_atomic(data, data_offset + i * (size_t)p.size, buffer, j, buf_len, &p);
break;
case H5Z_NBIT_ARRAY:
@@ -1476,8 +1477,8 @@ H5Z__nbit_compress_one_array(unsigned char *data, size_t data_offset, unsigned c
n = total_size / base_size; /* number of base_type elements inside the array datatype */
begin_index = *parms_index;
for (i = 0; i < n; i++) {
- H5Z__nbit_compress_one_array(data, data_offset + i * base_size, buffer, j, buf_len, parms,
- parms_index);
+ H5Z__nbit_compress_one_array(data, data_offset + i * (size_t)base_size, buffer, j, buf_len,
+ parms, parms_index);
*parms_index = begin_index;
}
break;
@@ -1487,8 +1488,8 @@ H5Z__nbit_compress_one_array(unsigned char *data, size_t data_offset, unsigned c
n = total_size / base_size; /* number of base_type elements inside the array datatype */
begin_index = *parms_index;
for (i = 0; i < n; i++) {
- H5Z__nbit_compress_one_compound(data, data_offset + i * base_size, buffer, j, buf_len, parms,
- parms_index);
+ H5Z__nbit_compress_one_compound(data, data_offset + i * (size_t)base_size, buffer, j, buf_len,
+ parms, parms_index);
*parms_index = begin_index;
}
break;
@@ -1574,7 +1575,7 @@ H5Z__nbit_compress(unsigned char *data, unsigned d_nelmts, unsigned char *buffer
p.offset = parms[7];
for (i = 0; i < d_nelmts; i++)
- H5Z__nbit_compress_one_atomic(data, i * p.size, buffer, &new_size, &buf_len, &p);
+ H5Z__nbit_compress_one_atomic(data, i * (size_t)p.size, buffer, &new_size, &buf_len, &p);
break;
case H5Z_NBIT_ARRAY:
diff --git a/src/H5Zscaleoffset.c b/src/H5Zscaleoffset.c
index 5bdc51c..46c1a10 100644
--- a/src/H5Zscaleoffset.c
+++ b/src/H5Zscaleoffset.c
@@ -1240,7 +1240,7 @@ H5Z__filter_scaleoffset(unsigned flags, size_t cd_nelmts, const unsigned cd_valu
p.minbits = minbits;
/* calculate size of output buffer after decompression */
- size_out = d_nelmts * p.size;
+ size_out = d_nelmts * (size_t)p.size;
/* allocate memory space for decompressed buffer */
if (NULL == (outbuf = (unsigned char *)H5MM_malloc(size_out)))
@@ -1403,7 +1403,7 @@ H5Z__scaleoffset_convert(void *buf, unsigned d_nelmts, unsigned dtype_size)
unsigned char *buffer, temp;
buffer = (unsigned char *)buf;
- for (i = 0; i < d_nelmts * dtype_size; i += dtype_size)
+ for (i = 0; i < d_nelmts * (size_t)dtype_size; i += dtype_size)
for (j = 0; j < dtype_size / 2; j++) {
/* swap pair of bytes */
temp = buffer[i + j];
@@ -1681,7 +1681,7 @@ H5Z__scaleoffset_decompress(unsigned char *data, unsigned d_nelmts, unsigned cha
unsigned buf_len;
/* must initialize to zeros */
- for (i = 0; i < d_nelmts * p.size; i++)
+ for (i = 0; i < d_nelmts * (size_t)p.size; i++)
data[i] = 0;
/* initialization before the loop */
diff --git a/src/H5mpi.c b/src/H5mpi.c
index aea0104..15fb785 100644
--- a/src/H5mpi.c
+++ b/src/H5mpi.c
@@ -549,4 +549,237 @@ done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5_mpio_create_large_type() */
+/*-------------------------------------------------------------------------
+ * Function: H5_mpio_gatherv_alloc
+ *
+ * Purpose: A wrapper around MPI_(All)gatherv that performs allocation
+ * of the receive buffer on the caller's behalf. This
+ * routine's parameters are as follows:
+ *
+ * `send_buf` - The buffer that data will be sent from for
+ * the calling MPI rank. Analogous to
+ * MPI_(All)gatherv's `sendbuf` parameter.
+ *
+ * `send_count` - The number of `send_type` elements in the
+ * send buffer. Analogous to MPI_(All)gatherv's
+ * `sendcount` parameter.
+ *
+ * `send_type` - The MPI Datatype of the elements in the send
+ * buffer. Analogous to MPI_(All)gatherv's
+ * `sendtype` parameter.
+ *
+ * `recv_counts` - An array containing the number of elements
+ * to be received from each MPI rank.
+ * Analogous to MPI_(All)gatherv's `recvcount`
+ * parameter.
+ *
+ * `displacements` - An array containing the displacements
+ * in the receive buffer where data from
+ * each MPI rank should be placed. Analogous
+ * to MPI_(All)gatherv's `displs` parameter.
+ *
+ * `recv_type` - The MPI Datatype of the elements in the
+ * receive buffer. Analogous to
+ * MPI_(All)gatherv's `recvtype` parameter.
+ *
+ * `allgather` - Specifies whether the gather operation to be
+ * performed should be MPI_Allgatherv (TRUE) or
+ * MPI_Gatherv (FALSE).
+ *
+ * `root` - For MPI_Gatherv operations, specifies the rank
+ * that will receive the data sent by other ranks.
+ * Analogous to MPI_Gatherv's `root` parameter. For
+ * MPI_Allgatherv operations, this parameter is
+ * ignored.
+ *
+ * `comm` - Specifies the MPI Communicator for the operation.
+ * Analogous to MPI_(All)gatherv's `comm` parameter.
+ *
+ * `mpi_rank` - Specifies the calling rank's rank value, as
+ * obtained by calling MPI_Comm_rank on the
+ * MPI Communicator `comm`.
+ *
+ * `mpi_size` - Specifies the MPI Communicator size, as
+ * obtained by calling MPI_Comm_size on the
+ * MPI Communicator `comm`.
+ *
+ * `out_buf` - Resulting buffer that is allocated and
+ * returned to the caller after data has been
+ * gathered into it. Returned only to the rank
+ * specified by `root` for MPI_Gatherv
+ * operations, or to all ranks for
+ * MPI_Allgatherv operations.
+ *
+ * `out_buf_num_entries` - The number of elements in the
+ * resulting buffer, in terms of
+ * the MPI Datatype provided for
+ * `recv_type`.
+ *
+ * Notes: This routine is collective across `comm`.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5_mpio_gatherv_alloc(void *send_buf, int send_count, MPI_Datatype send_type, const int recv_counts[],
+ const int displacements[], MPI_Datatype recv_type, hbool_t allgather, int root,
+ MPI_Comm comm, int mpi_rank, int mpi_size, void **out_buf, size_t *out_buf_num_entries)
+{
+ size_t recv_buf_num_entries = 0;
+ void * recv_buf = NULL;
+#if MPI_VERSION >= 3
+ MPI_Count type_lb;
+ MPI_Count type_extent;
+#else
+ MPI_Aint type_lb;
+ MPI_Aint type_extent;
+#endif
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ HDassert(send_buf || send_count == 0);
+ if (allgather || (mpi_rank == root))
+ HDassert(out_buf && out_buf_num_entries);
+
+ /* Retrieve the extent of the MPI Datatype being used */
+#if MPI_VERSION >= 3
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_get_extent_x(recv_type, &type_lb, &type_extent)))
+#else
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_get_extent(recv_type, &type_lb, &type_extent)))
+#endif
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_get_extent(_x) failed", mpi_code)
+
+ if (type_extent < 0)
+ HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "MPI recv_type had a negative extent")
+
+ /*
+ * Calculate the total size of the buffer being
+ * returned and allocate it
+ */
+ if (allgather || (mpi_rank == root)) {
+ size_t i;
+ size_t buf_size;
+
+ for (i = 0, recv_buf_num_entries = 0; i < (size_t)mpi_size; i++)
+ recv_buf_num_entries += (size_t)recv_counts[i];
+ buf_size = recv_buf_num_entries * (size_t)type_extent;
+
+ /* If our buffer size is 0, there's nothing to do */
+ if (buf_size == 0)
+ HGOTO_DONE(SUCCEED)
+
+ if (NULL == (recv_buf = H5MM_malloc(buf_size)))
+ /* Push an error, but still participate in collective gather operation */
+ HDONE_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't allocate receive buffer")
+ }
+
+ /* Perform gather operation */
+ if (allgather) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Allgatherv(send_buf, send_count, send_type, recv_buf, recv_counts,
+ displacements, recv_type, comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Allgatherv failed", mpi_code)
+ }
+ else {
+ if (MPI_SUCCESS != (mpi_code = MPI_Gatherv(send_buf, send_count, send_type, recv_buf, recv_counts,
+ displacements, recv_type, root, comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Gatherv failed", mpi_code)
+ }
+
+ if (allgather || (mpi_rank == root)) {
+ *out_buf = recv_buf;
+ *out_buf_num_entries = recv_buf_num_entries;
+ }
+
+done:
+ if (ret_value < 0) {
+ if (recv_buf)
+ H5MM_free(recv_buf);
+ }
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5_mpio_gatherv_alloc() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5_mpio_gatherv_alloc_simple
+ *
+ * Purpose: A slightly simplified interface to H5_mpio_gatherv_alloc
+ * which calculates the receive counts and receive buffer
+ * displacements for the caller.
+ *
+ * Notes: This routine is collective across `comm`.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5_mpio_gatherv_alloc_simple(void *send_buf, int send_count, MPI_Datatype send_type, MPI_Datatype recv_type,
+ hbool_t allgather, int root, MPI_Comm comm, int mpi_rank, int mpi_size,
+ void **out_buf, size_t *out_buf_num_entries)
+{
+ int * recv_counts_disps_array = NULL;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ HDassert(send_buf || send_count == 0);
+ if (allgather || (mpi_rank == root))
+ HDassert(out_buf && out_buf_num_entries);
+
+ /*
+ * Allocate array to store the receive counts of each rank, as well as
+ * the displacements into the final array where each rank will place
+ * their data. The first half of the array contains the receive counts
+ * (in rank order), while the latter half contains the displacements
+ * (also in rank order).
+ */
+ if (allgather || (mpi_rank == root)) {
+ if (NULL ==
+ (recv_counts_disps_array = H5MM_malloc(2 * (size_t)mpi_size * sizeof(*recv_counts_disps_array))))
+ /* Push an error, but still participate in collective gather operation */
+ HDONE_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "couldn't allocate receive counts and displacements array")
+ }
+
+ /* Collect each rank's send count to interested ranks */
+ if (allgather) {
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Allgather(&send_count, 1, MPI_INT, recv_counts_disps_array, 1, MPI_INT, comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Allgather failed", mpi_code)
+ }
+ else {
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Gather(&send_count, 1, MPI_INT, recv_counts_disps_array, 1, MPI_INT, root, comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Gather failed", mpi_code)
+ }
+
+ /* Set the displacements into the receive buffer for the gather operation */
+ if (allgather || (mpi_rank == root)) {
+ size_t i;
+ int * displacements_ptr;
+
+ displacements_ptr = &recv_counts_disps_array[mpi_size];
+
+ *displacements_ptr = 0;
+ for (i = 1; i < (size_t)mpi_size; i++)
+ displacements_ptr[i] = displacements_ptr[i - 1] + recv_counts_disps_array[i - 1];
+ }
+
+ /* Perform gather operation */
+ if (H5_mpio_gatherv_alloc(send_buf, send_count, send_type, recv_counts_disps_array,
+ &recv_counts_disps_array[mpi_size], recv_type, allgather, root, comm, mpi_rank,
+ mpi_size, out_buf, out_buf_num_entries) < 0)
+ HGOTO_ERROR(H5E_LIB, H5E_CANTGATHER, FAIL, "can't gather data")
+
+done:
+ if (recv_counts_disps_array)
+ H5MM_free(recv_counts_disps_array);
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5_mpio_gatherv_alloc_simple() */
+
#endif /* H5_HAVE_PARALLEL */
diff --git a/src/H5private.h b/src/H5private.h
index fa52e7f..d67163f 100644
--- a/src/H5private.h
+++ b/src/H5private.h
@@ -387,6 +387,25 @@
#define HSSIZET_MAX ((hssize_t)LLONG_MAX)
#define HSSIZET_MIN (~(HSSIZET_MAX))
+#ifdef H5_HAVE_PARALLEL
+
+/* Define a type for safely sending size_t values with MPI */
+#if SIZE_MAX == UCHAR_MAX
+#define H5_SIZE_T_AS_MPI_TYPE MPI_UNSIGNED_CHAR
+#elif SIZE_MAX == USHRT_MAX
+#define H5_SIZE_T_AS_MPI_TYPE MPI_UNSIGNED_SHORT
+#elif SIZE_MAX == UINT_MAX
+#define H5_SIZE_T_AS_MPI_TYPE MPI_UNSIGNED
+#elif SIZE_MAX == ULONG_MAX
+#define H5_SIZE_T_AS_MPI_TYPE MPI_UNSIGNED_LONG
+#elif SIZE_MAX == ULLONG_MAX
+#define H5_SIZE_T_AS_MPI_TYPE MPI_UNSIGNED_LONG_LONG
+#else
+#error "no suitable MPI type for size_t"
+#endif
+
+#endif /* H5_HAVE_PARALLEL */
+
/*
* Types and max sizes for POSIX I/O.
* OS X (Darwin) is odd since the max I/O size does not match the types.
@@ -508,6 +527,9 @@
#define H5_GCC_CLANG_DIAG_ON(x)
#endif
+/* Function pointer typedef for qsort */
+typedef int (*H5_sort_func_cb_t)(const void *, const void *);
+
/* Typedefs and functions for timing certain parts of the library. */
/* A set of elapsed/user/system times emitted as a time point by the
@@ -2602,7 +2624,8 @@ struct h5_long_options {
*/
};
-H5_DLL int H5_get_option(int argc, const char **argv, const char *opt, const struct h5_long_options *l_opt);
+H5_DLL int H5_get_option(int argc, const char *const *argv, const char *opt,
+ const struct h5_long_options *l_opt);
#ifdef H5_HAVE_PARALLEL
/* Generic MPI functions */
@@ -2616,6 +2639,14 @@ H5_DLL herr_t H5_mpi_comm_cmp(MPI_Comm comm1, MPI_Comm comm2, int *result);
H5_DLL herr_t H5_mpi_info_cmp(MPI_Info info1, MPI_Info info2, int *result);
H5_DLL herr_t H5_mpio_create_large_type(hsize_t num_elements, MPI_Aint stride_bytes, MPI_Datatype old_type,
MPI_Datatype *new_type);
+H5_DLL herr_t H5_mpio_gatherv_alloc(void *send_buf, int send_count, MPI_Datatype send_type,
+ const int recv_counts[], const int displacements[],
+ MPI_Datatype recv_type, hbool_t allgather, int root, MPI_Comm comm,
+ int mpi_rank, int mpi_size, void **out_buf, size_t *out_buf_num_entries);
+H5_DLL herr_t H5_mpio_gatherv_alloc_simple(void *send_buf, int send_count, MPI_Datatype send_type,
+ MPI_Datatype recv_type, hbool_t allgather, int root, MPI_Comm comm,
+ int mpi_rank, int mpi_size, void **out_buf,
+ size_t *out_buf_num_entries);
#endif /* H5_HAVE_PARALLEL */
/* Functions for debugging */
diff --git a/src/H5public.h b/src/H5public.h
index 6a3911c..3f9848a 100644
--- a/src/H5public.h
+++ b/src/H5public.h
@@ -83,7 +83,7 @@
/**
* For tweaks, bug-fixes, or development
*/
-#define H5_VERS_RELEASE 1
+#define H5_VERS_RELEASE 2
/**
* For pre-releases like \c snap0. Empty string for official releases.
*/
@@ -91,7 +91,7 @@
/**
* Full version string
*/
-#define H5_VERS_INFO "HDF5 library version: 1.13.1-1"
+#define H5_VERS_INFO "HDF5 library version: 1.13.2-1"
#define H5check() H5check_version(H5_VERS_MAJOR, H5_VERS_MINOR, H5_VERS_RELEASE)
@@ -289,6 +289,11 @@ typedef long long ssize_t;
* \internal Defined as a (minimum) 64-bit integer type.
*/
typedef uint64_t hsize_t;
+
+#ifdef H5_HAVE_PARALLEL
+#define HSIZE_AS_MPI_TYPE MPI_UINT64_T
+#endif
+
/**
* The size of file objects. Used when negative values are needed to indicate errors.
*
@@ -323,7 +328,7 @@ typedef uint64_t haddr_t;
#define HADDR_MAX (HADDR_UNDEF - 1)
#ifdef H5_HAVE_PARALLEL
-#define HADDR_AS_MPI_TYPE MPI_LONG_LONG_INT
+#define HADDR_AS_MPI_TYPE MPI_UINT64_T
#endif
//! <!-- [H5_iter_order_t_snip] -->
diff --git a/src/H5system.c b/src/H5system.c
index 9a966b0..ee9077e 100644
--- a/src/H5system.c
+++ b/src/H5system.c
@@ -956,7 +956,7 @@ const char *H5_optarg; /* Flag argument (or value) */
*-------------------------------------------------------------------------
*/
int
-H5_get_option(int argc, const char **argv, const char *opts, const struct h5_long_options *l_opts)
+H5_get_option(int argc, const char *const *argv, const char *opts, const struct h5_long_options *l_opts)
{
static int sp = 1; /* character index in current token */
int optchar = '?'; /* option character passed back to user */
diff --git a/src/H5timer.c b/src/H5timer.c
index b2cc5f0..b5dba97 100644
--- a/src/H5timer.c
+++ b/src/H5timer.c
@@ -193,17 +193,26 @@ H5_now_usec(void)
struct timespec ts;
HDclock_gettime(CLOCK_MONOTONIC, &ts);
- now = (uint64_t)(ts.tv_sec * (1000 * 1000)) + (uint64_t)(ts.tv_nsec / 1000);
+
+ /* Cast all values in this expression to uint64_t to ensure that all intermediate
+ * calculations are done in 64 bit, to prevent overflow */
+ now = ((uint64_t)ts.tv_sec * ((uint64_t)1000 * (uint64_t)1000)) +
+ ((uint64_t)ts.tv_nsec / (uint64_t)1000);
}
#elif defined(H5_HAVE_GETTIMEOFDAY)
{
struct timeval now_tv;
HDgettimeofday(&now_tv, NULL);
- now = (uint64_t)(now_tv.tv_sec * (1000 * 1000)) + (uint64_t)now_tv.tv_usec;
+
+ /* Cast all values in this expression to uint64_t to ensure that all intermediate
+ * calculations are done in 64 bit, to prevent overflow */
+ now = ((uint64_t)now_tv.tv_sec * ((uint64_t)1000 * (uint64_t)1000)) + (uint64_t)now_tv.tv_usec;
}
#else /* H5_HAVE_GETTIMEOFDAY */
- now = (uint64_t)(HDtime(NULL) * (1000 * 1000));
+ /* Cast all values in this expression to uint64_t to ensure that all intermediate calculations
+ * are done in 64 bit, to prevent overflow */
+ now = ((uint64_t)HDtime(NULL) * ((uint64_t)1000 * (uint64_t)1000));
#endif /* H5_HAVE_GETTIMEOFDAY */
return (now);
diff --git a/src/H5trace.c b/src/H5trace.c
index 5d15fee..8790a88 100644
--- a/src/H5trace.c
+++ b/src/H5trace.c
@@ -4023,7 +4023,7 @@ H5_trace(const double *returning, const char *func, const char *type, ...)
H5_timer_get_times(function_timer, &function_times);
H5_timer_get_times(running_timer, &running_times);
- HDsprintf(tmp, "%.6f", (function_times.elapsed - running_times.elapsed));
+ HDsnprintf(tmp, sizeof(tmp), "%.6f", (function_times.elapsed - running_times.elapsed));
H5RS_asprintf_cat(rs, " %*s ", (int)HDstrlen(tmp), "");
}
for (i = 0; i < current_depth; i++)