summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorScot Breitenfeld <brtnfld@hdfgroup.org>2019-01-07 23:08:33 (GMT)
committerScot Breitenfeld <brtnfld@hdfgroup.org>2019-01-07 23:08:33 (GMT)
commitd9b1ec3ce8672cd9c308f72baedc8a6f7bb9474c (patch)
tree32ff8b32b33a2947c27cbff5a05baee35c23d58b /src
parent5dfe00629588a54dbfb6f2d09dfbd88177e37cc2 (diff)
parentab5fe769ab711f736238abc89ef215a6ecff5f7e (diff)
downloadhdf5-d9b1ec3ce8672cd9c308f72baedc8a6f7bb9474c.zip
hdf5-d9b1ec3ce8672cd9c308f72baedc8a6f7bb9474c.tar.gz
hdf5-d9b1ec3ce8672cd9c308f72baedc8a6f7bb9474c.tar.bz2
Merge pull request #1439 in HDFFV/hdf5 from rank0_bcast to develop
* commit 'ab5fe769ab711f736238abc89ef215a6ecff5f7e': HDFFV-10625 -- Implemented a process-0 read and then broadcast for collective read of full (HS_ALL), contiguous, atomic datasets by all the processes in the file communicator. indent change changed logic statement in if Added chunking test, fixed issue with CX set Correct another git merge failure. Correct misplaced line from git merge. Updated and refined version of Scot's "rank 0 bcast" changes.
Diffstat (limited to 'src')
-rw-r--r--src/H5CX.c92
-rw-r--r--src/H5CXprivate.h3
-rw-r--r--src/H5Dio.c18
-rw-r--r--src/H5Dmpio.c115
-rw-r--r--src/H5Dprivate.h8
-rw-r--r--src/H5FDmpio.c43
-rw-r--r--src/H5Pdxpl.c3
-rw-r--r--src/H5Ppublic.h3
-rw-r--r--src/H5T.c106
-rw-r--r--src/H5Tprivate.h1
10 files changed, 343 insertions, 49 deletions
diff --git a/src/H5CX.c b/src/H5CX.c
index 1f91ee2..0d20132 100644
--- a/src/H5CX.c
+++ b/src/H5CX.c
@@ -198,6 +198,7 @@ typedef struct H5CX_t {
MPI_Datatype btype; /* MPI datatype for buffer, when using collective I/O */
MPI_Datatype ftype; /* MPI datatype for file, when using collective I/O */
hbool_t mpi_file_flushing; /* Whether an MPI-opened file is being flushed */
+ hbool_t rank0_bcast; /* Whether a dataset meets read-with-rank0-and-bcast requirements */
#endif /* H5_HAVE_PARALLEL */
/* Cached DXPL properties */
@@ -261,6 +262,8 @@ typedef struct H5CX_t {
hbool_t mpio_coll_chunk_multi_ratio_coll_set; /* Whether instrumented "collective chunk multi ratio coll" value is set */
int mpio_coll_chunk_multi_ratio_ind; /* Instrumented "collective chunk multi ratio ind" value (H5D_XFER_COLL_CHUNK_MULTI_RATIO_IND_NAME) */
hbool_t mpio_coll_chunk_multi_ratio_ind_set; /* Whether instrumented "collective chunk multi ratio ind" value is set */
+ hbool_t mpio_coll_rank0_bcast; /* Instrumented "collective chunk multi ratio ind" value (H5D_XFER_COLL_CHUNK_MULTI_RATIO_IND_NAME) */
+ hbool_t mpio_coll_rank0_bcast_set; /* Whether instrumented "collective chunk multi ratio ind" value is set */
#endif /* H5_HAVE_INSTRUMENTED_LIBRARY */
#endif /* H5_HAVE_PARALLEL */
@@ -1254,6 +1257,32 @@ H5CX_get_mpi_file_flushing(void)
FUNC_LEAVE_NOAPI((*head)->ctx.mpi_file_flushing)
} /* end H5CX_get_mpi_file_flushing() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5CX_get_mpio_rank0_bcast
+ *
+ * Purpose: Retrieves if the dataset meets read-with-rank0-and-bcast requirements for the current API call context.
+ *
+ * Return: Non-negative on success / Negative on failure
+ *
+ * Programmer: M. Breitenfeld
+ * December 31, 2018
+ *
+ *-------------------------------------------------------------------------
+ */
+hbool_t
+H5CX_get_mpio_rank0_bcast(void)
+{
+ H5CX_node_t **head = H5CX_get_my_context(); /* Get the pointer to the head of the API context, for this thread */
+
+ FUNC_ENTER_NOAPI_NOINIT_NOERR
+
+ /* Sanity check */
+ HDassert(head && *head);
+
+ FUNC_LEAVE_NOAPI((*head)->ctx.rank0_bcast)
+} /* end H5CX_get_mpio_rank0_bcast() */
#endif /* H5_HAVE_PARALLEL */
@@ -2185,6 +2214,34 @@ H5CX_set_mpi_file_flushing(hbool_t flushing)
FUNC_LEAVE_NOAPI_VOID
} /* end H5CX_set_mpi_file_flushing() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5CX_set_mpio_rank0_bcast
+ *
+ * Purpose: Sets the "dataset meets read-with-rank0-and-bcast requirements" flag for the current API call context.
+ *
+ * Return: <none>
+ *
+ * Programmer: M. Breitenfeld
+ * December 31, 2018
+ *
+ *-------------------------------------------------------------------------
+ */
+void
+H5CX_set_mpio_rank0_bcast(hbool_t rank0_bcast)
+{
+ H5CX_node_t **head = H5CX_get_my_context(); /* Get the pointer to the head of the API context, for this thread */
+
+ FUNC_ENTER_NOAPI_NOINIT_NOERR
+
+ /* Sanity checks */
+ HDassert(head && *head);
+
+ (*head)->ctx.rank0_bcast = rank0_bcast;
+
+ FUNC_LEAVE_NOAPI_VOID
+} /* end H5CX_set_mpio_rank0_bcast() */
#endif /* H5_HAVE_PARALLEL */
@@ -2596,6 +2653,40 @@ H5CX_test_set_mpio_coll_chunk_multi_ratio_ind(int mpio_coll_chunk_multi_ratio_in
done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5CX_test_set_mpio_coll_chunk_multi_ratio_ind() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5CX_test_set_mpio_coll_rank0_bcast
+ *
+ * Purpose: Sets the instrumented "read-with-rank0-bcast" flag for the current API call context.
+ *
+ * Note: Only sets value if property set in DXPL
+ *
+ * Return: Non-negative on success / Negative on failure
+ *
+ * Programmer: Quincey Koziol
+ * January 2, 2019
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5CX_test_set_mpio_coll_rank0_bcast(hbool_t mpio_coll_rank0_bcast)
+{
+ H5CX_node_t **head = H5CX_get_my_context(); /* Get the pointer to the head of the API context, for this thread */
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI_NOINIT
+
+ /* Sanity checks */
+ HDassert(head && *head);
+ HDassert(!((*head)->ctx.dxpl_id == H5P_DEFAULT ||
+ (*head)->ctx.dxpl_id == H5P_DATASET_XFER_DEFAULT));
+
+ H5CX_TEST_SET_PROP(H5D_XFER_COLL_RANK0_BCAST_NAME, mpio_coll_rank0_bcast)
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5CX_test_set_mpio_coll_rank0_bcast() */
#endif /* H5_HAVE_INSTRUMENTED_LIBRARY */
#endif /* H5_HAVE_PARALLEL */
@@ -2640,6 +2731,7 @@ H5CX__pop_common(void)
H5CX_SET_PROP(H5D_XFER_COLL_CHUNK_LINK_NUM_FALSE_NAME, mpio_coll_chunk_link_num_false)
H5CX_SET_PROP(H5D_XFER_COLL_CHUNK_MULTI_RATIO_COLL_NAME, mpio_coll_chunk_multi_ratio_coll)
H5CX_SET_PROP(H5D_XFER_COLL_CHUNK_MULTI_RATIO_IND_NAME, mpio_coll_chunk_multi_ratio_ind)
+ H5CX_SET_PROP(H5D_XFER_COLL_RANK0_BCAST_NAME, mpio_coll_rank0_bcast)
#endif /* H5_HAVE_INSTRUMENTED_LIBRARY */
#endif /* H5_HAVE_PARALLEL */
diff --git a/src/H5CXprivate.h b/src/H5CXprivate.h
index 46289c4..46d25d0 100644
--- a/src/H5CXprivate.h
+++ b/src/H5CXprivate.h
@@ -77,6 +77,7 @@ H5_DLL H5AC_ring_t H5CX_get_ring(void);
H5_DLL hbool_t H5CX_get_coll_metadata_read(void);
H5_DLL herr_t H5CX_get_mpi_coll_datatypes(MPI_Datatype *btype, MPI_Datatype *ftype);
H5_DLL hbool_t H5CX_get_mpi_file_flushing(void);
+H5_DLL hbool_t H5CX_get_mpio_rank0_bcast(void);
#endif /* H5_HAVE_PARALLEL */
/* "Getter" routines for DXPL properties cached in API context */
@@ -112,6 +113,7 @@ H5_DLL void H5CX_set_coll_metadata_read(hbool_t cmdr);
H5_DLL herr_t H5CX_set_mpi_coll_datatypes(MPI_Datatype btype, MPI_Datatype ftype);
H5_DLL herr_t H5CX_set_mpio_coll_opt(H5FD_mpio_collective_opt_t mpio_coll_opt);
H5_DLL void H5CX_set_mpi_file_flushing(hbool_t flushing);
+H5_DLL void H5CX_set_mpio_rank0_bcast(hbool_t rank0_bcast);
#endif /* H5_HAVE_PARALLEL */
/* "Setter" routines for DXPL properties cached in API context */
@@ -137,6 +139,7 @@ H5_DLL herr_t H5CX_test_set_mpio_coll_chunk_link_num_true(int mpio_coll_chunk_li
H5_DLL herr_t H5CX_test_set_mpio_coll_chunk_link_num_false(int mpio_coll_chunk_link_num_false);
H5_DLL herr_t H5CX_test_set_mpio_coll_chunk_multi_ratio_coll(int mpio_coll_chunk_multi_ratio_coll);
H5_DLL herr_t H5CX_test_set_mpio_coll_chunk_multi_ratio_ind(int mpio_coll_chunk_multi_ratio_ind);
+H5_DLL herr_t H5CX_test_set_mpio_coll_rank0_bcast(hbool_t rank0_bcast);
#endif /* H5_HAVE_INSTRUMENTED_LIBRARY */
#endif /* H5_HAVE_PARALLEL */
diff --git a/src/H5Dio.c b/src/H5Dio.c
index fe85d23..6062dff 100644
--- a/src/H5Dio.c
+++ b/src/H5Dio.c
@@ -657,22 +657,12 @@ H5D__write(H5D_t *dataset, hid_t mem_type_id, const H5S_t *mem_space,
/* Various MPI based checks */
#ifdef H5_HAVE_PARALLEL
- if H5F_HAS_FEATURE(dataset->oloc.file, H5FD_FEAT_HAS_MPI) {
- /* If MPI based VFD is used, no VL datatype support yet. */
+ if(H5F_HAS_FEATURE(dataset->oloc.file, H5FD_FEAT_HAS_MPI)) {
+ /* If MPI based VFD is used, no VL or region reference datatype support yet. */
/* This is because they use the global heap in the file and we don't */
/* support parallel access of that yet */
- if(H5T_detect_class(type_info.mem_type, H5T_VLEN, FALSE) > 0)
- HGOTO_ERROR(H5E_DATASET, H5E_UNSUPPORTED, FAIL, "Parallel IO does not support writing VL datatypes yet")
-
- /* If MPI based VFD is used, no VL datatype support yet. */
- /* This is because they use the global heap in the file and we don't */
- /* support parallel access of that yet */
- /* We should really use H5T_detect_class() here, but it will be difficult
- * to detect the type of the reference if it is nested... -QAK
- */
- if(H5T_get_class(type_info.mem_type, TRUE) == H5T_REFERENCE &&
- H5T_get_ref_type(type_info.mem_type) == H5R_DATASET_REGION)
- HGOTO_ERROR(H5E_DATASET, H5E_UNSUPPORTED, FAIL, "Parallel IO does not support writing region reference datatypes yet")
+ if(H5T_is_vl_storage(type_info.mem_type) > 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_UNSUPPORTED, FAIL, "Parallel IO does not support writing VL or region reference datatypes yet")
} /* end if */
else {
H5FD_mpio_xfer_t io_xfer_mode; /* MPI I/O transfer mode */
diff --git a/src/H5Dmpio.c b/src/H5Dmpio.c
index 2c06800..f5da33d 100644
--- a/src/H5Dmpio.c
+++ b/src/H5Dmpio.c
@@ -37,7 +37,6 @@
#include "H5Eprivate.h" /* Error handling */
#include "H5Fprivate.h" /* File access */
#include "H5FDprivate.h" /* File drivers */
-#include "H5FDmpi.h" /* MPI-based file drivers */
#include "H5Iprivate.h" /* IDs */
#include "H5MMprivate.h" /* Memory management */
#include "H5Oprivate.h" /* Object headers */
@@ -89,10 +88,20 @@
/******************/
/* Combine chunk address and chunk info into a struct for better performance. */
typedef struct H5D_chunk_addr_info_t {
- haddr_t chunk_addr;
- H5D_chunk_info_t chunk_info;
+ haddr_t chunk_addr;
+ H5D_chunk_info_t chunk_info;
} H5D_chunk_addr_info_t;
+/* Rank 0 Bcast values */
+typedef enum H5D_mpio_no_rank0_bcast_cause_t {
+ H5D_MPIO_RANK0_BCAST = 0x00,
+ H5D_MPIO_RANK0_NOT_H5S_ALL = 0x01,
+ H5D_MPIO_RANK0_NOT_CONTIGUOUS = 0x02,
+ H5D_MPIO_RANK0_NOT_FIXED_SIZE = 0x04,
+ H5D_MPIO_RANK0_GREATER_THAN_2GB = 0x08
+} H5D_mpio_no_rank0_bcast_cause_t;
+
+
/*
* Information about a single chunk when performing collective filtered I/O. All
* of the fields of one of these structs are initialized at the start of collective
@@ -281,8 +290,10 @@ H5D__mpio_opt_possible(const H5D_io_info_t *io_info, const H5S_t *file_space,
const H5S_t *mem_space, const H5D_type_info_t *type_info)
{
H5FD_mpio_xfer_t io_xfer_mode; /* MPI I/O transfer mode */
- unsigned local_cause = 0; /* Local reason(s) for breaking collective mode */
- unsigned global_cause = 0; /* Global reason(s) for breaking collective mode */
+ unsigned local_cause[2] = {0,0}; /* [0] Local reason(s) for breaking collective mode */
+ /* [1] Flag if dataset is both: H5S_ALL and small */
+ unsigned global_cause[2] = {0,0}; /* Global reason(s) for breaking collective mode */
+ htri_t is_vl_storage; /* Whether the dataset's datatype is stored in a variable-length form */
htri_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_PACKAGE
@@ -296,36 +307,37 @@ H5D__mpio_opt_possible(const H5D_io_info_t *io_info, const H5S_t *file_space,
/* For independent I/O, get out quickly and don't try to form consensus */
if(H5CX_get_io_xfer_mode(&io_xfer_mode) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode")
+ /* Set error flag, but keep going */
+ local_cause[0] |= H5D_MPIO_ERROR_WHILE_CHECKING_COLLECTIVE_POSSIBLE;
if(io_xfer_mode == H5FD_MPIO_INDEPENDENT)
- local_cause |= H5D_MPIO_SET_INDEPENDENT;
+ local_cause[0] |= H5D_MPIO_SET_INDEPENDENT;
/* Optimized MPI types flag must be set */
/* (based on 'HDF5_MPI_OPT_TYPES' environment variable) */
if(!H5FD_mpi_opt_types_g)
- local_cause |= H5D_MPIO_MPI_OPT_TYPES_ENV_VAR_DISABLED;
+ local_cause[0] |= H5D_MPIO_MPI_OPT_TYPES_ENV_VAR_DISABLED;
/* Don't allow collective operations if datatype conversions need to happen */
if(!type_info->is_conv_noop)
- local_cause |= H5D_MPIO_DATATYPE_CONVERSION;
+ local_cause[0] |= H5D_MPIO_DATATYPE_CONVERSION;
/* Don't allow collective operations if data transform operations should occur */
if(!type_info->is_xform_noop)
- local_cause |= H5D_MPIO_DATA_TRANSFORMS;
+ local_cause[0] |= H5D_MPIO_DATA_TRANSFORMS;
/* Check whether these are both simple or scalar dataspaces */
if(!((H5S_SIMPLE == H5S_GET_EXTENT_TYPE(mem_space) || H5S_SCALAR == H5S_GET_EXTENT_TYPE(mem_space))
&& (H5S_SIMPLE == H5S_GET_EXTENT_TYPE(file_space) || H5S_SCALAR == H5S_GET_EXTENT_TYPE(file_space))))
- local_cause |= H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES;
+ local_cause[0] |= H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES;
/* Dataset storage must be contiguous or chunked */
if(!(io_info->dset->shared->layout.type == H5D_CONTIGUOUS ||
io_info->dset->shared->layout.type == H5D_CHUNKED))
- local_cause |= H5D_MPIO_NOT_CONTIGUOUS_OR_CHUNKED_DATASET;
+ local_cause[0] |= H5D_MPIO_NOT_CONTIGUOUS_OR_CHUNKED_DATASET;
/* check if external-file storage is used */
if(io_info->dset->shared->dcpl_cache.efl.nused > 0)
- local_cause |= H5D_MPIO_NOT_CONTIGUOUS_OR_CHUNKED_DATASET;
+ local_cause[0] |= H5D_MPIO_NOT_CONTIGUOUS_OR_CHUNKED_DATASET;
/* The handling of memory space is different for chunking and contiguous
* storage. For contiguous storage, mem_space and file_space won't change
@@ -340,31 +352,84 @@ H5D__mpio_opt_possible(const H5D_io_info_t *io_info, const H5S_t *file_space,
* is less than 3. The functions needed (MPI_Mprobe and MPI_Imrecv) will
* not be available.
*/
- if (io_info->op_type == H5D_IO_OP_WRITE &&
- io_info->dset->shared->layout.type == H5D_CHUNKED &&
- io_info->dset->shared->dcpl_cache.pline.nused > 0)
- local_cause |= H5D_MPIO_PARALLEL_FILTERED_WRITES_DISABLED;
+ if(io_info->op_type == H5D_IO_OP_WRITE &&
+ io_info->dset->shared->layout.type == H5D_CHUNKED &&
+ io_info->dset->shared->dcpl_cache.pline.nused > 0)
+ local_cause[0] |= H5D_MPIO_PARALLEL_FILTERED_WRITES_DISABLED;
#endif
+ /* Check if we are able to do a MPI_Bcast of the data from one rank
+ * instead of having all the processes involved in the collective I/O call.
+ */
+
+ /* Check to see if the process is reading the entire dataset */
+ if(H5S_GET_SELECT_TYPE(file_space) != H5S_SEL_ALL)
+ local_cause[1] |= H5D_MPIO_RANK0_NOT_H5S_ALL;
+ /* Only perform this optimization for contigous datasets, currently */
+ else if(H5D_CONTIGUOUS != io_info->dset->shared->layout.type)
+ /* Flag to do a MPI_Bcast of the data from one proc instead of
+ * having all the processes involved in the collective I/O.
+ */
+ local_cause[1] |= H5D_MPIO_RANK0_NOT_CONTIGUOUS;
+ else if((is_vl_storage = H5T_is_vl_storage(type_info->dset_type)) < 0)
+ local_cause[0] |= H5D_MPIO_ERROR_WHILE_CHECKING_COLLECTIVE_POSSIBLE;
+ else if(is_vl_storage)
+ local_cause[1] |= H5D_MPIO_RANK0_NOT_FIXED_SIZE;
+ else {
+ size_t type_size; /* Size of dataset's datatype */
+
+ /* Retrieve the size of the dataset's datatype */
+ if(0 == (type_size = H5T_GET_SIZE(type_info->dset_type)))
+ local_cause[0] |= H5D_MPIO_ERROR_WHILE_CHECKING_COLLECTIVE_POSSIBLE;
+ else {
+ hssize_t snelmts; /* [Signed] # of elements in dataset's dataspace */
+
+ /* Retrieve the size of the dataset's datatype */
+ if((snelmts = H5S_GET_EXTENT_NPOINTS(file_space)) < 0)
+ local_cause[0] |= H5D_MPIO_ERROR_WHILE_CHECKING_COLLECTIVE_POSSIBLE;
+ else {
+ hsize_t dset_size;
+
+ /* Determine dataset size */
+ dset_size = ((hsize_t)snelmts) * type_size;
+
+ /* If the size of the dataset is less than 2GB then do an MPI_Bcast
+ * of the data from one process instead of having all the processes
+ * involved in the collective I/O.
+ */
+ if(dset_size > ((hsize_t)(2.0F * H5_GB) - 1))
+ local_cause[1] |= H5D_MPIO_RANK0_GREATER_THAN_2GB;
+ } /* end else */
+ } /* end else */
+ } /* end else */
+
/* Check for independent I/O */
- if(local_cause & H5D_MPIO_SET_INDEPENDENT)
- global_cause = local_cause;
+ if(local_cause[0] & H5D_MPIO_SET_INDEPENDENT)
+ global_cause[0] = local_cause[0];
else {
int mpi_code; /* MPI error code */
/* Form consensus opinion among all processes about whether to perform
* collective I/O
*/
- if(MPI_SUCCESS != (mpi_code = MPI_Allreduce(&local_cause, &global_cause, 1, MPI_UNSIGNED, MPI_BOR, io_info->comm)))
+ if(MPI_SUCCESS != (mpi_code = MPI_Allreduce(&local_cause, &global_cause, 2, MPI_UNSIGNED, MPI_BOR, io_info->comm)))
HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code)
} /* end else */
/* Set the local & global values of no-collective-cause in the API context */
- H5CX_set_mpio_local_no_coll_cause(local_cause);
- H5CX_set_mpio_global_no_coll_cause(global_cause);
+ H5CX_set_mpio_local_no_coll_cause(local_cause[0]);
+ H5CX_set_mpio_global_no_coll_cause(global_cause[0]);
+
+ /* Set read-with-rank0-and-bcast flag if possible */
+ if(global_cause[0] == 0 && global_cause[1] == 0) {
+ H5CX_set_mpio_rank0_bcast(TRUE);
+#ifdef H5_HAVE_INSTRUMENTED_LIBRARY
+ H5CX_test_set_mpio_coll_rank0_bcast(TRUE);
+#endif /* H5_HAVE_INSTRUMENTED_LIBRARY */
+ } /* end if */
/* Set the return value, based on the global cause */
- ret_value = global_cause > 0 ? FALSE : TRUE;
+ ret_value = global_cause[0] > 0 ? FALSE : TRUE;
done:
FUNC_LEAVE_NOAPI(ret_value)
@@ -3069,8 +3134,8 @@ H5D__filtered_collective_chunk_entry_io(H5D_filtered_collective_io_info_t *chunk
chunk_entry->chunk_states.new_chunk.length = chunk_entry->chunk_states.chunk_current.length;
/* Currently, these chunk reads are done independently and will likely
- * cause issues with collective metadata reads enabled. In the future,
- * this should be refactored to use collective chunk reads - JTH */
+ * cause issues with collective metadata reads enabled. In the future,
+ * this should be refactored to use collective chunk reads - JTH */
/* Get the original state of parallel I/O transfer mode */
if(H5CX_get_io_xfer_mode(&xfer_mode) < 0)
diff --git a/src/H5Dprivate.h b/src/H5Dprivate.h
index aaa3db2..6fb7889 100644
--- a/src/H5Dprivate.h
+++ b/src/H5Dprivate.h
@@ -95,7 +95,13 @@
/* Definitions for all collective chunk instrumentation properties */
#define H5D_XFER_COLL_CHUNK_SIZE sizeof(unsigned)
#define H5D_XFER_COLL_CHUNK_DEF 1
-#define H5D_XFER_COLL_CHUNK_FIX 0
+
+/* General collective I/O instrumentation properties */
+#define H5D_XFER_COLL_RANK0_BCAST_NAME "coll_rank0_bcast"
+
+/* Definitions for general collective I/O instrumentation properties */
+#define H5D_XFER_COLL_RANK0_BCAST_SIZE sizeof(hbool_t)
+#define H5D_XFER_COLL_RANK0_BCAST_DEF FALSE
#endif /* H5_HAVE_INSTRUMENTED_LIBRARY */
/* Default temporary buffer size */
diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c
index d160858..3ab90aa 100644
--- a/src/H5FDmpio.c
+++ b/src/H5FDmpio.c
@@ -1354,6 +1354,7 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type,
int n;
#endif
hbool_t use_view_this_time = FALSE;
+ hbool_t rank0_bcast = FALSE; /* If read-with-rank0-and-bcast flag was used */
herr_t ret_value = SUCCEED;
FUNC_ENTER_STATIC
@@ -1437,8 +1438,25 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type,
if(H5FD_mpio_Debug[(int)'r'])
HDfprintf(stdout, "%s: doing MPI collective IO\n", FUNC);
#endif
- if(MPI_SUCCESS != (mpi_code = MPI_File_read_at_all(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
- HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code)
+ /* Check whether we should read from rank 0 and broadcast to other ranks */
+ if(H5CX_get_mpio_rank0_bcast()) {
+#ifdef H5FDmpio_DEBUG
+ if(H5FD_mpio_Debug[(int)'r'])
+ HDfprintf(stdout, "%s: doing read-rank0-and-MPI_Bcast\n", FUNC);
+#endif
+ /* Indicate path we've taken */
+ rank0_bcast = TRUE;
+
+ /* Read on rank 0 Bcast to other ranks */
+ if(file->mpi_rank == 0)
+ if(MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
+ if(MPI_SUCCESS != (mpi_code = MPI_Bcast(buf, size_i, buf_type, 0, file->comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code)
+ } /* end if */
+ else
+ if(MPI_SUCCESS != (mpi_code = MPI_File_read_at_all(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code)
} /* end if */
else {
#ifdef H5FDmpio_DEBUG
@@ -1460,13 +1478,26 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type,
if(MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
- /* How many bytes were actually read? */
+ /* Only retrieve bytes read if this rank _actually_ participated in I/O */
+ if(!rank0_bcast || (rank0_bcast && file->mpi_rank == 0) ) {
+ /* How many bytes were actually read? */
#if MPI_VERSION >= 3
- if(MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_read)))
+ if(MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_read)))
#else
- if(MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read)))
+ if(MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read)))
#endif
- HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
+ HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
+ } /* end if */
+
+ /* If the rank0-bcast feature was used, broadcast the # of bytes read to
+ * other ranks, which didn't perform any I/O.
+ */
+ /* NOTE: This could be optimized further to be combined with the broadcast
+ * of the data. (QAK - 2019/1/2)
+ */
+ if(rank0_bcast)
+ if(MPI_SUCCESS != MPI_Bcast(&bytes_read, 1, MPI_LONG_LONG, 0, file->comm))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", 0)
/* Get the type's size */
#if MPI_VERSION >= 3
diff --git a/src/H5Pdxpl.c b/src/H5Pdxpl.c
index bfc1d93..8338d84 100644
--- a/src/H5Pdxpl.c
+++ b/src/H5Pdxpl.c
@@ -2017,6 +2017,7 @@ done:
FUNC_LEAVE_API(ret_value)
} /* end H5Pget_mpio_actual_io_mode() */
+
/*-------------------------------------------------------------------------
* Function: H5Pget_mpio_no_collective_cause
*
@@ -2053,8 +2054,6 @@ H5Pget_mpio_no_collective_cause(hid_t plist_id, uint32_t *local_no_collective_ca
done:
FUNC_LEAVE_API(ret_value)
} /* end H5Pget_mpio_no_collective_cause() */
-
-
#endif /* H5_HAVE_PARALLEL */
diff --git a/src/H5Ppublic.h b/src/H5Ppublic.h
index 2f094ea..078fe74 100644
--- a/src/H5Ppublic.h
+++ b/src/H5Ppublic.h
@@ -167,7 +167,8 @@ typedef enum H5D_mpio_no_collective_cause_t {
H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES = 0x10,
H5D_MPIO_NOT_CONTIGUOUS_OR_CHUNKED_DATASET = 0x20,
H5D_MPIO_PARALLEL_FILTERED_WRITES_DISABLED = 0x40,
- H5D_MPIO_NO_COLLECTIVE_MAX_CAUSE = 0x80
+ H5D_MPIO_ERROR_WHILE_CHECKING_COLLECTIVE_POSSIBLE = 0x80,
+ H5D_MPIO_NO_COLLECTIVE_MAX_CAUSE = 0x100
} H5D_mpio_no_collective_cause_t;
/********************/
diff --git a/src/H5T.c b/src/H5T.c
index 01ace87..9544488 100644
--- a/src/H5T.c
+++ b/src/H5T.c
@@ -295,6 +295,7 @@ static htri_t H5T__compiler_conv(H5T_t *src, H5T_t *dst);
static herr_t H5T__set_size(H5T_t *dt, size_t size);
static herr_t H5T__close_cb(H5T_t *dt);
static H5T_path_t *H5T__path_find_real(const H5T_t *src, const H5T_t *dst, const char *name, H5T_conv_func_t *conv);
+static hbool_t H5T__detect_reg_ref(const H5T_t *dt);
/*****************************/
@@ -5506,6 +5507,111 @@ done:
/*-------------------------------------------------------------------------
+ * Function: H5T_detect_reg_ref
+ *
+ * Purpose: Check whether a datatype contains (or is) a region reference
+ * datatype.
+ *
+ * Return: TRUE (1) or FALSE (0) on success
+ * (Can't fail)
+ *
+ * Programmer: Quincey Koziol
+ * Saturday, January 5, 2019
+ *
+ *-------------------------------------------------------------------------
+ */
+static hbool_t
+H5T__detect_reg_ref(const H5T_t *dt)
+{
+ unsigned u; /* Local index variable */
+ hbool_t ret_value = FALSE; /* Return value */
+
+ FUNC_ENTER_STATIC_NOERR
+
+ /* Sanity checks */
+ HDassert(dt);
+
+ /* Check if this datatype is a region reference */
+ if(H5T_REFERENCE == dt->shared->type && H5R_DATASET_REGION == dt->shared->u.atomic.u.r.rtype)
+ HGOTO_DONE(TRUE);
+
+ /* Check for types that might have the correct type as a component */
+ switch(dt->shared->type) {
+ case H5T_COMPOUND:
+ /* Iterate over all the compound datatype's fields */
+ for(u = 0; u < dt->shared->u.compnd.nmembs; u++)
+ /* Recurse on field's datatype */
+ if(H5T__detect_reg_ref(dt->shared->u.compnd.memb[u].type))
+ HGOTO_DONE(TRUE);
+ break;
+
+ case H5T_ARRAY:
+ case H5T_VLEN:
+ case H5T_ENUM:
+ HGOTO_DONE(H5T__detect_reg_ref(dt->shared->parent));
+ break;
+
+ case H5T_NO_CLASS:
+ case H5T_INTEGER:
+ case H5T_FLOAT:
+ case H5T_TIME:
+ case H5T_STRING:
+ case H5T_BITFIELD:
+ case H5T_OPAQUE:
+ case H5T_REFERENCE:
+ case H5T_NCLASSES:
+ default:
+ break;
+ } /* end if */
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5T__detect_reg_ref() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5T_is_vl_storage
+ *
+ * Purpose: Check if a datatype will be stored in a variable-length form.
+ *
+ * Notes: Currently, only variable-length string & sequences and region
+ * references are stored in a variable-length form.
+ *
+ * Return:
+ * One of two values on success:
+ * TRUE - If the datatype will be stored in a variable-length form
+ * FALSE - If the datatype will NOT be stored in a variable-length form
+ * <0 is returned on failure
+ *
+ * Programmer: Quincey Koziol
+ * Saturday, January 5, 2019
+ *
+ *-------------------------------------------------------------------------
+ */
+htri_t
+H5T_is_vl_storage(const H5T_t *dt)
+{
+ htri_t ret_value = FALSE;
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ /* Sanity check */
+ HDassert(dt);
+
+ /* VL and region reference datatypes are stored in variable-length form */
+ if(H5T_detect_class(dt, H5T_VLEN, FALSE))
+ ret_value = TRUE;
+ else if(H5T_detect_class(dt, H5T_REFERENCE, FALSE))
+ ret_value = H5T__detect_reg_ref(dt);
+ else
+ ret_value = FALSE;
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5T_is_vl_storage() */
+
+
+/*-------------------------------------------------------------------------
* Function: H5T_upgrade_version_cb
*
* Purpose: H5T__visit callback to Upgrade the version of a datatype
diff --git a/src/H5Tprivate.h b/src/H5Tprivate.h
index 6b6446f..3dcbb2c 100644
--- a/src/H5Tprivate.h
+++ b/src/H5Tprivate.h
@@ -145,6 +145,7 @@ H5_DLL H5T_t *H5T_get_actual_type(H5T_t *dt);
H5_DLL herr_t H5T_save_refresh_state(hid_t tid, struct H5O_shared_t *cached_H5O_shared);
H5_DLL herr_t H5T_restore_refresh_state(hid_t tid, struct H5O_shared_t *cached_H5O_shared);
H5_DLL hbool_t H5T_already_vol_managed(const H5T_t *dt);
+H5_DLL htri_t H5T_is_vl_storage(const H5T_t *dt);
/* Reference specific functions */
H5_DLL H5R_type_t H5T_get_ref_type(const H5T_t *dt);