summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorQuincey Koziol <koziol@hdfgroup.org>2019-01-06 04:31:42 (GMT)
committerM. Scot Breitenfeld <brtnfld@hdfgroup.org>2019-01-07 22:55:59 (GMT)
commitfed17ed3838d2cf73f8848c9d340a9139c0c02dc (patch)
tree32ff8b32b33a2947c27cbff5a05baee35c23d58b /src
parent5dfe00629588a54dbfb6f2d09dfbd88177e37cc2 (diff)
downloadhdf5-fed17ed3838d2cf73f8848c9d340a9139c0c02dc.zip
hdf5-fed17ed3838d2cf73f8848c9d340a9139c0c02dc.tar.gz
hdf5-fed17ed3838d2cf73f8848c9d340a9139c0c02dc.tar.bz2
HDFFV-10625 -- Implemented a process-0 read and then broadcast for collective read of full (HS_ALL), contiguous, atomic datasets by all the processes in the file communicator.
Diffstat (limited to 'src')
-rw-r--r--src/H5CX.c92
-rw-r--r--src/H5CXprivate.h3
-rw-r--r--src/H5Dio.c18
-rw-r--r--src/H5Dmpio.c115
-rw-r--r--src/H5Dprivate.h8
-rw-r--r--src/H5FDmpio.c43
-rw-r--r--src/H5Pdxpl.c3
-rw-r--r--src/H5Ppublic.h3
-rw-r--r--src/H5T.c106
-rw-r--r--src/H5Tprivate.h1
10 files changed, 343 insertions, 49 deletions
diff --git a/src/H5CX.c b/src/H5CX.c
index 1f91ee2..0d20132 100644
--- a/src/H5CX.c
+++ b/src/H5CX.c
@@ -198,6 +198,7 @@ typedef struct H5CX_t {
MPI_Datatype btype; /* MPI datatype for buffer, when using collective I/O */
MPI_Datatype ftype; /* MPI datatype for file, when using collective I/O */
hbool_t mpi_file_flushing; /* Whether an MPI-opened file is being flushed */
+ hbool_t rank0_bcast; /* Whether a dataset meets read-with-rank0-and-bcast requirements */
#endif /* H5_HAVE_PARALLEL */
/* Cached DXPL properties */
@@ -261,6 +262,8 @@ typedef struct H5CX_t {
hbool_t mpio_coll_chunk_multi_ratio_coll_set; /* Whether instrumented "collective chunk multi ratio coll" value is set */
int mpio_coll_chunk_multi_ratio_ind; /* Instrumented "collective chunk multi ratio ind" value (H5D_XFER_COLL_CHUNK_MULTI_RATIO_IND_NAME) */
hbool_t mpio_coll_chunk_multi_ratio_ind_set; /* Whether instrumented "collective chunk multi ratio ind" value is set */
+ hbool_t mpio_coll_rank0_bcast; /* Instrumented "collective chunk multi ratio ind" value (H5D_XFER_COLL_CHUNK_MULTI_RATIO_IND_NAME) */
+ hbool_t mpio_coll_rank0_bcast_set; /* Whether instrumented "collective chunk multi ratio ind" value is set */
#endif /* H5_HAVE_INSTRUMENTED_LIBRARY */
#endif /* H5_HAVE_PARALLEL */
@@ -1254,6 +1257,32 @@ H5CX_get_mpi_file_flushing(void)
FUNC_LEAVE_NOAPI((*head)->ctx.mpi_file_flushing)
} /* end H5CX_get_mpi_file_flushing() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5CX_get_mpio_rank0_bcast
+ *
+ * Purpose: Retrieves if the dataset meets read-with-rank0-and-bcast requirements for the current API call context.
+ *
+ * Return: Non-negative on success / Negative on failure
+ *
+ * Programmer: M. Breitenfeld
+ * December 31, 2018
+ *
+ *-------------------------------------------------------------------------
+ */
+hbool_t
+H5CX_get_mpio_rank0_bcast(void)
+{
+ H5CX_node_t **head = H5CX_get_my_context(); /* Get the pointer to the head of the API context, for this thread */
+
+ FUNC_ENTER_NOAPI_NOINIT_NOERR
+
+ /* Sanity check */
+ HDassert(head && *head);
+
+ FUNC_LEAVE_NOAPI((*head)->ctx.rank0_bcast)
+} /* end H5CX_get_mpio_rank0_bcast() */
#endif /* H5_HAVE_PARALLEL */
@@ -2185,6 +2214,34 @@ H5CX_set_mpi_file_flushing(hbool_t flushing)
FUNC_LEAVE_NOAPI_VOID
} /* end H5CX_set_mpi_file_flushing() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5CX_set_mpio_rank0_bcast
+ *
+ * Purpose: Sets the "dataset meets read-with-rank0-and-bcast requirements" flag for the current API call context.
+ *
+ * Return: <none>
+ *
+ * Programmer: M. Breitenfeld
+ * December 31, 2018
+ *
+ *-------------------------------------------------------------------------
+ */
+void
+H5CX_set_mpio_rank0_bcast(hbool_t rank0_bcast)
+{
+ H5CX_node_t **head = H5CX_get_my_context(); /* Get the pointer to the head of the API context, for this thread */
+
+ FUNC_ENTER_NOAPI_NOINIT_NOERR
+
+ /* Sanity checks */
+ HDassert(head && *head);
+
+ (*head)->ctx.rank0_bcast = rank0_bcast;
+
+ FUNC_LEAVE_NOAPI_VOID
+} /* end H5CX_set_mpio_rank0_bcast() */
#endif /* H5_HAVE_PARALLEL */
@@ -2596,6 +2653,40 @@ H5CX_test_set_mpio_coll_chunk_multi_ratio_ind(int mpio_coll_chunk_multi_ratio_in
done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5CX_test_set_mpio_coll_chunk_multi_ratio_ind() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5CX_test_set_mpio_coll_rank0_bcast
+ *
+ * Purpose: Sets the instrumented "read-with-rank0-bcast" flag for the current API call context.
+ *
+ * Note: Only sets value if property set in DXPL
+ *
+ * Return: Non-negative on success / Negative on failure
+ *
+ * Programmer: Quincey Koziol
+ * January 2, 2019
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5CX_test_set_mpio_coll_rank0_bcast(hbool_t mpio_coll_rank0_bcast)
+{
+ H5CX_node_t **head = H5CX_get_my_context(); /* Get the pointer to the head of the API context, for this thread */
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI_NOINIT
+
+ /* Sanity checks */
+ HDassert(head && *head);
+ HDassert(!((*head)->ctx.dxpl_id == H5P_DEFAULT ||
+ (*head)->ctx.dxpl_id == H5P_DATASET_XFER_DEFAULT));
+
+ H5CX_TEST_SET_PROP(H5D_XFER_COLL_RANK0_BCAST_NAME, mpio_coll_rank0_bcast)
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5CX_test_set_mpio_coll_rank0_bcast() */
#endif /* H5_HAVE_INSTRUMENTED_LIBRARY */
#endif /* H5_HAVE_PARALLEL */
@@ -2640,6 +2731,7 @@ H5CX__pop_common(void)
H5CX_SET_PROP(H5D_XFER_COLL_CHUNK_LINK_NUM_FALSE_NAME, mpio_coll_chunk_link_num_false)
H5CX_SET_PROP(H5D_XFER_COLL_CHUNK_MULTI_RATIO_COLL_NAME, mpio_coll_chunk_multi_ratio_coll)
H5CX_SET_PROP(H5D_XFER_COLL_CHUNK_MULTI_RATIO_IND_NAME, mpio_coll_chunk_multi_ratio_ind)
+ H5CX_SET_PROP(H5D_XFER_COLL_RANK0_BCAST_NAME, mpio_coll_rank0_bcast)
#endif /* H5_HAVE_INSTRUMENTED_LIBRARY */
#endif /* H5_HAVE_PARALLEL */
diff --git a/src/H5CXprivate.h b/src/H5CXprivate.h
index 46289c4..46d25d0 100644
--- a/src/H5CXprivate.h
+++ b/src/H5CXprivate.h
@@ -77,6 +77,7 @@ H5_DLL H5AC_ring_t H5CX_get_ring(void);
H5_DLL hbool_t H5CX_get_coll_metadata_read(void);
H5_DLL herr_t H5CX_get_mpi_coll_datatypes(MPI_Datatype *btype, MPI_Datatype *ftype);
H5_DLL hbool_t H5CX_get_mpi_file_flushing(void);
+H5_DLL hbool_t H5CX_get_mpio_rank0_bcast(void);
#endif /* H5_HAVE_PARALLEL */
/* "Getter" routines for DXPL properties cached in API context */
@@ -112,6 +113,7 @@ H5_DLL void H5CX_set_coll_metadata_read(hbool_t cmdr);
H5_DLL herr_t H5CX_set_mpi_coll_datatypes(MPI_Datatype btype, MPI_Datatype ftype);
H5_DLL herr_t H5CX_set_mpio_coll_opt(H5FD_mpio_collective_opt_t mpio_coll_opt);
H5_DLL void H5CX_set_mpi_file_flushing(hbool_t flushing);
+H5_DLL void H5CX_set_mpio_rank0_bcast(hbool_t rank0_bcast);
#endif /* H5_HAVE_PARALLEL */
/* "Setter" routines for DXPL properties cached in API context */
@@ -137,6 +139,7 @@ H5_DLL herr_t H5CX_test_set_mpio_coll_chunk_link_num_true(int mpio_coll_chunk_li
H5_DLL herr_t H5CX_test_set_mpio_coll_chunk_link_num_false(int mpio_coll_chunk_link_num_false);
H5_DLL herr_t H5CX_test_set_mpio_coll_chunk_multi_ratio_coll(int mpio_coll_chunk_multi_ratio_coll);
H5_DLL herr_t H5CX_test_set_mpio_coll_chunk_multi_ratio_ind(int mpio_coll_chunk_multi_ratio_ind);
+H5_DLL herr_t H5CX_test_set_mpio_coll_rank0_bcast(hbool_t rank0_bcast);
#endif /* H5_HAVE_INSTRUMENTED_LIBRARY */
#endif /* H5_HAVE_PARALLEL */
diff --git a/src/H5Dio.c b/src/H5Dio.c
index fe85d23..6062dff 100644
--- a/src/H5Dio.c
+++ b/src/H5Dio.c
@@ -657,22 +657,12 @@ H5D__write(H5D_t *dataset, hid_t mem_type_id, const H5S_t *mem_space,
/* Various MPI based checks */
#ifdef H5_HAVE_PARALLEL
- if H5F_HAS_FEATURE(dataset->oloc.file, H5FD_FEAT_HAS_MPI) {
- /* If MPI based VFD is used, no VL datatype support yet. */
+ if(H5F_HAS_FEATURE(dataset->oloc.file, H5FD_FEAT_HAS_MPI)) {
+ /* If MPI based VFD is used, no VL or region reference datatype support yet. */
/* This is because they use the global heap in the file and we don't */
/* support parallel access of that yet */
- if(H5T_detect_class(type_info.mem_type, H5T_VLEN, FALSE) > 0)
- HGOTO_ERROR(H5E_DATASET, H5E_UNSUPPORTED, FAIL, "Parallel IO does not support writing VL datatypes yet")
-
- /* If MPI based VFD is used, no VL datatype support yet. */
- /* This is because they use the global heap in the file and we don't */
- /* support parallel access of that yet */
- /* We should really use H5T_detect_class() here, but it will be difficult
- * to detect the type of the reference if it is nested... -QAK
- */
- if(H5T_get_class(type_info.mem_type, TRUE) == H5T_REFERENCE &&
- H5T_get_ref_type(type_info.mem_type) == H5R_DATASET_REGION)
- HGOTO_ERROR(H5E_DATASET, H5E_UNSUPPORTED, FAIL, "Parallel IO does not support writing region reference datatypes yet")
+ if(H5T_is_vl_storage(type_info.mem_type) > 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_UNSUPPORTED, FAIL, "Parallel IO does not support writing VL or region reference datatypes yet")
} /* end if */
else {
H5FD_mpio_xfer_t io_xfer_mode; /* MPI I/O transfer mode */
diff --git a/src/H5Dmpio.c b/src/H5Dmpio.c
index 2c06800..f5da33d 100644
--- a/src/H5Dmpio.c
+++ b/src/H5Dmpio.c
@@ -37,7 +37,6 @@
#include "H5Eprivate.h" /* Error handling */
#include "H5Fprivate.h" /* File access */
#include "H5FDprivate.h" /* File drivers */
-#include "H5FDmpi.h" /* MPI-based file drivers */
#include "H5Iprivate.h" /* IDs */
#include "H5MMprivate.h" /* Memory management */
#include "H5Oprivate.h" /* Object headers */
@@ -89,10 +88,20 @@
/******************/
/* Combine chunk address and chunk info into a struct for better performance. */
typedef struct H5D_chunk_addr_info_t {
- haddr_t chunk_addr;
- H5D_chunk_info_t chunk_info;
+ haddr_t chunk_addr;
+ H5D_chunk_info_t chunk_info;
} H5D_chunk_addr_info_t;
+/* Rank 0 Bcast values */
+typedef enum H5D_mpio_no_rank0_bcast_cause_t {
+ H5D_MPIO_RANK0_BCAST = 0x00,
+ H5D_MPIO_RANK0_NOT_H5S_ALL = 0x01,
+ H5D_MPIO_RANK0_NOT_CONTIGUOUS = 0x02,
+ H5D_MPIO_RANK0_NOT_FIXED_SIZE = 0x04,
+ H5D_MPIO_RANK0_GREATER_THAN_2GB = 0x08
+} H5D_mpio_no_rank0_bcast_cause_t;
+
+
/*
* Information about a single chunk when performing collective filtered I/O. All
* of the fields of one of these structs are initialized at the start of collective
@@ -281,8 +290,10 @@ H5D__mpio_opt_possible(const H5D_io_info_t *io_info, const H5S_t *file_space,
const H5S_t *mem_space, const H5D_type_info_t *type_info)
{
H5FD_mpio_xfer_t io_xfer_mode; /* MPI I/O transfer mode */
- unsigned local_cause = 0; /* Local reason(s) for breaking collective mode */
- unsigned global_cause = 0; /* Global reason(s) for breaking collective mode */
+ unsigned local_cause[2] = {0,0}; /* [0] Local reason(s) for breaking collective mode */
+ /* [1] Flag if dataset is both: H5S_ALL and small */
+ unsigned global_cause[2] = {0,0}; /* Global reason(s) for breaking collective mode */
+ htri_t is_vl_storage; /* Whether the dataset's datatype is stored in a variable-length form */
htri_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_PACKAGE
@@ -296,36 +307,37 @@ H5D__mpio_opt_possible(const H5D_io_info_t *io_info, const H5S_t *file_space,
/* For independent I/O, get out quickly and don't try to form consensus */
if(H5CX_get_io_xfer_mode(&io_xfer_mode) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode")
+ /* Set error flag, but keep going */
+ local_cause[0] |= H5D_MPIO_ERROR_WHILE_CHECKING_COLLECTIVE_POSSIBLE;
if(io_xfer_mode == H5FD_MPIO_INDEPENDENT)
- local_cause |= H5D_MPIO_SET_INDEPENDENT;
+ local_cause[0] |= H5D_MPIO_SET_INDEPENDENT;
/* Optimized MPI types flag must be set */
/* (based on 'HDF5_MPI_OPT_TYPES' environment variable) */
if(!H5FD_mpi_opt_types_g)
- local_cause |= H5D_MPIO_MPI_OPT_TYPES_ENV_VAR_DISABLED;
+ local_cause[0] |= H5D_MPIO_MPI_OPT_TYPES_ENV_VAR_DISABLED;
/* Don't allow collective operations if datatype conversions need to happen */
if(!type_info->is_conv_noop)
- local_cause |= H5D_MPIO_DATATYPE_CONVERSION;
+ local_cause[0] |= H5D_MPIO_DATATYPE_CONVERSION;
/* Don't allow collective operations if data transform operations should occur */
if(!type_info->is_xform_noop)
- local_cause |= H5D_MPIO_DATA_TRANSFORMS;
+ local_cause[0] |= H5D_MPIO_DATA_TRANSFORMS;
/* Check whether these are both simple or scalar dataspaces */
if(!((H5S_SIMPLE == H5S_GET_EXTENT_TYPE(mem_space) || H5S_SCALAR == H5S_GET_EXTENT_TYPE(mem_space))
&& (H5S_SIMPLE == H5S_GET_EXTENT_TYPE(file_space) || H5S_SCALAR == H5S_GET_EXTENT_TYPE(file_space))))
- local_cause |= H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES;
+ local_cause[0] |= H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES;
/* Dataset storage must be contiguous or chunked */
if(!(io_info->dset->shared->layout.type == H5D_CONTIGUOUS ||
io_info->dset->shared->layout.type == H5D_CHUNKED))
- local_cause |= H5D_MPIO_NOT_CONTIGUOUS_OR_CHUNKED_DATASET;
+ local_cause[0] |= H5D_MPIO_NOT_CONTIGUOUS_OR_CHUNKED_DATASET;
/* check if external-file storage is used */
if(io_info->dset->shared->dcpl_cache.efl.nused > 0)
- local_cause |= H5D_MPIO_NOT_CONTIGUOUS_OR_CHUNKED_DATASET;
+ local_cause[0] |= H5D_MPIO_NOT_CONTIGUOUS_OR_CHUNKED_DATASET;
/* The handling of memory space is different for chunking and contiguous
* storage. For contiguous storage, mem_space and file_space won't change
@@ -340,31 +352,84 @@ H5D__mpio_opt_possible(const H5D_io_info_t *io_info, const H5S_t *file_space,
* is less than 3. The functions needed (MPI_Mprobe and MPI_Imrecv) will
* not be available.
*/
- if (io_info->op_type == H5D_IO_OP_WRITE &&
- io_info->dset->shared->layout.type == H5D_CHUNKED &&
- io_info->dset->shared->dcpl_cache.pline.nused > 0)
- local_cause |= H5D_MPIO_PARALLEL_FILTERED_WRITES_DISABLED;
+ if(io_info->op_type == H5D_IO_OP_WRITE &&
+ io_info->dset->shared->layout.type == H5D_CHUNKED &&
+ io_info->dset->shared->dcpl_cache.pline.nused > 0)
+ local_cause[0] |= H5D_MPIO_PARALLEL_FILTERED_WRITES_DISABLED;
#endif
+ /* Check if we are able to do a MPI_Bcast of the data from one rank
+ * instead of having all the processes involved in the collective I/O call.
+ */
+
+ /* Check to see if the process is reading the entire dataset */
+ if(H5S_GET_SELECT_TYPE(file_space) != H5S_SEL_ALL)
+ local_cause[1] |= H5D_MPIO_RANK0_NOT_H5S_ALL;
+ /* Only perform this optimization for contigous datasets, currently */
+ else if(H5D_CONTIGUOUS != io_info->dset->shared->layout.type)
+ /* Flag to do a MPI_Bcast of the data from one proc instead of
+ * having all the processes involved in the collective I/O.
+ */
+ local_cause[1] |= H5D_MPIO_RANK0_NOT_CONTIGUOUS;
+ else if((is_vl_storage = H5T_is_vl_storage(type_info->dset_type)) < 0)
+ local_cause[0] |= H5D_MPIO_ERROR_WHILE_CHECKING_COLLECTIVE_POSSIBLE;
+ else if(is_vl_storage)
+ local_cause[1] |= H5D_MPIO_RANK0_NOT_FIXED_SIZE;
+ else {
+ size_t type_size; /* Size of dataset's datatype */
+
+ /* Retrieve the size of the dataset's datatype */
+ if(0 == (type_size = H5T_GET_SIZE(type_info->dset_type)))
+ local_cause[0] |= H5D_MPIO_ERROR_WHILE_CHECKING_COLLECTIVE_POSSIBLE;
+ else {
+ hssize_t snelmts; /* [Signed] # of elements in dataset's dataspace */
+
+ /* Retrieve the size of the dataset's datatype */
+ if((snelmts = H5S_GET_EXTENT_NPOINTS(file_space)) < 0)
+ local_cause[0] |= H5D_MPIO_ERROR_WHILE_CHECKING_COLLECTIVE_POSSIBLE;
+ else {
+ hsize_t dset_size;
+
+ /* Determine dataset size */
+ dset_size = ((hsize_t)snelmts) * type_size;
+
+ /* If the size of the dataset is less than 2GB then do an MPI_Bcast
+ * of the data from one process instead of having all the processes
+ * involved in the collective I/O.
+ */
+ if(dset_size > ((hsize_t)(2.0F * H5_GB) - 1))
+ local_cause[1] |= H5D_MPIO_RANK0_GREATER_THAN_2GB;
+ } /* end else */
+ } /* end else */
+ } /* end else */
+
/* Check for independent I/O */
- if(local_cause & H5D_MPIO_SET_INDEPENDENT)
- global_cause = local_cause;
+ if(local_cause[0] & H5D_MPIO_SET_INDEPENDENT)
+ global_cause[0] = local_cause[0];
else {
int mpi_code; /* MPI error code */
/* Form consensus opinion among all processes about whether to perform
* collective I/O
*/
- if(MPI_SUCCESS != (mpi_code = MPI_Allreduce(&local_cause, &global_cause, 1, MPI_UNSIGNED, MPI_BOR, io_info->comm)))
+ if(MPI_SUCCESS != (mpi_code = MPI_Allreduce(&local_cause, &global_cause, 2, MPI_UNSIGNED, MPI_BOR, io_info->comm)))
HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code)
} /* end else */
/* Set the local & global values of no-collective-cause in the API context */
- H5CX_set_mpio_local_no_coll_cause(local_cause);
- H5CX_set_mpio_global_no_coll_cause(global_cause);
+ H5CX_set_mpio_local_no_coll_cause(local_cause[0]);
+ H5CX_set_mpio_global_no_coll_cause(global_cause[0]);
+
+ /* Set read-with-rank0-and-bcast flag if possible */
+ if(global_cause[0] == 0 && global_cause[1] == 0) {
+ H5CX_set_mpio_rank0_bcast(TRUE);
+#ifdef H5_HAVE_INSTRUMENTED_LIBRARY
+ H5CX_test_set_mpio_coll_rank0_bcast(TRUE);
+#endif /* H5_HAVE_INSTRUMENTED_LIBRARY */
+ } /* end if */
/* Set the return value, based on the global cause */
- ret_value = global_cause > 0 ? FALSE : TRUE;
+ ret_value = global_cause[0] > 0 ? FALSE : TRUE;
done:
FUNC_LEAVE_NOAPI(ret_value)
@@ -3069,8 +3134,8 @@ H5D__filtered_collective_chunk_entry_io(H5D_filtered_collective_io_info_t *chunk
chunk_entry->chunk_states.new_chunk.length = chunk_entry->chunk_states.chunk_current.length;
/* Currently, these chunk reads are done independently and will likely
- * cause issues with collective metadata reads enabled. In the future,
- * this should be refactored to use collective chunk reads - JTH */
+ * cause issues with collective metadata reads enabled. In the future,
+ * this should be refactored to use collective chunk reads - JTH */
/* Get the original state of parallel I/O transfer mode */
if(H5CX_get_io_xfer_mode(&xfer_mode) < 0)
diff --git a/src/H5Dprivate.h b/src/H5Dprivate.h
index aaa3db2..6fb7889 100644
--- a/src/H5Dprivate.h
+++ b/src/H5Dprivate.h
@@ -95,7 +95,13 @@
/* Definitions for all collective chunk instrumentation properties */
#define H5D_XFER_COLL_CHUNK_SIZE sizeof(unsigned)
#define H5D_XFER_COLL_CHUNK_DEF 1
-#define H5D_XFER_COLL_CHUNK_FIX 0
+
+/* General collective I/O instrumentation properties */
+#define H5D_XFER_COLL_RANK0_BCAST_NAME "coll_rank0_bcast"
+
+/* Definitions for general collective I/O instrumentation properties */
+#define H5D_XFER_COLL_RANK0_BCAST_SIZE sizeof(hbool_t)
+#define H5D_XFER_COLL_RANK0_BCAST_DEF FALSE
#endif /* H5_HAVE_INSTRUMENTED_LIBRARY */
/* Default temporary buffer size */
diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c
index d160858..3ab90aa 100644
--- a/src/H5FDmpio.c
+++ b/src/H5FDmpio.c
@@ -1354,6 +1354,7 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type,
int n;
#endif
hbool_t use_view_this_time = FALSE;
+ hbool_t rank0_bcast = FALSE; /* If read-with-rank0-and-bcast flag was used */
herr_t ret_value = SUCCEED;
FUNC_ENTER_STATIC
@@ -1437,8 +1438,25 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type,
if(H5FD_mpio_Debug[(int)'r'])
HDfprintf(stdout, "%s: doing MPI collective IO\n", FUNC);
#endif
- if(MPI_SUCCESS != (mpi_code = MPI_File_read_at_all(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
- HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code)
+ /* Check whether we should read from rank 0 and broadcast to other ranks */
+ if(H5CX_get_mpio_rank0_bcast()) {
+#ifdef H5FDmpio_DEBUG
+ if(H5FD_mpio_Debug[(int)'r'])
+ HDfprintf(stdout, "%s: doing read-rank0-and-MPI_Bcast\n", FUNC);
+#endif
+ /* Indicate path we've taken */
+ rank0_bcast = TRUE;
+
+ /* Read on rank 0 Bcast to other ranks */
+ if(file->mpi_rank == 0)
+ if(MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
+ if(MPI_SUCCESS != (mpi_code = MPI_Bcast(buf, size_i, buf_type, 0, file->comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code)
+ } /* end if */
+ else
+ if(MPI_SUCCESS != (mpi_code = MPI_File_read_at_all(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code)
} /* end if */
else {
#ifdef H5FDmpio_DEBUG
@@ -1460,13 +1478,26 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type,
if(MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
- /* How many bytes were actually read? */
+ /* Only retrieve bytes read if this rank _actually_ participated in I/O */
+ if(!rank0_bcast || (rank0_bcast && file->mpi_rank == 0) ) {
+ /* How many bytes were actually read? */
#if MPI_VERSION >= 3
- if(MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_read)))
+ if(MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_read)))
#else
- if(MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read)))
+ if(MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read)))
#endif
- HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
+ HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
+ } /* end if */
+
+ /* If the rank0-bcast feature was used, broadcast the # of bytes read to
+ * other ranks, which didn't perform any I/O.
+ */
+ /* NOTE: This could be optimized further to be combined with the broadcast
+ * of the data. (QAK - 2019/1/2)
+ */
+ if(rank0_bcast)
+ if(MPI_SUCCESS != MPI_Bcast(&bytes_read, 1, MPI_LONG_LONG, 0, file->comm))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", 0)
/* Get the type's size */
#if MPI_VERSION >= 3
diff --git a/src/H5Pdxpl.c b/src/H5Pdxpl.c
index bfc1d93..8338d84 100644
--- a/src/H5Pdxpl.c
+++ b/src/H5Pdxpl.c
@@ -2017,6 +2017,7 @@ done:
FUNC_LEAVE_API(ret_value)
} /* end H5Pget_mpio_actual_io_mode() */
+
/*-------------------------------------------------------------------------
* Function: H5Pget_mpio_no_collective_cause
*
@@ -2053,8 +2054,6 @@ H5Pget_mpio_no_collective_cause(hid_t plist_id, uint32_t *local_no_collective_ca
done:
FUNC_LEAVE_API(ret_value)
} /* end H5Pget_mpio_no_collective_cause() */
-
-
#endif /* H5_HAVE_PARALLEL */
diff --git a/src/H5Ppublic.h b/src/H5Ppublic.h
index 2f094ea..078fe74 100644
--- a/src/H5Ppublic.h
+++ b/src/H5Ppublic.h
@@ -167,7 +167,8 @@ typedef enum H5D_mpio_no_collective_cause_t {
H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES = 0x10,
H5D_MPIO_NOT_CONTIGUOUS_OR_CHUNKED_DATASET = 0x20,
H5D_MPIO_PARALLEL_FILTERED_WRITES_DISABLED = 0x40,
- H5D_MPIO_NO_COLLECTIVE_MAX_CAUSE = 0x80
+ H5D_MPIO_ERROR_WHILE_CHECKING_COLLECTIVE_POSSIBLE = 0x80,
+ H5D_MPIO_NO_COLLECTIVE_MAX_CAUSE = 0x100
} H5D_mpio_no_collective_cause_t;
/********************/
diff --git a/src/H5T.c b/src/H5T.c
index 01ace87..9544488 100644
--- a/src/H5T.c
+++ b/src/H5T.c
@@ -295,6 +295,7 @@ static htri_t H5T__compiler_conv(H5T_t *src, H5T_t *dst);
static herr_t H5T__set_size(H5T_t *dt, size_t size);
static herr_t H5T__close_cb(H5T_t *dt);
static H5T_path_t *H5T__path_find_real(const H5T_t *src, const H5T_t *dst, const char *name, H5T_conv_func_t *conv);
+static hbool_t H5T__detect_reg_ref(const H5T_t *dt);
/*****************************/
@@ -5506,6 +5507,111 @@ done:
/*-------------------------------------------------------------------------
+ * Function: H5T_detect_reg_ref
+ *
+ * Purpose: Check whether a datatype contains (or is) a region reference
+ * datatype.
+ *
+ * Return: TRUE (1) or FALSE (0) on success
+ * (Can't fail)
+ *
+ * Programmer: Quincey Koziol
+ * Saturday, January 5, 2019
+ *
+ *-------------------------------------------------------------------------
+ */
+static hbool_t
+H5T__detect_reg_ref(const H5T_t *dt)
+{
+ unsigned u; /* Local index variable */
+ hbool_t ret_value = FALSE; /* Return value */
+
+ FUNC_ENTER_STATIC_NOERR
+
+ /* Sanity checks */
+ HDassert(dt);
+
+ /* Check if this datatype is a region reference */
+ if(H5T_REFERENCE == dt->shared->type && H5R_DATASET_REGION == dt->shared->u.atomic.u.r.rtype)
+ HGOTO_DONE(TRUE);
+
+ /* Check for types that might have the correct type as a component */
+ switch(dt->shared->type) {
+ case H5T_COMPOUND:
+ /* Iterate over all the compound datatype's fields */
+ for(u = 0; u < dt->shared->u.compnd.nmembs; u++)
+ /* Recurse on field's datatype */
+ if(H5T__detect_reg_ref(dt->shared->u.compnd.memb[u].type))
+ HGOTO_DONE(TRUE);
+ break;
+
+ case H5T_ARRAY:
+ case H5T_VLEN:
+ case H5T_ENUM:
+ HGOTO_DONE(H5T__detect_reg_ref(dt->shared->parent));
+ break;
+
+ case H5T_NO_CLASS:
+ case H5T_INTEGER:
+ case H5T_FLOAT:
+ case H5T_TIME:
+ case H5T_STRING:
+ case H5T_BITFIELD:
+ case H5T_OPAQUE:
+ case H5T_REFERENCE:
+ case H5T_NCLASSES:
+ default:
+ break;
+ } /* end if */
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5T__detect_reg_ref() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5T_is_vl_storage
+ *
+ * Purpose: Check if a datatype will be stored in a variable-length form.
+ *
+ * Notes: Currently, only variable-length string & sequences and region
+ * references are stored in a variable-length form.
+ *
+ * Return:
+ * One of two values on success:
+ * TRUE - If the datatype will be stored in a variable-length form
+ * FALSE - If the datatype will NOT be stored in a variable-length form
+ * <0 is returned on failure
+ *
+ * Programmer: Quincey Koziol
+ * Saturday, January 5, 2019
+ *
+ *-------------------------------------------------------------------------
+ */
+htri_t
+H5T_is_vl_storage(const H5T_t *dt)
+{
+ htri_t ret_value = FALSE;
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ /* Sanity check */
+ HDassert(dt);
+
+ /* VL and region reference datatypes are stored in variable-length form */
+ if(H5T_detect_class(dt, H5T_VLEN, FALSE))
+ ret_value = TRUE;
+ else if(H5T_detect_class(dt, H5T_REFERENCE, FALSE))
+ ret_value = H5T__detect_reg_ref(dt);
+ else
+ ret_value = FALSE;
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5T_is_vl_storage() */
+
+
+/*-------------------------------------------------------------------------
* Function: H5T_upgrade_version_cb
*
* Purpose: H5T__visit callback to Upgrade the version of a datatype
diff --git a/src/H5Tprivate.h b/src/H5Tprivate.h
index 6b6446f..3dcbb2c 100644
--- a/src/H5Tprivate.h
+++ b/src/H5Tprivate.h
@@ -145,6 +145,7 @@ H5_DLL H5T_t *H5T_get_actual_type(H5T_t *dt);
H5_DLL herr_t H5T_save_refresh_state(hid_t tid, struct H5O_shared_t *cached_H5O_shared);
H5_DLL herr_t H5T_restore_refresh_state(hid_t tid, struct H5O_shared_t *cached_H5O_shared);
H5_DLL hbool_t H5T_already_vol_managed(const H5T_t *dt);
+H5_DLL htri_t H5T_is_vl_storage(const H5T_t *dt);
/* Reference specific functions */
H5_DLL H5R_type_t H5T_get_ref_type(const H5T_t *dt);