diff options
author | M. Scot Breitenfeld <brtnfld@hdfgroup.org> | 2018-12-12 21:39:18 (GMT) |
---|---|---|
committer | M. Scot Breitenfeld <brtnfld@hdfgroup.org> | 2019-01-10 23:10:05 (GMT) |
commit | 6e489435e9ca484a4927d3bb83f2c0058147d7ba (patch) | |
tree | 8aab3b95bda0cee3ffdd4fa0ea018172fb074fe7 /src | |
parent | a043211d9edf2df3ea8049bda4be6bdcbf0970e9 (diff) | |
download | hdf5-6e489435e9ca484a4927d3bb83f2c0058147d7ba.zip hdf5-6e489435e9ca484a4927d3bb83f2c0058147d7ba.tar.gz hdf5-6e489435e9ca484a4927d3bb83f2c0058147d7ba.tar.bz2 |
merged:
HDFFV-10652
HDFFV-10443
Diffstat (limited to 'src')
-rw-r--r-- | src/H5CX.c | 92 | ||||
-rw-r--r-- | src/H5CXprivate.h | 4 | ||||
-rw-r--r-- | src/H5Dio.c | 18 | ||||
-rw-r--r-- | src/H5Dmpio.c | 121 | ||||
-rw-r--r-- | src/H5Dprivate.h | 8 | ||||
-rw-r--r-- | src/H5FDmpio.c | 89 | ||||
-rw-r--r-- | src/H5Pdxpl.c | 3 | ||||
-rw-r--r-- | src/H5Ppublic.h | 3 | ||||
-rw-r--r-- | src/H5T.c | 109 | ||||
-rw-r--r-- | src/H5Tprivate.h | 1 |
10 files changed, 374 insertions, 74 deletions
@@ -198,6 +198,7 @@ typedef struct H5CX_t { MPI_Datatype btype; /* MPI datatype for buffer, when using collective I/O */ MPI_Datatype ftype; /* MPI datatype for file, when using collective I/O */ hbool_t mpi_file_flushing; /* Whether an MPI-opened file is being flushed */ + hbool_t rank0_bcast; /* Whether a dataset meets read-with-rank0-and-bcast requirements */ #endif /* H5_HAVE_PARALLEL */ /* Cached DXPL properties */ @@ -261,6 +262,8 @@ typedef struct H5CX_t { hbool_t mpio_coll_chunk_multi_ratio_coll_set; /* Whether instrumented "collective chunk multi ratio coll" value is set */ int mpio_coll_chunk_multi_ratio_ind; /* Instrumented "collective chunk multi ratio ind" value (H5D_XFER_COLL_CHUNK_MULTI_RATIO_IND_NAME) */ hbool_t mpio_coll_chunk_multi_ratio_ind_set; /* Whether instrumented "collective chunk multi ratio ind" value is set */ + hbool_t mpio_coll_rank0_bcast; /* Instrumented "collective chunk multi ratio ind" value (H5D_XFER_COLL_CHUNK_MULTI_RATIO_IND_NAME) */ + hbool_t mpio_coll_rank0_bcast_set; /* Whether instrumented "collective chunk multi ratio ind" value is set */ #endif /* H5_HAVE_INSTRUMENTED_LIBRARY */ #endif /* H5_HAVE_PARALLEL */ @@ -1106,6 +1109,32 @@ H5CX_get_mpi_file_flushing(void) FUNC_LEAVE_NOAPI((*head)->ctx.mpi_file_flushing) } /* end H5CX_get_mpi_file_flushing() */ + + +/*------------------------------------------------------------------------- + * Function: H5CX_get_mpio_rank0_bcast + * + * Purpose: Retrieves if the dataset meets read-with-rank0-and-bcast requirements for the current API call context. + * + * Return: Non-negative on success / Negative on failure + * + * Programmer: M. Breitenfeld + * December 31, 2018 + * + *------------------------------------------------------------------------- + */ +hbool_t +H5CX_get_mpio_rank0_bcast(void) +{ + H5CX_node_t **head = H5CX_get_my_context(); /* Get the pointer to the head of the API context, for this thread */ + + FUNC_ENTER_NOAPI_NOINIT_NOERR + + /* Sanity check */ + HDassert(head && *head); + + FUNC_LEAVE_NOAPI((*head)->ctx.rank0_bcast) +} /* end H5CX_get_mpio_rank0_bcast() */ #endif /* H5_HAVE_PARALLEL */ @@ -2037,6 +2066,34 @@ H5CX_set_mpi_file_flushing(hbool_t flushing) FUNC_LEAVE_NOAPI_VOID } /* end H5CX_set_mpi_file_flushing() */ + + +/*------------------------------------------------------------------------- + * Function: H5CX_set_mpio_rank0_bcast + * + * Purpose: Sets the "dataset meets read-with-rank0-and-bcast requirements" flag for the current API call context. + * + * Return: <none> + * + * Programmer: M. Breitenfeld + * December 31, 2018 + * + *------------------------------------------------------------------------- + */ +void +H5CX_set_mpio_rank0_bcast(hbool_t rank0_bcast) +{ + H5CX_node_t **head = H5CX_get_my_context(); /* Get the pointer to the head of the API context, for this thread */ + + FUNC_ENTER_NOAPI_NOINIT_NOERR + + /* Sanity checks */ + HDassert(head && *head); + + (*head)->ctx.rank0_bcast = rank0_bcast; + + FUNC_LEAVE_NOAPI_VOID +} /* end H5CX_set_mpio_rank0_bcast() */ #endif /* H5_HAVE_PARALLEL */ @@ -2448,6 +2505,40 @@ H5CX_test_set_mpio_coll_chunk_multi_ratio_ind(int mpio_coll_chunk_multi_ratio_in done: FUNC_LEAVE_NOAPI(ret_value) } /* end H5CX_test_set_mpio_coll_chunk_multi_ratio_ind() */ + + +/*------------------------------------------------------------------------- + * Function: H5CX_test_set_mpio_coll_rank0_bcast + * + * Purpose: Sets the instrumented "read-with-rank0-bcast" flag for the current API call context. + * + * Note: Only sets value if property set in DXPL + * + * Return: Non-negative on success / Negative on failure + * + * Programmer: Quincey Koziol + * January 2, 2019 + * + *------------------------------------------------------------------------- + */ +herr_t +H5CX_test_set_mpio_coll_rank0_bcast(hbool_t mpio_coll_rank0_bcast) +{ + H5CX_node_t **head = H5CX_get_my_context(); /* Get the pointer to the head of the API context, for this thread */ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI_NOINIT + + /* Sanity checks */ + HDassert(head && *head); + HDassert(!((*head)->ctx.dxpl_id == H5P_DEFAULT || + (*head)->ctx.dxpl_id == H5P_DATASET_XFER_DEFAULT)); + + H5CX_TEST_SET_PROP(H5D_XFER_COLL_RANK0_BCAST_NAME, mpio_coll_rank0_bcast) + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5CX_test_set_mpio_coll_rank0_bcast() */ #endif /* H5_HAVE_INSTRUMENTED_LIBRARY */ #endif /* H5_HAVE_PARALLEL */ @@ -2488,6 +2579,7 @@ H5CX__pop_common(void) H5CX_SET_PROP(H5D_XFER_COLL_CHUNK_LINK_NUM_FALSE_NAME, mpio_coll_chunk_link_num_false) H5CX_SET_PROP(H5D_XFER_COLL_CHUNK_MULTI_RATIO_COLL_NAME, mpio_coll_chunk_multi_ratio_coll) H5CX_SET_PROP(H5D_XFER_COLL_CHUNK_MULTI_RATIO_IND_NAME, mpio_coll_chunk_multi_ratio_ind) + H5CX_SET_PROP(H5D_XFER_COLL_RANK0_BCAST_NAME, mpio_coll_rank0_bcast) #endif /* H5_HAVE_INSTRUMENTED_LIBRARY */ #endif /* H5_HAVE_PARALLEL */ diff --git a/src/H5CXprivate.h b/src/H5CXprivate.h index 44a4067..6b8e3b2 100644 --- a/src/H5CXprivate.h +++ b/src/H5CXprivate.h @@ -73,6 +73,7 @@ H5_DLL H5AC_ring_t H5CX_get_ring(void); H5_DLL hbool_t H5CX_get_coll_metadata_read(void); H5_DLL herr_t H5CX_get_mpi_coll_datatypes(MPI_Datatype *btype, MPI_Datatype *ftype); H5_DLL hbool_t H5CX_get_mpi_file_flushing(void); +H5_DLL hbool_t H5CX_get_mpio_rank0_bcast(void); #endif /* H5_HAVE_PARALLEL */ /* "Getter" routines for DXPL properties cached in API context */ @@ -108,6 +109,7 @@ H5_DLL void H5CX_set_coll_metadata_read(hbool_t cmdr); H5_DLL herr_t H5CX_set_mpi_coll_datatypes(MPI_Datatype btype, MPI_Datatype ftype); H5_DLL herr_t H5CX_set_mpio_coll_opt(H5FD_mpio_collective_opt_t mpio_coll_opt); H5_DLL void H5CX_set_mpi_file_flushing(hbool_t flushing); +H5_DLL void H5CX_set_mpio_rank0_bcast(hbool_t rank0_bcast); #endif /* H5_HAVE_PARALLEL */ /* "Setter" routines for DXPL properties cached in API context */ @@ -126,6 +128,7 @@ H5_DLL void H5CX_set_mpio_actual_chunk_opt(H5D_mpio_actual_chunk_opt_mode_t chun H5_DLL void H5CX_set_mpio_actual_io_mode(H5D_mpio_actual_io_mode_t actual_io_mode); H5_DLL void H5CX_set_mpio_local_no_coll_cause(uint32_t mpio_local_no_coll_cause); H5_DLL void H5CX_set_mpio_global_no_coll_cause(uint32_t mpio_global_no_coll_cause); +H5_DLL void H5CX_set_mpio_Proc0_BCast(hbool_t mpio_Proc0_BCast); #ifdef H5_HAVE_INSTRUMENTED_LIBRARY H5_DLL herr_t H5CX_test_set_mpio_coll_chunk_link_hard(int mpio_coll_chunk_link_hard); H5_DLL herr_t H5CX_test_set_mpio_coll_chunk_multi_hard(int mpio_coll_chunk_multi_hard); @@ -133,6 +136,7 @@ H5_DLL herr_t H5CX_test_set_mpio_coll_chunk_link_num_true(int mpio_coll_chunk_li H5_DLL herr_t H5CX_test_set_mpio_coll_chunk_link_num_false(int mpio_coll_chunk_link_num_false); H5_DLL herr_t H5CX_test_set_mpio_coll_chunk_multi_ratio_coll(int mpio_coll_chunk_multi_ratio_coll); H5_DLL herr_t H5CX_test_set_mpio_coll_chunk_multi_ratio_ind(int mpio_coll_chunk_multi_ratio_ind); +H5_DLL herr_t H5CX_test_set_mpio_coll_rank0_bcast(hbool_t rank0_bcast); #endif /* H5_HAVE_INSTRUMENTED_LIBRARY */ #endif /* H5_HAVE_PARALLEL */ diff --git a/src/H5Dio.c b/src/H5Dio.c index ed384ca..c762389 100644 --- a/src/H5Dio.c +++ b/src/H5Dio.c @@ -685,22 +685,12 @@ H5D__write(H5D_t *dataset, hid_t mem_type_id, const H5S_t *mem_space, /* Various MPI based checks */ #ifdef H5_HAVE_PARALLEL - if H5F_HAS_FEATURE(dataset->oloc.file, H5FD_FEAT_HAS_MPI) { - /* If MPI based VFD is used, no VL datatype support yet. */ + if(H5F_HAS_FEATURE(dataset->oloc.file, H5FD_FEAT_HAS_MPI)) { + /* If MPI based VFD is used, no VL or region reference datatype support yet. */ /* This is because they use the global heap in the file and we don't */ /* support parallel access of that yet */ - if(H5T_detect_class(type_info.mem_type, H5T_VLEN, FALSE) > 0) - HGOTO_ERROR(H5E_DATASET, H5E_UNSUPPORTED, FAIL, "Parallel IO does not support writing VL datatypes yet") - - /* If MPI based VFD is used, no VL datatype support yet. */ - /* This is because they use the global heap in the file and we don't */ - /* support parallel access of that yet */ - /* We should really use H5T_detect_class() here, but it will be difficult - * to detect the type of the reference if it is nested... -QAK - */ - if(H5T_get_class(type_info.mem_type, TRUE) == H5T_REFERENCE && - H5T_get_ref_type(type_info.mem_type) == H5R_DATASET_REGION) - HGOTO_ERROR(H5E_DATASET, H5E_UNSUPPORTED, FAIL, "Parallel IO does not support writing region reference datatypes yet") + if(H5T_is_vl_storage(type_info.mem_type) > 0) + HGOTO_ERROR(H5E_DATASET, H5E_UNSUPPORTED, FAIL, "Parallel IO does not support writing VL or region reference datatypes yet") } /* end if */ else { H5FD_mpio_xfer_t io_xfer_mode; /* MPI I/O transfer mode */ diff --git a/src/H5Dmpio.c b/src/H5Dmpio.c index baec4c4..20b2066 100644 --- a/src/H5Dmpio.c +++ b/src/H5Dmpio.c @@ -37,7 +37,6 @@ #include "H5Eprivate.h" /* Error handling */ #include "H5Fprivate.h" /* File access */ #include "H5FDprivate.h" /* File drivers */ -#include "H5FDmpi.h" /* MPI-based file drivers */ #include "H5Iprivate.h" /* IDs */ #include "H5MMprivate.h" /* Memory management */ #include "H5Oprivate.h" /* Object headers */ @@ -83,16 +82,30 @@ /* Macros to represent the regularity of the selection for multiple chunk IO case. */ #define H5D_CHUNK_SELECT_REG 1 +/* Macros for reasons to not enable read-proc-and-bcast. */ +#define H5D_MPIO_PROC0_BCAST 0x00 +#define H5D_MPIO_NOT_H5S_ALL 0x01 +#define H5D_MPIO_GREATER_THAN_2GB 0x02 /******************/ /* Local Typedefs */ /******************/ /* Combine chunk address and chunk info into a struct for better performance. */ typedef struct H5D_chunk_addr_info_t { - haddr_t chunk_addr; - H5D_chunk_info_t chunk_info; + haddr_t chunk_addr; + H5D_chunk_info_t chunk_info; } H5D_chunk_addr_info_t; +/* Rank 0 Bcast values */ +typedef enum H5D_mpio_no_rank0_bcast_cause_t { + H5D_MPIO_RANK0_BCAST = 0x00, + H5D_MPIO_RANK0_NOT_H5S_ALL = 0x01, + H5D_MPIO_RANK0_NOT_CONTIGUOUS = 0x02, + H5D_MPIO_RANK0_NOT_FIXED_SIZE = 0x04, + H5D_MPIO_RANK0_GREATER_THAN_2GB = 0x08 +} H5D_mpio_no_rank0_bcast_cause_t; + + /* * Information about a single chunk when performing collective filtered I/O. All * of the fields of one of these structs are initialized at the start of collective @@ -281,9 +294,12 @@ H5D__mpio_opt_possible(const H5D_io_info_t *io_info, const H5S_t *file_space, const H5S_t *mem_space, const H5D_type_info_t *type_info) { H5FD_mpio_xfer_t io_xfer_mode; /* MPI I/O transfer mode */ - unsigned local_cause = 0; /* Local reason(s) for breaking collective mode */ - unsigned global_cause = 0; /* Global reason(s) for breaking collective mode */ + unsigned local_cause[2] = {0,0}; /* [0] Local reason(s) for breaking collective mode */ + /* [1] Flag if dataset is both: H5S_ALL and small */ + unsigned global_cause[2] = {0,0}; /* Global reason(s) for breaking collective mode */ + htri_t is_vl_storage; /* Whether the dataset's datatype is stored in a variable-length form */ htri_t ret_value = SUCCEED; /* Return value */ + hbool_t H5FD_MPIO_Proc0_BCast; /* Flag if dataset is both: H5S_ALL and < 2GB */ FUNC_ENTER_PACKAGE @@ -296,36 +312,37 @@ H5D__mpio_opt_possible(const H5D_io_info_t *io_info, const H5S_t *file_space, /* For independent I/O, get out quickly and don't try to form consensus */ if(H5CX_get_io_xfer_mode(&io_xfer_mode) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode") + /* Set error flag, but keep going */ + local_cause[0] |= H5D_MPIO_ERROR_WHILE_CHECKING_COLLECTIVE_POSSIBLE; if(io_xfer_mode == H5FD_MPIO_INDEPENDENT) - local_cause |= H5D_MPIO_SET_INDEPENDENT; + local_cause[0] |= H5D_MPIO_SET_INDEPENDENT; /* Optimized MPI types flag must be set */ /* (based on 'HDF5_MPI_OPT_TYPES' environment variable) */ if(!H5FD_mpi_opt_types_g) - local_cause |= H5D_MPIO_MPI_OPT_TYPES_ENV_VAR_DISABLED; + local_cause[0] |= H5D_MPIO_MPI_OPT_TYPES_ENV_VAR_DISABLED; /* Don't allow collective operations if datatype conversions need to happen */ if(!type_info->is_conv_noop) - local_cause |= H5D_MPIO_DATATYPE_CONVERSION; + local_cause[0] |= H5D_MPIO_DATATYPE_CONVERSION; /* Don't allow collective operations if data transform operations should occur */ if(!type_info->is_xform_noop) - local_cause |= H5D_MPIO_DATA_TRANSFORMS; + local_cause[0] |= H5D_MPIO_DATA_TRANSFORMS; /* Check whether these are both simple or scalar dataspaces */ if(!((H5S_SIMPLE == H5S_GET_EXTENT_TYPE(mem_space) || H5S_SCALAR == H5S_GET_EXTENT_TYPE(mem_space)) && (H5S_SIMPLE == H5S_GET_EXTENT_TYPE(file_space) || H5S_SCALAR == H5S_GET_EXTENT_TYPE(file_space)))) - local_cause |= H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES; + local_cause[0] |= H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES; /* Dataset storage must be contiguous or chunked */ if(!(io_info->dset->shared->layout.type == H5D_CONTIGUOUS || io_info->dset->shared->layout.type == H5D_CHUNKED)) - local_cause |= H5D_MPIO_NOT_CONTIGUOUS_OR_CHUNKED_DATASET; + local_cause[0] |= H5D_MPIO_NOT_CONTIGUOUS_OR_CHUNKED_DATASET; /* check if external-file storage is used */ if(io_info->dset->shared->dcpl_cache.efl.nused > 0) - local_cause |= H5D_MPIO_NOT_CONTIGUOUS_OR_CHUNKED_DATASET; + local_cause[0] |= H5D_MPIO_NOT_CONTIGUOUS_OR_CHUNKED_DATASET; /* The handling of memory space is different for chunking and contiguous * storage. For contiguous storage, mem_space and file_space won't change @@ -340,31 +357,85 @@ H5D__mpio_opt_possible(const H5D_io_info_t *io_info, const H5S_t *file_space, * is less than 3. The functions needed (MPI_Mprobe and MPI_Imrecv) will * not be available. */ - if (io_info->op_type == H5D_IO_OP_WRITE && - io_info->dset->shared->layout.type == H5D_CHUNKED && - io_info->dset->shared->dcpl_cache.pline.nused > 0) - local_cause |= H5D_MPIO_PARALLEL_FILTERED_WRITES_DISABLED; + if(io_info->op_type == H5D_IO_OP_WRITE && + io_info->dset->shared->layout.type == H5D_CHUNKED && + io_info->dset->shared->dcpl_cache.pline.nused > 0) + local_cause[0] |= H5D_MPIO_PARALLEL_FILTERED_WRITES_DISABLED; #endif + /* Check if we are able to do a MPI_Bcast of the data from one rank + * instead of having all the processes involved in the collective I/O call. + */ + + /* Check to see if the process is reading the entire dataset */ + if(H5S_GET_SELECT_TYPE(file_space) != H5S_SEL_ALL) + local_cause[1] |= H5D_MPIO_RANK0_NOT_H5S_ALL; + /* Only perform this optimization for contigous datasets, currently */ + else if(H5D_CONTIGUOUS != io_info->dset->shared->layout.type) + /* Flag to do a MPI_Bcast of the data from one proc instead of + * having all the processes involved in the collective I/O. + */ + local_cause[1] |= H5D_MPIO_RANK0_NOT_CONTIGUOUS; + else if((is_vl_storage = H5T_is_vl_storage(type_info->dset_type)) < 0) + local_cause[0] |= H5D_MPIO_ERROR_WHILE_CHECKING_COLLECTIVE_POSSIBLE; + else if(is_vl_storage) + local_cause[1] |= H5D_MPIO_RANK0_NOT_FIXED_SIZE; + else { + size_t type_size; /* Size of dataset's datatype */ + + /* Retrieve the size of the dataset's datatype */ + if(0 == (type_size = H5T_GET_SIZE(type_info->dset_type))) + local_cause[0] |= H5D_MPIO_ERROR_WHILE_CHECKING_COLLECTIVE_POSSIBLE; + else { + hssize_t snelmts; /* [Signed] # of elements in dataset's dataspace */ + + /* Retrieve the size of the dataset's datatype */ + if((snelmts = H5S_GET_EXTENT_NPOINTS(file_space)) < 0) + local_cause[0] |= H5D_MPIO_ERROR_WHILE_CHECKING_COLLECTIVE_POSSIBLE; + else { + hsize_t dset_size; + + /* Determine dataset size */ + dset_size = ((hsize_t)snelmts) * type_size; + + /* If the size of the dataset is less than 2GB then do an MPI_Bcast + * of the data from one process instead of having all the processes + * involved in the collective I/O. + */ + if(dset_size > ((hsize_t)(2.0F * H5_GB) - 1)) + local_cause[1] |= H5D_MPIO_RANK0_GREATER_THAN_2GB; + } /* end else */ + } /* end else */ + } /* end else */ + /* Check for independent I/O */ - if(local_cause & H5D_MPIO_SET_INDEPENDENT) - global_cause = local_cause; + if(local_cause[0] & H5D_MPIO_SET_INDEPENDENT) + global_cause[0] = local_cause[0]; else { int mpi_code; /* MPI error code */ /* Form consensus opinion among all processes about whether to perform * collective I/O */ - if(MPI_SUCCESS != (mpi_code = MPI_Allreduce(&local_cause, &global_cause, 1, MPI_UNSIGNED, MPI_BOR, io_info->comm))) + if(MPI_SUCCESS != (mpi_code = MPI_Allreduce(&local_cause, &global_cause, 2, MPI_UNSIGNED, MPI_BOR, io_info->comm))) HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code) + } /* end else */ /* Set the local & global values of no-collective-cause in the API context */ - H5CX_set_mpio_local_no_coll_cause(local_cause); - H5CX_set_mpio_global_no_coll_cause(global_cause); + H5CX_set_mpio_local_no_coll_cause(local_cause[0]); + H5CX_set_mpio_global_no_coll_cause(global_cause[0]); + + /* Set read-with-rank0-and-bcast flag if possible */ + if(global_cause[0] == 0 && global_cause[1] == 0) { + H5CX_set_mpio_rank0_bcast(TRUE); +#ifdef H5_HAVE_INSTRUMENTED_LIBRARY + H5CX_test_set_mpio_coll_rank0_bcast(TRUE); +#endif /* H5_HAVE_INSTRUMENTED_LIBRARY */ + } /* end if */ /* Set the return value, based on the global cause */ - ret_value = global_cause > 0 ? FALSE : TRUE; + ret_value = global_cause[0] > 0 ? FALSE : TRUE; done: FUNC_LEAVE_NOAPI(ret_value) @@ -3076,8 +3147,8 @@ H5D__filtered_collective_chunk_entry_io(H5D_filtered_collective_io_info_t *chunk chunk_entry->chunk_states.new_chunk.length = chunk_entry->chunk_states.chunk_current.length; /* Currently, these chunk reads are done independently and will likely - * cause issues with collective metadata reads enabled. In the future, - * this should be refactored to use collective chunk reads - JTH */ + * cause issues with collective metadata reads enabled. In the future, + * this should be refactored to use collective chunk reads - JTH */ /* Get the original state of parallel I/O transfer mode */ if(H5CX_get_io_xfer_mode(&xfer_mode) < 0) diff --git a/src/H5Dprivate.h b/src/H5Dprivate.h index ccbae9d..00e1e28 100644 --- a/src/H5Dprivate.h +++ b/src/H5Dprivate.h @@ -95,7 +95,13 @@ /* Definitions for all collective chunk instrumentation properties */ #define H5D_XFER_COLL_CHUNK_SIZE sizeof(unsigned) #define H5D_XFER_COLL_CHUNK_DEF 1 -#define H5D_XFER_COLL_CHUNK_FIX 0 + +/* General collective I/O instrumentation properties */ +#define H5D_XFER_COLL_RANK0_BCAST_NAME "coll_rank0_bcast" + +/* Definitions for general collective I/O instrumentation properties */ +#define H5D_XFER_COLL_RANK0_BCAST_SIZE sizeof(hbool_t) +#define H5D_XFER_COLL_RANK0_BCAST_DEF FALSE #endif /* H5_HAVE_INSTRUMENTED_LIBRARY */ /* Default temporary buffer size */ diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c index 87f8b6a..689f43c 100644 --- a/src/H5FDmpio.c +++ b/src/H5FDmpio.c @@ -1414,31 +1414,32 @@ static herr_t H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNUSED dxpl_id, haddr_t addr, size_t size, void *buf/*out*/) { - H5FD_mpio_t *file = (H5FD_mpio_t*)_file; - MPI_Offset mpi_off; - MPI_Status mpi_stat; /* Status from I/O operation */ - int mpi_code; /* mpi return code */ - MPI_Datatype buf_type = MPI_BYTE; /* MPI description of the selection in memory */ - int size_i; /* Integer copy of 'size' to read */ + H5FD_mpio_t *file = (H5FD_mpio_t*)_file; + MPI_Offset mpi_off; + MPI_Status mpi_stat; /* Status from I/O operation */ + int mpi_code; /* mpi return code */ + MPI_Datatype buf_type = MPI_BYTE; /* MPI description of the selection in memory */ + int size_i; /* Integer copy of 'size' to read */ #if MPI_VERSION >= 3 - MPI_Count bytes_read; /* Number of bytes read in */ + MPI_Count bytes_read = 0; /* Number of bytes read in */ MPI_Count type_size; /* MPI datatype used for I/O's size */ MPI_Count io_size; /* Actual number of bytes requested */ MPI_Count n; #else - int bytes_read; /* Number of bytes read in */ + int bytes_read = 0; /* Number of bytes read in */ int type_size; /* MPI datatype used for I/O's size */ int io_size; /* Actual number of bytes requested */ - int n; + int n; #endif - hbool_t use_view_this_time = FALSE; - herr_t ret_value = SUCCEED; + hbool_t use_view_this_time = FALSE; + hbool_t rank0_bcast = FALSE; /* If read-with-rank0-and-bcast flag was used */ + herr_t ret_value = SUCCEED; FUNC_ENTER_NOAPI_NOINIT #ifdef H5FDmpio_DEBUG if (H5FD_mpio_Debug[(int)'t']) - fprintf(stdout, "Entering H5FD_mpio_read\n" ); + fprintf(stdout, "Entering H5FD_mpio_read\n" ); #endif HDassert(file); HDassert(H5FD_MPIO==file->pub.driver_id); @@ -1457,12 +1458,12 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, #ifdef H5FDmpio_DEBUG if (H5FD_mpio_Debug[(int)'r']) fprintf(stdout, "in H5FD_mpio_read mpi_off=%ld size_i=%d\n", - (long)mpi_off, size_i ); + (long)mpi_off, size_i ); #endif /* Only look for MPI views for raw data transfers */ if(type == H5FD_MEM_DRAW) { - H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */ + H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */ /* Get the transfer mode from the API context */ if(H5CX_get_io_xfer_mode(&xfer_mode) < 0) @@ -1475,7 +1476,7 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, * could mean "use MPI_BYTE" by convention). */ if(xfer_mode==H5FD_MPIO_COLLECTIVE) { - MPI_Datatype file_type; + MPI_Datatype file_type; /* Remember that views are used */ use_view_this_time = TRUE; @@ -1502,8 +1503,8 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, H5FD_mpio_collective_opt_t coll_opt_mode; #ifdef H5FDmpio_DEBUG - if (H5FD_mpio_Debug[(int)'t']) - fprintf(stdout, "H5FD_mpio_read: using MPIO collective mode\n"); + if (H5FD_mpio_Debug[(int)'t']) + fprintf(stdout, "H5FD_mpio_read: using MPIO collective mode\n"); #endif /* Get the collective_opt property to check whether the application wants to do IO individually. */ if(H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0) @@ -1514,8 +1515,25 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, if(H5FD_mpio_Debug[(int)'t']) fprintf(stdout, "H5FD_mpio_read: doing MPI collective IO\n"); #endif - if(MPI_SUCCESS != (mpi_code = MPI_File_read_at_all(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat))) - HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code) + /* Check whether we should read from rank 0 and broadcast to other ranks */ + if(H5CX_get_mpio_rank0_bcast()) { +#ifdef H5FDmpio_DEBUG + if(H5FD_mpio_Debug[(int)'r']) + HDfprintf(stdout, "%s: doing read-rank0-and-MPI_Bcast\n", FUNC); +#endif + /* Indicate path we've taken */ + rank0_bcast = TRUE; + + /* Read on rank 0 Bcast to other ranks */ + if(file->mpi_rank == 0) + if(MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code) + if(MPI_SUCCESS != (mpi_code = MPI_Bcast(buf, size_i, buf_type, 0, file->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code) + } /* end if */ + else + if(MPI_SUCCESS != (mpi_code = MPI_File_read_at_all(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code) } /* end if */ else { #ifdef H5FDmpio_DEBUG @@ -1534,24 +1552,37 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) } else { if(MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat))) - HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code) + HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code) } - /* How many bytes were actually read? */ + /* Only retrieve bytes read if this rank _actually_ participated in I/O */ + if(!rank0_bcast || (rank0_bcast && file->mpi_rank == 0) ) { + /* How many bytes were actually read? */ #if MPI_VERSION >= 3 - if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_read))) + if(MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_read))) #else - if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read))) + if(MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read))) #endif - HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code) + HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code) + } /* end if */ + + /* If the rank0-bcast feature was used, broadcast the # of bytes read to + * other ranks, which didn't perform any I/O. + */ + /* NOTE: This could be optimized further to be combined with the broadcast + * of the data. (QAK - 2019/1/2) + */ + if(rank0_bcast) + if(MPI_SUCCESS != MPI_Bcast(&bytes_read, 1, MPI_LONG_LONG, 0, file->comm)) + HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", 0) /* Get the type's size */ #if MPI_VERSION >= 3 - if (MPI_SUCCESS != (mpi_code = MPI_Type_size_x(buf_type, &type_size))) + if(MPI_SUCCESS != (mpi_code = MPI_Type_size_x(buf_type, &type_size))) #else - if (MPI_SUCCESS != (mpi_code = MPI_Type_size(buf_type, &type_size))) + if(MPI_SUCCESS != (mpi_code = MPI_Type_size(buf_type, &type_size))) #endif - HMPI_GOTO_ERROR(FAIL, "MPI_Type_size failed", mpi_code) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_size failed", mpi_code) /* Compute the actual number of bytes requested */ io_size=type_size*size_i; @@ -1564,12 +1595,12 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, * This gives us zeroes beyond end of physical MPI file. */ if ((n=(io_size-bytes_read)) > 0) - HDmemset((char*)buf+bytes_read, 0, (size_t)n); + HDmemset((char*)buf+bytes_read, 0, (size_t)n); done: #ifdef H5FDmpio_DEBUG if (H5FD_mpio_Debug[(int)'t']) - fprintf(stdout, "Leaving H5FD_mpio_read\n" ); + fprintf(stdout, "Leaving H5FD_mpio_read\n" ); #endif FUNC_LEAVE_NOAPI(ret_value) diff --git a/src/H5Pdxpl.c b/src/H5Pdxpl.c index dcfe89f..1c97589 100644 --- a/src/H5Pdxpl.c +++ b/src/H5Pdxpl.c @@ -2017,6 +2017,7 @@ done: FUNC_LEAVE_API(ret_value) } /* end H5Pget_mpio_actual_io_mode() */ + /*------------------------------------------------------------------------- * Function: H5Pget_mpio_no_collective_cause * @@ -2053,8 +2054,6 @@ H5Pget_mpio_no_collective_cause(hid_t plist_id, uint32_t *local_no_collective_ca done: FUNC_LEAVE_API(ret_value) } /* end H5Pget_mpio_no_collective_cause() */ - - #endif /* H5_HAVE_PARALLEL */ diff --git a/src/H5Ppublic.h b/src/H5Ppublic.h index b566085..5d6dea8 100644 --- a/src/H5Ppublic.h +++ b/src/H5Ppublic.h @@ -165,7 +165,8 @@ typedef enum H5D_mpio_no_collective_cause_t { H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES = 0x10, H5D_MPIO_NOT_CONTIGUOUS_OR_CHUNKED_DATASET = 0x20, H5D_MPIO_PARALLEL_FILTERED_WRITES_DISABLED = 0x40, - H5D_MPIO_NO_COLLECTIVE_MAX_CAUSE = 0x80 + H5D_MPIO_ERROR_WHILE_CHECKING_COLLECTIVE_POSSIBLE = 0x80, + H5D_MPIO_NO_COLLECTIVE_MAX_CAUSE = 0x100 } H5D_mpio_no_collective_cause_t; /********************/ @@ -301,8 +301,8 @@ static herr_t H5T__unregister(H5T_pers_t pers, const char *name, H5T_t *src, static htri_t H5T__compiler_conv(H5T_t *src, H5T_t *dst); static herr_t H5T__set_size(H5T_t *dt, size_t size); static herr_t H5T__close_cb(H5T_t *dt); -static H5T_path_t *H5T__path_find_real(const H5T_t *src, const H5T_t *dst, - const char *name, H5T_conv_func_t *conv); +static H5T_path_t *H5T__path_find_real(const H5T_t *src, const H5T_t *dst, const char *name, H5T_conv_func_t *conv); +static hbool_t H5T__detect_reg_ref(const H5T_t *dt); /*****************************/ @@ -5517,6 +5517,111 @@ done: /*------------------------------------------------------------------------- + * Function: H5T_detect_reg_ref + * + * Purpose: Check whether a datatype contains (or is) a region reference + * datatype. + * + * Return: TRUE (1) or FALSE (0) on success + * (Can't fail) + * + * Programmer: Quincey Koziol + * Saturday, January 5, 2019 + * + *------------------------------------------------------------------------- + */ +static hbool_t +H5T__detect_reg_ref(const H5T_t *dt) +{ + unsigned u; /* Local index variable */ + hbool_t ret_value = FALSE; /* Return value */ + + FUNC_ENTER_STATIC_NOERR + + /* Sanity checks */ + HDassert(dt); + + /* Check if this datatype is a region reference */ + if(H5T_REFERENCE == dt->shared->type && H5R_DATASET_REGION == dt->shared->u.atomic.u.r.rtype) + HGOTO_DONE(TRUE); + + /* Check for types that might have the correct type as a component */ + switch(dt->shared->type) { + case H5T_COMPOUND: + /* Iterate over all the compound datatype's fields */ + for(u = 0; u < dt->shared->u.compnd.nmembs; u++) + /* Recurse on field's datatype */ + if(H5T__detect_reg_ref(dt->shared->u.compnd.memb[u].type)) + HGOTO_DONE(TRUE); + break; + + case H5T_ARRAY: + case H5T_VLEN: + case H5T_ENUM: + HGOTO_DONE(H5T__detect_reg_ref(dt->shared->parent)); + break; + + case H5T_NO_CLASS: + case H5T_INTEGER: + case H5T_FLOAT: + case H5T_TIME: + case H5T_STRING: + case H5T_BITFIELD: + case H5T_OPAQUE: + case H5T_REFERENCE: + case H5T_NCLASSES: + default: + break; + } /* end if */ + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5T__detect_reg_ref() */ + + +/*------------------------------------------------------------------------- + * Function: H5T_is_vl_storage + * + * Purpose: Check if a datatype will be stored in a variable-length form. + * + * Notes: Currently, only variable-length string & sequences and region + * references are stored in a variable-length form. + * + * Return: + * One of two values on success: + * TRUE - If the datatype will be stored in a variable-length form + * FALSE - If the datatype will NOT be stored in a variable-length form + * <0 is returned on failure + * + * Programmer: Quincey Koziol + * Saturday, January 5, 2019 + * + *------------------------------------------------------------------------- + */ +htri_t +H5T_is_vl_storage(const H5T_t *dt) +{ + htri_t ret_value = FALSE; + + FUNC_ENTER_NOAPI(FAIL) + + /* Sanity check */ + HDassert(dt); + + /* VL and region reference datatypes are stored in variable-length form */ + if(H5T_detect_class(dt, H5T_VLEN, FALSE)) + ret_value = TRUE; + else if(H5T_detect_class(dt, H5T_REFERENCE, FALSE)) + ret_value = H5T__detect_reg_ref(dt); + else + ret_value = FALSE; + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5T_is_vl_storage() */ + + +/*------------------------------------------------------------------------- * Function: H5T_upgrade_version_cb * * Purpose: H5T__visit callback to Upgrade the version of a datatype diff --git a/src/H5Tprivate.h b/src/H5Tprivate.h index 89cdcfd..aff3f6d 100644 --- a/src/H5Tprivate.h +++ b/src/H5Tprivate.h @@ -138,6 +138,7 @@ H5_DLL herr_t H5T_set_version(H5F_t *f, H5T_t *dt); H5_DLL herr_t H5T_patch_file(H5T_t *dt, H5F_t *f); H5_DLL herr_t H5T_patch_vlen_file(H5T_t *dt, H5F_t *f); H5_DLL htri_t H5T_is_variable_str(const H5T_t *dt); +H5_DLL htri_t H5T_is_vl_storage(const H5T_t *dt); /* Reference specific functions */ H5_DLL H5R_type_t H5T_get_ref_type(const H5T_t *dt); |