diff options
author | M. Scot Breitenfeld <brtnfld@hdfgroup.org> | 2019-01-08 15:55:29 (GMT) |
---|---|---|
committer | M. Scot Breitenfeld <brtnfld@hdfgroup.org> | 2019-01-08 15:55:29 (GMT) |
commit | 1421059cfba7af05bb4a675820e9c5a6a73e77fe (patch) | |
tree | 156a3e1be5afeff69cdcfbcd6e60b521c41b3a66 /src/H5Dmpio.c | |
parent | b4828e7feb8fd11514683c6eef8809356751760a (diff) | |
parent | 74a41f92a48d553a33b80d84d0981623327074af (diff) | |
download | hdf5-1421059cfba7af05bb4a675820e9c5a6a73e77fe.zip hdf5-1421059cfba7af05bb4a675820e9c5a6a73e77fe.tar.gz hdf5-1421059cfba7af05bb4a675820e9c5a6a73e77fe.tar.bz2 |
Merge remote-tracking branch 'upstream/develop' into develop
Diffstat (limited to 'src/H5Dmpio.c')
-rw-r--r-- | src/H5Dmpio.c | 110 |
1 files changed, 69 insertions, 41 deletions
diff --git a/src/H5Dmpio.c b/src/H5Dmpio.c index 8850c59..f5da33d 100644 --- a/src/H5Dmpio.c +++ b/src/H5Dmpio.c @@ -37,7 +37,6 @@ #include "H5Eprivate.h" /* Error handling */ #include "H5Fprivate.h" /* File access */ #include "H5FDprivate.h" /* File drivers */ -#include "H5FDmpi.h" /* MPI-based file drivers */ #include "H5Iprivate.h" /* IDs */ #include "H5MMprivate.h" /* Memory management */ #include "H5Oprivate.h" /* Object headers */ @@ -83,20 +82,26 @@ /* Macros to represent the regularity of the selection for multiple chunk IO case. */ #define H5D_CHUNK_SELECT_REG 1 -/* Macros for reasons to not enable read-proc-and-bcast. */ -#define H5D_MPIO_PROC0_BCAST 0x00 -#define H5D_MPIO_NOT_H5S_ALL 0x01 -#define H5D_MPIO_GREATER_THAN_2GB 0x02 /******************/ /* Local Typedefs */ /******************/ /* Combine chunk address and chunk info into a struct for better performance. */ typedef struct H5D_chunk_addr_info_t { - haddr_t chunk_addr; - H5D_chunk_info_t chunk_info; + haddr_t chunk_addr; + H5D_chunk_info_t chunk_info; } H5D_chunk_addr_info_t; +/* Rank 0 Bcast values */ +typedef enum H5D_mpio_no_rank0_bcast_cause_t { + H5D_MPIO_RANK0_BCAST = 0x00, + H5D_MPIO_RANK0_NOT_H5S_ALL = 0x01, + H5D_MPIO_RANK0_NOT_CONTIGUOUS = 0x02, + H5D_MPIO_RANK0_NOT_FIXED_SIZE = 0x04, + H5D_MPIO_RANK0_GREATER_THAN_2GB = 0x08 +} H5D_mpio_no_rank0_bcast_cause_t; + + /* * Information about a single chunk when performing collective filtered I/O. All * of the fields of one of these structs are initialized at the start of collective @@ -285,11 +290,11 @@ H5D__mpio_opt_possible(const H5D_io_info_t *io_info, const H5S_t *file_space, const H5S_t *mem_space, const H5D_type_info_t *type_info) { H5FD_mpio_xfer_t io_xfer_mode; /* MPI I/O transfer mode */ - unsigned local_cause[2] = {0,H5D_MPIO_PROC0_BCAST}; /* [0] Local reason(s) for breaking collective mode */ - /* [1] Flag if dataset is both: H5S_ALL and small */ - unsigned global_cause[2] = {0,H5D_MPIO_PROC0_BCAST}; /* Global reason(s) for breaking collective mode */ + unsigned local_cause[2] = {0,0}; /* [0] Local reason(s) for breaking collective mode */ + /* [1] Flag if dataset is both: H5S_ALL and small */ + unsigned global_cause[2] = {0,0}; /* Global reason(s) for breaking collective mode */ + htri_t is_vl_storage; /* Whether the dataset's datatype is stored in a variable-length form */ htri_t ret_value = SUCCEED; /* Return value */ - hbool_t H5FD_MPIO_Proc0_BCast; /* Flag if dataset is both: H5S_ALL and < 2GB */ FUNC_ENTER_PACKAGE @@ -302,7 +307,8 @@ H5D__mpio_opt_possible(const H5D_io_info_t *io_info, const H5S_t *file_space, /* For independent I/O, get out quickly and don't try to form consensus */ if(H5CX_get_io_xfer_mode(&io_xfer_mode) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode") + /* Set error flag, but keep going */ + local_cause[0] |= H5D_MPIO_ERROR_WHILE_CHECKING_COLLECTIVE_POSSIBLE; if(io_xfer_mode == H5FD_MPIO_INDEPENDENT) local_cause[0] |= H5D_MPIO_SET_INDEPENDENT; @@ -346,34 +352,56 @@ H5D__mpio_opt_possible(const H5D_io_info_t *io_info, const H5S_t *file_space, * is less than 3. The functions needed (MPI_Mprobe and MPI_Imrecv) will * not be available. */ - if (io_info->op_type == H5D_IO_OP_WRITE && - io_info->dset->shared->layout.type == H5D_CHUNKED && - io_info->dset->shared->dcpl_cache.pline.nused > 0) + if(io_info->op_type == H5D_IO_OP_WRITE && + io_info->dset->shared->layout.type == H5D_CHUNKED && + io_info->dset->shared->dcpl_cache.pline.nused > 0) local_cause[0] |= H5D_MPIO_PARALLEL_FILTERED_WRITES_DISABLED; #endif - H5FD_MPIO_Proc0_BCast = FALSE; - /* Check to see if all the processes are reading the same data */ - if((H5S_GET_SELECT_TYPE(file_space) != H5S_SEL_ALL)) { - /* Flag to do a MPI_Bcast of the data from one proc instead of - * having all the processes involved in the persistent I/O. - */ - local_cause[1] |= H5D_MPIO_NOT_H5S_ALL; - } + /* Check if we are able to do a MPI_Bcast of the data from one rank + * instead of having all the processes involved in the collective I/O call. + */ + + /* Check to see if the process is reading the entire dataset */ + if(H5S_GET_SELECT_TYPE(file_space) != H5S_SEL_ALL) + local_cause[1] |= H5D_MPIO_RANK0_NOT_H5S_ALL; + /* Only perform this optimization for contigous datasets, currently */ + else if(H5D_CONTIGUOUS != io_info->dset->shared->layout.type) + /* Flag to do a MPI_Bcast of the data from one proc instead of + * having all the processes involved in the collective I/O. + */ + local_cause[1] |= H5D_MPIO_RANK0_NOT_CONTIGUOUS; + else if((is_vl_storage = H5T_is_vl_storage(type_info->dset_type)) < 0) + local_cause[0] |= H5D_MPIO_ERROR_WHILE_CHECKING_COLLECTIVE_POSSIBLE; + else if(is_vl_storage) + local_cause[1] |= H5D_MPIO_RANK0_NOT_FIXED_SIZE; else { + size_t type_size; /* Size of dataset's datatype */ - /* If the size of the dataset is less than 2GB then do an MPI_Bcast - * of the data from one process instead of having all the processes - * involved in the persistent I/O. - */ + /* Retrieve the size of the dataset's datatype */ + if(0 == (type_size = H5T_GET_SIZE(type_info->dset_type))) + local_cause[0] |= H5D_MPIO_ERROR_WHILE_CHECKING_COLLECTIVE_POSSIBLE; + else { + hssize_t snelmts; /* [Signed] # of elements in dataset's dataspace */ + + /* Retrieve the size of the dataset's datatype */ + if((snelmts = H5S_GET_EXTENT_NPOINTS(file_space)) < 0) + local_cause[0] |= H5D_MPIO_ERROR_WHILE_CHECKING_COLLECTIVE_POSSIBLE; + else { + hsize_t dset_size; - hsize_t dset_storage_size; - H5D__get_storage_size(io_info->dset, &dset_storage_size); + /* Determine dataset size */ + dset_size = ((hsize_t)snelmts) * type_size; - if(dset_storage_size > ((hsize_t)(H5_2GB) - 1) ) { - local_cause[1] |= H5D_MPIO_GREATER_THAN_2GB; - } - } + /* If the size of the dataset is less than 2GB then do an MPI_Bcast + * of the data from one process instead of having all the processes + * involved in the collective I/O. + */ + if(dset_size > ((hsize_t)(2.0F * H5_GB) - 1)) + local_cause[1] |= H5D_MPIO_RANK0_GREATER_THAN_2GB; + } /* end else */ + } /* end else */ + } /* end else */ /* Check for independent I/O */ if(local_cause[0] & H5D_MPIO_SET_INDEPENDENT) @@ -386,23 +414,23 @@ H5D__mpio_opt_possible(const H5D_io_info_t *io_info, const H5S_t *file_space, */ if(MPI_SUCCESS != (mpi_code = MPI_Allreduce(&local_cause, &global_cause, 2, MPI_UNSIGNED, MPI_BOR, io_info->comm))) HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code) - } /* end else */ /* Set the local & global values of no-collective-cause in the API context */ H5CX_set_mpio_local_no_coll_cause(local_cause[0]); H5CX_set_mpio_global_no_coll_cause(global_cause[0]); + /* Set read-with-rank0-and-bcast flag if possible */ + if(global_cause[0] == 0 && global_cause[1] == 0) { + H5CX_set_mpio_rank0_bcast(TRUE); +#ifdef H5_HAVE_INSTRUMENTED_LIBRARY + H5CX_test_set_mpio_coll_rank0_bcast(TRUE); +#endif /* H5_HAVE_INSTRUMENTED_LIBRARY */ + } /* end if */ + /* Set the return value, based on the global cause */ ret_value = global_cause[0] > 0 ? FALSE : TRUE; - /* read-proc0-and-bcast if collective and H5S_ALL */ - if(global_cause[0] == 0 && global_cause[1] == H5D_MPIO_PROC0_BCAST) - H5FD_MPIO_Proc0_BCast = TRUE; - - /* Set Flag if dataset is both: H5S_ALL and < 2GB in the API context */ - H5CX_set_mpio_Proc0_BCast(H5FD_MPIO_Proc0_BCast); - done: FUNC_LEAVE_NOAPI(ret_value) } /* H5D__mpio_opt_possible() */ |