From 2023495f320ed22febe41eb6c95a4bc687a8b676 Mon Sep 17 00:00:00 2001 From: Neil Fortner Date: Fri, 3 Sep 2021 12:02:44 -0500 Subject: Add support for independent parallel I/O with selection I/O. Add HDF5_USE_SELECTION_IO env var to control selection I/O (default off). --- src/H5.c | 10 +++++ src/H5Dchunk.c | 24 +++++----- src/H5Dcompact.c | 4 +- src/H5Dcontig.c | 31 +++++++------ src/H5Defl.c | 4 +- src/H5Dio.c | 11 ++++- src/H5Dpkg.h | 3 +- src/H5FDint.c | 12 ++--- src/H5FDmpio.c | 130 +++++++++++++++++++++++++++++++++++++++++++++++-------- 9 files changed, 167 insertions(+), 62 deletions(-) diff --git a/src/H5.c b/src/H5.c index cbd240a..0c7c8c1 100644 --- a/src/H5.c +++ b/src/H5.c @@ -147,6 +147,7 @@ done: herr_t H5_init_library(void) { + char *env_use_select_io = NULL; herr_t ret_value = SUCCEED; /* Set the 'library initialized' flag as early as possible, to avoid @@ -278,6 +279,15 @@ H5_init_library(void) if (H5VL_init_phase2() < 0) HGOTO_ERROR(H5E_FUNC, H5E_CANTINIT, FAIL, "unable to initialize vol interface") + /* Check for HDF5_USE_SELECTION_IO env variable */ + env_use_select_io = HDgetenv("HDF5_USE_SELECTION_IO"); + if (NULL != env_use_select_io && HDstrcmp(env_use_select_io, "") + && HDstrcmp(env_use_select_io, "0") && HDstrcmp(env_use_select_io, "no") + && HDstrcmp(env_use_select_io, "No") && HDstrcmp(env_use_select_io, "NO") + && HDstrcmp(env_use_select_io, "false") && HDstrcmp(env_use_select_io, "False") + && HDstrcmp(env_use_select_io, "FALSE")) + H5_use_selection_io_g = TRUE; + /* Debugging? */ H5__debug_mask("-all"); H5__debug_mask(HDgetenv("HDF5_DEBUG")); diff --git a/src/H5Dchunk.c b/src/H5Dchunk.c index 5447233..dcc3baa 100644 --- a/src/H5Dchunk.c +++ b/src/H5Dchunk.c @@ -254,7 +254,7 @@ typedef struct H5D_chunk_coll_info_t { /* Chunked layout operation callbacks */ static herr_t H5D__chunk_construct(H5F_t *f, H5D_t *dset); static herr_t H5D__chunk_init(H5F_t *f, const H5D_t *dset, hid_t dapl_id); -static herr_t H5D__chunk_io_init(const H5D_io_info_t *io_info, const H5D_type_info_t *type_info, +static herr_t H5D__chunk_io_init(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, hsize_t nelmts, const H5S_t *file_space, const H5S_t *mem_space, H5D_chunk_map_t *fm); static herr_t H5D__chunk_io_init_selections(const H5D_io_info_t *io_info, const H5D_type_info_t *type_info, @@ -1057,7 +1057,7 @@ H5D__chunk_is_data_cached(const H5D_shared_t *shared_dset) *------------------------------------------------------------------------- */ static herr_t -H5D__chunk_io_init(const H5D_io_info_t *io_info, const H5D_type_info_t *type_info, hsize_t nelmts, +H5D__chunk_io_init(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, hsize_t nelmts, const H5S_t *file_space, const H5S_t *mem_space, H5D_chunk_map_t *fm) { const H5D_t *dataset = io_info->dset; /* Local pointer to dataset info */ @@ -1065,6 +1065,7 @@ H5D__chunk_io_init(const H5D_io_info_t *io_info, const H5D_type_info_t *type_inf htri_t file_space_normalized = FALSE; /* File dataspace was normalized */ unsigned f_ndims; /* The number of dimensions of the file's dataspace */ int sm_ndims; /* The number of dimensions of the memory buffer's dataspace (signed) */ + htri_t use_selection_io = FALSE; /* Whether to use selection I/O */ unsigned u; /* Local index variable */ herr_t ret_value = SUCCEED; /* Return value */ @@ -1120,6 +1121,11 @@ H5D__chunk_io_init(const H5D_io_info_t *io_info, const H5D_type_info_t *type_inf if (H5D__chunk_io_init_selections(io_info, type_info, fm) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "unable to create file and memory chunk selections") + /* Check if we're performing selection I/O and save the result */ + if ((use_selection_io = H5D__chunk_may_use_select_io(io_info)) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't check if selection I/O is possible") + io_info->use_select_io = (hbool_t)use_selection_io; + done: /* Reset the global dataspace info */ fm->file_space = NULL; @@ -2535,7 +2541,6 @@ H5D__chunk_read(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, hsize_ H5S_t * chunk_file_spaces_static[8]; /* Static buffer for chunk_file_spaces */ haddr_t * chunk_addrs = NULL; /* Array of chunk addresses */ haddr_t chunk_addrs_static[8]; /* Static buffer for chunk_addrs */ - htri_t use_selection_io = FALSE; /* Whether to use selection I/O */ herr_t ret_value = SUCCEED; /*return value */ FUNC_ENTER_STATIC @@ -2567,12 +2572,8 @@ H5D__chunk_read(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, hsize_ skip_missing_chunks = TRUE; } - /* Check if we're performing selection I/O */ - if ((use_selection_io = H5D__chunk_may_use_select_io(io_info)) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't check if selection I/O is possible") - /* Different blocks depending on whether we're using selection I/O */ - if (use_selection_io) { + if (io_info->use_select_io) { size_t num_chunks; size_t element_sizes[2] = {type_info->dst_type_size, 0}; void * bufs[2] = {io_info->u.rbuf, NULL}; @@ -2815,7 +2816,6 @@ H5D__chunk_write(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, hsize H5S_t * chunk_file_spaces_static[8]; /* Static buffer for chunk_file_spaces */ haddr_t * chunk_addrs = NULL; /* Array of chunk addresses */ haddr_t chunk_addrs_static[8]; /* Static buffer for chunk_addrs */ - htri_t use_selection_io = FALSE; /* Whether to use selection I/O */ herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_STATIC @@ -2843,12 +2843,8 @@ H5D__chunk_write(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, hsize /* Initialize temporary compact storage info */ cpt_store.compact.dirty = &cpt_dirty; - /* Check if we're performing selection I/O */ - if ((use_selection_io = H5D__chunk_may_use_select_io(io_info)) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't check if selection I/O is possible") - /* Different blocks depending on whether we're using selection I/O */ - if (use_selection_io) { + if (io_info->use_select_io) { size_t num_chunks; size_t element_sizes[2] = {type_info->dst_type_size, 0}; const void *bufs[2] = {io_info->u.wbuf, NULL}; diff --git a/src/H5Dcompact.c b/src/H5Dcompact.c index fe41298..9a6d4b2 100644 --- a/src/H5Dcompact.c +++ b/src/H5Dcompact.c @@ -54,7 +54,7 @@ /* Layout operation callbacks */ static herr_t H5D__compact_construct(H5F_t *f, H5D_t *dset); static hbool_t H5D__compact_is_space_alloc(const H5O_storage_t *storage); -static herr_t H5D__compact_io_init(const H5D_io_info_t *io_info, const H5D_type_info_t *type_info, +static herr_t H5D__compact_io_init(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, hsize_t nelmts, const H5S_t *file_space, const H5S_t *mem_space, H5D_chunk_map_t *cm); static ssize_t H5D__compact_readvv(const H5D_io_info_t *io_info, size_t dset_max_nseq, size_t *dset_curr_seq, @@ -227,7 +227,7 @@ H5D__compact_is_space_alloc(const H5O_storage_t H5_ATTR_UNUSED *storage) *------------------------------------------------------------------------- */ static herr_t -H5D__compact_io_init(const H5D_io_info_t *io_info, const H5D_type_info_t H5_ATTR_UNUSED *type_info, +H5D__compact_io_init(H5D_io_info_t *io_info, const H5D_type_info_t H5_ATTR_UNUSED *type_info, hsize_t H5_ATTR_UNUSED nelmts, const H5S_t H5_ATTR_UNUSED *file_space, const H5S_t H5_ATTR_UNUSED *mem_space, H5D_chunk_map_t H5_ATTR_UNUSED *cm) { diff --git a/src/H5Dcontig.c b/src/H5Dcontig.c index 3b104b8..d2a84da 100644 --- a/src/H5Dcontig.c +++ b/src/H5Dcontig.c @@ -91,7 +91,7 @@ typedef struct H5D_contig_writevv_ud_t { /* Layout operation callbacks */ static herr_t H5D__contig_construct(H5F_t *f, H5D_t *dset); static herr_t H5D__contig_init(H5F_t *f, const H5D_t *dset, hid_t dapl_id); -static herr_t H5D__contig_io_init(const H5D_io_info_t *io_info, const H5D_type_info_t *type_info, +static herr_t H5D__contig_io_init(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, hsize_t nelmts, const H5S_t *file_space, const H5S_t *mem_space, H5D_chunk_map_t *cm); static ssize_t H5D__contig_readvv(const H5D_io_info_t *io_info, size_t dset_max_nseq, size_t *dset_curr_seq, @@ -551,16 +551,25 @@ H5D__contig_is_data_cached(const H5D_shared_t *shared_dset) *------------------------------------------------------------------------- */ static herr_t -H5D__contig_io_init(const H5D_io_info_t *io_info, const H5D_type_info_t H5_ATTR_UNUSED *type_info, +H5D__contig_io_init(H5D_io_info_t *io_info, const H5D_type_info_t H5_ATTR_UNUSED *type_info, hsize_t H5_ATTR_UNUSED nelmts, const H5S_t H5_ATTR_UNUSED *file_space, const H5S_t H5_ATTR_UNUSED *mem_space, H5D_chunk_map_t H5_ATTR_UNUSED *cm) { - FUNC_ENTER_STATIC_NOERR + htri_t use_selection_io = FALSE; /* Whether to use selection I/O */ + htri_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_STATIC io_info->store->contig.dset_addr = io_info->dset->shared->layout.storage.u.contig.addr; io_info->store->contig.dset_size = io_info->dset->shared->layout.storage.u.contig.size; - FUNC_LEAVE_NOAPI(SUCCEED) + /* Check if we're performing selection I/O */ + if ((use_selection_io = H5D__contig_may_use_select_io(io_info, H5D_IO_OP_READ)) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't check if selection I/O is possible") + io_info->use_select_io = (hbool_t)use_selection_io; + +done: + FUNC_LEAVE_NOAPI(ret_value) } /* end H5D__contig_io_init() */ /*------------------------------------------------------------------------- @@ -632,7 +641,6 @@ herr_t H5D__contig_read(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, hsize_t nelmts, const H5S_t *file_space, const H5S_t *mem_space, H5D_chunk_map_t H5_ATTR_UNUSED *fm) { - htri_t use_selection_io = FALSE; /* Whether to use selection I/O */ herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_PACKAGE @@ -644,11 +652,7 @@ H5D__contig_read(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, hsize HDassert(mem_space); HDassert(file_space); - /* Check if we're performing selection I/O */ - if ((use_selection_io = H5D__contig_may_use_select_io(io_info, H5D_IO_OP_READ)) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't check if selection I/O is possible") - - if (use_selection_io) { + if (io_info->use_select_io) { size_t dst_type_size = type_info->dst_type_size; /* Issue selection I/O call (we can skip the page buffer because we've @@ -684,7 +688,6 @@ herr_t H5D__contig_write(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, hsize_t nelmts, const H5S_t *file_space, const H5S_t *mem_space, H5D_chunk_map_t H5_ATTR_UNUSED *fm) { - htri_t use_selection_io = FALSE; /* Whether to use selection I/O */ herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_PACKAGE @@ -696,11 +699,7 @@ H5D__contig_write(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, hsiz HDassert(mem_space); HDassert(file_space); - /* Check if we're performing selection I/O */ - if ((use_selection_io = H5D__contig_may_use_select_io(io_info, H5D_IO_OP_WRITE)) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't check if selection I/O is possible") - - if (use_selection_io) { + if (io_info->use_select_io) { size_t dst_type_size = type_info->dst_type_size; /* Issue selection I/O call (we can skip the page buffer because we've diff --git a/src/H5Defl.c b/src/H5Defl.c index 85c9dba..fea70b0 100644 --- a/src/H5Defl.c +++ b/src/H5Defl.c @@ -61,7 +61,7 @@ typedef struct H5D_efl_writevv_ud_t { /* Layout operation callbacks */ static herr_t H5D__efl_construct(H5F_t *f, H5D_t *dset); -static herr_t H5D__efl_io_init(const H5D_io_info_t *io_info, const H5D_type_info_t *type_info, hsize_t nelmts, +static herr_t H5D__efl_io_init(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, hsize_t nelmts, const H5S_t *file_space, const H5S_t *mem_space, H5D_chunk_map_t *cm); static ssize_t H5D__efl_readvv(const H5D_io_info_t *io_info, size_t dset_max_nseq, size_t *dset_curr_seq, size_t dset_len_arr[], hsize_t dset_offset_arr[], size_t mem_max_nseq, @@ -197,7 +197,7 @@ H5D__efl_is_space_alloc(const H5O_storage_t H5_ATTR_UNUSED *storage) *------------------------------------------------------------------------- */ static herr_t -H5D__efl_io_init(const H5D_io_info_t *io_info, const H5D_type_info_t H5_ATTR_UNUSED *type_info, +H5D__efl_io_init(H5D_io_info_t *io_info, const H5D_type_info_t H5_ATTR_UNUSED *type_info, hsize_t H5_ATTR_UNUSED nelmts, const H5S_t H5_ATTR_UNUSED *file_space, const H5S_t H5_ATTR_UNUSED *mem_space, H5D_chunk_map_t H5_ATTR_UNUSED *cm) { diff --git a/src/H5Dio.c b/src/H5Dio.c index 03400cf..cc5f5bb 100644 --- a/src/H5Dio.c +++ b/src/H5Dio.c @@ -622,6 +622,10 @@ H5D__ioinfo_init(H5D_t *dset, const H5D_type_info_t *type_info, H5D_storage_t *s io_info->io_ops.single_write = H5D__scatgath_write; } /* end else */ + /* Start with selection I/O off, layout callback will turn it on if + * appropriate */ + io_info->use_select_io = FALSE; + #ifdef H5_HAVE_PARALLEL /* Determine if the file was opened with an MPI VFD */ io_info->using_mpi_vfd = H5F_HAS_FEATURE(dset->oloc.file, H5FD_FEAT_HAS_MPI); @@ -841,8 +845,11 @@ H5D__ioinfo_adjust(H5D_io_info_t *io_info, const H5D_t *dset, const H5S_t *file_ H5CX_set_mpio_actual_io_mode(H5D_MPIO_NO_COLLECTIVE); } /* end if */ - /* Make any parallel I/O adjustments */ - if (io_info->using_mpi_vfd) { + /* Make any parallel I/O adjustments. Do not use collective code path if + * we're using selection I/O - in this case the file driver will handle it. + */ + /* Check for selection/vector support in file driver? -NAF */ + if (io_info->using_mpi_vfd /*&& !H5_use_selection_io_g*/) { H5FD_mpio_xfer_t xfer_mode; /* Parallel transfer for this request */ htri_t opt; /* Flag whether a selection is optimizable */ diff --git a/src/H5Dpkg.h b/src/H5Dpkg.h index de04ac9..f9014a7 100644 --- a/src/H5Dpkg.h +++ b/src/H5Dpkg.h @@ -121,7 +121,7 @@ typedef herr_t (*H5D_layout_construct_func_t)(H5F_t *f, H5D_t *dset); typedef herr_t (*H5D_layout_init_func_t)(H5F_t *f, const H5D_t *dset, hid_t dapl_id); typedef hbool_t (*H5D_layout_is_space_alloc_func_t)(const H5O_storage_t *storage); typedef hbool_t (*H5D_layout_is_data_cached_func_t)(const H5D_shared_t *shared_dset); -typedef herr_t (*H5D_layout_io_init_func_t)(const struct H5D_io_info_t *io_info, +typedef herr_t (*H5D_layout_io_init_func_t)(struct H5D_io_info_t *io_info, const H5D_type_info_t *type_info, hsize_t nelmts, const H5S_t *file_space, const H5S_t *mem_space, struct H5D_chunk_map_t *cm); @@ -223,6 +223,7 @@ typedef struct H5D_io_info_t { H5D_layout_ops_t layout_ops; /* Dataset layout I/O operation function pointers */ H5D_io_ops_t io_ops; /* I/O operation function pointers */ H5D_io_op_type_t op_type; + hbool_t use_select_io; /* Whether to use selection I/O */ union { void * rbuf; /* Pointer to buffer for read */ const void *wbuf; /* Pointer to buffer to write */ diff --git a/src/H5FDint.c b/src/H5FDint.c index bdb7c68..f558b59 100644 --- a/src/H5FDint.c +++ b/src/H5FDint.c @@ -884,15 +884,15 @@ H5FD__read_selection_translate(H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id, uin void *tmp_ptr; /* Reallocate arrays */ - if (NULL == (tmp_ptr = H5MM_realloc(addrs, vec_arr_nalloc * 2))) + if (NULL == (tmp_ptr = H5MM_realloc(addrs, vec_arr_nalloc * sizeof(*addrs) * 2))) HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "memory reallocation failed for address list") addrs = tmp_ptr; - if (NULL == (tmp_ptr = H5MM_realloc(sizes, vec_arr_nalloc * 2))) + if (NULL == (tmp_ptr = H5MM_realloc(sizes, vec_arr_nalloc * sizeof(*sizes) * 2))) HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "memory reallocation failed for size list") sizes = tmp_ptr; - if (NULL == (tmp_ptr = H5MM_realloc(vec_bufs, vec_arr_nalloc * 2))) + if (NULL == (tmp_ptr = H5MM_realloc(vec_bufs, vec_arr_nalloc * sizeof(*vec_bufs) * 2))) HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "memory reallocation failed for buffer list") vec_bufs = tmp_ptr; @@ -1503,15 +1503,15 @@ H5FD__write_selection_translate(H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id, ui void *tmp_ptr; /* Reallocate arrays */ - if (NULL == (tmp_ptr = H5MM_realloc(addrs, vec_arr_nalloc * 2))) + if (NULL == (tmp_ptr = H5MM_realloc(addrs, vec_arr_nalloc * sizeof(*addrs) * 2))) HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "memory reallocation failed for address list") addrs = tmp_ptr; - if (NULL == (tmp_ptr = H5MM_realloc(sizes, vec_arr_nalloc * 2))) + if (NULL == (tmp_ptr = H5MM_realloc(sizes, vec_arr_nalloc * sizeof(*sizes) * 2))) HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "memory reallocation failed for size list") sizes = tmp_ptr; - if (NULL == (tmp_ptr = H5MM_realloc(vec_bufs, vec_arr_nalloc * 2))) + if (NULL == (tmp_ptr = H5MM_realloc(vec_bufs, vec_arr_nalloc * sizeof(*vec_bufs) * 2))) HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "memory reallocation failed for buffer list") vec_bufs = tmp_ptr; diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c index 1db700a..9a99b45 100644 --- a/src/H5FDmpio.c +++ b/src/H5FDmpio.c @@ -1281,7 +1281,7 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU #endif HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", 0) - /* Get the type's size */ + /* Get the type's size */ #if MPI_VERSION >= 3 if (MPI_SUCCESS != (mpi_code = MPI_Type_size_x(buf_type, &type_size))) #else @@ -1592,6 +1592,17 @@ H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t cou H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */ H5FD_mpio_collective_opt_t coll_opt_mode; /* whether we are doing collective or independent I/O */ int size_i; +#if MPI_VERSION >= 3 + MPI_Count bytes_read = 0; /* Number of bytes read in */ + MPI_Count type_size; /* MPI datatype used for I/O's size */ + MPI_Count io_size; /* Actual number of bytes requested */ + MPI_Count n; +#else + int bytes_read = 0; /* Number of bytes read in */ + int type_size; /* MPI datatype used for I/O's size */ + int io_size; /* Actual number of bytes requested */ + int n; +#endif herr_t ret_value = SUCCEED; FUNC_ENTER_STATIC @@ -1615,19 +1626,6 @@ H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t cou HDassert((count == 0) || (sizes[0] != 0)); HDassert((count == 0) || (types[0] != H5FD_MEM_NOLIST)); - /* sort the vector I/O request into increasing address order if required - * - * If the vector is already sorted, the base addresses of types, addrs, sizes, - * and bufs will be returned in s_types, s_addrs, s_sizes, and s_bufs respectively. - * - * If the vector was not already sorted, new, sorted versions of types, addrs, sizes, and bufs - * are allocated, populated, and returned in s_types, s_addrs, s_sizes, and s_bufs respectively. - * In this case, this function must free the memory allocated for the sorted vectors. - */ - if (H5FD_sort_vector_io_req(&vector_was_sorted, count, types, addrs, sizes, bufs, &s_types, &s_addrs, - &s_sizes, &s_bufs) < 0) - HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "can't sort vector I/O request") - /* Get the transfer mode from the API context * * This flag is set to H5FD_MPIO_COLLECTIVE if the API call is @@ -1643,6 +1641,19 @@ H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t cou if (count > 0) { /* create MPI derived types describing the vector write */ + /* sort the vector I/O request into increasing address order if required + * + * If the vector is already sorted, the base addresses of types, addrs, sizes, + * and bufs will be returned in s_types, s_addrs, s_sizes, and s_bufs respectively. + * + * If the vector was not already sorted, new, sorted versions of types, addrs, sizes, and bufs + * are allocated, populated, and returned in s_types, s_addrs, s_sizes, and s_bufs respectively. + * In this case, this function must free the memory allocated for the sorted vectors. + */ + if (H5FD_sort_vector_io_req(&vector_was_sorted, count, types, addrs, sizes, bufs, &s_types, &s_addrs, + &s_sizes, &s_bufs) < 0) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "can't sort vector I/O request") + if ((NULL == (mpi_block_lengths = (int *)HDmalloc((size_t)count * sizeof(int)))) || (NULL == (mpi_displacments = (MPI_Aint *)HDmalloc((size_t)count * sizeof(MPI_Aint)))) || (NULL == (mpi_bufs = (MPI_Aint *)HDmalloc((size_t)count * sizeof(MPI_Aint))))) { @@ -1810,9 +1821,60 @@ H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t cou if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, H5FD_mpi_native_g, file->info))) HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) + + /* How many bytes were actually read? */ +#if MPI_VERSION >= 3 + if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_read))) +#else + if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read))) +#endif + HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code) + + /* Get the type's size */ +#if MPI_VERSION >= 3 + if (MPI_SUCCESS != (mpi_code = MPI_Type_size_x(buf_type, &type_size))) +#else + if (MPI_SUCCESS != (mpi_code = MPI_Type_size(buf_type, &type_size))) +#endif + HMPI_GOTO_ERROR(FAIL, "MPI_Type_size failed", mpi_code) + + /* Compute the actual number of bytes requested */ + io_size = type_size * size_i; + + /* Check for read failure */ + if (bytes_read < 0 || bytes_read > io_size) + HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed") + + /* Check for incomplete read */ + n = io_size - bytes_read; + if (n > 0) { + i = (int)count - 1; + + /* Iterate over sorted array in reverse, filling in zeroes to + * sections of the buffers that were not read to */ + do { + HDassert(i >= 0); + +#if MPI_VERSION >= 3 + io_size = MIN(n, (MPI_Count)s_sizes[i]); + bytes_read = (MPI_Count)s_sizes[i] - io_size; +#else + io_size = MIN(n, (int)s_sizes[i]); + bytes_read = (int)s_sizes[i] - io_size; +#endif + HDassert(bytes_read >= 0); + + HDmemset((char *)bufs[i] + bytes_read, 0, (size_t)io_size); + + n -= io_size; + i--; + } while (n > 0); + } } else if (count > 0) { + haddr_t max_addr = HADDR_MAX; + /* The read is part of an independent operation. As a result, * we can't use MPI_File_set_view() (since it it a collective operation), * and thus there is no point in setting up an MPI derived type, as @@ -1832,7 +1894,7 @@ H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t cou for (i = 0; i < (int)count; i++) { - if (H5FD_mpi_haddr_to_MPIOff(s_addrs[i], &mpi_off) < 0) + if (H5FD_mpi_haddr_to_MPIOff(addrs[i], &mpi_off) < 0) HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off") @@ -1845,16 +1907,46 @@ H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t cou } else { - size = s_sizes[i]; + size = sizes[i]; } } size_i = (int)size; /* todo: fix potential for overflow */ - if (MPI_SUCCESS != - (mpi_code = MPI_File_read_at(file->f, mpi_off, s_bufs[i], size_i, MPI_BYTE, &mpi_stat))) + /* Check if we acutally need to do I/O */ + if (addrs[i] < max_addr) { + /* Issue read */ + if (MPI_SUCCESS != + (mpi_code = MPI_File_read_at(file->f, mpi_off, bufs[i], size_i, MPI_BYTE, &mpi_stat))) - HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code) + HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code) + + /* How many bytes were actually read? */ +#if MPI_VERSION >= 3 + if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, MPI_BYTE, &bytes_read))) +#else + if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read))) +#endif + HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code) + + /* Check for read failure */ + if (bytes_read < 0 || bytes_read > size_i) + HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed") + + /* + * If we didn't read the entire I/O, fill in zeroes beyond end of + * the physical MPI file and don't issue any more reads at higher + * addresses. + */ + if ((n = (size_i - bytes_read)) > 0) { + HDmemset((char *)bufs[i] + bytes_read, 0, (size_t)n); + max_addr = addrs[i] + (haddr_t)bytes_read; + } + } + else { + /* Read is past the max address, fill in zeroes */ + HDmemset((char *)bufs[i], 0, size); + } } } -- cgit v0.12