diff options
Diffstat (limited to 'src/H5FDmpio.c')
-rw-r--r-- | src/H5FDmpio.c | 1135 |
1 files changed, 1098 insertions, 37 deletions
diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c index 4aa8a96..6b639f9 100644 --- a/src/H5FDmpio.c +++ b/src/H5FDmpio.c @@ -87,49 +87,66 @@ static herr_t H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, ha void *buf); static herr_t H5FD__mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size, const void *buf); +static herr_t H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t count, + H5FD_mem_t types[], haddr_t addrs[], size_t sizes[], void *bufs[]); +static herr_t H5FD__mpio_write_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t count, + H5FD_mem_t types[], haddr_t addrs[], size_t sizes[], + const void *bufs[]); static herr_t H5FD__mpio_flush(H5FD_t *_file, hid_t dxpl_id, hbool_t closing); static herr_t H5FD__mpio_truncate(H5FD_t *_file, hid_t dxpl_id, hbool_t closing); static herr_t H5FD__mpio_delete(const char *filename, hid_t fapl_id); static herr_t H5FD__mpio_ctl(H5FD_t *_file, uint64_t op_code, uint64_t flags, const void *input, void **output); +/* Other functions */ +static herr_t H5FD__mpio_vector_build_types( + uint32_t count, H5FD_mem_t types[], haddr_t addrs[], size_t sizes[], H5_flexible_const_ptr_t bufs[], + haddr_t *s_addrs[], size_t *s_sizes[], H5_flexible_const_ptr_t *s_bufs[], hbool_t *vector_was_sorted, + MPI_Offset *mpi_off, H5_flexible_const_ptr_t *mpi_bufs_base, int *size_i, MPI_Datatype *buf_type, + hbool_t *buf_type_created, MPI_Datatype *file_type, hbool_t *file_type_created, char *unused); + /* The MPIO file driver information */ static const H5FD_class_t H5FD_mpio_g = { - H5_VFD_MPIO, /* value */ - "mpio", /* name */ - HADDR_MAX, /* maxaddr */ - H5F_CLOSE_SEMI, /* fc_degree */ - H5FD__mpio_term, /* terminate */ - NULL, /* sb_size */ - NULL, /* sb_encode */ - NULL, /* sb_decode */ - 0, /* fapl_size */ - NULL, /* fapl_get */ - NULL, /* fapl_copy */ - NULL, /* fapl_free */ - 0, /* dxpl_size */ - NULL, /* dxpl_copy */ - NULL, /* dxpl_free */ - H5FD__mpio_open, /* open */ - H5FD__mpio_close, /* close */ - NULL, /* cmp */ - H5FD__mpio_query, /* query */ - NULL, /* get_type_map */ - NULL, /* alloc */ - NULL, /* free */ - H5FD__mpio_get_eoa, /* get_eoa */ - H5FD__mpio_set_eoa, /* set_eoa */ - H5FD__mpio_get_eof, /* get_eof */ - H5FD__mpio_get_handle, /* get_handle */ - H5FD__mpio_read, /* read */ - H5FD__mpio_write, /* write */ - H5FD__mpio_flush, /* flush */ - H5FD__mpio_truncate, /* truncate */ - NULL, /* lock */ - NULL, /* unlock */ - H5FD__mpio_delete, /* del */ - H5FD__mpio_ctl, /* ctl */ - H5FD_FLMAP_DICHOTOMY /* fl_map */ + H5FD_CLASS_VERSION, /* struct version */ + H5_VFD_MPIO, /* value */ + "mpio", /* name */ + HADDR_MAX, /* maxaddr */ + H5F_CLOSE_SEMI, /* fc_degree */ + H5FD__mpio_term, /* terminate */ + NULL, /* sb_size */ + NULL, /* sb_encode */ + NULL, /* sb_decode */ + 0, /* fapl_size */ + NULL, /* fapl_get */ + NULL, /* fapl_copy */ + NULL, /* fapl_free */ + 0, /* dxpl_size */ + NULL, /* dxpl_copy */ + NULL, /* dxpl_free */ + H5FD__mpio_open, /* open */ + H5FD__mpio_close, /* close */ + NULL, /* cmp */ + H5FD__mpio_query, /* query */ + NULL, /* get_type_map */ + NULL, /* alloc */ + NULL, /* free */ + H5FD__mpio_get_eoa, /* get_eoa */ + H5FD__mpio_set_eoa, /* set_eoa */ + H5FD__mpio_get_eof, /* get_eof */ + H5FD__mpio_get_handle, /* get_handle */ + H5FD__mpio_read, /* read */ + H5FD__mpio_write, /* write */ + H5FD__mpio_read_vector, /* read_vector */ + H5FD__mpio_write_vector, /* write_vector */ + NULL, /* read_selection */ + NULL, /* write_selection */ + H5FD__mpio_flush, /* flush */ + H5FD__mpio_truncate, /* truncate */ + NULL, /* lock */ + NULL, /* unlock */ + H5FD__mpio_delete, /* del */ + H5FD__mpio_ctl, /* ctl */ + H5FD_FLMAP_DICHOTOMY /* fl_map */ }; #ifdef H5FDmpio_DEBUG @@ -1415,7 +1432,7 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU #ifdef H5FDmpio_DEBUG if (H5FD_mpio_debug_r_flag) HDfprintf(stderr, "%s: (%d) mpi_off = %ld bytes_read = %lld type = %s\n", __func__, file->mpi_rank, - (long)mpi_off, bytes_read, H5FD__mem_t_to_str(type)); + (long)mpi_off, (long long)bytes_read, H5FD__mem_t_to_str(type)); #endif /* @@ -1636,7 +1653,7 @@ H5FD__mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, h #ifdef H5FDmpio_DEBUG if (H5FD_mpio_debug_w_flag) HDfprintf(stderr, "%s: (%d) mpi_off = %ld bytes_written = %lld type = %s\n", __func__, - file->mpi_rank, (long)mpi_off, bytes_written, H5FD__mem_t_to_str(type)); + file->mpi_rank, (long)mpi_off, (long long)bytes_written, H5FD__mem_t_to_str(type)); #endif /* Each process will keep track of its perceived EOF value locally, and @@ -1663,6 +1680,1050 @@ done: } /* end H5FD__mpio_write() */ /*------------------------------------------------------------------------- + * Function: H5FD__mpio_vector_build_types + * + * Purpose: Build MPI datatypes and calculate offset, base buffer, and + * size for MPIO vector I/O. Spun off from common code in + * H5FD__mpio_vector_read() and H5FD__mpio_vector_write(). + * + * Return: Success: SUCCEED. + * Failure: FAIL. + * + * Programmer: Neil Fortner + * March 14, 2022 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__mpio_vector_build_types(uint32_t count, H5FD_mem_t types[], haddr_t addrs[], size_t sizes[], + H5_flexible_const_ptr_t bufs[], haddr_t *s_addrs[], size_t *s_sizes[], + H5_flexible_const_ptr_t *s_bufs[], hbool_t *vector_was_sorted, + MPI_Offset *mpi_off, H5_flexible_const_ptr_t *mpi_bufs_base, int *size_i, + MPI_Datatype *buf_type, hbool_t *buf_type_created, MPI_Datatype *file_type, + hbool_t *file_type_created, char *unused) +{ + hsize_t bigio_count; /* Transition point to create derived type */ + hbool_t fixed_size = FALSE; + size_t size; + H5FD_mem_t * s_types = NULL; + int * mpi_block_lengths = NULL; + MPI_Aint mpi_bufs_base_Aint; + MPI_Aint * mpi_bufs = NULL; + MPI_Aint * mpi_displacements = NULL; + MPI_Datatype *sub_types = NULL; + uint8_t * sub_types_created = NULL; + int i; + int j; + int mpi_code; /* MPI return code */ + herr_t ret_value = SUCCEED; + + FUNC_ENTER_STATIC + + /* Sanity checks */ + HDassert(s_sizes); + HDassert(s_bufs); + HDassert(vector_was_sorted); + HDassert(*vector_was_sorted); + HDassert(mpi_off); + HDassert(mpi_bufs_base); + HDassert(size_i); + HDassert(buf_type); + HDassert(buf_type_created); + HDassert(!*buf_type_created); + HDassert(file_type); + HDassert(file_type_created); + HDassert(!*file_type_created); + HDassert(unused); + + /* Get bio I/O transition point (may be lower than 2G for testing) */ + bigio_count = H5_mpi_get_bigio_count(); + + if (count == 1) { + /* Single block. Just use a series of MPI_BYTEs for the file view. + */ + *size_i = (int)sizes[0]; + *buf_type = MPI_BYTE; + *file_type = MPI_BYTE; + *mpi_bufs_base = bufs[0]; + + /* Setup s_addrs, s_sizes and s_bufs (needed for incomplete read filling code and eof + * calculation code) */ + *s_addrs = addrs; + *s_sizes = sizes; + *s_bufs = bufs; + + /* some numeric conversions */ + if (H5FD_mpi_haddr_to_MPIOff(addrs[0], mpi_off) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI offset") + + /* Check for size overflow */ + if (sizes[0] > bigio_count) { + /* We need to work around the integer size limit of 2GB. The input size_t size + * variable cannot fit into an integer, but we can get around that limitation by + * creating a different datatype and then setting the integer size (or element + * count) to 1 when using the derived_type. */ + + if (H5_mpio_create_large_type(sizes[0], 0, MPI_BYTE, buf_type) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype") + *buf_type_created = TRUE; + + if (H5_mpio_create_large_type(sizes[0], 0, MPI_BYTE, file_type) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype") + *file_type_created = TRUE; + + *size_i = 1; + } + } + else if (count > 0) { /* create MPI derived types describing the vector write */ + + /* sort the vector I/O request into increasing address order if required + * + * If the vector is already sorted, the base addresses of types, addrs, sizes, + * and bufs will be returned in s_types, s_addrs, s_sizes, and s_bufs respectively. + * + * If the vector was not already sorted, new, sorted versions of types, addrs, sizes, and bufs + * are allocated, populated, and returned in s_types, s_addrs, s_sizes, and s_bufs respectively. + * In this case, this function must free the memory allocated for the sorted vectors. + */ + if (H5FD_sort_vector_io_req(vector_was_sorted, count, types, addrs, sizes, bufs, &s_types, s_addrs, + s_sizes, s_bufs) < 0) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "can't sort vector I/O request") + + if ((NULL == (mpi_block_lengths = (int *)HDmalloc((size_t)count * sizeof(int)))) || + (NULL == (mpi_displacements = (MPI_Aint *)HDmalloc((size_t)count * sizeof(MPI_Aint)))) || + (NULL == (mpi_bufs = (MPI_Aint *)HDmalloc((size_t)count * sizeof(MPI_Aint))))) { + + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc mpi block lengths / displacement") + } + + /* when we setup mpi_bufs[] below, all addresses are offsets from + * mpi_bufs_base. + * + * Since these offsets must all be positive, we must scan through + * s_bufs[] to find the smallest value, and choose that for + * mpi_bufs_base. + */ + + j = 0; /* guess at the index of the smallest value of s_bufs[] */ + + for (i = 1; i < (int)count; i++) { + + if ((*s_bufs)[i].cvp < (*s_bufs)[j].cvp) { + + j = i; + } + } + + *mpi_bufs_base = (*s_bufs)[j]; + + if (MPI_SUCCESS != (mpi_code = MPI_Get_address(mpi_bufs_base->cvp, &mpi_bufs_base_Aint))) + + HMPI_GOTO_ERROR(FAIL, "MPI_Get_address for s_bufs[] to mpi_bufs_base failed", mpi_code) + + *size_i = 1; + + fixed_size = FALSE; + + /* load the mpi_block_lengths and mpi_displacements arrays */ + for (i = 0; i < (int)count; i++) { + /* Determine size of this vector element */ + if (!fixed_size) { + if ((*s_sizes)[i] == 0) { + HDassert(vector_was_sorted); + fixed_size = TRUE; + size = sizes[i - 1]; + } + else { + size = (*s_sizes)[i]; + } + } + + /* Add to block lengths and displacements arrays */ + mpi_block_lengths[i] = (int)size; + mpi_displacements[i] = (MPI_Aint)(*s_addrs)[i]; + + /* convert s_bufs[i] to MPI_Aint... */ + if (MPI_SUCCESS != (mpi_code = MPI_Get_address((*s_bufs)[i].cvp, &(mpi_bufs[i])))) + HMPI_GOTO_ERROR(FAIL, "MPI_Get_address for s_bufs[] - mpi_bufs_base failed", mpi_code) + + /*... and then subtract mpi_bufs_base_Aint from it. */ +#if ((MPI_VERSION > 3) || ((MPI_VERSION == 3) && (MPI_SUBVERSION >= 1))) + mpi_bufs[i] = MPI_Aint_diff(mpi_bufs[i], mpi_bufs_base_Aint); +#else + mpi_bufs[i] = mpi_bufs[i] - mpi_bufs_base_Aint; +#endif + + /* Check for size overflow */ + if (size > bigio_count) { + /* We need to work around the integer size limit of 2GB. The input size_t size + * variable cannot fit into an integer, but we can get around that limitation by + * creating a different datatype and then setting the integer size (or element + * count) to 1 when using the derived_type. */ + + /* Allocate arrays to keep track of types and whether they were created, if + * necessary */ + if (!sub_types) { + HDassert(!sub_types_created); + + if (NULL == (sub_types = (int *)HDmalloc((size_t)count * sizeof(MPI_Datatype)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc sub types array") + if (NULL == (sub_types_created = (uint8_t *)HDcalloc((size_t)count, 1))) { + H5MM_free(sub_types); + sub_types = NULL; + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc sub types created array") + } + + /* Initialize sub_types to all MPI_BYTE */ + for (j = 0; j < (int)count; j++) + sub_types[j] = MPI_BYTE; + } + HDassert(sub_types_created); + + /* Create type for large block */ + if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &sub_types[i]) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype") + sub_types_created[i] = TRUE; + + /* Only one of these large types for this vector element */ + mpi_block_lengths[i] = 1; + } + else + HDassert(size == (size_t)mpi_block_lengths[i]); + } + + /* create the memory MPI derived types */ + if (sub_types) { + if (MPI_SUCCESS != (mpi_code = MPI_Type_create_struct((int)count, mpi_block_lengths, mpi_bufs, + sub_types, buf_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct for buf_type failed", mpi_code) + } + else if (MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed((int)count, mpi_block_lengths, mpi_bufs, + MPI_BYTE, buf_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed for buf_type failed", mpi_code) + + *buf_type_created = TRUE; + + if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(buf_type))) + + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit for buf_type failed", mpi_code) + + /* create the file MPI derived type */ + if (sub_types) { + if (MPI_SUCCESS != (mpi_code = MPI_Type_create_struct((int)count, mpi_block_lengths, + mpi_displacements, sub_types, file_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct for file_type failed", mpi_code) + } + else if (MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed((int)count, mpi_block_lengths, + mpi_displacements, MPI_BYTE, file_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed for file_type failed", mpi_code) + + *file_type_created = TRUE; + + if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(file_type))) + + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit for file_type failed", mpi_code) + + /* Free up memory used to build types */ + HDassert(mpi_block_lengths); + HDfree(mpi_block_lengths); + mpi_block_lengths = NULL; + + HDassert(mpi_displacements); + HDfree(mpi_displacements); + mpi_displacements = NULL; + + HDassert(mpi_bufs); + HDfree(mpi_bufs); + mpi_bufs = NULL; + + if (sub_types) { + HDassert(sub_types); + + for (i = 0; i < (int)count; i++) + if (sub_types_created[i]) + MPI_Type_free(&sub_types[i]); + + HDfree(sub_types); + sub_types = NULL; + HDfree(sub_types_created); + sub_types_created = NULL; + } + + /* some numeric conversions */ + if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, mpi_off) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0") + } + else { + /* setup for null participation in the collective operation. */ + *buf_type = MPI_BYTE; + *file_type = MPI_BYTE; + + /* Set non-NULL pointer for I/O operation */ + mpi_bufs_base->vp = unused; + + /* MPI count to read */ + *size_i = 0; + + /* some numeric conversions */ + if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, mpi_off) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0") + } + +done: + /* free sorted vectors if they exist */ + if (!vector_was_sorted) + if (s_types) { + HDfree(s_types); + s_types = NULL; + } + + /* Clean up on error */ + if (ret_value < 0) { + if (mpi_block_lengths) { + HDfree(mpi_block_lengths); + mpi_block_lengths = NULL; + } + + if (mpi_displacements) { + HDfree(mpi_displacements); + mpi_displacements = NULL; + } + + if (mpi_bufs) { + HDfree(mpi_bufs); + mpi_bufs = NULL; + } + + if (sub_types) { + HDassert(sub_types_created); + + for (i = 0; i < (int)count; i++) + if (sub_types_created[i]) + MPI_Type_free(&sub_types[i]); + + HDfree(sub_types); + sub_types = NULL; + HDfree(sub_types_created); + sub_types_created = NULL; + } + } + + /* Make sure we cleaned up */ + HDassert(!mpi_block_lengths); + HDassert(!mpi_displacements); + HDassert(!mpi_bufs); + HDassert(!sub_types); + HDassert(!sub_types_created); + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__mpio_vector_build_types() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__mpio_read_vector() + * + * Purpose: The behaviour of this function dependes on the value of + * the io_xfer_mode obtained from the context. + * + * If it is H5FD_MPIO_COLLECTIVE, this is a collective + * operation, which allows us to use MPI_File_set_view, and + * then perform the entire vector read in a single MPI call. + * + * Do this (if count is positive), by constructing memory + * and file derived types from the supplied vector, using + * file type to set the file view, and then reading the + * the memory type from file. Note that this read is + * either independent or collective depending on the + * value of mpio_coll_opt -- again obtained from the context. + * + * If count is zero, participate in the collective read + * (if so configured) with an empty read. + * + * Finally, set the file view back to its default state. + * + * In contrast, if io_xfer_mode is H5FD_MPIO_INDEPENDENT, + * this call is independent, and thus we cannot use + * MPI_File_set_view(). + * + * In this case, simply walk the vector, and issue an + * independent read for each entry. + * + * Return: Success: SUCCEED. + * Failure: FAIL. + * + * Programmer: John Mainzer + * March 15, 2021 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t count, H5FD_mem_t types[], + haddr_t addrs[], size_t sizes[], void *bufs[]) +{ + H5FD_mpio_t * file = (H5FD_mpio_t *)_file; + hbool_t vector_was_sorted = TRUE; + haddr_t * s_addrs = NULL; + size_t * s_sizes = NULL; + void ** s_bufs = NULL; + char unused = 0; /* Unused, except for non-NULL pointer value */ + void * mpi_bufs_base = NULL; + MPI_Datatype buf_type = MPI_BYTE; /* MPI description of the selection in memory */ + hbool_t buf_type_created = FALSE; + MPI_Datatype file_type = MPI_BYTE; /* MPI description of the selection in file */ + hbool_t file_type_created = FALSE; + int i; + int mpi_code; /* MPI return code */ + MPI_Offset mpi_off = 0; + MPI_Status mpi_stat; /* Status from I/O operation */ + H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */ + H5FD_mpio_collective_opt_t coll_opt_mode; /* whether we are doing collective or independent I/O */ + int size_i; +#if MPI_VERSION >= 3 + MPI_Count bytes_read = 0; /* Number of bytes read in */ + MPI_Count type_size; /* MPI datatype used for I/O's size */ + MPI_Count io_size; /* Actual number of bytes requested */ + MPI_Count n; +#else + int bytes_read = 0; /* Number of bytes read in */ + int type_size; /* MPI datatype used for I/O's size */ + int io_size; /* Actual number of bytes requested */ + int n; +#endif + hbool_t rank0_bcast = FALSE; /* If read-with-rank0-and-bcast flag was used */ +#ifdef H5FDmpio_DEBUG + hbool_t H5FD_mpio_debug_t_flag = (H5FD_mpio_debug_flags_s[(int)'t'] && H5FD_MPIO_TRACE_THIS_RANK(file)); + hbool_t H5FD_mpio_debug_r_flag = (H5FD_mpio_debug_flags_s[(int)'r'] && H5FD_MPIO_TRACE_THIS_RANK(file)); +#endif + herr_t ret_value = SUCCEED; + + FUNC_ENTER_STATIC + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_t_flag) + HDfprintf(stderr, "%s: (%d) Entering\n", __func__, file->mpi_rank); +#endif + + /* Sanity checks */ + HDassert(file); + HDassert(H5FD_MPIO == file->pub.driver_id); + HDassert((types) || (count == 0)); + HDassert((addrs) || (count == 0)); + HDassert((sizes) || (count == 0)); + HDassert((bufs) || (count == 0)); + + /* verify that the first elements of the sizes and types arrays are + * valid. + */ + HDassert((count == 0) || (sizes[0] != 0)); + HDassert((count == 0) || (types[0] != H5FD_MEM_NOLIST)); + + /* Get the transfer mode from the API context + * + * This flag is set to H5FD_MPIO_COLLECTIVE if the API call is + * collective, and to H5FD_MPIO_INDEPENDENT if it is not. + * + * While this doesn't mean that we are actually about to do a collective + * read, it does mean that all ranks are here, so we can use MPI_File_set_view(). + */ + if (H5CX_get_io_xfer_mode(&xfer_mode) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode") + + if (xfer_mode == H5FD_MPIO_COLLECTIVE) { + /* Build MPI types, etc. */ + if (H5FD__mpio_vector_build_types(count, types, addrs, sizes, (H5_flexible_const_ptr_t *)bufs, + &s_addrs, &s_sizes, (H5_flexible_const_ptr_t **)&s_bufs, + &vector_was_sorted, &mpi_off, + (H5_flexible_const_ptr_t *)&mpi_bufs_base, &size_i, &buf_type, + &buf_type_created, &file_type, &file_type_created, &unused) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't build MPI datatypes for I/O") + + /* free sorted addrs vector if it exists */ + if (!vector_was_sorted) + if (s_addrs) { + HDfree(s_addrs); + s_addrs = NULL; + } + + /* Portably initialize MPI status variable */ + HDmemset(&mpi_stat, 0, sizeof(mpi_stat)); + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_r_flag) + HDfprintf(stdout, "%s: mpi_off = %ld size_i = %d\n", __func__, (long)mpi_off, size_i); +#endif + + /* Setup the file view. */ + if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, mpi_off, MPI_BYTE, file_type, + H5FD_mpi_native_g, file->info))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) + + /* Reset mpi_off to 0 since the view now starts at the data offset */ + if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, &mpi_off) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0") + + /* Get the collective_opt property to check whether the application wants to do IO individually. + */ + if (H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0) + + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property") + + /* Read the data. */ +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_r_flag) + HDfprintf(stdout, "%s: using MPIO collective mode\n", __func__); +#endif + if (coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) { +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_r_flag) + HDfprintf(stdout, "%s: doing MPI collective IO\n", __func__); +#endif + /* Check whether we should read from rank 0 and broadcast to other ranks */ + if (H5CX_get_mpio_rank0_bcast()) { +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_r_flag) + HDfprintf(stdout, "%s: doing read-rank0-and-MPI_Bcast\n", __func__); +#endif + /* Indicate path we've taken */ + rank0_bcast = TRUE; + + /* Read on rank 0 Bcast to other ranks */ + if (file->mpi_rank == 0) + if (MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, mpi_bufs_base, size_i, + buf_type, &mpi_stat))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code) + if (MPI_SUCCESS != (mpi_code = MPI_Bcast(mpi_bufs_base, size_i, buf_type, 0, file->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code) + } /* end if */ + else if (MPI_SUCCESS != (mpi_code = MPI_File_read_at_all(file->f, mpi_off, mpi_bufs_base, size_i, + buf_type, &mpi_stat))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code) + } /* end if */ + else if (size_i > 0) { +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_r_flag) + HDfprintf(stdout, "%s: doing MPI independent IO\n", __func__); +#endif + + if (MPI_SUCCESS != + (mpi_code = MPI_File_read_at(file->f, mpi_off, mpi_bufs_base, size_i, buf_type, &mpi_stat))) + + HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code) + + } /* end else */ + + /* Reset the file view */ + if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, + H5FD_mpi_native_g, file->info))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) + + /* Only retrieve bytes read if this rank _actually_ participated in I/O */ + if (!rank0_bcast || (rank0_bcast && file->mpi_rank == 0)) { + /* How many bytes were actually read? */ +#if MPI_VERSION >= 3 + if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_read))) +#else + if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read))) +#endif + HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code) + } /* end if */ + + /* If the rank0-bcast feature was used, broadcast the # of bytes read to + * other ranks, which didn't perform any I/O. + */ + /* NOTE: This could be optimized further to be combined with the broadcast + * of the data. (QAK - 2019/1/2) + * Or have rank 0 clear the unread parts of the buffer prior to + * the bcast. (NAF - 2021/9/15) + */ + if (rank0_bcast) +#if MPI_VERSION >= 3 + if (MPI_SUCCESS != MPI_Bcast(&bytes_read, 1, MPI_COUNT, 0, file->comm)) +#else + if (MPI_SUCCESS != MPI_Bcast(&bytes_read, 1, MPI_INT, 0, file->comm)) +#endif + HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", 0) + + /* Get the type's size */ +#if MPI_VERSION >= 3 + if (MPI_SUCCESS != (mpi_code = MPI_Type_size_x(buf_type, &type_size))) +#else + if (MPI_SUCCESS != (mpi_code = MPI_Type_size(buf_type, &type_size))) +#endif + HMPI_GOTO_ERROR(FAIL, "MPI_Type_size failed", mpi_code) + + /* Compute the actual number of bytes requested */ + io_size = type_size * size_i; + + /* Check for read failure */ + if (bytes_read < 0 || bytes_read > io_size) + HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed") + + /* Check for incomplete read */ + n = io_size - bytes_read; + if (n > 0) { + i = (int)count - 1; + + /* Iterate over sorted array in reverse, filling in zeroes to + * sections of the buffers that were not read to */ + do { + HDassert(i >= 0); + +#if MPI_VERSION >= 3 + io_size = MIN(n, (MPI_Count)s_sizes[i]); + bytes_read = (MPI_Count)s_sizes[i] - io_size; +#else + io_size = MIN(n, (int)s_sizes[i]); + bytes_read = (int)s_sizes[i] - io_size; +#endif + HDassert(bytes_read >= 0); + + HDmemset((char *)s_bufs[i] + bytes_read, 0, (size_t)io_size); + + n -= io_size; + i--; + } while (n > 0); + } + } + else if (count > 0) { + haddr_t max_addr = HADDR_MAX; + hbool_t fixed_size = FALSE; + size_t size; + + /* The read is part of an independent operation. As a result, + * we can't use MPI_File_set_view() (since it it a collective operation), + * and thus we can't use the above code to construct the MPI datatypes. + * In the future, we could write code to detect when a contiguous slab + * in the file selection spans multiple vector elements and construct a + * memory datatype to match this larger block in the file, but for now + * just read in each element of the vector in a separate + * MPI_File_read_at() call. + * + * We could also just detect the case when the entire file selection is + * contiguous, which would allow us to use + * H5FD__mpio_vector_build_types() to construct the memory datatype. + */ + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_r_flag) + HDfprintf(stdout, "%s: doing MPI independent IO\n", __func__); +#endif + + /* Loop over vector elements */ + for (i = 0; i < (int)count; i++) { + /* Convert address to mpi offset */ + if (H5FD_mpi_haddr_to_MPIOff(addrs[i], &mpi_off) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off") + + /* Calculate I/O size */ + if (!fixed_size) { + if (sizes[i] == 0) { + fixed_size = TRUE; + size = sizes[i - 1]; + } + else { + size = sizes[i]; + } + } + size_i = (int)size; + + if (size != (size_t)size_i) { + /* If HERE, then we need to work around the integer size limit + * of 2GB. The input size_t size variable cannot fit into an integer, + * but we can get around that limitation by creating a different datatype + * and then setting the integer size (or element count) to 1 when using + * the derived_type. + */ + + if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &buf_type) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype") + + buf_type_created = TRUE; + size_i = 1; + } + + /* Check if we actually need to do I/O */ + if (addrs[i] < max_addr) { + /* Portably initialize MPI status variable */ + HDmemset(&mpi_stat, 0, sizeof(mpi_stat)); + + /* Issue read */ + if (MPI_SUCCESS != + (mpi_code = MPI_File_read_at(file->f, mpi_off, bufs[i], size_i, buf_type, &mpi_stat))) + + HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code) + + /* How many bytes were actually read? */ +#if MPI_VERSION >= 3 + if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, MPI_BYTE, &bytes_read))) +#else + if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read))) +#endif + HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code) + + /* Compute the actual number of bytes requested */ +#if MPI_VERSION >= 3 + io_size = (MPI_Count)size; +#else + io_size = (int)size; +#endif + + /* Check for read failure */ + if (bytes_read < 0 || bytes_read > io_size) + HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed") + + /* + * If we didn't read the entire I/O, fill in zeroes beyond end of + * the physical MPI file and don't issue any more reads at higher + * addresses. + */ + if ((n = (io_size - bytes_read)) > 0) { + HDmemset((char *)bufs[i] + bytes_read, 0, (size_t)n); + max_addr = addrs[i] + (haddr_t)bytes_read; + } + } + else { + /* Read is past the max address, fill in zeroes */ + HDmemset((char *)bufs[i], 0, size); + } + } + } + +done: + if (buf_type_created) { + MPI_Type_free(&buf_type); + } + + if (file_type_created) { + MPI_Type_free(&file_type); + } + + /* free sorted vectors if they exist */ + if (!vector_was_sorted) { + if (s_addrs) { + HDfree(s_addrs); + s_addrs = NULL; + } + if (s_sizes) { + HDfree(s_sizes); + s_sizes = NULL; + } + if (s_bufs) { + HDfree(s_bufs); + s_bufs = NULL; + } + } + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_t_flag) + HDfprintf(stdout, "%s: Leaving, proc %d: ret_value = %d\n", __func__, file->mpi_rank, ret_value); +#endif + + FUNC_LEAVE_NOAPI(ret_value) + +} /* end H5FD__mpio_read_vector() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__mpio_write_vector + * + * Purpose: The behaviour of this function dependes on the value of + * the io_xfer_mode obtained from the context. + * + * If it is H5FD_MPIO_COLLECTIVE, this is a collective + * operation, which allows us to use MPI_File_set_view, and + * then perform the entire vector write in a single MPI call. + * + * Do this (if count is positive), by constructing memory + * and file derived types from the supplied vector, using + * file type to set the file view, and then writing the + * the memory type to file. Note that this write is + * either independent or collective depending on the + * value of mpio_coll_opt -- again obtained from the context. + * + * If count is zero, participate in the collective write + * (if so configured) with an empty write. + * + * Finally, set the file view back to its default state. + * + * In contrast, if io_xfer_mode is H5FD_MPIO_INDEPENDENT, + * this call is independent, and thus we cannot use + * MPI_File_set_view(). + * + * In this case, simply walk the vector, and issue an + * independent write for each entry. + * + * Return: Success: SUCCEED. + * Failure: FAIL. + * + * Programmer: John Mainzer + * March 15, 2021 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__mpio_write_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t count, H5FD_mem_t types[], + haddr_t addrs[], size_t sizes[], const void *bufs[]) +{ + H5FD_mpio_t * file = (H5FD_mpio_t *)_file; + hbool_t vector_was_sorted = TRUE; + haddr_t * s_addrs = NULL; + size_t * s_sizes = NULL; + const void ** s_bufs = NULL; + char unused = 0; /* Unused, except for non-NULL pointer value */ + const void * mpi_bufs_base = NULL; + MPI_Datatype buf_type = MPI_BYTE; /* MPI description of the selection in memory */ + hbool_t buf_type_created = FALSE; + MPI_Datatype file_type = MPI_BYTE; /* MPI description of the selection in file */ + hbool_t file_type_created = FALSE; + int i; + int mpi_code; /* MPI return code */ + MPI_Offset mpi_off = 0; + MPI_Status mpi_stat; /* Status from I/O operation */ + H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */ + H5FD_mpio_collective_opt_t coll_opt_mode; /* whether we are doing collective or independent I/O */ + int size_i; +#ifdef H5FDmpio_DEBUG + hbool_t H5FD_mpio_debug_t_flag = (H5FD_mpio_debug_flags_s[(int)'t'] && H5FD_MPIO_TRACE_THIS_RANK(file)); + hbool_t H5FD_mpio_debug_w_flag = (H5FD_mpio_debug_flags_s[(int)'w'] && H5FD_MPIO_TRACE_THIS_RANK(file)); +#endif + haddr_t max_addr = 0; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_STATIC + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_t_flag) + HDfprintf(stderr, "%s: (%d) Entering\n", __func__, file->mpi_rank); +#endif + + /* Sanity checks */ + HDassert(file); + HDassert(H5FD_MPIO == file->pub.driver_id); + HDassert((types) || (count == 0)); + HDassert((addrs) || (count == 0)); + HDassert((sizes) || (count == 0)); + HDassert((bufs) || (count == 0)); + + /* verify that the first elements of the sizes and types arrays are + * valid. + */ + HDassert((count == 0) || (sizes[0] != 0)); + HDassert((count == 0) || (types[0] != H5FD_MEM_NOLIST)); + + /* Verify that no data is written when between MPI_Barrier()s during file flush */ + + HDassert(!H5CX_get_mpi_file_flushing()); + + /* Get the transfer mode from the API context + * + * This flag is set to H5FD_MPIO_COLLECTIVE if the API call is + * collective, and to H5FD_MPIO_INDEPENDENT if it is not. + * + * While this doesn't mean that we are actually about to do a collective + * write, it does mean that all ranks are here, so we can use MPI_File_set_view(). + */ + if (H5CX_get_io_xfer_mode(&xfer_mode) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode") + + if (xfer_mode == H5FD_MPIO_COLLECTIVE) { + /* Build MPI types, etc. */ + if (H5FD__mpio_vector_build_types(count, types, addrs, sizes, (H5_flexible_const_ptr_t *)bufs, + &s_addrs, &s_sizes, (H5_flexible_const_ptr_t **)&s_bufs, + &vector_was_sorted, &mpi_off, + (H5_flexible_const_ptr_t *)&mpi_bufs_base, &size_i, &buf_type, + &buf_type_created, &file_type, &file_type_created, &unused) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't build MPI datatypes for I/O") + + /* Compute max addr writted to */ + if (count > 0) + max_addr = s_addrs[count - 1] + (haddr_t)(s_sizes[count - 1]); + + /* free sorted vectors if they exist */ + if (!vector_was_sorted) { + if (s_addrs) { + HDfree(s_addrs); + s_addrs = NULL; + } + if (s_sizes) { + HDfree(s_sizes); + s_sizes = NULL; + } + if (s_bufs) { + HDfree(s_bufs); + s_bufs = NULL; + } + } + + /* Portably initialize MPI status variable */ + HDmemset(&mpi_stat, 0, sizeof(MPI_Status)); + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_w_flag) + HDfprintf(stdout, "%s: mpi_off = %ld size_i = %d\n", __func__, (long)mpi_off, size_i); +#endif + + /* Setup the file view. */ + if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, mpi_off, MPI_BYTE, file_type, + H5FD_mpi_native_g, file->info))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) + + /* Reset mpi_off to 0 since the view now starts at the data offset */ + if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, &mpi_off) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0") + + /* Get the collective_opt property to check whether the application wants to do IO individually. + */ + if (H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property") + + /* Write the data. */ +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_w_flag) + HDfprintf(stdout, "%s: using MPIO collective mode\n", __func__); +#endif + + if (coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) { +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_w_flag) + HDfprintf(stdout, "%s: doing MPI collective IO\n", __func__); +#endif + + if (MPI_SUCCESS != (mpi_code = MPI_File_write_at_all(file->f, mpi_off, mpi_bufs_base, size_i, + buf_type, &mpi_stat))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at_all failed", mpi_code) + } /* end if */ + else if (size_i > 0) { +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_w_flag) + HDfprintf(stdout, "%s: doing MPI independent IO\n", __func__); +#endif + + if (MPI_SUCCESS != + (mpi_code = MPI_File_write_at(file->f, mpi_off, mpi_bufs_base, size_i, buf_type, &mpi_stat))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at failed", mpi_code) + } /* end else */ + + /* Reset the file view */ + if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, + H5FD_mpi_native_g, file->info))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) + } + else if (count > 0) { + hbool_t fixed_size = FALSE; + size_t size; + + /* The read is part of an independent operation. As a result, + * we can't use MPI_File_set_view() (since it it a collective operation), + * and thus we can't use the above code to construct the MPI datatypes. + * In the future, we could write code to detect when a contiguous slab + * in the file selection spans multiple vector elements and construct a + * memory datatype to match this larger block in the file, but for now + * just read in each element of the vector in a separate + * MPI_File_read_at() call. + * + * We could also just detect the case when the entire file selection is + * contiguous, which would allow us to use + * H5FD__mpio_vector_build_types() to construct the memory datatype. + */ + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_w_flag) + HDfprintf(stdout, "%s: doing MPI independent IO\n", __func__); +#endif + + /* Loop over vector elements */ + for (i = 0; i < (int)count; i++) { + /* Convert address to mpi offset */ + if (H5FD_mpi_haddr_to_MPIOff(addrs[i], &mpi_off) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off") + + /* Calculate I/O size */ + if (!fixed_size) { + if (sizes[i] == 0) { + fixed_size = TRUE; + size = sizes[i - 1]; + } + else { + size = sizes[i]; + } + } + size_i = (int)size; + + if (size != (size_t)size_i) { + /* If HERE, then we need to work around the integer size limit + * of 2GB. The input size_t size variable cannot fit into an integer, + * but we can get around that limitation by creating a different datatype + * and then setting the integer size (or element count) to 1 when using + * the derived_type. + */ + + if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &buf_type) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype") + + buf_type_created = TRUE; + size_i = 1; + } + + /* Perform write */ + if (MPI_SUCCESS != + (mpi_code = MPI_File_write_at(file->f, mpi_off, bufs[i], size_i, buf_type, &mpi_stat))) + + HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at failed", mpi_code) + + /* Check if this is the highest address written to so far */ + if (addrs[i] + size > max_addr) + max_addr = addrs[i] + size; + } + } + + /* Each process will keep track of its perceived EOF value locally, and + * ultimately we will reduce this value to the maximum amongst all + * processes, but until then keep the actual eof at HADDR_UNDEF just in + * case something bad happens before that point. (rather have a value + * we know is wrong sitting around rather than one that could only + * potentially be wrong.) + */ + file->eof = HADDR_UNDEF; + + /* check to see if the local eof has changed been extended, and update if so */ + if (max_addr > file->local_eof) + file->local_eof = max_addr; + +done: + if (buf_type_created) + MPI_Type_free(&buf_type); + + if (file_type_created) + MPI_Type_free(&file_type); + + /* Cleanup on error */ + if (ret_value < 0 && !vector_was_sorted) { + if (s_addrs) { + HDfree(s_addrs); + s_addrs = NULL; + } + if (s_sizes) { + HDfree(s_sizes); + s_sizes = NULL; + } + if (s_bufs) { + HDfree(s_bufs); + s_bufs = NULL; + } + } + + /* Make sure we cleaned up */ + HDassert(vector_was_sorted || !s_addrs); + HDassert(vector_was_sorted || !s_sizes); + HDassert(vector_was_sorted || !s_bufs); + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_t_flag) + HDfprintf(stdout, "%s: Leaving, proc %d: ret_value = %d\n", __func__, file->mpi_rank, ret_value); +#endif + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__mpio_write_vector() */ + +/*------------------------------------------------------------------------- * Function: H5FD__mpio_flush * * Purpose: Makes sure that all data is on disk. This is collective. |