diff options
Diffstat (limited to 'src/H5FDmpio.c')
-rw-r--r-- | src/H5FDmpio.c | 908 |
1 files changed, 870 insertions, 38 deletions
diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c index b5c2684..3ec6802 100644 --- a/src/H5FDmpio.c +++ b/src/H5FDmpio.c @@ -84,6 +84,10 @@ static herr_t H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, h void *buf); static herr_t H5FD__mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size, const void *buf); +static herr_t H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t count, + H5FD_mem_t types[], haddr_t addrs[], size_t sizes[], void *bufs[]); +static herr_t H5FD__mpio_write_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t count, + H5FD_mem_t types[], haddr_t addrs[], size_t sizes[], void *bufs[]); static herr_t H5FD__mpio_flush(H5FD_t *_file, hid_t dxpl_id, hbool_t closing); static herr_t H5FD__mpio_truncate(H5FD_t *_file, hid_t dxpl_id, hbool_t closing); static int H5FD__mpio_mpi_rank(const H5FD_t *_file); @@ -94,44 +98,44 @@ static MPI_Comm H5FD__mpio_communicator(const H5FD_t *_file); static const H5FD_class_mpi_t H5FD_mpio_g = { { /* Start of superclass information */ - "mpio", /*name */ - HADDR_MAX, /*maxaddr */ - H5F_CLOSE_SEMI, /*fc_degree */ - H5FD__mpio_term, /*terminate */ - NULL, /*sb_size */ - NULL, /*sb_encode */ - NULL, /*sb_decode */ - 0, /*fapl_size */ - NULL, /*fapl_get */ - NULL, /*fapl_copy */ - NULL, /*fapl_free */ - 0, /*dxpl_size */ - NULL, /*dxpl_copy */ - NULL, /*dxpl_free */ - H5FD__mpio_open, /*open */ - H5FD__mpio_close, /*close */ - NULL, /*cmp */ - H5FD__mpio_query, /*query */ - NULL, /*get_type_map */ - NULL, /*alloc */ - NULL, /*free */ - H5FD__mpio_get_eoa, /*get_eoa */ - H5FD__mpio_set_eoa, /*set_eoa */ - H5FD__mpio_get_eof, /*get_eof */ - H5FD__mpio_get_handle, /*get_handle */ - H5FD__mpio_read, /*read */ - H5FD__mpio_write, /*write */ - NULL, /*read_vector */ - NULL, /*write_vector */ - H5FD__mpio_flush, /*flush */ - H5FD__mpio_truncate, /*truncate */ - NULL, /*lock */ - NULL, /*unlock */ - H5FD_FLMAP_DICHOTOMY /*fl_map */ - }, /* End of superclass information */ - H5FD__mpio_mpi_rank, /*get_rank */ - H5FD__mpio_mpi_size, /*get_size */ - H5FD__mpio_communicator /*get_comm */ + "mpio", /*name */ + HADDR_MAX, /*maxaddr */ + H5F_CLOSE_SEMI, /*fc_degree */ + H5FD__mpio_term, /*terminate */ + NULL, /*sb_size */ + NULL, /*sb_encode */ + NULL, /*sb_decode */ + 0, /*fapl_size */ + NULL, /*fapl_get */ + NULL, /*fapl_copy */ + NULL, /*fapl_free */ + 0, /*dxpl_size */ + NULL, /*dxpl_copy */ + NULL, /*dxpl_free */ + H5FD__mpio_open, /*open */ + H5FD__mpio_close, /*close */ + NULL, /*cmp */ + H5FD__mpio_query, /*query */ + NULL, /*get_type_map */ + NULL, /*alloc */ + NULL, /*free */ + H5FD__mpio_get_eoa, /*get_eoa */ + H5FD__mpio_set_eoa, /*set_eoa */ + H5FD__mpio_get_eof, /*get_eof */ + H5FD__mpio_get_handle, /*get_handle */ + H5FD__mpio_read, /*read */ + H5FD__mpio_write, /*write */ + H5FD__mpio_read_vector, /*read_vector */ + H5FD__mpio_write_vector, /*write_vector */ + H5FD__mpio_flush, /*flush */ + H5FD__mpio_truncate, /*truncate */ + NULL, /*lock */ + NULL, /*unlock */ + H5FD_FLMAP_DICHOTOMY /*fl_map */ + }, /* End of superclass information */ + H5FD__mpio_mpi_rank, /*get_rank */ + H5FD__mpio_mpi_size, /*get_size */ + H5FD__mpio_communicator /*get_comm */ }; #ifdef H5FDmpio_DEBUG @@ -1521,6 +1525,834 @@ done: } /* end H5FD__mpio_write() */ /*------------------------------------------------------------------------- + * Function: H5FD__mpio_read_vector() + * + * Purpose: The behaviour of this function dependes on the value of + * the io_xfer_mode obtained from the context. + * + * If it is H5FD_MPIO_COLLECTIVE, this is a collective + * operation, which allows us to use MPI_File_set_view, and + * then perform the entire vector read in a single MPI call. + * + * Do this (if count is positive), by constructing memory + * and file derived types from the supplied vector, using + * file type to set the file view, and then reading the + * the memory type from file. Note that this read is + * either independent or collective depending on the + * value of mpio_coll_opt -- again obtained from the context. + * + * If count is zero, participate in the collective read + * (if so configured) with an empty read. + * + * Finally, set the file view back to its default state. + * + * In contrast, if io_xfer_mode is H5FD_MPIO_INDEPENDENT, + * this call is independent, and thus we cannot use + * MPI_File_set_view(). + * + * In this case, simply walk the vector, and issue an + * independent read for each entry. + * + * WARNING: At present, this function makes no provision + * entries of size greater than 2 GB in the vector. This + * will have to be fixed before release. + * + * Return: Success: SUCCEED. + * Failure: FAIL. + * + * Programmer: John Mainzer + * March 15, 2021 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t count, H5FD_mem_t types[], + haddr_t addrs[], size_t sizes[], void *bufs[]) +{ + H5FD_mpio_t * file = (H5FD_mpio_t *)_file; + hbool_t vector_was_sorted = TRUE; + hbool_t fixed_size = FALSE; + size_t size; + H5FD_mem_t * s_types = NULL; + haddr_t * s_addrs = NULL; + size_t * s_sizes = NULL; + void ** s_bufs = NULL; + int * mpi_block_lengths = NULL; + char unused = 0; /* Unused, except for non-NULL pointer value */ + void * mpi_bufs_base = NULL; + MPI_Aint mpi_bufs_base_Aint; + MPI_Aint * mpi_bufs = NULL; + MPI_Aint * mpi_displacments = NULL; + MPI_Datatype buf_type = MPI_BYTE; /* MPI description of the selection in memory */ + hbool_t buf_type_created = FALSE; + MPI_Datatype file_type = MPI_BYTE; /* MPI description of the selection in file */ + hbool_t file_type_created = FALSE; + int i; + int j; + int mpi_code; /* MPI return code */ + MPI_Offset mpi_off = 0; + MPI_Status mpi_stat; /* Status from I/O operation */ + H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */ + H5FD_mpio_collective_opt_t coll_opt_mode; /* whether we are doing collective or independent I/O */ + int size_i; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_STATIC + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_Debug[(int)'t']) + HDfprintf(stdout, "%s: Entering\n", FUNC); +#endif + + /* Sanity checks */ + HDassert(file); + HDassert(H5FD_MPIO == file->pub.driver_id); + HDassert((types) || (count == 0)); + HDassert((addrs) || (count == 0)); + HDassert((sizes) || (count == 0)); + HDassert((bufs) || (count == 0)); + + /* verify that the first elements of the sizes and types arrays are + * valid. + */ + HDassert((count == 0) || (sizes[0] != 0)); + HDassert((count == 0) || (types[0] != H5FD_MEM_NOLIST)); + + + /* sort the vector I/O request into increasing address order if required + * + * If the vector is already sorted, the base addresses of types, addrs, sizes, + * and bufs will be returned in s_types, s_addrs, s_sizes, and s_bufs respectively. + * + * If the vector was not already sorted, new, sorted versions of types, addrs, sizes, and bufs + * are allocated, populated, and returned in s_types, s_addrs, s_sizes, and s_bufs respectively. + * In this case, this function must free the memory allocated for the sorted vectors. + */ + if (H5FD_sort_vector_io_req(&vector_was_sorted, count, types, addrs, sizes, bufs, &s_types, &s_addrs, + &s_sizes, &s_bufs) < 0) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "can't sort vector I/O request") + + + /* Get the transfer mode from the API context + * + * This flag is set to H5FD_MPIO_COLLECTIVE if the API call is + * collective, and to H5FD_MPIO_INDEPENDENT if it is not. + * + * While this doesn't mean that we are actually about to do a collective + * read, it does mean that all ranks are here, so we can use MPI_File_set_view(). + */ + if (H5CX_get_io_xfer_mode(&xfer_mode) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode") + + if (xfer_mode == H5FD_MPIO_COLLECTIVE) { + + if (count > 0) { /* create MPI derived types describing the vector write */ + + if ((NULL == (mpi_block_lengths = (int *)HDmalloc((size_t)count * sizeof(int)))) || + (NULL == (mpi_displacments = (MPI_Aint *)HDmalloc((size_t)count * sizeof(MPI_Aint)))) || + (NULL == (mpi_bufs = (MPI_Aint *)HDmalloc((size_t)count * sizeof(MPI_Aint))))) { + + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc mpi block lengths / displacement") + } + + /* when we setup mpi_bufs[] below, all addresses are offsets from + * mpi_bufs_base. + * + * Since these offsets must all be positive, we must scan through + * s_bufs[] to find the smallest value, and choose that for + * mpi_bufs_base. + */ + + j = 0; /* guess at the index of the smallest value of s_bufs[] */ + + for (i = 1; i < (int)count; i++) { + + if (s_bufs[i] < s_bufs[j]) { + + j = i; + } + } + + mpi_bufs_base = s_bufs[j]; + + if (MPI_SUCCESS != (mpi_code = MPI_Get_address(mpi_bufs_base, &mpi_bufs_base_Aint))) + + HMPI_GOTO_ERROR(FAIL, "MPI_Get_address for s_bufs[] to mpi_bufs_base failed", mpi_code) + + size_i = 1; + + fixed_size = FALSE; + + /* load the mpi_block_lengths and mpi_displacements arrays */ + for (i = 0; i < (int)count; i++) { + + if (!fixed_size) { + + if (sizes[i] == 0) { + + fixed_size = TRUE; + size = sizes[i - 1]; + } + else { + + size = s_sizes[i]; + } + } + + /* There is an obvious possibility of an overflow here, as size_t + * will typically be 64 bits, where as int will typically be 32 bits. + * This must be fixed, but it should be good enough for initial + * correctness testing. + * JRM -- 3/17/21 + */ + mpi_block_lengths[i] = (int)size; + mpi_displacments[i] = (MPI_Aint)s_addrs[i]; + + /* convert s_bufs[i] to MPI_Aint... */ + if (MPI_SUCCESS != (mpi_code = MPI_Get_address(s_bufs[i], &(mpi_bufs[i])))) + + HMPI_GOTO_ERROR(FAIL, "MPI_Get_address for s_bufs[] - mpi_bufs_base failed", mpi_code) + + /*... and then subtract mpi_bufs_base_Aint from it. */ +#if ((MPI_VERSION > 3) || ((MPI_VERSION == 3) && (MPI_SUBVERSION >= 1))) + mpi_bufs[i] = MPI_Aint_diff(mpi_bufs[i], mpi_bufs_base_Aint); +#else + mpi_bufs[i] = mpi_bufs[i] - mpi_bufs_base_Aint; +#endif + } + + + /* create the memory MPI derived types */ + if (MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed((int)count, mpi_block_lengths, mpi_bufs, + MPI_BYTE, &buf_type))) + + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed for buf_type failed", mpi_code) + + buf_type_created = TRUE; + + if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(&buf_type))) + + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit for buf_type failed", mpi_code) + + + /* create the file MPI derived type */ + if (MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed((int)count, mpi_block_lengths, + mpi_displacments, MPI_BYTE, &file_type))) + + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed for file_type failed", mpi_code) + + file_type_created = TRUE; + + if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(&file_type))) + + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit for file_type failed", mpi_code) + } + else { + + /* setup for null participation in the collective operation. */ + + buf_type = MPI_BYTE; + file_type = MPI_BYTE; + + /* Set non-NULL pointer for I/O operation */ + mpi_bufs_base = (void *)(&unused); + + /* MPI count to read */ + size_i = 0; + } + + /* Portably initialize MPI status variable */ + HDmemset(&mpi_stat, 0, sizeof(MPI_Status)); + + /* some numeric conversions */ + if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, &mpi_off) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0") + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_Debug[(int)'r']) + HDfprintf(stdout, "%s: mpi_off = %ld size_i = %d\n", FUNC, (long)mpi_off, size_i); +#endif + + /* Setup the file view. */ + if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, mpi_off, MPI_BYTE, file_type, + H5FD_mpi_native_g, file->info))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) + + /* Get the collective_opt property to check whether the application wants to do IO individually. + */ + if (H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0) + + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property") + + /* Read the data. */ +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_Debug[(int)'r']) + HDfprintf(stdout, "%s: using MPIO collective mode\n", FUNC); +#endif + + if (coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) { +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_Debug[(int)'r']) + HDfprintf(stdout, "%s: doing MPI collective IO\n", FUNC); +#endif + + if (MPI_SUCCESS != (mpi_code = MPI_File_read_at_all(file->f, mpi_off, mpi_bufs_base, size_i, + buf_type, &mpi_stat))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code) + } /* end if */ + else if (size_i > 0) { +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_Debug[(int)'r']) + HDfprintf(stdout, "%s: doing MPI independent IO\n", FUNC); +#endif + + if (MPI_SUCCESS != + (mpi_code = MPI_File_read_at(file->f, mpi_off, mpi_bufs_base, size_i, buf_type, &mpi_stat))) + + HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code) + + } /* end else */ + + /* Reset the file view */ + if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, + H5FD_mpi_native_g, file->info))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) + } + else if (count > 0) { + + /* The read is part of an independent operation. As a result, + * we can't use MPI_File_set_view() (since it it a collective operation), + * and thus there is no point in setting up an MPI derived type, as + * (to the best of my knowlege) MPI I/O doesn't have support for + * non-contiguous I/O in independent mode. + * + * Thus we have to read in each element of the vector in a separate + * MPI_File_read_at() call. + */ + + fixed_size = FALSE; + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_Debug[(int)'r']) + HDfprintf(stdout, "%s: doing MPI independent IO\n", FUNC); +#endif + + for (i = 0; i < (int)count; i++) { + + if (H5FD_mpi_haddr_to_MPIOff(s_addrs[i], &mpi_off) < 0) + + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off") + + if (!fixed_size) { + + if (sizes[i] == 0) { + + fixed_size = TRUE; + size = sizes[i - 1]; + } + else { + + size = s_sizes[i]; + } + } + + size_i = (int)size; /* todo: fix potential for overflow */ + + if (MPI_SUCCESS != + (mpi_code = MPI_File_read_at(file->f, mpi_off, s_bufs[i], size_i, MPI_BYTE, &mpi_stat))) + + HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code) + } + } + +done: + + if (!vector_was_sorted) { /* free sorted vectors if they exist */ + + if (s_types) { + + HDfree(s_types); + s_types = NULL; + } + + if (s_addrs) { + + HDfree(s_addrs); + s_addrs = NULL; + } + + if (s_sizes) { + + HDfree(s_sizes); + s_sizes = NULL; + } + + if (s_bufs) { + + HDfree(s_bufs); + s_bufs = NULL; + } + } + + if (mpi_block_lengths) { + + HDfree(mpi_block_lengths); + mpi_block_lengths = NULL; + } + + if (mpi_displacments) { + + HDfree(mpi_displacments); + mpi_displacments = NULL; + } + + if (mpi_bufs) { + + HDfree(mpi_bufs); + mpi_bufs = NULL; + } + + if (buf_type_created) { + MPI_Type_free(&buf_type); + } + + if (file_type_created) { + MPI_Type_free(&file_type); + } + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_Debug[(int)'t']) + HDfprintf(stdout, "%s: Leaving, proc %d: ret_value = %d\n", FUNC, file->mpi_rank, ret_value); +#endif + + FUNC_LEAVE_NOAPI(ret_value) + +} /* end H5FD__mpio_read_vector() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__mpio_write_vector + * + * Purpose: The behaviour of this function dependes on the value of + * the io_xfer_mode obtained from the context. + * + * If it is H5FD_MPIO_COLLECTIVE, this is a collective + * operation, which allows us to use MPI_File_set_view, and + * then perform the entire vector write in a single MPI call. + * + * Do this (if count is positive), by constructing memory + * and file derived types from the supplied vector, using + * file type to set the file view, and then writing the + * the memory type to file. Note that this write is + * either independent or collective depending on the + * value of mpio_coll_opt -- again obtained from the context. + * + * If count is zero, participate in the collective write + * (if so configured) with an empty write. + * + * Finally, set the file view back to its default state. + * + * In contrast, if io_xfer_mode is H5FD_MPIO_INDEPENDENT, + * this call is independent, and thus we cannot use + * MPI_File_set_view(). + * + * In this case, simply walk the vector, and issue an + * independent write for each entry. + * + * WARNING: At present, this function makes no provision + * entries of size greater than 2 GB in the vector. This + * will have to be fixed before release. + * + * Return: Success: SUCCEED. + * Failure: FAIL. + * + * Programmer: John Mainzer + * March 15, 2021 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__mpio_write_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t count, H5FD_mem_t types[], + haddr_t addrs[], size_t sizes[], void *bufs[]) +{ + H5FD_mpio_t * file = (H5FD_mpio_t *)_file; + hbool_t vector_was_sorted = TRUE; + hbool_t fixed_size = FALSE; + size_t size; + H5FD_mem_t * s_types = NULL; + haddr_t * s_addrs = NULL; + size_t * s_sizes = NULL; + void ** s_bufs = NULL; + int * mpi_block_lengths = NULL; + char unused = 0; /* Unused, except for non-NULL pointer value */ + void * mpi_bufs_base = NULL; + MPI_Aint mpi_bufs_base_Aint; + MPI_Aint * mpi_bufs = NULL; + MPI_Aint * mpi_displacments = NULL; + MPI_Datatype buf_type = MPI_BYTE; /* MPI description of the selection in memory */ + hbool_t buf_type_created = FALSE; + MPI_Datatype file_type = MPI_BYTE; /* MPI description of the selection in file */ + hbool_t file_type_created = FALSE; + int i; + int j; + int mpi_code; /* MPI return code */ + MPI_Offset mpi_off = 0; + MPI_Status mpi_stat; /* Status from I/O operation */ + H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */ + H5FD_mpio_collective_opt_t coll_opt_mode; /* whether we are doing collective or independent I/O */ + int size_i; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_STATIC + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_Debug[(int)'t']) + HDfprintf(stdout, "%s: Entering\n", FUNC); +#endif + + /* Sanity checks */ + HDassert(file); + HDassert(H5FD_MPIO == file->pub.driver_id); + HDassert((types) || (count == 0)); + HDassert((addrs) || (count == 0)); + HDassert((sizes) || (count == 0)); + HDassert((bufs) || (count == 0)); + + /* verify that the first elements of the sizes and types arrays are + * valid. + */ + HDassert((count == 0) || (sizes[0] != 0)); + HDassert((count == 0) || (types[0] != H5FD_MEM_NOLIST)); + + + /* Verify that no data is written when between MPI_Barrier()s during file flush */ + + HDassert(!H5CX_get_mpi_file_flushing()); + + /* sort the vector I/O request into increasing address order if required + * + * If the vector is already sorted, the base addresses of types, addrs, sizes, + * and bufs will be returned in s_types, s_addrs, s_sizes, and s_bufs respectively. + * + * If the vector was not already sorted, new, sorted versions of types, addrs, sizes, and bufs + * are allocated, populated, and returned in s_types, s_addrs, s_sizes, and s_bufs respectively. + * In this case, this function must free the memory allocated for the sorted vectors. + */ + if (H5FD_sort_vector_io_req(&vector_was_sorted, count, types, addrs, sizes, bufs, &s_types, &s_addrs, + &s_sizes, &s_bufs) < 0) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "can't sort vector I/O request") + + + /* Get the transfer mode from the API context + * + * This flag is set to H5FD_MPIO_COLLECTIVE if the API call is + * collective, and to H5FD_MPIO_INDEPENDENT if it is not. + * + * While this doesn't mean that we are actually about to do a collective + * write, it does mean that all ranks are here, so we can use MPI_File_set_view(). + */ + if (H5CX_get_io_xfer_mode(&xfer_mode) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode") + + if (xfer_mode == H5FD_MPIO_COLLECTIVE) { + + if (count > 0) { /* create MPI derived types describing the vector write */ + + if ((NULL == (mpi_block_lengths = (int *)HDmalloc((size_t)count * sizeof(int)))) || + (NULL == (mpi_displacments = (MPI_Aint *)HDmalloc((size_t)count * sizeof(MPI_Aint)))) || + (NULL == (mpi_bufs = (MPI_Aint *)HDmalloc((size_t)count * sizeof(MPI_Aint))))) { + + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc mpi block lengths / displacement") + } + + /* when we setup mpi_bufs[] below, all addresses are offsets from + * mpi_bufs_base. + * + * Since these offsets must all be positive, we must scan through + * s_bufs[] to find the smallest value, and choose that for + * mpi_bufs_base. + */ + + j = 0; /* guess at the index of the smallest value of s_bufs[] */ + + for (i = 1; i < (int)count; i++) { + + if (s_bufs[i] < s_bufs[j]) { + + j = i; + } + } + + mpi_bufs_base = s_bufs[j]; + + if (MPI_SUCCESS != (mpi_code = MPI_Get_address(mpi_bufs_base, &mpi_bufs_base_Aint))) + + HMPI_GOTO_ERROR(FAIL, "MPI_Get_address for s_bufs[] to mpi_bufs_base failed", mpi_code) + + size_i = 1; + + fixed_size = FALSE; + + /* load the mpi_block_lengths and mpi_displacements arrays */ + for (i = 0; i < (int)count; i++) { + + if (!fixed_size) { + + if (sizes[i] == 0) { + + fixed_size = TRUE; + size = sizes[i - 1]; + } + else { + + size = s_sizes[i]; + } + } + + /* There is an obvious possibility of an overflow here, as size_t + * will typically be 64 bits, where as int will typically be 32 bits. + * This must be fixed, but it should be good enough for initial + * correctness testing. + * JRM -- 3/17/21 + */ + mpi_block_lengths[i] = (int)size; + mpi_displacments[i] = (MPI_Aint)s_addrs[i]; + + /* convert s_bufs[i] to MPI_Aint... */ + if (MPI_SUCCESS != (mpi_code = MPI_Get_address(s_bufs[i], &(mpi_bufs[i])))) + + HMPI_GOTO_ERROR(FAIL, "MPI_Get_address for s_bufs[] - mpi_bufs_base failed", mpi_code) + + /*... and then subtract mpi_bufs_base_Aint from it. */ +#if ((MPI_VERSION > 3) || ((MPI_VERSION == 3) && (MPI_SUBVERSION >= 1))) + mpi_bufs[i] = MPI_Aint_diff(mpi_bufs[i], mpi_bufs_base_Aint); +#else + mpi_bufs[i] = mpi_bufs[i] - mpi_bufs_base_Aint; +#endif + } + + + /* create the memory MPI derived types */ + if (MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed((int)count, mpi_block_lengths, mpi_bufs, + MPI_BYTE, &buf_type))) + + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed for buf_type failed", mpi_code) + + buf_type_created = TRUE; + + if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(&buf_type))) + + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit for buf_type failed", mpi_code) + + /* create the file MPI derived type */ + if (MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed((int)count, mpi_block_lengths, + mpi_displacments, MPI_BYTE, &file_type))) + + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed for file_type failed", mpi_code) + + file_type_created = TRUE; + + if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(&file_type))) + + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit for file_type failed", mpi_code) + } + else { + + /* setup for null participation in the collective operation. */ + + buf_type = MPI_BYTE; + file_type = MPI_BYTE; + + /* Set non-NULL pointer for I/O operation */ + mpi_bufs_base = (void *)(&unused); + + /* MPI count to write */ + size_i = 0; + } + + /* Portably initialize MPI status variable */ + HDmemset(&mpi_stat, 0, sizeof(MPI_Status)); + + /* some numeric conversions */ + if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, &mpi_off) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0") + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_Debug[(int)'w']) + HDfprintf(stdout, "%s: mpi_off = %ld size_i = %d\n", FUNC, (long)mpi_off, size_i); +#endif + + /* Setup the file view. */ + if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, mpi_off, MPI_BYTE, file_type, + H5FD_mpi_native_g, file->info))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) + + /* Get the collective_opt property to check whether the application wants to do IO individually. + */ + if (H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0) + + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property") + + /* Write the data. */ +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_Debug[(int)'w']) + HDfprintf(stdout, "%s: using MPIO collective mode\n", FUNC); +#endif + + if (coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) { +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_Debug[(int)'w']) + HDfprintf(stdout, "%s: doing MPI collective IO\n", FUNC); +#endif + + if (MPI_SUCCESS != (mpi_code = MPI_File_write_at_all(file->f, mpi_off, mpi_bufs_base, size_i, + buf_type, &mpi_stat))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at_all failed", mpi_code) + } /* end if */ + else if (size_i > 0) { +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_Debug[(int)'w']) + HDfprintf(stdout, "%s: doing MPI independent IO\n", FUNC); +#endif + + if (MPI_SUCCESS != + (mpi_code = MPI_File_write_at(file->f, mpi_off, mpi_bufs_base, size_i, buf_type, &mpi_stat))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at failed", mpi_code) + } /* end else */ + + /* Reset the file view */ + if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, + H5FD_mpi_native_g, file->info))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) + } + else if (count > 0) { + + /* The write is part of an independent operation. As a result, + * we can't use MPI_File_set_view() (since it it a collective operation), + * and thus there is no point in setting up an MPI derived type, as + * (to the best of my knowlege) MPI I/O doesn't have support for + * non-contiguous I/O in independent mode. + * + * Thus we have to write out each element of the vector in a separate + * MPI_File_write_at() call. + */ + + fixed_size = FALSE; + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_Debug[(int)'w']) + HDfprintf(stdout, "%s: doing MPI independent IO\n", FUNC); +#endif + + for (i = 0; i < (int)count; i++) { + + if (H5FD_mpi_haddr_to_MPIOff(s_addrs[i], &mpi_off) < 0) + + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off") + + if (!fixed_size) { + + if (sizes[i] == 0) { + + fixed_size = TRUE; + size = sizes[i - 1]; + } + else { + + size = s_sizes[i]; + } + } + + size_i = (int)size; /* todo: fix potential for overflow */ + + if (MPI_SUCCESS != + (mpi_code = MPI_File_write_at(file->f, mpi_off, s_bufs[i], size_i, MPI_BYTE, &mpi_stat))) + + HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at failed", mpi_code) + } + } + + /* Each process will keep track of its perceived EOF value locally, and + * ultimately we will reduce this value to the maximum amongst all + * processes, but until then keep the actual eof at HADDR_UNDEF just in + * case something bad happens before that point. (rather have a value + * we know is wrong sitting around rather than one that could only + * potentially be wrong.) + */ + file->eof = HADDR_UNDEF; + + /* check to see if the local eof has changed been extended, and update if so. + * Since the vector write request has been sorted in increasing address order, + * we need only look at the address and size of the last element in the vector. + */ + if ((count > 0) && ((s_addrs[count - 1] + (haddr_t)(s_sizes[count - 1])) > file->local_eof)) { + + file->local_eof = (s_addrs[count - 1] + (haddr_t)(s_sizes[count - 1])); + } + +done: + + if (!vector_was_sorted) { /* free sorted vectors if they exist */ + + if (s_types) { + + HDfree(s_types); + s_types = NULL; + } + + if (s_addrs) { + + HDfree(s_addrs); + s_addrs = NULL; + } + + if (s_sizes) { + + HDfree(s_sizes); + s_sizes = NULL; + } + + if (s_bufs) { + + HDfree(s_bufs); + s_bufs = NULL; + } + } + + if (mpi_block_lengths) { + + HDfree(mpi_block_lengths); + mpi_block_lengths = NULL; + } + + if (mpi_displacments) { + + HDfree(mpi_displacments); + mpi_displacments = NULL; + } + + if (mpi_bufs) { + + HDfree(mpi_bufs); + mpi_bufs = NULL; + } + + if (buf_type_created) { + MPI_Type_free(&buf_type); + } + + if (file_type_created) { + MPI_Type_free(&file_type); + } + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_Debug[(int)'t']) + HDfprintf(stdout, "%s: Leaving, proc %d: ret_value = %d\n", FUNC, file->mpi_rank, ret_value); +#endif + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__mpio_write_vector() */ + +/*------------------------------------------------------------------------- * Function: H5FD__mpio_flush * * Purpose: Makes sure that all data is on disk. This is collective. |