summaryrefslogtreecommitdiffstats
path: root/src/H5FDmpio.c
diff options
context:
space:
mode:
authorNeil Fortner <fortnern@gmail.com>2022-03-26 19:30:53 (GMT)
committerGitHub <noreply@github.com>2022-03-26 19:30:53 (GMT)
commit42b767fc67ad1e13735e3cee2077f2e108f9463e (patch)
tree42267295f94bb67dca39ba6dd2dd9d1ac89ee0bd /src/H5FDmpio.c
parent25ef608e2f1678c04a81b11ed9443768cdbd9dbd (diff)
downloadhdf5-42b767fc67ad1e13735e3cee2077f2e108f9463e.zip
hdf5-42b767fc67ad1e13735e3cee2077f2e108f9463e.tar.gz
hdf5-42b767fc67ad1e13735e3cee2077f2e108f9463e.tar.bz2
Merge initial version of selection I/O feature into develop (#1367)
Diffstat (limited to 'src/H5FDmpio.c')
-rw-r--r--src/H5FDmpio.c1135
1 files changed, 1098 insertions, 37 deletions
diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c
index 4aa8a96..6b639f9 100644
--- a/src/H5FDmpio.c
+++ b/src/H5FDmpio.c
@@ -87,49 +87,66 @@ static herr_t H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, ha
void *buf);
static herr_t H5FD__mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size,
const void *buf);
+static herr_t H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t count,
+ H5FD_mem_t types[], haddr_t addrs[], size_t sizes[], void *bufs[]);
+static herr_t H5FD__mpio_write_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t count,
+ H5FD_mem_t types[], haddr_t addrs[], size_t sizes[],
+ const void *bufs[]);
static herr_t H5FD__mpio_flush(H5FD_t *_file, hid_t dxpl_id, hbool_t closing);
static herr_t H5FD__mpio_truncate(H5FD_t *_file, hid_t dxpl_id, hbool_t closing);
static herr_t H5FD__mpio_delete(const char *filename, hid_t fapl_id);
static herr_t H5FD__mpio_ctl(H5FD_t *_file, uint64_t op_code, uint64_t flags, const void *input,
void **output);
+/* Other functions */
+static herr_t H5FD__mpio_vector_build_types(
+ uint32_t count, H5FD_mem_t types[], haddr_t addrs[], size_t sizes[], H5_flexible_const_ptr_t bufs[],
+ haddr_t *s_addrs[], size_t *s_sizes[], H5_flexible_const_ptr_t *s_bufs[], hbool_t *vector_was_sorted,
+ MPI_Offset *mpi_off, H5_flexible_const_ptr_t *mpi_bufs_base, int *size_i, MPI_Datatype *buf_type,
+ hbool_t *buf_type_created, MPI_Datatype *file_type, hbool_t *file_type_created, char *unused);
+
/* The MPIO file driver information */
static const H5FD_class_t H5FD_mpio_g = {
- H5_VFD_MPIO, /* value */
- "mpio", /* name */
- HADDR_MAX, /* maxaddr */
- H5F_CLOSE_SEMI, /* fc_degree */
- H5FD__mpio_term, /* terminate */
- NULL, /* sb_size */
- NULL, /* sb_encode */
- NULL, /* sb_decode */
- 0, /* fapl_size */
- NULL, /* fapl_get */
- NULL, /* fapl_copy */
- NULL, /* fapl_free */
- 0, /* dxpl_size */
- NULL, /* dxpl_copy */
- NULL, /* dxpl_free */
- H5FD__mpio_open, /* open */
- H5FD__mpio_close, /* close */
- NULL, /* cmp */
- H5FD__mpio_query, /* query */
- NULL, /* get_type_map */
- NULL, /* alloc */
- NULL, /* free */
- H5FD__mpio_get_eoa, /* get_eoa */
- H5FD__mpio_set_eoa, /* set_eoa */
- H5FD__mpio_get_eof, /* get_eof */
- H5FD__mpio_get_handle, /* get_handle */
- H5FD__mpio_read, /* read */
- H5FD__mpio_write, /* write */
- H5FD__mpio_flush, /* flush */
- H5FD__mpio_truncate, /* truncate */
- NULL, /* lock */
- NULL, /* unlock */
- H5FD__mpio_delete, /* del */
- H5FD__mpio_ctl, /* ctl */
- H5FD_FLMAP_DICHOTOMY /* fl_map */
+ H5FD_CLASS_VERSION, /* struct version */
+ H5_VFD_MPIO, /* value */
+ "mpio", /* name */
+ HADDR_MAX, /* maxaddr */
+ H5F_CLOSE_SEMI, /* fc_degree */
+ H5FD__mpio_term, /* terminate */
+ NULL, /* sb_size */
+ NULL, /* sb_encode */
+ NULL, /* sb_decode */
+ 0, /* fapl_size */
+ NULL, /* fapl_get */
+ NULL, /* fapl_copy */
+ NULL, /* fapl_free */
+ 0, /* dxpl_size */
+ NULL, /* dxpl_copy */
+ NULL, /* dxpl_free */
+ H5FD__mpio_open, /* open */
+ H5FD__mpio_close, /* close */
+ NULL, /* cmp */
+ H5FD__mpio_query, /* query */
+ NULL, /* get_type_map */
+ NULL, /* alloc */
+ NULL, /* free */
+ H5FD__mpio_get_eoa, /* get_eoa */
+ H5FD__mpio_set_eoa, /* set_eoa */
+ H5FD__mpio_get_eof, /* get_eof */
+ H5FD__mpio_get_handle, /* get_handle */
+ H5FD__mpio_read, /* read */
+ H5FD__mpio_write, /* write */
+ H5FD__mpio_read_vector, /* read_vector */
+ H5FD__mpio_write_vector, /* write_vector */
+ NULL, /* read_selection */
+ NULL, /* write_selection */
+ H5FD__mpio_flush, /* flush */
+ H5FD__mpio_truncate, /* truncate */
+ NULL, /* lock */
+ NULL, /* unlock */
+ H5FD__mpio_delete, /* del */
+ H5FD__mpio_ctl, /* ctl */
+ H5FD_FLMAP_DICHOTOMY /* fl_map */
};
#ifdef H5FDmpio_DEBUG
@@ -1415,7 +1432,7 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_r_flag)
HDfprintf(stderr, "%s: (%d) mpi_off = %ld bytes_read = %lld type = %s\n", __func__, file->mpi_rank,
- (long)mpi_off, bytes_read, H5FD__mem_t_to_str(type));
+ (long)mpi_off, (long long)bytes_read, H5FD__mem_t_to_str(type));
#endif
/*
@@ -1636,7 +1653,7 @@ H5FD__mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, h
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_w_flag)
HDfprintf(stderr, "%s: (%d) mpi_off = %ld bytes_written = %lld type = %s\n", __func__,
- file->mpi_rank, (long)mpi_off, bytes_written, H5FD__mem_t_to_str(type));
+ file->mpi_rank, (long)mpi_off, (long long)bytes_written, H5FD__mem_t_to_str(type));
#endif
/* Each process will keep track of its perceived EOF value locally, and
@@ -1663,6 +1680,1050 @@ done:
} /* end H5FD__mpio_write() */
/*-------------------------------------------------------------------------
+ * Function: H5FD__mpio_vector_build_types
+ *
+ * Purpose: Build MPI datatypes and calculate offset, base buffer, and
+ * size for MPIO vector I/O. Spun off from common code in
+ * H5FD__mpio_vector_read() and H5FD__mpio_vector_write().
+ *
+ * Return: Success: SUCCEED.
+ * Failure: FAIL.
+ *
+ * Programmer: Neil Fortner
+ * March 14, 2022
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD__mpio_vector_build_types(uint32_t count, H5FD_mem_t types[], haddr_t addrs[], size_t sizes[],
+ H5_flexible_const_ptr_t bufs[], haddr_t *s_addrs[], size_t *s_sizes[],
+ H5_flexible_const_ptr_t *s_bufs[], hbool_t *vector_was_sorted,
+ MPI_Offset *mpi_off, H5_flexible_const_ptr_t *mpi_bufs_base, int *size_i,
+ MPI_Datatype *buf_type, hbool_t *buf_type_created, MPI_Datatype *file_type,
+ hbool_t *file_type_created, char *unused)
+{
+ hsize_t bigio_count; /* Transition point to create derived type */
+ hbool_t fixed_size = FALSE;
+ size_t size;
+ H5FD_mem_t * s_types = NULL;
+ int * mpi_block_lengths = NULL;
+ MPI_Aint mpi_bufs_base_Aint;
+ MPI_Aint * mpi_bufs = NULL;
+ MPI_Aint * mpi_displacements = NULL;
+ MPI_Datatype *sub_types = NULL;
+ uint8_t * sub_types_created = NULL;
+ int i;
+ int j;
+ int mpi_code; /* MPI return code */
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_STATIC
+
+ /* Sanity checks */
+ HDassert(s_sizes);
+ HDassert(s_bufs);
+ HDassert(vector_was_sorted);
+ HDassert(*vector_was_sorted);
+ HDassert(mpi_off);
+ HDassert(mpi_bufs_base);
+ HDassert(size_i);
+ HDassert(buf_type);
+ HDassert(buf_type_created);
+ HDassert(!*buf_type_created);
+ HDassert(file_type);
+ HDassert(file_type_created);
+ HDassert(!*file_type_created);
+ HDassert(unused);
+
+ /* Get bio I/O transition point (may be lower than 2G for testing) */
+ bigio_count = H5_mpi_get_bigio_count();
+
+ if (count == 1) {
+ /* Single block. Just use a series of MPI_BYTEs for the file view.
+ */
+ *size_i = (int)sizes[0];
+ *buf_type = MPI_BYTE;
+ *file_type = MPI_BYTE;
+ *mpi_bufs_base = bufs[0];
+
+ /* Setup s_addrs, s_sizes and s_bufs (needed for incomplete read filling code and eof
+ * calculation code) */
+ *s_addrs = addrs;
+ *s_sizes = sizes;
+ *s_bufs = bufs;
+
+ /* some numeric conversions */
+ if (H5FD_mpi_haddr_to_MPIOff(addrs[0], mpi_off) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI offset")
+
+ /* Check for size overflow */
+ if (sizes[0] > bigio_count) {
+ /* We need to work around the integer size limit of 2GB. The input size_t size
+ * variable cannot fit into an integer, but we can get around that limitation by
+ * creating a different datatype and then setting the integer size (or element
+ * count) to 1 when using the derived_type. */
+
+ if (H5_mpio_create_large_type(sizes[0], 0, MPI_BYTE, buf_type) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype")
+ *buf_type_created = TRUE;
+
+ if (H5_mpio_create_large_type(sizes[0], 0, MPI_BYTE, file_type) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype")
+ *file_type_created = TRUE;
+
+ *size_i = 1;
+ }
+ }
+ else if (count > 0) { /* create MPI derived types describing the vector write */
+
+ /* sort the vector I/O request into increasing address order if required
+ *
+ * If the vector is already sorted, the base addresses of types, addrs, sizes,
+ * and bufs will be returned in s_types, s_addrs, s_sizes, and s_bufs respectively.
+ *
+ * If the vector was not already sorted, new, sorted versions of types, addrs, sizes, and bufs
+ * are allocated, populated, and returned in s_types, s_addrs, s_sizes, and s_bufs respectively.
+ * In this case, this function must free the memory allocated for the sorted vectors.
+ */
+ if (H5FD_sort_vector_io_req(vector_was_sorted, count, types, addrs, sizes, bufs, &s_types, s_addrs,
+ s_sizes, s_bufs) < 0)
+ HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "can't sort vector I/O request")
+
+ if ((NULL == (mpi_block_lengths = (int *)HDmalloc((size_t)count * sizeof(int)))) ||
+ (NULL == (mpi_displacements = (MPI_Aint *)HDmalloc((size_t)count * sizeof(MPI_Aint)))) ||
+ (NULL == (mpi_bufs = (MPI_Aint *)HDmalloc((size_t)count * sizeof(MPI_Aint))))) {
+
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc mpi block lengths / displacement")
+ }
+
+ /* when we setup mpi_bufs[] below, all addresses are offsets from
+ * mpi_bufs_base.
+ *
+ * Since these offsets must all be positive, we must scan through
+ * s_bufs[] to find the smallest value, and choose that for
+ * mpi_bufs_base.
+ */
+
+ j = 0; /* guess at the index of the smallest value of s_bufs[] */
+
+ for (i = 1; i < (int)count; i++) {
+
+ if ((*s_bufs)[i].cvp < (*s_bufs)[j].cvp) {
+
+ j = i;
+ }
+ }
+
+ *mpi_bufs_base = (*s_bufs)[j];
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Get_address(mpi_bufs_base->cvp, &mpi_bufs_base_Aint)))
+
+ HMPI_GOTO_ERROR(FAIL, "MPI_Get_address for s_bufs[] to mpi_bufs_base failed", mpi_code)
+
+ *size_i = 1;
+
+ fixed_size = FALSE;
+
+ /* load the mpi_block_lengths and mpi_displacements arrays */
+ for (i = 0; i < (int)count; i++) {
+ /* Determine size of this vector element */
+ if (!fixed_size) {
+ if ((*s_sizes)[i] == 0) {
+ HDassert(vector_was_sorted);
+ fixed_size = TRUE;
+ size = sizes[i - 1];
+ }
+ else {
+ size = (*s_sizes)[i];
+ }
+ }
+
+ /* Add to block lengths and displacements arrays */
+ mpi_block_lengths[i] = (int)size;
+ mpi_displacements[i] = (MPI_Aint)(*s_addrs)[i];
+
+ /* convert s_bufs[i] to MPI_Aint... */
+ if (MPI_SUCCESS != (mpi_code = MPI_Get_address((*s_bufs)[i].cvp, &(mpi_bufs[i]))))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Get_address for s_bufs[] - mpi_bufs_base failed", mpi_code)
+
+ /*... and then subtract mpi_bufs_base_Aint from it. */
+#if ((MPI_VERSION > 3) || ((MPI_VERSION == 3) && (MPI_SUBVERSION >= 1)))
+ mpi_bufs[i] = MPI_Aint_diff(mpi_bufs[i], mpi_bufs_base_Aint);
+#else
+ mpi_bufs[i] = mpi_bufs[i] - mpi_bufs_base_Aint;
+#endif
+
+ /* Check for size overflow */
+ if (size > bigio_count) {
+ /* We need to work around the integer size limit of 2GB. The input size_t size
+ * variable cannot fit into an integer, but we can get around that limitation by
+ * creating a different datatype and then setting the integer size (or element
+ * count) to 1 when using the derived_type. */
+
+ /* Allocate arrays to keep track of types and whether they were created, if
+ * necessary */
+ if (!sub_types) {
+ HDassert(!sub_types_created);
+
+ if (NULL == (sub_types = (int *)HDmalloc((size_t)count * sizeof(MPI_Datatype))))
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc sub types array")
+ if (NULL == (sub_types_created = (uint8_t *)HDcalloc((size_t)count, 1))) {
+ H5MM_free(sub_types);
+ sub_types = NULL;
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc sub types created array")
+ }
+
+ /* Initialize sub_types to all MPI_BYTE */
+ for (j = 0; j < (int)count; j++)
+ sub_types[j] = MPI_BYTE;
+ }
+ HDassert(sub_types_created);
+
+ /* Create type for large block */
+ if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &sub_types[i]) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype")
+ sub_types_created[i] = TRUE;
+
+ /* Only one of these large types for this vector element */
+ mpi_block_lengths[i] = 1;
+ }
+ else
+ HDassert(size == (size_t)mpi_block_lengths[i]);
+ }
+
+ /* create the memory MPI derived types */
+ if (sub_types) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_create_struct((int)count, mpi_block_lengths, mpi_bufs,
+ sub_types, buf_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct for buf_type failed", mpi_code)
+ }
+ else if (MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed((int)count, mpi_block_lengths, mpi_bufs,
+ MPI_BYTE, buf_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed for buf_type failed", mpi_code)
+
+ *buf_type_created = TRUE;
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(buf_type)))
+
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit for buf_type failed", mpi_code)
+
+ /* create the file MPI derived type */
+ if (sub_types) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_create_struct((int)count, mpi_block_lengths,
+ mpi_displacements, sub_types, file_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct for file_type failed", mpi_code)
+ }
+ else if (MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed((int)count, mpi_block_lengths,
+ mpi_displacements, MPI_BYTE, file_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed for file_type failed", mpi_code)
+
+ *file_type_created = TRUE;
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(file_type)))
+
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit for file_type failed", mpi_code)
+
+ /* Free up memory used to build types */
+ HDassert(mpi_block_lengths);
+ HDfree(mpi_block_lengths);
+ mpi_block_lengths = NULL;
+
+ HDassert(mpi_displacements);
+ HDfree(mpi_displacements);
+ mpi_displacements = NULL;
+
+ HDassert(mpi_bufs);
+ HDfree(mpi_bufs);
+ mpi_bufs = NULL;
+
+ if (sub_types) {
+ HDassert(sub_types);
+
+ for (i = 0; i < (int)count; i++)
+ if (sub_types_created[i])
+ MPI_Type_free(&sub_types[i]);
+
+ HDfree(sub_types);
+ sub_types = NULL;
+ HDfree(sub_types_created);
+ sub_types_created = NULL;
+ }
+
+ /* some numeric conversions */
+ if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, mpi_off) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0")
+ }
+ else {
+ /* setup for null participation in the collective operation. */
+ *buf_type = MPI_BYTE;
+ *file_type = MPI_BYTE;
+
+ /* Set non-NULL pointer for I/O operation */
+ mpi_bufs_base->vp = unused;
+
+ /* MPI count to read */
+ *size_i = 0;
+
+ /* some numeric conversions */
+ if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, mpi_off) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0")
+ }
+
+done:
+ /* free sorted vectors if they exist */
+ if (!vector_was_sorted)
+ if (s_types) {
+ HDfree(s_types);
+ s_types = NULL;
+ }
+
+ /* Clean up on error */
+ if (ret_value < 0) {
+ if (mpi_block_lengths) {
+ HDfree(mpi_block_lengths);
+ mpi_block_lengths = NULL;
+ }
+
+ if (mpi_displacements) {
+ HDfree(mpi_displacements);
+ mpi_displacements = NULL;
+ }
+
+ if (mpi_bufs) {
+ HDfree(mpi_bufs);
+ mpi_bufs = NULL;
+ }
+
+ if (sub_types) {
+ HDassert(sub_types_created);
+
+ for (i = 0; i < (int)count; i++)
+ if (sub_types_created[i])
+ MPI_Type_free(&sub_types[i]);
+
+ HDfree(sub_types);
+ sub_types = NULL;
+ HDfree(sub_types_created);
+ sub_types_created = NULL;
+ }
+ }
+
+ /* Make sure we cleaned up */
+ HDassert(!mpi_block_lengths);
+ HDassert(!mpi_displacements);
+ HDassert(!mpi_bufs);
+ HDassert(!sub_types);
+ HDassert(!sub_types_created);
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5FD__mpio_vector_build_types() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD__mpio_read_vector()
+ *
+ * Purpose: The behaviour of this function dependes on the value of
+ * the io_xfer_mode obtained from the context.
+ *
+ * If it is H5FD_MPIO_COLLECTIVE, this is a collective
+ * operation, which allows us to use MPI_File_set_view, and
+ * then perform the entire vector read in a single MPI call.
+ *
+ * Do this (if count is positive), by constructing memory
+ * and file derived types from the supplied vector, using
+ * file type to set the file view, and then reading the
+ * the memory type from file. Note that this read is
+ * either independent or collective depending on the
+ * value of mpio_coll_opt -- again obtained from the context.
+ *
+ * If count is zero, participate in the collective read
+ * (if so configured) with an empty read.
+ *
+ * Finally, set the file view back to its default state.
+ *
+ * In contrast, if io_xfer_mode is H5FD_MPIO_INDEPENDENT,
+ * this call is independent, and thus we cannot use
+ * MPI_File_set_view().
+ *
+ * In this case, simply walk the vector, and issue an
+ * independent read for each entry.
+ *
+ * Return: Success: SUCCEED.
+ * Failure: FAIL.
+ *
+ * Programmer: John Mainzer
+ * March 15, 2021
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t count, H5FD_mem_t types[],
+ haddr_t addrs[], size_t sizes[], void *bufs[])
+{
+ H5FD_mpio_t * file = (H5FD_mpio_t *)_file;
+ hbool_t vector_was_sorted = TRUE;
+ haddr_t * s_addrs = NULL;
+ size_t * s_sizes = NULL;
+ void ** s_bufs = NULL;
+ char unused = 0; /* Unused, except for non-NULL pointer value */
+ void * mpi_bufs_base = NULL;
+ MPI_Datatype buf_type = MPI_BYTE; /* MPI description of the selection in memory */
+ hbool_t buf_type_created = FALSE;
+ MPI_Datatype file_type = MPI_BYTE; /* MPI description of the selection in file */
+ hbool_t file_type_created = FALSE;
+ int i;
+ int mpi_code; /* MPI return code */
+ MPI_Offset mpi_off = 0;
+ MPI_Status mpi_stat; /* Status from I/O operation */
+ H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */
+ H5FD_mpio_collective_opt_t coll_opt_mode; /* whether we are doing collective or independent I/O */
+ int size_i;
+#if MPI_VERSION >= 3
+ MPI_Count bytes_read = 0; /* Number of bytes read in */
+ MPI_Count type_size; /* MPI datatype used for I/O's size */
+ MPI_Count io_size; /* Actual number of bytes requested */
+ MPI_Count n;
+#else
+ int bytes_read = 0; /* Number of bytes read in */
+ int type_size; /* MPI datatype used for I/O's size */
+ int io_size; /* Actual number of bytes requested */
+ int n;
+#endif
+ hbool_t rank0_bcast = FALSE; /* If read-with-rank0-and-bcast flag was used */
+#ifdef H5FDmpio_DEBUG
+ hbool_t H5FD_mpio_debug_t_flag = (H5FD_mpio_debug_flags_s[(int)'t'] && H5FD_MPIO_TRACE_THIS_RANK(file));
+ hbool_t H5FD_mpio_debug_r_flag = (H5FD_mpio_debug_flags_s[(int)'r'] && H5FD_MPIO_TRACE_THIS_RANK(file));
+#endif
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_STATIC
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_t_flag)
+ HDfprintf(stderr, "%s: (%d) Entering\n", __func__, file->mpi_rank);
+#endif
+
+ /* Sanity checks */
+ HDassert(file);
+ HDassert(H5FD_MPIO == file->pub.driver_id);
+ HDassert((types) || (count == 0));
+ HDassert((addrs) || (count == 0));
+ HDassert((sizes) || (count == 0));
+ HDassert((bufs) || (count == 0));
+
+ /* verify that the first elements of the sizes and types arrays are
+ * valid.
+ */
+ HDassert((count == 0) || (sizes[0] != 0));
+ HDassert((count == 0) || (types[0] != H5FD_MEM_NOLIST));
+
+ /* Get the transfer mode from the API context
+ *
+ * This flag is set to H5FD_MPIO_COLLECTIVE if the API call is
+ * collective, and to H5FD_MPIO_INDEPENDENT if it is not.
+ *
+ * While this doesn't mean that we are actually about to do a collective
+ * read, it does mean that all ranks are here, so we can use MPI_File_set_view().
+ */
+ if (H5CX_get_io_xfer_mode(&xfer_mode) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode")
+
+ if (xfer_mode == H5FD_MPIO_COLLECTIVE) {
+ /* Build MPI types, etc. */
+ if (H5FD__mpio_vector_build_types(count, types, addrs, sizes, (H5_flexible_const_ptr_t *)bufs,
+ &s_addrs, &s_sizes, (H5_flexible_const_ptr_t **)&s_bufs,
+ &vector_was_sorted, &mpi_off,
+ (H5_flexible_const_ptr_t *)&mpi_bufs_base, &size_i, &buf_type,
+ &buf_type_created, &file_type, &file_type_created, &unused) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't build MPI datatypes for I/O")
+
+ /* free sorted addrs vector if it exists */
+ if (!vector_was_sorted)
+ if (s_addrs) {
+ HDfree(s_addrs);
+ s_addrs = NULL;
+ }
+
+ /* Portably initialize MPI status variable */
+ HDmemset(&mpi_stat, 0, sizeof(mpi_stat));
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_r_flag)
+ HDfprintf(stdout, "%s: mpi_off = %ld size_i = %d\n", __func__, (long)mpi_off, size_i);
+#endif
+
+ /* Setup the file view. */
+ if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, mpi_off, MPI_BYTE, file_type,
+ H5FD_mpi_native_g, file->info)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
+
+ /* Reset mpi_off to 0 since the view now starts at the data offset */
+ if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, &mpi_off) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0")
+
+ /* Get the collective_opt property to check whether the application wants to do IO individually.
+ */
+ if (H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0)
+
+ HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property")
+
+ /* Read the data. */
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_r_flag)
+ HDfprintf(stdout, "%s: using MPIO collective mode\n", __func__);
+#endif
+ if (coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) {
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_r_flag)
+ HDfprintf(stdout, "%s: doing MPI collective IO\n", __func__);
+#endif
+ /* Check whether we should read from rank 0 and broadcast to other ranks */
+ if (H5CX_get_mpio_rank0_bcast()) {
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_r_flag)
+ HDfprintf(stdout, "%s: doing read-rank0-and-MPI_Bcast\n", __func__);
+#endif
+ /* Indicate path we've taken */
+ rank0_bcast = TRUE;
+
+ /* Read on rank 0 Bcast to other ranks */
+ if (file->mpi_rank == 0)
+ if (MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, mpi_bufs_base, size_i,
+ buf_type, &mpi_stat)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code)
+ if (MPI_SUCCESS != (mpi_code = MPI_Bcast(mpi_bufs_base, size_i, buf_type, 0, file->comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code)
+ } /* end if */
+ else if (MPI_SUCCESS != (mpi_code = MPI_File_read_at_all(file->f, mpi_off, mpi_bufs_base, size_i,
+ buf_type, &mpi_stat)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code)
+ } /* end if */
+ else if (size_i > 0) {
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_r_flag)
+ HDfprintf(stdout, "%s: doing MPI independent IO\n", __func__);
+#endif
+
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_File_read_at(file->f, mpi_off, mpi_bufs_base, size_i, buf_type, &mpi_stat)))
+
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
+
+ } /* end else */
+
+ /* Reset the file view */
+ if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE,
+ H5FD_mpi_native_g, file->info)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
+
+ /* Only retrieve bytes read if this rank _actually_ participated in I/O */
+ if (!rank0_bcast || (rank0_bcast && file->mpi_rank == 0)) {
+ /* How many bytes were actually read? */
+#if MPI_VERSION >= 3
+ if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_read)))
+#else
+ if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read)))
+#endif
+ HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
+ } /* end if */
+
+ /* If the rank0-bcast feature was used, broadcast the # of bytes read to
+ * other ranks, which didn't perform any I/O.
+ */
+ /* NOTE: This could be optimized further to be combined with the broadcast
+ * of the data. (QAK - 2019/1/2)
+ * Or have rank 0 clear the unread parts of the buffer prior to
+ * the bcast. (NAF - 2021/9/15)
+ */
+ if (rank0_bcast)
+#if MPI_VERSION >= 3
+ if (MPI_SUCCESS != MPI_Bcast(&bytes_read, 1, MPI_COUNT, 0, file->comm))
+#else
+ if (MPI_SUCCESS != MPI_Bcast(&bytes_read, 1, MPI_INT, 0, file->comm))
+#endif
+ HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", 0)
+
+ /* Get the type's size */
+#if MPI_VERSION >= 3
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_size_x(buf_type, &type_size)))
+#else
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_size(buf_type, &type_size)))
+#endif
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_size failed", mpi_code)
+
+ /* Compute the actual number of bytes requested */
+ io_size = type_size * size_i;
+
+ /* Check for read failure */
+ if (bytes_read < 0 || bytes_read > io_size)
+ HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed")
+
+ /* Check for incomplete read */
+ n = io_size - bytes_read;
+ if (n > 0) {
+ i = (int)count - 1;
+
+ /* Iterate over sorted array in reverse, filling in zeroes to
+ * sections of the buffers that were not read to */
+ do {
+ HDassert(i >= 0);
+
+#if MPI_VERSION >= 3
+ io_size = MIN(n, (MPI_Count)s_sizes[i]);
+ bytes_read = (MPI_Count)s_sizes[i] - io_size;
+#else
+ io_size = MIN(n, (int)s_sizes[i]);
+ bytes_read = (int)s_sizes[i] - io_size;
+#endif
+ HDassert(bytes_read >= 0);
+
+ HDmemset((char *)s_bufs[i] + bytes_read, 0, (size_t)io_size);
+
+ n -= io_size;
+ i--;
+ } while (n > 0);
+ }
+ }
+ else if (count > 0) {
+ haddr_t max_addr = HADDR_MAX;
+ hbool_t fixed_size = FALSE;
+ size_t size;
+
+ /* The read is part of an independent operation. As a result,
+ * we can't use MPI_File_set_view() (since it it a collective operation),
+ * and thus we can't use the above code to construct the MPI datatypes.
+ * In the future, we could write code to detect when a contiguous slab
+ * in the file selection spans multiple vector elements and construct a
+ * memory datatype to match this larger block in the file, but for now
+ * just read in each element of the vector in a separate
+ * MPI_File_read_at() call.
+ *
+ * We could also just detect the case when the entire file selection is
+ * contiguous, which would allow us to use
+ * H5FD__mpio_vector_build_types() to construct the memory datatype.
+ */
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_r_flag)
+ HDfprintf(stdout, "%s: doing MPI independent IO\n", __func__);
+#endif
+
+ /* Loop over vector elements */
+ for (i = 0; i < (int)count; i++) {
+ /* Convert address to mpi offset */
+ if (H5FD_mpi_haddr_to_MPIOff(addrs[i], &mpi_off) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off")
+
+ /* Calculate I/O size */
+ if (!fixed_size) {
+ if (sizes[i] == 0) {
+ fixed_size = TRUE;
+ size = sizes[i - 1];
+ }
+ else {
+ size = sizes[i];
+ }
+ }
+ size_i = (int)size;
+
+ if (size != (size_t)size_i) {
+ /* If HERE, then we need to work around the integer size limit
+ * of 2GB. The input size_t size variable cannot fit into an integer,
+ * but we can get around that limitation by creating a different datatype
+ * and then setting the integer size (or element count) to 1 when using
+ * the derived_type.
+ */
+
+ if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &buf_type) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype")
+
+ buf_type_created = TRUE;
+ size_i = 1;
+ }
+
+ /* Check if we actually need to do I/O */
+ if (addrs[i] < max_addr) {
+ /* Portably initialize MPI status variable */
+ HDmemset(&mpi_stat, 0, sizeof(mpi_stat));
+
+ /* Issue read */
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_File_read_at(file->f, mpi_off, bufs[i], size_i, buf_type, &mpi_stat)))
+
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
+
+ /* How many bytes were actually read? */
+#if MPI_VERSION >= 3
+ if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, MPI_BYTE, &bytes_read)))
+#else
+ if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read)))
+#endif
+ HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
+
+ /* Compute the actual number of bytes requested */
+#if MPI_VERSION >= 3
+ io_size = (MPI_Count)size;
+#else
+ io_size = (int)size;
+#endif
+
+ /* Check for read failure */
+ if (bytes_read < 0 || bytes_read > io_size)
+ HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed")
+
+ /*
+ * If we didn't read the entire I/O, fill in zeroes beyond end of
+ * the physical MPI file and don't issue any more reads at higher
+ * addresses.
+ */
+ if ((n = (io_size - bytes_read)) > 0) {
+ HDmemset((char *)bufs[i] + bytes_read, 0, (size_t)n);
+ max_addr = addrs[i] + (haddr_t)bytes_read;
+ }
+ }
+ else {
+ /* Read is past the max address, fill in zeroes */
+ HDmemset((char *)bufs[i], 0, size);
+ }
+ }
+ }
+
+done:
+ if (buf_type_created) {
+ MPI_Type_free(&buf_type);
+ }
+
+ if (file_type_created) {
+ MPI_Type_free(&file_type);
+ }
+
+ /* free sorted vectors if they exist */
+ if (!vector_was_sorted) {
+ if (s_addrs) {
+ HDfree(s_addrs);
+ s_addrs = NULL;
+ }
+ if (s_sizes) {
+ HDfree(s_sizes);
+ s_sizes = NULL;
+ }
+ if (s_bufs) {
+ HDfree(s_bufs);
+ s_bufs = NULL;
+ }
+ }
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_t_flag)
+ HDfprintf(stdout, "%s: Leaving, proc %d: ret_value = %d\n", __func__, file->mpi_rank, ret_value);
+#endif
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* end H5FD__mpio_read_vector() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD__mpio_write_vector
+ *
+ * Purpose: The behaviour of this function dependes on the value of
+ * the io_xfer_mode obtained from the context.
+ *
+ * If it is H5FD_MPIO_COLLECTIVE, this is a collective
+ * operation, which allows us to use MPI_File_set_view, and
+ * then perform the entire vector write in a single MPI call.
+ *
+ * Do this (if count is positive), by constructing memory
+ * and file derived types from the supplied vector, using
+ * file type to set the file view, and then writing the
+ * the memory type to file. Note that this write is
+ * either independent or collective depending on the
+ * value of mpio_coll_opt -- again obtained from the context.
+ *
+ * If count is zero, participate in the collective write
+ * (if so configured) with an empty write.
+ *
+ * Finally, set the file view back to its default state.
+ *
+ * In contrast, if io_xfer_mode is H5FD_MPIO_INDEPENDENT,
+ * this call is independent, and thus we cannot use
+ * MPI_File_set_view().
+ *
+ * In this case, simply walk the vector, and issue an
+ * independent write for each entry.
+ *
+ * Return: Success: SUCCEED.
+ * Failure: FAIL.
+ *
+ * Programmer: John Mainzer
+ * March 15, 2021
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD__mpio_write_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t count, H5FD_mem_t types[],
+ haddr_t addrs[], size_t sizes[], const void *bufs[])
+{
+ H5FD_mpio_t * file = (H5FD_mpio_t *)_file;
+ hbool_t vector_was_sorted = TRUE;
+ haddr_t * s_addrs = NULL;
+ size_t * s_sizes = NULL;
+ const void ** s_bufs = NULL;
+ char unused = 0; /* Unused, except for non-NULL pointer value */
+ const void * mpi_bufs_base = NULL;
+ MPI_Datatype buf_type = MPI_BYTE; /* MPI description of the selection in memory */
+ hbool_t buf_type_created = FALSE;
+ MPI_Datatype file_type = MPI_BYTE; /* MPI description of the selection in file */
+ hbool_t file_type_created = FALSE;
+ int i;
+ int mpi_code; /* MPI return code */
+ MPI_Offset mpi_off = 0;
+ MPI_Status mpi_stat; /* Status from I/O operation */
+ H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */
+ H5FD_mpio_collective_opt_t coll_opt_mode; /* whether we are doing collective or independent I/O */
+ int size_i;
+#ifdef H5FDmpio_DEBUG
+ hbool_t H5FD_mpio_debug_t_flag = (H5FD_mpio_debug_flags_s[(int)'t'] && H5FD_MPIO_TRACE_THIS_RANK(file));
+ hbool_t H5FD_mpio_debug_w_flag = (H5FD_mpio_debug_flags_s[(int)'w'] && H5FD_MPIO_TRACE_THIS_RANK(file));
+#endif
+ haddr_t max_addr = 0;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_STATIC
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_t_flag)
+ HDfprintf(stderr, "%s: (%d) Entering\n", __func__, file->mpi_rank);
+#endif
+
+ /* Sanity checks */
+ HDassert(file);
+ HDassert(H5FD_MPIO == file->pub.driver_id);
+ HDassert((types) || (count == 0));
+ HDassert((addrs) || (count == 0));
+ HDassert((sizes) || (count == 0));
+ HDassert((bufs) || (count == 0));
+
+ /* verify that the first elements of the sizes and types arrays are
+ * valid.
+ */
+ HDassert((count == 0) || (sizes[0] != 0));
+ HDassert((count == 0) || (types[0] != H5FD_MEM_NOLIST));
+
+ /* Verify that no data is written when between MPI_Barrier()s during file flush */
+
+ HDassert(!H5CX_get_mpi_file_flushing());
+
+ /* Get the transfer mode from the API context
+ *
+ * This flag is set to H5FD_MPIO_COLLECTIVE if the API call is
+ * collective, and to H5FD_MPIO_INDEPENDENT if it is not.
+ *
+ * While this doesn't mean that we are actually about to do a collective
+ * write, it does mean that all ranks are here, so we can use MPI_File_set_view().
+ */
+ if (H5CX_get_io_xfer_mode(&xfer_mode) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode")
+
+ if (xfer_mode == H5FD_MPIO_COLLECTIVE) {
+ /* Build MPI types, etc. */
+ if (H5FD__mpio_vector_build_types(count, types, addrs, sizes, (H5_flexible_const_ptr_t *)bufs,
+ &s_addrs, &s_sizes, (H5_flexible_const_ptr_t **)&s_bufs,
+ &vector_was_sorted, &mpi_off,
+ (H5_flexible_const_ptr_t *)&mpi_bufs_base, &size_i, &buf_type,
+ &buf_type_created, &file_type, &file_type_created, &unused) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't build MPI datatypes for I/O")
+
+ /* Compute max addr writted to */
+ if (count > 0)
+ max_addr = s_addrs[count - 1] + (haddr_t)(s_sizes[count - 1]);
+
+ /* free sorted vectors if they exist */
+ if (!vector_was_sorted) {
+ if (s_addrs) {
+ HDfree(s_addrs);
+ s_addrs = NULL;
+ }
+ if (s_sizes) {
+ HDfree(s_sizes);
+ s_sizes = NULL;
+ }
+ if (s_bufs) {
+ HDfree(s_bufs);
+ s_bufs = NULL;
+ }
+ }
+
+ /* Portably initialize MPI status variable */
+ HDmemset(&mpi_stat, 0, sizeof(MPI_Status));
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_w_flag)
+ HDfprintf(stdout, "%s: mpi_off = %ld size_i = %d\n", __func__, (long)mpi_off, size_i);
+#endif
+
+ /* Setup the file view. */
+ if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, mpi_off, MPI_BYTE, file_type,
+ H5FD_mpi_native_g, file->info)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
+
+ /* Reset mpi_off to 0 since the view now starts at the data offset */
+ if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, &mpi_off) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0")
+
+ /* Get the collective_opt property to check whether the application wants to do IO individually.
+ */
+ if (H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property")
+
+ /* Write the data. */
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_w_flag)
+ HDfprintf(stdout, "%s: using MPIO collective mode\n", __func__);
+#endif
+
+ if (coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) {
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_w_flag)
+ HDfprintf(stdout, "%s: doing MPI collective IO\n", __func__);
+#endif
+
+ if (MPI_SUCCESS != (mpi_code = MPI_File_write_at_all(file->f, mpi_off, mpi_bufs_base, size_i,
+ buf_type, &mpi_stat)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at_all failed", mpi_code)
+ } /* end if */
+ else if (size_i > 0) {
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_w_flag)
+ HDfprintf(stdout, "%s: doing MPI independent IO\n", __func__);
+#endif
+
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_File_write_at(file->f, mpi_off, mpi_bufs_base, size_i, buf_type, &mpi_stat)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at failed", mpi_code)
+ } /* end else */
+
+ /* Reset the file view */
+ if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE,
+ H5FD_mpi_native_g, file->info)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
+ }
+ else if (count > 0) {
+ hbool_t fixed_size = FALSE;
+ size_t size;
+
+ /* The read is part of an independent operation. As a result,
+ * we can't use MPI_File_set_view() (since it it a collective operation),
+ * and thus we can't use the above code to construct the MPI datatypes.
+ * In the future, we could write code to detect when a contiguous slab
+ * in the file selection spans multiple vector elements and construct a
+ * memory datatype to match this larger block in the file, but for now
+ * just read in each element of the vector in a separate
+ * MPI_File_read_at() call.
+ *
+ * We could also just detect the case when the entire file selection is
+ * contiguous, which would allow us to use
+ * H5FD__mpio_vector_build_types() to construct the memory datatype.
+ */
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_w_flag)
+ HDfprintf(stdout, "%s: doing MPI independent IO\n", __func__);
+#endif
+
+ /* Loop over vector elements */
+ for (i = 0; i < (int)count; i++) {
+ /* Convert address to mpi offset */
+ if (H5FD_mpi_haddr_to_MPIOff(addrs[i], &mpi_off) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off")
+
+ /* Calculate I/O size */
+ if (!fixed_size) {
+ if (sizes[i] == 0) {
+ fixed_size = TRUE;
+ size = sizes[i - 1];
+ }
+ else {
+ size = sizes[i];
+ }
+ }
+ size_i = (int)size;
+
+ if (size != (size_t)size_i) {
+ /* If HERE, then we need to work around the integer size limit
+ * of 2GB. The input size_t size variable cannot fit into an integer,
+ * but we can get around that limitation by creating a different datatype
+ * and then setting the integer size (or element count) to 1 when using
+ * the derived_type.
+ */
+
+ if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &buf_type) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype")
+
+ buf_type_created = TRUE;
+ size_i = 1;
+ }
+
+ /* Perform write */
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_File_write_at(file->f, mpi_off, bufs[i], size_i, buf_type, &mpi_stat)))
+
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at failed", mpi_code)
+
+ /* Check if this is the highest address written to so far */
+ if (addrs[i] + size > max_addr)
+ max_addr = addrs[i] + size;
+ }
+ }
+
+ /* Each process will keep track of its perceived EOF value locally, and
+ * ultimately we will reduce this value to the maximum amongst all
+ * processes, but until then keep the actual eof at HADDR_UNDEF just in
+ * case something bad happens before that point. (rather have a value
+ * we know is wrong sitting around rather than one that could only
+ * potentially be wrong.)
+ */
+ file->eof = HADDR_UNDEF;
+
+ /* check to see if the local eof has changed been extended, and update if so */
+ if (max_addr > file->local_eof)
+ file->local_eof = max_addr;
+
+done:
+ if (buf_type_created)
+ MPI_Type_free(&buf_type);
+
+ if (file_type_created)
+ MPI_Type_free(&file_type);
+
+ /* Cleanup on error */
+ if (ret_value < 0 && !vector_was_sorted) {
+ if (s_addrs) {
+ HDfree(s_addrs);
+ s_addrs = NULL;
+ }
+ if (s_sizes) {
+ HDfree(s_sizes);
+ s_sizes = NULL;
+ }
+ if (s_bufs) {
+ HDfree(s_bufs);
+ s_bufs = NULL;
+ }
+ }
+
+ /* Make sure we cleaned up */
+ HDassert(vector_was_sorted || !s_addrs);
+ HDassert(vector_was_sorted || !s_sizes);
+ HDassert(vector_was_sorted || !s_bufs);
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_t_flag)
+ HDfprintf(stdout, "%s: Leaving, proc %d: ret_value = %d\n", __func__, file->mpi_rank, ret_value);
+#endif
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5FD__mpio_write_vector() */
+
+/*-------------------------------------------------------------------------
* Function: H5FD__mpio_flush
*
* Purpose: Makes sure that all data is on disk. This is collective.