summaryrefslogtreecommitdiffstats
path: root/src/H5FDmpio.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/H5FDmpio.c')
-rw-r--r--src/H5FDmpio.c1026
1 files changed, 981 insertions, 45 deletions
diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c
index a76b183..aa83be4 100644
--- a/src/H5FDmpio.c
+++ b/src/H5FDmpio.c
@@ -88,11 +88,20 @@ static herr_t H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_i
static herr_t H5FD__mpio_write_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t count,
H5FD_mem_t types[], haddr_t addrs[], size_t sizes[],
const void *bufs[]);
-static herr_t H5FD__mpio_flush(H5FD_t *_file, hid_t dxpl_id, hbool_t closing);
-static herr_t H5FD__mpio_truncate(H5FD_t *_file, hid_t dxpl_id, hbool_t closing);
-static herr_t H5FD__mpio_delete(const char *filename, hid_t fapl_id);
-static herr_t H5FD__mpio_ctl(H5FD_t *_file, uint64_t op_code, uint64_t flags, const void *input,
- void **output);
+
+static herr_t H5FD__mpio_read_selection(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id,
+ size_t count, hid_t mem_space_ids[], hid_t file_space_ids[],
+ haddr_t offsets[], size_t element_sizes[], void *bufs[]);
+
+static herr_t H5FD__mpio_write_selection(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id,
+ size_t count, hid_t mem_space_ids[], hid_t file_space_ids[],
+ haddr_t offsets[], size_t element_sizes[], const void *bufs[]);
+
+static herr_t H5FD__mpio_flush(H5FD_t *_file, hid_t dxpl_id, hbool_t closing);
+static herr_t H5FD__mpio_truncate(H5FD_t *_file, hid_t dxpl_id, hbool_t closing);
+static herr_t H5FD__mpio_delete(const char *filename, hid_t fapl_id);
+static herr_t H5FD__mpio_ctl(H5FD_t *_file, uint64_t op_code, uint64_t flags, const void *input,
+ void **output);
/* Other functions */
static herr_t H5FD__mpio_vector_build_types(
@@ -101,48 +110,55 @@ static herr_t H5FD__mpio_vector_build_types(
MPI_Offset *mpi_off, H5_flexible_const_ptr_t *mpi_bufs_base, int *size_i, MPI_Datatype *buf_type,
hbool_t *buf_type_created, MPI_Datatype *file_type, hbool_t *file_type_created, char *unused);
+static herr_t H5FD__selection_build_types(hbool_t io_op_write, size_t num_pieces, H5_flexible_const_ptr_t mbb,
+ H5S_t **file_spaces, H5S_t **mem_spaces, haddr_t offsets[],
+ H5_flexible_const_ptr_t bufs[], size_t src_element_sizes[],
+ size_t dst_element_sizes[], MPI_Datatype *final_ftype,
+ hbool_t *final_ftype_is_derived, MPI_Datatype *final_mtype,
+ hbool_t *final_mtype_is_derived);
+
/* The MPIO file driver information */
static const H5FD_class_t H5FD_mpio_g = {
- H5FD_CLASS_VERSION, /* struct version */
- H5_VFD_MPIO, /* value */
- "mpio", /* name */
- HADDR_MAX, /* maxaddr */
- H5F_CLOSE_SEMI, /* fc_degree */
- H5FD__mpio_term, /* terminate */
- NULL, /* sb_size */
- NULL, /* sb_encode */
- NULL, /* sb_decode */
- 0, /* fapl_size */
- NULL, /* fapl_get */
- NULL, /* fapl_copy */
- NULL, /* fapl_free */
- 0, /* dxpl_size */
- NULL, /* dxpl_copy */
- NULL, /* dxpl_free */
- H5FD__mpio_open, /* open */
- H5FD__mpio_close, /* close */
- NULL, /* cmp */
- H5FD__mpio_query, /* query */
- NULL, /* get_type_map */
- NULL, /* alloc */
- NULL, /* free */
- H5FD__mpio_get_eoa, /* get_eoa */
- H5FD__mpio_set_eoa, /* set_eoa */
- H5FD__mpio_get_eof, /* get_eof */
- H5FD__mpio_get_handle, /* get_handle */
- H5FD__mpio_read, /* read */
- H5FD__mpio_write, /* write */
- H5FD__mpio_read_vector, /* read_vector */
- H5FD__mpio_write_vector, /* write_vector */
- NULL, /* read_selection */
- NULL, /* write_selection */
- H5FD__mpio_flush, /* flush */
- H5FD__mpio_truncate, /* truncate */
- NULL, /* lock */
- NULL, /* unlock */
- H5FD__mpio_delete, /* del */
- H5FD__mpio_ctl, /* ctl */
- H5FD_FLMAP_DICHOTOMY /* fl_map */
+ H5FD_CLASS_VERSION, /* struct version */
+ H5_VFD_MPIO, /* value */
+ "mpio", /* name */
+ HADDR_MAX, /* maxaddr */
+ H5F_CLOSE_SEMI, /* fc_degree */
+ H5FD__mpio_term, /* terminate */
+ NULL, /* sb_size */
+ NULL, /* sb_encode */
+ NULL, /* sb_decode */
+ 0, /* fapl_size */
+ NULL, /* fapl_get */
+ NULL, /* fapl_copy */
+ NULL, /* fapl_free */
+ 0, /* dxpl_size */
+ NULL, /* dxpl_copy */
+ NULL, /* dxpl_free */
+ H5FD__mpio_open, /* open */
+ H5FD__mpio_close, /* close */
+ NULL, /* cmp */
+ H5FD__mpio_query, /* query */
+ NULL, /* get_type_map */
+ NULL, /* alloc */
+ NULL, /* free */
+ H5FD__mpio_get_eoa, /* get_eoa */
+ H5FD__mpio_set_eoa, /* set_eoa */
+ H5FD__mpio_get_eof, /* get_eof */
+ H5FD__mpio_get_handle, /* get_handle */
+ H5FD__mpio_read, /* read */
+ H5FD__mpio_write, /* write */
+ H5FD__mpio_read_vector, /* read_vector */
+ H5FD__mpio_write_vector, /* write_vector */
+ H5FD__mpio_read_selection, /* read_selection */
+ H5FD__mpio_write_selection, /* write_selection */
+ H5FD__mpio_flush, /* flush */
+ H5FD__mpio_truncate, /* truncate */
+ NULL, /* lock */
+ NULL, /* unlock */
+ H5FD__mpio_delete, /* del */
+ H5FD__mpio_ctl, /* ctl */
+ H5FD_FLMAP_DICHOTOMY /* fl_map */
};
#ifdef H5FDmpio_DEBUG
@@ -2717,6 +2733,926 @@ done:
} /* end H5FD__mpio_write_vector() */
/*-------------------------------------------------------------------------
+ * Function: H5FD__selection_build_types
+ *
+ * Purpose: Build MPI derived datatype for each piece and then
+ * build MPI final derived datatype for file and memory.
+ *
+ * Note: This is derived from H5D__link_piece_collective_io() in
+ * src/H5Dmpio.c.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD__selection_build_types(hbool_t io_op_write, size_t num_pieces, H5_flexible_const_ptr_t mbb,
+ H5S_t **file_spaces, H5S_t **mem_spaces, haddr_t offsets[],
+ H5_flexible_const_ptr_t bufs[], size_t src_element_sizes[],
+ size_t dst_element_sizes[], MPI_Datatype *final_ftype,
+ hbool_t *final_ftype_is_derived, MPI_Datatype *final_mtype,
+ hbool_t *final_mtype_is_derived)
+{
+
+ MPI_Datatype *piece_mtype = NULL;
+ MPI_Datatype *piece_ftype = NULL;
+ MPI_Aint *piece_file_disp_array = NULL;
+ MPI_Aint *piece_mem_disp_array = NULL;
+ hbool_t *piece_mft_is_derived_array =
+ NULL; /* Flags to indicate each piece's MPI file datatype is derived */
+ ;
+ hbool_t *piece_mmt_is_derived_array =
+ NULL; /* Flags to indicate each piece's MPI memory datatype is derived */
+ int *piece_mpi_file_counts = NULL; /* Count of MPI file datatype for each piece */
+ int *piece_mpi_mem_counts = NULL; /* Count of MPI memory datatype for each piece */
+
+ haddr_t base_file_addr;
+ size_t i; /* Local index variable */
+ int mpi_code; /* MPI return code */
+
+ hbool_t extend_src_sizes = FALSE;
+ hbool_t extend_dst_sizes = FALSE;
+ hbool_t extend_bufs = FALSE;
+ H5_flexible_const_ptr_t buf;
+ size_t src_element_size, dst_element_size;
+
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_PACKAGE
+
+ /* Allocate information for num_pieces */
+ if (NULL == (piece_mtype = (MPI_Datatype *)H5MM_malloc(num_pieces * sizeof(MPI_Datatype))))
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate piece memory datatype buffer");
+ if (NULL == (piece_ftype = (MPI_Datatype *)H5MM_malloc(num_pieces * sizeof(MPI_Datatype))))
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate piece file datatype buffer");
+ if (NULL == (piece_file_disp_array = (MPI_Aint *)H5MM_malloc(num_pieces * sizeof(MPI_Aint))))
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate piece file displacement buffer");
+ if (NULL == (piece_mem_disp_array = (MPI_Aint *)H5MM_calloc(num_pieces * sizeof(MPI_Aint))))
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate piece memory displacement buffer");
+ if (NULL == (piece_mpi_mem_counts = (int *)H5MM_calloc(num_pieces * sizeof(int))))
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate piece memory counts buffer");
+ if (NULL == (piece_mpi_file_counts = (int *)H5MM_calloc(num_pieces * sizeof(int))))
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate piece file counts buffer");
+ if (NULL == (piece_mmt_is_derived_array = (hbool_t *)H5MM_calloc(num_pieces * sizeof(hbool_t))))
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL,
+ "couldn't allocate piece memory is derived datatype flags buffer");
+ if (NULL == (piece_mft_is_derived_array = (hbool_t *)H5MM_calloc(num_pieces * sizeof(hbool_t))))
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL,
+ "couldn't allocate piece file is derived datatype flags buffer");
+
+ /* save lowest file address */
+ base_file_addr = offsets[0];
+
+ /* Obtain MPI derived datatype from all individual pieces */
+ /* Iterate over selected pieces for this process */
+ for (i = 0; i < num_pieces; i++) {
+ hsize_t *permute_map = NULL; /* array that holds the mapping from the old,
+ out-of-order displacements to the in-order
+ displacements of the MPI datatypes of the
+ point selection of the file space */
+ hbool_t is_permuted = FALSE;
+
+ if (!extend_src_sizes) {
+ if (src_element_sizes[i] == 0) {
+ extend_src_sizes = TRUE;
+ src_element_size = src_element_sizes[i - 1];
+ }
+ else
+ src_element_size = src_element_sizes[i];
+ }
+
+ if (!extend_dst_sizes) {
+ if (dst_element_sizes[i] == 0) {
+ extend_dst_sizes = TRUE;
+ dst_element_size = dst_element_sizes[i - 1];
+ }
+ else
+ dst_element_size = src_element_sizes[i];
+ }
+
+ if (!extend_bufs) {
+ if (bufs[i].cvp == NULL) {
+ extend_bufs = TRUE;
+ buf = bufs[i - 1];
+ }
+ else
+ buf = bufs[i];
+ }
+
+ /* Obtain disk and memory MPI derived datatype */
+ /* NOTE: The permute_map array can be allocated within H5S_mpio_space_type
+ * and will be fed into the next call to H5S_mpio_space_type
+ * where it will be freed.
+ */
+ if (H5S_mpio_space_type(file_spaces[i], src_element_size, &piece_ftype[i], /* OUT: datatype created */
+ &piece_mpi_file_counts[i], /* OUT */
+ &(piece_mft_is_derived_array[i]), /* OUT */
+ TRUE, /* this is a file space,
+ so permute the
+ datatype if the point
+ selections are out of
+ order */
+ &permute_map, /* OUT: a map to indicate the
+ permutation of points
+ selected in case they
+ are out of order */
+ &is_permuted /* OUT */) < 0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "couldn't create MPI file type");
+
+ /* Sanity check */
+ if (is_permuted)
+ assert(permute_map);
+
+ if (H5S_mpio_space_type(mem_spaces[i], dst_element_size, &piece_mtype[i], &piece_mpi_mem_counts[i],
+ &(piece_mmt_is_derived_array[i]), FALSE, /* this is a memory
+ space, so if the file
+ space is not
+ permuted, there is no
+ need to permute the
+ datatype if the point
+ selections are out of
+ order*/
+ &permute_map, /* IN: the permutation map
+ generated by the
+ file_space selection
+ and applied to the
+ memory selection */
+ &is_permuted /* IN */) < 0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "couldn't create MPI buf type");
+
+ /* Sanity check */
+ if (is_permuted)
+ assert(!permute_map);
+
+ /* Piece address relative to the first piece addr
+ * Assign piece address to MPI displacement
+ * (assume MPI_Aint big enough to hold it) */
+ piece_file_disp_array[i] = (MPI_Aint)offsets[i] - (MPI_Aint)base_file_addr;
+
+ if (io_op_write) {
+ piece_mem_disp_array[i] = (MPI_Aint)buf.cvp - (MPI_Aint)mbb.cvp;
+ }
+ else {
+ piece_mem_disp_array[i] = (MPI_Aint)buf.vp - (MPI_Aint)mbb.vp;
+ }
+ } /* end for */
+
+ /* Create final MPI derived datatype for the file */
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_create_struct((int)num_pieces, piece_mpi_file_counts,
+ piece_file_disp_array, piece_ftype, final_ftype)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct failed", mpi_code);
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(final_ftype)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code);
+ *final_ftype_is_derived = TRUE;
+
+ /* Create final MPI derived datatype for memory */
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_create_struct((int)num_pieces, piece_mpi_mem_counts,
+ piece_mem_disp_array, piece_mtype, final_mtype)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct failed", mpi_code);
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(final_mtype)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code);
+ *final_mtype_is_derived = TRUE;
+
+ /* Free the file & memory MPI datatypes for each piece */
+ for (i = 0; i < num_pieces; i++) {
+ if (piece_mmt_is_derived_array[i])
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_free(piece_mtype + i)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
+
+ if (piece_mft_is_derived_array[i])
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_free(piece_ftype + i)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
+ } /* end for */
+
+done:
+
+ /* Release resources */
+ if (piece_mtype)
+ H5MM_xfree(piece_mtype);
+ if (piece_ftype)
+ H5MM_xfree(piece_ftype);
+ if (piece_file_disp_array)
+ H5MM_xfree(piece_file_disp_array);
+ if (piece_mem_disp_array)
+ H5MM_xfree(piece_mem_disp_array);
+ if (piece_mpi_mem_counts)
+ H5MM_xfree(piece_mpi_mem_counts);
+ if (piece_mpi_file_counts)
+ H5MM_xfree(piece_mpi_file_counts);
+ if (piece_mmt_is_derived_array)
+ H5MM_xfree(piece_mmt_is_derived_array);
+ if (piece_mft_is_derived_array)
+ H5MM_xfree(piece_mft_is_derived_array);
+
+ FUNC_LEAVE_NOAPI(ret_value);
+
+} /* H5FD__selection_build_types() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD__mpio_read_selection
+ *
+ * Purpose: The behaviour of this function dependes on the value of
+ * the transfer mode obtained from the context.
+ *
+ * If the transfer mode is H5FD_MPIO_COLLECTIVE:
+ * --sort the selections
+ * --set mpi_bufs_base
+ * --build the MPI derived types
+ * --perform MPI_File_set_view()
+ * --perform MPI_File_read_at_all() or MPI_File_read_at()
+ * depending on whether this is a H5FD_MPIO_COLLECTIVE_IO
+ *
+ * If this is not H5FD_MPIO_COLLECTIVE:
+ * --undo possible base address addition in internal routines
+ * --call H5FD_read_vector_from_selection() to perform vector
+ * or scalar writes for the selections
+ *
+ * Return: Success: SUCCEED.
+ * Failure: FAIL.
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD__mpio_read_selection(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, size_t count,
+ hid_t mem_space_ids[], hid_t file_space_ids[], haddr_t offsets[],
+ size_t element_sizes[], void *bufs[] /* out */)
+{
+ H5FD_mpio_t *file = (H5FD_mpio_t *)_file;
+ MPI_Offset mpi_off;
+ MPI_Status mpi_stat; /* Status from I/O operation */
+ int size_i; /* Integer copy of 'size' to read */
+
+ H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */
+ H5FD_mpio_collective_opt_t coll_opt_mode;
+
+ MPI_Datatype final_mtype; /* Final memory MPI datatype for all pieces with selection */
+ hbool_t final_mtype_is_derived = FALSE;
+
+ MPI_Datatype final_ftype; /* Final file MPI datatype for all pieces with selection */
+ hbool_t final_ftype_is_derived = FALSE;
+
+ hid_t *s_mem_space_ids = NULL;
+ hid_t *s_file_space_ids = NULL;
+ haddr_t *s_offsets = NULL;
+ size_t *s_element_sizes = NULL;
+ H5_flexible_const_ptr_t *s_bufs = NULL;
+ hbool_t selection_was_sorted = TRUE;
+
+ uint32_t i, j;
+ H5S_t **s_mem_spaces = NULL;
+ H5S_t **s_file_spaces = NULL;
+ haddr_t tmp_offset = 0;
+ void *mpi_bufs_base = NULL;
+ char unused = 0; /* Unused, except for non-NULL pointer value */
+
+#if H5_CHECK_MPI_VERSION(3, 0)
+ MPI_Count bytes_read = 0; /* Number of bytes read in */
+ MPI_Count type_size; /* MPI datatype used for I/O's size */
+ MPI_Count io_size; /* Actual number of bytes requested */
+ MPI_Count n;
+#else
+ int bytes_read = 0; /* Number of bytes read in */
+ int type_size; /* MPI datatype used for I/O's size */
+ int io_size; /* Actual number of bytes requested */
+ int n;
+#endif
+ hbool_t rank0_bcast = FALSE; /* If read-with-rank0-and-bcast flag was used */
+#ifdef H5FDmpio_DEBUG
+ hbool_t H5FD_mpio_debug_t_flag = (H5FD_mpio_debug_flags_s[(int)'t'] && H5FD_MPIO_TRACE_THIS_RANK(file));
+ hbool_t H5FD_mpio_debug_r_flag = (H5FD_mpio_debug_flags_s[(int)'r'] && H5FD_MPIO_TRACE_THIS_RANK(file));
+#endif
+ int mpi_code; /* MPI return code */
+ H5_flexible_const_ptr_t mbb;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_PACKAGE
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_t_flag)
+ fprintf(stderr, "%s: (%d) Entering\n", __func__, file->mpi_rank);
+#endif
+
+ /* Sanity checks */
+ assert(file);
+ assert(H5FD_MPIO == file->pub.driver_id);
+ assert((count == 0) || (mem_space_ids));
+ assert((count == 0) || (file_space_ids));
+ assert((count == 0) || (offsets));
+ assert((count == 0) || (element_sizes));
+ assert((count == 0) || (bufs));
+
+ /* Verify that the first elements of the element_sizes and bufs arrays are
+ * valid. */
+ assert((count == 0) || (element_sizes[0] != 0));
+ assert((count == 0) || (bufs[0] != NULL));
+
+ /* Portably initialize MPI status variable */
+ memset(&mpi_stat, 0, sizeof(MPI_Status));
+
+ /* Get the transfer mode from the API context */
+ if (H5CX_get_io_xfer_mode(&xfer_mode) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode");
+
+ /*
+ * Set up for a fancy xfer using complex types, or single byte block. We
+ * wouldn't need to rely on the use_view field if MPI semantics allowed
+ * us to test that btype=ftype=MPI_BYTE (or even MPI_TYPE_NULL, which
+ * could mean "use MPI_BYTE" by convention).
+ */
+ if (xfer_mode == H5FD_MPIO_COLLECTIVE) {
+
+ if (count) {
+ if (H5FD_sort_selection_io_req(&selection_was_sorted, count, mem_space_ids, file_space_ids,
+ offsets, element_sizes, (H5_flexible_const_ptr_t *)bufs,
+ &s_mem_space_ids, &s_file_space_ids, &s_offsets, &s_element_sizes,
+ &s_bufs) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "can't sort selection I/O request");
+
+ tmp_offset = s_offsets[0];
+
+ if (NULL == (s_file_spaces = H5MM_malloc(count * sizeof(H5S_t *))))
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "memory allocation failed for file space list");
+ if (NULL == (s_mem_spaces = H5MM_malloc(count * sizeof(H5S_t *))))
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "memory allocation failed for memory space list");
+
+ for (i = 0; i < count; i++) {
+ if (NULL == (s_mem_spaces[i] = (H5S_t *)H5I_object_verify(s_mem_space_ids[i], H5I_DATASPACE)))
+ HGOTO_ERROR(H5E_VFL, H5E_BADTYPE, H5I_INVALID_HID,
+ "can't retrieve memory dataspace from ID");
+ if (NULL ==
+ (s_file_spaces[i] = (H5S_t *)H5I_object_verify(s_file_space_ids[i], H5I_DATASPACE)))
+ HGOTO_ERROR(H5E_VFL, H5E_BADTYPE, H5I_INVALID_HID,
+ "can't retrieve file dataspace from ID");
+ }
+
+ /* when we setup mpi_bufs[] below, all addresses are offsets from
+ * mpi_bufs_base.
+ *
+ * Since these offsets must all be positive, we must scan through
+ * s_bufs[] to find the smallest value, and choose that for
+ * mpi_bufs_base.
+ */
+ j = 0; /* guess at the index of the smallest value of s_bufs[] */
+
+ if (s_bufs[j + 1].vp != NULL) {
+ for (i = 1; i < count; i++)
+ if (s_bufs[i].vp < s_bufs[j].vp)
+ j = i;
+ }
+
+ mpi_bufs_base = s_bufs[j].vp;
+ mbb.vp = mpi_bufs_base;
+
+ if (H5FD__selection_build_types(FALSE, count, mbb, s_file_spaces, s_mem_spaces, s_offsets, s_bufs,
+ s_element_sizes, s_element_sizes, &final_ftype,
+ &final_ftype_is_derived, &final_mtype,
+ &final_mtype_is_derived) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "couldn't build type for MPI-IO");
+
+ /* We have a single, complicated MPI datatype for both memory & file */
+ size_i = 1;
+ }
+ else {
+
+ /* No chunks selected for this process */
+ size_i = 0;
+
+ mpi_bufs_base = &unused;
+
+ /* Set the MPI datatype */
+ final_ftype = MPI_BYTE;
+ final_mtype = MPI_BYTE;
+ }
+
+ /* some numeric conversions */
+ if (H5FD_mpi_haddr_to_MPIOff(tmp_offset, &mpi_off /*out*/) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off");
+
+ /*
+ * Set the file view when we are using MPI derived types
+ */
+ if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, mpi_off, MPI_BYTE, final_ftype,
+ H5FD_mpi_native_g, file->info)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code);
+
+ /* When using types, use the address as the displacement for
+ * MPI_File_set_view and reset the address for the read to zero
+ */
+ /* Reset mpi_off to 0 since the view now starts at the data offset */
+ if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, &mpi_off) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0");
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_r_flag)
+ fprintf(stderr, "%s: (%d) using MPIO collective mode\n", __func__, file->mpi_rank);
+#endif
+ /* Get the collective_opt property to check whether the application wants to do IO individually. */
+ if (H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property");
+
+ if (coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) {
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_r_flag)
+ fprintf(stderr, "%s: (%d) doing MPI collective IO\n", __func__, file->mpi_rank);
+#endif
+ /* Check whether we should read from rank 0 and broadcast to other ranks */
+ if (H5CX_get_mpio_rank0_bcast()) {
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_r_flag)
+ fprintf(stderr, "%s: (%d) doing read-rank0-and-MPI_Bcast\n", __func__, file->mpi_rank);
+#endif
+ /* Indicate path we've taken */
+ rank0_bcast = TRUE;
+
+ /* Read on rank 0 Bcast to other ranks */
+ if (file->mpi_rank == 0) {
+ /* If MPI_File_read_at fails, push an error, but continue
+ * to participate in following MPI_Bcast */
+ if (MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, mpi_bufs_base, size_i,
+ final_mtype, &mpi_stat)))
+ HMPI_DONE_ERROR(FAIL, "MPI_File_read_at failed", mpi_code);
+ }
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Bcast(mpi_bufs_base, size_i, final_mtype, 0, file->comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code);
+ } /* end if */
+ else
+ /* Perform collective read operation */
+ if (MPI_SUCCESS != (mpi_code = MPI_File_read_at_all(file->f, mpi_off, mpi_bufs_base, size_i,
+ final_mtype, &mpi_stat)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code);
+ } /* end if */
+ else {
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_r_flag)
+ fprintf(stderr, "%s: (%d) doing MPI independent IO\n", __func__, file->mpi_rank);
+#endif
+
+ /* Perform independent read operation */
+ if (MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, mpi_bufs_base, size_i,
+ final_mtype, &mpi_stat)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code);
+ } /* end else */
+
+ /*
+ * Reset the file view when we used MPI derived types
+ */
+ if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE,
+ H5FD_mpi_native_g, file->info)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code);
+
+ /* Only retrieve bytes read if this rank _actually_ participated in I/O */
+ if (!rank0_bcast || (rank0_bcast && file->mpi_rank == 0)) {
+ /* How many bytes were actually read? */
+#if H5_CHECK_MPI_VERSION(3, 0)
+ if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, final_mtype, &bytes_read))) {
+#else
+ if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read))) {
+#endif
+ if (rank0_bcast && file->mpi_rank == 0) {
+ /* If MPI_Get_elements(_x) fails for a rank 0 bcast strategy,
+ * push an error, but continue to participate in the following
+ * MPI_Bcast.
+ */
+ bytes_read = -1;
+ HMPI_DONE_ERROR(FAIL, "MPI_Get_elements failed", mpi_code);
+ }
+ else
+ HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code);
+ }
+ } /* end if */
+
+ /* If the rank0-bcast feature was used, broadcast the # of bytes read to
+ * other ranks, which didn't perform any I/O.
+ */
+ /* NOTE: This could be optimized further to be combined with the broadcast
+ * of the data. (QAK - 2019/1/2)
+ */
+ if (rank0_bcast)
+#if H5_CHECK_MPI_VERSION(3, 0)
+ if (MPI_SUCCESS != MPI_Bcast(&bytes_read, 1, MPI_COUNT, 0, file->comm))
+#else
+ if (MPI_SUCCESS != MPI_Bcast(&bytes_read, 1, MPI_INT, 0, file->comm))
+#endif
+ HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", 0);
+
+ /* Get the type's size */
+#if H5_CHECK_MPI_VERSION(3, 0)
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_size_x(final_mtype, &type_size)))
+#else
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_size(final_mtype, &type_size)))
+#endif
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_size failed", mpi_code);
+
+ /* Compute the actual number of bytes requested */
+ io_size = type_size * size_i;
+
+ /* Check for read failure */
+ if (bytes_read < 0 || bytes_read > io_size)
+ HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed");
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_r_flag)
+ fprintf(stderr, "%s: (%d) mpi_off = %ld bytes_read = %lld type = %s\n", __func__,
+ file->mpi_rank, (long)mpi_off, (long long)bytes_read, H5FD__mem_t_to_str(type));
+#endif
+
+ /*
+ * This gives us zeroes beyond end of physical MPI file.
+ */
+ if ((n = (io_size - bytes_read)) > 0)
+ memset((char *)bufs[0] + bytes_read, 0, (size_t)n);
+
+ } /* end if */
+ else {
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_r_flag)
+ fprintf(stderr, "%s: (%d) doing MPI independent IO\n", __func__, file->mpi_rank);
+#endif
+ if (_file->base_addr > 0) {
+ /* Undo base address addition in internal routines before passing down to the mpio driver */
+ for (i = 0; i < count; i++) {
+ assert(offsets[i] >= _file->base_addr);
+ offsets[i] -= _file->base_addr;
+ }
+ }
+
+ if (H5FD_read_from_selection(_file, type, (uint32_t)count, mem_space_ids, file_space_ids, offsets,
+ element_sizes, bufs) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "read vector from selection failed");
+ }
+
+done:
+ /* Free the MPI buf and file types, if they were derived */
+ if (final_mtype_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&final_mtype)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
+ if (final_ftype_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&final_ftype)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
+
+ /* Cleanup dataspace arrays */
+ if (s_mem_spaces)
+ s_mem_spaces = H5MM_xfree(s_mem_spaces);
+ if (s_file_spaces)
+ s_file_spaces = H5MM_xfree(s_file_spaces);
+
+ if (!selection_was_sorted) {
+ free(s_mem_space_ids);
+ s_mem_space_ids = NULL;
+ free(s_file_space_ids);
+ s_file_space_ids = NULL;
+ free(s_offsets);
+ s_offsets = NULL;
+ free(s_element_sizes);
+ s_element_sizes = NULL;
+ free(s_bufs);
+ s_bufs = NULL;
+ }
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_t_flag)
+ fprintf(stderr, "%s: (%d) Leaving\n", __func__, file->mpi_rank);
+#endif
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* end H5FD__mpio_read_selection() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD__mpio_write_selection
+ *
+ * Purpose: The behaviour of this function dependes on the value of
+ * the transfer mode obtained from the context.
+ *
+ * If the transfer mode is H5FD_MPIO_COLLECTIVE:
+ * --sort the selections
+ * --set mpi_bufs_base
+ * --build the MPI derived types
+ * --perform MPI_File_set_view()
+ * --perform MPI_File_write_at_all() or MPI_File_write_at()
+ * depending on whether this is a H5FD_MPIO_COLLECTIVE_IO
+ * --calculate and set the file's eof for the bytes written
+ *
+ * If this is not H5FD_MPIO_COLLECTIVE:
+ * --undo possible base address addition in internal routines
+ * --call H5FD_write_vector_from_selection() to perform vector
+ * or scalar writes for the selections
+ *
+ * Return: Success: SUCCEED.
+ * Failure: FAIL.
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD__mpio_write_selection(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, size_t count,
+ hid_t mem_space_ids[], hid_t file_space_ids[], haddr_t offsets[],
+ size_t element_sizes[], const void *bufs[])
+{
+ H5FD_mpio_t *file = (H5FD_mpio_t *)_file;
+ MPI_Offset mpi_off;
+ MPI_Offset save_mpi_off; /* Use at the end of the routine for setting local_eof */
+ MPI_Status mpi_stat; /* Status from I/O operation */
+
+ int size_i;
+ H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */
+ H5FD_mpio_collective_opt_t coll_opt_mode;
+
+ MPI_Datatype final_mtype; /* Final memory MPI datatype for all pieces with selection */
+ hbool_t final_mtype_is_derived = FALSE;
+
+ MPI_Datatype final_ftype; /* Final file MPI datatype for all pieces with selection */
+ hbool_t final_ftype_is_derived = FALSE;
+
+ hid_t *s_mem_space_ids = NULL;
+ hid_t *s_file_space_ids = NULL;
+ haddr_t *s_offsets = NULL;
+ size_t *s_element_sizes = NULL;
+ H5_flexible_const_ptr_t *s_bufs = NULL;
+ hbool_t selection_was_sorted = TRUE;
+ const void *mpi_bufs_base = NULL;
+
+ uint32_t i, j;
+ H5S_t **s_mem_spaces = NULL;
+ H5S_t **s_file_spaces = NULL;
+ haddr_t tmp_offset = 0;
+ char unused = 0; /* Unused, except for non-NULL pointer value */
+ H5_flexible_const_ptr_t mbb;
+
+#if H5_CHECK_MPI_VERSION(3, 0)
+ MPI_Count bytes_written;
+ MPI_Count type_size; /* MPI datatype used for I/O's size */
+ MPI_Count io_size; /* Actual number of bytes requested */
+#else
+ int bytes_written;
+ int type_size; /* MPI datatype used for I/O's size */
+ int io_size; /* Actual number of bytes requested */
+#endif
+
+#ifdef H5FDmpio_DEBUG
+ hbool_t H5FD_mpio_debug_t_flag = (H5FD_mpio_debug_flags_s[(int)'t'] && H5FD_MPIO_TRACE_THIS_RANK(file));
+ hbool_t H5FD_mpio_debug_w_flag = (H5FD_mpio_debug_flags_s[(int)'w'] && H5FD_MPIO_TRACE_THIS_RANK(file));
+#endif
+ int mpi_code; /* MPI return code */
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_PACKAGE
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_t_flag)
+ fprintf(stderr, "%s: (%d) Entering\n", __func__, file->mpi_rank);
+#endif
+
+ /* Sanity checks */
+ assert(file);
+ assert(H5FD_MPIO == file->pub.driver_id);
+ assert((count == 0) || (mem_space_ids));
+ assert((count == 0) || (file_space_ids));
+ assert((count == 0) || (offsets));
+ assert((count == 0) || (element_sizes));
+ assert((count == 0) || (bufs));
+
+ /* Verify that the first elements of the element_sizes and bufs arrays are
+ * valid. */
+ assert((count == 0) || (element_sizes[0] != 0));
+ assert((count == 0) || (bufs[0] != NULL));
+
+ /* Verify that no data is written when between MPI_Barrier()s during file flush */
+ assert(!H5CX_get_mpi_file_flushing());
+
+ /* Portably initialize MPI status variable */
+ memset(&mpi_stat, 0, sizeof(MPI_Status));
+
+ /* Get the transfer mode from the API context */
+ if (H5CX_get_io_xfer_mode(&xfer_mode) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode");
+
+ if (xfer_mode == H5FD_MPIO_COLLECTIVE) {
+
+ if (count) {
+ if (H5FD_sort_selection_io_req(&selection_was_sorted, count, mem_space_ids, file_space_ids,
+ offsets, element_sizes, (H5_flexible_const_ptr_t *)bufs,
+ &s_mem_space_ids, &s_file_space_ids, &s_offsets, &s_element_sizes,
+ &s_bufs) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "can't sort selection I/O request");
+
+ tmp_offset = s_offsets[0];
+
+ if (NULL == (s_file_spaces = H5MM_malloc(count * sizeof(H5S_t *))))
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "memory allocation failed for file space list");
+ if (NULL == (s_mem_spaces = H5MM_malloc(count * sizeof(H5S_t *))))
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "memory allocation failed for memory space list");
+
+ for (i = 0; i < count; i++) {
+ if (NULL ==
+ (s_file_spaces[i] = (H5S_t *)H5I_object_verify(s_file_space_ids[i], H5I_DATASPACE)))
+ HGOTO_ERROR(H5E_VFL, H5E_BADTYPE, H5I_INVALID_HID,
+ "can't retrieve file dataspace from ID");
+ if (NULL == (s_mem_spaces[i] = (H5S_t *)H5I_object_verify(s_mem_space_ids[i], H5I_DATASPACE)))
+ HGOTO_ERROR(H5E_VFL, H5E_BADTYPE, H5I_INVALID_HID,
+ "can't retrieve memory dataspace from ID");
+ }
+
+ /* when we setup mpi_bufs[] below, all addresses are offsets from
+ * mpi_bufs_base.
+ *
+ * Since these offsets must all be positive, we must scan through
+ * s_bufs[] to find the smallest value, and choose that for
+ * mpi_bufs_base.
+ */
+ j = 0; /* guess at the index of the smallest value of s_bufs[] */
+
+ if (s_bufs[j + 1].cvp != NULL) {
+ for (i = 1; i < count; i++)
+ if (s_bufs[i].cvp < s_bufs[j].cvp)
+ j = i;
+ }
+
+ mpi_bufs_base = s_bufs[j].cvp;
+ mbb.cvp = mpi_bufs_base;
+
+ if (H5FD__selection_build_types(TRUE, count, mbb, s_file_spaces, s_mem_spaces, s_offsets, s_bufs,
+ s_element_sizes, s_element_sizes, &final_ftype,
+ &final_ftype_is_derived, &final_mtype,
+ &final_mtype_is_derived) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "couldn't build type for MPI-IO");
+
+ /* We have a single, complicated MPI datatype for both memory & file */
+ size_i = 1;
+ }
+ else {
+
+ /* No chunks selected for this process */
+ size_i = 0;
+
+ mpi_bufs_base = &unused;
+
+ /* Set the MPI datatype */
+ final_ftype = MPI_BYTE;
+ final_mtype = MPI_BYTE;
+ }
+
+ /* some numeric conversions */
+ if (H5FD_mpi_haddr_to_MPIOff(tmp_offset, &mpi_off /*out*/) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off");
+
+ /* To be used at the end of the routine for setting local_eof */
+ save_mpi_off = mpi_off;
+
+ /*
+ * Set the file view when we are using MPI derived types
+ */
+ if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, mpi_off, MPI_BYTE, final_ftype,
+ H5FD_mpi_native_g, file->info)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code);
+
+ /* Reset mpi_off to 0 since the view now starts at the data offset */
+ if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, &mpi_off) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0");
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_w_flag)
+ fprintf(stderr, "%s: (%d) using MPIO collective mode\n", __func__, file->mpi_rank);
+#endif
+
+ /* Get the collective_opt property to check whether the application wants to do IO individually. */
+ if (H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property");
+
+ if (coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) {
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_w_flag)
+ fprintf(stderr, "%s: (%d) doing MPI collective IO\n", __func__, file->mpi_rank);
+#endif
+
+ /* Perform collective write operation */
+ if (MPI_SUCCESS != (mpi_code = MPI_File_write_at_all(file->f, mpi_off, mpi_bufs_base, size_i,
+ final_mtype, &mpi_stat)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at_all failed", mpi_code);
+
+ /* Do MPI_File_sync when needed by underlying ROMIO driver */
+ if (file->mpi_file_sync_required) {
+ if (MPI_SUCCESS != (mpi_code = MPI_File_sync(file->f)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_sync failed", mpi_code);
+ }
+ }
+ else {
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_w_flag)
+ fprintf(stderr, "%s: (%d) doing MPI independent IO\n", __func__, file->mpi_rank);
+#endif
+ /* Perform independent write operation */
+ if (MPI_SUCCESS != (mpi_code = MPI_File_write_at(file->f, mpi_off, mpi_bufs_base, size_i,
+ final_mtype, &mpi_stat)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at failed", mpi_code);
+ } /* end else */
+
+ /* Reset the file view when we used MPI derived types */
+ if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE,
+ H5FD_mpi_native_g, file->info)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code);
+
+ /* How many bytes were actually written */
+#if H5_CHECK_MPI_VERSION(3, 0)
+ if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, final_mtype, &bytes_written)))
+#else
+ if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_written)))
+#endif
+ HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code);
+
+ /* Get the type's size */
+#if H5_CHECK_MPI_VERSION(3, 0)
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_size_x(final_mtype, &type_size)))
+#else
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_size(final_mtype, &type_size)))
+#endif
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_size failed", mpi_code);
+
+ /* Compute the actual number of bytes requested */
+ io_size = type_size * size_i;
+
+ /* Check for write failure */
+ if (bytes_written != io_size || bytes_written < 0)
+ HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "file write failed");
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_w_flag)
+ fprintf(stderr, "%s: (%d) mpi_off = %ld bytes_written = %lld type = %s\n", __func__,
+ file->mpi_rank, (long)mpi_off, (long long)bytes_written, H5FD__mem_t_to_str(type));
+#endif
+
+ /* Each process will keep track of its perceived EOF value locally, and
+ * ultimately we will reduce this value to the maximum amongst all
+ * processes, but until then keep the actual eof at HADDR_UNDEF just in
+ * case something bad happens before that point. (rather have a value
+ * we know is wrong sitting around rather than one that could only
+ * potentially be wrong.) */
+ file->eof = HADDR_UNDEF;
+
+ if (bytes_written && (((haddr_t)bytes_written + (haddr_t)save_mpi_off) > file->local_eof))
+ file->local_eof = (haddr_t)save_mpi_off + (haddr_t)bytes_written;
+ }
+ else { /* Not H5FD_MPIO_COLLECTIVE */
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_w_flag)
+ fprintf(stderr, "%s: (%d) doing MPI independent IO\n", __func__, file->mpi_rank);
+#endif
+ if (_file->base_addr > 0) {
+ /* Undo base address addition in internal routines before passing down to the mpio driver */
+ for (i = 0; i < count; i++) {
+ assert(offsets[i] >= _file->base_addr);
+ offsets[i] -= _file->base_addr;
+ }
+ }
+
+ if (H5FD_write_from_selection(_file, type, (uint32_t)count, mem_space_ids, file_space_ids, offsets,
+ element_sizes, bufs) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "write vector from selection failed");
+ }
+
+done:
+ /* Free the MPI buf and file types, if they were derived */
+ if (final_mtype_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&final_mtype)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
+ if (final_ftype_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&final_ftype)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
+
+ /* Cleanup dataspace arrays */
+ if (s_mem_spaces)
+ s_mem_spaces = H5MM_xfree(s_mem_spaces);
+ if (s_file_spaces)
+ s_file_spaces = H5MM_xfree(s_file_spaces);
+
+ if (!selection_was_sorted) {
+ free(s_mem_space_ids);
+ s_mem_space_ids = NULL;
+ free(s_file_space_ids);
+ s_file_space_ids = NULL;
+ free(s_offsets);
+ s_offsets = NULL;
+ free(s_element_sizes);
+ s_element_sizes = NULL;
+ free(s_bufs);
+ s_bufs = NULL;
+ }
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_t_flag)
+ fprintf(stderr, "%s: (%d) Leaving: ret_value = %d\n", __func__, file->mpi_rank, ret_value);
+#endif
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* end H5FD__mpio_write_selection() */
+
+/*-------------------------------------------------------------------------
* Function: H5FD__mpio_flush
*
* Purpose: Makes sure that all data is on disk. This is collective.