summaryrefslogtreecommitdiffstats
path: root/src/H5FDmpio.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/H5FDmpio.c')
-rw-r--r--src/H5FDmpio.c1546
1 files changed, 1346 insertions, 200 deletions
diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c
index dd40399..2a5e462 100644
--- a/src/H5FDmpio.c
+++ b/src/H5FDmpio.c
@@ -45,6 +45,9 @@ static hid_t H5FD_MPIO_g = 0;
/* (Can be changed by setting "HDF5_MPI_OPT_TYPES" environment variable to '0' or '1') */
hbool_t H5FD_mpi_opt_types_g = TRUE;
+/* Whether the driver initialized MPI on its own */
+hbool_t H5FD_mpi_self_initialized = FALSE;
+
/*
* The view is set to this value
*/
@@ -72,66 +75,78 @@ typedef struct H5FD_mpio_t {
/* Private Prototypes */
/* Callbacks */
-static herr_t H5FD__mpio_term(void);
-static H5FD_t * H5FD__mpio_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t maxaddr);
-static herr_t H5FD__mpio_close(H5FD_t *_file);
-static herr_t H5FD__mpio_query(const H5FD_t *_f1, unsigned long *flags);
-static haddr_t H5FD__mpio_get_eoa(const H5FD_t *_file, H5FD_mem_t type);
-static herr_t H5FD__mpio_set_eoa(H5FD_t *_file, H5FD_mem_t type, haddr_t addr);
-static haddr_t H5FD__mpio_get_eof(const H5FD_t *_file, H5FD_mem_t type);
-static herr_t H5FD__mpio_get_handle(H5FD_t *_file, hid_t fapl, void **file_handle);
-static herr_t H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size,
- void *buf);
-static herr_t H5FD__mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size,
- const void *buf);
-static herr_t H5FD__mpio_flush(H5FD_t *_file, hid_t dxpl_id, hbool_t closing);
-static herr_t H5FD__mpio_truncate(H5FD_t *_file, hid_t dxpl_id, hbool_t closing);
-static herr_t H5FD__mpio_delete(const char *filename, hid_t fapl_id);
-static int H5FD__mpio_mpi_rank(const H5FD_t *_file);
-static int H5FD__mpio_mpi_size(const H5FD_t *_file);
-static MPI_Comm H5FD__mpio_communicator(const H5FD_t *_file);
+static herr_t H5FD__mpio_term(void);
+static H5FD_t *H5FD__mpio_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t maxaddr);
+static herr_t H5FD__mpio_close(H5FD_t *_file);
+static herr_t H5FD__mpio_query(const H5FD_t *_f1, unsigned long *flags);
+static haddr_t H5FD__mpio_get_eoa(const H5FD_t *_file, H5FD_mem_t type);
+static herr_t H5FD__mpio_set_eoa(H5FD_t *_file, H5FD_mem_t type, haddr_t addr);
+static haddr_t H5FD__mpio_get_eof(const H5FD_t *_file, H5FD_mem_t type);
+static herr_t H5FD__mpio_get_handle(H5FD_t *_file, hid_t fapl, void **file_handle);
+static herr_t H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size,
+ void *buf);
+static herr_t H5FD__mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size,
+ const void *buf);
+static herr_t H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t count,
+ H5FD_mem_t types[], haddr_t addrs[], size_t sizes[], void *bufs[]);
+static herr_t H5FD__mpio_write_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t count,
+ H5FD_mem_t types[], haddr_t addrs[], size_t sizes[],
+ const void *bufs[]);
+static herr_t H5FD__mpio_flush(H5FD_t *_file, hid_t dxpl_id, hbool_t closing);
+static herr_t H5FD__mpio_truncate(H5FD_t *_file, hid_t dxpl_id, hbool_t closing);
+static herr_t H5FD__mpio_delete(const char *filename, hid_t fapl_id);
+static herr_t H5FD__mpio_ctl(H5FD_t *_file, uint64_t op_code, uint64_t flags, const void *input,
+ void **output);
+
+/* Other functions */
+static herr_t H5FD__mpio_vector_build_types(
+ uint32_t count, H5FD_mem_t types[], haddr_t addrs[], size_t sizes[], H5_flexible_const_ptr_t bufs[],
+ haddr_t *s_addrs[], size_t *s_sizes[], H5_flexible_const_ptr_t *s_bufs[], hbool_t *vector_was_sorted,
+ MPI_Offset *mpi_off, H5_flexible_const_ptr_t *mpi_bufs_base, int *size_i, MPI_Datatype *buf_type,
+ hbool_t *buf_type_created, MPI_Datatype *file_type, hbool_t *file_type_created, char *unused);
/* The MPIO file driver information */
-static const H5FD_class_mpi_t H5FD_mpio_g = {
- {
- /* Start of superclass information */
- "mpio", /* name */
- HADDR_MAX, /* maxaddr */
- H5F_CLOSE_SEMI, /* fc_degree */
- H5FD__mpio_term, /* terminate */
- NULL, /* sb_size */
- NULL, /* sb_encode */
- NULL, /* sb_decode */
- 0, /* fapl_size */
- NULL, /* fapl_get */
- NULL, /* fapl_copy */
- NULL, /* fapl_free */
- 0, /* dxpl_size */
- NULL, /* dxpl_copy */
- NULL, /* dxpl_free */
- H5FD__mpio_open, /* open */
- H5FD__mpio_close, /* close */
- NULL, /* cmp */
- H5FD__mpio_query, /* query */
- NULL, /* get_type_map */
- NULL, /* alloc */
- NULL, /* free */
- H5FD__mpio_get_eoa, /* get_eoa */
- H5FD__mpio_set_eoa, /* set_eoa */
- H5FD__mpio_get_eof, /* get_eof */
- H5FD__mpio_get_handle, /* get_handle */
- H5FD__mpio_read, /* read */
- H5FD__mpio_write, /* write */
- H5FD__mpio_flush, /* flush */
- H5FD__mpio_truncate, /* truncate */
- NULL, /* lock */
- NULL, /* unlock */
- H5FD__mpio_delete, /* del */
- H5FD_FLMAP_DICHOTOMY /* fl_map */
- }, /* End of superclass information */
- H5FD__mpio_mpi_rank, /* get_rank */
- H5FD__mpio_mpi_size, /* get_size */
- H5FD__mpio_communicator /* get_comm */
+static const H5FD_class_t H5FD_mpio_g = {
+ H5FD_CLASS_VERSION, /* struct version */
+ H5_VFD_MPIO, /* value */
+ "mpio", /* name */
+ HADDR_MAX, /* maxaddr */
+ H5F_CLOSE_SEMI, /* fc_degree */
+ H5FD__mpio_term, /* terminate */
+ NULL, /* sb_size */
+ NULL, /* sb_encode */
+ NULL, /* sb_decode */
+ 0, /* fapl_size */
+ NULL, /* fapl_get */
+ NULL, /* fapl_copy */
+ NULL, /* fapl_free */
+ 0, /* dxpl_size */
+ NULL, /* dxpl_copy */
+ NULL, /* dxpl_free */
+ H5FD__mpio_open, /* open */
+ H5FD__mpio_close, /* close */
+ NULL, /* cmp */
+ H5FD__mpio_query, /* query */
+ NULL, /* get_type_map */
+ NULL, /* alloc */
+ NULL, /* free */
+ H5FD__mpio_get_eoa, /* get_eoa */
+ H5FD__mpio_set_eoa, /* set_eoa */
+ H5FD__mpio_get_eof, /* get_eof */
+ H5FD__mpio_get_handle, /* get_handle */
+ H5FD__mpio_read, /* read */
+ H5FD__mpio_write, /* write */
+ H5FD__mpio_read_vector, /* read_vector */
+ H5FD__mpio_write_vector, /* write_vector */
+ NULL, /* read_selection */
+ NULL, /* write_selection */
+ H5FD__mpio_flush, /* flush */
+ H5FD__mpio_truncate, /* truncate */
+ NULL, /* lock */
+ NULL, /* unlock */
+ H5FD__mpio_delete, /* del */
+ H5FD__mpio_ctl, /* ctl */
+ H5FD_FLMAP_DICHOTOMY /* fl_map */
};
#ifdef H5FDmpio_DEBUG
@@ -157,35 +172,6 @@ static int H5FD_mpio_debug_rank_s = -1;
(H5FD_mpio_debug_rank_s < 0 || H5FD_mpio_debug_rank_s == (file)->mpi_rank)
#endif
-/*--------------------------------------------------------------------------
-NAME
- H5FD__init_package -- Initialize interface-specific information
-
-USAGE
- herr_t H5FD__init_package()
-
-RETURNS
- SUCCEED/FAIL
-
-DESCRIPTION
- Initializes any interface-specific data or routines. (Just calls
- H5FD_mpio_init currently).
-
---------------------------------------------------------------------------*/
-static herr_t
-H5FD__init_package(void)
-{
- herr_t ret_value = SUCCEED;
-
- FUNC_ENTER_STATIC
-
- if (H5FD_mpio_init() < 0)
- HGOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "unable to initialize mpio VFD")
-
-done:
- FUNC_LEAVE_NOAPI(ret_value)
-} /* H5FD__init_package() */
-
#ifdef H5FDmpio_DEBUG
/*---------------------------------------------------------------------------
@@ -219,6 +205,41 @@ H5FD__mpio_parse_debug_str(const char *s)
FUNC_LEAVE_NOAPI_VOID
} /* end H5FD__mpio_parse_debug_str() */
+
+/*---------------------------------------------------------------------------
+ * Function: H5FD__mem_t_to_str
+ *
+ * Purpose: Returns a string representing the enum value in an H5FD_mem_t
+ * enum
+ *
+ * Returns: H5FD_mem_t enum value string
+ *
+ *---------------------------------------------------------------------------
+ */
+static const char *
+H5FD__mem_t_to_str(H5FD_mem_t mem_type)
+{
+ switch (mem_type) {
+ case H5FD_MEM_NOLIST:
+ return "H5FD_MEM_NOLIST";
+ case H5FD_MEM_DEFAULT:
+ return "H5FD_MEM_DEFAULT";
+ case H5FD_MEM_SUPER:
+ return "H5FD_MEM_SUPER";
+ case H5FD_MEM_BTREE:
+ return "H5FD_MEM_BTREE";
+ case H5FD_MEM_DRAW:
+ return "H5FD_MEM_DRAW";
+ case H5FD_MEM_GHEAP:
+ return "H5FD_MEM_GHEAP";
+ case H5FD_MEM_LHEAP:
+ return "H5FD_MEM_LHEAP";
+ case H5FD_MEM_OHDR:
+ return "H5FD_MEM_OHDR";
+ default:
+ return "(Unknown)";
+ }
+}
#endif /* H5FDmpio_DEBUG */
/*-------------------------------------------------------------------------
@@ -239,13 +260,30 @@ hid_t
H5FD_mpio_init(void)
{
static int H5FD_mpio_Debug_inited = 0;
+ char * env = NULL;
hid_t ret_value = H5I_INVALID_HID; /* Return value */
FUNC_ENTER_NOAPI(H5I_INVALID_HID)
/* Register the MPI-IO VFD, if it isn't already */
- if (H5I_VFL != H5I_get_type(H5FD_MPIO_g))
- H5FD_MPIO_g = H5FD_register((const H5FD_class_t *)&H5FD_mpio_g, sizeof(H5FD_class_mpi_t), FALSE);
+ if (H5I_VFL != H5I_get_type(H5FD_MPIO_g)) {
+ H5FD_MPIO_g = H5FD_register((const H5FD_class_t *)&H5FD_mpio_g, sizeof(H5FD_class_t), FALSE);
+
+ /* Check if MPI driver has been loaded dynamically */
+ env = HDgetenv(HDF5_DRIVER);
+ if (env && !HDstrcmp(env, "mpio")) {
+ int mpi_initialized = 0;
+
+ /* Initialize MPI if not already initialized */
+ if (MPI_SUCCESS != MPI_Initialized(&mpi_initialized))
+ HGOTO_ERROR(H5E_VFL, H5E_UNINITIALIZED, H5I_INVALID_HID, "can't check if MPI is initialized")
+ if (!mpi_initialized) {
+ if (MPI_SUCCESS != MPI_Init(NULL, NULL))
+ HGOTO_ERROR(H5E_VFL, H5E_CANTINIT, H5I_INVALID_HID, "can't initialize MPI")
+ H5FD_mpi_self_initialized = TRUE;
+ }
+ }
+ }
if (!H5FD_mpio_Debug_inited) {
const char *s; /* String for environment variables */
@@ -292,6 +330,17 @@ H5FD__mpio_term(void)
{
FUNC_ENTER_STATIC_NOERR
+ /* Terminate MPI if the driver initialized it */
+ if (H5FD_mpi_self_initialized) {
+ int mpi_finalized = 0;
+
+ MPI_Finalized(&mpi_finalized);
+ if (!mpi_finalized)
+ MPI_Finalize();
+
+ H5FD_mpi_self_initialized = FALSE;
+ }
+
/* Reset VFL ID */
H5FD_MPIO_g = 0;
@@ -353,7 +402,7 @@ H5Pset_fapl_mpio(hid_t fapl_id, MPI_Comm comm, MPI_Info info)
HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI info object")
/* duplication is done during driver setting. */
- ret_value = H5P_set_driver(plist, H5FD_MPIO, NULL);
+ ret_value = H5P_set_driver(plist, H5FD_MPIO, NULL, NULL);
done:
FUNC_LEAVE_API(ret_value)
@@ -808,11 +857,16 @@ H5FD__mpio_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t H5_ATTR
if (NULL == (plist = H5P_object_verify(fapl_id, H5P_FILE_ACCESS)))
HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, NULL, "not a file access property list")
- /* Get the MPI communicator and info object from the property list */
- if (H5P_get(plist, H5F_ACS_MPI_PARAMS_COMM_NAME, &comm) < 0)
- HGOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't get MPI communicator")
- if (H5P_get(plist, H5F_ACS_MPI_PARAMS_INFO_NAME, &info) < 0)
- HGOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't get MPI info object")
+ if (H5FD_mpi_self_initialized) {
+ comm = MPI_COMM_WORLD;
+ }
+ else {
+ /* Get the MPI communicator and info object from the property list */
+ if (H5P_get(plist, H5F_ACS_MPI_PARAMS_COMM_NAME, &comm) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't get MPI communicator")
+ if (H5P_get(plist, H5F_ACS_MPI_PARAMS_INFO_NAME, &info) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't get MPI info object")
+ }
/* Get the MPI rank of this process and the total number of processes */
if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &mpi_rank)))
@@ -862,14 +916,19 @@ H5FD__mpio_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t H5_ATTR
file->mpi_size = mpi_size;
/* Only processor p0 will get the filesize and broadcast it. */
- if (mpi_rank == 0)
+ if (mpi_rank == 0) {
+ /* If MPI_File_get_size fails, broadcast file size as -1 to signal error */
if (MPI_SUCCESS != (mpi_code = MPI_File_get_size(fh, &file_size)))
- HMPI_GOTO_ERROR(NULL, "MPI_File_get_size failed", mpi_code)
+ file_size = (MPI_Offset)-1;
+ }
/* Broadcast file size */
if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&file_size, (int)sizeof(MPI_Offset), MPI_BYTE, 0, comm)))
HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code)
+ if (file_size < 0)
+ HMPI_GOTO_ERROR(NULL, "MPI_File_get_size failed", mpi_code)
+
/* Determine if the file should be truncated */
if (file_size && (flags & H5F_ACC_TRUNC)) {
/* Truncate the file */
@@ -987,7 +1046,6 @@ H5FD__mpio_query(const H5FD_t H5_ATTR_UNUSED *_file, unsigned long *flags /* out
*flags |= H5FD_FEAT_AGGREGATE_METADATA; /* OK to aggregate metadata allocations */
*flags |= H5FD_FEAT_AGGREGATE_SMALLDATA; /* OK to aggregate "small" raw data allocations */
*flags |= H5FD_FEAT_HAS_MPI; /* This driver uses MPI */
- *flags |= H5FD_FEAT_ALLOCATE_EARLY; /* Allocate space early instead of late */
*flags |= H5FD_FEAT_DEFAULT_VFD_COMPATIBLE; /* VFD creates a file which can be opened with the default
VFD */
} /* end if */
@@ -1153,7 +1211,7 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU
MPI_Status mpi_stat; /* Status from I/O operation */
MPI_Datatype buf_type = MPI_BYTE; /* MPI description of the selection in memory */
int size_i; /* Integer copy of 'size' to read */
-#if MPI_VERSION >= 3
+#if H5_CHECK_MPI_VERSION(3, 0)
MPI_Count bytes_read = 0; /* Number of bytes read in */
MPI_Count type_size; /* MPI datatype used for I/O's size */
MPI_Count io_size; /* Actual number of bytes requested */
@@ -1165,6 +1223,7 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU
int n;
#endif
hbool_t use_view_this_time = FALSE;
+ hbool_t derived_type = FALSE;
hbool_t rank0_bcast = FALSE; /* If read-with-rank0-and-bcast flag was used */
#ifdef H5FDmpio_DEBUG
hbool_t H5FD_mpio_debug_t_flag = (H5FD_mpio_debug_flags_s[(int)'t'] && H5FD_MPIO_TRACE_THIS_RANK(file));
@@ -1192,8 +1251,6 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU
if (H5FD_mpi_haddr_to_MPIOff(addr, &mpi_off /*out*/) < 0)
HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off")
size_i = (int)size;
- if ((hsize_t)size_i != size)
- HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from size to size_i")
/* Only look for MPI views for raw data transfers */
if (type == H5FD_MEM_DRAW) {
@@ -1260,10 +1317,14 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU
rank0_bcast = TRUE;
/* Read on rank 0 Bcast to other ranks */
- if (file->mpi_rank == 0)
+ if (file->mpi_rank == 0) {
+ /* If MPI_File_read_at fails, push an error, but continue
+ * to participate in following MPI_Bcast */
if (MPI_SUCCESS !=
(mpi_code = MPI_File_read_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
- HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
+ HMPI_DONE_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
+ }
+
if (MPI_SUCCESS != (mpi_code = MPI_Bcast(buf, size_i, buf_type, 0, file->comm)))
HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code)
} /* end if */
@@ -1293,6 +1354,21 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU
HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
} /* end if */
else {
+ if (size != (hsize_t)size_i) {
+ /* If HERE, then we need to work around the integer size limit
+ * of 2GB. The input size_t size variable cannot fit into an integer,
+ * but we can get around that limitation by creating a different datatype
+ * and then setting the integer size (or element count) to 1 when using
+ * the derived_type.
+ */
+
+ if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &buf_type) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype")
+
+ derived_type = TRUE;
+ size_i = 1;
+ }
+
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_r_flag)
HDfprintf(stderr, "%s: (%d) doing MPI independent IO\n", __func__, file->mpi_rank);
@@ -1306,12 +1382,22 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU
/* Only retrieve bytes read if this rank _actually_ participated in I/O */
if (!rank0_bcast || (rank0_bcast && file->mpi_rank == 0)) {
/* How many bytes were actually read? */
-#if MPI_VERSION >= 3
- if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_read)))
+#if H5_CHECK_MPI_VERSION(3, 0)
+ if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_read))) {
#else
- if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read)))
+ if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read))) {
#endif
- HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
+ if (rank0_bcast && file->mpi_rank == 0) {
+ /* If MPI_Get_elements(_x) fails for a rank 0 bcast strategy,
+ * push an error, but continue to participate in the following
+ * MPI_Bcast.
+ */
+ bytes_read = -1;
+ HMPI_DONE_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
+ }
+ else
+ HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
+ }
} /* end if */
/* If the rank0-bcast feature was used, broadcast the # of bytes read to
@@ -1321,7 +1407,7 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU
* of the data. (QAK - 2019/1/2)
*/
if (rank0_bcast)
-#if MPI_VERSION >= 3
+#if H5_CHECK_MPI_VERSION(3, 0)
if (MPI_SUCCESS != MPI_Bcast(&bytes_read, 1, MPI_COUNT, 0, file->comm))
#else
if (MPI_SUCCESS != MPI_Bcast(&bytes_read, 1, MPI_INT, 0, file->comm))
@@ -1329,7 +1415,7 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU
HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", 0)
/* Get the type's size */
-#if MPI_VERSION >= 3
+#if H5_CHECK_MPI_VERSION(3, 0)
if (MPI_SUCCESS != (mpi_code = MPI_Type_size_x(buf_type, &type_size)))
#else
if (MPI_SUCCESS != (mpi_code = MPI_Type_size(buf_type, &type_size)))
@@ -1345,8 +1431,8 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_r_flag)
- HDfprintf(stderr, "%s: (%d) mpi_off = %ld bytes_read = %lld\n", __func__, file->mpi_rank,
- (long)mpi_off, bytes_read);
+ HDfprintf(stderr, "%s: (%d) mpi_off = %ld bytes_read = %lld type = %s\n", __func__, file->mpi_rank,
+ (long)mpi_off, (long long)bytes_read, H5FD__mem_t_to_str(type));
#endif
/*
@@ -1356,6 +1442,9 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU
HDmemset((char *)buf + bytes_read, 0, (size_t)n);
done:
+ if (derived_type)
+ MPI_Type_free(&buf_type);
+
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_t_flag)
HDfprintf(stderr, "%s: (%d) Leaving\n", __func__, file->mpi_rank);
@@ -1393,7 +1482,7 @@ H5FD__mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, h
MPI_Offset mpi_off;
MPI_Status mpi_stat; /* Status from I/O operation */
MPI_Datatype buf_type = MPI_BYTE; /* MPI description of the selection in memory */
-#if MPI_VERSION >= 3
+#if H5_CHECK_MPI_VERSION(3, 0)
MPI_Count bytes_written;
MPI_Count type_size; /* MPI datatype used for I/O's size */
MPI_Count io_size; /* Actual number of bytes requested */
@@ -1468,20 +1557,6 @@ H5FD__mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, h
*/
mpi_off = 0;
} /* end if */
- else if (size != (hsize_t)size_i) {
- /* If HERE, then we need to work around the integer size limit
- * of 2GB. The input size_t size variable cannot fit into an integer,
- * but we can get around that limitation by creating a different datatype
- * and then setting the integer size (or element count) to 1 when using
- * the derived_type.
- */
-
- if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &buf_type) < 0)
- HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype")
-
- derived_type = TRUE;
- size_i = 1;
- }
/* Write the data. */
if (use_view_this_time) {
@@ -1527,6 +1602,21 @@ H5FD__mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, h
HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
} /* end if */
else {
+ if (size != (hsize_t)size_i) {
+ /* If HERE, then we need to work around the integer size limit
+ * of 2GB. The input size_t size variable cannot fit into an integer,
+ * but we can get around that limitation by creating a different datatype
+ * and then setting the integer size (or element count) to 1 when using
+ * the derived_type.
+ */
+
+ if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &buf_type) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype")
+
+ derived_type = TRUE;
+ size_i = 1;
+ }
+
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_w_flag)
HDfprintf(stderr, "%s: (%d) doing MPI independent IO\n", __func__, file->mpi_rank);
@@ -1538,7 +1628,7 @@ H5FD__mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, h
} /* end else */
/* How many bytes were actually written? */
-#if MPI_VERSION >= 3
+#if H5_CHECK_MPI_VERSION(3, 0)
if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_written)))
#else
if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_written)))
@@ -1546,7 +1636,7 @@ H5FD__mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, h
HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
/* Get the type's size */
-#if MPI_VERSION >= 3
+#if H5_CHECK_MPI_VERSION(3, 0)
if (MPI_SUCCESS != (mpi_code = MPI_Type_size_x(buf_type, &type_size)))
#else
if (MPI_SUCCESS != (mpi_code = MPI_Type_size(buf_type, &type_size)))
@@ -1562,8 +1652,8 @@ H5FD__mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, h
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_w_flag)
- HDfprintf(stderr, "%s: (%d) mpi_off = %ld bytes_written = %lld\n", __func__, file->mpi_rank,
- (long)mpi_off, bytes_written);
+ HDfprintf(stderr, "%s: (%d) mpi_off = %ld bytes_written = %lld type = %s\n", __func__,
+ file->mpi_rank, (long)mpi_off, (long long)bytes_written, H5FD__mem_t_to_str(type));
#endif
/* Each process will keep track of its perceived EOF value locally, and
@@ -1590,6 +1680,1050 @@ done:
} /* end H5FD__mpio_write() */
/*-------------------------------------------------------------------------
+ * Function: H5FD__mpio_vector_build_types
+ *
+ * Purpose: Build MPI datatypes and calculate offset, base buffer, and
+ * size for MPIO vector I/O. Spun off from common code in
+ * H5FD__mpio_vector_read() and H5FD__mpio_vector_write().
+ *
+ * Return: Success: SUCCEED.
+ * Failure: FAIL.
+ *
+ * Programmer: Neil Fortner
+ * March 14, 2022
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD__mpio_vector_build_types(uint32_t count, H5FD_mem_t types[], haddr_t addrs[], size_t sizes[],
+ H5_flexible_const_ptr_t bufs[], haddr_t *s_addrs[], size_t *s_sizes[],
+ H5_flexible_const_ptr_t *s_bufs[], hbool_t *vector_was_sorted,
+ MPI_Offset *mpi_off, H5_flexible_const_ptr_t *mpi_bufs_base, int *size_i,
+ MPI_Datatype *buf_type, hbool_t *buf_type_created, MPI_Datatype *file_type,
+ hbool_t *file_type_created, char *unused)
+{
+ hsize_t bigio_count; /* Transition point to create derived type */
+ hbool_t fixed_size = FALSE;
+ size_t size;
+ H5FD_mem_t * s_types = NULL;
+ int * mpi_block_lengths = NULL;
+ MPI_Aint mpi_bufs_base_Aint;
+ MPI_Aint * mpi_bufs = NULL;
+ MPI_Aint * mpi_displacements = NULL;
+ MPI_Datatype *sub_types = NULL;
+ uint8_t * sub_types_created = NULL;
+ int i;
+ int j;
+ int mpi_code; /* MPI return code */
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_STATIC
+
+ /* Sanity checks */
+ HDassert(s_sizes);
+ HDassert(s_bufs);
+ HDassert(vector_was_sorted);
+ HDassert(*vector_was_sorted);
+ HDassert(mpi_off);
+ HDassert(mpi_bufs_base);
+ HDassert(size_i);
+ HDassert(buf_type);
+ HDassert(buf_type_created);
+ HDassert(!*buf_type_created);
+ HDassert(file_type);
+ HDassert(file_type_created);
+ HDassert(!*file_type_created);
+ HDassert(unused);
+
+ /* Get bio I/O transition point (may be lower than 2G for testing) */
+ bigio_count = H5_mpi_get_bigio_count();
+
+ if (count == 1) {
+ /* Single block. Just use a series of MPI_BYTEs for the file view.
+ */
+ *size_i = (int)sizes[0];
+ *buf_type = MPI_BYTE;
+ *file_type = MPI_BYTE;
+ *mpi_bufs_base = bufs[0];
+
+ /* Setup s_addrs, s_sizes and s_bufs (needed for incomplete read filling code and eof
+ * calculation code) */
+ *s_addrs = addrs;
+ *s_sizes = sizes;
+ *s_bufs = bufs;
+
+ /* some numeric conversions */
+ if (H5FD_mpi_haddr_to_MPIOff(addrs[0], mpi_off) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI offset")
+
+ /* Check for size overflow */
+ if (sizes[0] > bigio_count) {
+ /* We need to work around the integer size limit of 2GB. The input size_t size
+ * variable cannot fit into an integer, but we can get around that limitation by
+ * creating a different datatype and then setting the integer size (or element
+ * count) to 1 when using the derived_type. */
+
+ if (H5_mpio_create_large_type(sizes[0], 0, MPI_BYTE, buf_type) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype")
+ *buf_type_created = TRUE;
+
+ if (H5_mpio_create_large_type(sizes[0], 0, MPI_BYTE, file_type) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype")
+ *file_type_created = TRUE;
+
+ *size_i = 1;
+ }
+ }
+ else if (count > 0) { /* create MPI derived types describing the vector write */
+
+ /* sort the vector I/O request into increasing address order if required
+ *
+ * If the vector is already sorted, the base addresses of types, addrs, sizes,
+ * and bufs will be returned in s_types, s_addrs, s_sizes, and s_bufs respectively.
+ *
+ * If the vector was not already sorted, new, sorted versions of types, addrs, sizes, and bufs
+ * are allocated, populated, and returned in s_types, s_addrs, s_sizes, and s_bufs respectively.
+ * In this case, this function must free the memory allocated for the sorted vectors.
+ */
+ if (H5FD_sort_vector_io_req(vector_was_sorted, count, types, addrs, sizes, bufs, &s_types, s_addrs,
+ s_sizes, s_bufs) < 0)
+ HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "can't sort vector I/O request")
+
+ if ((NULL == (mpi_block_lengths = (int *)HDmalloc((size_t)count * sizeof(int)))) ||
+ (NULL == (mpi_displacements = (MPI_Aint *)HDmalloc((size_t)count * sizeof(MPI_Aint)))) ||
+ (NULL == (mpi_bufs = (MPI_Aint *)HDmalloc((size_t)count * sizeof(MPI_Aint))))) {
+
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc mpi block lengths / displacement")
+ }
+
+ /* when we setup mpi_bufs[] below, all addresses are offsets from
+ * mpi_bufs_base.
+ *
+ * Since these offsets must all be positive, we must scan through
+ * s_bufs[] to find the smallest value, and choose that for
+ * mpi_bufs_base.
+ */
+
+ j = 0; /* guess at the index of the smallest value of s_bufs[] */
+
+ for (i = 1; i < (int)count; i++) {
+
+ if ((*s_bufs)[i].cvp < (*s_bufs)[j].cvp) {
+
+ j = i;
+ }
+ }
+
+ *mpi_bufs_base = (*s_bufs)[j];
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Get_address(mpi_bufs_base->cvp, &mpi_bufs_base_Aint)))
+
+ HMPI_GOTO_ERROR(FAIL, "MPI_Get_address for s_bufs[] to mpi_bufs_base failed", mpi_code)
+
+ *size_i = 1;
+
+ fixed_size = FALSE;
+
+ /* load the mpi_block_lengths and mpi_displacements arrays */
+ for (i = 0; i < (int)count; i++) {
+ /* Determine size of this vector element */
+ if (!fixed_size) {
+ if ((*s_sizes)[i] == 0) {
+ HDassert(vector_was_sorted);
+ fixed_size = TRUE;
+ size = sizes[i - 1];
+ }
+ else {
+ size = (*s_sizes)[i];
+ }
+ }
+
+ /* Add to block lengths and displacements arrays */
+ mpi_block_lengths[i] = (int)size;
+ mpi_displacements[i] = (MPI_Aint)(*s_addrs)[i];
+
+ /* convert s_bufs[i] to MPI_Aint... */
+ if (MPI_SUCCESS != (mpi_code = MPI_Get_address((*s_bufs)[i].cvp, &(mpi_bufs[i]))))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Get_address for s_bufs[] - mpi_bufs_base failed", mpi_code)
+
+ /*... and then subtract mpi_bufs_base_Aint from it. */
+#if ((MPI_VERSION > 3) || ((MPI_VERSION == 3) && (MPI_SUBVERSION >= 1)))
+ mpi_bufs[i] = MPI_Aint_diff(mpi_bufs[i], mpi_bufs_base_Aint);
+#else
+ mpi_bufs[i] = mpi_bufs[i] - mpi_bufs_base_Aint;
+#endif
+
+ /* Check for size overflow */
+ if (size > bigio_count) {
+ /* We need to work around the integer size limit of 2GB. The input size_t size
+ * variable cannot fit into an integer, but we can get around that limitation by
+ * creating a different datatype and then setting the integer size (or element
+ * count) to 1 when using the derived_type. */
+
+ /* Allocate arrays to keep track of types and whether they were created, if
+ * necessary */
+ if (!sub_types) {
+ HDassert(!sub_types_created);
+
+ if (NULL == (sub_types = (int *)HDmalloc((size_t)count * sizeof(MPI_Datatype))))
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc sub types array")
+ if (NULL == (sub_types_created = (uint8_t *)HDcalloc((size_t)count, 1))) {
+ H5MM_free(sub_types);
+ sub_types = NULL;
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc sub types created array")
+ }
+
+ /* Initialize sub_types to all MPI_BYTE */
+ for (j = 0; j < (int)count; j++)
+ sub_types[j] = MPI_BYTE;
+ }
+ HDassert(sub_types_created);
+
+ /* Create type for large block */
+ if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &sub_types[i]) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype")
+ sub_types_created[i] = TRUE;
+
+ /* Only one of these large types for this vector element */
+ mpi_block_lengths[i] = 1;
+ }
+ else
+ HDassert(size == (size_t)mpi_block_lengths[i]);
+ }
+
+ /* create the memory MPI derived types */
+ if (sub_types) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_create_struct((int)count, mpi_block_lengths, mpi_bufs,
+ sub_types, buf_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct for buf_type failed", mpi_code)
+ }
+ else if (MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed((int)count, mpi_block_lengths, mpi_bufs,
+ MPI_BYTE, buf_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed for buf_type failed", mpi_code)
+
+ *buf_type_created = TRUE;
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(buf_type)))
+
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit for buf_type failed", mpi_code)
+
+ /* create the file MPI derived type */
+ if (sub_types) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_create_struct((int)count, mpi_block_lengths,
+ mpi_displacements, sub_types, file_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct for file_type failed", mpi_code)
+ }
+ else if (MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed((int)count, mpi_block_lengths,
+ mpi_displacements, MPI_BYTE, file_type)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed for file_type failed", mpi_code)
+
+ *file_type_created = TRUE;
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(file_type)))
+
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit for file_type failed", mpi_code)
+
+ /* Free up memory used to build types */
+ HDassert(mpi_block_lengths);
+ HDfree(mpi_block_lengths);
+ mpi_block_lengths = NULL;
+
+ HDassert(mpi_displacements);
+ HDfree(mpi_displacements);
+ mpi_displacements = NULL;
+
+ HDassert(mpi_bufs);
+ HDfree(mpi_bufs);
+ mpi_bufs = NULL;
+
+ if (sub_types) {
+ HDassert(sub_types);
+
+ for (i = 0; i < (int)count; i++)
+ if (sub_types_created[i])
+ MPI_Type_free(&sub_types[i]);
+
+ HDfree(sub_types);
+ sub_types = NULL;
+ HDfree(sub_types_created);
+ sub_types_created = NULL;
+ }
+
+ /* some numeric conversions */
+ if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, mpi_off) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0")
+ }
+ else {
+ /* setup for null participation in the collective operation. */
+ *buf_type = MPI_BYTE;
+ *file_type = MPI_BYTE;
+
+ /* Set non-NULL pointer for I/O operation */
+ mpi_bufs_base->vp = unused;
+
+ /* MPI count to read */
+ *size_i = 0;
+
+ /* some numeric conversions */
+ if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, mpi_off) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0")
+ }
+
+done:
+ /* free sorted vectors if they exist */
+ if (!vector_was_sorted)
+ if (s_types) {
+ HDfree(s_types);
+ s_types = NULL;
+ }
+
+ /* Clean up on error */
+ if (ret_value < 0) {
+ if (mpi_block_lengths) {
+ HDfree(mpi_block_lengths);
+ mpi_block_lengths = NULL;
+ }
+
+ if (mpi_displacements) {
+ HDfree(mpi_displacements);
+ mpi_displacements = NULL;
+ }
+
+ if (mpi_bufs) {
+ HDfree(mpi_bufs);
+ mpi_bufs = NULL;
+ }
+
+ if (sub_types) {
+ HDassert(sub_types_created);
+
+ for (i = 0; i < (int)count; i++)
+ if (sub_types_created[i])
+ MPI_Type_free(&sub_types[i]);
+
+ HDfree(sub_types);
+ sub_types = NULL;
+ HDfree(sub_types_created);
+ sub_types_created = NULL;
+ }
+ }
+
+ /* Make sure we cleaned up */
+ HDassert(!mpi_block_lengths);
+ HDassert(!mpi_displacements);
+ HDassert(!mpi_bufs);
+ HDassert(!sub_types);
+ HDassert(!sub_types_created);
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5FD__mpio_vector_build_types() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD__mpio_read_vector()
+ *
+ * Purpose: The behaviour of this function dependes on the value of
+ * the io_xfer_mode obtained from the context.
+ *
+ * If it is H5FD_MPIO_COLLECTIVE, this is a collective
+ * operation, which allows us to use MPI_File_set_view, and
+ * then perform the entire vector read in a single MPI call.
+ *
+ * Do this (if count is positive), by constructing memory
+ * and file derived types from the supplied vector, using
+ * file type to set the file view, and then reading the
+ * the memory type from file. Note that this read is
+ * either independent or collective depending on the
+ * value of mpio_coll_opt -- again obtained from the context.
+ *
+ * If count is zero, participate in the collective read
+ * (if so configured) with an empty read.
+ *
+ * Finally, set the file view back to its default state.
+ *
+ * In contrast, if io_xfer_mode is H5FD_MPIO_INDEPENDENT,
+ * this call is independent, and thus we cannot use
+ * MPI_File_set_view().
+ *
+ * In this case, simply walk the vector, and issue an
+ * independent read for each entry.
+ *
+ * Return: Success: SUCCEED.
+ * Failure: FAIL.
+ *
+ * Programmer: John Mainzer
+ * March 15, 2021
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t count, H5FD_mem_t types[],
+ haddr_t addrs[], size_t sizes[], void *bufs[])
+{
+ H5FD_mpio_t * file = (H5FD_mpio_t *)_file;
+ hbool_t vector_was_sorted = TRUE;
+ haddr_t * s_addrs = NULL;
+ size_t * s_sizes = NULL;
+ void ** s_bufs = NULL;
+ char unused = 0; /* Unused, except for non-NULL pointer value */
+ void * mpi_bufs_base = NULL;
+ MPI_Datatype buf_type = MPI_BYTE; /* MPI description of the selection in memory */
+ hbool_t buf_type_created = FALSE;
+ MPI_Datatype file_type = MPI_BYTE; /* MPI description of the selection in file */
+ hbool_t file_type_created = FALSE;
+ int i;
+ int mpi_code; /* MPI return code */
+ MPI_Offset mpi_off = 0;
+ MPI_Status mpi_stat; /* Status from I/O operation */
+ H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */
+ H5FD_mpio_collective_opt_t coll_opt_mode; /* whether we are doing collective or independent I/O */
+ int size_i;
+#if MPI_VERSION >= 3
+ MPI_Count bytes_read = 0; /* Number of bytes read in */
+ MPI_Count type_size; /* MPI datatype used for I/O's size */
+ MPI_Count io_size; /* Actual number of bytes requested */
+ MPI_Count n;
+#else
+ int bytes_read = 0; /* Number of bytes read in */
+ int type_size; /* MPI datatype used for I/O's size */
+ int io_size; /* Actual number of bytes requested */
+ int n;
+#endif
+ hbool_t rank0_bcast = FALSE; /* If read-with-rank0-and-bcast flag was used */
+#ifdef H5FDmpio_DEBUG
+ hbool_t H5FD_mpio_debug_t_flag = (H5FD_mpio_debug_flags_s[(int)'t'] && H5FD_MPIO_TRACE_THIS_RANK(file));
+ hbool_t H5FD_mpio_debug_r_flag = (H5FD_mpio_debug_flags_s[(int)'r'] && H5FD_MPIO_TRACE_THIS_RANK(file));
+#endif
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_STATIC
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_t_flag)
+ HDfprintf(stderr, "%s: (%d) Entering\n", __func__, file->mpi_rank);
+#endif
+
+ /* Sanity checks */
+ HDassert(file);
+ HDassert(H5FD_MPIO == file->pub.driver_id);
+ HDassert((types) || (count == 0));
+ HDassert((addrs) || (count == 0));
+ HDassert((sizes) || (count == 0));
+ HDassert((bufs) || (count == 0));
+
+ /* verify that the first elements of the sizes and types arrays are
+ * valid.
+ */
+ HDassert((count == 0) || (sizes[0] != 0));
+ HDassert((count == 0) || (types[0] != H5FD_MEM_NOLIST));
+
+ /* Get the transfer mode from the API context
+ *
+ * This flag is set to H5FD_MPIO_COLLECTIVE if the API call is
+ * collective, and to H5FD_MPIO_INDEPENDENT if it is not.
+ *
+ * While this doesn't mean that we are actually about to do a collective
+ * read, it does mean that all ranks are here, so we can use MPI_File_set_view().
+ */
+ if (H5CX_get_io_xfer_mode(&xfer_mode) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode")
+
+ if (xfer_mode == H5FD_MPIO_COLLECTIVE) {
+ /* Build MPI types, etc. */
+ if (H5FD__mpio_vector_build_types(count, types, addrs, sizes, (H5_flexible_const_ptr_t *)bufs,
+ &s_addrs, &s_sizes, (H5_flexible_const_ptr_t **)&s_bufs,
+ &vector_was_sorted, &mpi_off,
+ (H5_flexible_const_ptr_t *)&mpi_bufs_base, &size_i, &buf_type,
+ &buf_type_created, &file_type, &file_type_created, &unused) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't build MPI datatypes for I/O")
+
+ /* free sorted addrs vector if it exists */
+ if (!vector_was_sorted)
+ if (s_addrs) {
+ HDfree(s_addrs);
+ s_addrs = NULL;
+ }
+
+ /* Portably initialize MPI status variable */
+ HDmemset(&mpi_stat, 0, sizeof(mpi_stat));
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_r_flag)
+ HDfprintf(stdout, "%s: mpi_off = %ld size_i = %d\n", __func__, (long)mpi_off, size_i);
+#endif
+
+ /* Setup the file view. */
+ if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, mpi_off, MPI_BYTE, file_type,
+ H5FD_mpi_native_g, file->info)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
+
+ /* Reset mpi_off to 0 since the view now starts at the data offset */
+ if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, &mpi_off) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0")
+
+ /* Get the collective_opt property to check whether the application wants to do IO individually.
+ */
+ if (H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0)
+
+ HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property")
+
+ /* Read the data. */
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_r_flag)
+ HDfprintf(stdout, "%s: using MPIO collective mode\n", __func__);
+#endif
+ if (coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) {
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_r_flag)
+ HDfprintf(stdout, "%s: doing MPI collective IO\n", __func__);
+#endif
+ /* Check whether we should read from rank 0 and broadcast to other ranks */
+ if (H5CX_get_mpio_rank0_bcast()) {
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_r_flag)
+ HDfprintf(stdout, "%s: doing read-rank0-and-MPI_Bcast\n", __func__);
+#endif
+ /* Indicate path we've taken */
+ rank0_bcast = TRUE;
+
+ /* Read on rank 0 Bcast to other ranks */
+ if (file->mpi_rank == 0)
+ if (MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, mpi_bufs_base, size_i,
+ buf_type, &mpi_stat)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code)
+ if (MPI_SUCCESS != (mpi_code = MPI_Bcast(mpi_bufs_base, size_i, buf_type, 0, file->comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code)
+ } /* end if */
+ else if (MPI_SUCCESS != (mpi_code = MPI_File_read_at_all(file->f, mpi_off, mpi_bufs_base, size_i,
+ buf_type, &mpi_stat)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code)
+ } /* end if */
+ else if (size_i > 0) {
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_r_flag)
+ HDfprintf(stdout, "%s: doing MPI independent IO\n", __func__);
+#endif
+
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_File_read_at(file->f, mpi_off, mpi_bufs_base, size_i, buf_type, &mpi_stat)))
+
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
+
+ } /* end else */
+
+ /* Reset the file view */
+ if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE,
+ H5FD_mpi_native_g, file->info)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
+
+ /* Only retrieve bytes read if this rank _actually_ participated in I/O */
+ if (!rank0_bcast || (rank0_bcast && file->mpi_rank == 0)) {
+ /* How many bytes were actually read? */
+#if MPI_VERSION >= 3
+ if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_read)))
+#else
+ if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read)))
+#endif
+ HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
+ } /* end if */
+
+ /* If the rank0-bcast feature was used, broadcast the # of bytes read to
+ * other ranks, which didn't perform any I/O.
+ */
+ /* NOTE: This could be optimized further to be combined with the broadcast
+ * of the data. (QAK - 2019/1/2)
+ * Or have rank 0 clear the unread parts of the buffer prior to
+ * the bcast. (NAF - 2021/9/15)
+ */
+ if (rank0_bcast)
+#if MPI_VERSION >= 3
+ if (MPI_SUCCESS != MPI_Bcast(&bytes_read, 1, MPI_COUNT, 0, file->comm))
+#else
+ if (MPI_SUCCESS != MPI_Bcast(&bytes_read, 1, MPI_INT, 0, file->comm))
+#endif
+ HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", 0)
+
+ /* Get the type's size */
+#if MPI_VERSION >= 3
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_size_x(buf_type, &type_size)))
+#else
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_size(buf_type, &type_size)))
+#endif
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_size failed", mpi_code)
+
+ /* Compute the actual number of bytes requested */
+ io_size = type_size * size_i;
+
+ /* Check for read failure */
+ if (bytes_read < 0 || bytes_read > io_size)
+ HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed")
+
+ /* Check for incomplete read */
+ n = io_size - bytes_read;
+ if (n > 0) {
+ i = (int)count - 1;
+
+ /* Iterate over sorted array in reverse, filling in zeroes to
+ * sections of the buffers that were not read to */
+ do {
+ HDassert(i >= 0);
+
+#if MPI_VERSION >= 3
+ io_size = MIN(n, (MPI_Count)s_sizes[i]);
+ bytes_read = (MPI_Count)s_sizes[i] - io_size;
+#else
+ io_size = MIN(n, (int)s_sizes[i]);
+ bytes_read = (int)s_sizes[i] - io_size;
+#endif
+ HDassert(bytes_read >= 0);
+
+ HDmemset((char *)s_bufs[i] + bytes_read, 0, (size_t)io_size);
+
+ n -= io_size;
+ i--;
+ } while (n > 0);
+ }
+ }
+ else if (count > 0) {
+ haddr_t max_addr = HADDR_MAX;
+ hbool_t fixed_size = FALSE;
+ size_t size;
+
+ /* The read is part of an independent operation. As a result,
+ * we can't use MPI_File_set_view() (since it it a collective operation),
+ * and thus we can't use the above code to construct the MPI datatypes.
+ * In the future, we could write code to detect when a contiguous slab
+ * in the file selection spans multiple vector elements and construct a
+ * memory datatype to match this larger block in the file, but for now
+ * just read in each element of the vector in a separate
+ * MPI_File_read_at() call.
+ *
+ * We could also just detect the case when the entire file selection is
+ * contiguous, which would allow us to use
+ * H5FD__mpio_vector_build_types() to construct the memory datatype.
+ */
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_r_flag)
+ HDfprintf(stdout, "%s: doing MPI independent IO\n", __func__);
+#endif
+
+ /* Loop over vector elements */
+ for (i = 0; i < (int)count; i++) {
+ /* Convert address to mpi offset */
+ if (H5FD_mpi_haddr_to_MPIOff(addrs[i], &mpi_off) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off")
+
+ /* Calculate I/O size */
+ if (!fixed_size) {
+ if (sizes[i] == 0) {
+ fixed_size = TRUE;
+ size = sizes[i - 1];
+ }
+ else {
+ size = sizes[i];
+ }
+ }
+ size_i = (int)size;
+
+ if (size != (size_t)size_i) {
+ /* If HERE, then we need to work around the integer size limit
+ * of 2GB. The input size_t size variable cannot fit into an integer,
+ * but we can get around that limitation by creating a different datatype
+ * and then setting the integer size (or element count) to 1 when using
+ * the derived_type.
+ */
+
+ if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &buf_type) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype")
+
+ buf_type_created = TRUE;
+ size_i = 1;
+ }
+
+ /* Check if we actually need to do I/O */
+ if (addrs[i] < max_addr) {
+ /* Portably initialize MPI status variable */
+ HDmemset(&mpi_stat, 0, sizeof(mpi_stat));
+
+ /* Issue read */
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_File_read_at(file->f, mpi_off, bufs[i], size_i, buf_type, &mpi_stat)))
+
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
+
+ /* How many bytes were actually read? */
+#if MPI_VERSION >= 3
+ if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, MPI_BYTE, &bytes_read)))
+#else
+ if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read)))
+#endif
+ HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
+
+ /* Compute the actual number of bytes requested */
+#if MPI_VERSION >= 3
+ io_size = (MPI_Count)size;
+#else
+ io_size = (int)size;
+#endif
+
+ /* Check for read failure */
+ if (bytes_read < 0 || bytes_read > io_size)
+ HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed")
+
+ /*
+ * If we didn't read the entire I/O, fill in zeroes beyond end of
+ * the physical MPI file and don't issue any more reads at higher
+ * addresses.
+ */
+ if ((n = (io_size - bytes_read)) > 0) {
+ HDmemset((char *)bufs[i] + bytes_read, 0, (size_t)n);
+ max_addr = addrs[i] + (haddr_t)bytes_read;
+ }
+ }
+ else {
+ /* Read is past the max address, fill in zeroes */
+ HDmemset((char *)bufs[i], 0, size);
+ }
+ }
+ }
+
+done:
+ if (buf_type_created) {
+ MPI_Type_free(&buf_type);
+ }
+
+ if (file_type_created) {
+ MPI_Type_free(&file_type);
+ }
+
+ /* free sorted vectors if they exist */
+ if (!vector_was_sorted) {
+ if (s_addrs) {
+ HDfree(s_addrs);
+ s_addrs = NULL;
+ }
+ if (s_sizes) {
+ HDfree(s_sizes);
+ s_sizes = NULL;
+ }
+ if (s_bufs) {
+ HDfree(s_bufs);
+ s_bufs = NULL;
+ }
+ }
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_t_flag)
+ HDfprintf(stdout, "%s: Leaving, proc %d: ret_value = %d\n", __func__, file->mpi_rank, ret_value);
+#endif
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* end H5FD__mpio_read_vector() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD__mpio_write_vector
+ *
+ * Purpose: The behaviour of this function dependes on the value of
+ * the io_xfer_mode obtained from the context.
+ *
+ * If it is H5FD_MPIO_COLLECTIVE, this is a collective
+ * operation, which allows us to use MPI_File_set_view, and
+ * then perform the entire vector write in a single MPI call.
+ *
+ * Do this (if count is positive), by constructing memory
+ * and file derived types from the supplied vector, using
+ * file type to set the file view, and then writing the
+ * the memory type to file. Note that this write is
+ * either independent or collective depending on the
+ * value of mpio_coll_opt -- again obtained from the context.
+ *
+ * If count is zero, participate in the collective write
+ * (if so configured) with an empty write.
+ *
+ * Finally, set the file view back to its default state.
+ *
+ * In contrast, if io_xfer_mode is H5FD_MPIO_INDEPENDENT,
+ * this call is independent, and thus we cannot use
+ * MPI_File_set_view().
+ *
+ * In this case, simply walk the vector, and issue an
+ * independent write for each entry.
+ *
+ * Return: Success: SUCCEED.
+ * Failure: FAIL.
+ *
+ * Programmer: John Mainzer
+ * March 15, 2021
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD__mpio_write_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t count, H5FD_mem_t types[],
+ haddr_t addrs[], size_t sizes[], const void *bufs[])
+{
+ H5FD_mpio_t * file = (H5FD_mpio_t *)_file;
+ hbool_t vector_was_sorted = TRUE;
+ haddr_t * s_addrs = NULL;
+ size_t * s_sizes = NULL;
+ const void ** s_bufs = NULL;
+ char unused = 0; /* Unused, except for non-NULL pointer value */
+ const void * mpi_bufs_base = NULL;
+ MPI_Datatype buf_type = MPI_BYTE; /* MPI description of the selection in memory */
+ hbool_t buf_type_created = FALSE;
+ MPI_Datatype file_type = MPI_BYTE; /* MPI description of the selection in file */
+ hbool_t file_type_created = FALSE;
+ int i;
+ int mpi_code; /* MPI return code */
+ MPI_Offset mpi_off = 0;
+ MPI_Status mpi_stat; /* Status from I/O operation */
+ H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */
+ H5FD_mpio_collective_opt_t coll_opt_mode; /* whether we are doing collective or independent I/O */
+ int size_i;
+#ifdef H5FDmpio_DEBUG
+ hbool_t H5FD_mpio_debug_t_flag = (H5FD_mpio_debug_flags_s[(int)'t'] && H5FD_MPIO_TRACE_THIS_RANK(file));
+ hbool_t H5FD_mpio_debug_w_flag = (H5FD_mpio_debug_flags_s[(int)'w'] && H5FD_MPIO_TRACE_THIS_RANK(file));
+#endif
+ haddr_t max_addr = 0;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_STATIC
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_t_flag)
+ HDfprintf(stderr, "%s: (%d) Entering\n", __func__, file->mpi_rank);
+#endif
+
+ /* Sanity checks */
+ HDassert(file);
+ HDassert(H5FD_MPIO == file->pub.driver_id);
+ HDassert((types) || (count == 0));
+ HDassert((addrs) || (count == 0));
+ HDassert((sizes) || (count == 0));
+ HDassert((bufs) || (count == 0));
+
+ /* verify that the first elements of the sizes and types arrays are
+ * valid.
+ */
+ HDassert((count == 0) || (sizes[0] != 0));
+ HDassert((count == 0) || (types[0] != H5FD_MEM_NOLIST));
+
+ /* Verify that no data is written when between MPI_Barrier()s during file flush */
+
+ HDassert(!H5CX_get_mpi_file_flushing());
+
+ /* Get the transfer mode from the API context
+ *
+ * This flag is set to H5FD_MPIO_COLLECTIVE if the API call is
+ * collective, and to H5FD_MPIO_INDEPENDENT if it is not.
+ *
+ * While this doesn't mean that we are actually about to do a collective
+ * write, it does mean that all ranks are here, so we can use MPI_File_set_view().
+ */
+ if (H5CX_get_io_xfer_mode(&xfer_mode) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode")
+
+ if (xfer_mode == H5FD_MPIO_COLLECTIVE) {
+ /* Build MPI types, etc. */
+ if (H5FD__mpio_vector_build_types(count, types, addrs, sizes, (H5_flexible_const_ptr_t *)bufs,
+ &s_addrs, &s_sizes, (H5_flexible_const_ptr_t **)&s_bufs,
+ &vector_was_sorted, &mpi_off,
+ (H5_flexible_const_ptr_t *)&mpi_bufs_base, &size_i, &buf_type,
+ &buf_type_created, &file_type, &file_type_created, &unused) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't build MPI datatypes for I/O")
+
+ /* Compute max addr writted to */
+ if (count > 0)
+ max_addr = s_addrs[count - 1] + (haddr_t)(s_sizes[count - 1]);
+
+ /* free sorted vectors if they exist */
+ if (!vector_was_sorted) {
+ if (s_addrs) {
+ HDfree(s_addrs);
+ s_addrs = NULL;
+ }
+ if (s_sizes) {
+ HDfree(s_sizes);
+ s_sizes = NULL;
+ }
+ if (s_bufs) {
+ HDfree(s_bufs);
+ s_bufs = NULL;
+ }
+ }
+
+ /* Portably initialize MPI status variable */
+ HDmemset(&mpi_stat, 0, sizeof(MPI_Status));
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_w_flag)
+ HDfprintf(stdout, "%s: mpi_off = %ld size_i = %d\n", __func__, (long)mpi_off, size_i);
+#endif
+
+ /* Setup the file view. */
+ if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, mpi_off, MPI_BYTE, file_type,
+ H5FD_mpi_native_g, file->info)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
+
+ /* Reset mpi_off to 0 since the view now starts at the data offset */
+ if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, &mpi_off) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0")
+
+ /* Get the collective_opt property to check whether the application wants to do IO individually.
+ */
+ if (H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property")
+
+ /* Write the data. */
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_w_flag)
+ HDfprintf(stdout, "%s: using MPIO collective mode\n", __func__);
+#endif
+
+ if (coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) {
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_w_flag)
+ HDfprintf(stdout, "%s: doing MPI collective IO\n", __func__);
+#endif
+
+ if (MPI_SUCCESS != (mpi_code = MPI_File_write_at_all(file->f, mpi_off, mpi_bufs_base, size_i,
+ buf_type, &mpi_stat)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at_all failed", mpi_code)
+ } /* end if */
+ else if (size_i > 0) {
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_w_flag)
+ HDfprintf(stdout, "%s: doing MPI independent IO\n", __func__);
+#endif
+
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_File_write_at(file->f, mpi_off, mpi_bufs_base, size_i, buf_type, &mpi_stat)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at failed", mpi_code)
+ } /* end else */
+
+ /* Reset the file view */
+ if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE,
+ H5FD_mpi_native_g, file->info)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
+ }
+ else if (count > 0) {
+ hbool_t fixed_size = FALSE;
+ size_t size;
+
+ /* The read is part of an independent operation. As a result,
+ * we can't use MPI_File_set_view() (since it it a collective operation),
+ * and thus we can't use the above code to construct the MPI datatypes.
+ * In the future, we could write code to detect when a contiguous slab
+ * in the file selection spans multiple vector elements and construct a
+ * memory datatype to match this larger block in the file, but for now
+ * just read in each element of the vector in a separate
+ * MPI_File_read_at() call.
+ *
+ * We could also just detect the case when the entire file selection is
+ * contiguous, which would allow us to use
+ * H5FD__mpio_vector_build_types() to construct the memory datatype.
+ */
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_w_flag)
+ HDfprintf(stdout, "%s: doing MPI independent IO\n", __func__);
+#endif
+
+ /* Loop over vector elements */
+ for (i = 0; i < (int)count; i++) {
+ /* Convert address to mpi offset */
+ if (H5FD_mpi_haddr_to_MPIOff(addrs[i], &mpi_off) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off")
+
+ /* Calculate I/O size */
+ if (!fixed_size) {
+ if (sizes[i] == 0) {
+ fixed_size = TRUE;
+ size = sizes[i - 1];
+ }
+ else {
+ size = sizes[i];
+ }
+ }
+ size_i = (int)size;
+
+ if (size != (size_t)size_i) {
+ /* If HERE, then we need to work around the integer size limit
+ * of 2GB. The input size_t size variable cannot fit into an integer,
+ * but we can get around that limitation by creating a different datatype
+ * and then setting the integer size (or element count) to 1 when using
+ * the derived_type.
+ */
+
+ if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &buf_type) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype")
+
+ buf_type_created = TRUE;
+ size_i = 1;
+ }
+
+ /* Perform write */
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_File_write_at(file->f, mpi_off, bufs[i], size_i, buf_type, &mpi_stat)))
+
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at failed", mpi_code)
+
+ /* Check if this is the highest address written to so far */
+ if (addrs[i] + size > max_addr)
+ max_addr = addrs[i] + size;
+ }
+ }
+
+ /* Each process will keep track of its perceived EOF value locally, and
+ * ultimately we will reduce this value to the maximum amongst all
+ * processes, but until then keep the actual eof at HADDR_UNDEF just in
+ * case something bad happens before that point. (rather have a value
+ * we know is wrong sitting around rather than one that could only
+ * potentially be wrong.)
+ */
+ file->eof = HADDR_UNDEF;
+
+ /* check to see if the local eof has changed been extended, and update if so */
+ if (max_addr > file->local_eof)
+ file->local_eof = max_addr;
+
+done:
+ if (buf_type_created)
+ MPI_Type_free(&buf_type);
+
+ if (file_type_created)
+ MPI_Type_free(&file_type);
+
+ /* Cleanup on error */
+ if (ret_value < 0 && !vector_was_sorted) {
+ if (s_addrs) {
+ HDfree(s_addrs);
+ s_addrs = NULL;
+ }
+ if (s_sizes) {
+ HDfree(s_sizes);
+ s_sizes = NULL;
+ }
+ if (s_bufs) {
+ HDfree(s_bufs);
+ s_bufs = NULL;
+ }
+ }
+
+ /* Make sure we cleaned up */
+ HDassert(vector_was_sorted || !s_addrs);
+ HDassert(vector_was_sorted || !s_sizes);
+ HDassert(vector_was_sorted || !s_bufs);
+
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_debug_t_flag)
+ HDfprintf(stdout, "%s: Leaving, proc %d: ret_value = %d\n", __func__, file->mpi_rank, ret_value);
+#endif
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5FD__mpio_write_vector() */
+
+/*-------------------------------------------------------------------------
* Function: H5FD__mpio_flush
*
* Purpose: Makes sure that all data is on disk. This is collective.
@@ -1701,17 +2835,19 @@ H5FD__mpio_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t H5_ATTR
HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code)
/* Only processor p0 will get the filesize and broadcast it. */
- /* (Note that throwing an error here will cause non-rank 0 processes
- * to hang in following Bcast. -QAK, 3/17/2018)
- */
- if (0 == file->mpi_rank)
+ if (0 == file->mpi_rank) {
+ /* If MPI_File_get_size fails, broadcast file size as -1 to signal error */
if (MPI_SUCCESS != (mpi_code = MPI_File_get_size(file->f, &size)))
- HMPI_GOTO_ERROR(FAIL, "MPI_File_get_size failed", mpi_code)
+ size = (MPI_Offset)-1;
+ }
/* Broadcast file size */
if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&size, (int)sizeof(MPI_Offset), MPI_BYTE, 0, file->comm)))
HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code)
+ if (size < 0)
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_get_size failed", mpi_code)
+
if (H5FD_mpi_haddr_to_MPIOff(file->eoa, &needed_eof) < 0)
HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "cannot convert from haddr_t to MPI_Offset")
@@ -1774,11 +2910,16 @@ H5FD__mpio_delete(const char *filename, hid_t fapl_id)
HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list")
HDassert(H5FD_MPIO == H5P_peek_driver(plist));
- /* Get the MPI communicator and info from the fapl */
- if (H5P_get(plist, H5F_ACS_MPI_PARAMS_INFO_NAME, &info) < 0)
- HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI info object")
- if (H5P_get(plist, H5F_ACS_MPI_PARAMS_COMM_NAME, &comm) < 0)
- HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI communicator")
+ if (H5FD_mpi_self_initialized) {
+ comm = MPI_COMM_WORLD;
+ }
+ else {
+ /* Get the MPI communicator and info from the fapl */
+ if (H5P_get(plist, H5F_ACS_MPI_PARAMS_INFO_NAME, &info) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI info object")
+ if (H5P_get(plist, H5F_ACS_MPI_PARAMS_COMM_NAME, &comm) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI communicator")
+ }
/* Get the MPI rank of this process */
if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &mpi_rank)))
@@ -1789,96 +2930,101 @@ H5FD__mpio_delete(const char *filename, hid_t fapl_id)
HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code)
/* Delete the file */
- if (mpi_rank == 0)
+ if (mpi_rank == 0) {
+ /* If MPI_File_delete fails, push an error but
+ * still participate in the following MPI_Barrier
+ */
if (MPI_SUCCESS != (mpi_code = MPI_File_delete(filename, info)))
- HMPI_GOTO_ERROR(FAIL, "MPI_File_delete failed", mpi_code)
+ HMPI_DONE_ERROR(FAIL, "MPI_File_delete failed", mpi_code)
+ }
/* Set up a barrier (don't want processes to run ahead of the delete) */
if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm)))
HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code)
done:
+ /* Free duplicated MPI Communicator and Info objects */
+ if (H5_mpi_comm_free(&comm) < 0)
+ HDONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free MPI communicator")
+ if (H5_mpi_info_free(&info) < 0)
+ HDONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free MPI info object")
+
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD__mpio_delete() */
/*-------------------------------------------------------------------------
- * Function: H5FD__mpio_mpi_rank
+ * Function: H5FD__mpio_ctl
*
- * Purpose: Returns the MPI rank for a process
+ * Purpose: MPIO version of the ctl callback.
*
- * Return: Success: non-negative
- * Failure: negative
+ * The desired operation is specified by the op_code
+ * parameter.
*
- * Programmer: Quincey Koziol
- * Thursday, May 16, 2002
+ * The flags parameter controls management of op_codes that
+ * are unknown to the callback
*
- *-------------------------------------------------------------------------
- */
-static int
-H5FD__mpio_mpi_rank(const H5FD_t *_file)
-{
- const H5FD_mpio_t *file = (const H5FD_mpio_t *)_file;
-
- FUNC_ENTER_STATIC_NOERR
-
- /* Sanity checks */
- HDassert(file);
- HDassert(H5FD_MPIO == file->pub.driver_id);
-
- FUNC_LEAVE_NOAPI(file->mpi_rank)
-} /* end H5FD__mpio_mpi_rank() */
-
-/*-------------------------------------------------------------------------
- * Function: H5FD__mpio_mpi_size
+ * The input and output parameters allow op_code specific
+ * input and output
*
- * Purpose: Returns the number of MPI processes
+ * At present, the supported op codes are:
*
- * Return: Success: non-negative
- * Failure: negative
+ * H5FD_CTL__GET_MPI_COMMUNICATOR_OPCODE
+ * H5FD_CTL__GET_MPI_RANK_OPCODE
+ * H5FD_CTL__GET_MPI_SIZE_OPCODE
*
- * Programmer: Quincey Koziol
- * Thursday, May 16, 2002
+ * Note that these opcodes must be supported by all VFDs that
+ * support MPI.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: JRM -- 8/3/21
*
*-------------------------------------------------------------------------
*/
-static int
-H5FD__mpio_mpi_size(const H5FD_t *_file)
+static herr_t
+H5FD__mpio_ctl(H5FD_t *_file, uint64_t op_code, uint64_t flags, const void H5_ATTR_UNUSED *input,
+ void **output)
{
- const H5FD_mpio_t *file = (const H5FD_mpio_t *)_file;
+ H5FD_mpio_t *file = (H5FD_mpio_t *)_file;
+ herr_t ret_value = SUCCEED; /* Return value */
- FUNC_ENTER_STATIC_NOERR
+ FUNC_ENTER_NOAPI(FAIL)
/* Sanity checks */
HDassert(file);
HDassert(H5FD_MPIO == file->pub.driver_id);
- FUNC_LEAVE_NOAPI(file->mpi_size)
-} /* end H5FD__mpio_mpi_size() */
+ switch (op_code) {
-/*-------------------------------------------------------------------------
- * Function: H5FD__mpio_communicator
- *
- * Purpose: Returns the MPI communicator for the file.
- *
- * Return: Success: The communicator
- * Failure: Can't fail
- *
- * Programmer: Robb Matzke
- * Monday, August 9, 1999
- *
- *-------------------------------------------------------------------------
- */
-static MPI_Comm
-H5FD__mpio_communicator(const H5FD_t *_file)
-{
- const H5FD_mpio_t *file = (const H5FD_mpio_t *)_file;
+ case H5FD_CTL__GET_MPI_COMMUNICATOR_OPCODE:
+ HDassert(output);
+ HDassert(*output);
+ **((MPI_Comm **)output) = file->comm;
+ break;
- FUNC_ENTER_STATIC_NOERR
+ case H5FD_CTL__GET_MPI_RANK_OPCODE:
+ HDassert(output);
+ HDassert(*output);
+ **((int **)output) = file->mpi_rank;
+ break;
- /* Sanity checks */
- HDassert(file);
- HDassert(H5FD_MPIO == file->pub.driver_id);
+ case H5FD_CTL__GET_MPI_SIZE_OPCODE:
+ HDassert(output);
+ HDassert(*output);
+ **((int **)output) = file->mpi_size;
+ break;
+
+ default: /* unknown op code */
+ if (flags & H5FD_CTL__FAIL_IF_UNKNOWN_FLAG) {
+
+ HGOTO_ERROR(H5E_VFL, H5E_FCNTL, FAIL, "unknown op_code and fail if unknown")
+ }
+ break;
+ }
+
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
- FUNC_LEAVE_NOAPI(file->comm)
-} /* end H5FD__mpio_communicator() */
+} /* end H5FD__mpio_ctl() */
#endif /* H5_HAVE_PARALLEL */