diff options
Diffstat (limited to 'src/H5FDmpio.c')
-rw-r--r-- | src/H5FDmpio.c | 1546 |
1 files changed, 1346 insertions, 200 deletions
diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c index dd40399..2a5e462 100644 --- a/src/H5FDmpio.c +++ b/src/H5FDmpio.c @@ -45,6 +45,9 @@ static hid_t H5FD_MPIO_g = 0; /* (Can be changed by setting "HDF5_MPI_OPT_TYPES" environment variable to '0' or '1') */ hbool_t H5FD_mpi_opt_types_g = TRUE; +/* Whether the driver initialized MPI on its own */ +hbool_t H5FD_mpi_self_initialized = FALSE; + /* * The view is set to this value */ @@ -72,66 +75,78 @@ typedef struct H5FD_mpio_t { /* Private Prototypes */ /* Callbacks */ -static herr_t H5FD__mpio_term(void); -static H5FD_t * H5FD__mpio_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t maxaddr); -static herr_t H5FD__mpio_close(H5FD_t *_file); -static herr_t H5FD__mpio_query(const H5FD_t *_f1, unsigned long *flags); -static haddr_t H5FD__mpio_get_eoa(const H5FD_t *_file, H5FD_mem_t type); -static herr_t H5FD__mpio_set_eoa(H5FD_t *_file, H5FD_mem_t type, haddr_t addr); -static haddr_t H5FD__mpio_get_eof(const H5FD_t *_file, H5FD_mem_t type); -static herr_t H5FD__mpio_get_handle(H5FD_t *_file, hid_t fapl, void **file_handle); -static herr_t H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size, - void *buf); -static herr_t H5FD__mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size, - const void *buf); -static herr_t H5FD__mpio_flush(H5FD_t *_file, hid_t dxpl_id, hbool_t closing); -static herr_t H5FD__mpio_truncate(H5FD_t *_file, hid_t dxpl_id, hbool_t closing); -static herr_t H5FD__mpio_delete(const char *filename, hid_t fapl_id); -static int H5FD__mpio_mpi_rank(const H5FD_t *_file); -static int H5FD__mpio_mpi_size(const H5FD_t *_file); -static MPI_Comm H5FD__mpio_communicator(const H5FD_t *_file); +static herr_t H5FD__mpio_term(void); +static H5FD_t *H5FD__mpio_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t maxaddr); +static herr_t H5FD__mpio_close(H5FD_t *_file); +static herr_t H5FD__mpio_query(const H5FD_t *_f1, unsigned long *flags); +static haddr_t H5FD__mpio_get_eoa(const H5FD_t *_file, H5FD_mem_t type); +static herr_t H5FD__mpio_set_eoa(H5FD_t *_file, H5FD_mem_t type, haddr_t addr); +static haddr_t H5FD__mpio_get_eof(const H5FD_t *_file, H5FD_mem_t type); +static herr_t H5FD__mpio_get_handle(H5FD_t *_file, hid_t fapl, void **file_handle); +static herr_t H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size, + void *buf); +static herr_t H5FD__mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size, + const void *buf); +static herr_t H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t count, + H5FD_mem_t types[], haddr_t addrs[], size_t sizes[], void *bufs[]); +static herr_t H5FD__mpio_write_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t count, + H5FD_mem_t types[], haddr_t addrs[], size_t sizes[], + const void *bufs[]); +static herr_t H5FD__mpio_flush(H5FD_t *_file, hid_t dxpl_id, hbool_t closing); +static herr_t H5FD__mpio_truncate(H5FD_t *_file, hid_t dxpl_id, hbool_t closing); +static herr_t H5FD__mpio_delete(const char *filename, hid_t fapl_id); +static herr_t H5FD__mpio_ctl(H5FD_t *_file, uint64_t op_code, uint64_t flags, const void *input, + void **output); + +/* Other functions */ +static herr_t H5FD__mpio_vector_build_types( + uint32_t count, H5FD_mem_t types[], haddr_t addrs[], size_t sizes[], H5_flexible_const_ptr_t bufs[], + haddr_t *s_addrs[], size_t *s_sizes[], H5_flexible_const_ptr_t *s_bufs[], hbool_t *vector_was_sorted, + MPI_Offset *mpi_off, H5_flexible_const_ptr_t *mpi_bufs_base, int *size_i, MPI_Datatype *buf_type, + hbool_t *buf_type_created, MPI_Datatype *file_type, hbool_t *file_type_created, char *unused); /* The MPIO file driver information */ -static const H5FD_class_mpi_t H5FD_mpio_g = { - { - /* Start of superclass information */ - "mpio", /* name */ - HADDR_MAX, /* maxaddr */ - H5F_CLOSE_SEMI, /* fc_degree */ - H5FD__mpio_term, /* terminate */ - NULL, /* sb_size */ - NULL, /* sb_encode */ - NULL, /* sb_decode */ - 0, /* fapl_size */ - NULL, /* fapl_get */ - NULL, /* fapl_copy */ - NULL, /* fapl_free */ - 0, /* dxpl_size */ - NULL, /* dxpl_copy */ - NULL, /* dxpl_free */ - H5FD__mpio_open, /* open */ - H5FD__mpio_close, /* close */ - NULL, /* cmp */ - H5FD__mpio_query, /* query */ - NULL, /* get_type_map */ - NULL, /* alloc */ - NULL, /* free */ - H5FD__mpio_get_eoa, /* get_eoa */ - H5FD__mpio_set_eoa, /* set_eoa */ - H5FD__mpio_get_eof, /* get_eof */ - H5FD__mpio_get_handle, /* get_handle */ - H5FD__mpio_read, /* read */ - H5FD__mpio_write, /* write */ - H5FD__mpio_flush, /* flush */ - H5FD__mpio_truncate, /* truncate */ - NULL, /* lock */ - NULL, /* unlock */ - H5FD__mpio_delete, /* del */ - H5FD_FLMAP_DICHOTOMY /* fl_map */ - }, /* End of superclass information */ - H5FD__mpio_mpi_rank, /* get_rank */ - H5FD__mpio_mpi_size, /* get_size */ - H5FD__mpio_communicator /* get_comm */ +static const H5FD_class_t H5FD_mpio_g = { + H5FD_CLASS_VERSION, /* struct version */ + H5_VFD_MPIO, /* value */ + "mpio", /* name */ + HADDR_MAX, /* maxaddr */ + H5F_CLOSE_SEMI, /* fc_degree */ + H5FD__mpio_term, /* terminate */ + NULL, /* sb_size */ + NULL, /* sb_encode */ + NULL, /* sb_decode */ + 0, /* fapl_size */ + NULL, /* fapl_get */ + NULL, /* fapl_copy */ + NULL, /* fapl_free */ + 0, /* dxpl_size */ + NULL, /* dxpl_copy */ + NULL, /* dxpl_free */ + H5FD__mpio_open, /* open */ + H5FD__mpio_close, /* close */ + NULL, /* cmp */ + H5FD__mpio_query, /* query */ + NULL, /* get_type_map */ + NULL, /* alloc */ + NULL, /* free */ + H5FD__mpio_get_eoa, /* get_eoa */ + H5FD__mpio_set_eoa, /* set_eoa */ + H5FD__mpio_get_eof, /* get_eof */ + H5FD__mpio_get_handle, /* get_handle */ + H5FD__mpio_read, /* read */ + H5FD__mpio_write, /* write */ + H5FD__mpio_read_vector, /* read_vector */ + H5FD__mpio_write_vector, /* write_vector */ + NULL, /* read_selection */ + NULL, /* write_selection */ + H5FD__mpio_flush, /* flush */ + H5FD__mpio_truncate, /* truncate */ + NULL, /* lock */ + NULL, /* unlock */ + H5FD__mpio_delete, /* del */ + H5FD__mpio_ctl, /* ctl */ + H5FD_FLMAP_DICHOTOMY /* fl_map */ }; #ifdef H5FDmpio_DEBUG @@ -157,35 +172,6 @@ static int H5FD_mpio_debug_rank_s = -1; (H5FD_mpio_debug_rank_s < 0 || H5FD_mpio_debug_rank_s == (file)->mpi_rank) #endif -/*-------------------------------------------------------------------------- -NAME - H5FD__init_package -- Initialize interface-specific information - -USAGE - herr_t H5FD__init_package() - -RETURNS - SUCCEED/FAIL - -DESCRIPTION - Initializes any interface-specific data or routines. (Just calls - H5FD_mpio_init currently). - ---------------------------------------------------------------------------*/ -static herr_t -H5FD__init_package(void) -{ - herr_t ret_value = SUCCEED; - - FUNC_ENTER_STATIC - - if (H5FD_mpio_init() < 0) - HGOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "unable to initialize mpio VFD") - -done: - FUNC_LEAVE_NOAPI(ret_value) -} /* H5FD__init_package() */ - #ifdef H5FDmpio_DEBUG /*--------------------------------------------------------------------------- @@ -219,6 +205,41 @@ H5FD__mpio_parse_debug_str(const char *s) FUNC_LEAVE_NOAPI_VOID } /* end H5FD__mpio_parse_debug_str() */ + +/*--------------------------------------------------------------------------- + * Function: H5FD__mem_t_to_str + * + * Purpose: Returns a string representing the enum value in an H5FD_mem_t + * enum + * + * Returns: H5FD_mem_t enum value string + * + *--------------------------------------------------------------------------- + */ +static const char * +H5FD__mem_t_to_str(H5FD_mem_t mem_type) +{ + switch (mem_type) { + case H5FD_MEM_NOLIST: + return "H5FD_MEM_NOLIST"; + case H5FD_MEM_DEFAULT: + return "H5FD_MEM_DEFAULT"; + case H5FD_MEM_SUPER: + return "H5FD_MEM_SUPER"; + case H5FD_MEM_BTREE: + return "H5FD_MEM_BTREE"; + case H5FD_MEM_DRAW: + return "H5FD_MEM_DRAW"; + case H5FD_MEM_GHEAP: + return "H5FD_MEM_GHEAP"; + case H5FD_MEM_LHEAP: + return "H5FD_MEM_LHEAP"; + case H5FD_MEM_OHDR: + return "H5FD_MEM_OHDR"; + default: + return "(Unknown)"; + } +} #endif /* H5FDmpio_DEBUG */ /*------------------------------------------------------------------------- @@ -239,13 +260,30 @@ hid_t H5FD_mpio_init(void) { static int H5FD_mpio_Debug_inited = 0; + char * env = NULL; hid_t ret_value = H5I_INVALID_HID; /* Return value */ FUNC_ENTER_NOAPI(H5I_INVALID_HID) /* Register the MPI-IO VFD, if it isn't already */ - if (H5I_VFL != H5I_get_type(H5FD_MPIO_g)) - H5FD_MPIO_g = H5FD_register((const H5FD_class_t *)&H5FD_mpio_g, sizeof(H5FD_class_mpi_t), FALSE); + if (H5I_VFL != H5I_get_type(H5FD_MPIO_g)) { + H5FD_MPIO_g = H5FD_register((const H5FD_class_t *)&H5FD_mpio_g, sizeof(H5FD_class_t), FALSE); + + /* Check if MPI driver has been loaded dynamically */ + env = HDgetenv(HDF5_DRIVER); + if (env && !HDstrcmp(env, "mpio")) { + int mpi_initialized = 0; + + /* Initialize MPI if not already initialized */ + if (MPI_SUCCESS != MPI_Initialized(&mpi_initialized)) + HGOTO_ERROR(H5E_VFL, H5E_UNINITIALIZED, H5I_INVALID_HID, "can't check if MPI is initialized") + if (!mpi_initialized) { + if (MPI_SUCCESS != MPI_Init(NULL, NULL)) + HGOTO_ERROR(H5E_VFL, H5E_CANTINIT, H5I_INVALID_HID, "can't initialize MPI") + H5FD_mpi_self_initialized = TRUE; + } + } + } if (!H5FD_mpio_Debug_inited) { const char *s; /* String for environment variables */ @@ -292,6 +330,17 @@ H5FD__mpio_term(void) { FUNC_ENTER_STATIC_NOERR + /* Terminate MPI if the driver initialized it */ + if (H5FD_mpi_self_initialized) { + int mpi_finalized = 0; + + MPI_Finalized(&mpi_finalized); + if (!mpi_finalized) + MPI_Finalize(); + + H5FD_mpi_self_initialized = FALSE; + } + /* Reset VFL ID */ H5FD_MPIO_g = 0; @@ -353,7 +402,7 @@ H5Pset_fapl_mpio(hid_t fapl_id, MPI_Comm comm, MPI_Info info) HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI info object") /* duplication is done during driver setting. */ - ret_value = H5P_set_driver(plist, H5FD_MPIO, NULL); + ret_value = H5P_set_driver(plist, H5FD_MPIO, NULL, NULL); done: FUNC_LEAVE_API(ret_value) @@ -808,11 +857,16 @@ H5FD__mpio_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t H5_ATTR if (NULL == (plist = H5P_object_verify(fapl_id, H5P_FILE_ACCESS))) HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, NULL, "not a file access property list") - /* Get the MPI communicator and info object from the property list */ - if (H5P_get(plist, H5F_ACS_MPI_PARAMS_COMM_NAME, &comm) < 0) - HGOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't get MPI communicator") - if (H5P_get(plist, H5F_ACS_MPI_PARAMS_INFO_NAME, &info) < 0) - HGOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't get MPI info object") + if (H5FD_mpi_self_initialized) { + comm = MPI_COMM_WORLD; + } + else { + /* Get the MPI communicator and info object from the property list */ + if (H5P_get(plist, H5F_ACS_MPI_PARAMS_COMM_NAME, &comm) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't get MPI communicator") + if (H5P_get(plist, H5F_ACS_MPI_PARAMS_INFO_NAME, &info) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't get MPI info object") + } /* Get the MPI rank of this process and the total number of processes */ if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &mpi_rank))) @@ -862,14 +916,19 @@ H5FD__mpio_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t H5_ATTR file->mpi_size = mpi_size; /* Only processor p0 will get the filesize and broadcast it. */ - if (mpi_rank == 0) + if (mpi_rank == 0) { + /* If MPI_File_get_size fails, broadcast file size as -1 to signal error */ if (MPI_SUCCESS != (mpi_code = MPI_File_get_size(fh, &file_size))) - HMPI_GOTO_ERROR(NULL, "MPI_File_get_size failed", mpi_code) + file_size = (MPI_Offset)-1; + } /* Broadcast file size */ if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&file_size, (int)sizeof(MPI_Offset), MPI_BYTE, 0, comm))) HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code) + if (file_size < 0) + HMPI_GOTO_ERROR(NULL, "MPI_File_get_size failed", mpi_code) + /* Determine if the file should be truncated */ if (file_size && (flags & H5F_ACC_TRUNC)) { /* Truncate the file */ @@ -987,7 +1046,6 @@ H5FD__mpio_query(const H5FD_t H5_ATTR_UNUSED *_file, unsigned long *flags /* out *flags |= H5FD_FEAT_AGGREGATE_METADATA; /* OK to aggregate metadata allocations */ *flags |= H5FD_FEAT_AGGREGATE_SMALLDATA; /* OK to aggregate "small" raw data allocations */ *flags |= H5FD_FEAT_HAS_MPI; /* This driver uses MPI */ - *flags |= H5FD_FEAT_ALLOCATE_EARLY; /* Allocate space early instead of late */ *flags |= H5FD_FEAT_DEFAULT_VFD_COMPATIBLE; /* VFD creates a file which can be opened with the default VFD */ } /* end if */ @@ -1153,7 +1211,7 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU MPI_Status mpi_stat; /* Status from I/O operation */ MPI_Datatype buf_type = MPI_BYTE; /* MPI description of the selection in memory */ int size_i; /* Integer copy of 'size' to read */ -#if MPI_VERSION >= 3 +#if H5_CHECK_MPI_VERSION(3, 0) MPI_Count bytes_read = 0; /* Number of bytes read in */ MPI_Count type_size; /* MPI datatype used for I/O's size */ MPI_Count io_size; /* Actual number of bytes requested */ @@ -1165,6 +1223,7 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU int n; #endif hbool_t use_view_this_time = FALSE; + hbool_t derived_type = FALSE; hbool_t rank0_bcast = FALSE; /* If read-with-rank0-and-bcast flag was used */ #ifdef H5FDmpio_DEBUG hbool_t H5FD_mpio_debug_t_flag = (H5FD_mpio_debug_flags_s[(int)'t'] && H5FD_MPIO_TRACE_THIS_RANK(file)); @@ -1192,8 +1251,6 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU if (H5FD_mpi_haddr_to_MPIOff(addr, &mpi_off /*out*/) < 0) HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off") size_i = (int)size; - if ((hsize_t)size_i != size) - HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from size to size_i") /* Only look for MPI views for raw data transfers */ if (type == H5FD_MEM_DRAW) { @@ -1260,10 +1317,14 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU rank0_bcast = TRUE; /* Read on rank 0 Bcast to other ranks */ - if (file->mpi_rank == 0) + if (file->mpi_rank == 0) { + /* If MPI_File_read_at fails, push an error, but continue + * to participate in following MPI_Bcast */ if (MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat))) - HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code) + HMPI_DONE_ERROR(FAIL, "MPI_File_read_at failed", mpi_code) + } + if (MPI_SUCCESS != (mpi_code = MPI_Bcast(buf, size_i, buf_type, 0, file->comm))) HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code) } /* end if */ @@ -1293,6 +1354,21 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) } /* end if */ else { + if (size != (hsize_t)size_i) { + /* If HERE, then we need to work around the integer size limit + * of 2GB. The input size_t size variable cannot fit into an integer, + * but we can get around that limitation by creating a different datatype + * and then setting the integer size (or element count) to 1 when using + * the derived_type. + */ + + if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &buf_type) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype") + + derived_type = TRUE; + size_i = 1; + } + #ifdef H5FDmpio_DEBUG if (H5FD_mpio_debug_r_flag) HDfprintf(stderr, "%s: (%d) doing MPI independent IO\n", __func__, file->mpi_rank); @@ -1306,12 +1382,22 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU /* Only retrieve bytes read if this rank _actually_ participated in I/O */ if (!rank0_bcast || (rank0_bcast && file->mpi_rank == 0)) { /* How many bytes were actually read? */ -#if MPI_VERSION >= 3 - if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_read))) +#if H5_CHECK_MPI_VERSION(3, 0) + if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_read))) { #else - if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read))) + if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read))) { #endif - HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code) + if (rank0_bcast && file->mpi_rank == 0) { + /* If MPI_Get_elements(_x) fails for a rank 0 bcast strategy, + * push an error, but continue to participate in the following + * MPI_Bcast. + */ + bytes_read = -1; + HMPI_DONE_ERROR(FAIL, "MPI_Get_elements failed", mpi_code) + } + else + HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code) + } } /* end if */ /* If the rank0-bcast feature was used, broadcast the # of bytes read to @@ -1321,7 +1407,7 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU * of the data. (QAK - 2019/1/2) */ if (rank0_bcast) -#if MPI_VERSION >= 3 +#if H5_CHECK_MPI_VERSION(3, 0) if (MPI_SUCCESS != MPI_Bcast(&bytes_read, 1, MPI_COUNT, 0, file->comm)) #else if (MPI_SUCCESS != MPI_Bcast(&bytes_read, 1, MPI_INT, 0, file->comm)) @@ -1329,7 +1415,7 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", 0) /* Get the type's size */ -#if MPI_VERSION >= 3 +#if H5_CHECK_MPI_VERSION(3, 0) if (MPI_SUCCESS != (mpi_code = MPI_Type_size_x(buf_type, &type_size))) #else if (MPI_SUCCESS != (mpi_code = MPI_Type_size(buf_type, &type_size))) @@ -1345,8 +1431,8 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU #ifdef H5FDmpio_DEBUG if (H5FD_mpio_debug_r_flag) - HDfprintf(stderr, "%s: (%d) mpi_off = %ld bytes_read = %lld\n", __func__, file->mpi_rank, - (long)mpi_off, bytes_read); + HDfprintf(stderr, "%s: (%d) mpi_off = %ld bytes_read = %lld type = %s\n", __func__, file->mpi_rank, + (long)mpi_off, (long long)bytes_read, H5FD__mem_t_to_str(type)); #endif /* @@ -1356,6 +1442,9 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU HDmemset((char *)buf + bytes_read, 0, (size_t)n); done: + if (derived_type) + MPI_Type_free(&buf_type); + #ifdef H5FDmpio_DEBUG if (H5FD_mpio_debug_t_flag) HDfprintf(stderr, "%s: (%d) Leaving\n", __func__, file->mpi_rank); @@ -1393,7 +1482,7 @@ H5FD__mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, h MPI_Offset mpi_off; MPI_Status mpi_stat; /* Status from I/O operation */ MPI_Datatype buf_type = MPI_BYTE; /* MPI description of the selection in memory */ -#if MPI_VERSION >= 3 +#if H5_CHECK_MPI_VERSION(3, 0) MPI_Count bytes_written; MPI_Count type_size; /* MPI datatype used for I/O's size */ MPI_Count io_size; /* Actual number of bytes requested */ @@ -1468,20 +1557,6 @@ H5FD__mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, h */ mpi_off = 0; } /* end if */ - else if (size != (hsize_t)size_i) { - /* If HERE, then we need to work around the integer size limit - * of 2GB. The input size_t size variable cannot fit into an integer, - * but we can get around that limitation by creating a different datatype - * and then setting the integer size (or element count) to 1 when using - * the derived_type. - */ - - if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &buf_type) < 0) - HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype") - - derived_type = TRUE; - size_i = 1; - } /* Write the data. */ if (use_view_this_time) { @@ -1527,6 +1602,21 @@ H5FD__mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, h HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) } /* end if */ else { + if (size != (hsize_t)size_i) { + /* If HERE, then we need to work around the integer size limit + * of 2GB. The input size_t size variable cannot fit into an integer, + * but we can get around that limitation by creating a different datatype + * and then setting the integer size (or element count) to 1 when using + * the derived_type. + */ + + if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &buf_type) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype") + + derived_type = TRUE; + size_i = 1; + } + #ifdef H5FDmpio_DEBUG if (H5FD_mpio_debug_w_flag) HDfprintf(stderr, "%s: (%d) doing MPI independent IO\n", __func__, file->mpi_rank); @@ -1538,7 +1628,7 @@ H5FD__mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, h } /* end else */ /* How many bytes were actually written? */ -#if MPI_VERSION >= 3 +#if H5_CHECK_MPI_VERSION(3, 0) if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_written))) #else if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_written))) @@ -1546,7 +1636,7 @@ H5FD__mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, h HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code) /* Get the type's size */ -#if MPI_VERSION >= 3 +#if H5_CHECK_MPI_VERSION(3, 0) if (MPI_SUCCESS != (mpi_code = MPI_Type_size_x(buf_type, &type_size))) #else if (MPI_SUCCESS != (mpi_code = MPI_Type_size(buf_type, &type_size))) @@ -1562,8 +1652,8 @@ H5FD__mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, h #ifdef H5FDmpio_DEBUG if (H5FD_mpio_debug_w_flag) - HDfprintf(stderr, "%s: (%d) mpi_off = %ld bytes_written = %lld\n", __func__, file->mpi_rank, - (long)mpi_off, bytes_written); + HDfprintf(stderr, "%s: (%d) mpi_off = %ld bytes_written = %lld type = %s\n", __func__, + file->mpi_rank, (long)mpi_off, (long long)bytes_written, H5FD__mem_t_to_str(type)); #endif /* Each process will keep track of its perceived EOF value locally, and @@ -1590,6 +1680,1050 @@ done: } /* end H5FD__mpio_write() */ /*------------------------------------------------------------------------- + * Function: H5FD__mpio_vector_build_types + * + * Purpose: Build MPI datatypes and calculate offset, base buffer, and + * size for MPIO vector I/O. Spun off from common code in + * H5FD__mpio_vector_read() and H5FD__mpio_vector_write(). + * + * Return: Success: SUCCEED. + * Failure: FAIL. + * + * Programmer: Neil Fortner + * March 14, 2022 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__mpio_vector_build_types(uint32_t count, H5FD_mem_t types[], haddr_t addrs[], size_t sizes[], + H5_flexible_const_ptr_t bufs[], haddr_t *s_addrs[], size_t *s_sizes[], + H5_flexible_const_ptr_t *s_bufs[], hbool_t *vector_was_sorted, + MPI_Offset *mpi_off, H5_flexible_const_ptr_t *mpi_bufs_base, int *size_i, + MPI_Datatype *buf_type, hbool_t *buf_type_created, MPI_Datatype *file_type, + hbool_t *file_type_created, char *unused) +{ + hsize_t bigio_count; /* Transition point to create derived type */ + hbool_t fixed_size = FALSE; + size_t size; + H5FD_mem_t * s_types = NULL; + int * mpi_block_lengths = NULL; + MPI_Aint mpi_bufs_base_Aint; + MPI_Aint * mpi_bufs = NULL; + MPI_Aint * mpi_displacements = NULL; + MPI_Datatype *sub_types = NULL; + uint8_t * sub_types_created = NULL; + int i; + int j; + int mpi_code; /* MPI return code */ + herr_t ret_value = SUCCEED; + + FUNC_ENTER_STATIC + + /* Sanity checks */ + HDassert(s_sizes); + HDassert(s_bufs); + HDassert(vector_was_sorted); + HDassert(*vector_was_sorted); + HDassert(mpi_off); + HDassert(mpi_bufs_base); + HDassert(size_i); + HDassert(buf_type); + HDassert(buf_type_created); + HDassert(!*buf_type_created); + HDassert(file_type); + HDassert(file_type_created); + HDassert(!*file_type_created); + HDassert(unused); + + /* Get bio I/O transition point (may be lower than 2G for testing) */ + bigio_count = H5_mpi_get_bigio_count(); + + if (count == 1) { + /* Single block. Just use a series of MPI_BYTEs for the file view. + */ + *size_i = (int)sizes[0]; + *buf_type = MPI_BYTE; + *file_type = MPI_BYTE; + *mpi_bufs_base = bufs[0]; + + /* Setup s_addrs, s_sizes and s_bufs (needed for incomplete read filling code and eof + * calculation code) */ + *s_addrs = addrs; + *s_sizes = sizes; + *s_bufs = bufs; + + /* some numeric conversions */ + if (H5FD_mpi_haddr_to_MPIOff(addrs[0], mpi_off) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI offset") + + /* Check for size overflow */ + if (sizes[0] > bigio_count) { + /* We need to work around the integer size limit of 2GB. The input size_t size + * variable cannot fit into an integer, but we can get around that limitation by + * creating a different datatype and then setting the integer size (or element + * count) to 1 when using the derived_type. */ + + if (H5_mpio_create_large_type(sizes[0], 0, MPI_BYTE, buf_type) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype") + *buf_type_created = TRUE; + + if (H5_mpio_create_large_type(sizes[0], 0, MPI_BYTE, file_type) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype") + *file_type_created = TRUE; + + *size_i = 1; + } + } + else if (count > 0) { /* create MPI derived types describing the vector write */ + + /* sort the vector I/O request into increasing address order if required + * + * If the vector is already sorted, the base addresses of types, addrs, sizes, + * and bufs will be returned in s_types, s_addrs, s_sizes, and s_bufs respectively. + * + * If the vector was not already sorted, new, sorted versions of types, addrs, sizes, and bufs + * are allocated, populated, and returned in s_types, s_addrs, s_sizes, and s_bufs respectively. + * In this case, this function must free the memory allocated for the sorted vectors. + */ + if (H5FD_sort_vector_io_req(vector_was_sorted, count, types, addrs, sizes, bufs, &s_types, s_addrs, + s_sizes, s_bufs) < 0) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "can't sort vector I/O request") + + if ((NULL == (mpi_block_lengths = (int *)HDmalloc((size_t)count * sizeof(int)))) || + (NULL == (mpi_displacements = (MPI_Aint *)HDmalloc((size_t)count * sizeof(MPI_Aint)))) || + (NULL == (mpi_bufs = (MPI_Aint *)HDmalloc((size_t)count * sizeof(MPI_Aint))))) { + + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc mpi block lengths / displacement") + } + + /* when we setup mpi_bufs[] below, all addresses are offsets from + * mpi_bufs_base. + * + * Since these offsets must all be positive, we must scan through + * s_bufs[] to find the smallest value, and choose that for + * mpi_bufs_base. + */ + + j = 0; /* guess at the index of the smallest value of s_bufs[] */ + + for (i = 1; i < (int)count; i++) { + + if ((*s_bufs)[i].cvp < (*s_bufs)[j].cvp) { + + j = i; + } + } + + *mpi_bufs_base = (*s_bufs)[j]; + + if (MPI_SUCCESS != (mpi_code = MPI_Get_address(mpi_bufs_base->cvp, &mpi_bufs_base_Aint))) + + HMPI_GOTO_ERROR(FAIL, "MPI_Get_address for s_bufs[] to mpi_bufs_base failed", mpi_code) + + *size_i = 1; + + fixed_size = FALSE; + + /* load the mpi_block_lengths and mpi_displacements arrays */ + for (i = 0; i < (int)count; i++) { + /* Determine size of this vector element */ + if (!fixed_size) { + if ((*s_sizes)[i] == 0) { + HDassert(vector_was_sorted); + fixed_size = TRUE; + size = sizes[i - 1]; + } + else { + size = (*s_sizes)[i]; + } + } + + /* Add to block lengths and displacements arrays */ + mpi_block_lengths[i] = (int)size; + mpi_displacements[i] = (MPI_Aint)(*s_addrs)[i]; + + /* convert s_bufs[i] to MPI_Aint... */ + if (MPI_SUCCESS != (mpi_code = MPI_Get_address((*s_bufs)[i].cvp, &(mpi_bufs[i])))) + HMPI_GOTO_ERROR(FAIL, "MPI_Get_address for s_bufs[] - mpi_bufs_base failed", mpi_code) + + /*... and then subtract mpi_bufs_base_Aint from it. */ +#if ((MPI_VERSION > 3) || ((MPI_VERSION == 3) && (MPI_SUBVERSION >= 1))) + mpi_bufs[i] = MPI_Aint_diff(mpi_bufs[i], mpi_bufs_base_Aint); +#else + mpi_bufs[i] = mpi_bufs[i] - mpi_bufs_base_Aint; +#endif + + /* Check for size overflow */ + if (size > bigio_count) { + /* We need to work around the integer size limit of 2GB. The input size_t size + * variable cannot fit into an integer, but we can get around that limitation by + * creating a different datatype and then setting the integer size (or element + * count) to 1 when using the derived_type. */ + + /* Allocate arrays to keep track of types and whether they were created, if + * necessary */ + if (!sub_types) { + HDassert(!sub_types_created); + + if (NULL == (sub_types = (int *)HDmalloc((size_t)count * sizeof(MPI_Datatype)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc sub types array") + if (NULL == (sub_types_created = (uint8_t *)HDcalloc((size_t)count, 1))) { + H5MM_free(sub_types); + sub_types = NULL; + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc sub types created array") + } + + /* Initialize sub_types to all MPI_BYTE */ + for (j = 0; j < (int)count; j++) + sub_types[j] = MPI_BYTE; + } + HDassert(sub_types_created); + + /* Create type for large block */ + if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &sub_types[i]) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype") + sub_types_created[i] = TRUE; + + /* Only one of these large types for this vector element */ + mpi_block_lengths[i] = 1; + } + else + HDassert(size == (size_t)mpi_block_lengths[i]); + } + + /* create the memory MPI derived types */ + if (sub_types) { + if (MPI_SUCCESS != (mpi_code = MPI_Type_create_struct((int)count, mpi_block_lengths, mpi_bufs, + sub_types, buf_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct for buf_type failed", mpi_code) + } + else if (MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed((int)count, mpi_block_lengths, mpi_bufs, + MPI_BYTE, buf_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed for buf_type failed", mpi_code) + + *buf_type_created = TRUE; + + if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(buf_type))) + + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit for buf_type failed", mpi_code) + + /* create the file MPI derived type */ + if (sub_types) { + if (MPI_SUCCESS != (mpi_code = MPI_Type_create_struct((int)count, mpi_block_lengths, + mpi_displacements, sub_types, file_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct for file_type failed", mpi_code) + } + else if (MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed((int)count, mpi_block_lengths, + mpi_displacements, MPI_BYTE, file_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed for file_type failed", mpi_code) + + *file_type_created = TRUE; + + if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(file_type))) + + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit for file_type failed", mpi_code) + + /* Free up memory used to build types */ + HDassert(mpi_block_lengths); + HDfree(mpi_block_lengths); + mpi_block_lengths = NULL; + + HDassert(mpi_displacements); + HDfree(mpi_displacements); + mpi_displacements = NULL; + + HDassert(mpi_bufs); + HDfree(mpi_bufs); + mpi_bufs = NULL; + + if (sub_types) { + HDassert(sub_types); + + for (i = 0; i < (int)count; i++) + if (sub_types_created[i]) + MPI_Type_free(&sub_types[i]); + + HDfree(sub_types); + sub_types = NULL; + HDfree(sub_types_created); + sub_types_created = NULL; + } + + /* some numeric conversions */ + if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, mpi_off) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0") + } + else { + /* setup for null participation in the collective operation. */ + *buf_type = MPI_BYTE; + *file_type = MPI_BYTE; + + /* Set non-NULL pointer for I/O operation */ + mpi_bufs_base->vp = unused; + + /* MPI count to read */ + *size_i = 0; + + /* some numeric conversions */ + if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, mpi_off) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0") + } + +done: + /* free sorted vectors if they exist */ + if (!vector_was_sorted) + if (s_types) { + HDfree(s_types); + s_types = NULL; + } + + /* Clean up on error */ + if (ret_value < 0) { + if (mpi_block_lengths) { + HDfree(mpi_block_lengths); + mpi_block_lengths = NULL; + } + + if (mpi_displacements) { + HDfree(mpi_displacements); + mpi_displacements = NULL; + } + + if (mpi_bufs) { + HDfree(mpi_bufs); + mpi_bufs = NULL; + } + + if (sub_types) { + HDassert(sub_types_created); + + for (i = 0; i < (int)count; i++) + if (sub_types_created[i]) + MPI_Type_free(&sub_types[i]); + + HDfree(sub_types); + sub_types = NULL; + HDfree(sub_types_created); + sub_types_created = NULL; + } + } + + /* Make sure we cleaned up */ + HDassert(!mpi_block_lengths); + HDassert(!mpi_displacements); + HDassert(!mpi_bufs); + HDassert(!sub_types); + HDassert(!sub_types_created); + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__mpio_vector_build_types() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__mpio_read_vector() + * + * Purpose: The behaviour of this function dependes on the value of + * the io_xfer_mode obtained from the context. + * + * If it is H5FD_MPIO_COLLECTIVE, this is a collective + * operation, which allows us to use MPI_File_set_view, and + * then perform the entire vector read in a single MPI call. + * + * Do this (if count is positive), by constructing memory + * and file derived types from the supplied vector, using + * file type to set the file view, and then reading the + * the memory type from file. Note that this read is + * either independent or collective depending on the + * value of mpio_coll_opt -- again obtained from the context. + * + * If count is zero, participate in the collective read + * (if so configured) with an empty read. + * + * Finally, set the file view back to its default state. + * + * In contrast, if io_xfer_mode is H5FD_MPIO_INDEPENDENT, + * this call is independent, and thus we cannot use + * MPI_File_set_view(). + * + * In this case, simply walk the vector, and issue an + * independent read for each entry. + * + * Return: Success: SUCCEED. + * Failure: FAIL. + * + * Programmer: John Mainzer + * March 15, 2021 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t count, H5FD_mem_t types[], + haddr_t addrs[], size_t sizes[], void *bufs[]) +{ + H5FD_mpio_t * file = (H5FD_mpio_t *)_file; + hbool_t vector_was_sorted = TRUE; + haddr_t * s_addrs = NULL; + size_t * s_sizes = NULL; + void ** s_bufs = NULL; + char unused = 0; /* Unused, except for non-NULL pointer value */ + void * mpi_bufs_base = NULL; + MPI_Datatype buf_type = MPI_BYTE; /* MPI description of the selection in memory */ + hbool_t buf_type_created = FALSE; + MPI_Datatype file_type = MPI_BYTE; /* MPI description of the selection in file */ + hbool_t file_type_created = FALSE; + int i; + int mpi_code; /* MPI return code */ + MPI_Offset mpi_off = 0; + MPI_Status mpi_stat; /* Status from I/O operation */ + H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */ + H5FD_mpio_collective_opt_t coll_opt_mode; /* whether we are doing collective or independent I/O */ + int size_i; +#if MPI_VERSION >= 3 + MPI_Count bytes_read = 0; /* Number of bytes read in */ + MPI_Count type_size; /* MPI datatype used for I/O's size */ + MPI_Count io_size; /* Actual number of bytes requested */ + MPI_Count n; +#else + int bytes_read = 0; /* Number of bytes read in */ + int type_size; /* MPI datatype used for I/O's size */ + int io_size; /* Actual number of bytes requested */ + int n; +#endif + hbool_t rank0_bcast = FALSE; /* If read-with-rank0-and-bcast flag was used */ +#ifdef H5FDmpio_DEBUG + hbool_t H5FD_mpio_debug_t_flag = (H5FD_mpio_debug_flags_s[(int)'t'] && H5FD_MPIO_TRACE_THIS_RANK(file)); + hbool_t H5FD_mpio_debug_r_flag = (H5FD_mpio_debug_flags_s[(int)'r'] && H5FD_MPIO_TRACE_THIS_RANK(file)); +#endif + herr_t ret_value = SUCCEED; + + FUNC_ENTER_STATIC + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_t_flag) + HDfprintf(stderr, "%s: (%d) Entering\n", __func__, file->mpi_rank); +#endif + + /* Sanity checks */ + HDassert(file); + HDassert(H5FD_MPIO == file->pub.driver_id); + HDassert((types) || (count == 0)); + HDassert((addrs) || (count == 0)); + HDassert((sizes) || (count == 0)); + HDassert((bufs) || (count == 0)); + + /* verify that the first elements of the sizes and types arrays are + * valid. + */ + HDassert((count == 0) || (sizes[0] != 0)); + HDassert((count == 0) || (types[0] != H5FD_MEM_NOLIST)); + + /* Get the transfer mode from the API context + * + * This flag is set to H5FD_MPIO_COLLECTIVE if the API call is + * collective, and to H5FD_MPIO_INDEPENDENT if it is not. + * + * While this doesn't mean that we are actually about to do a collective + * read, it does mean that all ranks are here, so we can use MPI_File_set_view(). + */ + if (H5CX_get_io_xfer_mode(&xfer_mode) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode") + + if (xfer_mode == H5FD_MPIO_COLLECTIVE) { + /* Build MPI types, etc. */ + if (H5FD__mpio_vector_build_types(count, types, addrs, sizes, (H5_flexible_const_ptr_t *)bufs, + &s_addrs, &s_sizes, (H5_flexible_const_ptr_t **)&s_bufs, + &vector_was_sorted, &mpi_off, + (H5_flexible_const_ptr_t *)&mpi_bufs_base, &size_i, &buf_type, + &buf_type_created, &file_type, &file_type_created, &unused) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't build MPI datatypes for I/O") + + /* free sorted addrs vector if it exists */ + if (!vector_was_sorted) + if (s_addrs) { + HDfree(s_addrs); + s_addrs = NULL; + } + + /* Portably initialize MPI status variable */ + HDmemset(&mpi_stat, 0, sizeof(mpi_stat)); + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_r_flag) + HDfprintf(stdout, "%s: mpi_off = %ld size_i = %d\n", __func__, (long)mpi_off, size_i); +#endif + + /* Setup the file view. */ + if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, mpi_off, MPI_BYTE, file_type, + H5FD_mpi_native_g, file->info))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) + + /* Reset mpi_off to 0 since the view now starts at the data offset */ + if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, &mpi_off) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0") + + /* Get the collective_opt property to check whether the application wants to do IO individually. + */ + if (H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0) + + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property") + + /* Read the data. */ +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_r_flag) + HDfprintf(stdout, "%s: using MPIO collective mode\n", __func__); +#endif + if (coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) { +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_r_flag) + HDfprintf(stdout, "%s: doing MPI collective IO\n", __func__); +#endif + /* Check whether we should read from rank 0 and broadcast to other ranks */ + if (H5CX_get_mpio_rank0_bcast()) { +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_r_flag) + HDfprintf(stdout, "%s: doing read-rank0-and-MPI_Bcast\n", __func__); +#endif + /* Indicate path we've taken */ + rank0_bcast = TRUE; + + /* Read on rank 0 Bcast to other ranks */ + if (file->mpi_rank == 0) + if (MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, mpi_bufs_base, size_i, + buf_type, &mpi_stat))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code) + if (MPI_SUCCESS != (mpi_code = MPI_Bcast(mpi_bufs_base, size_i, buf_type, 0, file->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code) + } /* end if */ + else if (MPI_SUCCESS != (mpi_code = MPI_File_read_at_all(file->f, mpi_off, mpi_bufs_base, size_i, + buf_type, &mpi_stat))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code) + } /* end if */ + else if (size_i > 0) { +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_r_flag) + HDfprintf(stdout, "%s: doing MPI independent IO\n", __func__); +#endif + + if (MPI_SUCCESS != + (mpi_code = MPI_File_read_at(file->f, mpi_off, mpi_bufs_base, size_i, buf_type, &mpi_stat))) + + HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code) + + } /* end else */ + + /* Reset the file view */ + if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, + H5FD_mpi_native_g, file->info))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) + + /* Only retrieve bytes read if this rank _actually_ participated in I/O */ + if (!rank0_bcast || (rank0_bcast && file->mpi_rank == 0)) { + /* How many bytes were actually read? */ +#if MPI_VERSION >= 3 + if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_read))) +#else + if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read))) +#endif + HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code) + } /* end if */ + + /* If the rank0-bcast feature was used, broadcast the # of bytes read to + * other ranks, which didn't perform any I/O. + */ + /* NOTE: This could be optimized further to be combined with the broadcast + * of the data. (QAK - 2019/1/2) + * Or have rank 0 clear the unread parts of the buffer prior to + * the bcast. (NAF - 2021/9/15) + */ + if (rank0_bcast) +#if MPI_VERSION >= 3 + if (MPI_SUCCESS != MPI_Bcast(&bytes_read, 1, MPI_COUNT, 0, file->comm)) +#else + if (MPI_SUCCESS != MPI_Bcast(&bytes_read, 1, MPI_INT, 0, file->comm)) +#endif + HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", 0) + + /* Get the type's size */ +#if MPI_VERSION >= 3 + if (MPI_SUCCESS != (mpi_code = MPI_Type_size_x(buf_type, &type_size))) +#else + if (MPI_SUCCESS != (mpi_code = MPI_Type_size(buf_type, &type_size))) +#endif + HMPI_GOTO_ERROR(FAIL, "MPI_Type_size failed", mpi_code) + + /* Compute the actual number of bytes requested */ + io_size = type_size * size_i; + + /* Check for read failure */ + if (bytes_read < 0 || bytes_read > io_size) + HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed") + + /* Check for incomplete read */ + n = io_size - bytes_read; + if (n > 0) { + i = (int)count - 1; + + /* Iterate over sorted array in reverse, filling in zeroes to + * sections of the buffers that were not read to */ + do { + HDassert(i >= 0); + +#if MPI_VERSION >= 3 + io_size = MIN(n, (MPI_Count)s_sizes[i]); + bytes_read = (MPI_Count)s_sizes[i] - io_size; +#else + io_size = MIN(n, (int)s_sizes[i]); + bytes_read = (int)s_sizes[i] - io_size; +#endif + HDassert(bytes_read >= 0); + + HDmemset((char *)s_bufs[i] + bytes_read, 0, (size_t)io_size); + + n -= io_size; + i--; + } while (n > 0); + } + } + else if (count > 0) { + haddr_t max_addr = HADDR_MAX; + hbool_t fixed_size = FALSE; + size_t size; + + /* The read is part of an independent operation. As a result, + * we can't use MPI_File_set_view() (since it it a collective operation), + * and thus we can't use the above code to construct the MPI datatypes. + * In the future, we could write code to detect when a contiguous slab + * in the file selection spans multiple vector elements and construct a + * memory datatype to match this larger block in the file, but for now + * just read in each element of the vector in a separate + * MPI_File_read_at() call. + * + * We could also just detect the case when the entire file selection is + * contiguous, which would allow us to use + * H5FD__mpio_vector_build_types() to construct the memory datatype. + */ + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_r_flag) + HDfprintf(stdout, "%s: doing MPI independent IO\n", __func__); +#endif + + /* Loop over vector elements */ + for (i = 0; i < (int)count; i++) { + /* Convert address to mpi offset */ + if (H5FD_mpi_haddr_to_MPIOff(addrs[i], &mpi_off) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off") + + /* Calculate I/O size */ + if (!fixed_size) { + if (sizes[i] == 0) { + fixed_size = TRUE; + size = sizes[i - 1]; + } + else { + size = sizes[i]; + } + } + size_i = (int)size; + + if (size != (size_t)size_i) { + /* If HERE, then we need to work around the integer size limit + * of 2GB. The input size_t size variable cannot fit into an integer, + * but we can get around that limitation by creating a different datatype + * and then setting the integer size (or element count) to 1 when using + * the derived_type. + */ + + if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &buf_type) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype") + + buf_type_created = TRUE; + size_i = 1; + } + + /* Check if we actually need to do I/O */ + if (addrs[i] < max_addr) { + /* Portably initialize MPI status variable */ + HDmemset(&mpi_stat, 0, sizeof(mpi_stat)); + + /* Issue read */ + if (MPI_SUCCESS != + (mpi_code = MPI_File_read_at(file->f, mpi_off, bufs[i], size_i, buf_type, &mpi_stat))) + + HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code) + + /* How many bytes were actually read? */ +#if MPI_VERSION >= 3 + if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, MPI_BYTE, &bytes_read))) +#else + if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read))) +#endif + HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code) + + /* Compute the actual number of bytes requested */ +#if MPI_VERSION >= 3 + io_size = (MPI_Count)size; +#else + io_size = (int)size; +#endif + + /* Check for read failure */ + if (bytes_read < 0 || bytes_read > io_size) + HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed") + + /* + * If we didn't read the entire I/O, fill in zeroes beyond end of + * the physical MPI file and don't issue any more reads at higher + * addresses. + */ + if ((n = (io_size - bytes_read)) > 0) { + HDmemset((char *)bufs[i] + bytes_read, 0, (size_t)n); + max_addr = addrs[i] + (haddr_t)bytes_read; + } + } + else { + /* Read is past the max address, fill in zeroes */ + HDmemset((char *)bufs[i], 0, size); + } + } + } + +done: + if (buf_type_created) { + MPI_Type_free(&buf_type); + } + + if (file_type_created) { + MPI_Type_free(&file_type); + } + + /* free sorted vectors if they exist */ + if (!vector_was_sorted) { + if (s_addrs) { + HDfree(s_addrs); + s_addrs = NULL; + } + if (s_sizes) { + HDfree(s_sizes); + s_sizes = NULL; + } + if (s_bufs) { + HDfree(s_bufs); + s_bufs = NULL; + } + } + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_t_flag) + HDfprintf(stdout, "%s: Leaving, proc %d: ret_value = %d\n", __func__, file->mpi_rank, ret_value); +#endif + + FUNC_LEAVE_NOAPI(ret_value) + +} /* end H5FD__mpio_read_vector() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__mpio_write_vector + * + * Purpose: The behaviour of this function dependes on the value of + * the io_xfer_mode obtained from the context. + * + * If it is H5FD_MPIO_COLLECTIVE, this is a collective + * operation, which allows us to use MPI_File_set_view, and + * then perform the entire vector write in a single MPI call. + * + * Do this (if count is positive), by constructing memory + * and file derived types from the supplied vector, using + * file type to set the file view, and then writing the + * the memory type to file. Note that this write is + * either independent or collective depending on the + * value of mpio_coll_opt -- again obtained from the context. + * + * If count is zero, participate in the collective write + * (if so configured) with an empty write. + * + * Finally, set the file view back to its default state. + * + * In contrast, if io_xfer_mode is H5FD_MPIO_INDEPENDENT, + * this call is independent, and thus we cannot use + * MPI_File_set_view(). + * + * In this case, simply walk the vector, and issue an + * independent write for each entry. + * + * Return: Success: SUCCEED. + * Failure: FAIL. + * + * Programmer: John Mainzer + * March 15, 2021 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__mpio_write_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t count, H5FD_mem_t types[], + haddr_t addrs[], size_t sizes[], const void *bufs[]) +{ + H5FD_mpio_t * file = (H5FD_mpio_t *)_file; + hbool_t vector_was_sorted = TRUE; + haddr_t * s_addrs = NULL; + size_t * s_sizes = NULL; + const void ** s_bufs = NULL; + char unused = 0; /* Unused, except for non-NULL pointer value */ + const void * mpi_bufs_base = NULL; + MPI_Datatype buf_type = MPI_BYTE; /* MPI description of the selection in memory */ + hbool_t buf_type_created = FALSE; + MPI_Datatype file_type = MPI_BYTE; /* MPI description of the selection in file */ + hbool_t file_type_created = FALSE; + int i; + int mpi_code; /* MPI return code */ + MPI_Offset mpi_off = 0; + MPI_Status mpi_stat; /* Status from I/O operation */ + H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */ + H5FD_mpio_collective_opt_t coll_opt_mode; /* whether we are doing collective or independent I/O */ + int size_i; +#ifdef H5FDmpio_DEBUG + hbool_t H5FD_mpio_debug_t_flag = (H5FD_mpio_debug_flags_s[(int)'t'] && H5FD_MPIO_TRACE_THIS_RANK(file)); + hbool_t H5FD_mpio_debug_w_flag = (H5FD_mpio_debug_flags_s[(int)'w'] && H5FD_MPIO_TRACE_THIS_RANK(file)); +#endif + haddr_t max_addr = 0; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_STATIC + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_t_flag) + HDfprintf(stderr, "%s: (%d) Entering\n", __func__, file->mpi_rank); +#endif + + /* Sanity checks */ + HDassert(file); + HDassert(H5FD_MPIO == file->pub.driver_id); + HDassert((types) || (count == 0)); + HDassert((addrs) || (count == 0)); + HDassert((sizes) || (count == 0)); + HDassert((bufs) || (count == 0)); + + /* verify that the first elements of the sizes and types arrays are + * valid. + */ + HDassert((count == 0) || (sizes[0] != 0)); + HDassert((count == 0) || (types[0] != H5FD_MEM_NOLIST)); + + /* Verify that no data is written when between MPI_Barrier()s during file flush */ + + HDassert(!H5CX_get_mpi_file_flushing()); + + /* Get the transfer mode from the API context + * + * This flag is set to H5FD_MPIO_COLLECTIVE if the API call is + * collective, and to H5FD_MPIO_INDEPENDENT if it is not. + * + * While this doesn't mean that we are actually about to do a collective + * write, it does mean that all ranks are here, so we can use MPI_File_set_view(). + */ + if (H5CX_get_io_xfer_mode(&xfer_mode) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode") + + if (xfer_mode == H5FD_MPIO_COLLECTIVE) { + /* Build MPI types, etc. */ + if (H5FD__mpio_vector_build_types(count, types, addrs, sizes, (H5_flexible_const_ptr_t *)bufs, + &s_addrs, &s_sizes, (H5_flexible_const_ptr_t **)&s_bufs, + &vector_was_sorted, &mpi_off, + (H5_flexible_const_ptr_t *)&mpi_bufs_base, &size_i, &buf_type, + &buf_type_created, &file_type, &file_type_created, &unused) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't build MPI datatypes for I/O") + + /* Compute max addr writted to */ + if (count > 0) + max_addr = s_addrs[count - 1] + (haddr_t)(s_sizes[count - 1]); + + /* free sorted vectors if they exist */ + if (!vector_was_sorted) { + if (s_addrs) { + HDfree(s_addrs); + s_addrs = NULL; + } + if (s_sizes) { + HDfree(s_sizes); + s_sizes = NULL; + } + if (s_bufs) { + HDfree(s_bufs); + s_bufs = NULL; + } + } + + /* Portably initialize MPI status variable */ + HDmemset(&mpi_stat, 0, sizeof(MPI_Status)); + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_w_flag) + HDfprintf(stdout, "%s: mpi_off = %ld size_i = %d\n", __func__, (long)mpi_off, size_i); +#endif + + /* Setup the file view. */ + if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, mpi_off, MPI_BYTE, file_type, + H5FD_mpi_native_g, file->info))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) + + /* Reset mpi_off to 0 since the view now starts at the data offset */ + if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, &mpi_off) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0") + + /* Get the collective_opt property to check whether the application wants to do IO individually. + */ + if (H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property") + + /* Write the data. */ +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_w_flag) + HDfprintf(stdout, "%s: using MPIO collective mode\n", __func__); +#endif + + if (coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) { +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_w_flag) + HDfprintf(stdout, "%s: doing MPI collective IO\n", __func__); +#endif + + if (MPI_SUCCESS != (mpi_code = MPI_File_write_at_all(file->f, mpi_off, mpi_bufs_base, size_i, + buf_type, &mpi_stat))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at_all failed", mpi_code) + } /* end if */ + else if (size_i > 0) { +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_w_flag) + HDfprintf(stdout, "%s: doing MPI independent IO\n", __func__); +#endif + + if (MPI_SUCCESS != + (mpi_code = MPI_File_write_at(file->f, mpi_off, mpi_bufs_base, size_i, buf_type, &mpi_stat))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at failed", mpi_code) + } /* end else */ + + /* Reset the file view */ + if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, + H5FD_mpi_native_g, file->info))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) + } + else if (count > 0) { + hbool_t fixed_size = FALSE; + size_t size; + + /* The read is part of an independent operation. As a result, + * we can't use MPI_File_set_view() (since it it a collective operation), + * and thus we can't use the above code to construct the MPI datatypes. + * In the future, we could write code to detect when a contiguous slab + * in the file selection spans multiple vector elements and construct a + * memory datatype to match this larger block in the file, but for now + * just read in each element of the vector in a separate + * MPI_File_read_at() call. + * + * We could also just detect the case when the entire file selection is + * contiguous, which would allow us to use + * H5FD__mpio_vector_build_types() to construct the memory datatype. + */ + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_w_flag) + HDfprintf(stdout, "%s: doing MPI independent IO\n", __func__); +#endif + + /* Loop over vector elements */ + for (i = 0; i < (int)count; i++) { + /* Convert address to mpi offset */ + if (H5FD_mpi_haddr_to_MPIOff(addrs[i], &mpi_off) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off") + + /* Calculate I/O size */ + if (!fixed_size) { + if (sizes[i] == 0) { + fixed_size = TRUE; + size = sizes[i - 1]; + } + else { + size = sizes[i]; + } + } + size_i = (int)size; + + if (size != (size_t)size_i) { + /* If HERE, then we need to work around the integer size limit + * of 2GB. The input size_t size variable cannot fit into an integer, + * but we can get around that limitation by creating a different datatype + * and then setting the integer size (or element count) to 1 when using + * the derived_type. + */ + + if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &buf_type) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype") + + buf_type_created = TRUE; + size_i = 1; + } + + /* Perform write */ + if (MPI_SUCCESS != + (mpi_code = MPI_File_write_at(file->f, mpi_off, bufs[i], size_i, buf_type, &mpi_stat))) + + HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at failed", mpi_code) + + /* Check if this is the highest address written to so far */ + if (addrs[i] + size > max_addr) + max_addr = addrs[i] + size; + } + } + + /* Each process will keep track of its perceived EOF value locally, and + * ultimately we will reduce this value to the maximum amongst all + * processes, but until then keep the actual eof at HADDR_UNDEF just in + * case something bad happens before that point. (rather have a value + * we know is wrong sitting around rather than one that could only + * potentially be wrong.) + */ + file->eof = HADDR_UNDEF; + + /* check to see if the local eof has changed been extended, and update if so */ + if (max_addr > file->local_eof) + file->local_eof = max_addr; + +done: + if (buf_type_created) + MPI_Type_free(&buf_type); + + if (file_type_created) + MPI_Type_free(&file_type); + + /* Cleanup on error */ + if (ret_value < 0 && !vector_was_sorted) { + if (s_addrs) { + HDfree(s_addrs); + s_addrs = NULL; + } + if (s_sizes) { + HDfree(s_sizes); + s_sizes = NULL; + } + if (s_bufs) { + HDfree(s_bufs); + s_bufs = NULL; + } + } + + /* Make sure we cleaned up */ + HDassert(vector_was_sorted || !s_addrs); + HDassert(vector_was_sorted || !s_sizes); + HDassert(vector_was_sorted || !s_bufs); + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_t_flag) + HDfprintf(stdout, "%s: Leaving, proc %d: ret_value = %d\n", __func__, file->mpi_rank, ret_value); +#endif + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__mpio_write_vector() */ + +/*------------------------------------------------------------------------- * Function: H5FD__mpio_flush * * Purpose: Makes sure that all data is on disk. This is collective. @@ -1701,17 +2835,19 @@ H5FD__mpio_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t H5_ATTR HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code) /* Only processor p0 will get the filesize and broadcast it. */ - /* (Note that throwing an error here will cause non-rank 0 processes - * to hang in following Bcast. -QAK, 3/17/2018) - */ - if (0 == file->mpi_rank) + if (0 == file->mpi_rank) { + /* If MPI_File_get_size fails, broadcast file size as -1 to signal error */ if (MPI_SUCCESS != (mpi_code = MPI_File_get_size(file->f, &size))) - HMPI_GOTO_ERROR(FAIL, "MPI_File_get_size failed", mpi_code) + size = (MPI_Offset)-1; + } /* Broadcast file size */ if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&size, (int)sizeof(MPI_Offset), MPI_BYTE, 0, file->comm))) HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code) + if (size < 0) + HMPI_GOTO_ERROR(FAIL, "MPI_File_get_size failed", mpi_code) + if (H5FD_mpi_haddr_to_MPIOff(file->eoa, &needed_eof) < 0) HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "cannot convert from haddr_t to MPI_Offset") @@ -1774,11 +2910,16 @@ H5FD__mpio_delete(const char *filename, hid_t fapl_id) HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list") HDassert(H5FD_MPIO == H5P_peek_driver(plist)); - /* Get the MPI communicator and info from the fapl */ - if (H5P_get(plist, H5F_ACS_MPI_PARAMS_INFO_NAME, &info) < 0) - HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI info object") - if (H5P_get(plist, H5F_ACS_MPI_PARAMS_COMM_NAME, &comm) < 0) - HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI communicator") + if (H5FD_mpi_self_initialized) { + comm = MPI_COMM_WORLD; + } + else { + /* Get the MPI communicator and info from the fapl */ + if (H5P_get(plist, H5F_ACS_MPI_PARAMS_INFO_NAME, &info) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI info object") + if (H5P_get(plist, H5F_ACS_MPI_PARAMS_COMM_NAME, &comm) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI communicator") + } /* Get the MPI rank of this process */ if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &mpi_rank))) @@ -1789,96 +2930,101 @@ H5FD__mpio_delete(const char *filename, hid_t fapl_id) HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code) /* Delete the file */ - if (mpi_rank == 0) + if (mpi_rank == 0) { + /* If MPI_File_delete fails, push an error but + * still participate in the following MPI_Barrier + */ if (MPI_SUCCESS != (mpi_code = MPI_File_delete(filename, info))) - HMPI_GOTO_ERROR(FAIL, "MPI_File_delete failed", mpi_code) + HMPI_DONE_ERROR(FAIL, "MPI_File_delete failed", mpi_code) + } /* Set up a barrier (don't want processes to run ahead of the delete) */ if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm))) HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code) done: + /* Free duplicated MPI Communicator and Info objects */ + if (H5_mpi_comm_free(&comm) < 0) + HDONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free MPI communicator") + if (H5_mpi_info_free(&info) < 0) + HDONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free MPI info object") + FUNC_LEAVE_NOAPI(ret_value) } /* end H5FD__mpio_delete() */ /*------------------------------------------------------------------------- - * Function: H5FD__mpio_mpi_rank + * Function: H5FD__mpio_ctl * - * Purpose: Returns the MPI rank for a process + * Purpose: MPIO version of the ctl callback. * - * Return: Success: non-negative - * Failure: negative + * The desired operation is specified by the op_code + * parameter. * - * Programmer: Quincey Koziol - * Thursday, May 16, 2002 + * The flags parameter controls management of op_codes that + * are unknown to the callback * - *------------------------------------------------------------------------- - */ -static int -H5FD__mpio_mpi_rank(const H5FD_t *_file) -{ - const H5FD_mpio_t *file = (const H5FD_mpio_t *)_file; - - FUNC_ENTER_STATIC_NOERR - - /* Sanity checks */ - HDassert(file); - HDassert(H5FD_MPIO == file->pub.driver_id); - - FUNC_LEAVE_NOAPI(file->mpi_rank) -} /* end H5FD__mpio_mpi_rank() */ - -/*------------------------------------------------------------------------- - * Function: H5FD__mpio_mpi_size + * The input and output parameters allow op_code specific + * input and output * - * Purpose: Returns the number of MPI processes + * At present, the supported op codes are: * - * Return: Success: non-negative - * Failure: negative + * H5FD_CTL__GET_MPI_COMMUNICATOR_OPCODE + * H5FD_CTL__GET_MPI_RANK_OPCODE + * H5FD_CTL__GET_MPI_SIZE_OPCODE * - * Programmer: Quincey Koziol - * Thursday, May 16, 2002 + * Note that these opcodes must be supported by all VFDs that + * support MPI. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: JRM -- 8/3/21 * *------------------------------------------------------------------------- */ -static int -H5FD__mpio_mpi_size(const H5FD_t *_file) +static herr_t +H5FD__mpio_ctl(H5FD_t *_file, uint64_t op_code, uint64_t flags, const void H5_ATTR_UNUSED *input, + void **output) { - const H5FD_mpio_t *file = (const H5FD_mpio_t *)_file; + H5FD_mpio_t *file = (H5FD_mpio_t *)_file; + herr_t ret_value = SUCCEED; /* Return value */ - FUNC_ENTER_STATIC_NOERR + FUNC_ENTER_NOAPI(FAIL) /* Sanity checks */ HDassert(file); HDassert(H5FD_MPIO == file->pub.driver_id); - FUNC_LEAVE_NOAPI(file->mpi_size) -} /* end H5FD__mpio_mpi_size() */ + switch (op_code) { -/*------------------------------------------------------------------------- - * Function: H5FD__mpio_communicator - * - * Purpose: Returns the MPI communicator for the file. - * - * Return: Success: The communicator - * Failure: Can't fail - * - * Programmer: Robb Matzke - * Monday, August 9, 1999 - * - *------------------------------------------------------------------------- - */ -static MPI_Comm -H5FD__mpio_communicator(const H5FD_t *_file) -{ - const H5FD_mpio_t *file = (const H5FD_mpio_t *)_file; + case H5FD_CTL__GET_MPI_COMMUNICATOR_OPCODE: + HDassert(output); + HDassert(*output); + **((MPI_Comm **)output) = file->comm; + break; - FUNC_ENTER_STATIC_NOERR + case H5FD_CTL__GET_MPI_RANK_OPCODE: + HDassert(output); + HDassert(*output); + **((int **)output) = file->mpi_rank; + break; - /* Sanity checks */ - HDassert(file); - HDassert(H5FD_MPIO == file->pub.driver_id); + case H5FD_CTL__GET_MPI_SIZE_OPCODE: + HDassert(output); + HDassert(*output); + **((int **)output) = file->mpi_size; + break; + + default: /* unknown op code */ + if (flags & H5FD_CTL__FAIL_IF_UNKNOWN_FLAG) { + + HGOTO_ERROR(H5E_VFL, H5E_FCNTL, FAIL, "unknown op_code and fail if unknown") + } + break; + } + +done: + + FUNC_LEAVE_NOAPI(ret_value) - FUNC_LEAVE_NOAPI(file->comm) -} /* end H5FD__mpio_communicator() */ +} /* end H5FD__mpio_ctl() */ #endif /* H5_HAVE_PARALLEL */ |