diff options
Diffstat (limited to 'src/H5mpi.c')
| -rw-r--r-- | src/H5mpi.c | 838 |
1 files changed, 838 insertions, 0 deletions
diff --git a/src/H5mpi.c b/src/H5mpi.c new file mode 100644 index 0000000..31ad036 --- /dev/null +++ b/src/H5mpi.c @@ -0,0 +1,838 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright by The HDF Group. * + * All rights reserved. * + * * + * This file is part of HDF5. The full HDF5 copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the COPYING file, which can be found at the root of the source code * + * distribution tree, or in https://www.hdfgroup.org/licenses. * + * If you do not have access to either file, you may request a copy from * + * help@hdfgroup.org. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +/* + * Purpose: Common MPI routines + * + */ + +#include "H5private.h" /* Generic Functions */ +#include "H5Eprivate.h" /* Error handling */ +#include "H5MMprivate.h" /* Memory Management */ + +#ifdef H5_HAVE_PARALLEL + +/****************/ +/* Local Macros */ +/****************/ +#define TWO_GIG_LIMIT INT32_MAX +#ifndef H5_MAX_MPI_COUNT +#define H5_MAX_MPI_COUNT (1 << 30) +#endif + +/*******************/ +/* Local Variables */ +/*******************/ +static hsize_t bigio_count_g = H5_MAX_MPI_COUNT; + +/*------------------------------------------------------------------------- + * Function: H5_mpi_set_bigio_count + * + * Purpose: Allow us to programmatically change the switch point + * when we utilize derived datatypes. This is of + * particular interest for allowing nightly testing + * + * Return: The current/previous value of bigio_count_g. + * + * Programmer: Richard Warren, March 10, 2017 + * + *------------------------------------------------------------------------- + */ +hsize_t +H5_mpi_set_bigio_count(hsize_t new_count) +{ + hsize_t orig_count = bigio_count_g; + + if ((new_count > 0) && (new_count < (hsize_t)TWO_GIG_LIMIT)) { + bigio_count_g = new_count; + } + return orig_count; +} /* end H5_mpi_set_bigio_count() */ + +/*------------------------------------------------------------------------- + * Function: H5_mpi_get_bigio_count + * + * Purpose: Allow other HDF5 library functions to access + * the current value for bigio_count_g. + * + * Return: The current/previous value of bigio_count_g. + * + * Programmer: Richard Warren, October 7, 2019 + * + *------------------------------------------------------------------------- + */ +hsize_t +H5_mpi_get_bigio_count(void) +{ + return bigio_count_g; +} + +/*------------------------------------------------------------------------- + * Function: H5_mpi_comm_dup + * + * Purpose: Duplicate an MPI communicator. + * + * Does not duplicate MPI_COMM_NULL. Instead, comm_new will + * be set to MPI_COMM_NULL directly. + * + * The new communicator is returned via the comm_new pointer. + * + * Return: SUCCEED/FAIL + * + *------------------------------------------------------------------------- + */ +herr_t +H5_mpi_comm_dup(MPI_Comm comm, MPI_Comm *comm_new) +{ + herr_t ret_value = SUCCEED; + MPI_Comm comm_dup = MPI_COMM_NULL; + int mpi_code; + + FUNC_ENTER_NOAPI(FAIL) + + /* Check arguments */ + if (!comm_new) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADVALUE, FAIL, "comm_new cannot be NULL") + + /* Handle MPI_COMM_NULL separately */ + if (MPI_COMM_NULL == comm) { + /* Don't duplicate MPI_COMM_NULL since that's an error in MPI */ + comm_dup = MPI_COMM_NULL; + } + else { + + /* Duplicate the MPI communicator */ + if (MPI_SUCCESS != (mpi_code = MPI_Comm_dup(comm, &comm_dup))) + HMPI_GOTO_ERROR(FAIL, "MPI_Comm_dup failed", mpi_code) + + /* Set MPI_ERRORS_RETURN on comm_dup so that MPI failures are not fatal, + * and return codes can be checked and handled. + */ + if (MPI_SUCCESS != (mpi_code = MPI_Comm_set_errhandler(comm_dup, MPI_ERRORS_RETURN))) + HMPI_GOTO_ERROR(FAIL, "MPI_Errhandler_set failed", mpi_code) + } + + /* Copy the new communicator to the return argument */ + *comm_new = comm_dup; + +done: + if (FAIL == ret_value) { + /* need to free anything created here */ + if (MPI_COMM_NULL != comm_dup) + MPI_Comm_free(&comm_dup); + } + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5_mpi_comm_dup() */ + +/*------------------------------------------------------------------------- + * Function: H5_mpi_info_dup + * + * Purpose: Duplicate an MPI info. + * + * If the info object is MPI_INFO_NULL, no duplicate + * is made but the same value assigned to the new info object + * handle. + * + * The new info is returned via the info_new pointer. + * + * Return: SUCCEED/FAIL + * + *------------------------------------------------------------------------- + */ +herr_t +H5_mpi_info_dup(MPI_Info info, MPI_Info *info_new) +{ + herr_t ret_value = SUCCEED; + MPI_Info info_dup = MPI_INFO_NULL; + int mpi_code; + + FUNC_ENTER_NOAPI(FAIL) + + /* Check arguments */ + if (!info_new) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADVALUE, FAIL, "info_new cannot be NULL") + + /* Duplicate the MPI info */ + if (info == MPI_INFO_NULL) { + /* Don't duplicate MPI_INFO_NULL. Just copy it. */ + info_dup = MPI_INFO_NULL; + } + else { + /* Duplicate the info */ + if (MPI_SUCCESS != (mpi_code = MPI_Info_dup(info, &info_dup))) + HMPI_GOTO_ERROR(FAIL, "MPI_Info_dup failed", mpi_code) + } + + /* Copy the new info to the return argument */ + *info_new = info_dup; + +done: + if (FAIL == ret_value) { + /* need to free anything created here */ + if (MPI_INFO_NULL != info_dup) + MPI_Info_free(&info_dup); + } + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5_mpi_info_dup() */ + +/*------------------------------------------------------------------------- + * Function: H5_mpi_comm_free + * + * Purpose: Free an MPI communicator. + * + * If comm is MPI_COMM_NULL this call does nothing. + * + * Return: SUCCEED/FAIL + * + *------------------------------------------------------------------------- + */ +herr_t +H5_mpi_comm_free(MPI_Comm *comm) +{ + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI(FAIL) + + /* Check arguments */ + if (!comm) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADVALUE, FAIL, "comm pointer cannot be NULL") + + /* Free the communicator */ + if (MPI_COMM_WORLD != *comm && MPI_COMM_NULL != *comm) + MPI_Comm_free(comm); + + *comm = MPI_COMM_NULL; + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* End H5_mpi_comm_free() */ + +/*------------------------------------------------------------------------- + * Function: H5_mpi_info_free + * + * Purpose: Free the MPI info. + * + * If info is MPI_INFO_NULL this call does nothing. + * + * Return: SUCCEED/FAIL + * + *------------------------------------------------------------------------- + */ +herr_t +H5_mpi_info_free(MPI_Info *info) +{ + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI(FAIL) + + /* Check arguments */ + if (!info) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADVALUE, FAIL, "info pointer cannot be NULL") + + /* Free the info */ + if (MPI_INFO_NULL != *info) + MPI_Info_free(info); + + *info = MPI_INFO_NULL; + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* End H5_mpi_info_free() */ + +/*------------------------------------------------------------------------- + * Function: H5_mpi_comm_cmp + * + * Purpose: Compares two MPI communicators. + * + * Note that passing MPI_COMM_NULL to this function will not + * throw errors, unlike MPI_Comm_compare(). + * + * We consider MPI communicators to be the "same" when the + * groups are identical. We don't care about the context + * since that will always be different as we call MPI_Comm_dup + * when we store the communicator in the fapl. + * + * The out parameter is a value like strcmp. The value is + * undefined when the return value is FAIL. + * + * Return: SUCCEED/FAIL + * + *------------------------------------------------------------------------- + */ +herr_t +H5_mpi_comm_cmp(MPI_Comm comm1, MPI_Comm comm2, int *result) +{ + int mpi_code; + int mpi_result = MPI_IDENT; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI(FAIL) + + /* Check arguments */ + if (!result) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADVALUE, FAIL, "result cannot be NULL") + + /* Set out parameter to something reasonable in case something goes wrong */ + *result = 0; + + /* Can't pass MPI_COMM_NULL to MPI_Comm_compare() so we have to handle + * it in special cases. + * + * MPI_Comm can either be an integer type or a pointer. We cast them + * to intptr_t so we can compare them with < and > when needed. + */ + if (MPI_COMM_NULL == comm1 && MPI_COMM_NULL == comm2) { + /* Special case of both communicators being MPI_COMM_NULL */ + *result = 0; + } + else if (MPI_COMM_NULL == comm1 || MPI_COMM_NULL == comm2) { + + /* Special case of one communicator being MPI_COMM_NULL */ + *result = (intptr_t)comm1 < (intptr_t)comm2 ? -1 : 1; + } + else { + + /* Normal communicator compare */ + + /* Compare the MPI communicators */ + if (MPI_SUCCESS != (mpi_code = MPI_Comm_compare(comm1, comm2, &mpi_result))) + HMPI_GOTO_ERROR(FAIL, "MPI_Comm_compare failed", mpi_code) + + /* Set the result */ + if (MPI_IDENT == mpi_result || MPI_CONGRUENT == mpi_result) + *result = 0; + else + *result = (intptr_t)comm1 < (intptr_t)comm2 ? -1 : 1; + } + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5_mpi_comm_cmp() */ + +/*------------------------------------------------------------------------- + * Function: H5_mpi_info_cmp + * + * Purpose: Compares two MPI info objects. + * + * For our purposes, two mpi info objects are the "same" if + * they contain the same key-value pairs or are both + * MPI_INFO_NULL. + * + * The out parameter is a value like strcmp. The value is + * undefined when the return value is FAIL. + * + * Return: SUCCEED/FAIL + * + *------------------------------------------------------------------------- + */ +herr_t +H5_mpi_info_cmp(MPI_Info info1, MPI_Info info2, int *result) +{ + hbool_t same = FALSE; + char *key = NULL; + char *value1 = NULL; + char *value2 = NULL; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI(FAIL) + + /* Check arguments */ + if (!result) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADVALUE, FAIL, "result cannot be NULL") + + /* Check for MPI_INFO_NULL */ + if (MPI_INFO_NULL == info1 && MPI_INFO_NULL == info2) { + /* Special case of both info objects being MPI_INFO_NULL */ + same = TRUE; + } + else if (MPI_INFO_NULL == info1 || MPI_INFO_NULL == info2) { + + /* Special case of one info object being MPI_INFO_NULL */ + same = FALSE; + } + else { + int mpi_code; + int nkeys_1; + int nkeys_2; + + /* Check if the number of keys is the same */ + if (MPI_SUCCESS != (mpi_code = MPI_Info_get_nkeys(info1, &nkeys_1))) + HMPI_GOTO_ERROR(FAIL, "MPI_Info_get_nkeys failed", mpi_code) + if (MPI_SUCCESS != (mpi_code = MPI_Info_get_nkeys(info2, &nkeys_2))) + HMPI_GOTO_ERROR(FAIL, "MPI_Info_get_nkeys failed", mpi_code) + + if (nkeys_1 != nkeys_2) + same = FALSE; + else if (0 == nkeys_1 && 0 == nkeys_2) + same = TRUE; + else { + int i; + int flag1 = -1; + int flag2 = -1; + + /* Allocate buffers for iteration */ + if (NULL == (key = (char *)H5MM_malloc(MPI_MAX_INFO_KEY * sizeof(char)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, "memory allocation failed") + if (NULL == (value1 = (char *)H5MM_malloc(MPI_MAX_INFO_VAL * sizeof(char)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, "memory allocation failed") + if (NULL == (value2 = (char *)H5MM_malloc(MPI_MAX_INFO_VAL * sizeof(char)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, "memory allocation failed") + + /* Iterate over the keys, comparing them */ + for (i = 0; i < nkeys_1; i++) { + + same = TRUE; + + /* Memset the buffers to zero */ + HDmemset(key, 0, MPI_MAX_INFO_KEY); + HDmemset(value1, 0, MPI_MAX_INFO_VAL); + HDmemset(value2, 0, MPI_MAX_INFO_VAL); + + /* Get the nth key */ + if (MPI_SUCCESS != (mpi_code = MPI_Info_get_nthkey(info1, i, key))) + HMPI_GOTO_ERROR(FAIL, "MPI_Info_get_nthkey failed", mpi_code) + + /* Get the values */ + if (MPI_SUCCESS != (mpi_code = MPI_Info_get(info1, key, MPI_MAX_INFO_VAL, value1, &flag1))) + HMPI_GOTO_ERROR(FAIL, "MPI_Info_get failed", mpi_code) + if (MPI_SUCCESS != (mpi_code = MPI_Info_get(info2, key, MPI_MAX_INFO_VAL, value2, &flag2))) + HMPI_GOTO_ERROR(FAIL, "MPI_Info_get failed", mpi_code) + + /* Compare values and flags */ + if (!flag1 || !flag2 || HDmemcmp(value1, value2, MPI_MAX_INFO_VAL)) { + same = FALSE; + break; + } + + } /* end for */ + } /* end else */ + } /* end else */ + + /* Set the output value + * + * MPI_Info can either be an integer type or a pointer. We cast them + * to intptr_t so we can compare them with < and > when needed. + */ + if (same) + *result = 0; + else + *result = (intptr_t)info1 < (intptr_t)info2 ? -1 : 1; + +done: + if (key) + H5MM_xfree(key); + if (value1) + H5MM_xfree(value1); + if (value2) + H5MM_xfree(value2); + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5_mpi_info_cmp() */ + +/*------------------------------------------------------------------------- + * Function: H5_mpio_create_large_type + * + * Purpose: Create a large datatype of size larger than what a 32 bit integer + * can hold. + * + * Return: Non-negative on success, negative on failure. + * + * *new_type the new datatype created + * + * Programmer: Mohamad Chaarawi + * + *------------------------------------------------------------------------- + */ +herr_t +H5_mpio_create_large_type(hsize_t num_elements, MPI_Aint stride_bytes, MPI_Datatype old_type, + MPI_Datatype *new_type) +{ + int num_big_types; /* num times the 2G datatype will be repeated */ + int remaining_bytes; /* the number of bytes left that can be held in an int value */ + hsize_t leftover; + int block_len[2]; + int mpi_code; /* MPI return code */ + MPI_Datatype inner_type, outer_type, leftover_type, type[2]; + MPI_Aint disp[2], old_extent; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(FAIL) + + /* Calculate how many Big MPI datatypes are needed to represent the buffer */ + num_big_types = (int)(num_elements / bigio_count_g); + leftover = (hsize_t)num_elements - (hsize_t)num_big_types * bigio_count_g; + H5_CHECKED_ASSIGN(remaining_bytes, int, leftover, hsize_t); + + /* Create a contiguous datatype of size equal to the largest + * number that a 32 bit integer can hold x size of old type. + * If the displacement is 0, then the type is contiguous, otherwise + * use type_hvector to create the type with the displacement provided + */ + if (0 == stride_bytes) { + if (MPI_SUCCESS != (mpi_code = MPI_Type_contiguous((int)bigio_count_g, old_type, &inner_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_contiguous failed", mpi_code) + } /* end if */ + else if (MPI_SUCCESS != + (mpi_code = MPI_Type_create_hvector((int)bigio_count_g, 1, stride_bytes, old_type, &inner_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hvector failed", mpi_code) + + /* Create a contiguous datatype of the buffer (minus the remaining < 2GB part) + * If a stride is present, use hvector type + */ + if (0 == stride_bytes) { + if (MPI_SUCCESS != (mpi_code = MPI_Type_contiguous(num_big_types, inner_type, &outer_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_contiguous failed", mpi_code) + } /* end if */ + else if (MPI_SUCCESS != + (mpi_code = MPI_Type_create_hvector(num_big_types, 1, stride_bytes, inner_type, &outer_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hvector failed", mpi_code) + + MPI_Type_free(&inner_type); + + /* If there is a remaining part create a contiguous/vector datatype and then + * use a struct datatype to encapsulate everything. + */ + if (remaining_bytes) { + if (stride_bytes == 0) { + if (MPI_SUCCESS != (mpi_code = MPI_Type_contiguous(remaining_bytes, old_type, &leftover_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_contiguous failed", mpi_code) + } /* end if */ + else if (MPI_SUCCESS != (mpi_code = MPI_Type_create_hvector( + (int)(num_elements - (hsize_t)num_big_types * bigio_count_g), 1, + stride_bytes, old_type, &leftover_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hvector failed", mpi_code) + + /* As of version 4.0, OpenMPI now turns off MPI-1 API calls by default, + * so we're using the MPI-2 version even though we don't need the lb + * value. + */ + { + MPI_Aint unused_lb_arg; + MPI_Type_get_extent(old_type, &unused_lb_arg, &old_extent); + } + + /* Set up the arguments for MPI_Type_create_struct() */ + type[0] = outer_type; + type[1] = leftover_type; + block_len[0] = 1; + block_len[1] = 1; + disp[0] = 0; + disp[1] = (old_extent + stride_bytes) * num_big_types * (MPI_Aint)bigio_count_g; + + if (MPI_SUCCESS != (mpi_code = MPI_Type_create_struct(2, block_len, disp, type, new_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct failed", mpi_code) + + MPI_Type_free(&outer_type); + MPI_Type_free(&leftover_type); + } /* end if */ + else + /* There are no remaining bytes so just set the new type to + * the outer type created */ + *new_type = outer_type; + + if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(new_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code) + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5_mpio_create_large_type() */ + +/*------------------------------------------------------------------------- + * Function: H5_mpio_gatherv_alloc + * + * Purpose: A wrapper around MPI_(All)gatherv that performs allocation + * of the receive buffer on the caller's behalf. This + * routine's parameters are as follows: + * + * `send_buf` - The buffer that data will be sent from for + * the calling MPI rank. Analogous to + * MPI_(All)gatherv's `sendbuf` parameter. + * + * `send_count` - The number of `send_type` elements in the + * send buffer. Analogous to MPI_(All)gatherv's + * `sendcount` parameter. + * + * `send_type` - The MPI Datatype of the elements in the send + * buffer. Analogous to MPI_(All)gatherv's + * `sendtype` parameter. + * + * `recv_counts` - An array containing the number of elements + * to be received from each MPI rank. + * Analogous to MPI_(All)gatherv's `recvcount` + * parameter. + * + * `displacements` - An array containing the displacements + * in the receive buffer where data from + * each MPI rank should be placed. Analogous + * to MPI_(All)gatherv's `displs` parameter. + * + * `recv_type` - The MPI Datatype of the elements in the + * receive buffer. Analogous to + * MPI_(All)gatherv's `recvtype` parameter. + * + * `allgather` - Specifies whether the gather operation to be + * performed should be MPI_Allgatherv (TRUE) or + * MPI_Gatherv (FALSE). + * + * `root` - For MPI_Gatherv operations, specifies the rank + * that will receive the data sent by other ranks. + * Analogous to MPI_Gatherv's `root` parameter. For + * MPI_Allgatherv operations, this parameter is + * ignored. + * + * `comm` - Specifies the MPI Communicator for the operation. + * Analogous to MPI_(All)gatherv's `comm` parameter. + * + * `mpi_rank` - Specifies the calling rank's rank value, as + * obtained by calling MPI_Comm_rank on the + * MPI Communicator `comm`. + * + * `mpi_size` - Specifies the MPI Communicator size, as + * obtained by calling MPI_Comm_size on the + * MPI Communicator `comm`. + * + * `out_buf` - Resulting buffer that is allocated and + * returned to the caller after data has been + * gathered into it. Returned only to the rank + * specified by `root` for MPI_Gatherv + * operations, or to all ranks for + * MPI_Allgatherv operations. + * + * `out_buf_num_entries` - The number of elements in the + * resulting buffer, in terms of + * the MPI Datatype provided for + * `recv_type`. + * + * Notes: This routine is collective across `comm`. + * + * Return: Non-negative on success/Negative on failure + * + *------------------------------------------------------------------------- + */ +herr_t +H5_mpio_gatherv_alloc(void *send_buf, int send_count, MPI_Datatype send_type, const int recv_counts[], + const int displacements[], MPI_Datatype recv_type, hbool_t allgather, int root, + MPI_Comm comm, int mpi_rank, int mpi_size, void **out_buf, size_t *out_buf_num_entries) +{ + size_t recv_buf_num_entries = 0; + void *recv_buf = NULL; +#if H5_CHECK_MPI_VERSION(3, 0) + MPI_Count type_lb; + MPI_Count type_extent; +#else + MPI_Aint type_lb; + MPI_Aint type_extent; +#endif + int mpi_code; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI(FAIL) + + HDassert(send_buf || send_count == 0); + if (allgather || (mpi_rank == root)) + HDassert(out_buf && out_buf_num_entries); + + /* Retrieve the extent of the MPI Datatype being used */ +#if H5_CHECK_MPI_VERSION(3, 0) + if (MPI_SUCCESS != (mpi_code = MPI_Type_get_extent_x(recv_type, &type_lb, &type_extent))) +#else + if (MPI_SUCCESS != (mpi_code = MPI_Type_get_extent(recv_type, &type_lb, &type_extent))) +#endif + HMPI_GOTO_ERROR(FAIL, "MPI_Type_get_extent(_x) failed", mpi_code) + + if (type_extent < 0) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "MPI recv_type had a negative extent") + + /* + * Calculate the total size of the buffer being + * returned and allocate it + */ + if (allgather || (mpi_rank == root)) { + size_t i; + size_t buf_size; + + for (i = 0, recv_buf_num_entries = 0; i < (size_t)mpi_size; i++) + recv_buf_num_entries += (size_t)recv_counts[i]; + buf_size = recv_buf_num_entries * (size_t)type_extent; + + /* If our buffer size is 0, there's nothing to do */ + if (buf_size == 0) + HGOTO_DONE(SUCCEED) + + if (NULL == (recv_buf = H5MM_malloc(buf_size))) + /* Push an error, but still participate in collective gather operation */ + HDONE_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't allocate receive buffer") + } + + /* Perform gather operation */ + if (allgather) { + if (MPI_SUCCESS != (mpi_code = MPI_Allgatherv(send_buf, send_count, send_type, recv_buf, recv_counts, + displacements, recv_type, comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Allgatherv failed", mpi_code) + } + else { + if (MPI_SUCCESS != (mpi_code = MPI_Gatherv(send_buf, send_count, send_type, recv_buf, recv_counts, + displacements, recv_type, root, comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Gatherv failed", mpi_code) + } + + if (allgather || (mpi_rank == root)) { + *out_buf = recv_buf; + *out_buf_num_entries = recv_buf_num_entries; + } + +done: + if (ret_value < 0) { + if (recv_buf) + H5MM_free(recv_buf); + } + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5_mpio_gatherv_alloc() */ + +/*------------------------------------------------------------------------- + * Function: H5_mpio_gatherv_alloc_simple + * + * Purpose: A slightly simplified interface to H5_mpio_gatherv_alloc + * which calculates the receive counts and receive buffer + * displacements for the caller. + * + * Notes: This routine is collective across `comm`. + * + * Return: Non-negative on success/Negative on failure + * + *------------------------------------------------------------------------- + */ +herr_t +H5_mpio_gatherv_alloc_simple(void *send_buf, int send_count, MPI_Datatype send_type, MPI_Datatype recv_type, + hbool_t allgather, int root, MPI_Comm comm, int mpi_rank, int mpi_size, + void **out_buf, size_t *out_buf_num_entries) +{ + int *recv_counts_disps_array = NULL; + int mpi_code; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI(FAIL) + + HDassert(send_buf || send_count == 0); + if (allgather || (mpi_rank == root)) + HDassert(out_buf && out_buf_num_entries); + + /* + * Allocate array to store the receive counts of each rank, as well as + * the displacements into the final array where each rank will place + * their data. The first half of the array contains the receive counts + * (in rank order), while the latter half contains the displacements + * (also in rank order). + */ + if (allgather || (mpi_rank == root)) { + if (NULL == + (recv_counts_disps_array = H5MM_malloc(2 * (size_t)mpi_size * sizeof(*recv_counts_disps_array)))) + /* Push an error, but still participate in collective gather operation */ + HDONE_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, + "couldn't allocate receive counts and displacements array") + } + + /* Collect each rank's send count to interested ranks */ + if (allgather) { + if (MPI_SUCCESS != + (mpi_code = MPI_Allgather(&send_count, 1, MPI_INT, recv_counts_disps_array, 1, MPI_INT, comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Allgather failed", mpi_code) + } + else { + if (MPI_SUCCESS != + (mpi_code = MPI_Gather(&send_count, 1, MPI_INT, recv_counts_disps_array, 1, MPI_INT, root, comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Gather failed", mpi_code) + } + + /* Set the displacements into the receive buffer for the gather operation */ + if (allgather || (mpi_rank == root)) { + size_t i; + int *displacements_ptr; + + displacements_ptr = &recv_counts_disps_array[mpi_size]; + + *displacements_ptr = 0; + for (i = 1; i < (size_t)mpi_size; i++) + displacements_ptr[i] = displacements_ptr[i - 1] + recv_counts_disps_array[i - 1]; + } + + /* Perform gather operation */ + if (H5_mpio_gatherv_alloc(send_buf, send_count, send_type, recv_counts_disps_array, + &recv_counts_disps_array[mpi_size], recv_type, allgather, root, comm, mpi_rank, + mpi_size, out_buf, out_buf_num_entries) < 0) + HGOTO_ERROR(H5E_LIB, H5E_CANTGATHER, FAIL, "can't gather data") + +done: + if (recv_counts_disps_array) + H5MM_free(recv_counts_disps_array); + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5_mpio_gatherv_alloc_simple() */ + +/*------------------------------------------------------------------------- + * Function: H5_mpio_get_file_sync_required + * + * Purpose: Retrieve the MPI hint indicating whether the data written + * by the MPI ROMIO driver is immediately visible to all MPI + * ranks. + * + * Notes: This routine is designed for supporting UnifyFS that needs + * MPI_File_sync in order to make the written data available + * to all ranks. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: Houjun Tang, April 7, 2022 + * + *------------------------------------------------------------------------- + */ +herr_t +H5_mpio_get_file_sync_required(MPI_File fh, hbool_t *file_sync_required) +{ + MPI_Info info_used; + int flag; + char value[MPI_MAX_INFO_VAL]; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI(FAIL) + + HDassert(file_sync_required); + + *file_sync_required = FALSE; + + if (MPI_SUCCESS != MPI_File_get_info(fh, &info_used)) + HGOTO_ERROR(H5E_LIB, H5E_CANTGET, FAIL, "can't get MPI info") + + if (MPI_SUCCESS != + MPI_Info_get(info_used, "romio_visibility_immediate", MPI_MAX_INFO_VAL - 1, value, &flag)) + HGOTO_ERROR(H5E_LIB, H5E_CANTGET, FAIL, "can't get MPI info") + + if (flag && !HDstrcmp(value, "false")) + *file_sync_required = TRUE; + + if (MPI_SUCCESS != MPI_Info_free(&info_used)) + HGOTO_ERROR(H5E_LIB, H5E_CANTFREE, FAIL, "can't free MPI info") + + /* Force setting the flag via env variable (temp solution before the flag is implemented in MPI) */ + char *sync_env_var = HDgetenv("HDF5_DO_MPI_FILE_SYNC"); + if (sync_env_var && (!HDstrcmp(sync_env_var, "TRUE") || !HDstrcmp(sync_env_var, "1"))) + *file_sync_required = TRUE; + if (sync_env_var && (!HDstrcmp(sync_env_var, "FALSE") || !HDstrcmp(sync_env_var, "0"))) + *file_sync_required = FALSE; + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5_mpio_get_file_sync_required() */ +#endif /* H5_HAVE_PARALLEL */ |
