From 7983a584a0242f687efcd2dbd0deedfc0e960307 Mon Sep 17 00:00:00 2001 From: jhendersonHDF Date: Fri, 22 Jul 2022 15:03:44 -0500 Subject: Switch to vector I/O for collective metadata writes (#1911) * Switch to vector I/O for collective metadata writes * Committing clang-format changes Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com> --- src/H5Cmpio.c | 103 ++++++++++++++++++++----------------------------------- src/H5Fio.c | 79 ++++++++++++++++++++++++++++++++++++++++++ src/H5Fprivate.h | 6 ++++ 3 files changed, 123 insertions(+), 65 deletions(-) diff --git a/src/H5Cmpio.c b/src/H5Cmpio.c index a746e54..01c377b 100644 --- a/src/H5Cmpio.c +++ b/src/H5Cmpio.c @@ -947,16 +947,12 @@ H5C__collective_write(H5F_t *f) { H5AC_t * cache_ptr; H5FD_mpio_xfer_t orig_xfer_mode = H5FD_MPIO_COLLECTIVE; - void * base_buf; - int count; - int * length_array = NULL; - MPI_Aint * buf_array = NULL; - MPI_Aint * offset_array = NULL; - MPI_Datatype btype = MPI_BYTE; - MPI_Datatype ftype = MPI_BYTE; - int mpi_code; - char unused = 0; /* Unused, except for non-NULL pointer value */ - size_t buf_count; + const void ** bufs = NULL; + H5FD_mem_t * types = NULL; + haddr_t * addrs = NULL; + size_t * sizes = NULL; + uint32_t count32; + size_t count; herr_t ret_value = SUCCEED; FUNC_ENTER_PACKAGE @@ -976,22 +972,23 @@ H5C__collective_write(H5F_t *f) HGOTO_ERROR(H5E_CACHE, H5E_CANTSET, FAIL, "can't set MPI-I/O transfer mode") /* Get number of entries in collective write list */ - count = (int)H5SL_count(cache_ptr->coll_write_list); + count = H5SL_count(cache_ptr->coll_write_list); + H5_CHECKED_ASSIGN(count32, uint32_t, count, size_t); + if (count > 0) { H5SL_node_t * node; H5C_cache_entry_t *entry_ptr; + void * base_buf; int i; - /* Allocate arrays */ - if (NULL == (length_array = (int *)H5MM_malloc((size_t)count * sizeof(int)))) - HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, - "memory allocation failed for collective write table length array") - if (NULL == (buf_array = (MPI_Aint *)H5MM_malloc((size_t)count * sizeof(MPI_Aint)))) - HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, - "memory allocation failed for collective buf table length array") - if (NULL == (offset_array = (MPI_Aint *)H5MM_malloc((size_t)count * sizeof(MPI_Aint)))) - HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, - "memory allocation failed for collective offset table length array") + if (NULL == (addrs = H5MM_malloc(count * sizeof(*addrs)))) + HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "couldn't allocate address array") + if (NULL == (sizes = H5MM_malloc(count * sizeof(*sizes)))) + HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "couldn't allocate sizes array") + if (NULL == (bufs = H5MM_malloc(count * sizeof(*bufs)))) + HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "couldn't allocate buffers array") + if (NULL == (types = H5MM_malloc(count * sizeof(*types)))) + HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "couldn't allocate types array") /* Fill arrays */ node = H5SL_first(cache_ptr->coll_write_list); @@ -1000,10 +997,11 @@ H5C__collective_write(H5F_t *f) HGOTO_ERROR(H5E_CACHE, H5E_NOTFOUND, FAIL, "can't retrieve skip list item") /* Set up initial array position & buffer base address */ - length_array[0] = (int)entry_ptr->size; - base_buf = entry_ptr->image_ptr; - buf_array[0] = (MPI_Aint)0; - offset_array[0] = (MPI_Aint)entry_ptr->addr; + base_buf = entry_ptr->image_ptr; + addrs[0] = entry_ptr->addr; + sizes[0] = entry_ptr->size; + bufs[0] = base_buf; + types[0] = H5FD_MEM_DEFAULT; node = H5SL_next(node); i = 1; @@ -1012,59 +1010,34 @@ H5C__collective_write(H5F_t *f) HGOTO_ERROR(H5E_CACHE, H5E_NOTFOUND, FAIL, "can't retrieve skip list item") /* Set up array position */ - length_array[i] = (int)entry_ptr->size; - buf_array[i] = (MPI_Aint)entry_ptr->image_ptr - (MPI_Aint)base_buf; - offset_array[i] = (MPI_Aint)entry_ptr->addr; + addrs[i] = entry_ptr->addr; + sizes[i] = entry_ptr->size; + bufs[i] = entry_ptr->image_ptr; + types[i] = H5FD_MEM_DEFAULT; /* Advance to next node & array location */ node = H5SL_next(node); i++; } /* end while */ - /* Create memory MPI type */ - if (MPI_SUCCESS != - (mpi_code = MPI_Type_create_hindexed(count, length_array, buf_array, MPI_BYTE, &btype))) - HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed failed", mpi_code) - if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(&btype))) - HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code) - - /* Create file MPI type */ - if (MPI_SUCCESS != - (mpi_code = MPI_Type_create_hindexed(count, length_array, offset_array, MPI_BYTE, &ftype))) - HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed failed", mpi_code) - if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(&ftype))) - HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code) - - /* MPI count to write */ - buf_count = 1; + /* Optimization for vector I/O */ + if (count > 1) + types[1] = H5FD_MEM_NOLIST; } /* end if */ - else { - /* Set non-NULL pointer for I/O operation */ - base_buf = &unused; - - /* MPI count to write */ - buf_count = 0; - } /* end else */ /* Pass buf type, file type to the file driver */ - if (H5CX_set_mpi_coll_datatypes(btype, ftype) < 0) + if (H5CX_set_mpi_coll_datatypes(MPI_BYTE, MPI_BYTE) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTSET, FAIL, "can't set MPI-I/O properties") - /* Write data */ - if (H5F_block_write(f, H5FD_MEM_DEFAULT, (haddr_t)0, buf_count, base_buf) < 0) - HGOTO_ERROR(H5E_CACHE, H5E_WRITEERROR, FAIL, "unable to write entries collectively") + /* Make vector write call */ + if (H5F_shared_vector_write(H5F_SHARED(f), count32, types, addrs, sizes, bufs) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_WRITEERROR, FAIL, "unable to write entries") done: - /* Free arrays */ - length_array = (int *)H5MM_xfree(length_array); - buf_array = (MPI_Aint *)H5MM_xfree(buf_array); - offset_array = (MPI_Aint *)H5MM_xfree(offset_array); - - /* Free MPI Types */ - if (MPI_BYTE != btype && MPI_SUCCESS != (mpi_code = MPI_Type_free(&btype))) - HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code) - if (MPI_BYTE != ftype && MPI_SUCCESS != (mpi_code = MPI_Type_free(&ftype))) - HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code) + H5MM_xfree(types); + H5MM_xfree(bufs); + H5MM_xfree(sizes); + H5MM_xfree(addrs); /* Reset transfer mode in API context, if changed */ if (orig_xfer_mode != H5FD_MPIO_COLLECTIVE) diff --git a/src/H5Fio.c b/src/H5Fio.c index 53fec97..287dd61 100644 --- a/src/H5Fio.c +++ b/src/H5Fio.c @@ -327,6 +327,85 @@ done: FUNC_LEAVE_NOAPI(ret_value) } /* end H5F_shared_select_write() */ +herr_t +H5F_shared_vector_read(H5F_shared_t *f_sh, uint32_t count, H5FD_mem_t types[], haddr_t addrs[], + size_t sizes[], void *bufs[]) +{ + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI(FAIL) + + /* Sanity checks */ + HDassert(f_sh); + HDassert((types) || (count == 0)); + HDassert((addrs) || (count == 0)); + HDassert((sizes) || (count == 0)); + HDassert((bufs) || (count == 0)); + + /* + * Note that we don't try to map global heap data to raw + * data here, as it may become expensive to check for when + * I/O vectors are large. This may change in the future, but, + * for now, assume the caller has done this already. + */ +#ifndef NDEBUG + for (uint32_t i = 0; i < count; i++) + HDassert(types[i] != H5FD_MEM_GHEAP); +#endif + + /* Pass down to file driver layer (bypass page buffer for now) */ + if (H5FD_read_vector(f_sh->lf, count, types, addrs, sizes, bufs) < 0) + HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "vector read through file driver failed") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} + +/*------------------------------------------------------------------------- + * Function: H5F_shared_vector_write + * + * Purpose: Writes data from `count` buffers (from the `bufs` array) to + * a file/server/etc. at the offsets provided in the `addrs` + * array, with the data sizes specified in the `sizes` array + * and data memory types specified in the `types` array. The + * addresses are relative to the base address for the file. + * + *------------------------------------------------------------------------- + */ +herr_t +H5F_shared_vector_write(H5F_shared_t *f_sh, uint32_t count, H5FD_mem_t types[], haddr_t addrs[], + size_t sizes[], const void *bufs[]) +{ + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI(FAIL) + + /* Sanity checks */ + HDassert(f_sh); + HDassert((types) || (count == 0)); + HDassert((addrs) || (count == 0)); + HDassert((sizes) || (count == 0)); + HDassert((bufs) || (count == 0)); + + /* + * Note that we don't try to map global heap data to raw + * data here, as it may become expensive to check for when + * I/O vectors are large. This may change in the future, but, + * for now, assume the caller has done this already. + */ +#ifndef NDEBUG + for (uint32_t i = 0; i < count; i++) + HDassert(types[i] != H5FD_MEM_GHEAP); +#endif + + /* Pass down to file driver layer (bypass page buffer for now) */ + if (H5FD_write_vector(f_sh->lf, count, types, addrs, sizes, bufs) < 0) + HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "vector write through file driver failed") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} + /*------------------------------------------------------------------------- * Function: H5F_flush_tagged_metadata * diff --git a/src/H5Fprivate.h b/src/H5Fprivate.h index efce71d..134c606 100644 --- a/src/H5Fprivate.h +++ b/src/H5Fprivate.h @@ -932,6 +932,12 @@ H5_DLL herr_t H5F_shared_select_write(H5F_shared_t *f_sh, H5FD_mem_t type, uint3 struct H5S_t **mem_spaces, struct H5S_t **file_spaces, haddr_t offsets[], size_t element_sizes[], const void *bufs[]); +/* Functions that operate on I/O vectors */ +H5_DLL herr_t H5F_shared_vector_read(H5F_shared_t *f_sh, uint32_t count, H5FD_mem_t types[], haddr_t addrs[], + size_t sizes[], void *bufs[]); +H5_DLL herr_t H5F_shared_vector_write(H5F_shared_t *f_sh, uint32_t count, H5FD_mem_t types[], haddr_t addrs[], + size_t sizes[], const void *bufs[]); + /* Functions that flush or evict */ H5_DLL herr_t H5F_flush_tagged_metadata(H5F_t *f, haddr_t tag); H5_DLL herr_t H5F_evict_tagged_metadata(H5F_t *f, haddr_t tag); -- cgit v0.12