summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjhendersonHDF <jhenderson@hdfgroup.org>2022-07-22 20:03:44 (GMT)
committerGitHub <noreply@github.com>2022-07-22 20:03:44 (GMT)
commit7983a584a0242f687efcd2dbd0deedfc0e960307 (patch)
tree71b3f44265e031288b871b33ec44d23c074ad2d4
parent27bb358f7ab1d23f3f8ce081c6b4f1602033e4d7 (diff)
downloadhdf5-7983a584a0242f687efcd2dbd0deedfc0e960307.zip
hdf5-7983a584a0242f687efcd2dbd0deedfc0e960307.tar.gz
hdf5-7983a584a0242f687efcd2dbd0deedfc0e960307.tar.bz2
Switch to vector I/O for collective metadata writes (#1911)
* Switch to vector I/O for collective metadata writes * Committing clang-format changes Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
-rw-r--r--src/H5Cmpio.c103
-rw-r--r--src/H5Fio.c79
-rw-r--r--src/H5Fprivate.h6
3 files changed, 123 insertions, 65 deletions
diff --git a/src/H5Cmpio.c b/src/H5Cmpio.c
index a746e54..01c377b 100644
--- a/src/H5Cmpio.c
+++ b/src/H5Cmpio.c
@@ -947,16 +947,12 @@ H5C__collective_write(H5F_t *f)
{
H5AC_t * cache_ptr;
H5FD_mpio_xfer_t orig_xfer_mode = H5FD_MPIO_COLLECTIVE;
- void * base_buf;
- int count;
- int * length_array = NULL;
- MPI_Aint * buf_array = NULL;
- MPI_Aint * offset_array = NULL;
- MPI_Datatype btype = MPI_BYTE;
- MPI_Datatype ftype = MPI_BYTE;
- int mpi_code;
- char unused = 0; /* Unused, except for non-NULL pointer value */
- size_t buf_count;
+ const void ** bufs = NULL;
+ H5FD_mem_t * types = NULL;
+ haddr_t * addrs = NULL;
+ size_t * sizes = NULL;
+ uint32_t count32;
+ size_t count;
herr_t ret_value = SUCCEED;
FUNC_ENTER_PACKAGE
@@ -976,22 +972,23 @@ H5C__collective_write(H5F_t *f)
HGOTO_ERROR(H5E_CACHE, H5E_CANTSET, FAIL, "can't set MPI-I/O transfer mode")
/* Get number of entries in collective write list */
- count = (int)H5SL_count(cache_ptr->coll_write_list);
+ count = H5SL_count(cache_ptr->coll_write_list);
+ H5_CHECKED_ASSIGN(count32, uint32_t, count, size_t);
+
if (count > 0) {
H5SL_node_t * node;
H5C_cache_entry_t *entry_ptr;
+ void * base_buf;
int i;
- /* Allocate arrays */
- if (NULL == (length_array = (int *)H5MM_malloc((size_t)count * sizeof(int))))
- HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL,
- "memory allocation failed for collective write table length array")
- if (NULL == (buf_array = (MPI_Aint *)H5MM_malloc((size_t)count * sizeof(MPI_Aint))))
- HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL,
- "memory allocation failed for collective buf table length array")
- if (NULL == (offset_array = (MPI_Aint *)H5MM_malloc((size_t)count * sizeof(MPI_Aint))))
- HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL,
- "memory allocation failed for collective offset table length array")
+ if (NULL == (addrs = H5MM_malloc(count * sizeof(*addrs))))
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "couldn't allocate address array")
+ if (NULL == (sizes = H5MM_malloc(count * sizeof(*sizes))))
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "couldn't allocate sizes array")
+ if (NULL == (bufs = H5MM_malloc(count * sizeof(*bufs))))
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "couldn't allocate buffers array")
+ if (NULL == (types = H5MM_malloc(count * sizeof(*types))))
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "couldn't allocate types array")
/* Fill arrays */
node = H5SL_first(cache_ptr->coll_write_list);
@@ -1000,10 +997,11 @@ H5C__collective_write(H5F_t *f)
HGOTO_ERROR(H5E_CACHE, H5E_NOTFOUND, FAIL, "can't retrieve skip list item")
/* Set up initial array position & buffer base address */
- length_array[0] = (int)entry_ptr->size;
- base_buf = entry_ptr->image_ptr;
- buf_array[0] = (MPI_Aint)0;
- offset_array[0] = (MPI_Aint)entry_ptr->addr;
+ base_buf = entry_ptr->image_ptr;
+ addrs[0] = entry_ptr->addr;
+ sizes[0] = entry_ptr->size;
+ bufs[0] = base_buf;
+ types[0] = H5FD_MEM_DEFAULT;
node = H5SL_next(node);
i = 1;
@@ -1012,59 +1010,34 @@ H5C__collective_write(H5F_t *f)
HGOTO_ERROR(H5E_CACHE, H5E_NOTFOUND, FAIL, "can't retrieve skip list item")
/* Set up array position */
- length_array[i] = (int)entry_ptr->size;
- buf_array[i] = (MPI_Aint)entry_ptr->image_ptr - (MPI_Aint)base_buf;
- offset_array[i] = (MPI_Aint)entry_ptr->addr;
+ addrs[i] = entry_ptr->addr;
+ sizes[i] = entry_ptr->size;
+ bufs[i] = entry_ptr->image_ptr;
+ types[i] = H5FD_MEM_DEFAULT;
/* Advance to next node & array location */
node = H5SL_next(node);
i++;
} /* end while */
- /* Create memory MPI type */
- if (MPI_SUCCESS !=
- (mpi_code = MPI_Type_create_hindexed(count, length_array, buf_array, MPI_BYTE, &btype)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed failed", mpi_code)
- if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(&btype)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code)
-
- /* Create file MPI type */
- if (MPI_SUCCESS !=
- (mpi_code = MPI_Type_create_hindexed(count, length_array, offset_array, MPI_BYTE, &ftype)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed failed", mpi_code)
- if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(&ftype)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code)
-
- /* MPI count to write */
- buf_count = 1;
+ /* Optimization for vector I/O */
+ if (count > 1)
+ types[1] = H5FD_MEM_NOLIST;
} /* end if */
- else {
- /* Set non-NULL pointer for I/O operation */
- base_buf = &unused;
-
- /* MPI count to write */
- buf_count = 0;
- } /* end else */
/* Pass buf type, file type to the file driver */
- if (H5CX_set_mpi_coll_datatypes(btype, ftype) < 0)
+ if (H5CX_set_mpi_coll_datatypes(MPI_BYTE, MPI_BYTE) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTSET, FAIL, "can't set MPI-I/O properties")
- /* Write data */
- if (H5F_block_write(f, H5FD_MEM_DEFAULT, (haddr_t)0, buf_count, base_buf) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_WRITEERROR, FAIL, "unable to write entries collectively")
+ /* Make vector write call */
+ if (H5F_shared_vector_write(H5F_SHARED(f), count32, types, addrs, sizes, bufs) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_WRITEERROR, FAIL, "unable to write entries")
done:
- /* Free arrays */
- length_array = (int *)H5MM_xfree(length_array);
- buf_array = (MPI_Aint *)H5MM_xfree(buf_array);
- offset_array = (MPI_Aint *)H5MM_xfree(offset_array);
-
- /* Free MPI Types */
- if (MPI_BYTE != btype && MPI_SUCCESS != (mpi_code = MPI_Type_free(&btype)))
- HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
- if (MPI_BYTE != ftype && MPI_SUCCESS != (mpi_code = MPI_Type_free(&ftype)))
- HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ H5MM_xfree(types);
+ H5MM_xfree(bufs);
+ H5MM_xfree(sizes);
+ H5MM_xfree(addrs);
/* Reset transfer mode in API context, if changed */
if (orig_xfer_mode != H5FD_MPIO_COLLECTIVE)
diff --git a/src/H5Fio.c b/src/H5Fio.c
index 53fec97..287dd61 100644
--- a/src/H5Fio.c
+++ b/src/H5Fio.c
@@ -327,6 +327,85 @@ done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5F_shared_select_write() */
+herr_t
+H5F_shared_vector_read(H5F_shared_t *f_sh, uint32_t count, H5FD_mem_t types[], haddr_t addrs[],
+ size_t sizes[], void *bufs[])
+{
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ /* Sanity checks */
+ HDassert(f_sh);
+ HDassert((types) || (count == 0));
+ HDassert((addrs) || (count == 0));
+ HDassert((sizes) || (count == 0));
+ HDassert((bufs) || (count == 0));
+
+ /*
+ * Note that we don't try to map global heap data to raw
+ * data here, as it may become expensive to check for when
+ * I/O vectors are large. This may change in the future, but,
+ * for now, assume the caller has done this already.
+ */
+#ifndef NDEBUG
+ for (uint32_t i = 0; i < count; i++)
+ HDassert(types[i] != H5FD_MEM_GHEAP);
+#endif
+
+ /* Pass down to file driver layer (bypass page buffer for now) */
+ if (H5FD_read_vector(f_sh->lf, count, types, addrs, sizes, bufs) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "vector read through file driver failed")
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+}
+
+/*-------------------------------------------------------------------------
+ * Function: H5F_shared_vector_write
+ *
+ * Purpose: Writes data from `count` buffers (from the `bufs` array) to
+ * a file/server/etc. at the offsets provided in the `addrs`
+ * array, with the data sizes specified in the `sizes` array
+ * and data memory types specified in the `types` array. The
+ * addresses are relative to the base address for the file.
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5F_shared_vector_write(H5F_shared_t *f_sh, uint32_t count, H5FD_mem_t types[], haddr_t addrs[],
+ size_t sizes[], const void *bufs[])
+{
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ /* Sanity checks */
+ HDassert(f_sh);
+ HDassert((types) || (count == 0));
+ HDassert((addrs) || (count == 0));
+ HDassert((sizes) || (count == 0));
+ HDassert((bufs) || (count == 0));
+
+ /*
+ * Note that we don't try to map global heap data to raw
+ * data here, as it may become expensive to check for when
+ * I/O vectors are large. This may change in the future, but,
+ * for now, assume the caller has done this already.
+ */
+#ifndef NDEBUG
+ for (uint32_t i = 0; i < count; i++)
+ HDassert(types[i] != H5FD_MEM_GHEAP);
+#endif
+
+ /* Pass down to file driver layer (bypass page buffer for now) */
+ if (H5FD_write_vector(f_sh->lf, count, types, addrs, sizes, bufs) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "vector write through file driver failed")
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+}
+
/*-------------------------------------------------------------------------
* Function: H5F_flush_tagged_metadata
*
diff --git a/src/H5Fprivate.h b/src/H5Fprivate.h
index efce71d..134c606 100644
--- a/src/H5Fprivate.h
+++ b/src/H5Fprivate.h
@@ -932,6 +932,12 @@ H5_DLL herr_t H5F_shared_select_write(H5F_shared_t *f_sh, H5FD_mem_t type, uint3
struct H5S_t **mem_spaces, struct H5S_t **file_spaces,
haddr_t offsets[], size_t element_sizes[], const void *bufs[]);
+/* Functions that operate on I/O vectors */
+H5_DLL herr_t H5F_shared_vector_read(H5F_shared_t *f_sh, uint32_t count, H5FD_mem_t types[], haddr_t addrs[],
+ size_t sizes[], void *bufs[]);
+H5_DLL herr_t H5F_shared_vector_write(H5F_shared_t *f_sh, uint32_t count, H5FD_mem_t types[], haddr_t addrs[],
+ size_t sizes[], const void *bufs[]);
+
/* Functions that flush or evict */
H5_DLL herr_t H5F_flush_tagged_metadata(H5F_t *f, haddr_t tag);
H5_DLL herr_t H5F_evict_tagged_metadata(H5F_t *f, haddr_t tag);