From f8b1ea3f9973b690a60b63ebb431568ce0e19a29 Mon Sep 17 00:00:00 2001 From: Neil Fortner Date: Fri, 18 Feb 2022 14:03:52 -0600 Subject: Implement big I/O support for vector I/O requests in the MPIO file driver. --- src/H5FDmpio.c | 253 +++++++++++++++++++++++++++++++++++++++++++++--------- testpar/t_bigio.c | 2 - 2 files changed, 214 insertions(+), 41 deletions(-) diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c index cfb8f110..14a2739 100644 --- a/src/H5FDmpio.c +++ b/src/H5FDmpio.c @@ -128,10 +128,10 @@ static const H5FD_class_t H5FD_mpio_g = { H5FD__mpio_get_handle, /* get_handle */ H5FD__mpio_read, /* read */ H5FD__mpio_write, /* write */ - H5FD__mpio_read_vector, /*read_vector */ - H5FD__mpio_write_vector, /*write_vector */ - NULL, /*read_selection */ - NULL, /*write_selection */ + H5FD__mpio_read_vector, /* read_vector */ + H5FD__mpio_write_vector, /* write_vector */ + NULL, /* read_selection */ + NULL, /* write_selection */ H5FD__mpio_flush, /* flush */ H5FD__mpio_truncate, /* truncate */ NULL, /* lock */ @@ -1700,6 +1700,8 @@ H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t cou hbool_t buf_type_created = FALSE; MPI_Datatype file_type = MPI_BYTE; /* MPI description of the selection in file */ hbool_t file_type_created = FALSE; + MPI_Datatype * sub_types = NULL; + uint8_t * sub_types_created = NULL; int i; int j; int mpi_code; /* MPI return code */ @@ -1759,6 +1761,10 @@ H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t cou HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode") if (xfer_mode == H5FD_MPIO_COLLECTIVE) { + hsize_t bigio_count; /* Transition point to create derived type */ + + /* Get bio I/O transition point (may be lower than 2G for testing) */ + bigio_count = H5_mpi_get_bigio_count(); if (count == 1) { /* Single block. Just use a series of MPI_BYTEs for the file view. @@ -1775,6 +1781,24 @@ H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t cou /* some numeric conversions */ if (H5FD_mpi_haddr_to_MPIOff(addrs[0], &mpi_off) < 0) HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI offset") + + /* Check for size overflow */ + if (sizes[0] > bigio_count) { + /* We need to work around the integer size limit of 2GB. The input size_t size + * variable cannot fit into an integer, but we can get around that limitation by + * creating a different datatype and then setting the integer size (or element + * count) to 1 when using the derived_type. */ + + if (H5_mpio_create_large_type(sizes[0], 0, MPI_BYTE, &buf_type) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype") + buf_type_created = TRUE; + + if (H5_mpio_create_large_type(sizes[0], 0, MPI_BYTE, &file_type) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype") + file_type_created = TRUE; + + size_i = 1; + } } else if (count > 0) { /* create MPI derived types describing the vector write */ @@ -1787,7 +1811,7 @@ H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t cou * are allocated, populated, and returned in s_types, s_addrs, s_sizes, and s_bufs respectively. * In this case, this function must free the memory allocated for the sorted vectors. */ - if (H5FD_sort_vector_io_req(&vector_was_sorted, count, types, addrs, sizes, bufs, &s_types, + if (H5FD_sort_vector_io_req(&vector_was_sorted, count, types, addrs, sizes, (const void **)bufs, &s_types, &s_addrs, &s_sizes, &s_bufs) < 0) HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "can't sort vector I/O request") @@ -1842,12 +1866,7 @@ H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t cou } } - /* There is an obvious possibility of an overflow here, as size_t - * will typically be 64 bits, where as int will typically be 32 bits. - * This must be fixed, but it should be good enough for initial - * correctness testing. - * JRM -- 3/17/21 - */ + /* Add to block lengths and displacements arrays */ mpi_block_lengths[i] = (int)size; mpi_displacments[i] = (MPI_Aint)s_addrs[i]; @@ -1862,13 +1881,54 @@ H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t cou #else mpi_bufs[i] = mpi_bufs[i] - mpi_bufs_base_Aint; #endif + + /* Check for size overflow */ + if (size > bigio_count) { + /* We need to work around the integer size limit of 2GB. The input size_t size + * variable cannot fit into an integer, but we can get around that limitation by + * creating a different datatype and then setting the integer size (or element + * count) to 1 when using the derived_type. */ + + /* Allocate arrays to keep track of types and whether they were created, if + * necessary */ + if (!sub_types) { + HDassert(!sub_types_created); + + if (NULL == (sub_types = (int *)HDmalloc((size_t)count * sizeof(MPI_Datatype)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc sub types array") + if (NULL == (sub_types_created = (uint8_t *)HDcalloc((size_t)count, 1))) { + sub_types = H5MM_free(sub_types); + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc sub types created array") + } + + /* Initialize sub_types to all MPI_BYTE */ + for (j = 0; j < (int)count; j++) + sub_types[j] = MPI_BYTE; + } + HDassert(sub_types_created); + + /* Create type for large block */ + if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &sub_types[i]) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype") + sub_types_created[i] = TRUE; + + /* Only one of these large types for this vector element */ + mpi_block_lengths[i] = 1; + } + else + HDassert(size == (size_t)mpi_block_lengths[i]); } /* create the memory MPI derived types */ - if (MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed((int)count, mpi_block_lengths, mpi_bufs, - MPI_BYTE, &buf_type))) - - HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed for buf_type failed", mpi_code) + if (sub_types) { + if (MPI_SUCCESS != (mpi_code = MPI_Type_create_struct((int)count, mpi_block_lengths, mpi_bufs, + sub_types, &buf_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct for buf_type failed", mpi_code) + } + else + if (MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed((int)count, mpi_block_lengths, mpi_bufs, + MPI_BYTE, &buf_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed for buf_type failed", mpi_code) buf_type_created = TRUE; @@ -1877,10 +1937,15 @@ H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t cou HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit for buf_type failed", mpi_code) /* create the file MPI derived type */ - if (MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed((int)count, mpi_block_lengths, - mpi_displacments, MPI_BYTE, &file_type))) - - HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed for file_type failed", mpi_code) + if (sub_types) { + if (MPI_SUCCESS != (mpi_code = MPI_Type_create_struct((int)count, mpi_block_lengths, + mpi_displacments, sub_types, &file_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct for file_type failed", mpi_code) + } + else + if (MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed((int)count, mpi_block_lengths, + mpi_displacments, MPI_BYTE, &file_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed for file_type failed", mpi_code) file_type_created = TRUE; @@ -2090,7 +2155,22 @@ H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t cou } } - size_i = (int)size; /* todo: fix potential for overflow */ + size_i = (int)size; + + if (size != (size_t)size_i) { + /* If HERE, then we need to work around the integer size limit + * of 2GB. The input size_t size variable cannot fit into an integer, + * but we can get around that limitation by creating a different datatype + * and then setting the integer size (or element count) to 1 when using + * the derived_type. + */ + + if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &buf_type) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype") + + buf_type_created = TRUE; + size_i = 1; + } /* Check if we actually need to do I/O */ if (addrs[i] < max_addr) { @@ -2099,7 +2179,7 @@ H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t cou /* Issue read */ if (MPI_SUCCESS != - (mpi_code = MPI_File_read_at(file->f, mpi_off, bufs[i], size_i, MPI_BYTE, &mpi_stat))) + (mpi_code = MPI_File_read_at(file->f, mpi_off, bufs[i], size_i, buf_type, &mpi_stat))) HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code) @@ -2111,8 +2191,15 @@ H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t cou #endif HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code) + /* Compute the actual number of bytes requested */ +#if MPI_VERSION >= 3 + io_size = (MPI_Count)sizes[i]; +#else + io_size = (int)sizes[i]; +#endif + /* Check for read failure */ - if (bytes_read < 0 || bytes_read > size_i) + if (bytes_read < 0 || bytes_read > io_size) HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed") /* @@ -2120,7 +2207,7 @@ H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t cou * the physical MPI file and don't issue any more reads at higher * addresses. */ - if ((n = (size_i - bytes_read)) > 0) { + if ((n = (io_size - bytes_read)) > 0) { HDmemset((char *)bufs[i] + bytes_read, 0, (size_t)n); max_addr = addrs[i] + (haddr_t)bytes_read; } @@ -2183,6 +2270,19 @@ done: MPI_Type_free(&buf_type); } + if (sub_types) { + HDassert(sub_types_created); + + for (i = 0; i < (int)count; i++) + if (sub_types_created[i]) + MPI_Type_free(&sub_types[i]); + + HDfree(sub_types); + sub_types = NULL; + HDfree(sub_types_created); + sub_types_created = NULL; + } + if (file_type_created) { MPI_Type_free(&file_type); } @@ -2259,6 +2359,8 @@ H5FD__mpio_write_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t co hbool_t buf_type_created = FALSE; MPI_Datatype file_type = MPI_BYTE; /* MPI description of the selection in file */ hbool_t file_type_created = FALSE; + MPI_Datatype * sub_types = NULL; + uint8_t * sub_types_created = NULL; int i; int j; int mpi_code; /* MPI return code */ @@ -2325,6 +2427,7 @@ H5FD__mpio_write_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t co if (xfer_mode == H5FD_MPIO_COLLECTIVE) { if (count > 0) { /* create MPI derived types describing the vector write */ + hsize_t bigio_count; /* Transition point to create derived type */ if ((NULL == (mpi_block_lengths = (int *)HDmalloc((size_t)count * sizeof(int)))) || (NULL == (mpi_displacments = (MPI_Aint *)HDmalloc((size_t)count * sizeof(MPI_Aint)))) || @@ -2361,6 +2464,9 @@ H5FD__mpio_write_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t co fixed_size = FALSE; + /* Get bio I/O transition point (may be lower than 2G for testing) */ + bigio_count = H5_mpi_get_bigio_count(); + /* load the mpi_block_lengths and mpi_displacements arrays */ for (i = 0; i < (int)count; i++) { @@ -2377,12 +2483,7 @@ H5FD__mpio_write_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t co } } - /* There is an obvious possibility of an overflow here, as size_t - * will typically be 64 bits, where as int will typically be 32 bits. - * This must be fixed, but it should be good enough for initial - * correctness testing. - * JRM -- 3/17/21 - */ + /* Add to block lengths and displacements arrays */ mpi_block_lengths[i] = (int)size; mpi_displacments[i] = (MPI_Aint)s_addrs[i]; @@ -2397,13 +2498,54 @@ H5FD__mpio_write_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t co #else mpi_bufs[i] = mpi_bufs[i] - mpi_bufs_base_Aint; #endif + + /* Check for size overflow */ + if (size > bigio_count) { + /* We need to work around the integer size limit of 2GB. The input size_t size + * variable cannot fit into an integer, but we can get around that limitation by + * creating a different datatype and then setting the integer size (or element + * count) to 1 when using the derived_type. */ + + /* Allocate arrays to keep track of types and whether they were created, if + * necessary */ + if (!sub_types) { + HDassert(!sub_types_created); + + if (NULL == (sub_types = (int *)HDmalloc((size_t)count * sizeof(MPI_Datatype)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc sub types array") + if (NULL == (sub_types_created = (uint8_t *)HDcalloc((size_t)count, 1))) { + sub_types = H5MM_free(sub_types); + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc sub types created array") + } + + /* Initialize sub_types to all MPI_BYTE */ + for (j = 0; j < (int)count; j++) + sub_types[j] = MPI_BYTE; + } + HDassert(sub_types_created); + + /* Create type for large block */ + if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &sub_types[i]) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype") + sub_types_created[i] = TRUE; + + /* Only one of these large types for this vector element */ + mpi_block_lengths[i] = 1; + } + else + HDassert(size == (size_t)mpi_block_lengths[i]); } /* create the memory MPI derived types */ - if (MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed((int)count, mpi_block_lengths, mpi_bufs, - MPI_BYTE, &buf_type))) - - HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed for buf_type failed", mpi_code) + if (sub_types) { + if (MPI_SUCCESS != (mpi_code = MPI_Type_create_struct((int)count, mpi_block_lengths, mpi_bufs, + sub_types, &buf_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct for buf_type failed", mpi_code) + } + else + if (MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed((int)count, mpi_block_lengths, mpi_bufs, + MPI_BYTE, &buf_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed for buf_type failed", mpi_code) buf_type_created = TRUE; @@ -2412,10 +2554,15 @@ H5FD__mpio_write_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t co HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit for buf_type failed", mpi_code) /* create the file MPI derived type */ - if (MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed((int)count, mpi_block_lengths, - mpi_displacments, MPI_BYTE, &file_type))) - - HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed for file_type failed", mpi_code) + if (sub_types) { + if (MPI_SUCCESS != (mpi_code = MPI_Type_create_struct((int)count, mpi_block_lengths, + mpi_displacments, sub_types, &file_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct for file_type failed", mpi_code) + } + else + if (MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed((int)count, mpi_block_lengths, + mpi_displacments, MPI_BYTE, &file_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed for file_type failed", mpi_code) file_type_created = TRUE; @@ -2530,10 +2677,25 @@ H5FD__mpio_write_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t co } } - size_i = (int)size; /* todo: fix potential for overflow */ + size_i = (int)size; + + if (size != (size_t)size_i) { + /* If HERE, then we need to work around the integer size limit + * of 2GB. The input size_t size variable cannot fit into an integer, + * but we can get around that limitation by creating a different datatype + * and then setting the integer size (or element count) to 1 when using + * the derived_type. + */ + + if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &buf_type) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype") + + buf_type_created = TRUE; + size_i = 1; + } if (MPI_SUCCESS != - (mpi_code = MPI_File_write_at(file->f, mpi_off, s_bufs[i], size_i, MPI_BYTE, &mpi_stat))) + (mpi_code = MPI_File_write_at(file->f, mpi_off, s_bufs[i], size_i, buf_type, &mpi_stat))) HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at failed", mpi_code) } @@ -2612,6 +2774,19 @@ done: MPI_Type_free(&file_type); } + if (sub_types) { + HDassert(sub_types_created); + + for (i = 0; i < (int)count; i++) + if (sub_types_created[i]) + MPI_Type_free(&sub_types[i]); + + HDfree(sub_types); + sub_types = NULL; + HDfree(sub_types_created); + sub_types_created = NULL; + } + #ifdef H5FDmpio_DEBUG if (H5FD_mpio_debug_t_flag) HDfprintf(stdout, "%s: Leaving, proc %d: ret_value = %d\n", __func__, file->mpi_rank, ret_value); diff --git a/testpar/t_bigio.c b/testpar/t_bigio.c index dd2514b..0a971c5 100644 --- a/testpar/t_bigio.c +++ b/testpar/t_bigio.c @@ -1103,7 +1103,6 @@ dataset_big_read(void) static void single_rank_independent_io(void) { -#if HDF5_BIG_SELECTION_IO_WORKS if (mpi_rank_g == 0) HDprintf("\nSingle Rank Independent I/O\n"); @@ -1179,7 +1178,6 @@ single_rank_independent_io(void) HDremove(FILENAME[1]); } MPI_Barrier(MPI_COMM_WORLD); -#endif } /* -- cgit v0.12