diff options
author | Larry Knox <lrknox@hdfgroup.org> | 2023-08-08 16:06:05 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-08-08 16:06:05 (GMT) |
commit | f43d301f633b5e2cd9cde160ddb2c13578f9231f (patch) | |
tree | 3c05bb6a52a5cb19869721dab02e4931f6e48e39 /src | |
parent | 8ceb226bae916152510387da6f7f9710903882a8 (diff) | |
download | hdf5-f43d301f633b5e2cd9cde160ddb2c13578f9231f.zip hdf5-f43d301f633b5e2cd9cde160ddb2c13578f9231f.tar.gz hdf5-f43d301f633b5e2cd9cde160ddb2c13578f9231f.tar.bz2 |
Merge Implementation of the mpio driver with selection I/O. (#3360)
Diffstat (limited to 'src')
-rw-r--r-- | src/H5Dio.c | 30 | ||||
-rw-r--r-- | src/H5Dmpio.c | 22 | ||||
-rw-r--r-- | src/H5FD.c | 402 | ||||
-rw-r--r-- | src/H5FDdevelop.h | 12 | ||||
-rw-r--r-- | src/H5FDint.c | 700 | ||||
-rw-r--r-- | src/H5FDmpio.c | 1026 | ||||
-rw-r--r-- | src/H5FDprivate.h | 35 |
7 files changed, 2023 insertions, 204 deletions
diff --git a/src/H5Dio.c b/src/H5Dio.c index 89afde5..b978139 100644 --- a/src/H5Dio.c +++ b/src/H5Dio.c @@ -415,8 +415,14 @@ H5D__read(size_t count, H5D_dset_io_info_t *dset_info) HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode"); /* Only report the collective I/O mode if we're actually performing collective I/O */ - if (xfer_mode == H5FD_MPIO_COLLECTIVE) + if (xfer_mode == H5FD_MPIO_COLLECTIVE) { H5CX_set_mpio_actual_io_mode(io_info.actual_io_mode); + + /* If we did selection I/O, report that we used "link chunk" mode, since that's the most + * analogous to what selection I/O does */ + if (io_info.use_select_io == H5D_SELECTION_IO_MODE_ON) + H5CX_set_mpio_actual_chunk_opt(H5D_MPIO_LINK_CHUNK); + } } #endif /* H5_HAVE_PARALLEL */ } @@ -1131,20 +1137,16 @@ H5D__typeinfo_init_phase2(H5D_io_info_t *io_info) assert(io_info); /* If selection I/O mode is default (auto), enable it here if the VFD supports it (it will be turned off - * later if something else conflicts), otherwise disable it. If we're using the MPIO VFD, the automatic - * selection will happen in H5D__mpio_opt_possible() inside H5D__ioinfo_adjust(). */ -#ifdef H5_HAVE_PARALLEL - if (!io_info->using_mpi_vfd) -#endif /* H5_HAVE_PARALLEL */ - if (io_info->use_select_io == H5D_SELECTION_IO_MODE_DEFAULT) { - if (H5F_has_vector_select_io(io_info->dsets_info[0].dset->oloc.file, - io_info->op_type == H5D_IO_OP_WRITE)) - io_info->use_select_io = H5D_SELECTION_IO_MODE_ON; - else { - io_info->use_select_io = H5D_SELECTION_IO_MODE_OFF; - io_info->no_selection_io_cause |= H5D_SEL_IO_DEFAULT_OFF; - } + * later if something else conflicts), otherwise disable it */ + if (io_info->use_select_io == H5D_SELECTION_IO_MODE_DEFAULT) { + if (H5F_has_vector_select_io(io_info->dsets_info[0].dset->oloc.file, + io_info->op_type == H5D_IO_OP_WRITE)) + io_info->use_select_io = H5D_SELECTION_IO_MODE_ON; + else { + io_info->use_select_io = H5D_SELECTION_IO_MODE_OFF; + io_info->no_selection_io_cause |= H5D_SEL_IO_DEFAULT_OFF; } + } /* If we're doing type conversion and we might be doing selection I/O, check if the buffers are large * enough to handle the whole I/O */ diff --git a/src/H5Dmpio.c b/src/H5Dmpio.c index 901907c..82bcf02 100644 --- a/src/H5Dmpio.c +++ b/src/H5Dmpio.c @@ -622,6 +622,9 @@ H5D__mpio_opt_possible(H5D_io_info_t *io_info) if (!H5FD_mpi_opt_types_g) local_cause[0] |= H5D_MPIO_MPI_OPT_TYPES_ENV_VAR_DISABLED; + /* Decision on whether to use selection I/O should have been made by now */ + assert(io_info->use_select_io != H5D_SELECTION_IO_MODE_DEFAULT); + /* Datatype conversions and transformations are allowed with selection I/O. If the selection I/O mode * is auto (default), disable collective for now and re-enable later if we can */ if (io_info->use_select_io != H5D_SELECTION_IO_MODE_ON) { @@ -731,25 +734,6 @@ H5D__mpio_opt_possible(H5D_io_info_t *io_info) HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code) } /* end else */ - /* If the selection I/O mode is default (auto), decide here whether it should be on or off */ - if (io_info->use_select_io == H5D_SELECTION_IO_MODE_DEFAULT) { - /* If the only reason(s) we've disabled collective are type conversions and/or transforms, enable - * selection I/O and re-enable collective I/O since it's supported by selection I/O */ - if (global_cause[0] && !(global_cause[0] & ~((unsigned)H5D_MPIO_DATATYPE_CONVERSION | - (unsigned)H5D_MPIO_DATA_TRANSFORMS))) { - assert(!(local_cause[0] & - ~((unsigned)H5D_MPIO_DATATYPE_CONVERSION | (unsigned)H5D_MPIO_DATA_TRANSFORMS))); - local_cause[0] = 0; - global_cause[0] = 0; - io_info->use_select_io = H5D_SELECTION_IO_MODE_ON; - } - else { - /* Otherwise, there's currently no benefit to selection I/O, so leave it off */ - io_info->use_select_io = H5D_SELECTION_IO_MODE_OFF; - io_info->no_selection_io_cause |= H5D_SEL_IO_DEFAULT_OFF; - } - } - /* Set the local & global values of no-collective-cause in the API context */ H5CX_set_mpio_local_no_coll_cause(local_cause[0]); H5CX_set_mpio_global_no_coll_cause(global_cause[0]); @@ -1723,8 +1723,8 @@ H5FDread_selection(H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id, uint32_t count, /* Call private function */ /* (Note compensating for base address addition in internal routine) */ - if (H5FD_read_selection_id(file, type, count, mem_space_ids, file_space_ids, offsets, element_sizes, - bufs) < 0) + if (H5FD_read_selection_id(SKIP_NO_CB, file, type, count, mem_space_ids, file_space_ids, offsets, + element_sizes, bufs) < 0) HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "file selection read request failed"); done: @@ -1820,8 +1820,9 @@ H5FDwrite_selection(H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id, uint32_t count /* Call private function */ /* (Note compensating for base address addition in internal routine) */ - if (H5FD_write_selection_id(file, type, count, mem_space_ids, file_space_ids, offsets, element_sizes, - bufs) < 0) + + if (H5FD_write_selection_id(SKIP_NO_CB, file, type, count, mem_space_ids, file_space_ids, offsets, + element_sizes, bufs) < 0) HGOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "file selection write request failed"); done: @@ -1829,6 +1830,399 @@ done: } /* end H5FDwrite_selection() */ /*------------------------------------------------------------------------- + * Purpose: This is similar to H5FDread_selection() with the + * exception noted below. + * + * Perform count reads from the specified file at the + * locations selected in the dataspaces in the file_spaces + * array, with each of those dataspaces starting at the file + * address specified by the corresponding element of the + * offsets array, and with the size of each element in the + * dataspace specified by the corresponding element of the + * element_sizes array. The memory type provided by type is + * the same for all selections. Data read is returned in + * the locations selected in the dataspaces in the + * mem_spaces array, within the buffers provided in the + * corresponding elements of the bufs array. + * + * If i > 0 and element_sizes[i] == 0, presume + * element_sizes[n] = element_sizes[i-1] for all n >= i and + * < count. + * + * Note: + * It will skip selection read call whether the underlying VFD + * supports selection reads or not. + * + * It will translate the selection read to a vector read call + * if vector reads are supported, or a series of scalar read + * calls otherwise. + * + * All reads are done according to the data transfer property + * list dxpl_id (which may be the constant H5P_DEFAULT). + * + * Return: Success: SUCCEED + * All reads have completed successfully, and + * the results have been written into the supplied + * buffers. + * + * Failure: FAIL + * The contents of supplied buffers are undefined. + * + *------------------------------------------------------------------------- + */ +herr_t +H5FDread_vector_from_selection(H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id, uint32_t count, + hid_t mem_space_ids[], hid_t file_space_ids[], haddr_t offsets[], + size_t element_sizes[], void *bufs[] /* out */) +{ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_API(FAIL) + H5TRACE9("e", "*#MtiIu*i*i*a*zx", file, type, dxpl_id, count, mem_space_ids, file_space_ids, offsets, + element_sizes, bufs); + + /* Check arguments */ + if (!file) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file pointer cannot be NULL"); + + if (!file->cls) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file class pointer cannot be NULL"); + + if ((!mem_space_ids) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "mem_spaces parameter can't be NULL if count is positive"); + + if ((!file_space_ids) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file_spaces parameter can't be NULL if count is positive"); + + if ((!offsets) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "offsets parameter can't be NULL if count is positive"); + + if ((!element_sizes) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, + "element_sizes parameter can't be NULL if count is positive"); + + if ((!bufs) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "bufs parameter can't be NULL if count is positive"); + + if ((count > 0) && (element_sizes[0] == 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "sizes[0] can't be 0"); + + if ((count > 0) && (bufs[0] == NULL)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "bufs[0] can't be NULL"); + + /* Get the default dataset transfer property list if the user didn't provide one */ + if (H5P_DEFAULT == dxpl_id) { + dxpl_id = H5P_DATASET_XFER_DEFAULT; + } + else { + if (TRUE != H5P_isa_class(dxpl_id, H5P_DATASET_XFER)) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a data transfer property list"); + } + + /* Call private function */ + /* (Note compensating for base address addition in internal routine) */ + if (H5FD_read_vector_from_selection(file, type, count, mem_space_ids, file_space_ids, offsets, + element_sizes, bufs) < 0) + HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "file selection read request failed"); + +done: + FUNC_LEAVE_API(ret_value) +} /* end H5FDread_vector_from_selection() */ + +/*------------------------------------------------------------------------- + * Purpose: This is similar to H5FDwrite_selection() with the + * exception noted below. + * + * Perform count writes to the specified file at the + * locations selected in the dataspaces in the file_spaces + * array, with each of those dataspaces starting at the file + * address specified by the corresponding element of the + * offsets array, and with the size of each element in the + * dataspace specified by the corresponding element of the + * element_sizes array. The memory type provided by type is + * the same for all selections. Data write is from + * the locations selected in the dataspaces in the + * mem_spaces array, within the buffers provided in the + * corresponding elements of the bufs array. + * + * If i > 0 and element_sizes[i] == 0, presume + * element_sizes[n] = element_sizes[i-1] for all n >= i and + * < count. + * + * Note: + * It will skip selection write call whether the underlying VFD + * supports selection writes or not. + * + * It will translate the selection write to a vector write call + * if vector writes are supported, or a series of scalar write + * calls otherwise. + * + * All writes are done according to the data transfer property + * list dxpl_id (which may be the constant H5P_DEFAULT). + * + * Return: Success: SUCCEED + * All writes have completed successfully + * + * Failure: FAIL + * One or more of the writes failed. + * + *------------------------------------------------------------------------- + */ +herr_t +H5FDwrite_vector_from_selection(H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id, uint32_t count, + hid_t mem_space_ids[], hid_t file_space_ids[], haddr_t offsets[], + size_t element_sizes[], const void *bufs[]) +{ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_API(FAIL) + H5TRACE9("e", "*#MtiIu*i*i*a*z**x", file, type, dxpl_id, count, mem_space_ids, file_space_ids, offsets, + element_sizes, bufs); + + /* Check arguments */ + if (!file) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file pointer cannot be NULL"); + + if (!file->cls) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file class pointer cannot be NULL"); + + if ((!mem_space_ids) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "mem_spaces parameter can't be NULL if count is positive"); + + if ((!file_space_ids) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file_spaces parameter can't be NULL if count is positive"); + + if ((!offsets) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "offsets parameter can't be NULL if count is positive"); + + if ((!element_sizes) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, + "element_sizes parameter can't be NULL if count is positive"); + + if ((!bufs) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "bufs parameter can't be NULL if count is positive"); + + if ((count > 0) && (element_sizes[0] == 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "sizes[0] can't be 0"); + + if ((count > 0) && (bufs[0] == NULL)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "bufs[0] can't be NULL"); + + /* Get the default dataset transfer property list if the user didn't provide one */ + if (H5P_DEFAULT == dxpl_id) { + dxpl_id = H5P_DATASET_XFER_DEFAULT; + } + else { + if (TRUE != H5P_isa_class(dxpl_id, H5P_DATASET_XFER)) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a data transfer property list"); + } + + /* Call private function */ + /* (Note compensating for base address addition in internal routine) */ + if (H5FD_write_vector_from_selection(file, type, count, mem_space_ids, file_space_ids, offsets, + element_sizes, bufs) < 0) + HGOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "file selection write request failed"); + +done: + FUNC_LEAVE_API(ret_value) +} /* end H5FDwrite_vector_from_selection() */ + +/*------------------------------------------------------------------------- + * Purpose: This is similar to H5FDread_selection() with the + * exception noted below. + * + * Perform count reads from the specified file at the + * locations selected in the dataspaces in the file_spaces + * array, with each of those dataspaces starting at the file + * address specified by the corresponding element of the + * offsets array, and with the size of each element in the + * dataspace specified by the corresponding element of the + * element_sizes array. The memory type provided by type is + * the same for all selections. Data read is returned in + * the locations selected in the dataspaces in the + * mem_spaces array, within the buffers provided in the + * corresponding elements of the bufs array. + * + * If i > 0 and element_sizes[i] == 0, presume + * element_sizes[n] = element_sizes[i-1] for all n >= i and + * < count. + * + * Note: + * It will skip selection and vector read calls whether the underlying + * VFD supports selection and vector reads or not. + * + * It will translate the selection read to a series of + * scalar read calls. + * + * All reads are done according to the data transfer property + * list dxpl_id (which may be the constant H5P_DEFAULT). + * + * Return: Success: SUCCEED + * All reads have completed successfully, and + * the results have been written into the supplied + * buffers. + * + * Failure: FAIL + * The contents of supplied buffers are undefined. + * + *------------------------------------------------------------------------- + */ +herr_t +H5FDread_from_selection(H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id, uint32_t count, hid_t mem_space_ids[], + hid_t file_space_ids[], haddr_t offsets[], size_t element_sizes[], void *bufs[]) +{ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_API(FAIL) + H5TRACE9("e", "*#MtiIu*i*i*a*z**x", file, type, dxpl_id, count, mem_space_ids, file_space_ids, offsets, + element_sizes, bufs); + + /* Check arguments */ + if (!file) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file pointer cannot be NULL"); + + if (!file->cls) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file class pointer cannot be NULL"); + + if ((!mem_space_ids) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "mem_spaces parameter can't be NULL if count is positive"); + + if ((!file_space_ids) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file_spaces parameter can't be NULL if count is positive"); + + if ((!offsets) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "offsets parameter can't be NULL if count is positive"); + + if ((!element_sizes) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, + "element_sizes parameter can't be NULL if count is positive"); + + if ((!bufs) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "bufs parameter can't be NULL if count is positive"); + + if ((count > 0) && (element_sizes[0] == 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "sizes[0] can't be 0"); + + if ((count > 0) && (bufs[0] == NULL)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "bufs[0] can't be NULL"); + + /* Get the default dataset transfer property list if the user didn't provide one */ + if (H5P_DEFAULT == dxpl_id) { + dxpl_id = H5P_DATASET_XFER_DEFAULT; + } + else { + if (TRUE != H5P_isa_class(dxpl_id, H5P_DATASET_XFER)) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a data transfer property list"); + } + + /* Call private function */ + /* (Note compensating for base address addition in internal routine) */ + if (H5FD_read_from_selection(file, type, count, mem_space_ids, file_space_ids, offsets, element_sizes, + bufs) < 0) + HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "file selection read request failed"); + +done: + FUNC_LEAVE_API(ret_value) +} /* end H5FDread_from_selection() */ + +/*------------------------------------------------------------------------- + * Purpose: This is similar to H5FDwrite_selection() with the + * exception noted below. + * + * Perform count writes to the specified file at the + * locations selected in the dataspaces in the file_spaces + * array, with each of those dataspaces starting at the file + * address specified by the corresponding element of the + * offsets array, and with the size of each element in the + * dataspace specified by the corresponding element of the + * element_sizes array. The memory type provided by type is + * the same for all selections. Data write is from + * the locations selected in the dataspaces in the + * mem_spaces array, within the buffers provided in the + * corresponding elements of the bufs array. + * + * If i > 0 and element_sizes[i] == 0, presume + * element_sizes[n] = element_sizes[i-1] for all n >= i and + * < count. + * + * Note: + * It will skip selection and vector write calls whether the underlying + * VFD supports selection and vector writes or not. + * + * It will translate the selection write to a series of + * scalar write calls. + * + * All writes are done according to the data transfer property + * list dxpl_id (which may be the constant H5P_DEFAULT). + * + * Return: Success: SUCCEED + * All writes have completed successfully + * + * Failure: FAIL + * One or more of the writes failed. + * + *------------------------------------------------------------------------- + */ +herr_t +H5FDwrite_from_selection(H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id, uint32_t count, hid_t mem_space_ids[], + hid_t file_space_ids[], haddr_t offsets[], size_t element_sizes[], + const void *bufs[]) +{ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_API(FAIL) + H5TRACE9("e", "*#MtiIu*i*i*a*z**x", file, type, dxpl_id, count, mem_space_ids, file_space_ids, offsets, + element_sizes, bufs); + + /* Check arguments */ + if (!file) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file pointer cannot be NULL"); + + if (!file->cls) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file class pointer cannot be NULL"); + + if ((!mem_space_ids) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "mem_spaces parameter can't be NULL if count is positive"); + + if ((!file_space_ids) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file_spaces parameter can't be NULL if count is positive"); + + if ((!offsets) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "offsets parameter can't be NULL if count is positive"); + + if ((!element_sizes) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, + "element_sizes parameter can't be NULL if count is positive"); + + if ((!bufs) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "bufs parameter can't be NULL if count is positive"); + + if ((count > 0) && (element_sizes[0] == 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "sizes[0] can't be 0"); + + if ((count > 0) && (bufs[0] == NULL)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "bufs[0] can't be NULL"); + + /* Get the default dataset transfer property list if the user didn't provide one */ + if (H5P_DEFAULT == dxpl_id) { + dxpl_id = H5P_DATASET_XFER_DEFAULT; + } + else { + if (TRUE != H5P_isa_class(dxpl_id, H5P_DATASET_XFER)) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a data transfer property list"); + } + + /* Call private function */ + /* (Note compensating for base address addition in internal routine) */ + if (H5FD_write_from_selection(file, type, count, mem_space_ids, file_space_ids, offsets, element_sizes, + bufs) < 0) + HGOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "file selection write request failed"); + +done: + FUNC_LEAVE_API(ret_value) +} /* end H5FDwrite_from_selection() */ + +/*------------------------------------------------------------------------- * Function: H5FDflush * * Purpose: Notify driver to flush all cached data. If the driver has no diff --git a/src/H5FDdevelop.h b/src/H5FDdevelop.h index cba2703..75e63b1 100644 --- a/src/H5FDdevelop.h +++ b/src/H5FDdevelop.h @@ -281,6 +281,18 @@ H5_DLL herr_t H5FDread_selection(H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id, H5_DLL herr_t H5FDwrite_selection(H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id, uint32_t count, hid_t mem_spaces[], hid_t file_spaces[], haddr_t offsets[], size_t element_sizes[], const void *bufs[]); +H5_DLL herr_t H5FDread_vector_from_selection(H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id, uint32_t count, + hid_t mem_spaces[], hid_t file_spaces[], haddr_t offsets[], + size_t element_sizes[], void *bufs[] /* out */); +H5_DLL herr_t H5FDwrite_vector_from_selection(H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id, uint32_t count, + hid_t mem_spaces[], hid_t file_spaces[], haddr_t offsets[], + size_t element_sizes[], const void *bufs[] /* in */); +H5_DLL herr_t H5FDread_from_selection(H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id, uint32_t count, + hid_t mem_space_ids[], hid_t file_space_ids[], haddr_t offsets[], + size_t element_sizes[], void *bufs[] /* out */); +H5_DLL herr_t H5FDwrite_from_selection(H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id, uint32_t count, + hid_t mem_space_ids[], hid_t file_space_ids[], haddr_t offsets[], + size_t element_sizes[], const void *bufs[] /* in */); H5_DLL herr_t H5FDflush(H5FD_t *file, hid_t dxpl_id, hbool_t closing); H5_DLL herr_t H5FDtruncate(H5FD_t *file, hid_t dxpl_id, hbool_t closing); H5_DLL herr_t H5FDlock(H5FD_t *file, hbool_t rw); diff --git a/src/H5FDint.c b/src/H5FDint.c index 65b1424..fcc1b6c 100644 --- a/src/H5FDint.c +++ b/src/H5FDint.c @@ -62,17 +62,17 @@ /************************************************************************* * - * H5FD_vsrt_tmp_t + * H5FD_srt_tmp_t * - * Structure used to store vector I/O request addresses and the associated + * Structure used to store I/O request addresses and the associated * indexes in the addrs[] array for the purpose of determine the sorted * order. * - * This is done by allocating an array of H5FD_vsrt_tmp_t of length + * This is done by allocating an array of H5FD_srt_tmp_t of length * count, loading it with the contents of the addrs[] array and the * associated indices, and then sorting it. * - * This sorted array of H5FD_vsrt_tmp_t is then used to populate sorted + * This sorted array of H5FD_srt_tmp_t is then used to populate sorted * versions of the types[], addrs[], sizes[] and bufs[] vectors. * * addr: haddr_t containing the value of addrs[i], @@ -82,10 +82,10 @@ * *************************************************************************/ -typedef struct H5FD_vsrt_tmp_t { +typedef struct H5FD_srt_tmp_t { haddr_t addr; size_t index; -} H5FD_vsrt_tmp_t; +} H5FD_srt_tmp_t; /* Information needed for iterating over the registered VFD hid_t IDs. * The name or value of the new VFD that is being registered is stored @@ -109,12 +109,14 @@ typedef struct H5FD_get_driver_ud_t { /* Local Prototypes */ /********************/ static int H5FD__get_driver_cb(void *obj, hid_t id, void *_op_data); -static herr_t H5FD__read_selection_translate(H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id, uint32_t count, - H5S_t **mem_spaces, H5S_t **file_spaces, haddr_t offsets[], - size_t element_sizes[], void *bufs[] /* out */); -static herr_t H5FD__write_selection_translate(H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id, uint32_t count, - H5S_t **mem_spaces, H5S_t **file_spaces, haddr_t offsets[], - size_t element_sizes[], const void *bufs[]); +static herr_t H5FD__read_selection_translate(uint32_t skip_vector_cb, H5FD_t *file, H5FD_mem_t type, + hid_t dxpl_id, uint32_t count, H5S_t **mem_spaces, + H5S_t **file_spaces, haddr_t offsets[], size_t element_sizes[], + void *bufs[] /* out */); +static herr_t H5FD__write_selection_translate(uint32_t skip_vector_cb, H5FD_t *file, H5FD_mem_t type, + hid_t dxpl_id, uint32_t count, H5S_t **mem_spaces, + H5S_t **file_spaces, haddr_t offsets[], size_t element_sizes[], + const void *bufs[]); /*********************/ /* Package Variables */ @@ -734,8 +736,8 @@ done: * Function: H5FD__read_selection_translate * * Purpose: Translates a selection read call to a vector read call if - * vector reads are supported, or a series of scalar read - * calls otherwise. + * vector reads are supported and !skip_vector_cb, + * or a series of scalar read calls otherwise. * * Return: Success: SUCCEED * All reads have completed successfully, and @@ -748,8 +750,8 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5FD__read_selection_translate(H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id, uint32_t count, - H5S_t **mem_spaces, H5S_t **file_spaces, haddr_t offsets[], +H5FD__read_selection_translate(uint32_t skip_vector_cb, H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id, + uint32_t count, H5S_t **mem_spaces, H5S_t **file_spaces, haddr_t offsets[], size_t element_sizes[], void *bufs[] /* out */) { hbool_t extend_sizes = FALSE; @@ -797,7 +799,7 @@ H5FD__read_selection_translate(H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id, uin assert((bufs) || (count == 0)); /* Check if we're using vector I/O */ - use_vector = file->cls->read_vector != NULL; + use_vector = (file->cls->read_vector != NULL) && (!skip_vector_cb); if (count > 0) { /* Verify that the first elements of the element_sizes and bufs arrays are @@ -1063,8 +1065,8 @@ done: * If the underlying VFD supports selection reads, pass the * call through directly. * - * If it doesn't, convert the vector read into a sequence - * of individual reads. + * If it doesn't, convert the selection read into a sequence + * of vector or scalar reads. * * Return: Success: SUCCEED * All reads have completed successfully, and @@ -1189,8 +1191,8 @@ H5FD_read_selection(H5FD_t *file, H5FD_mem_t type, uint32_t count, H5S_t **mem_s /* Otherwise, implement the selection read as a sequence of regular * or vector read calls. */ - if (H5FD__read_selection_translate(file, type, dxpl_id, count, mem_spaces, file_spaces, offsets, - element_sizes, bufs) < 0) + if (H5FD__read_selection_translate(SKIP_NO_CB, file, type, dxpl_id, count, mem_spaces, file_spaces, + offsets, element_sizes, bufs) < 0) HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "translation to vector or scalar read failed"); done: @@ -1228,6 +1230,15 @@ done: * Purpose: Like H5FD_read_selection(), but takes hid_t arrays instead * of H5S_t * arrays for the dataspaces. * + * Depending on the parameter skip_cb which is translated into + * skip_selection_cb and skip_vector_cb: + * + * --If the underlying VFD supports selection reads and !skip_selection_cb, + * pass the call through directly. + * + * --If it doesn't, convert the selection reads into a sequence of vector or + * scalar reads depending on skip_vector_cb. + * * Return: Success: SUCCEED * All reads have completed successfully, and * the results havce been into the supplied @@ -1239,7 +1250,7 @@ done: *------------------------------------------------------------------------- */ herr_t -H5FD_read_selection_id(H5FD_t *file, H5FD_mem_t type, uint32_t count, hid_t mem_space_ids[], +H5FD_read_selection_id(uint32_t skip_cb, H5FD_t *file, H5FD_mem_t type, uint32_t count, hid_t mem_space_ids[], hid_t file_space_ids[], haddr_t offsets[], size_t element_sizes[], void *bufs[] /* out */) { @@ -1250,6 +1261,8 @@ H5FD_read_selection_id(H5FD_t *file, H5FD_mem_t type, uint32_t count, hid_t mem_ H5S_t **file_spaces = file_spaces_local; hid_t dxpl_id = H5I_INVALID_HID; /* DXPL for operation */ uint32_t i; + uint32_t skip_selection_cb; + uint32_t skip_vector_cb; herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_NOAPI(FAIL) @@ -1282,6 +1295,9 @@ H5FD_read_selection_id(H5FD_t *file, H5FD_mem_t type, uint32_t count, hid_t mem_ } #endif /* H5_HAVE_PARALLEL */ + skip_selection_cb = skip_cb & SKIP_SELECTION_CB; + skip_vector_cb = skip_cb & SKIP_VECTOR_CB; + if (file->base_addr > 0) { /* apply the base_addr offset to the offsets array. Must undo before @@ -1320,7 +1336,7 @@ H5FD_read_selection_id(H5FD_t *file, H5FD_mem_t type, uint32_t count, hid_t mem_ } /* if the underlying VFD supports selection read, make the call */ - if (file->cls->read_selection) { + if (!skip_selection_cb && file->cls->read_selection) { if ((file->cls->read_selection)(file, type, dxpl_id, count, mem_space_ids, file_space_ids, offsets, element_sizes, bufs) < 0) HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "driver read selection request failed"); @@ -1348,8 +1364,9 @@ H5FD_read_selection_id(H5FD_t *file, H5FD_mem_t type, uint32_t count, hid_t mem_ } /* Translate to vector or scalar I/O */ - if (H5FD__read_selection_translate(file, type, dxpl_id, count, mem_spaces, file_spaces, offsets, - element_sizes, bufs) < 0) + + if (H5FD__read_selection_translate(skip_vector_cb, file, type, dxpl_id, count, mem_spaces, + file_spaces, offsets, element_sizes, bufs) < 0) HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "translation to vector or scalar read failed"); } @@ -1378,8 +1395,8 @@ done: * Function: H5FD__write_selection_translate * * Purpose: Translates a selection write call to a vector write call - * if vector writes are supported, or a series of scalar - * write calls otherwise. + * if vector writes are supported and !skip_vector_cb, + * or a series of scalar write calls otherwise. * * Return: Success: SUCCEED * All writes have completed successfully. @@ -1390,8 +1407,8 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5FD__write_selection_translate(H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id, uint32_t count, - H5S_t **mem_spaces, H5S_t **file_spaces, haddr_t offsets[], +H5FD__write_selection_translate(uint32_t skip_vector_cb, H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id, + uint32_t count, H5S_t **mem_spaces, H5S_t **file_spaces, haddr_t offsets[], size_t element_sizes[], const void *bufs[]) { hbool_t extend_sizes = FALSE; @@ -1439,7 +1456,7 @@ H5FD__write_selection_translate(H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id, ui assert((bufs) || (count == 0)); /* Check if we're using vector I/O */ - use_vector = file->cls->write_vector != NULL; + use_vector = (file->cls->write_vector != NULL) && (!skip_vector_cb); if (count > 0) { /* Verify that the first elements of the element_sizes and bufs arrays are @@ -1705,8 +1722,8 @@ done: * If the underlying VFD supports selection writes, pass the * call through directly. * - * If it doesn't, convert the vector write into a sequence - * of individual writes. + * If it doesn't, convert the selection write into a sequence + * of vector or scalar writes. * * Return: Success: SUCCEED * All writes have completed successfully. @@ -1823,8 +1840,9 @@ H5FD_write_selection(H5FD_t *file, H5FD_mem_t type, uint32_t count, H5S_t **mem_ /* Otherwise, implement the selection write as a sequence of regular * or vector write calls. */ - if (H5FD__write_selection_translate(file, type, dxpl_id, count, mem_spaces, file_spaces, offsets, - element_sizes, bufs) < 0) + + if (H5FD__write_selection_translate(SKIP_NO_CB, file, type, dxpl_id, count, mem_spaces, file_spaces, + offsets, element_sizes, bufs) < 0) HGOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "translation to vector or scalar write failed"); done: @@ -1862,6 +1880,15 @@ done: * Purpose: Like H5FD_write_selection(), but takes hid_t arrays * instead of H5S_t * arrays for the dataspaces. * + * Depending on the parameter skip_cb which is translated into + * skip_selection_cb and skip_vector_cb: + * + * --If the underlying VFD supports selection writes and !skip_selection_cb, + * pass the call through directly. + * + * --If it doesn't, convert the selection writes into a sequence of vector or + * scalar reads depending on skip_vector_cb. + * * Return: Success: SUCCEED * All writes have completed successfully. * @@ -1871,8 +1898,9 @@ done: *------------------------------------------------------------------------- */ herr_t -H5FD_write_selection_id(H5FD_t *file, H5FD_mem_t type, uint32_t count, hid_t mem_space_ids[], - hid_t file_space_ids[], haddr_t offsets[], size_t element_sizes[], const void *bufs[]) +H5FD_write_selection_id(uint32_t skip_cb, H5FD_t *file, H5FD_mem_t type, uint32_t count, + hid_t mem_space_ids[], hid_t file_space_ids[], haddr_t offsets[], + size_t element_sizes[], const void *bufs[]) { hbool_t offsets_cooked = FALSE; H5S_t *mem_spaces_local[H5FD_LOCAL_SEL_ARR_LEN]; @@ -1881,6 +1909,8 @@ H5FD_write_selection_id(H5FD_t *file, H5FD_mem_t type, uint32_t count, hid_t mem H5S_t **file_spaces = file_spaces_local; hid_t dxpl_id = H5I_INVALID_HID; /* DXPL for operation */ uint32_t i; + uint32_t skip_selection_cb; + uint32_t skip_vector_cb; herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_NOAPI(FAIL) @@ -1913,6 +1943,9 @@ H5FD_write_selection_id(H5FD_t *file, H5FD_mem_t type, uint32_t count, hid_t mem } #endif /* H5_HAVE_PARALLEL */ + skip_selection_cb = skip_cb & SKIP_SELECTION_CB; + skip_vector_cb = skip_cb & SKIP_VECTOR_CB; + if (file->base_addr > 0) { /* apply the base_addr offset to the offsets array. Must undo before @@ -1945,7 +1978,7 @@ H5FD_write_selection_id(H5FD_t *file, H5FD_mem_t type, uint32_t count, hid_t mem } /* if the underlying VFD supports selection write, make the call */ - if (file->cls->write_selection) { + if (!skip_selection_cb && file->cls->write_selection) { if ((file->cls->write_selection)(file, type, dxpl_id, count, mem_space_ids, file_space_ids, offsets, element_sizes, bufs) < 0) HGOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "driver write selection request failed"); @@ -1973,8 +2006,9 @@ H5FD_write_selection_id(H5FD_t *file, H5FD_mem_t type, uint32_t count, hid_t mem } /* Translate to vector or scalar I/O */ - if (H5FD__write_selection_translate(file, type, dxpl_id, count, mem_spaces, file_spaces, offsets, - element_sizes, bufs) < 0) + + if (H5FD__write_selection_translate(skip_vector_cb, file, type, dxpl_id, count, mem_spaces, + file_spaces, offsets, element_sizes, bufs) < 0) HGOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "translation to vector or scalar write failed"); } @@ -2000,6 +2034,209 @@ done: } /* end H5FD_write_selection_id() */ /*------------------------------------------------------------------------- + * Function: H5FD_read_vector_from_selection + * + * Purpose: Internal routine for H5FDread_vector_from_selection() + * + * It will translate the selection read to a vector read call + * if vector reads are supported, or a series of scalar read + * calls otherwise. + * + * Return: Success: SUCCEED + * All writes have completed successfully. + * + * Failure: FAIL + * One or more writes failed. + * + *------------------------------------------------------------------------- + */ +herr_t +H5FD_read_vector_from_selection(H5FD_t *file, H5FD_mem_t type, uint32_t count, hid_t mem_space_ids[], + hid_t file_space_ids[], haddr_t offsets[], size_t element_sizes[], + void *bufs[]) +{ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(FAIL) + + /* Sanity checks */ + assert(file); + assert(file->cls); + assert((mem_space_ids) || (count == 0)); + assert((file_space_ids) || (count == 0)); + assert((offsets) || (count == 0)); + assert((element_sizes) || (count == 0)); + assert((bufs) || (count == 0)); + + /* Verify that the first elements of the element_sizes and bufs arrays are + * valid. */ + assert((count == 0) || (element_sizes[0] != 0)); + assert((count == 0) || (bufs[0] != NULL)); + + /* Call private function */ + /* (Note compensating for base address addition in internal routine) */ + if (H5FD_read_selection_id(SKIP_SELECTION_CB, file, type, count, mem_space_ids, file_space_ids, offsets, + element_sizes, bufs) < 0) + HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "file selection read request failed"); + +done: + FUNC_LEAVE_NOAPI(ret_value) + +} /* end H5FD_read_vector_from_selection() */ + +/*------------------------------------------------------------------------- + * Function: H5FD_write_vector_from_selection + * + * Purpose: Internal routine for H5FDwrite_vector_from_selection() + * + * It will translate the selection write to a vector write call + * if vector writes are supported, or a series of scalar write + * calls otherwise. + * + * Return: Success: SUCCEED + * All writes have completed successfully. + * + * Failure: FAIL + * One or more writes failed. + * + *------------------------------------------------------------------------- + */ +herr_t +H5FD_write_vector_from_selection(H5FD_t *file, H5FD_mem_t type, uint32_t count, hid_t mem_space_ids[], + hid_t file_space_ids[], haddr_t offsets[], size_t element_sizes[], + const void *bufs[]) +{ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(FAIL) + + /* Sanity checks */ + assert(file); + assert(file->cls); + assert((mem_space_ids) || (count == 0)); + assert((file_space_ids) || (count == 0)); + assert((offsets) || (count == 0)); + assert((element_sizes) || (count == 0)); + assert((bufs) || (count == 0)); + + /* Verify that the first elements of the element_sizes and bufs arrays are + * valid. */ + assert((count == 0) || (element_sizes[0] != 0)); + assert((count == 0) || (bufs[0] != NULL)); + + /* Call private function */ + /* (Note compensating for base address addition in internal routine) */ + if (H5FD_write_selection_id(SKIP_SELECTION_CB, file, type, count, mem_space_ids, file_space_ids, offsets, + element_sizes, bufs) < 0) + HGOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "file selection write request failed"); + +done: + FUNC_LEAVE_NOAPI(ret_value) + +} /* end H5FD_write_vector_from_selection() */ + +/*------------------------------------------------------------------------- + * Function: H5FD_read_from_selection + * + * Purpose: Internal routine for H5FDread_from_selection() + * + * It will translate the selection read to a series of + * scalar read calls. + * + * Return: Success: SUCCEED + * All writes have completed successfully. + * + * Failure: FAIL + * One or more writes failed. + * + *------------------------------------------------------------------------- + */ +herr_t +H5FD_read_from_selection(H5FD_t *file, H5FD_mem_t type, uint32_t count, hid_t mem_space_ids[], + hid_t file_space_ids[], haddr_t offsets[], size_t element_sizes[], void *bufs[]) +{ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(FAIL) + + /* Sanity checks */ + assert(file); + assert(file->cls); + assert((mem_space_ids) || (count == 0)); + assert((file_space_ids) || (count == 0)); + assert((offsets) || (count == 0)); + assert((element_sizes) || (count == 0)); + assert((bufs) || (count == 0)); + + /* Verify that the first elements of the element_sizes and bufs arrays are + * valid. */ + assert((count == 0) || (element_sizes[0] != 0)); + assert((count == 0) || (bufs[0] != NULL)); + + /* Call private function */ + /* (Note compensating for base address addition in internal routine) */ + if (H5FD_read_selection_id(SKIP_SELECTION_CB | SKIP_VECTOR_CB, file, type, count, mem_space_ids, + file_space_ids, offsets, element_sizes, bufs) < 0) + HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "file selection read request failed"); + +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} /* end H5FD_read_from_selection() */ + +/*------------------------------------------------------------------------- + * Function: H5FD_write_from_selection + * + * Purpose: Internal routine for H5FDwrite_from_selection() + * + * It will translate the selection write to a series of + * scalar write calls. + * + * Return: Success: SUCCEED + * All writes have completed successfully. + * + * Failure: FAIL + * One or more writes failed. + * + *------------------------------------------------------------------------- + */ +herr_t +H5FD_write_from_selection(H5FD_t *file, H5FD_mem_t type, uint32_t count, hid_t mem_space_ids[], + hid_t file_space_ids[], haddr_t offsets[], size_t element_sizes[], + const void *bufs[]) +{ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(FAIL) + + /* Sanity checks */ + assert(file); + assert(file->cls); + assert((mem_space_ids) || (count == 0)); + assert((file_space_ids) || (count == 0)); + assert((offsets) || (count == 0)); + assert((element_sizes) || (count == 0)); + assert((bufs) || (count == 0)); + + /* Verify that the first elements of the element_sizes and bufs arrays are + * valid. */ + assert((count == 0) || (element_sizes[0] != 0)); + assert((count == 0) || (bufs[0] != NULL)); + + /* Call private function */ + /* (Note compensating for base address addition in internal routine) */ + if (H5FD_write_selection_id(SKIP_SELECTION_CB | SKIP_VECTOR_CB, file, type, count, mem_space_ids, + file_space_ids, offsets, element_sizes, bufs) < 0) + HGOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "file selection write request failed"); + +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} /* end H5FD_write_from_selection() */ + +/*------------------------------------------------------------------------- * Function: H5FD_set_eoa * * Purpose: Private version of H5FDset_eoa() @@ -2138,6 +2375,119 @@ H5FD_driver_query(const H5FD_class_t *driver, unsigned long *flags /*out*/) FUNC_LEAVE_NOAPI(ret_value) } /* end H5FD_driver_query() */ +/*------------------------------------------------------------------------ + * Function: H5FD__vstr_tmp_cmp() + * + * Purpose: This is the comparison callback function used by qsort() + * in H5FD__sort_io_req_real( ) + * + *------------------------------------------------------------------------- + */ +static int +H5FD__srt_tmp_cmp(const void *element_1, const void *element_2) +{ + haddr_t addr_1 = ((const H5FD_srt_tmp_t *)element_1)->addr; + haddr_t addr_2 = ((const H5FD_srt_tmp_t *)element_2)->addr; + int ret_value = 0; /* Return value */ + + FUNC_ENTER_PACKAGE_NOERR + + /* Sanity checks */ + assert(H5_addr_defined(addr_1)); + assert(H5_addr_defined(addr_2)); + + /* Compare the addresses */ + if (H5_addr_gt(addr_1, addr_2)) + ret_value = 1; + else if (H5_addr_lt(addr_1, addr_2)) + ret_value = -1; + + FUNC_LEAVE_NOAPI(ret_value) +} /* H5FD__srt_tmp_cmp() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__sort_io_req_real() + * + * Purpose: Scan the addrs array to see if it is sorted. + * + * If sorted, return TRUE in *was_sorted. + * + * If not sorted, use qsort() to sort the array. + * Do this by allocating an array of struct H5FD_srt_tmp_t, + * where each instance of H5FD_srt_tmp_t has two fields, + * addr and index. Load the array with the contents of the + * addrs array and the index of the associated entry. + * Then sort the array using qsort(). + * Return *FALSE in was_sorted. + * + * This is a common routine used by: + * --H5FD_sort_vector_io_req () + * --H5FD_sort_selection_io_req() + * + * Return: SUCCEED/FAIL + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__sort_io_req_real(size_t count, haddr_t *addrs, hbool_t *was_sorted, struct H5FD_srt_tmp_t **srt_tmp) +{ + size_t i; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_PACKAGE + + /* Sanity checks */ + + /* scan the offsets array to see if it is sorted */ + for (i = 1; i < count; i++) { + assert(H5_addr_defined(addrs[i - 1])); + + if (H5_addr_gt(addrs[i - 1], addrs[i])) + break; + else if (H5_addr_eq(addrs[i - 1], addrs[i])) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "duplicate addr in selections"); + } + + /* if we traversed the entire array without breaking out, then + * the array was already sorted */ + if (i >= count) + *was_sorted = TRUE; + else + *was_sorted = FALSE; + + if (!(*was_sorted)) { + size_t srt_tmp_size; + + srt_tmp_size = (count * sizeof(struct H5FD_srt_tmp_t)); + + if (NULL == (*srt_tmp = (H5FD_srt_tmp_t *)malloc(srt_tmp_size))) + + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc srt_tmp"); + + for (i = 0; i < count; i++) { + (*srt_tmp)[i].addr = addrs[i]; + (*srt_tmp)[i].index = i; + } + + /* sort the srt_tmp array */ + qsort(*srt_tmp, count, sizeof(struct H5FD_srt_tmp_t), H5FD__srt_tmp_cmp); + + /* verify no duplicate entries */ + i = 1; + + for (i = 1; i < count; i++) { + assert(H5_addr_lt((*srt_tmp)[i - 1].addr, (*srt_tmp)[i].addr)); + + if (H5_addr_eq(addrs[i - 1], addrs[i])) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "duplicate addrs in array"); + } + } + +done: + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5FD__sort_io_req_real() */ + /*------------------------------------------------------------------------- * Function: H5FD_sort_vector_io_req * @@ -2169,38 +2519,15 @@ H5FD_driver_query(const H5FD_class_t *driver, unsigned long *flags /*out*/) * *------------------------------------------------------------------------- */ - -static int -H5FD__vsrt_tmp_cmp(const void *element_1, const void *element_2) -{ - haddr_t addr_1 = ((const H5FD_vsrt_tmp_t *)element_1)->addr; - haddr_t addr_2 = ((const H5FD_vsrt_tmp_t *)element_2)->addr; - int ret_value = 0; /* Return value */ - - FUNC_ENTER_PACKAGE_NOERR - - /* Sanity checks */ - assert(H5_addr_defined(addr_1)); - assert(H5_addr_defined(addr_2)); - - /* Compare the addresses */ - if (H5_addr_gt(addr_1, addr_2)) - ret_value = 1; - else if (H5_addr_lt(addr_1, addr_2)) - ret_value = -1; - - FUNC_LEAVE_NOAPI(ret_value) -} /* H5FD__vsrt_tmp_cmp() */ - herr_t H5FD_sort_vector_io_req(hbool_t *vector_was_sorted, uint32_t _count, H5FD_mem_t types[], haddr_t addrs[], size_t sizes[], H5_flexible_const_ptr_t bufs[], H5FD_mem_t **s_types_ptr, haddr_t **s_addrs_ptr, size_t **s_sizes_ptr, H5_flexible_const_ptr_t **s_bufs_ptr) { - herr_t ret_value = SUCCEED; /* Return value */ - size_t count = (size_t)_count; - size_t i; - struct H5FD_vsrt_tmp_t *srt_tmp = NULL; + herr_t ret_value = SUCCEED; /* Return value */ + size_t count = (size_t)_count; + size_t i; + struct H5FD_srt_tmp_t *srt_tmp = NULL; FUNC_ENTER_NOAPI(FAIL) @@ -2224,22 +2551,12 @@ H5FD_sort_vector_io_req(hbool_t *vector_was_sorted, uint32_t _count, H5FD_mem_t assert((count == 0) || ((s_sizes_ptr) && (NULL == *s_sizes_ptr))); assert((count == 0) || ((s_bufs_ptr) && (NULL == *s_bufs_ptr))); - /* scan the addrs array to see if it is sorted */ - for (i = 1; i < count; i++) { - assert(H5_addr_defined(addrs[i - 1])); - - if (H5_addr_gt(addrs[i - 1], addrs[i])) - break; - else if (H5_addr_eq(addrs[i - 1], addrs[i])) - HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "duplicate addr in vector"); - } - - /* if we traversed the entire array without breaking out, then - * the array was already sorted */ - if (i >= count) - *vector_was_sorted = TRUE; - else - *vector_was_sorted = FALSE; + /* Sort the addrs array in increasing addr order, while + * maintaining the association between each addr, and the + * sizes[], types[], and bufs[] values at the same index. + */ + if (H5FD__sort_io_req_real(count, addrs, vector_was_sorted, &srt_tmp) < 0) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "sorting error in selection offsets"); if (*vector_was_sorted) { @@ -2250,46 +2567,14 @@ H5FD_sort_vector_io_req(hbool_t *vector_was_sorted, uint32_t _count, H5FD_mem_t } else { - /* must sort the addrs array in increasing addr order, while - * maintaining the association between each addr, and the - * sizes[], types[], and bufs[] values at the same index. - * - * Do this by allocating an array of struct H5FD_vsrt_tmp_t, where - * each instance of H5FD_vsrt_tmp_t has two fields, addr and index. - * Load the array with the contents of the addrs array and - * the index of the associated entry. Sort the array, allocate - * the s_types_ptr, s_addrs_ptr, s_sizes_ptr, and s_bufs_ptr + /* + * Allocate the s_types_ptr, s_addrs_ptr, s_sizes_ptr, and s_bufs_ptr * arrays and populate them using the mapping provided by - * the sorted array of H5FD_vsrt_tmp_t. + * the sorted array of H5FD_srt_tmp_t. */ size_t j; size_t fixed_size_index = count; size_t fixed_type_index = count; - size_t srt_tmp_size; - - srt_tmp_size = (count * sizeof(struct H5FD_vsrt_tmp_t)); - - if (NULL == (srt_tmp = (H5FD_vsrt_tmp_t *)malloc(srt_tmp_size))) - - HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc srt_tmp"); - - for (i = 0; i < count; i++) { - srt_tmp[i].addr = addrs[i]; - srt_tmp[i].index = i; - } - - /* sort the srt_tmp array */ - qsort(srt_tmp, count, sizeof(struct H5FD_vsrt_tmp_t), H5FD__vsrt_tmp_cmp); - - /* verify no duplicate entries */ - i = 1; - - for (i = 1; i < count; i++) { - assert(H5_addr_lt(srt_tmp[i - 1].addr, srt_tmp[i].addr)); - - if (H5_addr_eq(addrs[i - 1], addrs[i])) - HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "duplicate addr in vector"); - } if ((NULL == (*s_types_ptr = (H5FD_mem_t *)malloc(count * sizeof(H5FD_mem_t)))) || (NULL == (*s_addrs_ptr = (haddr_t *)malloc(count * sizeof(haddr_t)))) || @@ -2379,6 +2664,187 @@ done: } /* end H5FD_sort_vector_io_req() */ /*------------------------------------------------------------------------- + * Purpose: Determine whether the supplied selection I/O request is + * sorted. + * + * if is is, set *selection_was_sorted to TRUE, set: + * + * *s_mem_space_ids_ptr = mem_space_ids; + * *s_file_space_ids_ptr = file_space_ids; + * *s_offsets_ptr = offsets; + * *s_element_sizes_ptr = element_sizes; + * *s_bufs_ptr = bufs; + * + * and return. + * + * If it is not sorted, duplicate the mem_space_ids, file_space_ids, + * offsets, element_sizes and bufs arrays, storing the base + * addresses of the new arrays in *s_mem_space_ids_ptr, + * s_file_space_ids_ptr, s_offsets_ptr, *s_element_sizes_ptr, + * and s_bufs_ptr respectively. Determine the sorted order + * of the selection I/O request, and load it into the new + * selections in sorted order. + * + * Note that in this case, it is the caller's responsibility + * to free the sorted vectors. + * + * Return: SUCCEED/FAIL + * + *------------------------------------------------------------------------- + */ +herr_t +H5FD_sort_selection_io_req(hbool_t *selection_was_sorted, size_t count, hid_t mem_space_ids[], + hid_t file_space_ids[], haddr_t offsets[], size_t element_sizes[], + H5_flexible_const_ptr_t bufs[], hid_t **s_mem_space_ids_ptr, + hid_t **s_file_space_ids_ptr, haddr_t **s_offsets_ptr, + size_t **s_element_sizes_ptr, H5_flexible_const_ptr_t **s_bufs_ptr) +{ + size_t i; + struct H5FD_srt_tmp_t *srt_tmp = NULL; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(FAIL) + + /* Sanity checks */ + + assert(selection_was_sorted); + + assert((mem_space_ids) || (count == 0)); + assert((file_space_ids) || (count == 0)); + assert((offsets) || (count == 0)); + assert((element_sizes) || (count == 0)); + assert((bufs) || (count == 0)); + + /* verify that the first elements of the element_sizes and bufs arrays are + * valid. + */ + assert((count == 0) || (element_sizes[0] != 0)); + assert((count == 0) || (bufs[0].cvp != NULL)); + + assert((count == 0) || ((s_mem_space_ids_ptr) && (NULL == *s_mem_space_ids_ptr))); + assert((count == 0) || ((s_file_space_ids_ptr) && (NULL == *s_file_space_ids_ptr))); + assert((count == 0) || ((s_offsets_ptr) && (NULL == *s_offsets_ptr))); + assert((count == 0) || ((s_element_sizes_ptr) && (NULL == *s_element_sizes_ptr))); + assert((count == 0) || ((s_bufs_ptr) && (NULL == *s_bufs_ptr))); + + /* Sort the offsets array in increasing offset order, while + * maintaining the association between each offset, and the + * mem_space_ids[], file_space_ids[], element_sizes and bufs[] + * values at the same index. + */ + if (H5FD__sort_io_req_real(count, offsets, selection_was_sorted, &srt_tmp) < 0) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "sorting error in selection offsets"); + + if (*selection_was_sorted) { + + *s_mem_space_ids_ptr = mem_space_ids; + *s_file_space_ids_ptr = file_space_ids; + *s_offsets_ptr = offsets; + *s_element_sizes_ptr = element_sizes; + *s_bufs_ptr = bufs; + } + else { + + /* + * Allocate the s_mem_space_ids_ptr, s_file_space_ids_ptr, s_offsets_ptr, + * s_element_sizes_ptr and s_bufs_ptr arrays and populate them using the + * mapping provided by the sorted array of H5FD_srt_tmp_t. + */ + size_t j; + size_t fixed_element_sizes_index = count; + size_t fixed_bufs_index = count; + + if ((NULL == (*s_mem_space_ids_ptr = (hid_t *)malloc(count * sizeof(hid_t)))) || + (NULL == (*s_file_space_ids_ptr = (hid_t *)malloc(count * sizeof(hid_t)))) || + (NULL == (*s_offsets_ptr = (haddr_t *)malloc(count * sizeof(haddr_t)))) || + (NULL == (*s_element_sizes_ptr = (size_t *)malloc(count * sizeof(size_t)))) || + (NULL == + (*s_bufs_ptr = (H5_flexible_const_ptr_t *)malloc(count * sizeof(H5_flexible_const_ptr_t))))) { + + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc sorted selection(s)"); + } + + assert(element_sizes[0] != 0); + assert(bufs[0].cvp != NULL); + + /* Scan the element_sizes and bufs array to determine if the fixed + * element_sizes / bufs optimization is in use, and if so, to determine + * the index of the last valid value on each array. + * We have already verified that the first + * elements of these arrays are valid so we can start at the second + * element (if it exists). + */ + for (i = 1; i < count && ((fixed_element_sizes_index == count) || (fixed_bufs_index == count)); i++) { + if ((fixed_element_sizes_index == count) && (element_sizes[i] == 0)) + fixed_element_sizes_index = i - 1; + if ((fixed_bufs_index == count) && (bufs[i].cvp == NULL)) + fixed_bufs_index = i - 1; + } + + assert(fixed_element_sizes_index <= count); + assert(fixed_bufs_index <= count); + + /* Populate the sorted arrays. Note that the index stored in srt_tmp + * refers to the index in the unsorted array, while the position of + * srt_tmp within the sorted array is the index in the sorted arrays */ + for (i = 0; i < count; i++) { + + j = srt_tmp[i].index; + + (*s_mem_space_ids_ptr)[i] = mem_space_ids[j]; + (*s_file_space_ids_ptr)[i] = file_space_ids[j]; + (*s_offsets_ptr)[i] = offsets[j]; + (*s_element_sizes_ptr)[i] = element_sizes[MIN(j, fixed_element_sizes_index)]; + (*s_bufs_ptr)[i] = bufs[MIN(j, fixed_bufs_index)]; + } + } + +done: + if (srt_tmp) { + free(srt_tmp); + srt_tmp = NULL; + } + + /* On failure, free the sorted arrays if they were allocated. + * Note that we only allocate these arrays if the original array + * was not sorted -- thus we check both for failure, and for + * the flag indicating that the original array was not sorted + * in increasing address order. + */ + if ((ret_value != SUCCEED) && (!(*selection_was_sorted))) { + + /* free space allocated for sorted arrays */ + if (*s_mem_space_ids_ptr) { + free(*s_mem_space_ids_ptr); + *s_mem_space_ids_ptr = NULL; + } + + if (*s_file_space_ids_ptr) { + free(*s_file_space_ids_ptr); + *s_file_space_ids_ptr = NULL; + } + + if (*s_offsets_ptr) { + free(*s_offsets_ptr); + *s_offsets_ptr = NULL; + } + + if (*s_element_sizes_ptr) { + free(*s_element_sizes_ptr); + *s_element_sizes_ptr = NULL; + } + + if (*s_bufs_ptr) { + free(*s_bufs_ptr); + *s_bufs_ptr = NULL; + } + } + + FUNC_LEAVE_NOAPI(ret_value) + +} /* end H5FD_sort_selection_io_req() */ + +/*------------------------------------------------------------------------- * Function: H5FD_delete * * Purpose: Private version of H5FDdelete() diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c index a76b183..aa83be4 100644 --- a/src/H5FDmpio.c +++ b/src/H5FDmpio.c @@ -88,11 +88,20 @@ static herr_t H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_i static herr_t H5FD__mpio_write_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t count, H5FD_mem_t types[], haddr_t addrs[], size_t sizes[], const void *bufs[]); -static herr_t H5FD__mpio_flush(H5FD_t *_file, hid_t dxpl_id, hbool_t closing); -static herr_t H5FD__mpio_truncate(H5FD_t *_file, hid_t dxpl_id, hbool_t closing); -static herr_t H5FD__mpio_delete(const char *filename, hid_t fapl_id); -static herr_t H5FD__mpio_ctl(H5FD_t *_file, uint64_t op_code, uint64_t flags, const void *input, - void **output); + +static herr_t H5FD__mpio_read_selection(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, + size_t count, hid_t mem_space_ids[], hid_t file_space_ids[], + haddr_t offsets[], size_t element_sizes[], void *bufs[]); + +static herr_t H5FD__mpio_write_selection(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, + size_t count, hid_t mem_space_ids[], hid_t file_space_ids[], + haddr_t offsets[], size_t element_sizes[], const void *bufs[]); + +static herr_t H5FD__mpio_flush(H5FD_t *_file, hid_t dxpl_id, hbool_t closing); +static herr_t H5FD__mpio_truncate(H5FD_t *_file, hid_t dxpl_id, hbool_t closing); +static herr_t H5FD__mpio_delete(const char *filename, hid_t fapl_id); +static herr_t H5FD__mpio_ctl(H5FD_t *_file, uint64_t op_code, uint64_t flags, const void *input, + void **output); /* Other functions */ static herr_t H5FD__mpio_vector_build_types( @@ -101,48 +110,55 @@ static herr_t H5FD__mpio_vector_build_types( MPI_Offset *mpi_off, H5_flexible_const_ptr_t *mpi_bufs_base, int *size_i, MPI_Datatype *buf_type, hbool_t *buf_type_created, MPI_Datatype *file_type, hbool_t *file_type_created, char *unused); +static herr_t H5FD__selection_build_types(hbool_t io_op_write, size_t num_pieces, H5_flexible_const_ptr_t mbb, + H5S_t **file_spaces, H5S_t **mem_spaces, haddr_t offsets[], + H5_flexible_const_ptr_t bufs[], size_t src_element_sizes[], + size_t dst_element_sizes[], MPI_Datatype *final_ftype, + hbool_t *final_ftype_is_derived, MPI_Datatype *final_mtype, + hbool_t *final_mtype_is_derived); + /* The MPIO file driver information */ static const H5FD_class_t H5FD_mpio_g = { - H5FD_CLASS_VERSION, /* struct version */ - H5_VFD_MPIO, /* value */ - "mpio", /* name */ - HADDR_MAX, /* maxaddr */ - H5F_CLOSE_SEMI, /* fc_degree */ - H5FD__mpio_term, /* terminate */ - NULL, /* sb_size */ - NULL, /* sb_encode */ - NULL, /* sb_decode */ - 0, /* fapl_size */ - NULL, /* fapl_get */ - NULL, /* fapl_copy */ - NULL, /* fapl_free */ - 0, /* dxpl_size */ - NULL, /* dxpl_copy */ - NULL, /* dxpl_free */ - H5FD__mpio_open, /* open */ - H5FD__mpio_close, /* close */ - NULL, /* cmp */ - H5FD__mpio_query, /* query */ - NULL, /* get_type_map */ - NULL, /* alloc */ - NULL, /* free */ - H5FD__mpio_get_eoa, /* get_eoa */ - H5FD__mpio_set_eoa, /* set_eoa */ - H5FD__mpio_get_eof, /* get_eof */ - H5FD__mpio_get_handle, /* get_handle */ - H5FD__mpio_read, /* read */ - H5FD__mpio_write, /* write */ - H5FD__mpio_read_vector, /* read_vector */ - H5FD__mpio_write_vector, /* write_vector */ - NULL, /* read_selection */ - NULL, /* write_selection */ - H5FD__mpio_flush, /* flush */ - H5FD__mpio_truncate, /* truncate */ - NULL, /* lock */ - NULL, /* unlock */ - H5FD__mpio_delete, /* del */ - H5FD__mpio_ctl, /* ctl */ - H5FD_FLMAP_DICHOTOMY /* fl_map */ + H5FD_CLASS_VERSION, /* struct version */ + H5_VFD_MPIO, /* value */ + "mpio", /* name */ + HADDR_MAX, /* maxaddr */ + H5F_CLOSE_SEMI, /* fc_degree */ + H5FD__mpio_term, /* terminate */ + NULL, /* sb_size */ + NULL, /* sb_encode */ + NULL, /* sb_decode */ + 0, /* fapl_size */ + NULL, /* fapl_get */ + NULL, /* fapl_copy */ + NULL, /* fapl_free */ + 0, /* dxpl_size */ + NULL, /* dxpl_copy */ + NULL, /* dxpl_free */ + H5FD__mpio_open, /* open */ + H5FD__mpio_close, /* close */ + NULL, /* cmp */ + H5FD__mpio_query, /* query */ + NULL, /* get_type_map */ + NULL, /* alloc */ + NULL, /* free */ + H5FD__mpio_get_eoa, /* get_eoa */ + H5FD__mpio_set_eoa, /* set_eoa */ + H5FD__mpio_get_eof, /* get_eof */ + H5FD__mpio_get_handle, /* get_handle */ + H5FD__mpio_read, /* read */ + H5FD__mpio_write, /* write */ + H5FD__mpio_read_vector, /* read_vector */ + H5FD__mpio_write_vector, /* write_vector */ + H5FD__mpio_read_selection, /* read_selection */ + H5FD__mpio_write_selection, /* write_selection */ + H5FD__mpio_flush, /* flush */ + H5FD__mpio_truncate, /* truncate */ + NULL, /* lock */ + NULL, /* unlock */ + H5FD__mpio_delete, /* del */ + H5FD__mpio_ctl, /* ctl */ + H5FD_FLMAP_DICHOTOMY /* fl_map */ }; #ifdef H5FDmpio_DEBUG @@ -2717,6 +2733,926 @@ done: } /* end H5FD__mpio_write_vector() */ /*------------------------------------------------------------------------- + * Function: H5FD__selection_build_types + * + * Purpose: Build MPI derived datatype for each piece and then + * build MPI final derived datatype for file and memory. + * + * Note: This is derived from H5D__link_piece_collective_io() in + * src/H5Dmpio.c. + * + * Return: Non-negative on success/Negative on failure + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__selection_build_types(hbool_t io_op_write, size_t num_pieces, H5_flexible_const_ptr_t mbb, + H5S_t **file_spaces, H5S_t **mem_spaces, haddr_t offsets[], + H5_flexible_const_ptr_t bufs[], size_t src_element_sizes[], + size_t dst_element_sizes[], MPI_Datatype *final_ftype, + hbool_t *final_ftype_is_derived, MPI_Datatype *final_mtype, + hbool_t *final_mtype_is_derived) +{ + + MPI_Datatype *piece_mtype = NULL; + MPI_Datatype *piece_ftype = NULL; + MPI_Aint *piece_file_disp_array = NULL; + MPI_Aint *piece_mem_disp_array = NULL; + hbool_t *piece_mft_is_derived_array = + NULL; /* Flags to indicate each piece's MPI file datatype is derived */ + ; + hbool_t *piece_mmt_is_derived_array = + NULL; /* Flags to indicate each piece's MPI memory datatype is derived */ + int *piece_mpi_file_counts = NULL; /* Count of MPI file datatype for each piece */ + int *piece_mpi_mem_counts = NULL; /* Count of MPI memory datatype for each piece */ + + haddr_t base_file_addr; + size_t i; /* Local index variable */ + int mpi_code; /* MPI return code */ + + hbool_t extend_src_sizes = FALSE; + hbool_t extend_dst_sizes = FALSE; + hbool_t extend_bufs = FALSE; + H5_flexible_const_ptr_t buf; + size_t src_element_size, dst_element_size; + + herr_t ret_value = SUCCEED; + + FUNC_ENTER_PACKAGE + + /* Allocate information for num_pieces */ + if (NULL == (piece_mtype = (MPI_Datatype *)H5MM_malloc(num_pieces * sizeof(MPI_Datatype)))) + HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate piece memory datatype buffer"); + if (NULL == (piece_ftype = (MPI_Datatype *)H5MM_malloc(num_pieces * sizeof(MPI_Datatype)))) + HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate piece file datatype buffer"); + if (NULL == (piece_file_disp_array = (MPI_Aint *)H5MM_malloc(num_pieces * sizeof(MPI_Aint)))) + HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate piece file displacement buffer"); + if (NULL == (piece_mem_disp_array = (MPI_Aint *)H5MM_calloc(num_pieces * sizeof(MPI_Aint)))) + HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate piece memory displacement buffer"); + if (NULL == (piece_mpi_mem_counts = (int *)H5MM_calloc(num_pieces * sizeof(int)))) + HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate piece memory counts buffer"); + if (NULL == (piece_mpi_file_counts = (int *)H5MM_calloc(num_pieces * sizeof(int)))) + HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate piece file counts buffer"); + if (NULL == (piece_mmt_is_derived_array = (hbool_t *)H5MM_calloc(num_pieces * sizeof(hbool_t)))) + HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, + "couldn't allocate piece memory is derived datatype flags buffer"); + if (NULL == (piece_mft_is_derived_array = (hbool_t *)H5MM_calloc(num_pieces * sizeof(hbool_t)))) + HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, + "couldn't allocate piece file is derived datatype flags buffer"); + + /* save lowest file address */ + base_file_addr = offsets[0]; + + /* Obtain MPI derived datatype from all individual pieces */ + /* Iterate over selected pieces for this process */ + for (i = 0; i < num_pieces; i++) { + hsize_t *permute_map = NULL; /* array that holds the mapping from the old, + out-of-order displacements to the in-order + displacements of the MPI datatypes of the + point selection of the file space */ + hbool_t is_permuted = FALSE; + + if (!extend_src_sizes) { + if (src_element_sizes[i] == 0) { + extend_src_sizes = TRUE; + src_element_size = src_element_sizes[i - 1]; + } + else + src_element_size = src_element_sizes[i]; + } + + if (!extend_dst_sizes) { + if (dst_element_sizes[i] == 0) { + extend_dst_sizes = TRUE; + dst_element_size = dst_element_sizes[i - 1]; + } + else + dst_element_size = src_element_sizes[i]; + } + + if (!extend_bufs) { + if (bufs[i].cvp == NULL) { + extend_bufs = TRUE; + buf = bufs[i - 1]; + } + else + buf = bufs[i]; + } + + /* Obtain disk and memory MPI derived datatype */ + /* NOTE: The permute_map array can be allocated within H5S_mpio_space_type + * and will be fed into the next call to H5S_mpio_space_type + * where it will be freed. + */ + if (H5S_mpio_space_type(file_spaces[i], src_element_size, &piece_ftype[i], /* OUT: datatype created */ + &piece_mpi_file_counts[i], /* OUT */ + &(piece_mft_is_derived_array[i]), /* OUT */ + TRUE, /* this is a file space, + so permute the + datatype if the point + selections are out of + order */ + &permute_map, /* OUT: a map to indicate the + permutation of points + selected in case they + are out of order */ + &is_permuted /* OUT */) < 0) + HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "couldn't create MPI file type"); + + /* Sanity check */ + if (is_permuted) + assert(permute_map); + + if (H5S_mpio_space_type(mem_spaces[i], dst_element_size, &piece_mtype[i], &piece_mpi_mem_counts[i], + &(piece_mmt_is_derived_array[i]), FALSE, /* this is a memory + space, so if the file + space is not + permuted, there is no + need to permute the + datatype if the point + selections are out of + order*/ + &permute_map, /* IN: the permutation map + generated by the + file_space selection + and applied to the + memory selection */ + &is_permuted /* IN */) < 0) + HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "couldn't create MPI buf type"); + + /* Sanity check */ + if (is_permuted) + assert(!permute_map); + + /* Piece address relative to the first piece addr + * Assign piece address to MPI displacement + * (assume MPI_Aint big enough to hold it) */ + piece_file_disp_array[i] = (MPI_Aint)offsets[i] - (MPI_Aint)base_file_addr; + + if (io_op_write) { + piece_mem_disp_array[i] = (MPI_Aint)buf.cvp - (MPI_Aint)mbb.cvp; + } + else { + piece_mem_disp_array[i] = (MPI_Aint)buf.vp - (MPI_Aint)mbb.vp; + } + } /* end for */ + + /* Create final MPI derived datatype for the file */ + if (MPI_SUCCESS != (mpi_code = MPI_Type_create_struct((int)num_pieces, piece_mpi_file_counts, + piece_file_disp_array, piece_ftype, final_ftype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct failed", mpi_code); + + if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(final_ftype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code); + *final_ftype_is_derived = TRUE; + + /* Create final MPI derived datatype for memory */ + if (MPI_SUCCESS != (mpi_code = MPI_Type_create_struct((int)num_pieces, piece_mpi_mem_counts, + piece_mem_disp_array, piece_mtype, final_mtype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct failed", mpi_code); + + if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(final_mtype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code); + *final_mtype_is_derived = TRUE; + + /* Free the file & memory MPI datatypes for each piece */ + for (i = 0; i < num_pieces; i++) { + if (piece_mmt_is_derived_array[i]) + if (MPI_SUCCESS != (mpi_code = MPI_Type_free(piece_mtype + i))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); + + if (piece_mft_is_derived_array[i]) + if (MPI_SUCCESS != (mpi_code = MPI_Type_free(piece_ftype + i))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); + } /* end for */ + +done: + + /* Release resources */ + if (piece_mtype) + H5MM_xfree(piece_mtype); + if (piece_ftype) + H5MM_xfree(piece_ftype); + if (piece_file_disp_array) + H5MM_xfree(piece_file_disp_array); + if (piece_mem_disp_array) + H5MM_xfree(piece_mem_disp_array); + if (piece_mpi_mem_counts) + H5MM_xfree(piece_mpi_mem_counts); + if (piece_mpi_file_counts) + H5MM_xfree(piece_mpi_file_counts); + if (piece_mmt_is_derived_array) + H5MM_xfree(piece_mmt_is_derived_array); + if (piece_mft_is_derived_array) + H5MM_xfree(piece_mft_is_derived_array); + + FUNC_LEAVE_NOAPI(ret_value); + +} /* H5FD__selection_build_types() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__mpio_read_selection + * + * Purpose: The behaviour of this function dependes on the value of + * the transfer mode obtained from the context. + * + * If the transfer mode is H5FD_MPIO_COLLECTIVE: + * --sort the selections + * --set mpi_bufs_base + * --build the MPI derived types + * --perform MPI_File_set_view() + * --perform MPI_File_read_at_all() or MPI_File_read_at() + * depending on whether this is a H5FD_MPIO_COLLECTIVE_IO + * + * If this is not H5FD_MPIO_COLLECTIVE: + * --undo possible base address addition in internal routines + * --call H5FD_read_vector_from_selection() to perform vector + * or scalar writes for the selections + * + * Return: Success: SUCCEED. + * Failure: FAIL. + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__mpio_read_selection(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, size_t count, + hid_t mem_space_ids[], hid_t file_space_ids[], haddr_t offsets[], + size_t element_sizes[], void *bufs[] /* out */) +{ + H5FD_mpio_t *file = (H5FD_mpio_t *)_file; + MPI_Offset mpi_off; + MPI_Status mpi_stat; /* Status from I/O operation */ + int size_i; /* Integer copy of 'size' to read */ + + H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */ + H5FD_mpio_collective_opt_t coll_opt_mode; + + MPI_Datatype final_mtype; /* Final memory MPI datatype for all pieces with selection */ + hbool_t final_mtype_is_derived = FALSE; + + MPI_Datatype final_ftype; /* Final file MPI datatype for all pieces with selection */ + hbool_t final_ftype_is_derived = FALSE; + + hid_t *s_mem_space_ids = NULL; + hid_t *s_file_space_ids = NULL; + haddr_t *s_offsets = NULL; + size_t *s_element_sizes = NULL; + H5_flexible_const_ptr_t *s_bufs = NULL; + hbool_t selection_was_sorted = TRUE; + + uint32_t i, j; + H5S_t **s_mem_spaces = NULL; + H5S_t **s_file_spaces = NULL; + haddr_t tmp_offset = 0; + void *mpi_bufs_base = NULL; + char unused = 0; /* Unused, except for non-NULL pointer value */ + +#if H5_CHECK_MPI_VERSION(3, 0) + MPI_Count bytes_read = 0; /* Number of bytes read in */ + MPI_Count type_size; /* MPI datatype used for I/O's size */ + MPI_Count io_size; /* Actual number of bytes requested */ + MPI_Count n; +#else + int bytes_read = 0; /* Number of bytes read in */ + int type_size; /* MPI datatype used for I/O's size */ + int io_size; /* Actual number of bytes requested */ + int n; +#endif + hbool_t rank0_bcast = FALSE; /* If read-with-rank0-and-bcast flag was used */ +#ifdef H5FDmpio_DEBUG + hbool_t H5FD_mpio_debug_t_flag = (H5FD_mpio_debug_flags_s[(int)'t'] && H5FD_MPIO_TRACE_THIS_RANK(file)); + hbool_t H5FD_mpio_debug_r_flag = (H5FD_mpio_debug_flags_s[(int)'r'] && H5FD_MPIO_TRACE_THIS_RANK(file)); +#endif + int mpi_code; /* MPI return code */ + H5_flexible_const_ptr_t mbb; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_PACKAGE + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_t_flag) + fprintf(stderr, "%s: (%d) Entering\n", __func__, file->mpi_rank); +#endif + + /* Sanity checks */ + assert(file); + assert(H5FD_MPIO == file->pub.driver_id); + assert((count == 0) || (mem_space_ids)); + assert((count == 0) || (file_space_ids)); + assert((count == 0) || (offsets)); + assert((count == 0) || (element_sizes)); + assert((count == 0) || (bufs)); + + /* Verify that the first elements of the element_sizes and bufs arrays are + * valid. */ + assert((count == 0) || (element_sizes[0] != 0)); + assert((count == 0) || (bufs[0] != NULL)); + + /* Portably initialize MPI status variable */ + memset(&mpi_stat, 0, sizeof(MPI_Status)); + + /* Get the transfer mode from the API context */ + if (H5CX_get_io_xfer_mode(&xfer_mode) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode"); + + /* + * Set up for a fancy xfer using complex types, or single byte block. We + * wouldn't need to rely on the use_view field if MPI semantics allowed + * us to test that btype=ftype=MPI_BYTE (or even MPI_TYPE_NULL, which + * could mean "use MPI_BYTE" by convention). + */ + if (xfer_mode == H5FD_MPIO_COLLECTIVE) { + + if (count) { + if (H5FD_sort_selection_io_req(&selection_was_sorted, count, mem_space_ids, file_space_ids, + offsets, element_sizes, (H5_flexible_const_ptr_t *)bufs, + &s_mem_space_ids, &s_file_space_ids, &s_offsets, &s_element_sizes, + &s_bufs) < 0) + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "can't sort selection I/O request"); + + tmp_offset = s_offsets[0]; + + if (NULL == (s_file_spaces = H5MM_malloc(count * sizeof(H5S_t *)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, + "memory allocation failed for file space list"); + if (NULL == (s_mem_spaces = H5MM_malloc(count * sizeof(H5S_t *)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, + "memory allocation failed for memory space list"); + + for (i = 0; i < count; i++) { + if (NULL == (s_mem_spaces[i] = (H5S_t *)H5I_object_verify(s_mem_space_ids[i], H5I_DATASPACE))) + HGOTO_ERROR(H5E_VFL, H5E_BADTYPE, H5I_INVALID_HID, + "can't retrieve memory dataspace from ID"); + if (NULL == + (s_file_spaces[i] = (H5S_t *)H5I_object_verify(s_file_space_ids[i], H5I_DATASPACE))) + HGOTO_ERROR(H5E_VFL, H5E_BADTYPE, H5I_INVALID_HID, + "can't retrieve file dataspace from ID"); + } + + /* when we setup mpi_bufs[] below, all addresses are offsets from + * mpi_bufs_base. + * + * Since these offsets must all be positive, we must scan through + * s_bufs[] to find the smallest value, and choose that for + * mpi_bufs_base. + */ + j = 0; /* guess at the index of the smallest value of s_bufs[] */ + + if (s_bufs[j + 1].vp != NULL) { + for (i = 1; i < count; i++) + if (s_bufs[i].vp < s_bufs[j].vp) + j = i; + } + + mpi_bufs_base = s_bufs[j].vp; + mbb.vp = mpi_bufs_base; + + if (H5FD__selection_build_types(FALSE, count, mbb, s_file_spaces, s_mem_spaces, s_offsets, s_bufs, + s_element_sizes, s_element_sizes, &final_ftype, + &final_ftype_is_derived, &final_mtype, + &final_mtype_is_derived) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "couldn't build type for MPI-IO"); + + /* We have a single, complicated MPI datatype for both memory & file */ + size_i = 1; + } + else { + + /* No chunks selected for this process */ + size_i = 0; + + mpi_bufs_base = &unused; + + /* Set the MPI datatype */ + final_ftype = MPI_BYTE; + final_mtype = MPI_BYTE; + } + + /* some numeric conversions */ + if (H5FD_mpi_haddr_to_MPIOff(tmp_offset, &mpi_off /*out*/) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off"); + + /* + * Set the file view when we are using MPI derived types + */ + if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, mpi_off, MPI_BYTE, final_ftype, + H5FD_mpi_native_g, file->info))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code); + + /* When using types, use the address as the displacement for + * MPI_File_set_view and reset the address for the read to zero + */ + /* Reset mpi_off to 0 since the view now starts at the data offset */ + if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, &mpi_off) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0"); + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_r_flag) + fprintf(stderr, "%s: (%d) using MPIO collective mode\n", __func__, file->mpi_rank); +#endif + /* Get the collective_opt property to check whether the application wants to do IO individually. */ + if (H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property"); + + if (coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) { +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_r_flag) + fprintf(stderr, "%s: (%d) doing MPI collective IO\n", __func__, file->mpi_rank); +#endif + /* Check whether we should read from rank 0 and broadcast to other ranks */ + if (H5CX_get_mpio_rank0_bcast()) { +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_r_flag) + fprintf(stderr, "%s: (%d) doing read-rank0-and-MPI_Bcast\n", __func__, file->mpi_rank); +#endif + /* Indicate path we've taken */ + rank0_bcast = TRUE; + + /* Read on rank 0 Bcast to other ranks */ + if (file->mpi_rank == 0) { + /* If MPI_File_read_at fails, push an error, but continue + * to participate in following MPI_Bcast */ + if (MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, mpi_bufs_base, size_i, + final_mtype, &mpi_stat))) + HMPI_DONE_ERROR(FAIL, "MPI_File_read_at failed", mpi_code); + } + + if (MPI_SUCCESS != (mpi_code = MPI_Bcast(mpi_bufs_base, size_i, final_mtype, 0, file->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code); + } /* end if */ + else + /* Perform collective read operation */ + if (MPI_SUCCESS != (mpi_code = MPI_File_read_at_all(file->f, mpi_off, mpi_bufs_base, size_i, + final_mtype, &mpi_stat))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code); + } /* end if */ + else { +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_r_flag) + fprintf(stderr, "%s: (%d) doing MPI independent IO\n", __func__, file->mpi_rank); +#endif + + /* Perform independent read operation */ + if (MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, mpi_bufs_base, size_i, + final_mtype, &mpi_stat))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code); + } /* end else */ + + /* + * Reset the file view when we used MPI derived types + */ + if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, + H5FD_mpi_native_g, file->info))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code); + + /* Only retrieve bytes read if this rank _actually_ participated in I/O */ + if (!rank0_bcast || (rank0_bcast && file->mpi_rank == 0)) { + /* How many bytes were actually read? */ +#if H5_CHECK_MPI_VERSION(3, 0) + if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, final_mtype, &bytes_read))) { +#else + if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read))) { +#endif + if (rank0_bcast && file->mpi_rank == 0) { + /* If MPI_Get_elements(_x) fails for a rank 0 bcast strategy, + * push an error, but continue to participate in the following + * MPI_Bcast. + */ + bytes_read = -1; + HMPI_DONE_ERROR(FAIL, "MPI_Get_elements failed", mpi_code); + } + else + HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code); + } + } /* end if */ + + /* If the rank0-bcast feature was used, broadcast the # of bytes read to + * other ranks, which didn't perform any I/O. + */ + /* NOTE: This could be optimized further to be combined with the broadcast + * of the data. (QAK - 2019/1/2) + */ + if (rank0_bcast) +#if H5_CHECK_MPI_VERSION(3, 0) + if (MPI_SUCCESS != MPI_Bcast(&bytes_read, 1, MPI_COUNT, 0, file->comm)) +#else + if (MPI_SUCCESS != MPI_Bcast(&bytes_read, 1, MPI_INT, 0, file->comm)) +#endif + HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", 0); + + /* Get the type's size */ +#if H5_CHECK_MPI_VERSION(3, 0) + if (MPI_SUCCESS != (mpi_code = MPI_Type_size_x(final_mtype, &type_size))) +#else + if (MPI_SUCCESS != (mpi_code = MPI_Type_size(final_mtype, &type_size))) +#endif + HMPI_GOTO_ERROR(FAIL, "MPI_Type_size failed", mpi_code); + + /* Compute the actual number of bytes requested */ + io_size = type_size * size_i; + + /* Check for read failure */ + if (bytes_read < 0 || bytes_read > io_size) + HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed"); + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_r_flag) + fprintf(stderr, "%s: (%d) mpi_off = %ld bytes_read = %lld type = %s\n", __func__, + file->mpi_rank, (long)mpi_off, (long long)bytes_read, H5FD__mem_t_to_str(type)); +#endif + + /* + * This gives us zeroes beyond end of physical MPI file. + */ + if ((n = (io_size - bytes_read)) > 0) + memset((char *)bufs[0] + bytes_read, 0, (size_t)n); + + } /* end if */ + else { +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_r_flag) + fprintf(stderr, "%s: (%d) doing MPI independent IO\n", __func__, file->mpi_rank); +#endif + if (_file->base_addr > 0) { + /* Undo base address addition in internal routines before passing down to the mpio driver */ + for (i = 0; i < count; i++) { + assert(offsets[i] >= _file->base_addr); + offsets[i] -= _file->base_addr; + } + } + + if (H5FD_read_from_selection(_file, type, (uint32_t)count, mem_space_ids, file_space_ids, offsets, + element_sizes, bufs) < 0) + HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "read vector from selection failed"); + } + +done: + /* Free the MPI buf and file types, if they were derived */ + if (final_mtype_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&final_mtype))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); + if (final_ftype_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&final_ftype))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); + + /* Cleanup dataspace arrays */ + if (s_mem_spaces) + s_mem_spaces = H5MM_xfree(s_mem_spaces); + if (s_file_spaces) + s_file_spaces = H5MM_xfree(s_file_spaces); + + if (!selection_was_sorted) { + free(s_mem_space_ids); + s_mem_space_ids = NULL; + free(s_file_space_ids); + s_file_space_ids = NULL; + free(s_offsets); + s_offsets = NULL; + free(s_element_sizes); + s_element_sizes = NULL; + free(s_bufs); + s_bufs = NULL; + } + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_t_flag) + fprintf(stderr, "%s: (%d) Leaving\n", __func__, file->mpi_rank); +#endif + + FUNC_LEAVE_NOAPI(ret_value) + +} /* end H5FD__mpio_read_selection() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__mpio_write_selection + * + * Purpose: The behaviour of this function dependes on the value of + * the transfer mode obtained from the context. + * + * If the transfer mode is H5FD_MPIO_COLLECTIVE: + * --sort the selections + * --set mpi_bufs_base + * --build the MPI derived types + * --perform MPI_File_set_view() + * --perform MPI_File_write_at_all() or MPI_File_write_at() + * depending on whether this is a H5FD_MPIO_COLLECTIVE_IO + * --calculate and set the file's eof for the bytes written + * + * If this is not H5FD_MPIO_COLLECTIVE: + * --undo possible base address addition in internal routines + * --call H5FD_write_vector_from_selection() to perform vector + * or scalar writes for the selections + * + * Return: Success: SUCCEED. + * Failure: FAIL. + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__mpio_write_selection(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, size_t count, + hid_t mem_space_ids[], hid_t file_space_ids[], haddr_t offsets[], + size_t element_sizes[], const void *bufs[]) +{ + H5FD_mpio_t *file = (H5FD_mpio_t *)_file; + MPI_Offset mpi_off; + MPI_Offset save_mpi_off; /* Use at the end of the routine for setting local_eof */ + MPI_Status mpi_stat; /* Status from I/O operation */ + + int size_i; + H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */ + H5FD_mpio_collective_opt_t coll_opt_mode; + + MPI_Datatype final_mtype; /* Final memory MPI datatype for all pieces with selection */ + hbool_t final_mtype_is_derived = FALSE; + + MPI_Datatype final_ftype; /* Final file MPI datatype for all pieces with selection */ + hbool_t final_ftype_is_derived = FALSE; + + hid_t *s_mem_space_ids = NULL; + hid_t *s_file_space_ids = NULL; + haddr_t *s_offsets = NULL; + size_t *s_element_sizes = NULL; + H5_flexible_const_ptr_t *s_bufs = NULL; + hbool_t selection_was_sorted = TRUE; + const void *mpi_bufs_base = NULL; + + uint32_t i, j; + H5S_t **s_mem_spaces = NULL; + H5S_t **s_file_spaces = NULL; + haddr_t tmp_offset = 0; + char unused = 0; /* Unused, except for non-NULL pointer value */ + H5_flexible_const_ptr_t mbb; + +#if H5_CHECK_MPI_VERSION(3, 0) + MPI_Count bytes_written; + MPI_Count type_size; /* MPI datatype used for I/O's size */ + MPI_Count io_size; /* Actual number of bytes requested */ +#else + int bytes_written; + int type_size; /* MPI datatype used for I/O's size */ + int io_size; /* Actual number of bytes requested */ +#endif + +#ifdef H5FDmpio_DEBUG + hbool_t H5FD_mpio_debug_t_flag = (H5FD_mpio_debug_flags_s[(int)'t'] && H5FD_MPIO_TRACE_THIS_RANK(file)); + hbool_t H5FD_mpio_debug_w_flag = (H5FD_mpio_debug_flags_s[(int)'w'] && H5FD_MPIO_TRACE_THIS_RANK(file)); +#endif + int mpi_code; /* MPI return code */ + herr_t ret_value = SUCCEED; + + FUNC_ENTER_PACKAGE + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_t_flag) + fprintf(stderr, "%s: (%d) Entering\n", __func__, file->mpi_rank); +#endif + + /* Sanity checks */ + assert(file); + assert(H5FD_MPIO == file->pub.driver_id); + assert((count == 0) || (mem_space_ids)); + assert((count == 0) || (file_space_ids)); + assert((count == 0) || (offsets)); + assert((count == 0) || (element_sizes)); + assert((count == 0) || (bufs)); + + /* Verify that the first elements of the element_sizes and bufs arrays are + * valid. */ + assert((count == 0) || (element_sizes[0] != 0)); + assert((count == 0) || (bufs[0] != NULL)); + + /* Verify that no data is written when between MPI_Barrier()s during file flush */ + assert(!H5CX_get_mpi_file_flushing()); + + /* Portably initialize MPI status variable */ + memset(&mpi_stat, 0, sizeof(MPI_Status)); + + /* Get the transfer mode from the API context */ + if (H5CX_get_io_xfer_mode(&xfer_mode) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode"); + + if (xfer_mode == H5FD_MPIO_COLLECTIVE) { + + if (count) { + if (H5FD_sort_selection_io_req(&selection_was_sorted, count, mem_space_ids, file_space_ids, + offsets, element_sizes, (H5_flexible_const_ptr_t *)bufs, + &s_mem_space_ids, &s_file_space_ids, &s_offsets, &s_element_sizes, + &s_bufs) < 0) + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "can't sort selection I/O request"); + + tmp_offset = s_offsets[0]; + + if (NULL == (s_file_spaces = H5MM_malloc(count * sizeof(H5S_t *)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, + "memory allocation failed for file space list"); + if (NULL == (s_mem_spaces = H5MM_malloc(count * sizeof(H5S_t *)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, + "memory allocation failed for memory space list"); + + for (i = 0; i < count; i++) { + if (NULL == + (s_file_spaces[i] = (H5S_t *)H5I_object_verify(s_file_space_ids[i], H5I_DATASPACE))) + HGOTO_ERROR(H5E_VFL, H5E_BADTYPE, H5I_INVALID_HID, + "can't retrieve file dataspace from ID"); + if (NULL == (s_mem_spaces[i] = (H5S_t *)H5I_object_verify(s_mem_space_ids[i], H5I_DATASPACE))) + HGOTO_ERROR(H5E_VFL, H5E_BADTYPE, H5I_INVALID_HID, + "can't retrieve memory dataspace from ID"); + } + + /* when we setup mpi_bufs[] below, all addresses are offsets from + * mpi_bufs_base. + * + * Since these offsets must all be positive, we must scan through + * s_bufs[] to find the smallest value, and choose that for + * mpi_bufs_base. + */ + j = 0; /* guess at the index of the smallest value of s_bufs[] */ + + if (s_bufs[j + 1].cvp != NULL) { + for (i = 1; i < count; i++) + if (s_bufs[i].cvp < s_bufs[j].cvp) + j = i; + } + + mpi_bufs_base = s_bufs[j].cvp; + mbb.cvp = mpi_bufs_base; + + if (H5FD__selection_build_types(TRUE, count, mbb, s_file_spaces, s_mem_spaces, s_offsets, s_bufs, + s_element_sizes, s_element_sizes, &final_ftype, + &final_ftype_is_derived, &final_mtype, + &final_mtype_is_derived) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "couldn't build type for MPI-IO"); + + /* We have a single, complicated MPI datatype for both memory & file */ + size_i = 1; + } + else { + + /* No chunks selected for this process */ + size_i = 0; + + mpi_bufs_base = &unused; + + /* Set the MPI datatype */ + final_ftype = MPI_BYTE; + final_mtype = MPI_BYTE; + } + + /* some numeric conversions */ + if (H5FD_mpi_haddr_to_MPIOff(tmp_offset, &mpi_off /*out*/) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off"); + + /* To be used at the end of the routine for setting local_eof */ + save_mpi_off = mpi_off; + + /* + * Set the file view when we are using MPI derived types + */ + if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, mpi_off, MPI_BYTE, final_ftype, + H5FD_mpi_native_g, file->info))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code); + + /* Reset mpi_off to 0 since the view now starts at the data offset */ + if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, &mpi_off) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0"); + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_w_flag) + fprintf(stderr, "%s: (%d) using MPIO collective mode\n", __func__, file->mpi_rank); +#endif + + /* Get the collective_opt property to check whether the application wants to do IO individually. */ + if (H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property"); + + if (coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) { + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_w_flag) + fprintf(stderr, "%s: (%d) doing MPI collective IO\n", __func__, file->mpi_rank); +#endif + + /* Perform collective write operation */ + if (MPI_SUCCESS != (mpi_code = MPI_File_write_at_all(file->f, mpi_off, mpi_bufs_base, size_i, + final_mtype, &mpi_stat))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at_all failed", mpi_code); + + /* Do MPI_File_sync when needed by underlying ROMIO driver */ + if (file->mpi_file_sync_required) { + if (MPI_SUCCESS != (mpi_code = MPI_File_sync(file->f))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_sync failed", mpi_code); + } + } + else { + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_w_flag) + fprintf(stderr, "%s: (%d) doing MPI independent IO\n", __func__, file->mpi_rank); +#endif + /* Perform independent write operation */ + if (MPI_SUCCESS != (mpi_code = MPI_File_write_at(file->f, mpi_off, mpi_bufs_base, size_i, + final_mtype, &mpi_stat))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at failed", mpi_code); + } /* end else */ + + /* Reset the file view when we used MPI derived types */ + if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, + H5FD_mpi_native_g, file->info))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code); + + /* How many bytes were actually written */ +#if H5_CHECK_MPI_VERSION(3, 0) + if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, final_mtype, &bytes_written))) +#else + if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_written))) +#endif + HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code); + + /* Get the type's size */ +#if H5_CHECK_MPI_VERSION(3, 0) + if (MPI_SUCCESS != (mpi_code = MPI_Type_size_x(final_mtype, &type_size))) +#else + if (MPI_SUCCESS != (mpi_code = MPI_Type_size(final_mtype, &type_size))) +#endif + HMPI_GOTO_ERROR(FAIL, "MPI_Type_size failed", mpi_code); + + /* Compute the actual number of bytes requested */ + io_size = type_size * size_i; + + /* Check for write failure */ + if (bytes_written != io_size || bytes_written < 0) + HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "file write failed"); + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_w_flag) + fprintf(stderr, "%s: (%d) mpi_off = %ld bytes_written = %lld type = %s\n", __func__, + file->mpi_rank, (long)mpi_off, (long long)bytes_written, H5FD__mem_t_to_str(type)); +#endif + + /* Each process will keep track of its perceived EOF value locally, and + * ultimately we will reduce this value to the maximum amongst all + * processes, but until then keep the actual eof at HADDR_UNDEF just in + * case something bad happens before that point. (rather have a value + * we know is wrong sitting around rather than one that could only + * potentially be wrong.) */ + file->eof = HADDR_UNDEF; + + if (bytes_written && (((haddr_t)bytes_written + (haddr_t)save_mpi_off) > file->local_eof)) + file->local_eof = (haddr_t)save_mpi_off + (haddr_t)bytes_written; + } + else { /* Not H5FD_MPIO_COLLECTIVE */ + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_w_flag) + fprintf(stderr, "%s: (%d) doing MPI independent IO\n", __func__, file->mpi_rank); +#endif + if (_file->base_addr > 0) { + /* Undo base address addition in internal routines before passing down to the mpio driver */ + for (i = 0; i < count; i++) { + assert(offsets[i] >= _file->base_addr); + offsets[i] -= _file->base_addr; + } + } + + if (H5FD_write_from_selection(_file, type, (uint32_t)count, mem_space_ids, file_space_ids, offsets, + element_sizes, bufs) < 0) + HGOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "write vector from selection failed"); + } + +done: + /* Free the MPI buf and file types, if they were derived */ + if (final_mtype_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&final_mtype))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); + if (final_ftype_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&final_ftype))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); + + /* Cleanup dataspace arrays */ + if (s_mem_spaces) + s_mem_spaces = H5MM_xfree(s_mem_spaces); + if (s_file_spaces) + s_file_spaces = H5MM_xfree(s_file_spaces); + + if (!selection_was_sorted) { + free(s_mem_space_ids); + s_mem_space_ids = NULL; + free(s_file_space_ids); + s_file_space_ids = NULL; + free(s_offsets); + s_offsets = NULL; + free(s_element_sizes); + s_element_sizes = NULL; + free(s_bufs); + s_bufs = NULL; + } + +#ifdef H5FDmpio_DEBUG + if (H5FD_mpio_debug_t_flag) + fprintf(stderr, "%s: (%d) Leaving: ret_value = %d\n", __func__, file->mpi_rank, ret_value); +#endif + + FUNC_LEAVE_NOAPI(ret_value) + +} /* end H5FD__mpio_write_selection() */ + +/*------------------------------------------------------------------------- * Function: H5FD__mpio_flush * * Purpose: Makes sure that all data is on disk. This is collective. diff --git a/src/H5FDprivate.h b/src/H5FDprivate.h index c4ccfdd..6b8e2da 100644 --- a/src/H5FDprivate.h +++ b/src/H5FDprivate.h @@ -77,6 +77,10 @@ typedef struct { } \ } +#define SKIP_NO_CB 0x00u +#define SKIP_SELECTION_CB 0x01u +#define SKIP_VECTOR_CB 0x02u + /* Define structure to hold driver ID, info & configuration string for FAPLs */ typedef struct { hid_t driver_id; /* Driver's ID */ @@ -149,12 +153,27 @@ H5_DLL herr_t H5FD_read_selection(H5FD_t *file, H5FD_mem_t type, uint32_t count H5_DLL herr_t H5FD_write_selection(H5FD_t *file, H5FD_mem_t type, uint32_t count, struct H5S_t **mem_spaces, struct H5S_t **file_spaces, haddr_t offsets[], size_t element_sizes[], const void *bufs[]); -H5_DLL herr_t H5FD_read_selection_id(H5FD_t *file, H5FD_mem_t type, uint32_t count, hid_t mem_space_ids[], - hid_t file_space_ids[], haddr_t offsets[], size_t element_sizes[], - void *bufs[] /* out */); -H5_DLL herr_t H5FD_write_selection_id(H5FD_t *file, H5FD_mem_t type, uint32_t count, hid_t mem_space_ids[], +H5_DLL herr_t H5FD_read_selection_id(uint32_t skip_cb, H5FD_t *file, H5FD_mem_t type, uint32_t count, + hid_t mem_space_ids[], hid_t file_space_ids[], haddr_t offsets[], + size_t element_sizes[], void *bufs[] /* out */); +H5_DLL herr_t H5FD_write_selection_id(uint32_t skip_cb, H5FD_t *file, H5FD_mem_t type, uint32_t count, + hid_t mem_space_ids[], hid_t file_space_ids[], haddr_t offsets[], + size_t element_sizes[], const void *bufs[]); +H5_DLL herr_t H5FD_read_vector_from_selection(H5FD_t *file, H5FD_mem_t type, uint32_t count, + hid_t mem_space_ids[], hid_t file_space_ids[], + haddr_t offsets[], size_t element_sizes[], void *bufs[]); + +H5_DLL herr_t H5FD_write_vector_from_selection(H5FD_t *file, H5FD_mem_t type, uint32_t count, + hid_t mem_space_ids[], hid_t file_space_ids[], + haddr_t offsets[], size_t element_sizes[], const void *bufs[]); + +H5_DLL herr_t H5FD_read_from_selection(H5FD_t *file, H5FD_mem_t type, uint32_t count, hid_t mem_space_ids[], hid_t file_space_ids[], haddr_t offsets[], size_t element_sizes[], - const void *bufs[]); + void *bufs[]); + +H5_DLL herr_t H5FD_write_from_selection(H5FD_t *file, H5FD_mem_t type, uint32_t count, hid_t mem_space_ids[], + hid_t file_space_ids[], haddr_t offsets[], size_t element_sizes[], + const void *bufs[]); H5_DLL herr_t H5FD_flush(H5FD_t *file, hbool_t closing); H5_DLL herr_t H5FD_truncate(H5FD_t *file, hbool_t closing); H5_DLL herr_t H5FD_lock(H5FD_t *file, hbool_t rw); @@ -171,6 +190,12 @@ H5_DLL herr_t H5FD_sort_vector_io_req(hbool_t *vector_was_sorted, uint32_t count haddr_t addrs[], size_t sizes[], H5_flexible_const_ptr_t bufs[], H5FD_mem_t **s_types_ptr, haddr_t **s_addrs_ptr, size_t **s_sizes_ptr, H5_flexible_const_ptr_t **s_bufs_ptr); + +H5_DLL herr_t H5FD_sort_selection_io_req(hbool_t *selection_was_sorted, size_t count, hid_t mem_space_ids[], + hid_t file_space_ids[], haddr_t offsets[], size_t element_sizes[], + H5_flexible_const_ptr_t bufs[], hid_t **s_mem_space_ids, + hid_t **s_file_space_ids, haddr_t **s_offsets_ptr, + size_t **s_element_sizes_ptr, H5_flexible_const_ptr_t **s_bufs_ptr); H5_DLL herr_t H5FD_init(void); /* Function prototypes for MPI based VFDs*/ |