diff options
Diffstat (limited to 'src/H5Dmpio.c')
-rw-r--r-- | src/H5Dmpio.c | 1697 |
1 files changed, 804 insertions, 893 deletions
diff --git a/src/H5Dmpio.c b/src/H5Dmpio.c index 17ae0e5..3bddf0b 100644 --- a/src/H5Dmpio.c +++ b/src/H5Dmpio.c @@ -55,12 +55,15 @@ #define H5D_MULTI_CHUNK_IO 1 #define H5D_ONE_LINK_CHUNK_IO_MORE_OPT 2 #define H5D_MULTI_CHUNK_IO_MORE_OPT 3 +#define H5D_NO_IO 4 /***** Macros for One linked collective IO case. *****/ /* The default value to do one linked collective IO for all chunks. - If the average number of chunks per process is greater than this value, - the library will create an MPI derived datatype to link all chunks to do collective IO. - The user can set this value through an API. */ + * If the average number of chunks per process is greater than this + * value, the library will create an MPI derived datatype to link all + * chunks to do collective IO. The user can set this value through an + * API. + */ /* Macros to represent options on how to obtain chunk address for one linked-chunk IO case */ #define H5D_OBTAIN_ONE_CHUNK_ADDR_IND 0 @@ -71,10 +74,10 @@ #define H5D_ALL_CHUNK_ADDR_THRES_COL_NUM 10000 /***** Macros for multi-chunk collective IO case. *****/ -/* The default value of the threshold to do collective IO for this chunk. - If the average number of processes per chunk is greater than the default value, - collective IO is done for this chunk. -*/ +/* The default value of the threshold to do collective IO for this + * chunk. If the average number of processes per chunk is greater + * than the default value, collective IO is done for this chunk. + */ /* Macros to represent different IO modes(NONE, Independent or collective)for multiple chunk IO case */ #define H5D_CHUNK_IO_MODE_COL 1 @@ -107,12 +110,12 @@ * structure, given a pointer to a H5D_io_info_t * structure */ -#define H5D_MPIO_INIT_CHUNK_IDX_INFO(index_info, io_info_ptr) \ +#define H5D_MPIO_INIT_CHUNK_IDX_INFO(index_info, dset) \ do { \ - index_info.f = (io_info_ptr)->dset->oloc.file; \ - index_info.pline = &((io_info_ptr)->dset->shared->dcpl_cache.pline); \ - index_info.layout = &((io_info_ptr)->dset->shared->layout.u.chunk); \ - index_info.storage = &((io_info_ptr)->dset->shared->layout.storage.u.chunk); \ + index_info.f = (dset)->oloc.file; \ + index_info.pline = &((dset)->shared->dcpl_cache.pline); \ + index_info.layout = &((dset)->shared->layout.u.chunk); \ + index_info.storage = &((dset)->shared->layout.storage.u.chunk); \ } while (0) /* @@ -130,10 +133,12 @@ /* Local Typedefs */ /******************/ -/* Combine chunk address and chunk info into a struct for better performance. */ +/* Combine chunk/piece address and chunk/piece info into a struct for + * better performance. */ typedef struct H5D_chunk_addr_info_t { - haddr_t chunk_addr; - H5D_chunk_info_t chunk_info; + /* piece for multi-dset */ + haddr_t piece_addr; + H5D_piece_info_t piece_info; } H5D_chunk_addr_info_t; /* Rank 0 Bcast values */ @@ -187,7 +192,7 @@ typedef struct H5D_chunk_index_info_t { * need_insert - A flag which determines whether or not a chunk needs to be re-inserted into * the chunk index after the write operation. * - * chunk_info - A pointer to the chunk's H5D_chunk_info_t structure, which contains useful + * chunk_info - A pointer to the chunk's H5D_piece_info_t structure, which contains useful * information like the dataspaces containing the selection in the chunk. * * chunk_current - The address in the file and size of this chunk before the filtering @@ -241,7 +246,7 @@ typedef struct H5D_chunk_index_info_t { typedef struct H5D_filtered_collective_io_info_t { H5D_chunk_index_info_t index_info; - H5D_chunk_info_t *chunk_info; + H5D_piece_info_t *chunk_info; H5F_block_t chunk_current; H5F_block_t chunk_new; hbool_t need_read; @@ -281,65 +286,57 @@ typedef struct H5D_chunk_insert_info_t { /********************/ /* Local Prototypes */ /********************/ -static herr_t H5D__chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, - H5D_chunk_map_t *fm); -static herr_t H5D__multi_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, - H5D_chunk_map_t *fm, int mpi_rank, int mpi_size); -static herr_t H5D__multi_chunk_filtered_collective_io(H5D_io_info_t *io_info, - const H5D_type_info_t *type_info, H5D_chunk_map_t *fm, +static herr_t H5D__piece_io(H5D_io_info_t *io_info); +static herr_t H5D__multi_chunk_collective_io(H5D_io_info_t *io_info, H5D_dset_io_info_t *dset_info, + int mpi_rank, int mpi_size); +static herr_t H5D__multi_chunk_filtered_collective_io(H5D_io_info_t *io_info, H5D_dset_io_info_t *dset_info, int mpi_rank, int mpi_size); -static herr_t H5D__link_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, - H5D_chunk_map_t *fm, int sum_chunk, int mpi_rank, int mpi_size); -static herr_t H5D__link_chunk_filtered_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, - H5D_chunk_map_t *fm, int mpi_rank, int mpi_size); -static herr_t H5D__inter_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, +static herr_t H5D__link_piece_collective_io(H5D_io_info_t *io_info, int mpi_rank); +static herr_t H5D__link_chunk_filtered_collective_io(H5D_io_info_t *io_info, H5D_dset_io_info_t *dset_info, + int mpi_rank, int mpi_size); +static herr_t H5D__inter_collective_io(H5D_io_info_t *io_info, const H5D_dset_io_info_t *di, H5S_t *file_space, H5S_t *mem_space); -static herr_t H5D__final_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, - hsize_t nelmts, MPI_Datatype mpi_file_type, MPI_Datatype mpi_buf_type); -static herr_t H5D__sort_chunk(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm, - H5D_chunk_addr_info_t chunk_addr_info_array[], int many_chunk_opt, int mpi_rank, - int mpi_size); -static herr_t H5D__obtain_mpio_mode(H5D_io_info_t *io_info, H5D_chunk_map_t *fm, uint8_t assign_io_mode[], +static herr_t H5D__final_collective_io(H5D_io_info_t *io_info, hsize_t mpi_buf_count, + MPI_Datatype mpi_file_type, MPI_Datatype mpi_buf_type); +static herr_t H5D__obtain_mpio_mode(H5D_io_info_t *io_info, H5D_dset_io_info_t *di, uint8_t assign_io_mode[], haddr_t chunk_addr[], int mpi_rank, int mpi_size); -static herr_t H5D__mpio_get_sum_chunk(const H5D_io_info_t *io_info, const H5D_chunk_map_t *fm, - int *sum_chunkf); +static herr_t H5D__mpio_get_sum_chunk(const H5D_io_info_t *io_info, int *sum_chunkf); +static herr_t H5D__mpio_get_sum_chunk_dset(const H5D_io_info_t *io_info, const H5D_dset_io_info_t *dset_info, + int *sum_chunkf); static herr_t H5D__mpio_collective_filtered_chunk_io_setup(const H5D_io_info_t *io_info, - const H5D_type_info_t *type_info, - const H5D_chunk_map_t *fm, + const H5D_dset_io_info_t *di, H5D_filtered_collective_io_info_t **chunk_list, size_t *num_entries, int mpi_rank); static herr_t H5D__mpio_redistribute_shared_chunks(H5D_filtered_collective_io_info_t *chunk_list, size_t chunk_list_num_entries, - const H5D_io_info_t *io_info, const H5D_chunk_map_t *fm, - int mpi_rank, int mpi_size, + const H5D_io_info_t *io_info, int mpi_rank, int mpi_size, size_t **rank_chunks_assigned_map); static herr_t H5D__mpio_redistribute_shared_chunks_int(H5D_filtered_collective_io_info_t *chunk_list, - size_t *num_chunks_assigned_map, - hbool_t all_ranks_involved, - const H5D_io_info_t *io_info, - const H5D_chunk_map_t *fm, int mpi_rank, int mpi_size); + size_t *num_chunks_assigned_map, + hbool_t all_ranks_involved, + const H5D_io_info_t *io_info, int mpi_rank, + int mpi_size); static herr_t H5D__mpio_share_chunk_modification_data(H5D_filtered_collective_io_info_t *chunk_list, size_t *chunk_list_num_entries, H5D_io_info_t *io_info, - const H5D_type_info_t *type_info, int mpi_rank, + H5D_dset_io_info_t *dset_info, int mpi_rank, int mpi_size, H5D_filtered_collective_io_info_t **chunk_hash_table, unsigned char ***chunk_msg_bufs, int *chunk_msg_bufs_len); static herr_t H5D__mpio_collective_filtered_chunk_common_io(H5D_filtered_collective_io_info_t *chunk_list, - size_t chunk_list_num_entries, - const H5D_io_info_t *io_info, - const H5D_type_info_t *type_info, int mpi_size); + size_t chunk_list_num_entries, + const H5D_io_info_t *io_info, int mpi_size); static herr_t H5D__mpio_collective_filtered_chunk_read(H5D_filtered_collective_io_info_t *chunk_list, - size_t chunk_list_num_entries, - const H5D_io_info_t *io_info, - const H5D_type_info_t *type_info, int mpi_rank, + size_t chunk_list_num_entries, + const H5D_io_info_t *io_info, + const H5D_dset_io_info_t *di, int mpi_rank, int mpi_size); static herr_t H5D__mpio_collective_filtered_chunk_update(H5D_filtered_collective_io_info_t *chunk_list, size_t chunk_list_num_entries, H5D_filtered_collective_io_info_t *chunk_hash_table, unsigned char **chunk_msg_bufs, int chunk_msg_bufs_len, const H5D_io_info_t *io_info, - const H5D_type_info_t *type_info, int mpi_rank, + const H5D_dset_io_info_t *di, int mpi_rank, int mpi_size); static herr_t H5D__mpio_collective_filtered_chunk_reallocate(H5D_filtered_collective_io_info_t *chunk_list, size_t chunk_list_num_entries, @@ -348,9 +345,9 @@ static herr_t H5D__mpio_collective_filtered_chunk_reallocate(H5D_filtered_collec H5D_chk_idx_info_t *idx_info, int mpi_rank, int mpi_size); static herr_t H5D__mpio_collective_filtered_chunk_reinsert(H5D_filtered_collective_io_info_t *chunk_list, - size_t chunk_list_num_entries, - size_t *num_chunks_assigned_map, - H5D_io_info_t *io_info, + size_t chunk_list_num_entries, + size_t *num_chunks_assigned_map, + H5D_io_info_t *io_info, H5D_dset_io_info_t *di, H5D_chk_idx_info_t *idx_info, int mpi_rank, int mpi_size); static herr_t H5D__mpio_get_chunk_redistribute_info_types(MPI_Datatype *contig_type, @@ -366,7 +363,7 @@ static herr_t H5D__mpio_collective_filtered_io_type(H5D_filtered_collective_io_i size_t num_entries, H5D_io_op_type_t op_type, MPI_Datatype *new_mem_type, hbool_t *mem_type_derived, MPI_Datatype *new_file_type, hbool_t *file_type_derived); -static int H5D__cmp_chunk_addr(const void *chunk_addr_info1, const void *chunk_addr_info2); +static int H5D__cmp_piece_addr(const void *chunk_addr_info1, const void *chunk_addr_info2); static int H5D__cmp_filtered_collective_io_info_entry(const void *filtered_collective_io_info_entry1, const void *filtered_collective_io_info_entry2); static int H5D__cmp_chunk_redistribute_info(const void *entry1, const void *entry2); @@ -572,34 +569,40 @@ H5D__mpio_debug_init(void) * Function: H5D__mpio_opt_possible * * Purpose: Checks if an direct I/O transfer is possible between memory and - * the file. + * the file. + * + * This was derived from H5D__mpio_opt_possible for + * multi-dset work. * * Return: Success: Non-negative: TRUE or FALSE * Failure: Negative * - * Programmer: Quincey Koziol - * Wednesday, April 3, 2002 - * *------------------------------------------------------------------------- */ htri_t -H5D__mpio_opt_possible(const H5D_io_info_t *io_info, const H5S_t *file_space, const H5S_t *mem_space, - const H5D_type_info_t *type_info) +H5D__mpio_opt_possible(H5D_io_info_t *io_info) { - H5FD_mpio_xfer_t io_xfer_mode; /* MPI I/O transfer mode */ + H5FD_mpio_xfer_t io_xfer_mode; /* MPI I/O transfer mode */ + size_t i; + H5D_t *dset; + const H5S_t *file_space; + const H5S_t *mem_space; + H5D_type_info_t *type_info; unsigned local_cause[2] = {0, 0}; /* [0] Local reason(s) for breaking collective mode */ /* [1] Flag if dataset is both: H5S_ALL and small */ unsigned global_cause[2] = {0, 0}; /* Global reason(s) for breaking collective mode */ - htri_t is_vl_storage; /* Whether the dataset's datatype is stored in a variable-length form */ - htri_t ret_value = SUCCEED; /* Return value */ + htri_t is_vl_storage; /* Whether the dataset's datatype is stored in a variable-length form */ + htri_t ret_value = TRUE; /* Return value */ FUNC_ENTER_PACKAGE /* Check args */ HDassert(io_info); - HDassert(mem_space); - HDassert(file_space); - HDassert(type_info); + + for (i = 0; i < io_info->count; i++) { + HDassert(io_info->dsets_info[i].file_space); + HDassert(io_info->dsets_info[i].mem_space); + } /* For independent I/O, get out quickly and don't try to form consensus */ if (H5CX_get_io_xfer_mode(&io_xfer_mode) < 0) @@ -608,90 +611,103 @@ H5D__mpio_opt_possible(const H5D_io_info_t *io_info, const H5S_t *file_space, co if (io_xfer_mode == H5FD_MPIO_INDEPENDENT) local_cause[0] |= H5D_MPIO_SET_INDEPENDENT; - /* Optimized MPI types flag must be set */ - /* (based on 'HDF5_MPI_OPT_TYPES' environment variable) */ - if (!H5FD_mpi_opt_types_g) - local_cause[0] |= H5D_MPIO_MPI_OPT_TYPES_ENV_VAR_DISABLED; - - /* Don't allow collective operations if datatype conversions need to happen */ - if (!type_info->is_conv_noop) - local_cause[0] |= H5D_MPIO_DATATYPE_CONVERSION; - - /* Don't allow collective operations if data transform operations should occur */ - if (!type_info->is_xform_noop) - local_cause[0] |= H5D_MPIO_DATA_TRANSFORMS; - - /* Check whether these are both simple or scalar dataspaces */ - if (!((H5S_SIMPLE == H5S_GET_EXTENT_TYPE(mem_space) || H5S_SCALAR == H5S_GET_EXTENT_TYPE(mem_space)) && - (H5S_SIMPLE == H5S_GET_EXTENT_TYPE(file_space) || H5S_SCALAR == H5S_GET_EXTENT_TYPE(file_space)))) - local_cause[0] |= H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES; - - /* Dataset storage must be contiguous or chunked */ - if (!(io_info->dset->shared->layout.type == H5D_CONTIGUOUS || - io_info->dset->shared->layout.type == H5D_CHUNKED)) - local_cause[0] |= H5D_MPIO_NOT_CONTIGUOUS_OR_CHUNKED_DATASET; - - /* check if external-file storage is used */ - if (io_info->dset->shared->dcpl_cache.efl.nused > 0) - local_cause[0] |= H5D_MPIO_NOT_CONTIGUOUS_OR_CHUNKED_DATASET; - - /* The handling of memory space is different for chunking and contiguous - * storage. For contiguous storage, mem_space and file_space won't change - * when it it is doing disk IO. For chunking storage, mem_space will - * change for different chunks. So for chunking storage, whether we can - * use collective IO will defer until each chunk IO is reached. - */ + for (i = 0; i < io_info->count; i++) { + /* Check for skipped I/O */ + if (io_info->dsets_info[i].skip_io) + continue; + + /* Set convenience pointers */ + dset = io_info->dsets_info[i].dset; + file_space = io_info->dsets_info[i].file_space; + mem_space = io_info->dsets_info[i].mem_space; + type_info = &io_info->dsets_info[i].type_info; + + /* Optimized MPI types flag must be set */ + /* (based on 'HDF5_MPI_OPT_TYPES' environment variable) */ + if (!H5FD_mpi_opt_types_g) + local_cause[0] |= H5D_MPIO_MPI_OPT_TYPES_ENV_VAR_DISABLED; + + /* Don't allow collective operations if datatype conversions need to happen */ + if (!type_info->is_conv_noop) + local_cause[0] |= H5D_MPIO_DATATYPE_CONVERSION; + + /* Don't allow collective operations if data transform operations should occur */ + if (!type_info->is_xform_noop) + local_cause[0] |= H5D_MPIO_DATA_TRANSFORMS; + + /* Check whether these are both simple or scalar dataspaces */ + if (!((H5S_SIMPLE == H5S_GET_EXTENT_TYPE(mem_space) || + H5S_SCALAR == H5S_GET_EXTENT_TYPE(mem_space)) && + (H5S_SIMPLE == H5S_GET_EXTENT_TYPE(file_space) || + H5S_SCALAR == H5S_GET_EXTENT_TYPE(file_space)))) + local_cause[0] |= H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES; + + /* Dataset storage must be contiguous or chunked */ + if (!(dset->shared->layout.type == H5D_CONTIGUOUS || dset->shared->layout.type == H5D_CHUNKED)) + local_cause[0] |= H5D_MPIO_NOT_CONTIGUOUS_OR_CHUNKED_DATASET; + + /* check if external-file storage is used */ + if (dset->shared->dcpl_cache.efl.nused > 0) + local_cause[0] |= H5D_MPIO_NOT_CONTIGUOUS_OR_CHUNKED_DATASET; + + /* The handling of memory space is different for chunking and contiguous + * storage. For contiguous storage, mem_space and file_space won't change + * when it it is doing disk IO. For chunking storage, mem_space will + * change for different chunks. So for chunking storage, whether we can + * use collective IO will defer until each chunk IO is reached. + */ #ifndef H5_HAVE_PARALLEL_FILTERED_WRITES - /* Don't allow writes to filtered datasets if the functionality is disabled */ - if (io_info->op_type == H5D_IO_OP_WRITE && io_info->dset->shared->dcpl_cache.pline.nused > 0) - local_cause[0] |= H5D_MPIO_PARALLEL_FILTERED_WRITES_DISABLED; + /* Don't allow writes to filtered datasets if the functionality is disabled */ + if (io_info->op_type == H5D_IO_OP_WRITE && dset->shared->dcpl_cache.pline.nused > 0) + local_cause[0] |= H5D_MPIO_PARALLEL_FILTERED_WRITES_DISABLED; #endif - /* Check if we are able to do a MPI_Bcast of the data from one rank - * instead of having all the processes involved in the collective I/O call. - */ - - /* Check to see if the process is reading the entire dataset */ - if (H5S_GET_SELECT_TYPE(file_space) != H5S_SEL_ALL) - local_cause[1] |= H5D_MPIO_RANK0_NOT_H5S_ALL; - /* Only perform this optimization for contiguous datasets, currently */ - else if (H5D_CONTIGUOUS != io_info->dset->shared->layout.type) - /* Flag to do a MPI_Bcast of the data from one proc instead of - * having all the processes involved in the collective I/O. + /* Check if we are able to do a MPI_Bcast of the data from one rank + * instead of having all the processes involved in the collective I/O call. */ - local_cause[1] |= H5D_MPIO_RANK0_NOT_CONTIGUOUS; - else if ((is_vl_storage = H5T_is_vl_storage(type_info->dset_type)) < 0) - local_cause[0] |= H5D_MPIO_ERROR_WHILE_CHECKING_COLLECTIVE_POSSIBLE; - else if (is_vl_storage) - local_cause[1] |= H5D_MPIO_RANK0_NOT_FIXED_SIZE; - else { - size_t type_size; /* Size of dataset's datatype */ - /* Retrieve the size of the dataset's datatype */ - if (0 == (type_size = H5T_GET_SIZE(type_info->dset_type))) + /* Check to see if the process is reading the entire dataset */ + if (H5S_GET_SELECT_TYPE(file_space) != H5S_SEL_ALL) + local_cause[1] |= H5D_MPIO_RANK0_NOT_H5S_ALL; + /* Only perform this optimization for contiguous datasets, currently */ + else if (H5D_CONTIGUOUS != dset->shared->layout.type) + /* Flag to do a MPI_Bcast of the data from one proc instead of + * having all the processes involved in the collective I/O. + */ + local_cause[1] |= H5D_MPIO_RANK0_NOT_CONTIGUOUS; + else if ((is_vl_storage = H5T_is_vl_storage(type_info->dset_type)) < 0) local_cause[0] |= H5D_MPIO_ERROR_WHILE_CHECKING_COLLECTIVE_POSSIBLE; + else if (is_vl_storage) + local_cause[1] |= H5D_MPIO_RANK0_NOT_FIXED_SIZE; else { - hssize_t snelmts; /* [Signed] # of elements in dataset's dataspace */ + size_t type_size; /* Size of dataset's datatype */ /* Retrieve the size of the dataset's datatype */ - if ((snelmts = H5S_GET_EXTENT_NPOINTS(file_space)) < 0) + if (0 == (type_size = H5T_GET_SIZE(type_info->dset_type))) local_cause[0] |= H5D_MPIO_ERROR_WHILE_CHECKING_COLLECTIVE_POSSIBLE; else { - hsize_t dset_size; + hssize_t snelmts; /* [Signed] # of elements in dataset's dataspace */ - /* Determine dataset size */ - dset_size = ((hsize_t)snelmts) * type_size; + /* Retrieve the size of the dataset's datatype */ + if ((snelmts = H5S_GET_EXTENT_NPOINTS(file_space)) < 0) + local_cause[0] |= H5D_MPIO_ERROR_WHILE_CHECKING_COLLECTIVE_POSSIBLE; + else { + hsize_t dset_size; - /* If the size of the dataset is less than 2GB then do an MPI_Bcast - * of the data from one process instead of having all the processes - * involved in the collective I/O. - */ - if (dset_size > ((hsize_t)(2.0F * H5_GB) - 1)) - local_cause[1] |= H5D_MPIO_RANK0_GREATER_THAN_2GB; - } /* end else */ - } /* end else */ - } /* end else */ + /* Determine dataset size */ + dset_size = ((hsize_t)snelmts) * type_size; + + /* If the size of the dataset is less than 2GB then do an MPI_Bcast + * of the data from one process instead of having all the processes + * involved in the collective I/O. + */ + if (dset_size > ((hsize_t)(2.0F * H5_GB) - 1)) + local_cause[1] |= H5D_MPIO_RANK0_GREATER_THAN_2GB; + } /* end else */ + } /* end else */ + } /* end else */ + } /* end for loop */ /* Check for independent I/O */ if (local_cause[0] & H5D_MPIO_SET_INDEPENDENT) @@ -875,26 +891,29 @@ done: * * Purpose: MPI-IO function to read directly from app buffer to file. * - * Return: non-negative on success, negative on failure. + * This was referred from H5D__mpio_select_read for + * multi-dset work. * - * Programmer: + * Return: non-negative on success, negative on failure. * *------------------------------------------------------------------------- */ herr_t -H5D__mpio_select_read(const H5D_io_info_t *io_info, const H5D_type_info_t H5_ATTR_UNUSED *type_info, - hsize_t mpi_buf_count, H5S_t H5_ATTR_UNUSED *file_space, +H5D__mpio_select_read(const H5D_io_info_t *io_info, hsize_t mpi_buf_count, H5S_t H5_ATTR_UNUSED *file_space, H5S_t H5_ATTR_UNUSED *mem_space) { - const H5D_contig_storage_t *store_contig = - &(io_info->store->contig); /* Contiguous storage info for this I/O operation */ + void *rbuf = NULL; herr_t ret_value = SUCCEED; FUNC_ENTER_PACKAGE + /* memory addr from a piece with lowest file addr */ + rbuf = io_info->base_maddr.vp; + + /*OKAY: CAST DISCARDS CONST QUALIFIER*/ H5_CHECK_OVERFLOW(mpi_buf_count, hsize_t, size_t); - if (H5F_shared_block_read(io_info->f_sh, H5FD_MEM_DRAW, store_contig->dset_addr, (size_t)mpi_buf_count, - io_info->u.rbuf) < 0) + if (H5F_shared_block_read(io_info->f_sh, H5FD_MEM_DRAW, io_info->store_faddr, (size_t)mpi_buf_count, + rbuf) < 0) HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "can't finish collective parallel read") done: @@ -906,27 +925,29 @@ done: * * Purpose: MPI-IO function to write directly from app buffer to file. * - * Return: non-negative on success, negative on failure. + * This was referred from H5D__mpio_select_write for + * multi-dset work. * - * Programmer: + * Return: non-negative on success, negative on failure. * *------------------------------------------------------------------------- */ herr_t -H5D__mpio_select_write(const H5D_io_info_t *io_info, const H5D_type_info_t H5_ATTR_UNUSED *type_info, - hsize_t mpi_buf_count, H5S_t H5_ATTR_UNUSED *file_space, +H5D__mpio_select_write(const H5D_io_info_t *io_info, hsize_t mpi_buf_count, H5S_t H5_ATTR_UNUSED *file_space, H5S_t H5_ATTR_UNUSED *mem_space) { - const H5D_contig_storage_t *store_contig = - &(io_info->store->contig); /* Contiguous storage info for this I/O operation */ - herr_t ret_value = SUCCEED; + const void *wbuf = NULL; + herr_t ret_value = SUCCEED; FUNC_ENTER_PACKAGE + /* memory addr from a piece with lowest file addr */ + wbuf = io_info->base_maddr.cvp; + /*OKAY: CAST DISCARDS CONST QUALIFIER*/ H5_CHECK_OVERFLOW(mpi_buf_count, hsize_t, size_t); - if (H5F_shared_block_write(io_info->f_sh, H5FD_MEM_DRAW, store_contig->dset_addr, (size_t)mpi_buf_count, - io_info->u.wbuf) < 0) + if (H5F_shared_block_write(io_info->f_sh, H5FD_MEM_DRAW, io_info->store_faddr, (size_t)mpi_buf_count, + wbuf) < 0) HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "can't finish collective parallel write") done: @@ -937,17 +958,15 @@ done: * Function: H5D__mpio_get_sum_chunk * * Purpose: Routine for obtaining total number of chunks to cover - * hyperslab selection selected by all processors. + * hyperslab selection selected by all processors. Operates + * on all datasets in the operation. * * Return: Non-negative on success/Negative on failure * - * Programmer: Muqun Yang - * Monday, Feb. 13th, 2006 - * *------------------------------------------------------------------------- */ static herr_t -H5D__mpio_get_sum_chunk(const H5D_io_info_t *io_info, const H5D_chunk_map_t *fm, int *sum_chunkf) +H5D__mpio_get_sum_chunk(const H5D_io_info_t *io_info, int *sum_chunkf) { int num_chunkf; /* Number of chunks to iterate over */ size_t ori_num_chunkf; @@ -958,7 +977,7 @@ H5D__mpio_get_sum_chunk(const H5D_io_info_t *io_info, const H5D_chunk_map_t *fm, /* Get the number of chunks to perform I/O on */ num_chunkf = 0; - ori_num_chunkf = H5SL_count(fm->sel_chunks); + ori_num_chunkf = io_info->pieces_added; H5_CHECKED_ASSIGN(num_chunkf, int, ori_num_chunkf, size_t); /* Determine the summation of number of chunks for all processes */ @@ -971,85 +990,47 @@ done: } /* end H5D__mpio_get_sum_chunk() */ /*------------------------------------------------------------------------- - * Function: H5D__contig_collective_read - * - * Purpose: Reads directly from contiguous data in file into application - * memory using collective I/O. - * - * Return: Non-negative on success/Negative on failure - * - * Programmer: Quincey Koziol - * Tuesday, March 4, 2008 - * - *------------------------------------------------------------------------- - */ -herr_t -H5D__contig_collective_read(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, - hsize_t H5_ATTR_UNUSED nelmts, H5S_t *file_space, H5S_t *mem_space, - H5D_chunk_map_t H5_ATTR_UNUSED *fm) -{ - H5D_mpio_actual_io_mode_t actual_io_mode = H5D_MPIO_CONTIGUOUS_COLLECTIVE; - herr_t ret_value = SUCCEED; /* Return value */ - - FUNC_ENTER_PACKAGE - - /* Sanity check */ - HDassert(H5F_HAS_FEATURE(io_info->dset->oloc.file, H5FD_FEAT_HAS_MPI)); - - /* Call generic internal collective I/O routine */ - if (H5D__inter_collective_io(io_info, type_info, file_space, mem_space) < 0) - HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "couldn't finish shared collective MPI-IO") - - /* Set the actual I/O mode property. internal_collective_io will not break to - * independent I/O, so we set it here. - */ - H5CX_set_mpio_actual_io_mode(actual_io_mode); - -done: - FUNC_LEAVE_NOAPI(ret_value) -} /* end H5D__contig_collective_read() */ - -/*------------------------------------------------------------------------- - * Function: H5D__contig_collective_write + * Function: H5D__mpio_get_sum_chunk_dset * - * Purpose: Write directly to contiguous data in file from application - * memory using collective I/O. + * Purpose: Routine for obtaining total number of chunks to cover + * hyperslab selection selected by all processors. Operates + * on a single dataset. * * Return: Non-negative on success/Negative on failure * - * Programmer: Quincey Koziol - * Tuesday, March 4, 2008 - * *------------------------------------------------------------------------- */ -herr_t -H5D__contig_collective_write(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, - hsize_t H5_ATTR_UNUSED nelmts, H5S_t *file_space, H5S_t *mem_space, - H5D_chunk_map_t H5_ATTR_UNUSED *fm) +static herr_t +H5D__mpio_get_sum_chunk_dset(const H5D_io_info_t *io_info, const H5D_dset_io_info_t *dset_info, + int *sum_chunkf) { - H5D_mpio_actual_io_mode_t actual_io_mode = H5D_MPIO_CONTIGUOUS_COLLECTIVE; - herr_t ret_value = SUCCEED; /* Return value */ + int num_chunkf; /* Number of chunks to iterate over */ + size_t ori_num_chunkf; + int mpi_code; /* MPI return code */ + herr_t ret_value = SUCCEED; FUNC_ENTER_PACKAGE - /* Sanity check */ - HDassert(H5F_HAS_FEATURE(io_info->dset->oloc.file, H5FD_FEAT_HAS_MPI)); + /* Check for non-chunked dataset, in this case we know the number of "chunks" + * is simply the mpi size */ + HDassert(dset_info->layout->type == H5D_CHUNKED); - /* Call generic internal collective I/O routine */ - if (H5D__inter_collective_io(io_info, type_info, file_space, mem_space) < 0) - HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "couldn't finish shared collective MPI-IO") + /* Get the number of chunks to perform I/O on */ + num_chunkf = 0; + ori_num_chunkf = H5SL_count(dset_info->layout_io_info.chunk_map->dset_sel_pieces); + H5_CHECKED_ASSIGN(num_chunkf, int, ori_num_chunkf, size_t); - /* Set the actual I/O mode property. internal_collective_io will not break to - * independent I/O, so we set it here. - */ - H5CX_set_mpio_actual_io_mode(actual_io_mode); + /* Determine the summation of number of chunks for all processes */ + if (MPI_SUCCESS != + (mpi_code = MPI_Allreduce(&num_chunkf, sum_chunkf, 1, MPI_INT, MPI_SUM, io_info->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code) done: FUNC_LEAVE_NOAPI(ret_value) -} /* end H5D__contig_collective_write() */ +} /* end H5D__mpio_get_sum_chunk_dset() */ /*------------------------------------------------------------------------- - * Function: H5D__chunk_collective_io + * Function: H5D__piece_io * * Purpose: Routine for * 1) choose an IO option: @@ -1085,34 +1066,34 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5D__chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, H5D_chunk_map_t *fm) +H5D__piece_io(H5D_io_info_t *io_info) { H5FD_mpio_chunk_opt_t chunk_opt_mode; #ifdef H5Dmpio_DEBUG hbool_t log_file_flag = FALSE; FILE *debug_log_file = NULL; #endif -#ifdef H5_HAVE_INSTRUMENTED_LIBRARY - htri_t temp_not_link_io = FALSE; -#endif - int io_option = H5D_MULTI_CHUNK_IO_MORE_OPT; - int sum_chunk = -1; - int mpi_rank; - int mpi_size; - herr_t ret_value = SUCCEED; + int io_option = H5D_MULTI_CHUNK_IO_MORE_OPT; + hbool_t recalc_io_option = FALSE; + hbool_t use_multi_dset = FALSE; + unsigned one_link_chunk_io_threshold; /* Threshold to use single collective I/O for all chunks */ + int sum_chunk = -1; + int mpi_rank; + int mpi_size; + size_t i; + herr_t ret_value = SUCCEED; FUNC_ENTER_PACKAGE /* Sanity checks */ HDassert(io_info); HDassert(io_info->using_mpi_vfd); - HDassert(type_info); - HDassert(fm); + HDassert(io_info->count > 0); /* Obtain the current rank of the process and the number of ranks */ - if ((mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file)) < 0) + if ((mpi_rank = H5F_mpi_get_rank(io_info->dsets_info[0].dset->oloc.file)) < 0) HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain MPI rank") - if ((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file)) < 0) + if ((mpi_size = H5F_mpi_get_size(io_info->dsets_info[0].dset->oloc.file)) < 0) HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain MPI size") #ifdef H5Dmpio_DEBUG @@ -1139,7 +1120,9 @@ H5D__chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_inf } #endif - /* Check the optional property list for the collective chunk IO optimization option */ + /* Check the optional property list for the collective chunk IO optimization option. + * Only set here if it's a static option, if it needs to be calculated using the + * number of chunks per process delay that calculation until later. */ if (H5CX_get_mpio_chunk_opt_mode(&chunk_opt_mode) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "couldn't get chunk optimization option") @@ -1148,81 +1131,190 @@ H5D__chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_inf /* direct request to multi-chunk-io */ else if (H5FD_MPIO_CHUNK_MULTI_IO == chunk_opt_mode) io_option = H5D_MULTI_CHUNK_IO; - /* via default path. branch by num threshold */ - else { - unsigned one_link_chunk_io_threshold; /* Threshold to use single collective I/O for all chunks */ + else + recalc_io_option = TRUE; - if (H5D__mpio_get_sum_chunk(io_info, fm, &sum_chunk) < 0) - HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, - "unable to obtain the total chunk number of all processes"); + /* Check if we can and should use multi dataset path */ + if (io_info->count > 1 && (io_option == H5D_ONE_LINK_CHUNK_IO || recalc_io_option)) { + /* Use multi dataset path for now */ + use_multi_dset = TRUE; - /* Get the chunk optimization option threshold */ - if (H5CX_get_mpio_chunk_opt_num(&one_link_chunk_io_threshold) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, - "couldn't get chunk optimization option threshold value") + /* Check for filtered datasets */ + for (i = 0; i < io_info->count; i++) + if (io_info->dsets_info[i].dset->shared->dcpl_cache.pline.nused > 0) { + use_multi_dset = FALSE; + break; + } - /* step 1: choose an IO option */ - /* If the average number of chunk per process is greater than a threshold, we will do one link chunked - * IO. */ - if ((unsigned)sum_chunk / (unsigned)mpi_size >= one_link_chunk_io_threshold) - io_option = H5D_ONE_LINK_CHUNK_IO_MORE_OPT; -#ifdef H5_HAVE_INSTRUMENTED_LIBRARY - else - temp_not_link_io = TRUE; -#endif /* H5_HAVE_INSTRUMENTED_LIBRARY */ - } /* end else */ + /* Check if this I/O exceeds one linked chunk threshold */ + if (recalc_io_option && use_multi_dset) { + /* Get the chunk optimization option threshold */ + if (H5CX_get_mpio_chunk_opt_num(&one_link_chunk_io_threshold) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, + "couldn't get chunk optimization option threshold value") + + /* If the threshold is 0, no need to check number of chunks */ + if (one_link_chunk_io_threshold > 0) { + /* Get number of chunks for all processes */ + if (H5D__mpio_get_sum_chunk(io_info, &sum_chunk) < 0) + HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, + "unable to obtain the total chunk number of all processes"); + + /* If the average number of chunk per process is less than the threshold, we will do multi + * chunk IO. If this threshold is not exceeded for all datasets, no need to check it again + * for each individual dataset. */ + if ((unsigned)sum_chunk / (unsigned)mpi_size < one_link_chunk_io_threshold) { + recalc_io_option = FALSE; + use_multi_dset = FALSE; + } + } + } + /* Perform multi dataset I/O if appropriate */ + if (use_multi_dset) { #ifdef H5_HAVE_INSTRUMENTED_LIBRARY - { - /*** Set collective chunk user-input optimization APIs. ***/ - if (H5D_ONE_LINK_CHUNK_IO == io_option) { - if (H5CX_test_set_mpio_coll_chunk_link_hard(0) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTSET, FAIL, "unable to set property value") - } /* end if */ - else if (H5D_MULTI_CHUNK_IO == io_option) { - if (H5CX_test_set_mpio_coll_chunk_multi_hard(0) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTSET, FAIL, "unable to set property value") - } /* end else-if */ - else if (H5D_ONE_LINK_CHUNK_IO_MORE_OPT == io_option) { - if (H5CX_test_set_mpio_coll_chunk_link_num_true(0) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTSET, FAIL, "unable to set property value") - } /* end if */ - else if (temp_not_link_io) { - if (H5CX_test_set_mpio_coll_chunk_link_num_false(0) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTSET, FAIL, "unable to set property value") - } /* end if */ + /*** Set collective chunk user-input optimization API. ***/ + if (H5D_ONE_LINK_CHUNK_IO == io_option) { + if (H5CX_test_set_mpio_coll_chunk_link_hard(0) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTSET, FAIL, "unable to set property value") + } /* end if */ +#endif /* H5_HAVE_INSTRUMENTED_LIBRARY */ + + /* Perform unfiltered link chunk collective IO */ + if (H5D__link_piece_collective_io(io_info, mpi_rank) < 0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish linked chunk MPI-IO") + } } -#endif /* H5_HAVE_INSTRUMENTED_LIBRARY */ - /* step 2: Go ahead to do IO.*/ - switch (io_option) { - case H5D_ONE_LINK_CHUNK_IO: - case H5D_ONE_LINK_CHUNK_IO_MORE_OPT: - /* Check if there are any filters in the pipeline */ - if (io_info->dset->shared->dcpl_cache.pline.nused > 0) { - if (H5D__link_chunk_filtered_collective_io(io_info, type_info, fm, mpi_rank, mpi_size) < 0) - HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish filtered linked chunk MPI-IO") - } /* end if */ - else - /* Perform unfiltered link chunk collective IO */ - if (H5D__link_chunk_collective_io(io_info, type_info, fm, sum_chunk, mpi_rank, mpi_size) < 0) - HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish linked chunk MPI-IO") - break; + if (!use_multi_dset) { + /* Loop over datasets */ + for (i = 0; i < io_info->count; i++) { + if (io_info->dsets_info[i].layout->type == H5D_CONTIGUOUS) { + /* Contiguous: call H5D__inter_collective_io() directly */ + H5D_mpio_actual_io_mode_t actual_io_mode = H5D_MPIO_CONTIGUOUS_COLLECTIVE; - case H5D_MULTI_CHUNK_IO: /* direct request to do multi-chunk IO */ - default: /* multiple chunk IO via threshold */ - /* Check if there are any filters in the pipeline */ - if (io_info->dset->shared->dcpl_cache.pline.nused > 0) { - if (H5D__multi_chunk_filtered_collective_io(io_info, type_info, fm, mpi_rank, mpi_size) < 0) - HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, - "couldn't finish optimized multiple filtered chunk MPI-IO") - } /* end if */ - else - /* Perform unfiltered multi chunk collective IO */ - if (H5D__multi_chunk_collective_io(io_info, type_info, fm, mpi_rank, mpi_size) < 0) - HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish optimized multiple chunk MPI-IO") - break; - } /* end switch */ + io_info->store_faddr = io_info->dsets_info[i].store->contig.dset_addr; + io_info->base_maddr = io_info->dsets_info[i].buf; + + if (H5D__inter_collective_io(io_info, &io_info->dsets_info[i], + io_info->dsets_info[i].file_space, + io_info->dsets_info[i].mem_space) < 0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish shared collective MPI-IO") + + /* Set the actual I/O mode property. internal_collective_io will not break to + * independent I/O, so we set it here. + */ + H5CX_set_mpio_actual_io_mode(actual_io_mode); + } + else { + /* Chunked I/O path */ + HDassert(io_info->dsets_info[i].layout->type == H5D_CHUNKED); + + /* Recalculate io_option if necessary */ + if (recalc_io_option) { + /* Get the chunk optimization option threshold */ + if (H5CX_get_mpio_chunk_opt_num(&one_link_chunk_io_threshold) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, + "couldn't get chunk optimization option threshold value") + + /* If the threshold is 0, no need to check number of chunks */ + if (one_link_chunk_io_threshold == 0) { + io_option = H5D_ONE_LINK_CHUNK_IO_MORE_OPT; + recalc_io_option = FALSE; + } + else { + /* Get number of chunks for all processes */ + if (H5D__mpio_get_sum_chunk_dset(io_info, &io_info->dsets_info[i], &sum_chunk) < 0) + HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, + "unable to obtain the total chunk number of all processes"); + + /* step 1: choose an IO option */ + /* If the average number of chunk per process is greater than a threshold, we will do + * one link chunked IO. */ + if ((unsigned)sum_chunk / (unsigned)mpi_size >= one_link_chunk_io_threshold) + io_option = H5D_ONE_LINK_CHUNK_IO_MORE_OPT; + else + io_option = H5D_MULTI_CHUNK_IO_MORE_OPT; + } + } + + /* step 2: Go ahead to do IO.*/ + switch (io_option) { + case H5D_ONE_LINK_CHUNK_IO: + case H5D_ONE_LINK_CHUNK_IO_MORE_OPT: + /* Check if there are any filters in the pipeline */ + if (io_info->dsets_info[i].dset->shared->dcpl_cache.pline.nused > 0) { + if (H5D__link_chunk_filtered_collective_io(io_info, &io_info->dsets_info[i], + mpi_rank, mpi_size) < 0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, + "couldn't finish filtered linked chunk MPI-IO") + } /* end if */ + else { + /* If there is more than one dataset we cannot make the multi dataset call here, + * fall back to multi chunk */ + if (io_info->count > 1) { + io_option = H5D_MULTI_CHUNK_IO_MORE_OPT; + recalc_io_option = TRUE; + + if (H5D__multi_chunk_collective_io(io_info, &io_info->dsets_info[i], mpi_rank, + mpi_size) < 0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, + "couldn't finish optimized multiple chunk MPI-IO") + } + else { + /* Perform unfiltered link chunk collective IO */ + if (H5D__link_piece_collective_io(io_info, mpi_rank) < 0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, + "couldn't finish linked chunk MPI-IO") + } + } + + break; + + case H5D_MULTI_CHUNK_IO: /* direct request to do multi-chunk IO */ + default: /* multiple chunk IO via threshold */ + /* Check if there are any filters in the pipeline */ + if (io_info->dsets_info[i].dset->shared->dcpl_cache.pline.nused > 0) { + if (H5D__multi_chunk_filtered_collective_io(io_info, &io_info->dsets_info[i], + mpi_rank, mpi_size) < 0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, + "couldn't finish optimized multiple filtered chunk MPI-IO") + } /* end if */ + else { + /* Perform unfiltered multi chunk collective IO */ + if (H5D__multi_chunk_collective_io(io_info, &io_info->dsets_info[i], mpi_rank, + mpi_size) < 0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, + "couldn't finish optimized multiple chunk MPI-IO") + } + + break; + } /* end switch */ + +#ifdef H5_HAVE_INSTRUMENTED_LIBRARY + { + /*** Set collective chunk user-input optimization APIs. ***/ + if (H5D_ONE_LINK_CHUNK_IO == io_option) { + if (H5CX_test_set_mpio_coll_chunk_link_hard(0) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTSET, FAIL, "unable to set property value") + } /* end if */ + else if (H5D_MULTI_CHUNK_IO == io_option) { + if (H5CX_test_set_mpio_coll_chunk_multi_hard(0) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTSET, FAIL, "unable to set property value") + } /* end else-if */ + else if (H5D_ONE_LINK_CHUNK_IO_MORE_OPT == io_option) { + if (H5CX_test_set_mpio_coll_chunk_link_num_true(0) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTSET, FAIL, "unable to set property value") + } /* end if */ + else if (H5D_MULTI_CHUNK_IO_MORE_OPT == io_option) { + if (H5CX_test_set_mpio_coll_chunk_link_num_false(0) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTSET, FAIL, "unable to set property value") + } /* end if */ + } +#endif /* H5_HAVE_INSTRUMENTED_LIBRARY */ + } + } + } done: #ifdef H5Dmpio_DEBUG @@ -1236,13 +1328,13 @@ done: #endif FUNC_LEAVE_NOAPI(ret_value) -} /* end H5D__chunk_collective_io */ +} /* end H5D__piece_io */ /*------------------------------------------------------------------------- - * Function: H5D__chunk_collective_read + * Function: H5D__collective_read * - * Purpose: Reads directly from chunks in file into application memory - * using collective I/O. + * Purpose: Read directly from pieces (chunks/contig) in file into + * application memory using collective I/O. * * Return: Non-negative on success/Negative on failure * @@ -1252,27 +1344,25 @@ done: *------------------------------------------------------------------------- */ herr_t -H5D__chunk_collective_read(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, - hsize_t H5_ATTR_UNUSED nelmts, H5S_t H5_ATTR_UNUSED *file_space, - H5S_t H5_ATTR_UNUSED *mem_space, H5D_chunk_map_t *fm) +H5D__collective_read(H5D_io_info_t *io_info) { herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_PACKAGE /* Call generic selection operation */ - if (H5D__chunk_collective_io(io_info, type_info, fm) < 0) + if (H5D__piece_io(io_info) < 0) HGOTO_ERROR(H5E_DATASPACE, H5E_READERROR, FAIL, "read error") done: FUNC_LEAVE_NOAPI(ret_value) -} /* end H5D__chunk_collective_read() */ +} /* end H5D__collective_read() */ /*------------------------------------------------------------------------- - * Function: H5D__chunk_collective_write + * Function: H5D__collective_write * - * Purpose: Write directly to chunks in file from application memory - * using collective I/O. + * Purpose: Write directly to pieces (chunks/contig) in file into + * application memory using collective I/O. * * Return: Non-negative on success/Negative on failure * @@ -1282,31 +1372,30 @@ done: *------------------------------------------------------------------------- */ herr_t -H5D__chunk_collective_write(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, - hsize_t H5_ATTR_UNUSED nelmts, H5S_t H5_ATTR_UNUSED *file_space, - H5S_t H5_ATTR_UNUSED *mem_space, H5D_chunk_map_t *fm) +H5D__collective_write(H5D_io_info_t *io_info) { herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_PACKAGE /* Call generic selection operation */ - if (H5D__chunk_collective_io(io_info, type_info, fm) < 0) + if (H5D__piece_io(io_info) < 0) HGOTO_ERROR(H5E_DATASPACE, H5E_WRITEERROR, FAIL, "write error") done: FUNC_LEAVE_NOAPI(ret_value) -} /* end H5D__chunk_collective_write() */ +} /* end H5D__collective_write() */ /*------------------------------------------------------------------------- - * Function: H5D__link_chunk_collective_io + * Function: H5D__link_piece_collective_io * - * Purpose: Routine for one collective IO with one MPI derived datatype to link with all chunks + * Purpose: Routine for single collective IO with one MPI derived datatype + * to link with all pieces (chunks + contig) * - * 1. Sort the chunk address and chunk info - * 2. Build up MPI derived datatype for each chunk - * 3. Build up the final MPI derived datatype - * 4. Use common collective IO routine to do MPI-IO + * 1. Use the piece addresses and piece info sorted in skiplist + * 2. Build up MPI derived datatype for each chunk + * 3. Build up the final MPI derived datatype + * 4. Use common collective IO routine to do MPI-IO * * Return: Non-negative on success/Negative on failure * @@ -1316,121 +1405,100 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5D__link_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, H5D_chunk_map_t *fm, - int sum_chunk, int mpi_rank, int mpi_size) +H5D__link_piece_collective_io(H5D_io_info_t *io_info, int mpi_rank) { - H5D_chunk_addr_info_t *chunk_addr_info_array = NULL; - MPI_Datatype chunk_final_mtype; /* Final memory MPI datatype for all chunks with selection */ - hbool_t chunk_final_mtype_is_derived = FALSE; - MPI_Datatype chunk_final_ftype; /* Final file MPI datatype for all chunks with selection */ - hbool_t chunk_final_ftype_is_derived = FALSE; - H5D_storage_t ctg_store; /* Storage info for "fake" contiguous dataset */ - size_t total_chunks; - MPI_Datatype *chunk_mtype = NULL; - MPI_Datatype *chunk_ftype = NULL; - MPI_Aint *chunk_disp_array = NULL; - MPI_Aint *chunk_mem_disp_array = NULL; - hbool_t *chunk_mft_is_derived_array = + MPI_Datatype chunk_final_mtype; /* Final memory MPI datatype for all chunks with selection */ + hbool_t chunk_final_mtype_is_derived = FALSE; + MPI_Datatype chunk_final_ftype; /* Final file MPI datatype for all chunks with selection */ + hbool_t chunk_final_ftype_is_derived = FALSE; + H5D_storage_t ctg_store; /* Storage info for "fake" contiguous dataset */ + MPI_Datatype *chunk_mtype = NULL; + MPI_Datatype *chunk_ftype = NULL; + MPI_Aint *chunk_file_disp_array = NULL; + MPI_Aint *chunk_mem_disp_array = NULL; + hbool_t *chunk_mft_is_derived_array = NULL; /* Flags to indicate each chunk's MPI file datatype is derived */ hbool_t *chunk_mbt_is_derived_array = - NULL; /* Flags to indicate each chunk's MPI memory datatype is derived */ - int *chunk_mpi_file_counts = NULL; /* Count of MPI file datatype for each chunk */ - int *chunk_mpi_mem_counts = NULL; /* Count of MPI memory datatype for each chunk */ - int mpi_code; /* MPI return code */ - herr_t ret_value = SUCCEED; + NULL; /* Flags to indicate each chunk's MPI memory datatype is derived */ + int *chunk_mpi_file_counts = NULL; /* Count of MPI file datatype for each chunk */ + int *chunk_mpi_mem_counts = NULL; /* Count of MPI memory datatype for each chunk */ + int mpi_code; /* MPI return code */ + H5D_mpio_actual_chunk_opt_mode_t actual_chunk_opt_mode = H5D_MPIO_LINK_CHUNK; + H5D_mpio_actual_io_mode_t actual_io_mode = 0; + size_t i; /* Local index variable */ + herr_t ret_value = SUCCEED; FUNC_ENTER_PACKAGE + /* set actual_io_mode */ + for (i = 0; i < io_info->count; i++) { + HDassert(io_info->dsets_info[i].dset->shared->dcpl_cache.pline.nused == 0); + if (io_info->dsets_info[i].layout->type == H5D_CHUNKED) + actual_io_mode |= H5D_MPIO_CHUNK_COLLECTIVE; + else if (io_info->dsets_info[i].layout->type == H5D_CONTIGUOUS) { + actual_io_mode |= H5D_MPIO_CONTIGUOUS_COLLECTIVE; + + /* if only single-dset */ + if (1 == io_info->count) + actual_chunk_opt_mode = H5D_MPIO_NO_CHUNK_OPTIMIZATION; + } + else + HGOTO_ERROR(H5E_IO, H5E_UNSUPPORTED, FAIL, "unsupported storage layout") + } + /* Set the actual-chunk-opt-mode property. */ - H5CX_set_mpio_actual_chunk_opt(H5D_MPIO_LINK_CHUNK); + H5CX_set_mpio_actual_chunk_opt(actual_chunk_opt_mode); /* Set the actual-io-mode property. * Link chunk I/O does not break to independent, so can set right away */ - H5CX_set_mpio_actual_io_mode(H5D_MPIO_CHUNK_COLLECTIVE); - - /* Get the sum # of chunks, if not already available */ - if (sum_chunk < 0) { - if (H5D__mpio_get_sum_chunk(io_info, fm, &sum_chunk) < 0) - HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, - "unable to obtain the total chunk number of all processes"); - } /* end if */ - - /* Retrieve total # of chunks in dataset */ - H5_CHECKED_ASSIGN(total_chunks, size_t, fm->layout->u.chunk.nchunks, hsize_t); - - /* Handle special case when dataspace dimensions only allow one chunk in - * the dataset. [This sometimes is used by developers who want the - * equivalent of compressed contiguous datasets - QAK] - */ - if (total_chunks == 1) { - H5SL_node_t *chunk_node; /* Pointer to chunk node for selection */ - H5S_t *fspace; /* Dataspace describing chunk & selection in it */ - H5S_t *mspace; /* Dataspace describing selection in memory corresponding to this chunk */ - - /* Check for this process having selection in this chunk */ - chunk_node = H5SL_first(fm->sel_chunks); - - if (chunk_node == NULL) { - /* Set the dataspace info for I/O to NULL, this process doesn't have any I/O to perform */ - fspace = mspace = NULL; - - /* Initialize chunk address */ - ctg_store.contig.dset_addr = 0; - } /* end if */ - else { - H5D_chunk_ud_t udata; /* User data for querying chunk info */ - H5D_chunk_info_t *chunk_info; /* Info for chunk in skiplist */ - - /* Get the chunk info, for the selection in the chunk */ - if (NULL == (chunk_info = (H5D_chunk_info_t *)H5SL_item(chunk_node))) - HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL, "couldn't get chunk info from skip list") - - /* Set the dataspace info for I/O */ - fspace = chunk_info->fspace; - mspace = chunk_info->mspace; - - /* Look up address of chunk */ - if (H5D__chunk_lookup(io_info->dset, chunk_info->scaled, &udata) < 0) - HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL, "couldn't get chunk address") - ctg_store.contig.dset_addr = udata.chunk_block.offset; - } /* end else */ - - /* Set up the base storage address for this chunk */ - io_info->store = &ctg_store; - -#ifdef H5Dmpio_DEBUG - H5D_MPIO_DEBUG(mpi_rank, "before inter_collective_io for total chunk = 1"); -#endif + H5CX_set_mpio_actual_io_mode(actual_io_mode); - /* Perform I/O */ - if (H5D__inter_collective_io(io_info, type_info, fspace, mspace) < 0) - HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL, "couldn't finish shared collective MPI-IO") - } /* end if */ - else { + /* Code block for actual actions (Build a MPI Type, IO) */ + { hsize_t mpi_buf_count; /* Number of MPI types */ size_t num_chunk; /* Number of chunks for this process */ - size_t u; /* Local index variable */ + + H5D_piece_info_t *piece_info; + + /* local variable for base address for buffer */ + H5_flexible_const_ptr_t base_buf_addr; + base_buf_addr.cvp = NULL; /* Get the number of chunks with a selection */ - num_chunk = H5SL_count(fm->sel_chunks); + num_chunk = io_info->pieces_added; H5_CHECK_OVERFLOW(num_chunk, size_t, int); #ifdef H5Dmpio_DEBUG - H5D_MPIO_DEBUG_VA(mpi_rank, "total_chunks = %zu, num_chunk = %zu", total_chunks, num_chunk); + H5D_MPIO_DEBUG_VA(mpi_rank, "num_chunk = %zu\n", num_chunk); #endif /* Set up MPI datatype for chunks selected */ if (num_chunk) { + hbool_t need_sort = FALSE; + + /* Check if sel_pieces array is sorted */ + HDassert(io_info->sel_pieces[0]->faddr != HADDR_UNDEF); + for (i = 1; i < num_chunk; i++) { + HDassert(io_info->sel_pieces[i]->faddr != HADDR_UNDEF); + + if (io_info->sel_pieces[i]->faddr < io_info->sel_pieces[i - 1]->faddr) { + need_sort = TRUE; + break; + } + } + + /* Sort sel_pieces if necessary */ + if (need_sort) + HDqsort(io_info->sel_pieces, io_info->pieces_added, sizeof(io_info->sel_pieces[0]), + H5D__cmp_piece_addr); + /* Allocate chunking information */ - if (NULL == (chunk_addr_info_array = - (H5D_chunk_addr_info_t *)H5MM_malloc(num_chunk * sizeof(H5D_chunk_addr_info_t)))) - HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate chunk array buffer") if (NULL == (chunk_mtype = (MPI_Datatype *)H5MM_malloc(num_chunk * sizeof(MPI_Datatype)))) HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate chunk memory datatype buffer") if (NULL == (chunk_ftype = (MPI_Datatype *)H5MM_malloc(num_chunk * sizeof(MPI_Datatype)))) HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate chunk file datatype buffer") - if (NULL == (chunk_disp_array = (MPI_Aint *)H5MM_malloc(num_chunk * sizeof(MPI_Aint)))) + if (NULL == (chunk_file_disp_array = (MPI_Aint *)H5MM_malloc(num_chunk * sizeof(MPI_Aint)))) HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate chunk file displacement buffer") if (NULL == (chunk_mem_disp_array = (MPI_Aint *)H5MM_calloc(num_chunk * sizeof(MPI_Aint)))) @@ -1447,36 +1515,37 @@ H5D__link_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *typ HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate chunk file is derived datatype flags buffer") -#ifdef H5Dmpio_DEBUG - H5D_MPIO_DEBUG(mpi_rank, "before sorting chunk addresses"); -#endif + /* save lowest file address */ + ctg_store.contig.dset_addr = io_info->sel_pieces[0]->faddr; - /* Sort the chunk address */ - if (H5D__sort_chunk(io_info, fm, chunk_addr_info_array, sum_chunk, mpi_rank, mpi_size) < 0) - HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to sort chunk address") - ctg_store.contig.dset_addr = chunk_addr_info_array[0].chunk_addr; + /* save base mem addr of piece for read/write */ + base_buf_addr = io_info->sel_pieces[0]->dset_info->buf; #ifdef H5Dmpio_DEBUG - H5D_MPIO_DEBUG(mpi_rank, "after sorting chunk addresses"); + H5D_MPIO_DEBUG(mpi_rank, "before iterate over selected pieces\n"); #endif - /* Obtain MPI derived datatype from all individual chunks */ - for (u = 0; u < num_chunk; u++) { + /* Obtain MPI derived datatype from all individual pieces */ + /* Iterate over selected pieces for this process */ + for (i = 0; i < num_chunk; i++) { hsize_t *permute_map = NULL; /* array that holds the mapping from the old, out-of-order displacements to the in-order displacements of the MPI datatypes of the point selection of the file space */ hbool_t is_permuted = FALSE; + /* Assign convenience pointer to piece info */ + piece_info = io_info->sel_pieces[i]; + /* Obtain disk and memory MPI derived datatype */ /* NOTE: The permute_map array can be allocated within H5S_mpio_space_type * and will be fed into the next call to H5S_mpio_space_type * where it will be freed. */ - if (H5S_mpio_space_type(chunk_addr_info_array[u].chunk_info.fspace, type_info->src_type_size, - &chunk_ftype[u], /* OUT: datatype created */ - &chunk_mpi_file_counts[u], /* OUT */ - &(chunk_mft_is_derived_array[u]), /* OUT */ + if (H5S_mpio_space_type(piece_info->fspace, piece_info->dset_info->type_info.src_type_size, + &chunk_ftype[i], /* OUT: datatype created */ + &chunk_mpi_file_counts[i], /* OUT */ + &(chunk_mft_is_derived_array[i]), /* OUT */ TRUE, /* this is a file space, so permute the datatype if the point @@ -1488,12 +1557,13 @@ H5D__link_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *typ are out of order */ &is_permuted /* OUT */) < 0) HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "couldn't create MPI file type") + /* Sanity check */ if (is_permuted) HDassert(permute_map); - if (H5S_mpio_space_type(chunk_addr_info_array[u].chunk_info.mspace, type_info->dst_type_size, - &chunk_mtype[u], &chunk_mpi_mem_counts[u], - &(chunk_mbt_is_derived_array[u]), FALSE, /* this is a memory + if (H5S_mpio_space_type(piece_info->mspace, piece_info->dset_info->type_info.dst_type_size, + &chunk_mtype[i], &chunk_mpi_mem_counts[i], + &(chunk_mbt_is_derived_array[i]), FALSE, /* this is a memory space, so if the file space is not permuted, there is no @@ -1512,19 +1582,27 @@ H5D__link_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *typ if (is_permuted) HDassert(!permute_map); - /* Chunk address relative to the first chunk */ - chunk_addr_info_array[u].chunk_addr -= ctg_store.contig.dset_addr; + /* Piece address relative to the first piece addr + * Assign piece address to MPI displacement + * (assume MPI_Aint big enough to hold it) */ + chunk_file_disp_array[i] = (MPI_Aint)piece_info->faddr - (MPI_Aint)ctg_store.contig.dset_addr; - /* Assign chunk address to MPI displacement */ - /* (assume MPI_Aint big enough to hold it) */ - chunk_disp_array[u] = (MPI_Aint)chunk_addr_info_array[u].chunk_addr; + if (io_info->op_type == H5D_IO_OP_WRITE) { + chunk_mem_disp_array[i] = + (MPI_Aint)piece_info->dset_info->buf.cvp - (MPI_Aint)base_buf_addr.cvp; + } + else if (io_info->op_type == H5D_IO_OP_READ) { + chunk_mem_disp_array[i] = + (MPI_Aint)piece_info->dset_info->buf.vp - (MPI_Aint)base_buf_addr.vp; + } } /* end for */ /* Create final MPI derived datatype for the file */ if (MPI_SUCCESS != - (mpi_code = MPI_Type_create_struct((int)num_chunk, chunk_mpi_file_counts, chunk_disp_array, - chunk_ftype, &chunk_final_ftype))) + (mpi_code = MPI_Type_create_struct((int)num_chunk, chunk_mpi_file_counts, + chunk_file_disp_array, chunk_ftype, &chunk_final_ftype))) HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct failed", mpi_code) + if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(&chunk_final_ftype))) HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code) chunk_final_ftype_is_derived = TRUE; @@ -1539,13 +1617,13 @@ H5D__link_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *typ chunk_final_mtype_is_derived = TRUE; /* Free the file & memory MPI datatypes for each chunk */ - for (u = 0; u < num_chunk; u++) { - if (chunk_mbt_is_derived_array[u]) - if (MPI_SUCCESS != (mpi_code = MPI_Type_free(chunk_mtype + u))) + for (i = 0; i < num_chunk; i++) { + if (chunk_mbt_is_derived_array[i]) + if (MPI_SUCCESS != (mpi_code = MPI_Type_free(chunk_mtype + i))) HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code) - if (chunk_mft_is_derived_array[u]) - if (MPI_SUCCESS != (mpi_code = MPI_Type_free(chunk_ftype + u))) + if (chunk_mft_is_derived_array[i]) + if (MPI_SUCCESS != (mpi_code = MPI_Type_free(chunk_ftype + i))) HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code) } /* end for */ @@ -1555,6 +1633,9 @@ H5D__link_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *typ else { /* no selection at all for this process */ ctg_store.contig.dset_addr = 0; + /* just provide a valid mem address. no actual IO occur */ + base_buf_addr = io_info->dsets_info[0].buf; + /* Set the MPI datatype */ chunk_final_ftype = MPI_BYTE; chunk_final_mtype = MPI_BYTE; @@ -1566,15 +1647,14 @@ H5D__link_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *typ #ifdef H5Dmpio_DEBUG H5D_MPIO_DEBUG(mpi_rank, "before coming to final collective I/O"); #endif - - /* Set up the base storage address for this chunk */ - io_info->store = &ctg_store; + /* Set up the base storage address for this piece */ + io_info->store_faddr = ctg_store.contig.dset_addr; + io_info->base_maddr = base_buf_addr; /* Perform final collective I/O operation */ - if (H5D__final_collective_io(io_info, type_info, mpi_buf_count, chunk_final_ftype, - chunk_final_mtype) < 0) + if (H5D__final_collective_io(io_info, mpi_buf_count, chunk_final_ftype, chunk_final_mtype) < 0) HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish MPI-IO") - } /* end else */ + } done: #ifdef H5Dmpio_DEBUG @@ -1583,14 +1663,12 @@ done: #endif /* Release resources */ - if (chunk_addr_info_array) - H5MM_xfree(chunk_addr_info_array); if (chunk_mtype) H5MM_xfree(chunk_mtype); if (chunk_ftype) H5MM_xfree(chunk_ftype); - if (chunk_disp_array) - H5MM_xfree(chunk_disp_array); + if (chunk_file_disp_array) + H5MM_xfree(chunk_file_disp_array); if (chunk_mem_disp_array) H5MM_xfree(chunk_mem_disp_array); if (chunk_mpi_mem_counts) @@ -1609,7 +1687,7 @@ done: HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code) FUNC_LEAVE_NOAPI(ret_value) -} /* end H5D__link_chunk_collective_io */ +} /* end H5D__link_piece_collective_io */ /*------------------------------------------------------------------------- * Function: H5D__link_chunk_filtered_collective_io @@ -1680,29 +1758,27 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5D__link_chunk_filtered_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, - H5D_chunk_map_t *fm, int mpi_rank, int mpi_size) +H5D__link_chunk_filtered_collective_io(H5D_io_info_t *io_info, H5D_dset_io_info_t *dset_info, int mpi_rank, + int mpi_size) { - H5D_filtered_collective_io_info_t *chunk_list = NULL; /* The list of chunks being read/written */ - H5D_filtered_collective_io_info_t *chunk_hash_table = NULL; - unsigned char **chunk_msg_bufs = NULL; - H5D_storage_t ctg_store; /* Chunk storage information as contiguous dataset */ - MPI_Datatype mem_type = MPI_BYTE; - MPI_Datatype file_type = MPI_BYTE; - hbool_t mem_type_is_derived = FALSE; + H5D_filtered_collective_io_info_t *chunk_list = NULL; /* The list of chunks being read/written */ + H5D_filtered_collective_io_info_t *chunk_hash_table = NULL; + unsigned char **chunk_msg_bufs = NULL; + MPI_Datatype mem_type = MPI_BYTE; + MPI_Datatype file_type = MPI_BYTE; + hbool_t mem_type_is_derived = FALSE; hbool_t file_type_is_derived = FALSE; size_t *rank_chunks_assigned_map = NULL; size_t chunk_list_num_entries; size_t i; int chunk_msg_bufs_len = 0; - int mpi_code; - herr_t ret_value = SUCCEED; + char fake_buf; /* Used as a fake buffer for ranks with no chunks, thus a NULL buf pointer */ + int mpi_code; + herr_t ret_value = SUCCEED; - FUNC_ENTER_PACKAGE + FUNC_ENTER_PACKAGE_TAG(dset_info->dset->oloc.addr) HDassert(io_info); - HDassert(type_info); - HDassert(fm); #ifdef H5Dmpio_DEBUG H5D_MPIO_TRACE_ENTER(mpi_rank); @@ -1720,12 +1796,12 @@ H5D__link_chunk_filtered_collective_io(H5D_io_info_t *io_info, const H5D_type_in H5CX_set_mpio_actual_io_mode(H5D_MPIO_CHUNK_COLLECTIVE); /* Build a list of selected chunks in the collective io operation */ - if (H5D__mpio_collective_filtered_chunk_io_setup(io_info, type_info, fm, &chunk_list, - &chunk_list_num_entries, mpi_rank) < 0) + if (H5D__mpio_collective_filtered_chunk_io_setup(io_info, dset_info, &chunk_list, &chunk_list_num_entries, + mpi_rank) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "couldn't construct filtered I/O info list") if (io_info->op_type == H5D_IO_OP_READ) { /* Filtered collective read */ - if (H5D__mpio_collective_filtered_chunk_read(chunk_list, chunk_list_num_entries, io_info, type_info, + if (H5D__mpio_collective_filtered_chunk_read(chunk_list, chunk_list_num_entries, io_info, dset_info, mpi_rank, mpi_size) < 0) HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "couldn't read filtered chunks") } @@ -1733,17 +1809,17 @@ H5D__link_chunk_filtered_collective_io(H5D_io_info_t *io_info, const H5D_type_in H5D_chk_idx_info_t index_info; hsize_t mpi_buf_count; - H5D_MPIO_INIT_CHUNK_IDX_INFO(index_info, io_info); + H5D_MPIO_INIT_CHUNK_IDX_INFO(index_info, dset_info->dset); if (mpi_size > 1) { /* Redistribute shared chunks being written to */ - if (H5D__mpio_redistribute_shared_chunks(chunk_list, chunk_list_num_entries, io_info, fm, - mpi_rank, mpi_size, &rank_chunks_assigned_map) < 0) + if (H5D__mpio_redistribute_shared_chunks(chunk_list, chunk_list_num_entries, io_info, mpi_rank, + mpi_size, &rank_chunks_assigned_map) < 0) HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "unable to redistribute shared chunks") /* Send any chunk modification messages for chunks this rank no longer owns */ if (H5D__mpio_share_chunk_modification_data(chunk_list, &chunk_list_num_entries, io_info, - type_info, mpi_rank, mpi_size, &chunk_hash_table, + dset_info, mpi_rank, mpi_size, &chunk_hash_table, &chunk_msg_bufs, &chunk_msg_bufs_len) < 0) HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "unable to send chunk modification data between MPI ranks") @@ -1758,7 +1834,7 @@ H5D__link_chunk_filtered_collective_io(H5D_io_info_t *io_info, const H5D_type_in * must participate. */ if (H5D__mpio_collective_filtered_chunk_update(chunk_list, chunk_list_num_entries, chunk_hash_table, - chunk_msg_bufs, chunk_msg_bufs_len, io_info, type_info, + chunk_msg_bufs, chunk_msg_bufs_len, io_info, dset_info, mpi_rank, mpi_size) < 0) HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "couldn't update modified chunks") @@ -1790,20 +1866,21 @@ H5D__link_chunk_filtered_collective_io(H5D_io_info_t *io_info, const H5D_type_in * Override the write buffer to point to the first * chunk's data buffer */ - io_info->u.wbuf = chunk_list[0].buf; + io_info->base_maddr.cvp = chunk_list[0].buf; /* * Setup the base storage address for this operation * to be the first chunk's file address */ - ctg_store.contig.dset_addr = chunk_list[0].chunk_new.offset; + io_info->store_faddr = chunk_list[0].chunk_new.offset; + } + else { + io_info->base_maddr.cvp = &fake_buf; + io_info->store_faddr = 0; } - else - ctg_store.contig.dset_addr = 0; /* Perform I/O */ - io_info->store = &ctg_store; - if (H5D__final_collective_io(io_info, type_info, mpi_buf_count, file_type, mem_type) < 0) + if (H5D__final_collective_io(io_info, mpi_buf_count, file_type, mem_type) < 0) HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish MPI-IO") /* Free up resources in anticipation of following collective operation */ @@ -1818,8 +1895,8 @@ H5D__link_chunk_filtered_collective_io(H5D_io_info_t *io_info, const H5D_type_in * into the chunk index */ if (H5D__mpio_collective_filtered_chunk_reinsert(chunk_list, chunk_list_num_entries, - rank_chunks_assigned_map, io_info, &index_info, - mpi_rank, mpi_size) < 0) + rank_chunks_assigned_map, io_info, dset_info, + &index_info, mpi_rank, mpi_size) < 0) HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "couldn't collectively re-insert modified chunks into chunk index") } @@ -1857,7 +1934,7 @@ done: H5D_MPIO_TRACE_EXIT(mpi_rank); #endif - FUNC_LEAVE_NOAPI(ret_value) + FUNC_LEAVE_NOAPI_TAG(ret_value) } /* end H5D__link_chunk_filtered_collective_io() */ /*------------------------------------------------------------------------- @@ -1878,33 +1955,39 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5D__multi_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, H5D_chunk_map_t *fm, - int mpi_rank, int mpi_size) +H5D__multi_chunk_collective_io(H5D_io_info_t *io_info, H5D_dset_io_info_t *dset_info, int mpi_rank, + int mpi_size) { - H5D_io_info_t ctg_io_info; /* Contiguous I/O info object */ - H5D_storage_t ctg_store; /* Chunk storage information as contiguous dataset */ - H5D_io_info_t cpt_io_info; /* Compact I/O info object */ - H5D_storage_t cpt_store; /* Chunk storage information as compact dataset */ - hbool_t cpt_dirty; /* Temporary placeholder for compact storage "dirty" flag */ uint8_t *chunk_io_option = NULL; haddr_t *chunk_addr = NULL; H5D_storage_t store; /* union of EFL and chunk pointer in file space */ H5FD_mpio_collective_opt_t last_coll_opt_mode = H5FD_MPIO_COLLECTIVE_IO; /* Last parallel transfer with independent IO or collective IO with this mode */ + H5FD_mpio_collective_opt_t orig_coll_opt_mode = + H5FD_MPIO_COLLECTIVE_IO; /* Original parallel transfer property on entering this function */ size_t total_chunk; /* Total # of chunks in dataset */ - size_t u; /* Local index variable */ + size_t num_chunk; /* Number of chunks for this process */ + H5SL_node_t *piece_node = NULL; /* Current node in chunk skip list */ + H5D_piece_info_t *next_chunk_info = NULL; /* Chunk info for next selected chunk */ + size_t u; /* Local index variable */ H5D_mpio_actual_io_mode_t actual_io_mode = H5D_MPIO_NO_COLLECTIVE; /* Local variable for tracking the I/O mode used. */ herr_t ret_value = SUCCEED; - FUNC_ENTER_PACKAGE + FUNC_ENTER_PACKAGE_TAG(dset_info->dset->oloc.addr) + + HDassert(dset_info->layout->type == H5D_CHUNKED); + + /* Get the current I/O collective opt mode so we can restore it later */ + if (H5CX_get_mpio_coll_opt(&orig_coll_opt_mode) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property") /* Set the actual chunk opt mode property */ H5CX_set_mpio_actual_chunk_opt(H5D_MPIO_MULTI_CHUNK); /* Retrieve total # of chunks in dataset */ - H5_CHECKED_ASSIGN(total_chunk, size_t, fm->layout->u.chunk.nchunks, hsize_t); + H5_CHECKED_ASSIGN(total_chunk, size_t, dset_info->layout->u.chunk.nchunks, hsize_t); HDassert(total_chunk != 0); /* Allocate memories */ @@ -1916,47 +1999,62 @@ H5D__multi_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *ty #endif /* Obtain IO option for each chunk */ - if (H5D__obtain_mpio_mode(io_info, fm, chunk_io_option, chunk_addr, mpi_rank, mpi_size) < 0) + if (H5D__obtain_mpio_mode(io_info, dset_info, chunk_io_option, chunk_addr, mpi_rank, mpi_size) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTRECV, FAIL, "unable to obtain MPIO mode") - /* Set up contiguous I/O info object */ - H5MM_memcpy(&ctg_io_info, io_info, sizeof(ctg_io_info)); - ctg_io_info.store = &ctg_store; - ctg_io_info.layout_ops = *H5D_LOPS_CONTIG; - - /* Initialize temporary contiguous storage info */ - ctg_store.contig.dset_size = (hsize_t)io_info->dset->shared->layout.u.chunk.size; - - /* Set up compact I/O info object */ - H5MM_memcpy(&cpt_io_info, io_info, sizeof(cpt_io_info)); - cpt_io_info.store = &cpt_store; - cpt_io_info.layout_ops = *H5D_LOPS_COMPACT; - - /* Initialize temporary compact storage info */ - cpt_store.compact.dirty = &cpt_dirty; + /* Set memory buffers */ + io_info->base_maddr = dset_info->buf; /* Set dataset storage for I/O info */ - io_info->store = &store; + dset_info->store = &store; + + /* Get the number of chunks with a selection */ + num_chunk = H5SL_count(dset_info->layout_io_info.chunk_map->dset_sel_pieces); + + if (num_chunk) { + /* Start at the beginning of the chunk map skiplist. Since these chunks are + * stored in index order and since we're iterating in index order we can + * just check for each chunk being selected in order */ + if (NULL == (piece_node = H5SL_first(dset_info->layout_io_info.chunk_map->dset_sel_pieces))) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "couldn't get piece node from skip list") + if (NULL == (next_chunk_info = (H5D_piece_info_t *)H5SL_item(piece_node))) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "couldn't get piece info from skip list") + } /* Loop over _all_ the chunks */ for (u = 0; u < total_chunk; u++) { - H5D_chunk_info_t *chunk_info; /* Chunk info for current chunk */ + H5D_piece_info_t *chunk_info; /* Chunk info for current chunk */ H5S_t *fspace; /* Dataspace describing chunk & selection in it */ H5S_t *mspace; /* Dataspace describing selection in memory corresponding to this chunk */ #ifdef H5Dmpio_DEBUG H5D_MPIO_DEBUG_VA(mpi_rank, "mpi_rank = %d, chunk index = %zu", mpi_rank, u); #endif - /* Get the chunk info for this chunk, if there are elements selected */ - chunk_info = fm->select_chunk[u]; - /* Set the storage information for chunks with selections */ - if (chunk_info) { - HDassert(chunk_info->index == u); + /* Check if this chunk is the next chunk in the skip list, if there are + * selected chunks left to process */ + HDassert(!num_chunk || next_chunk_info); + HDassert(!num_chunk || next_chunk_info->index >= u); + if (num_chunk && next_chunk_info->index == u) { + /* Next chunk is this chunk */ + chunk_info = next_chunk_info; + + /* One less chunk to process */ + num_chunk--; + + /* Advance next chunk to next node in skip list, if there are more chunks selected */ + if (num_chunk) { + if (NULL == (piece_node = H5SL_next(piece_node))) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "chunk skip list terminated early") + if (NULL == (next_chunk_info = (H5D_piece_info_t *)H5SL_item(piece_node))) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "couldn't get piece info from skip list") + } /* Pass in chunk's coordinates in a union. */ store.chunk.scaled = chunk_info->scaled; - } /* end if */ + } + else + chunk_info = NULL; /* Collective IO for this chunk, * Note: even there is no selection for this process, the process still @@ -1994,10 +2092,10 @@ H5D__multi_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *ty } /* end if */ /* Initialize temporary contiguous storage address */ - ctg_store.contig.dset_addr = chunk_addr[u]; + io_info->store_faddr = chunk_addr[u]; /* Perform the I/O */ - if (H5D__inter_collective_io(&ctg_io_info, type_info, fspace, mspace) < 0) + if (H5D__inter_collective_io(io_info, dset_info, fspace, mspace) < 0) HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish shared collective MPI-IO") } /* end if */ else { /* possible independent IO for this chunk */ @@ -2028,10 +2126,10 @@ H5D__multi_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *ty } /* end if */ /* Initialize temporary contiguous storage address */ - ctg_store.contig.dset_addr = chunk_addr[u]; + io_info->store_faddr = chunk_addr[u]; /* Perform the I/O */ - if (H5D__inter_collective_io(&ctg_io_info, type_info, fspace, mspace) < 0) + if (H5D__inter_collective_io(io_info, dset_info, fspace, mspace) < 0) HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish shared collective MPI-IO") #ifdef H5Dmpio_DEBUG H5D_MPIO_DEBUG(mpi_rank, "after inter collective IO"); @@ -2043,12 +2141,17 @@ H5D__multi_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *ty H5CX_set_mpio_actual_io_mode(actual_io_mode); done: + /* Reset collective opt mode */ + if (H5CX_set_mpio_coll_opt(orig_coll_opt_mode) < 0) + HDONE_ERROR(H5E_DATASET, H5E_CANTSET, FAIL, "can't reset MPI-I/O collective_op property") + + /* Free memory */ if (chunk_io_option) H5MM_xfree(chunk_io_option); if (chunk_addr) H5MM_xfree(chunk_addr); - FUNC_LEAVE_NOAPI(ret_value) + FUNC_LEAVE_NOAPI_TAG(ret_value) } /* end H5D__multi_chunk_collective_io */ /*------------------------------------------------------------------------- @@ -2130,14 +2233,13 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5D__multi_chunk_filtered_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, - H5D_chunk_map_t *fm, int mpi_rank, int mpi_size) +H5D__multi_chunk_filtered_collective_io(H5D_io_info_t *io_info, H5D_dset_io_info_t *dset_info, int mpi_rank, + int mpi_size) { H5D_filtered_collective_io_info_t *chunk_list = NULL; /* The list of chunks being read/written */ H5D_filtered_collective_io_info_t *chunk_hash_table = NULL; unsigned char **chunk_msg_bufs = NULL; H5D_io_info_t ctg_io_info; /* Contiguous I/O info object */ - H5D_storage_t ctg_store; /* Chunk storage information as contiguous dataset */ MPI_Datatype mem_type = MPI_BYTE; MPI_Datatype file_type = MPI_BYTE; hbool_t mem_type_is_derived = FALSE; @@ -2150,11 +2252,9 @@ H5D__multi_chunk_filtered_collective_io(H5D_io_info_t *io_info, const H5D_type_i int mpi_code; herr_t ret_value = SUCCEED; - FUNC_ENTER_PACKAGE + FUNC_ENTER_PACKAGE_TAG(dset_info->dset->oloc.addr) HDassert(io_info); - HDassert(type_info); - HDassert(fm); #ifdef H5Dmpio_DEBUG H5D_MPIO_TRACE_ENTER(mpi_rank); @@ -2172,8 +2272,8 @@ H5D__multi_chunk_filtered_collective_io(H5D_io_info_t *io_info, const H5D_type_i H5CX_set_mpio_actual_io_mode(H5D_MPIO_CHUNK_COLLECTIVE); /* Build a list of selected chunks in the collective IO operation */ - if (H5D__mpio_collective_filtered_chunk_io_setup(io_info, type_info, fm, &chunk_list, - &chunk_list_num_entries, mpi_rank) < 0) + if (H5D__mpio_collective_filtered_chunk_io_setup(io_info, dset_info, &chunk_list, &chunk_list_num_entries, + mpi_rank) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "couldn't construct filtered I/O info list") /* Retrieve the maximum number of chunks selected for any rank */ @@ -2187,11 +2287,6 @@ H5D__multi_chunk_filtered_collective_io(H5D_io_info_t *io_info, const H5D_type_i /* Set up contiguous I/O info object */ H5MM_memcpy(&ctg_io_info, io_info, sizeof(ctg_io_info)); - ctg_io_info.store = &ctg_store; - ctg_io_info.layout_ops = *H5D_LOPS_CONTIG; - - /* Initialize temporary contiguous storage info */ - ctg_store.contig.dset_size = (hsize_t)io_info->dset->shared->layout.u.chunk.size; if (io_info->op_type == H5D_IO_OP_READ) { /* Filtered collective read */ for (i = 0; i < max_num_chunks; i++) { @@ -2199,7 +2294,7 @@ H5D__multi_chunk_filtered_collective_io(H5D_io_info_t *io_info, const H5D_type_i have_chunk_to_process = (i < chunk_list_num_entries); if (H5D__mpio_collective_filtered_chunk_read(have_chunk_to_process ? &chunk_list[i] : NULL, - have_chunk_to_process ? 1 : 0, io_info, type_info, + have_chunk_to_process ? 1 : 0, io_info, dset_info, mpi_rank, mpi_size) < 0) HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "couldn't read filtered chunks") @@ -2214,17 +2309,17 @@ H5D__multi_chunk_filtered_collective_io(H5D_io_info_t *io_info, const H5D_type_i hsize_t mpi_buf_count; /* Construct chunked index info */ - H5D_MPIO_INIT_CHUNK_IDX_INFO(index_info, io_info); + H5D_MPIO_INIT_CHUNK_IDX_INFO(index_info, dset_info->dset); if (mpi_size > 1) { /* Redistribute shared chunks being written to */ - if (H5D__mpio_redistribute_shared_chunks(chunk_list, chunk_list_num_entries, io_info, fm, - mpi_rank, mpi_size, NULL) < 0) + if (H5D__mpio_redistribute_shared_chunks(chunk_list, chunk_list_num_entries, io_info, mpi_rank, + mpi_size, NULL) < 0) HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "unable to redistribute shared chunks") /* Send any chunk modification messages for chunks this rank no longer owns */ if (H5D__mpio_share_chunk_modification_data(chunk_list, &chunk_list_num_entries, io_info, - type_info, mpi_rank, mpi_size, &chunk_hash_table, + dset_info, mpi_rank, mpi_size, &chunk_hash_table, &chunk_msg_bufs, &chunk_msg_bufs_len) < 0) HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "unable to send chunk modification data between MPI ranks") @@ -2246,7 +2341,7 @@ H5D__multi_chunk_filtered_collective_io(H5D_io_info_t *io_info, const H5D_type_i if (H5D__mpio_collective_filtered_chunk_update(have_chunk_to_process ? &chunk_list[i] : NULL, have_chunk_to_process ? 1 : 0, chunk_hash_table, chunk_msg_bufs, chunk_msg_bufs_len, io_info, - type_info, mpi_rank, mpi_size) < 0) + dset_info, mpi_rank, mpi_size) < 0) HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "couldn't update modified chunks") /* All ranks now collectively re-allocate file space for all chunks */ @@ -2276,19 +2371,21 @@ H5D__multi_chunk_filtered_collective_io(H5D_io_info_t *io_info, const H5D_type_i * Override the write buffer to point to the * chunk's data buffer */ - ctg_io_info.u.wbuf = chunk_list[i].buf; + ctg_io_info.base_maddr.cvp = chunk_list[i].buf; /* * Setup the base storage address for this * operation to be the chunk's file address */ - ctg_store.contig.dset_addr = chunk_list[i].chunk_new.offset; + ctg_io_info.store_faddr = chunk_list[i].chunk_new.offset; + } + else { + ctg_io_info.store_faddr = 0; + ctg_io_info.base_maddr = dset_info->buf; } - else - ctg_store.contig.dset_addr = 0; /* Perform the I/O */ - if (H5D__final_collective_io(&ctg_io_info, type_info, mpi_buf_count, file_type, mem_type) < 0) + if (H5D__final_collective_io(&ctg_io_info, mpi_buf_count, file_type, mem_type) < 0) HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish MPI-IO") /* Free up resources in anticipation of following collective operation */ @@ -2302,7 +2399,7 @@ H5D__multi_chunk_filtered_collective_io(H5D_io_info_t *io_info, const H5D_type_i */ if (H5D__mpio_collective_filtered_chunk_reinsert(have_chunk_to_process ? &chunk_list[i] : NULL, have_chunk_to_process ? 1 : 0, NULL, io_info, - &index_info, mpi_rank, mpi_size) < 0) + dset_info, &index_info, mpi_rank, mpi_size) < 0) HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "couldn't collectively re-insert modified chunks into chunk index") @@ -2346,7 +2443,7 @@ done: H5D_MPIO_TRACE_EXIT(mpi_rank); #endif - FUNC_LEAVE_NOAPI(ret_value) + FUNC_LEAVE_NOAPI_TAG(ret_value) } /* end H5D__multi_chunk_filtered_collective_io() */ /*------------------------------------------------------------------------- @@ -2363,7 +2460,7 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5D__inter_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, H5S_t *file_space, +H5D__inter_collective_io(H5D_io_info_t *io_info, const H5D_dset_io_info_t *di, H5S_t *file_space, H5S_t *mem_space) { int mpi_buf_count; /* # of MPI types */ @@ -2379,13 +2476,15 @@ H5D__inter_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_inf FUNC_ENTER_PACKAGE #ifdef H5Dmpio_DEBUG - mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file); + mpi_rank = H5F_mpi_get_rank(di->dset->oloc.file); H5D_MPIO_TRACE_ENTER(mpi_rank); H5D_MPIO_TIME_START(mpi_rank, "Inter collective I/O"); if (mpi_rank < 0) HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain MPI rank") #endif + HDassert(io_info); + if ((file_space != NULL) && (mem_space != NULL)) { int mpi_file_count; /* Number of file "objects" to transfer */ hsize_t *permute_map = NULL; /* array that holds the mapping from the old, @@ -2394,12 +2493,14 @@ H5D__inter_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_inf point selection of the file space */ hbool_t is_permuted = FALSE; + HDassert(di); + /* Obtain disk and memory MPI derived datatype */ /* NOTE: The permute_map array can be allocated within H5S_mpio_space_type * and will be fed into the next call to H5S_mpio_space_type * where it will be freed. */ - if (H5S_mpio_space_type(file_space, type_info->src_type_size, &mpi_file_type, &mpi_file_count, + if (H5S_mpio_space_type(file_space, di->type_info.src_type_size, &mpi_file_type, &mpi_file_count, &mft_is_derived, /* OUT: datatype created */ TRUE, /* this is a file space, so permute the datatype if the @@ -2415,7 +2516,7 @@ H5D__inter_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_inf /* Sanity check */ if (is_permuted) HDassert(permute_map); - if (H5S_mpio_space_type(mem_space, type_info->src_type_size, &mpi_buf_type, &mpi_buf_count, + if (H5S_mpio_space_type(mem_space, di->type_info.src_type_size, &mpi_buf_type, &mpi_buf_count, &mbt_is_derived, /* OUT: datatype created */ FALSE, /* this is a memory space, so if the file space is not @@ -2449,7 +2550,7 @@ H5D__inter_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_inf #endif /* Perform final collective I/O operation */ - if (H5D__final_collective_io(io_info, type_info, (hsize_t)mpi_buf_count, mpi_file_type, mpi_buf_type) < 0) + if (H5D__final_collective_io(io_info, (hsize_t)mpi_buf_count, mpi_file_type, mpi_buf_type) < 0) HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish collective MPI-IO") done: @@ -2481,8 +2582,8 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5D__final_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, hsize_t mpi_buf_count, - MPI_Datatype mpi_file_type, MPI_Datatype mpi_buf_type) +H5D__final_collective_io(H5D_io_info_t *io_info, hsize_t mpi_buf_count, MPI_Datatype mpi_file_type, + MPI_Datatype mpi_buf_type) { #ifdef H5Dmpio_DEBUG int mpi_rank; @@ -2492,7 +2593,7 @@ H5D__final_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_inf FUNC_ENTER_PACKAGE #ifdef H5Dmpio_DEBUG - mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file); + mpi_rank = H5F_mpi_get_rank(io_info->dsets_info[0].dset->oloc.file); H5D_MPIO_TRACE_ENTER(mpi_rank); H5D_MPIO_TIME_START(mpi_rank, "Final collective I/O"); if (mpi_rank < 0) @@ -2504,11 +2605,11 @@ H5D__final_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_inf HGOTO_ERROR(H5E_DATASET, H5E_CANTSET, FAIL, "can't set MPI-I/O collective I/O datatypes") if (io_info->op_type == H5D_IO_OP_WRITE) { - if ((io_info->io_ops.single_write)(io_info, type_info, mpi_buf_count, NULL, NULL) < 0) + if ((io_info->md_io_ops.single_write_md)(io_info, mpi_buf_count, NULL, NULL) < 0) HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed") } /* end if */ else { - if ((io_info->io_ops.single_read)(io_info, type_info, mpi_buf_count, NULL, NULL) < 0) + if ((io_info->md_io_ops.single_read_md)(io_info, mpi_buf_count, NULL, NULL) < 0) HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed") } /* end else */ @@ -2523,28 +2624,26 @@ done: } /* end H5D__final_collective_io */ /*------------------------------------------------------------------------- - * Function: H5D__cmp_chunk_addr + * Function: H5D__cmp_piece_addr * - * Purpose: Routine to compare chunk addresses + * Purpose: Routine to compare piece addresses * - * Description: Callback for qsort() to compare chunk addresses + * Description: Callback for qsort() to compare piece addresses * * Return: -1, 0, 1 * - * Programmer: Muqun Yang - * Monday, Feb. 13th, 2006 - * *------------------------------------------------------------------------- */ static int -H5D__cmp_chunk_addr(const void *chunk_addr_info1, const void *chunk_addr_info2) +H5D__cmp_piece_addr(const void *piece_info1, const void *piece_info2) { - haddr_t addr1 = HADDR_UNDEF, addr2 = HADDR_UNDEF; + haddr_t addr1; + haddr_t addr2; FUNC_ENTER_PACKAGE_NOERR - addr1 = ((const H5D_chunk_addr_info_t *)chunk_addr_info1)->chunk_addr; - addr2 = ((const H5D_chunk_addr_info_t *)chunk_addr_info2)->chunk_addr; + addr1 = (*((const H5D_piece_info_t *const *)piece_info1))->faddr; + addr2 = (*((const H5D_piece_info_t *const *)piece_info2))->faddr; FUNC_LEAVE_NOAPI(H5F_addr_cmp(addr1, addr2)) } /* end H5D__cmp_chunk_addr() */ @@ -2705,178 +2804,6 @@ H5D__cmp_chunk_redistribute_info_orig_owner(const void *_entry1, const void *_en } /* end H5D__cmp_chunk_redistribute_info_orig_owner() */ /*------------------------------------------------------------------------- - * Function: H5D__sort_chunk - * - * Purpose: Routine to sort chunks in increasing order of chunk address - * Each chunk address is also obtained. - * - * Description: - * For most cases, the chunk address has already been sorted in increasing order. - * The special sorting flag is used to optimize this common case. - * quick sort is used for necessary sorting. - * - * Parameters: - * Input: H5D_io_info_t* io_info, - * H5D_chunk_map_t *fm(global chunk map struct) - * Input/Output: H5D_chunk_addr_info_t chunk_addr_info_array[] : array to store chunk address - *and information many_chunk_opt : flag to optimize the way to obtain chunk addresses - * for many chunks - * - * Return: Non-negative on success/Negative on failure - * - * Programmer: Muqun Yang - * Monday, Feb. 13th, 2006 - * - *------------------------------------------------------------------------- - */ -static herr_t -H5D__sort_chunk(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm, - H5D_chunk_addr_info_t chunk_addr_info_array[], int sum_chunk, int mpi_rank, int mpi_size) -{ - H5SL_node_t *chunk_node; /* Current node in chunk skip list */ - H5D_chunk_info_t *chunk_info; /* Current chunking info. of this node. */ - haddr_t chunk_addr; /* Current chunking address of this node */ - haddr_t *total_chunk_addr_array = NULL; /* The array of chunk address for the total number of chunk */ - H5P_coll_md_read_flag_t md_reads_file_flag; - hbool_t md_reads_context_flag; - hbool_t restore_md_reads_state = FALSE; - hbool_t do_sort = FALSE; /* Whether the addresses need to be sorted */ - int bsearch_coll_chunk_threshold; - int many_chunk_opt = H5D_OBTAIN_ONE_CHUNK_ADDR_IND; - int mpi_code; /* MPI return code */ - int i; /* Local index variable */ - herr_t ret_value = SUCCEED; /* Return value */ - - FUNC_ENTER_PACKAGE - - /* Calculate the actual threshold to obtain all chunk addresses collectively - * The bigger this number is, the more possible the use of obtaining chunk - * address collectively. - */ - /* For non-optimization one-link IO, actual bsearch threshold is always - * 0, we would always want to obtain the chunk addresses individually - * for each process. - */ - bsearch_coll_chunk_threshold = (sum_chunk * 100) / ((int)fm->layout->u.chunk.nchunks * mpi_size); - if ((bsearch_coll_chunk_threshold > H5D_ALL_CHUNK_ADDR_THRES_COL) && - ((sum_chunk / mpi_size) >= H5D_ALL_CHUNK_ADDR_THRES_COL_NUM)) - many_chunk_opt = H5D_OBTAIN_ALL_CHUNK_ADDR_COL; - -#ifdef H5Dmpio_DEBUG - H5D_MPIO_DEBUG_VA(mpi_rank, "many_chunk_opt = %d", many_chunk_opt); -#endif - - /* If we need to optimize the way to obtain the chunk address */ - if (many_chunk_opt != H5D_OBTAIN_ONE_CHUNK_ADDR_IND) { -#ifdef H5Dmpio_DEBUG - H5D_MPIO_DEBUG(mpi_rank, "Coming inside H5D_OBTAIN_ALL_CHUNK_ADDR_COL"); -#endif - /* Allocate array for chunk addresses */ - if (NULL == (total_chunk_addr_array = - (haddr_t *)H5MM_malloc(sizeof(haddr_t) * (size_t)fm->layout->u.chunk.nchunks))) - HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, "unable to allocate memory chunk address array") - - if (mpi_rank == 0) { - herr_t result; - - /* - * If enabled, disable collective metadata reads here. - * Since the chunk address mapping is done on rank 0 - * only here, it will cause problems if collective - * metadata reads are enabled. - */ - if (H5F_get_coll_metadata_reads(io_info->dset->oloc.file)) { - md_reads_file_flag = H5P_FORCE_FALSE; - md_reads_context_flag = FALSE; - H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag, - &md_reads_context_flag); - restore_md_reads_state = TRUE; - } - - result = H5D__chunk_addrmap(io_info, total_chunk_addr_array); - - /* Ensure that we restore the old collective metadata reads state */ - if (restore_md_reads_state) { - H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag, - &md_reads_context_flag); - restore_md_reads_state = FALSE; - } - - if (result < 0) { - size_t u; - - /* Clear total chunk address array */ - for (u = 0; u < (size_t)fm->layout->u.chunk.nchunks; u++) - total_chunk_addr_array[u] = HADDR_UNDEF; - - /* Push error, but still participate in following MPI_Bcast */ - HDONE_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address") - } - } /* end if */ - - /* Broadcasting the MPI_IO option info. and chunk address info. */ - if (MPI_SUCCESS != (mpi_code = MPI_Bcast(total_chunk_addr_array, - (int)(sizeof(haddr_t) * fm->layout->u.chunk.nchunks), - MPI_BYTE, (int)0, io_info->comm))) - HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code) - } /* end if */ - - /* Start at first node in chunk skip list */ - i = 0; - if (NULL == (chunk_node = H5SL_first(fm->sel_chunks))) - HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL, "couldn't get chunk node from skipped list") - - /* Iterate over all chunks for this process */ - while (chunk_node) { - if (NULL == (chunk_info = (H5D_chunk_info_t *)H5SL_item(chunk_node))) - HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL, "couldn't get chunk info from skipped list") - - if (many_chunk_opt == H5D_OBTAIN_ONE_CHUNK_ADDR_IND) { - H5D_chunk_ud_t udata; /* User data for querying chunk info */ - - /* Get address of chunk */ - if (H5D__chunk_lookup(io_info->dset, chunk_info->scaled, &udata) < 0) - HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL, "couldn't get chunk info from skipped list") - chunk_addr = udata.chunk_block.offset; - } /* end if */ - else - chunk_addr = total_chunk_addr_array[chunk_info->index]; - - /* Check if chunk addresses are not in increasing order in the file */ - if (i > 0 && chunk_addr < chunk_addr_info_array[i - 1].chunk_addr) - do_sort = TRUE; - - /* Set the address & info for this chunk */ - chunk_addr_info_array[i].chunk_addr = chunk_addr; - chunk_addr_info_array[i].chunk_info = *chunk_info; - - /* Advance to next chunk in list */ - i++; - chunk_node = H5SL_next(chunk_node); - } /* end while */ - -#ifdef H5Dmpio_DEBUG - H5D_MPIO_DEBUG(mpi_rank, "before Qsort"); -#endif - - if (do_sort) { - size_t num_chunks = H5SL_count(fm->sel_chunks); - - HDqsort(chunk_addr_info_array, num_chunks, sizeof(chunk_addr_info_array[0]), H5D__cmp_chunk_addr); - } /* end if */ - -done: - /* Re-enable collective metadata reads if we disabled them */ - if (restore_md_reads_state) - H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag, &md_reads_context_flag); - - if (total_chunk_addr_array) - H5MM_xfree(total_chunk_addr_array); - - FUNC_LEAVE_NOAPI(ret_value) -} /* end H5D__sort_chunk() */ - -/*------------------------------------------------------------------------- * Function: H5D__obtain_mpio_mode * * Purpose: Routine to obtain each io mode(collective,independent or none) for each chunk; @@ -2902,7 +2829,7 @@ done: * Parameters: * * Input: H5D_io_info_t* io_info, - * H5D_chunk_map_t *fm,(global chunk map struct) + * H5D_dset_io_info_t *di,(dataset info struct) * Output: uint8_t assign_io_mode[], : IO mode, collective, independent or none * haddr_t chunk_addr[], : chunk address array for each chunk * @@ -2914,7 +2841,7 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5D__obtain_mpio_mode(H5D_io_info_t *io_info, H5D_chunk_map_t *fm, uint8_t assign_io_mode[], +H5D__obtain_mpio_mode(H5D_io_info_t *io_info, H5D_dset_io_info_t *di, uint8_t assign_io_mode[], haddr_t chunk_addr[], int mpi_rank, int mpi_size) { size_t total_chunks; @@ -2924,7 +2851,7 @@ H5D__obtain_mpio_mode(H5D_io_info_t *io_info, H5D_chunk_map_t *fm, uint8_t assig uint8_t *mergebuf = NULL; uint8_t *tempbuf; H5SL_node_t *chunk_node; - H5D_chunk_info_t *chunk_info; + H5D_piece_info_t *chunk_info; H5P_coll_md_read_flag_t md_reads_file_flag; hbool_t md_reads_context_flag; hbool_t restore_md_reads_state = FALSE; @@ -2936,17 +2863,19 @@ H5D__obtain_mpio_mode(H5D_io_info_t *io_info, H5D_chunk_map_t *fm, uint8_t assig FUNC_ENTER_PACKAGE + HDassert(di->layout->type == H5D_CHUNKED); + /* Assign the rank 0 to the root */ root = 0; comm = io_info->comm; /* Setup parameters */ - H5_CHECKED_ASSIGN(total_chunks, size_t, fm->layout->u.chunk.nchunks, hsize_t); + H5_CHECKED_ASSIGN(total_chunks, size_t, di->layout->u.chunk.nchunks, hsize_t); if (H5CX_get_mpio_chunk_opt_ratio(&percent_nproc_per_chunk) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "couldn't get percent nproc per chunk") /* if ratio is 0, perform collective io */ if (0 == percent_nproc_per_chunk) { - if (H5D__chunk_addrmap(io_info, chunk_addr) < 0) + if (H5D__chunk_addrmap(di->dset, chunk_addr) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address"); for (ic = 0; ic < total_chunks; ic++) assign_io_mode[ic] = H5D_CHUNK_IO_MODE_COL; @@ -2967,9 +2896,9 @@ H5D__obtain_mpio_mode(H5D_io_info_t *io_info, H5D_chunk_map_t *fm, uint8_t assig HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate recv I/O mode info buffer") /* Obtain the regularity and selection information for all chunks in this process. */ - chunk_node = H5SL_first(fm->sel_chunks); + chunk_node = H5SL_first(di->layout_io_info.chunk_map->dset_sel_pieces); while (chunk_node) { - chunk_info = (H5D_chunk_info_t *)H5SL_item(chunk_node); + chunk_info = (H5D_piece_info_t *)H5SL_item(chunk_node); io_mode_info[chunk_info->index] = H5D_CHUNK_SELECT_REG; /* this chunk is selected and is "regular" */ chunk_node = H5SL_next(chunk_node); @@ -2992,11 +2921,10 @@ H5D__obtain_mpio_mode(H5D_io_info_t *io_info, H5D_chunk_map_t *fm, uint8_t assig * only here, it will cause problems if collective * metadata reads are enabled. */ - if (H5F_get_coll_metadata_reads(io_info->dset->oloc.file)) { + if (H5F_get_coll_metadata_reads(di->dset->oloc.file)) { md_reads_file_flag = H5P_FORCE_FALSE; md_reads_context_flag = FALSE; - H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag, - &md_reads_context_flag); + H5F_set_coll_metadata_reads(di->dset->oloc.file, &md_reads_file_flag, &md_reads_context_flag); restore_md_reads_state = TRUE; } @@ -3006,7 +2934,7 @@ H5D__obtain_mpio_mode(H5D_io_info_t *io_info, H5D_chunk_map_t *fm, uint8_t assig HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate nproc_per_chunk buffer") /* calculating the chunk address */ - if (H5D__chunk_addrmap(io_info, chunk_addr) < 0) { + if (H5D__chunk_addrmap(di->dset, chunk_addr) < 0) { H5MM_free(nproc_per_chunk); HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address") } /* end if */ @@ -3068,7 +2996,7 @@ H5D__obtain_mpio_mode(H5D_io_info_t *io_info, H5D_chunk_map_t *fm, uint8_t assig done: /* Re-enable collective metadata reads if we disabled them */ if (restore_md_reads_state) - H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag, &md_reads_context_flag); + H5F_set_coll_metadata_reads(di->dset->oloc.file, &md_reads_file_flag, &md_reads_context_flag); if (io_mode_info) H5MM_free(io_mode_info); @@ -3098,8 +3026,7 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5D__mpio_collective_filtered_chunk_io_setup(const H5D_io_info_t *io_info, const H5D_type_info_t *type_info, - const H5D_chunk_map_t *fm, +H5D__mpio_collective_filtered_chunk_io_setup(const H5D_io_info_t *io_info, const H5D_dset_io_info_t *di, H5D_filtered_collective_io_info_t **chunk_list, size_t *num_entries, int mpi_rank) { @@ -3113,36 +3040,36 @@ H5D__mpio_collective_filtered_chunk_io_setup(const H5D_io_info_t *io_info, const FUNC_ENTER_PACKAGE HDassert(io_info); - HDassert(type_info); - HDassert(fm); + HDassert(di); HDassert(chunk_list); HDassert(num_entries); - #ifdef H5Dmpio_DEBUG H5D_MPIO_TRACE_ENTER(mpi_rank); H5D_MPIO_TIME_START(mpi_rank, "Filtered Collective I/O Setup"); #endif + HDassert(di->layout->type == H5D_CHUNKED); + /* Each rank builds a local list of the chunks they have selected */ - if ((num_chunks_selected = H5SL_count(fm->sel_chunks))) { - H5D_chunk_info_t *chunk_info; + if ((num_chunks_selected = H5SL_count(di->layout_io_info.chunk_map->dset_sel_pieces))) { + H5D_piece_info_t *chunk_info; H5SL_node_t *chunk_node; hsize_t select_npoints; hbool_t need_sort = FALSE; /* Determine whether partial edge chunks should be filtered */ - filter_partial_edge_chunks = !(io_info->dset->shared->layout.u.chunk.flags & - H5O_LAYOUT_CHUNK_DONT_FILTER_PARTIAL_BOUND_CHUNKS); + filter_partial_edge_chunks = + !(di->dset->shared->layout.u.chunk.flags & H5O_LAYOUT_CHUNK_DONT_FILTER_PARTIAL_BOUND_CHUNKS); if (NULL == (local_info_array = H5MM_malloc(num_chunks_selected * sizeof(*local_info_array)))) HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate local io info array buffer") - chunk_node = H5SL_first(fm->sel_chunks); + chunk_node = H5SL_first(di->layout_io_info.chunk_map->dset_sel_pieces); for (i = 0; chunk_node; i++) { - chunk_info = (H5D_chunk_info_t *)H5SL_item(chunk_node); + chunk_info = (H5D_piece_info_t *)H5SL_item(chunk_node); /* Obtain this chunk's address */ - if (H5D__chunk_lookup(io_info->dset, chunk_info->scaled, &udata) < 0) + if (H5D__chunk_lookup(di->dset, chunk_info->scaled, &udata) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "error looking up chunk address") /* Initialize rank-local chunk info */ @@ -3154,7 +3081,7 @@ H5D__mpio_collective_filtered_chunk_io_setup(const H5D_io_info_t *io_info, const local_info_array[i].buf = NULL; select_npoints = H5S_GET_SELECT_NPOINTS(chunk_info->fspace); - local_info_array[i].io_size = (size_t)select_npoints * type_info->dst_type_size; + local_info_array[i].io_size = (size_t)select_npoints * di->type_info.dst_type_size; /* * Determine whether this chunk will need to be read from the file. If this is @@ -3204,7 +3131,7 @@ H5D__mpio_collective_filtered_chunk_io_setup(const H5D_io_info_t *io_info, const local_info_array[i].need_read = TRUE; else { local_info_array[i].need_read = - local_info_array[i].io_size < (size_t)io_info->dset->shared->layout.u.chunk.size; + local_info_array[i].io_size < (size_t)di->dset->shared->layout.u.chunk.size; } local_info_array[i].skip_filter_pline = FALSE; @@ -3213,9 +3140,9 @@ H5D__mpio_collective_filtered_chunk_io_setup(const H5D_io_info_t *io_info, const * If this is a partial edge chunk and the "don't filter partial edge * chunks" flag is set, make sure not to apply filters to the chunk. */ - if (H5D__chunk_is_partial_edge_chunk(io_info->dset->shared->ndims, - io_info->dset->shared->layout.u.chunk.dim, - chunk_info->scaled, io_info->dset->shared->curr_dims)) + if (H5D__chunk_is_partial_edge_chunk(di->dset->shared->ndims, + di->dset->shared->layout.u.chunk.dim, chunk_info->scaled, + di->dset->shared->curr_dims)) local_info_array[i].skip_filter_pline = TRUE; } @@ -3244,7 +3171,7 @@ H5D__mpio_collective_filtered_chunk_io_setup(const H5D_io_info_t *io_info, const * extensible array code calculated instead of what was calculated * in the chunk file mapping. */ - if (io_info->dset->shared->layout.u.chunk.idx_type == H5D_CHUNK_IDX_EARRAY) + if (di->dset->shared->layout.u.chunk.idx_type == H5D_CHUNK_IDX_EARRAY) local_info_array[i].index_info.chunk_idx = udata.chunk_idx; else local_info_array[i].index_info.chunk_idx = chunk_info->index; @@ -3264,7 +3191,7 @@ H5D__mpio_collective_filtered_chunk_io_setup(const H5D_io_info_t *io_info, const H5D__mpio_dump_collective_filtered_chunk_list(local_info_array, num_chunks_selected, mpi_rank); #endif } - else if (H5F_get_coll_metadata_reads(io_info->dset->oloc.file)) { + else if (H5F_get_coll_metadata_reads(di->dset->oloc.file)) { hsize_t scaled[H5O_LAYOUT_NDIMS] = {0}; /* @@ -3281,7 +3208,7 @@ H5D__mpio_collective_filtered_chunk_io_setup(const H5D_io_info_t *io_info, const * callback that can be used to ensure collectivity between ranks * in a more natural way, but this hack should suffice for now. */ - if (H5D__chunk_lookup(io_info->dset, scaled, &udata) < 0) + if (H5D__chunk_lookup(di->dset, scaled, &udata) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "error looking up chunk address") } @@ -3320,8 +3247,7 @@ done: static herr_t H5D__mpio_redistribute_shared_chunks(H5D_filtered_collective_io_info_t *chunk_list, size_t chunk_list_num_entries, const H5D_io_info_t *io_info, - const H5D_chunk_map_t *fm, int mpi_rank, int mpi_size, - size_t **rank_chunks_assigned_map) + int mpi_rank, int mpi_size, size_t **rank_chunks_assigned_map) { hbool_t redistribute_on_all_ranks; size_t *num_chunks_map = NULL; @@ -3334,7 +3260,6 @@ H5D__mpio_redistribute_shared_chunks(H5D_filtered_collective_io_info_t *chunk_li HDassert(chunk_list || 0 == chunk_list_num_entries); HDassert(io_info); - HDassert(fm); HDassert(mpi_size > 1); /* No chunk sharing is possible for MPI Comm size of 1 */ #ifdef H5Dmpio_DEBUG @@ -3368,7 +3293,7 @@ H5D__mpio_redistribute_shared_chunks(H5D_filtered_collective_io_info_t *chunk_li redistribute_on_all_ranks = coll_chunk_list_size < H5D_CHUNK_REDISTRIBUTE_THRES; if (H5D__mpio_redistribute_shared_chunks_int(chunk_list, num_chunks_map, redistribute_on_all_ranks, - io_info, fm, mpi_rank, mpi_size) < 0) + io_info, mpi_rank, mpi_size) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTREDISTRIBUTE, FAIL, "can't redistribute shared chunks") /* @@ -3458,9 +3383,7 @@ done: static herr_t H5D__mpio_redistribute_shared_chunks_int(H5D_filtered_collective_io_info_t *chunk_list, size_t *num_chunks_assigned_map, hbool_t all_ranks_involved, - const H5D_io_info_t *io_info, - const H5D_chunk_map_t H5_ATTR_NDEBUG_UNUSED *fm, int mpi_rank, - int mpi_size) + const H5D_io_info_t *io_info, int mpi_rank, int mpi_size) { MPI_Datatype struct_type; MPI_Datatype packed_type; @@ -3481,7 +3404,6 @@ H5D__mpio_redistribute_shared_chunks_int(H5D_filtered_collective_io_info_t *chun HDassert(num_chunks_assigned_map); HDassert(chunk_list || 0 == num_chunks_assigned_map[mpi_rank]); HDassert(io_info); - HDassert(fm); HDassert(mpi_size > 1); #ifdef H5Dmpio_DEBUG @@ -3789,7 +3711,7 @@ done: static herr_t H5D__mpio_share_chunk_modification_data(H5D_filtered_collective_io_info_t *chunk_list, size_t *chunk_list_num_entries, H5D_io_info_t *io_info, - const H5D_type_info_t *type_info, int mpi_rank, + H5D_dset_io_info_t *dset_info, int mpi_rank, int H5_ATTR_NDEBUG_UNUSED mpi_size, H5D_filtered_collective_io_info_t **chunk_hash_table, unsigned char ***chunk_msg_bufs, int *chunk_msg_bufs_len) @@ -3818,7 +3740,7 @@ H5D__mpio_share_chunk_modification_data(H5D_filtered_collective_io_info_t *chunk HDassert(chunk_list_num_entries); HDassert(chunk_list || 0 == *chunk_list_num_entries); HDassert(io_info); - HDassert(type_info); + HDassert(dset_info); HDassert(mpi_size > 1); HDassert(chunk_msg_bufs); HDassert(chunk_msg_bufs_len); @@ -3891,7 +3813,7 @@ H5D__mpio_share_chunk_modification_data(H5D_filtered_collective_io_info_t *chunk last_assigned_idx++; } else { - H5D_chunk_info_t *chunk_info = chunk_entry->chunk_info; + H5D_piece_info_t *chunk_info = chunk_entry->chunk_info; unsigned char *mod_data_p = NULL; hsize_t iter_nelmts; size_t mod_data_size = 0; @@ -3909,7 +3831,7 @@ H5D__mpio_share_chunk_modification_data(H5D_filtered_collective_io_info_t *chunk iter_nelmts = H5S_GET_SELECT_NPOINTS(chunk_info->mspace); H5_CHECK_OVERFLOW(iter_nelmts, hsize_t, size_t); - mod_data_size += (size_t)iter_nelmts * type_info->src_type_size; + mod_data_size += (size_t)iter_nelmts * dset_info->type_info.src_type_size; if (NULL == (msg_send_bufs[num_send_requests] = H5MM_malloc(mod_data_size))) HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, @@ -3926,14 +3848,14 @@ H5D__mpio_share_chunk_modification_data(H5D_filtered_collective_io_info_t *chunk HGOTO_ERROR(H5E_DATASET, H5E_CANTENCODE, FAIL, "unable to encode dataspace") /* Initialize iterator for memory selection */ - if (H5S_select_iter_init(mem_iter, chunk_info->mspace, type_info->src_type_size, + if (H5S_select_iter_init(mem_iter, chunk_info->mspace, dset_info->type_info.src_type_size, H5S_SEL_ITER_SHARE_WITH_DATASPACE) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "unable to initialize memory selection information") mem_iter_init = TRUE; /* Collect the modification data into the buffer */ - if (0 == H5D__gather_mem(io_info->u.wbuf, mem_iter, (size_t)iter_nelmts, mod_data_p)) + if (0 == H5D__gather_mem(dset_info->buf.cvp, mem_iter, (size_t)iter_nelmts, mod_data_p)) HGOTO_ERROR(H5E_IO, H5E_CANTGATHER, FAIL, "couldn't gather from write buffer") /* @@ -4202,10 +4124,9 @@ done: static herr_t H5D__mpio_collective_filtered_chunk_common_io(H5D_filtered_collective_io_info_t *chunk_list, size_t chunk_list_num_entries, const H5D_io_info_t *io_info, - const H5D_type_info_t *type_info, int mpi_size) + int mpi_size) { H5D_io_info_t coll_io_info; - H5D_storage_t ctg_store; MPI_Datatype file_type = MPI_DATATYPE_NULL; MPI_Datatype mem_type = MPI_DATATYPE_NULL; hbool_t mem_type_is_derived = FALSE; @@ -4222,7 +4143,6 @@ H5D__mpio_collective_filtered_chunk_common_io(H5D_filtered_collective_io_info_t HDassert(chunk_list || 0 == chunk_list_num_entries); HDassert(io_info); - HDassert(type_info); /* Initialize temporary I/O info */ coll_io_info = *io_info; @@ -4258,9 +4178,7 @@ H5D__mpio_collective_filtered_chunk_common_io(H5D_filtered_collective_io_info_t /* * If this rank doesn't have a selection, it can - * skip I/O if independent I/O was requested at - * the low level, or if the MPI communicator size - * is 1. + * skip I/O if the MPI communicator size is 1. * * Otherwise, this rank has to participate in * collective I/O, but probably has a NULL buf @@ -4268,20 +4186,13 @@ H5D__mpio_collective_filtered_chunk_common_io(H5D_filtered_collective_io_info_t * write/read function expects one. */ if (num_chunks == 0) { - H5FD_mpio_collective_opt_t coll_opt_mode; - - /* Get the collective_opt property to check whether the application wants to do IO individually. */ - if (H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_opt property") - - if ((mpi_size == 1) || (H5FD_MPIO_INDIVIDUAL_IO == coll_opt_mode)) { + if (mpi_size == 1) HGOTO_DONE(SUCCEED) - } else { if (io_info->op_type == H5D_IO_OP_WRITE) - coll_io_info.u.wbuf = &fake_buf; + coll_io_info.base_maddr.cvp = &fake_buf; else - coll_io_info.u.rbuf = &fake_buf; + coll_io_info.base_maddr.vp = &fake_buf; } } @@ -4297,18 +4208,15 @@ H5D__mpio_collective_filtered_chunk_common_io(H5D_filtered_collective_io_info_t * to be the first chunk's file address */ if (io_info->op_type == H5D_IO_OP_WRITE) - ctg_store.contig.dset_addr = chunk_list[0].chunk_new.offset; + coll_io_info.store_faddr = chunk_list[0].chunk_new.offset; else - ctg_store.contig.dset_addr = base_read_offset; + coll_io_info.store_faddr = base_read_offset; } else - ctg_store.contig.dset_addr = 0; - - ctg_store.contig.dset_size = (hsize_t)io_info->dset->shared->layout.u.chunk.size; - coll_io_info.store = &ctg_store; + coll_io_info.store_faddr = 0; /* Perform I/O */ - if (H5D__final_collective_io(&coll_io_info, type_info, mpi_buf_count, file_type, mem_type) < 0) + if (H5D__final_collective_io(&coll_io_info, mpi_buf_count, file_type, mem_type) < 0) HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "couldn't finish MPI I/O") done: @@ -4335,10 +4243,10 @@ done: static herr_t H5D__mpio_collective_filtered_chunk_read(H5D_filtered_collective_io_info_t *chunk_list, size_t chunk_list_num_entries, const H5D_io_info_t *io_info, - const H5D_type_info_t *type_info, int mpi_rank, int mpi_size) + const H5D_dset_io_info_t *di, int mpi_rank, int mpi_size) { H5D_fill_buf_info_t fb_info; - H5D_chunk_info_t *chunk_info = NULL; + H5D_piece_info_t *chunk_info = NULL; H5D_io_info_t coll_io_info; H5Z_EDC_t err_detect; /* Error detection info */ H5Z_cb_t filter_cb; /* I/O filter callback function */ @@ -4356,7 +4264,7 @@ H5D__mpio_collective_filtered_chunk_read(H5D_filtered_collective_io_info_t *chun HDassert(chunk_list || 0 == chunk_list_num_entries); HDassert(io_info); - HDassert(type_info); + HDassert(di); #ifdef H5Dmpio_DEBUG H5D_MPIO_TRACE_ENTER(mpi_rank); @@ -4366,8 +4274,8 @@ H5D__mpio_collective_filtered_chunk_read(H5D_filtered_collective_io_info_t *chun #endif /* Initialize temporary I/O info */ - coll_io_info = *io_info; - coll_io_info.u.rbuf = NULL; + coll_io_info = *io_info; + coll_io_info.base_maddr.vp = NULL; if (chunk_list_num_entries) { /* Retrieve filter settings from API context */ @@ -4377,12 +4285,12 @@ H5D__mpio_collective_filtered_chunk_read(H5D_filtered_collective_io_info_t *chun HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get I/O filter callback function") /* Set size of full chunks in dataset */ - file_chunk_size = io_info->dset->shared->layout.u.chunk.size; + file_chunk_size = di->dset->shared->layout.u.chunk.size; /* Determine if fill values should be "read" for unallocated chunks */ - should_fill = (io_info->dset->shared->dcpl_cache.fill.fill_time == H5D_FILL_TIME_ALLOC) || - ((io_info->dset->shared->dcpl_cache.fill.fill_time == H5D_FILL_TIME_IFSET) && - io_info->dset->shared->dcpl_cache.fill.fill_defined); + should_fill = (di->dset->shared->dcpl_cache.fill.fill_time == H5D_FILL_TIME_ALLOC) || + ((di->dset->shared->dcpl_cache.fill.fill_time == H5D_FILL_TIME_IFSET) && + di->dset->shared->dcpl_cache.fill.fill_defined); } /* @@ -4436,22 +4344,21 @@ H5D__mpio_collective_filtered_chunk_read(H5D_filtered_collective_io_info_t *chun if (!fb_info_init) { hsize_t chunk_dims[H5S_MAX_RANK]; - HDassert(io_info->dset->shared->ndims == io_info->dset->shared->layout.u.chunk.ndims - 1); - for (size_t j = 0; j < io_info->dset->shared->layout.u.chunk.ndims - 1; j++) - chunk_dims[j] = (hsize_t)io_info->dset->shared->layout.u.chunk.dim[j]; + HDassert(di->dset->shared->ndims == di->dset->shared->layout.u.chunk.ndims - 1); + for (size_t j = 0; j < di->dset->shared->layout.u.chunk.ndims - 1; j++) + chunk_dims[j] = (hsize_t)di->dset->shared->layout.u.chunk.dim[j]; /* Get a dataspace for filling chunk memory buffers */ - if (NULL == (fill_space = H5S_create_simple( - io_info->dset->shared->layout.u.chunk.ndims - 1, chunk_dims, NULL))) + if (NULL == (fill_space = H5S_create_simple(di->dset->shared->layout.u.chunk.ndims - 1, + chunk_dims, NULL))) HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "unable to create chunk fill dataspace") /* Initialize fill value buffer */ - if (H5D__fill_init(&fb_info, NULL, (H5MM_allocate_t)H5D__chunk_mem_alloc, - (void *)&io_info->dset->shared->dcpl_cache.pline, - (H5MM_free_t)H5D__chunk_mem_free, - (void *)&io_info->dset->shared->dcpl_cache.pline, - &io_info->dset->shared->dcpl_cache.fill, io_info->dset->shared->type, - io_info->dset->shared->type_id, 0, file_chunk_size) < 0) + if (H5D__fill_init( + &fb_info, NULL, (H5MM_allocate_t)H5D__chunk_mem_alloc, + (void *)&di->dset->shared->dcpl_cache.pline, (H5MM_free_t)H5D__chunk_mem_free, + (void *)&di->dset->shared->dcpl_cache.pline, &di->dset->shared->dcpl_cache.fill, + di->dset->shared->type, di->dset->shared->type_id, 0, file_chunk_size) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "can't initialize fill value buffer") fb_info_init = TRUE; @@ -4459,8 +4366,8 @@ H5D__mpio_collective_filtered_chunk_read(H5D_filtered_collective_io_info_t *chun /* Write fill value to memory buffer */ HDassert(fb_info.fill_buf); - if (H5D__fill(fb_info.fill_buf, io_info->dset->shared->type, chunk_list[i].buf, - type_info->mem_type, fill_space) < 0) + if (H5D__fill(fb_info.fill_buf, di->dset->shared->type, chunk_list[i].buf, + di->type_info.mem_type, fill_space) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "couldn't fill chunk buffer with fill value") } } @@ -4472,8 +4379,8 @@ H5D__mpio_collective_filtered_chunk_read(H5D_filtered_collective_io_info_t *chun * read of chunks is essentially a no-op, so avoid it here. */ index_empty = FALSE; - if (io_info->dset->shared->dcpl_cache.fill.alloc_time == H5D_ALLOC_TIME_INCR) - if (H5D__chunk_index_empty(io_info->dset, &index_empty) < 0) + if (di->dset->shared->dcpl_cache.fill.alloc_time == H5D_ALLOC_TIME_INCR) + if (H5D__chunk_index_empty(di->dset, &index_empty) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "couldn't determine if chunk index is empty") if (!index_empty) { @@ -4482,11 +4389,11 @@ H5D__mpio_collective_filtered_chunk_read(H5D_filtered_collective_io_info_t *chun * the first chunk data buffer being read into */ if (base_read_buf) - coll_io_info.u.rbuf = base_read_buf; + coll_io_info.base_maddr.vp = base_read_buf; /* Perform collective chunk read */ if (H5D__mpio_collective_filtered_chunk_common_io(chunk_list, chunk_list_num_entries, &coll_io_info, - type_info, mpi_size) < 0) + mpi_size) < 0) HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "couldn't finish collective filtered chunk read") } @@ -4499,7 +4406,7 @@ H5D__mpio_collective_filtered_chunk_read(H5D_filtered_collective_io_info_t *chun /* Unfilter the chunk, unless we didn't read it from the file */ if (chunk_list[i].need_read && !chunk_list[i].skip_filter_pline) { - if (H5Z_pipeline(&io_info->dset->shared->dcpl_cache.pline, H5Z_FLAG_REVERSE, + if (H5Z_pipeline(&di->dset->shared->dcpl_cache.pline, H5Z_FLAG_REVERSE, &(chunk_list[i].index_info.filter_mask), err_detect, filter_cb, (size_t *)&chunk_list[i].chunk_new.length, &chunk_list[i].chunk_buf_size, &chunk_list[i].buf) < 0) @@ -4509,8 +4416,8 @@ H5D__mpio_collective_filtered_chunk_read(H5D_filtered_collective_io_info_t *chun /* Scatter the chunk data to the read buffer */ iter_nelmts = H5S_GET_SELECT_NPOINTS(chunk_info->fspace); - if (H5D_select_io_mem(io_info->u.rbuf, chunk_info->mspace, chunk_list[i].buf, chunk_info->fspace, - type_info->src_type_size, (size_t)iter_nelmts) < 0) + if (H5D_select_io_mem(di->buf.vp, chunk_info->mspace, chunk_list[i].buf, chunk_info->fspace, + di->type_info.src_type_size, (size_t)iter_nelmts) < 0) HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "couldn't copy chunk data to read buffer") } @@ -4554,39 +4461,44 @@ H5D__mpio_collective_filtered_chunk_update(H5D_filtered_collective_io_info_t *ch size_t chunk_list_num_entries, H5D_filtered_collective_io_info_t *chunk_hash_table, unsigned char **chunk_msg_bufs, int chunk_msg_bufs_len, - const H5D_io_info_t *io_info, const H5D_type_info_t *type_info, + const H5D_io_info_t *io_info, const H5D_dset_io_info_t *di, int H5_ATTR_NDEBUG_UNUSED mpi_rank, int mpi_size) { - H5D_fill_buf_info_t fb_info; - H5D_chunk_info_t *chunk_info = NULL; - H5S_sel_iter_t *sel_iter = NULL; /* Dataspace selection iterator for H5D__scatter_mem */ - H5D_io_info_t coll_io_info; - H5Z_EDC_t err_detect; /* Error detection info */ - H5Z_cb_t filter_cb; /* I/O filter callback function */ - hsize_t file_chunk_size = 0; - hsize_t iter_nelmts; /* Number of points to iterate over for the chunk IO operation */ - hbool_t should_fill = FALSE; - hbool_t fb_info_init = FALSE; - hbool_t sel_iter_init = FALSE; - hbool_t index_empty = FALSE; - size_t i; - H5S_t *dataspace = NULL; - H5S_t *fill_space = NULL; - void *base_read_buf = NULL; - herr_t ret_value = SUCCEED; + const H5D_type_info_t *type_info = NULL; + H5D_fill_buf_info_t fb_info; + H5D_piece_info_t *chunk_info = NULL; + H5S_sel_iter_t *sel_iter = NULL; /* Dataspace selection iterator for H5D__scatter_mem */ + H5D_io_info_t coll_io_info; + H5Z_EDC_t err_detect; /* Error detection info */ + H5Z_cb_t filter_cb; /* I/O filter callback function */ + hsize_t file_chunk_size = 0; + hsize_t iter_nelmts; /* Number of points to iterate over for the chunk IO operation */ + hbool_t should_fill = FALSE; + hbool_t fb_info_init = FALSE; + hbool_t sel_iter_init = FALSE; + hbool_t index_empty = FALSE; + size_t i; + H5S_t *dataspace = NULL; + H5S_t *fill_space = NULL; + void *base_read_buf = NULL; + herr_t ret_value = SUCCEED; FUNC_ENTER_PACKAGE HDassert(chunk_list || 0 == chunk_list_num_entries); HDassert((chunk_msg_bufs && chunk_hash_table) || 0 == chunk_msg_bufs_len); HDassert(io_info); - HDassert(type_info); + HDassert(di); #ifdef H5Dmpio_DEBUG H5D_MPIO_TRACE_ENTER(mpi_rank); H5D_MPIO_TIME_START(mpi_rank, "Filtered collective chunk update"); #endif + /* Set convenience pointers */ + type_info = &(di->type_info); + HDassert(type_info); + if (chunk_list_num_entries) { /* Retrieve filter settings from API context */ if (H5CX_get_err_detect(&err_detect) < 0) @@ -4595,12 +4507,12 @@ H5D__mpio_collective_filtered_chunk_update(H5D_filtered_collective_io_info_t *ch HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get I/O filter callback function") /* Set size of full chunks in dataset */ - file_chunk_size = io_info->dset->shared->layout.u.chunk.size; + file_chunk_size = di->dset->shared->layout.u.chunk.size; /* Determine if fill values should be written to chunks */ - should_fill = (io_info->dset->shared->dcpl_cache.fill.fill_time == H5D_FILL_TIME_ALLOC) || - ((io_info->dset->shared->dcpl_cache.fill.fill_time == H5D_FILL_TIME_IFSET) && - io_info->dset->shared->dcpl_cache.fill.fill_defined); + should_fill = (di->dset->shared->dcpl_cache.fill.fill_time == H5D_FILL_TIME_ALLOC) || + ((di->dset->shared->dcpl_cache.fill.fill_time == H5D_FILL_TIME_IFSET) && + di->dset->shared->dcpl_cache.fill.fill_defined); } /* @@ -4672,25 +4584,23 @@ H5D__mpio_collective_filtered_chunk_update(H5D_filtered_collective_io_info_t *ch if (!fb_info_init) { hsize_t chunk_dims[H5S_MAX_RANK]; - HDassert(io_info->dset->shared->ndims == - io_info->dset->shared->layout.u.chunk.ndims - 1); - for (size_t j = 0; j < io_info->dset->shared->layout.u.chunk.ndims - 1; j++) - chunk_dims[j] = (hsize_t)io_info->dset->shared->layout.u.chunk.dim[j]; + HDassert(di->dset->shared->ndims == di->dset->shared->layout.u.chunk.ndims - 1); + for (size_t j = 0; j < di->dset->shared->layout.u.chunk.ndims - 1; j++) + chunk_dims[j] = (hsize_t)di->dset->shared->layout.u.chunk.dim[j]; /* Get a dataspace for filling chunk memory buffers */ if (NULL == (fill_space = H5S_create_simple( - io_info->dset->shared->layout.u.chunk.ndims - 1, chunk_dims, NULL))) + di->dset->shared->layout.u.chunk.ndims - 1, chunk_dims, NULL))) HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "unable to create chunk fill dataspace") /* Initialize fill value buffer */ if (H5D__fill_init(&fb_info, NULL, (H5MM_allocate_t)H5D__chunk_mem_alloc, - (void *)&io_info->dset->shared->dcpl_cache.pline, + (void *)&di->dset->shared->dcpl_cache.pline, (H5MM_free_t)H5D__chunk_mem_free, - (void *)&io_info->dset->shared->dcpl_cache.pline, - &io_info->dset->shared->dcpl_cache.fill, - io_info->dset->shared->type, io_info->dset->shared->type_id, 0, - file_chunk_size) < 0) + (void *)&di->dset->shared->dcpl_cache.pline, + &di->dset->shared->dcpl_cache.fill, di->dset->shared->type, + di->dset->shared->type_id, 0, file_chunk_size) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "can't initialize fill value buffer") fb_info_init = TRUE; @@ -4698,7 +4608,7 @@ H5D__mpio_collective_filtered_chunk_update(H5D_filtered_collective_io_info_t *ch /* Write fill value to memory buffer */ HDassert(fb_info.fill_buf); - if (H5D__fill(fb_info.fill_buf, io_info->dset->shared->type, chunk_list[i].buf, + if (H5D__fill(fb_info.fill_buf, di->dset->shared->type, chunk_list[i].buf, type_info->mem_type, fill_space) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "couldn't fill chunk buffer with fill value") @@ -4715,8 +4625,8 @@ H5D__mpio_collective_filtered_chunk_update(H5D_filtered_collective_io_info_t *ch * read of chunks is essentially a no-op, so avoid it here. */ index_empty = FALSE; - if (io_info->dset->shared->dcpl_cache.fill.alloc_time == H5D_ALLOC_TIME_INCR) - if (H5D__chunk_index_empty(io_info->dset, &index_empty) < 0) + if (di->dset->shared->dcpl_cache.fill.alloc_time == H5D_ALLOC_TIME_INCR) + if (H5D__chunk_index_empty(di->dset, &index_empty) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "couldn't determine if chunk index is empty") if (!index_empty) { @@ -4731,12 +4641,13 @@ H5D__mpio_collective_filtered_chunk_update(H5D_filtered_collective_io_info_t *ch /* Override the read buffer to point to the address of the first * chunk data buffer being read into */ - if (base_read_buf) - coll_io_info.u.rbuf = base_read_buf; + if (base_read_buf) { + coll_io_info.base_maddr.vp = base_read_buf; + } /* Read all chunks that need to be read from the file */ if (H5D__mpio_collective_filtered_chunk_common_io(chunk_list, chunk_list_num_entries, &coll_io_info, - type_info, mpi_size) < 0) + mpi_size) < 0) HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "couldn't finish collective filtered chunk read") } @@ -4756,7 +4667,7 @@ H5D__mpio_collective_filtered_chunk_update(H5D_filtered_collective_io_info_t *ch * the file, so we need to unfilter it */ if (chunk_list[i].need_read && !chunk_list[i].skip_filter_pline) { - if (H5Z_pipeline(&io_info->dset->shared->dcpl_cache.pline, H5Z_FLAG_REVERSE, + if (H5Z_pipeline(&di->dset->shared->dcpl_cache.pline, H5Z_FLAG_REVERSE, &(chunk_list[i].index_info.filter_mask), err_detect, filter_cb, (size_t *)&chunk_list[i].chunk_new.length, &chunk_list[i].chunk_buf_size, &chunk_list[i].buf) < 0) @@ -4765,7 +4676,7 @@ H5D__mpio_collective_filtered_chunk_update(H5D_filtered_collective_io_info_t *ch iter_nelmts = H5S_GET_SELECT_NPOINTS(chunk_info->mspace); - if (H5D_select_io_mem(chunk_list[i].buf, chunk_info->fspace, io_info->u.wbuf, chunk_info->mspace, + if (H5D_select_io_mem(chunk_list[i].buf, chunk_info->fspace, di->buf.cvp, chunk_info->mspace, type_info->dst_type_size, (size_t)iter_nelmts) < 0) HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "couldn't copy chunk data to write buffer") } @@ -4834,10 +4745,9 @@ H5D__mpio_collective_filtered_chunk_update(H5D_filtered_collective_io_info_t *ch /* Finally, filter all the chunks */ for (i = 0; i < chunk_list_num_entries; i++) { if (!chunk_list[i].skip_filter_pline) { - if (H5Z_pipeline(&io_info->dset->shared->dcpl_cache.pline, 0, - &(chunk_list[i].index_info.filter_mask), err_detect, filter_cb, - (size_t *)&chunk_list[i].chunk_new.length, &chunk_list[i].chunk_buf_size, - &chunk_list[i].buf) < 0) + if (H5Z_pipeline(&di->dset->shared->dcpl_cache.pline, 0, &(chunk_list[i].index_info.filter_mask), + err_detect, filter_cb, (size_t *)&chunk_list[i].chunk_new.length, + &chunk_list[i].chunk_buf_size, &chunk_list[i].buf) < 0) HGOTO_ERROR(H5E_PLINE, H5E_CANTFILTER, FAIL, "output pipeline failed") } @@ -5088,8 +4998,8 @@ done: static herr_t H5D__mpio_collective_filtered_chunk_reinsert(H5D_filtered_collective_io_info_t *chunk_list, size_t chunk_list_num_entries, size_t *num_chunks_assigned_map, - H5D_io_info_t *io_info, H5D_chk_idx_info_t *idx_info, - int mpi_rank, int mpi_size) + H5D_io_info_t *io_info, H5D_dset_io_info_t *di, + H5D_chk_idx_info_t *idx_info, int mpi_rank, int mpi_size) { H5D_chunk_ud_t chunk_ud; MPI_Datatype send_type; @@ -5110,6 +5020,7 @@ H5D__mpio_collective_filtered_chunk_reinsert(H5D_filtered_collective_io_info_t * HDassert(chunk_list || 0 == chunk_list_num_entries); HDassert(io_info); + HDassert(di); HDassert(idx_info); #ifdef H5Dmpio_DEBUG @@ -5219,17 +5130,17 @@ H5D__mpio_collective_filtered_chunk_reinsert(H5D_filtered_collective_io_info_t * * callback that accepts a chunk index and provides the * caller with the scaled coordinates for that chunk. */ - H5VM_array_calc_pre(chunk_ud.chunk_idx, io_info->dset->shared->ndims, + H5VM_array_calc_pre(chunk_ud.chunk_idx, di->dset->shared->ndims, idx_info->layout->u.earray.swizzled_down_chunks, scaled_coords); H5VM_unswizzle_coords(hsize_t, scaled_coords, idx_info->layout->u.earray.unlim_dim); } else { - H5VM_array_calc_pre(chunk_ud.chunk_idx, io_info->dset->shared->ndims, - io_info->dset->shared->layout.u.chunk.down_chunks, scaled_coords); + H5VM_array_calc_pre(chunk_ud.chunk_idx, di->dset->shared->ndims, + di->dset->shared->layout.u.chunk.down_chunks, scaled_coords); } - scaled_coords[io_info->dset->shared->ndims] = 0; + scaled_coords[di->dset->shared->ndims] = 0; #ifndef NDEBUG /* @@ -5243,7 +5154,7 @@ H5D__mpio_collective_filtered_chunk_reinsert(H5D_filtered_collective_io_info_t * for (size_t dbg_idx = 0; dbg_idx < chunk_list_num_entries; dbg_idx++) { if (coll_entry->index_info.chunk_idx == chunk_list[dbg_idx].index_info.chunk_idx) { hbool_t coords_match = !HDmemcmp(scaled_coords, chunk_list[dbg_idx].chunk_info->scaled, - io_info->dset->shared->ndims * sizeof(hsize_t)); + di->dset->shared->ndims * sizeof(hsize_t)); HDassert(coords_match && "Calculated scaled coordinates for chunk didn't match " "chunk's actual scaled coordinates!"); @@ -5252,7 +5163,7 @@ H5D__mpio_collective_filtered_chunk_reinsert(H5D_filtered_collective_io_info_t * } #endif - if ((idx_info->storage->ops->insert)(idx_info, &chunk_ud, io_info->dset) < 0) + if ((idx_info->storage->ops->insert)(idx_info, &chunk_ud, di->dset) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTINSERT, FAIL, "unable to insert chunk address into index") } |