diff options
-rw-r--r-- | src/H5Dio.c | 436 | ||||
-rw-r--r-- | src/H5Dmpio.c | 1646 | ||||
-rw-r--r-- | src/H5Dpkg.h | 85 | ||||
-rw-r--r-- | src/H5Dselect.c | 2 |
4 files changed, 1701 insertions, 468 deletions
diff --git a/src/H5Dio.c b/src/H5Dio.c index 21face4..6b62ca9 100644 --- a/src/H5Dio.c +++ b/src/H5Dio.c @@ -49,6 +49,7 @@ /* Information for mapping between file space and memory space */ +#if 0 /* Structure holding information about a chunk's selection for mapping */ typedef struct H5D_chunk_info_t { hsize_t index; /* "Index" of chunk in dataset */ @@ -77,8 +78,11 @@ typedef struct fm_map { hsize_t down_chunks[H5O_LAYOUT_NDIMS]; /* "down" size of number of chunks in each dimension */ H5O_layout_t *layout; /* Dataset layout information*/ H5S_sel_type msel_type; /* Selection type in memory */ + hsize_t total_chunks; /* Number of total chunks */ + hbool_t *select_chunk; /* store the information about whether this chunk is selected or not */ } fm_map; +#endif /********************/ /* Local Prototypes */ /********************/ @@ -107,12 +111,9 @@ static herr_t H5D_chunk_write(H5D_io_info_t *io_info, hsize_t nelmts, const H5T_t *mem_type, const H5S_t *mem_space, const H5S_t *file_space, H5T_path_t *tpath, hid_t src_id, hid_t dst_id, const void *buf); + #ifdef H5_HAVE_PARALLEL -static herr_t H5D_ioinfo_make_ind(H5D_io_info_t *io_info); -static herr_t H5D_ioinfo_make_coll(H5D_io_info_t *io_info); static herr_t H5D_ioinfo_term(H5D_io_info_t *io_info); -static herr_t H5D_mpio_get_min_chunk(const H5D_io_info_t *io_info, - const fm_map *fm, int *min_chunkf); #endif /* H5_HAVE_PARALLEL */ /* I/O info operations */ @@ -125,7 +126,7 @@ static herr_t H5D_create_chunk_map(const H5D_t *dataset, const H5T_t *mem_type, const H5S_t *file_space, const H5S_t *mem_space, fm_map *fm); static herr_t H5D_destroy_chunk_map(const fm_map *fm); static herr_t H5D_free_chunk_info(void *item, void *key, void *opdata); -static herr_t H5D_create_chunk_file_map_hyper(const fm_map *fm); +static herr_t H5D_create_chunk_file_map_hyper(fm_map *fm, const H5D_t *dset); static herr_t H5D_create_chunk_mem_map_hyper(const fm_map *fm); static herr_t H5D_chunk_file_cb(void *elem, hid_t type_id, unsigned ndims, const hsize_t *coords, void *fm); @@ -952,6 +953,8 @@ H5D_contig_read(H5D_io_info_t *io_info, hsize_t nelmts, hsize_t smine_start; /*strip mine start loc */ size_t n, smine_nelmts; /*elements per strip */ H5D_storage_t store; /*union of storage info for dataset */ + + herr_t ret_value = SUCCEED; /*return value */ FUNC_ENTER_NOAPI_NOINIT(H5D_contig_read) @@ -981,10 +984,21 @@ H5D_contig_read(H5D_io_info_t *io_info, hsize_t nelmts, || dataset->shared->efl.nused>0 || 0 == nelmts || dataset->shared->layout.type==H5D_COMPACT); H5_CHECK_OVERFLOW(nelmts,hsize_t,size_t); - status = (io_info->ops.read)(io_info, + +#ifdef H5_HAVE_PARALLEL + if(io_info->dxpl_cache->xfer_mode == H5FD_MPIO_COLLECTIVE) { + if(H5D_contig_collective_io(io_info,file_space,mem_space,buf,FALSE)<0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't manipulate collective I/O"); + } + else +#endif + { + status = (io_info->ops.read)(io_info, (size_t)nelmts, H5T_get_size(dataset->shared->type), - file_space, mem_space, + file_space, mem_space,0, buf/*out*/); + } + #ifdef H5S_DEBUG H5_timer_end(&(io_info->stats->stats[1].read_timer), &timer); io_info->stats->stats[1].read_nbytes += nelmts * H5T_get_size(dataset->shared->type); @@ -1208,6 +1222,7 @@ H5D_contig_write(H5D_io_info_t *io_info, hsize_t nelmts, hsize_t smine_start; /*strip mine start loc */ size_t n, smine_nelmts; /*elements per strip */ H5D_storage_t store; /*union of storage info for dataset */ + herr_t ret_value = SUCCEED; /*return value */ FUNC_ENTER_NOAPI_NOINIT(H5D_contig_write) @@ -1232,10 +1247,20 @@ H5D_contig_write(H5D_io_info_t *io_info, hsize_t nelmts, H5_timer_begin(&timer); #endif H5_CHECK_OVERFLOW(nelmts,hsize_t,size_t); - status = (io_info->ops.write)(io_info, +#ifdef H5_HAVE_PARALLEL + if(io_info->dxpl_cache->xfer_mode == H5FD_MPIO_COLLECTIVE) { + if(H5D_contig_collective_io(io_info,file_space,mem_space,buf,TRUE)<0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't manipulate collective I/O"); + } + else +#endif + { + status = (io_info->ops.write)(io_info, (size_t)nelmts, H5T_get_size(dataset->shared->type), - file_space, mem_space, - buf); + file_space, mem_space,0, + buf/*out*/); + } + #ifdef H5S_DEBUG H5_timer_end(&(io_info->stats->stats[0].write_timer), &timer); io_info->stats->stats[0].write_nbytes += nelmts * H5T_get_size(mem_type); @@ -1460,10 +1485,6 @@ H5D_chunk_read(H5D_io_info_t *io_info, hsize_t nelmts, uint8_t *tconv_buf = NULL; /*data type conv buffer */ uint8_t *bkg_buf = NULL; /*background buffer */ H5D_storage_t store; /*union of EFL and chunk pointer in file space */ -#ifdef H5_HAVE_PARALLEL - int count_chunk; /* Number of chunks accessed */ - int min_num_chunk; /* Number of chunks to access collectively */ -#endif herr_t ret_value = SUCCEED; /*return value */ FUNC_ENTER_NOAPI_NOINIT(H5D_chunk_read) @@ -1489,99 +1510,48 @@ H5D_chunk_read(H5D_io_info_t *io_info, hsize_t nelmts, || dataset->shared->efl.nused>0 || 0 == nelmts || dataset->shared->layout.type==H5D_COMPACT); - /* Get first node in chunk skip list */ - chunk_node=H5SL_first(fm.fsel); - #ifdef H5_HAVE_PARALLEL if(io_info->dxpl_cache->xfer_mode == H5FD_MPIO_COLLECTIVE) { - if(H5D_mpio_get_min_chunk(io_info, &fm, &min_num_chunk)<0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get minimum number of chunk") + if(H5D_mpio_chunk_adjust_iomode(io_info,&fm)) + HGOTO_ERROR(H5E_DATASET,H5E_CANTGET,FAIL,"can't adjust collective I/O") + } + /* Temporarily shut down collective IO for chunking */ + if(io_info->dxpl_cache->xfer_mode == H5FD_MPIO_COLLECTIVE) { + if(H5D_chunk_collective_io(io_info,&fm,buf,FALSE)<0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't manipulate collective I/O"); } - count_chunk = 0; -#endif /* H5_HAVE_PARALLEL */ + + else {/* sequential or independent read */ +#endif + /* Get first node in chunk skip list */ + chunk_node=H5SL_first(fm.fsel); - /* Iterate through chunks to be operated on */ - while(chunk_node) { + while(chunk_node) { H5D_chunk_info_t *chunk_info; /* chunk information */ -#ifdef H5_HAVE_PARALLEL - hbool_t make_ind, make_coll; /* Flags to indicate that the MPI mode should change */ -#endif /* H5_HAVE_PARALLEL */ - /* Get the actual chunk information from the skip list node */ - chunk_info=H5SL_item(chunk_node); + /* Get the actual chunk information from the skip list node */ + chunk_info=H5SL_item(chunk_node); - /* Pass in chunk's coordinates in a union. */ - store.chunk.offset = chunk_info->coords; - store.chunk.index = chunk_info->index; + /* Pass in chunk's coordinates in a union. */ + store.chunk.offset = chunk_info->coords; + store.chunk.index = chunk_info->index; -#ifdef H5_HAVE_PARALLEL - /* Reset flags for changing parallel I/O mode */ - make_ind = make_coll = FALSE; - - if(io_info->dxpl_cache->xfer_mode == H5FD_MPIO_COLLECTIVE) { - /* Increment chunk we are operating on */ - count_chunk++; - - /* If the number of chunk is greater than minimum number of chunk, - Do independent read */ - if(count_chunk > min_num_chunk) { - /* Switch to independent I/O (permanently) */ - make_ind = TRUE; - } -#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS - else { - int is_regular; /* If this chunk's selections are regular */ - int mpi_code; /* MPI error code */ - int all_regular = 0; /* If this chunk's selections are regular on all processes */ - - /* Determine if this process has regular selections */ - if(H5S_SELECT_IS_REGULAR(chunk_info->fspace) == TRUE && - H5S_SELECT_IS_REGULAR(chunk_info->mspace) == TRUE) - is_regular = 1; - else - is_regular = 0; - - /* Determine if all processes have regular selections */ - if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&is_regular, &all_regular, 1, MPI_INT, MPI_LAND, io_info->comm))) - HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code) - - /* For regular selection, - if MPI_COMPLEX_DERIVED_DATATYPE is not defined, - unless spaces for all processors are regular, independent read operation should be performed.*/ - if(!all_regular) { - /* Switch to independent I/O (temporarily) */ - make_ind = TRUE; - make_coll = TRUE; - } /* end if */ - } /* end else */ -#endif /* H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS */ - } /* end if */ + /* Perform the actual read operation */ + status = (io_info->ops.read)(io_info, + chunk_info->chunk_points, H5T_get_size(dataset->shared->type), + chunk_info->fspace, chunk_info->mspace, 0,buf); - /* Switch to independent I/O */ - if(make_ind) - if(H5D_ioinfo_make_ind(io_info) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O") -#endif /* H5_HAVE_PARALLEL */ + /* Check return value from optimized read */ + if (status<0) + HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, " optimized read failed") - /* Perform the actual read operation */ - status = (io_info->ops.read)(io_info, - chunk_info->chunk_points, H5T_get_size(dataset->shared->type), - chunk_info->fspace, chunk_info->mspace, buf); + chunk_node = H5SL_next(chunk_node); + } #ifdef H5_HAVE_PARALLEL - /* Switch back to collective I/O */ - if(make_coll) - if(H5D_ioinfo_make_coll(io_info) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O") -#endif /* H5_HAVE_PARALLEL */ - - /* Check return value from optimized read */ - if (status<0) - HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed") + } +#endif - /* Get the next chunk node in the skip list */ - chunk_node=H5SL_next(chunk_node); - } /* end while */ #ifdef H5S_DEBUG H5_timer_end(&(io_info->stats->stats[1].read_timer), &timer); @@ -1843,10 +1813,7 @@ H5D_chunk_write(H5D_io_info_t *io_info, hsize_t nelmts, uint8_t *tconv_buf = NULL; /*data type conv buffer */ uint8_t *bkg_buf = NULL; /*background buffer */ H5D_storage_t store; /*union of EFL and chunk pointer in file space */ -#ifdef H5_HAVE_PARALLEL - int count_chunk; /* Number of chunks accessed */ - int min_num_chunk; /* Number of chunks to access collectively */ -#endif + herr_t ret_value = SUCCEED; /*return value */ FUNC_ENTER_NOAPI_NOINIT(H5D_chunk_write) @@ -1868,98 +1835,48 @@ H5D_chunk_write(H5D_io_info_t *io_info, hsize_t nelmts, #endif #ifdef H5_HAVE_PARALLEL + /* Check whether the collective mode can be turned off globally*/ + if(io_info->dxpl_cache->xfer_mode == H5FD_MPIO_COLLECTIVE) { - if(H5D_mpio_get_min_chunk(io_info, &fm, &min_num_chunk)<0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get minimum number of chunk") + if(H5D_mpio_chunk_adjust_iomode(io_info,&fm)) + HGOTO_ERROR(H5E_DATASET,H5E_CANTGET,FAIL,"can't adjust collective I/O") + } + if(io_info->dxpl_cache->xfer_mode == H5FD_MPIO_COLLECTIVE) { + if(H5D_chunk_collective_io(io_info,&fm,buf,TRUE)<0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't manipulate collective I/O"); } - count_chunk = 0; -#endif /* H5_HAVE_PARALLEL */ + else {/* sequential or independent write */ + + #endif /* H5_HAVE_PARALLEL */ + /* Get first node in chunk skip list */ + chunk_node=H5SL_first(fm.fsel); - /* Get first node in chunk skip list */ - chunk_node=H5SL_first(fm.fsel); - - /* Iterate through chunks to be operated on */ - while(chunk_node) { + while(chunk_node) { H5D_chunk_info_t *chunk_info; /* Chunk information */ -#ifdef H5_HAVE_PARALLEL - hbool_t make_ind, make_coll; /* Flags to indicate that the MPI mode should change */ -#endif /* H5_HAVE_PARALLEL */ - /* Get the actual chunk information from the skip list node */ - chunk_info=H5SL_item(chunk_node); + /* Get the actual chunk information from the skip list node */ + chunk_info=H5SL_item(chunk_node); - /* Pass in chunk's coordinates in a union. */ - store.chunk.offset = chunk_info->coords; - store.chunk.index = chunk_info->index; + /* Pass in chunk's coordinates in a union. */ + store.chunk.offset = chunk_info->coords; + store.chunk.index = chunk_info->index; -#ifdef H5_HAVE_PARALLEL - /* Reset flags for changing parallel I/O mode */ - make_ind = make_coll = FALSE; - - if(io_info->dxpl_cache->xfer_mode == H5FD_MPIO_COLLECTIVE) { - /* Increment chunk we are operating on */ - count_chunk++; - - /* If the number of chunk is greater than minimum number of chunk, - Do independent write */ - if(count_chunk > min_num_chunk) { - /* Switch to independent I/O (permanently) */ - make_ind = TRUE; - } -#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS - else { - int is_regular; /* If this chunk's selections are regular */ - int mpi_code; /* MPI error code */ - int all_regular = 0; /* If this chunk's selections are regular on all processes */ - - /* Determine if this process has regular selections */ - if(H5S_SELECT_IS_REGULAR(chunk_info->fspace) == TRUE && - H5S_SELECT_IS_REGULAR(chunk_info->mspace) == TRUE) - is_regular = 1; - else - is_regular = 0; - - /* Determine if all processes have regular selections */ - if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&is_regular, &all_regular, 1, MPI_INT, MPI_LAND, io_info->comm))) - HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code) - - /* For regular selection, - if MPI_COMPLEX_DERIVED_DATATYPE is not defined, - unless spaces for all processors are regular, independent read operation should be performed.*/ - if(!all_regular) { - /* Switch to independent I/O (temporarily) */ - make_ind = TRUE; - make_coll = TRUE; - } /* end if */ - } /* end else */ -#endif /* H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS */ - } /* end if */ - - /* Switch to independent I/O */ - if(make_ind) - if(H5D_ioinfo_make_ind(io_info) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O") -#endif /* H5_HAVE_PARALLEL */ + /* Perform the actual read operation */ - /* Perform the actual write operation */ - status = (io_info->ops.write)(io_info, - chunk_info->chunk_points, H5T_get_size(dataset->shared->type), - chunk_info->fspace, chunk_info->mspace, buf); + status = (io_info->ops.write)(io_info, + chunk_info->chunk_points, H5T_get_size(dataset->shared->type), + chunk_info->fspace, chunk_info->mspace, 0,buf); -#ifdef H5_HAVE_PARALLEL - /* Switch back to collective I/O */ - if(make_coll) - if(H5D_ioinfo_make_coll(io_info) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O") -#endif /* H5_HAVE_PARALLEL */ + /* Check return value from optimized read */ + if (status<0) + HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, " optimized read failed") - /* Check return value from optimized write */ - if (status<0) - HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed") + chunk_node = H5SL_next(chunk_node); - /* Get the next chunk node in the skip list */ - chunk_node=H5SL_next(chunk_node); - } /* end while */ + } +#ifdef H5_HAVE_PARALLEL + } +#endif #ifdef H5S_DEBUG H5_timer_end(&(io_info->stats->stats[0].write_timer), &timer); @@ -2211,6 +2128,7 @@ H5D_create_chunk_map(const H5D_t *dataset, const H5T_t *mem_type, const H5S_t *f H5S_sel_type fsel_type; /* Selection type on disk */ char bogus; /* "bogus" buffer to pass to selection iterator */ unsigned u; /* Local index variable */ + hbool_t sel_hyper_flag; herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_NOAPI_NOINIT(H5D_create_chunk_map) @@ -2218,6 +2136,7 @@ H5D_create_chunk_map(const H5D_t *dataset, const H5T_t *mem_type, const H5S_t *f /* Get layout for dataset */ fm->layout = &(dataset->shared->layout); + /* Check if the memory space is scalar & make equivalent memory space */ if((sm_ndims = H5S_GET_EXTENT_NDIMS(mem_space))<0) HGOTO_ERROR (H5E_DATASPACE, H5E_CANTGET, FAIL, "unable to get dimension number") @@ -2272,6 +2191,14 @@ H5D_create_chunk_map(const H5D_t *dataset, const H5T_t *mem_type, const H5S_t *f if(H5V_array_down(f_ndims,fm->chunks,fm->down_chunks)<0) HGOTO_ERROR (H5E_INTERNAL, H5E_BADVALUE, FAIL, "can't compute 'down' sizes") + /* calculate total chunk in file map*/ + fm->select_chunk = NULL; + fm->total_chunks = 1; + for(u=0; u<fm->f_ndims; u++) + fm->total_chunks= fm->total_chunks*fm->chunks[u]; + + + /* Initialize skip list for chunk selections */ if((fm->fsel=H5SL_create(H5SL_TYPE_HSIZE,0.5,H5D_DEFAULT_SKIPLIST_HEIGHT))==NULL) HGOTO_ERROR(H5E_DATASET,H5E_CANTCREATE,FAIL,"can't create skip list for chunk selections") @@ -2291,8 +2218,13 @@ H5D_create_chunk_map(const H5D_t *dataset, const H5T_t *mem_type, const H5S_t *f if((fm->msel_type=H5S_GET_SELECT_TYPE(equiv_mspace))<H5S_SEL_NONE) HGOTO_ERROR (H5E_DATASET, H5E_BADSELECT, FAIL, "unable to convert from file to memory data space") + /* If the selection is NONE or POINTS, set the flag to FALSE */ + if(fsel_type == H5S_SEL_POINTS || fsel_type == H5S_SEL_NONE) + sel_hyper_flag = FALSE; + else + sel_hyper_flag = TRUE; /* Check if file selection is a point selection */ - if(fsel_type==H5S_SEL_POINTS) { + if(!sel_hyper_flag) { /* Create temporary datatypes for selection iteration */ if((f_tid = H5I_register(H5I_DATATYPE, H5T_copy(dataset->shared->type, H5T_COPY_ALL)))<0) HGOTO_ERROR (H5E_DATATYPE, H5E_CANTREGISTER, FAIL, "unable to register file datatype") @@ -2307,7 +2239,7 @@ H5D_create_chunk_map(const H5D_t *dataset, const H5T_t *mem_type, const H5S_t *f } /* end if */ else { /* Build the file selection for each chunk */ - if(H5D_create_chunk_file_map_hyper(fm)<0) + if(H5D_create_chunk_file_map_hyper(fm,dataset)<0) HGOTO_ERROR (H5E_DATASET, H5E_CANTINIT, FAIL, "unable to create file chunk selections") /* Clean file chunks' hyperslab span "scratch" information */ @@ -2329,7 +2261,7 @@ H5D_create_chunk_map(const H5D_t *dataset, const H5T_t *mem_type, const H5S_t *f } /* end else */ /* Build the memory selection for each chunk */ - if(fsel_type!=H5S_SEL_POINTS && H5S_select_shape_same(file_space,equiv_mspace)==TRUE) { + if(sel_hyper_flag && H5S_select_shape_same(file_space,equiv_mspace)==TRUE) { /* Reset chunk template information */ fm->mchunk_tmpl=NULL; @@ -2500,7 +2432,7 @@ H5D_destroy_chunk_map(const fm_map *fm) if(fm->mchunk_tmpl) if(H5S_close(fm->mchunk_tmpl)<0) HGOTO_ERROR(H5E_DATASPACE, H5E_CANTRELEASE, FAIL, "can't release memory chunk dataspace template") - + if(fm->select_chunk) H5MM_xfree(fm->select_chunk); done: FUNC_LEAVE_NOAPI(ret_value) } /* end H5D_destroy_chunk_map() */ @@ -2519,7 +2451,7 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5D_create_chunk_file_map_hyper(const fm_map *fm) +H5D_create_chunk_file_map_hyper(fm_map *fm,const H5D_t *dset) { hssize_t ssel_points; /* Number of elements in file selection */ hsize_t sel_points; /* Number of elements in file selection */ @@ -2548,12 +2480,18 @@ H5D_create_chunk_file_map_hyper(const fm_map *fm) HGOTO_ERROR(H5E_DATASPACE, H5E_CANTGET, FAIL, "can't get file selection bound info") /* Set initial chunk location & hyperslab size */ + for(u=0; u<fm->f_ndims; u++) { start_coords[u]=(sel_start[u]/fm->layout->u.chunk.dim[u])*fm->layout->u.chunk.dim[u]; coords[u]=start_coords[u]; end[u]=(coords[u]+fm->chunk_dim[u])-1; } /* end for */ + if(IS_H5FD_MPI(dset->oloc.file)) { + if(NULL == (fm->select_chunk = (hbool_t *) H5MM_calloc(fm->total_chunks*sizeof(hbool_t)))) + HGOTO_ERROR (H5E_RESOURCE, H5E_NOSPACE, FAIL, "can't allocate chunk info") + } + /* Calculate the index of this chunk */ if(H5V_chunk_index(fm->f_ndims,coords,fm->layout->u.chunk.dim,fm->down_chunks,&chunk_index)<0) HGOTO_ERROR (H5E_DATASPACE, H5E_BADRANGE, FAIL, "can't get chunk index") @@ -2608,6 +2546,10 @@ H5D_create_chunk_file_map_hyper(const fm_map *fm) /* Set the chunk index */ new_chunk_info->index=chunk_index; + /* store chunk selection information */ + if(IS_H5FD_MPI(dset->oloc.file)) + fm->select_chunk[chunk_index] = TRUE; + /* Set the file chunk dataspace */ new_chunk_info->fspace=tmp_fchunk; @@ -3140,96 +3082,6 @@ done: #ifdef H5_HAVE_PARALLEL /*------------------------------------------------------------------------- - * Function: H5D_ioinfo_make_ind - * - * Purpose: Switch to MPI independent I/O - * - * Return: Non-negative on success/Negative on failure - * - * Programmer: Quincey Koziol - * Friday, August 12, 2005 - * - *------------------------------------------------------------------------- - */ -static herr_t -H5D_ioinfo_make_ind(H5D_io_info_t *io_info) -{ - H5P_genplist_t *dx_plist; /* Data transer property list */ - herr_t ret_value = SUCCEED; /*return value */ - - FUNC_ENTER_NOAPI_NOINIT(H5D_ioinfo_make_ind) - - /* Get the dataset transfer property list */ - if (NULL == (dx_plist = H5I_object(io_info->dxpl_id))) - HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset transfer property list") - - /* Change the xfer_mode to independent, handle the request, - * then set xfer_mode before return. - */ - io_info->dxpl_cache->xfer_mode = H5FD_MPIO_INDEPENDENT; - if(H5P_set (dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &io_info->dxpl_cache->xfer_mode) < 0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode") - - /* Set the pointers to the non-MPI-specific routines */ - io_info->ops.read = H5D_select_read; - io_info->ops.write = H5D_select_write; - - /* Indicate that the transfer mode should be restored before returning - * to user. - */ - io_info->xfer_mode_changed=TRUE; - -done: - FUNC_LEAVE_NOAPI(ret_value) -} /* end H5D_ioinfo_make_ind() */ - - -/*------------------------------------------------------------------------- - * Function: H5D_ioinfo_make_coll - * - * Purpose: Switch to MPI collective I/O - * - * Return: Non-negative on success/Negative on failure - * - * Programmer: Quincey Koziol - * Friday, August 12, 2005 - * - *------------------------------------------------------------------------- - */ -static herr_t -H5D_ioinfo_make_coll(H5D_io_info_t *io_info) -{ - H5P_genplist_t *dx_plist; /* Data transer property list */ - herr_t ret_value = SUCCEED; /*return value */ - - FUNC_ENTER_NOAPI_NOINIT(H5D_ioinfo_make_coll) - - /* Get the dataset transfer property list */ - if (NULL == (dx_plist = H5I_object(io_info->dxpl_id))) - HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset transfer property list") - - /* Change the xfer_mode to independent, handle the request, - * then set xfer_mode before return. - */ - io_info->dxpl_cache->xfer_mode = H5FD_MPIO_COLLECTIVE; - if(H5P_set (dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &io_info->dxpl_cache->xfer_mode) < 0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode") - - /* Set the pointers to the MPI-specific routines */ - io_info->ops.read = H5D_mpio_select_read; - io_info->ops.write = H5D_mpio_select_write; - - /* Indicate that the transfer mode should _NOT_ be restored before returning - * to user. - */ - io_info->xfer_mode_changed=FALSE; - -done: - FUNC_LEAVE_NOAPI(ret_value) -} /* end H5D_ioinfo_make_coll() */ - - -/*------------------------------------------------------------------------- * Function: H5D_ioinfo_term * * Purpose: Common logic for terminating an I/O info object @@ -3267,38 +3119,4 @@ done: FUNC_LEAVE_NOAPI(ret_value) } /* end H5D_ioinfo_term() */ - -/*------------------------------------------------------------------------- - * Function: H5D_mpio_get_min_chunk - * - * Purpose: Routine for obtaining minimum number of chunks to cover - * hyperslab selection selected by all processors. - * - * Return: Non-negative on success/Negative on failure - * - * Programmer: - * - *------------------------------------------------------------------------- - */ -static herr_t -H5D_mpio_get_min_chunk(const H5D_io_info_t *io_info, - const fm_map *fm, int *min_chunkf) -{ - int num_chunkf; /* Number of chunks to iterate over */ - int mpi_code; /* MPI return code */ - herr_t ret_value = SUCCEED; - - FUNC_ENTER_NOAPI_NOINIT(H5D_mpio_get_min_chunk); - - /* Get the number of chunks to perform I/O on */ - num_chunkf = H5SL_count(fm->fsel); - - /* Determine the minimum # of chunks for all processes */ - if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&num_chunkf, min_chunkf, 1, MPI_INT, MPI_MIN, io_info->comm))) - HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code) - -done: - FUNC_LEAVE_NOAPI(ret_value); -} /* end H5D_mpio_get_min_chunk() */ -#endif /*H5_HAVE_PARALLEL*/ - +#endif diff --git a/src/H5Dmpio.c b/src/H5Dmpio.c index 9d8821a..2bec70d 100644 --- a/src/H5Dmpio.c +++ b/src/H5Dmpio.c @@ -26,19 +26,23 @@ /****************/ #define H5D_PACKAGE /*suppress error about including H5Dpkg */ +/*#define KENT */ /***********/ /* Headers */ /***********/ #include "H5private.h" /* Generic Functions */ +#include "H5Iprivate.h" #include "H5Dpkg.h" /* Datasets */ #include "H5Eprivate.h" /* Error handling */ #include "H5Fprivate.h" /* File access */ #include "H5FDprivate.h" /* File drivers */ +#include "H5MMprivate.h" #include "H5Oprivate.h" /* Object headers */ #include "H5Pprivate.h" /* Property lists */ #include "H5Sprivate.h" /* Dataspaces */ +#include "H5Vprivate.h" /* Vector */ #ifdef H5_HAVE_PARALLEL @@ -46,20 +50,109 @@ /* Local Macros */ /****************/ +/* Macros to represent different IO options */ +#define H5D_ONE_LINK_CHUNK_IO 0 +#define H5D_MULTI_CHUNK_IO 1 + +/***** Macros for One linked collective IO case. *****/ +/* The default value to do one linked collective IO for all chunks. + If the average number of chunks per process is greater than this value, + the library will create an MPI derived datatype to link all chunks to do collective IO. + The user can set this value through an API. */ +#define H5D_ONE_LINK_CHUNK_IO_THRESHOLD 0 /* always set this option with value 0*/ + +/* Macros to represent options on how to obtain chunk address for one linked-chunk IO case */ +#define H5D_OBTAIN_ONE_CHUNK_ADDR_IND 0 +#define H5D_OBTAIN_ALL_CHUNK_ADDR_IND 1 +#define H5D_OBTAIN_ALL_CHUNK_ADDR_COL 2 + +/* Macros to define the default ratio of obtaining all chunk addresses for one linked-chunk IO case */ +#define H5D_ALL_CHUNK_ADDR_THRES_IND 10 +#define H5D_ALL_CHUNK_ADDR_THRES_COL 20 + +/***** Macros for multi-chunk collective IO case. *****/ +/* The default value of the threshold to do collective IO for this chunk. + If the average number of processes per chunk is greater than the default value, + collective IO is done for this chunk. +*/ +#define H5D_MULTI_CHUNK_IO_COL_THRESHOLD 50 + +/* Macros to represent different IO modes(NONE, Independent or collective)for multiple chunk IO case */ +#define H5D_CHUNK_IO_MODE_IND 0 +#define H5D_CHUNK_IO_MODE_COL 1 + +/* Macros to represent the regularity of the selection for multiple chunk IO case. */ +#define H5D_CHUNK_SELECT_REG 1 +#define H5D_CHUNK_SELECT_IRREG 2 +#define H5D_CHUNK_SELECT_NONE 0 + /******************/ /* Local Typedefs */ /******************/ +/* Combine chunk address and chunk info into a struct for better performance. */ +typedef struct H5D_chunk_addr_info_t { + haddr_t chunk_addr; + H5D_chunk_info_t chunk_info; +} H5D_chunk_addr_info_t; + +/* Combine all information that needs to know for collective MPI-IO of this selection. */ +typedef struct H5D_common_coll_info_t { + hbool_t mbt_is_derived; + hbool_t mft_is_derived; + size_t mpi_buf_count; + haddr_t chunk_addr; +} H5D_common_coll_info_t; + /********************/ /* Local Prototypes */ /********************/ -/* For regular hyperslab selection. */ +static herr_t +H5D_multi_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,void *buf, + hbool_t do_write); + +static herr_t +H5D_link_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,void *buf, + hbool_t do_write,int sum_chunk); + +static herr_t +H5D_inter_collective_io(H5D_io_info_t *io_info,const H5S_t *file_space, + const H5S_t *mem_space,haddr_t addr, + void *buf, hbool_t do_write ); + +static herr_t +H5D_final_collective_io(H5D_io_info_t *io_info,MPI_Datatype*mpi_file_type, + MPI_Datatype *mpi_buf_type, + H5D_common_coll_info_t* coll_info, + void *buf, hbool_t do_write); +#ifdef OLD_WAY static herr_t -H5D_mpio_spaces_xfer(H5D_io_info_t *io_info, size_t elmt_size, - const H5S_t *file_space, const H5S_t *mem_space, - void *buf/*out*/, - hbool_t do_write); +H5D_pre_sort_chunk(H5D_io_info_t *io_info,int total_chunks, + haddr_t total_chunk_addr_array[]); +#endif + +static herr_t +H5D_sort_chunk(H5D_io_info_t * io_info, + fm_map *fm, + H5D_chunk_addr_info_t chunk_addr_info_array[], + int many_chunk_opt); + +static herr_t +H5D_obtain_mpio_mode(H5D_io_info_t* io_info, + fm_map *fm, + uint8_t assign_io_mode[], + haddr_t chunk_addr[]); + +static herr_t H5D_ioinfo_make_ind(H5D_io_info_t *io_info); +static herr_t H5D_ioinfo_make_coll(H5D_io_info_t *io_info); +static herr_t H5D_mpio_get_min_chunk(const H5D_io_info_t *io_info, + const fm_map *fm, int *min_chunkf); +static int H5D_cmp_chunk_addr(const void *addr1, const void *addr2); +static herr_t +H5D_mpio_get_sum_chunk(const H5D_io_info_t *io_info, + const fm_map *fm, int *sum_chunkf); + /*********************/ /* Package Variables */ @@ -104,7 +197,7 @@ H5D_mpio_opt_possible( const H5D_io_info_t *io_info, if (io_info->dxpl_cache->xfer_mode==H5FD_MPIO_INDEPENDENT) HGOTO_DONE(FALSE); - /* Optimized MPI types flag must be set and it is must be collective IO */ + /* Optimized MPI types flag must be set and it must be collective IO */ /* (Don't allow parallel I/O for the MPI-posix driver, since it doesn't do real collective I/O) */ if (!(H5S_mpi_opt_types_g && io_info->dxpl_cache->xfer_mode==H5FD_MPIO_COLLECTIVE && !IS_H5FD_MPIPOSIX(io_info->dset->oloc.file))) { local_opinion = FALSE; @@ -131,14 +224,14 @@ H5D_mpio_opt_possible( const H5D_io_info_t *io_info, goto broadcast; } /* end if */ - /*The handling of memory space is different for chunking + /* The handling of memory space is different for chunking and contiguous storage, For contigous storage, mem_space and file_space won't change when it it is doing disk IO. For chunking storage, mem_space will change for different chunks. So for chunking storage, whether we can use - collective IO will defer until the each chunk IO is reached. - For contiguous storage, if we find the MPI-IO cannot + collective IO will defer until each chunk IO is reached. + For contiguous storage, if we find MPI-IO cannot support complicated MPI derived data type, we will set use_par_opt_io = FALSE. */ @@ -184,124 +277,53 @@ done: /*------------------------------------------------------------------------- - * Function: H5D_mpio_spaces_xfer + * Function: H5D_mpio_chunk_adjust_iomode * - * Purpose: Use MPI-IO to transfer data efficiently - * directly between app buffer and file. - * - * Return: non-negative on success, negative on failure. + * Purpose: Checks if it is possible to do collective IO * - * Programmer: rky 980813 - * - * Notes: - * For collective data transfer only since this would eventually call - * H5FD_mpio_setup to do setup to eveually call MPI_File_set_view in - * H5FD_mpio_read or H5FD_mpio_write. MPI_File_set_view is a collective - * call. Letting independent data transfer use this route would result in - * hanging. + * Return: Success: Non-negative: TRUE or FALSE + * Failure: Negative * - * The preconditions for calling this routine are located in the - * H5S_mpio_opt_possible() routine, which determines whether this routine - * can be called for a given dataset transfer. + * Programmer: Muqun Yang + * Monday, Feb. 13th, 2006 * *------------------------------------------------------------------------- */ -static herr_t -H5D_mpio_spaces_xfer(H5D_io_info_t *io_info, size_t elmt_size, - const H5S_t *file_space, const H5S_t *mem_space, - void *_buf /*out*/, hbool_t do_write ) -{ - haddr_t addr; /* Address of dataset (or selection) within file */ - size_t mpi_buf_count, mpi_file_count; /* Number of "objects" to transfer */ - hsize_t mpi_buf_offset, mpi_file_offset; /* Offset within dataset where selection (ie. MPI type) begins */ - MPI_Datatype mpi_buf_type, mpi_file_type; /* MPI types for buffer (memory) and file */ - hbool_t mbt_is_derived=0, /* Whether the buffer (memory) type is derived and needs to be free'd */ - mft_is_derived=0; /* Whether the file type is derived and needs to be free'd */ - hbool_t plist_is_setup=0; /* Whether the dxpl has been customized */ - uint8_t *buf=(uint8_t *)_buf; /* Alias for pointer arithmetic */ - int mpi_code; /* MPI return code */ - herr_t ret_value = SUCCEED; /* Return value */ - - FUNC_ENTER_NOAPI_NOINIT(H5D_mpio_spaces_xfer); +herr_t +H5D_mpio_chunk_adjust_iomode(H5D_io_info_t *io_info, const fm_map *fm) { - /* Check args */ - assert (io_info); - assert (io_info->dset); - assert (file_space); - assert (mem_space); - assert (buf); - assert (IS_H5FD_MPIO(io_info->dset->oloc.file)); - /* Make certain we have the correct type of property list */ - assert(TRUE==H5P_isa_class(io_info->dxpl_id,H5P_DATASET_XFER)); + int min_chunk; + herr_t ret_value = SUCCEED; - /* create the MPI buffer type */ - if (H5S_mpio_space_type( mem_space, elmt_size, - /* out: */ - &mpi_buf_type, - &mpi_buf_count, - &mpi_buf_offset, - &mbt_is_derived )<0) - HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI buf type"); - - /* create the MPI file type */ - if ( H5S_mpio_space_type( file_space, elmt_size, - /* out: */ - &mpi_file_type, - &mpi_file_count, - &mpi_file_offset, - &mft_is_derived )<0) - HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI file type"); - - /* Get the base address of the contiguous dataset or the chunk */ - if(io_info->dset->shared->layout.type == H5D_CONTIGUOUS) - addr = H5D_contig_get_addr(io_info->dset) + mpi_file_offset; - else { - haddr_t chunk_addr; /* for collective chunk IO */ + FUNC_ENTER_NOAPI_NOINIT(H5D_mpio_chunk_adjust_iomode) - assert(io_info->dset->shared->layout.type == H5D_CHUNKED); - chunk_addr=H5D_istore_get_addr(io_info,NULL); - addr = H5F_BASE_ADDR(io_info->dset->oloc.file) + chunk_addr + mpi_file_offset; - } +#ifndef H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS + if(H5D_mpio_get_min_chunk(io_info,fm,&min_chunk)<0) + HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the min chunk number of all processes"); + if(min_chunk == 0) { + H5P_genplist_t *dx_plist; /* Data transer property list */ - /* - * Pass buf type, file type to the file driver. Request an MPI type - * transfer (instead of an elementary byteblock transfer). - */ - if(H5FD_mpi_setup_collective(io_info->dxpl_id, mpi_buf_type, mpi_file_type)<0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O properties"); - plist_is_setup=1; - - /* Adjust the buffer pointer to the beginning of the selection */ - buf+=mpi_buf_offset; - - /* transfer the data */ - if (do_write) { - if (H5F_block_write(io_info->dset->oloc.file, H5FD_MEM_DRAW, addr, mpi_buf_count, io_info->dxpl_id, buf) <0) - HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL,"MPI write failed"); - } else { - if (H5F_block_read (io_info->dset->oloc.file, H5FD_MEM_DRAW, addr, mpi_buf_count, io_info->dxpl_id, buf) <0) - HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL,"MPI read failed"); - } + /* Get the dataset transfer property list */ + if (NULL == (dx_plist = H5I_object(io_info->dxpl_id))) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset creation property list") -done: - /* Reset the dxpl settings */ - if(plist_is_setup) { - if(H5FD_mpi_teardown_collective(io_info->dxpl_id)<0) - HDONE_ERROR(H5E_DATASPACE, H5E_CANTFREE, FAIL, "unable to reset dxpl values"); - } /* end if */ + /* Change the xfer_mode to independent for handling the I/O */ + io_info->dxpl_cache->xfer_mode = H5FD_MPIO_INDEPENDENT; + if(H5P_set (dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &io_info->dxpl_cache->xfer_mode) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode") - /* free the MPI buf and file types */ - if (mbt_is_derived) { - if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &mpi_buf_type ))) - HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); - } - if (mft_is_derived) { - if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &mpi_file_type ))) - HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); - } + /* Indicate that the transfer mode should be restored before returning + * to user. + */ + io_info->xfer_mode_changed = TRUE; + io_info->ops.read = H5D_select_read; + io_info->ops.write = H5D_select_write; + } /* end if */ +#endif +done: + FUNC_LEAVE_NOAPI(ret_value) +} - FUNC_LEAVE_NOAPI(ret_value); -} /* end H5D_mpio_spaces_xfer() */ /*------------------------------------------------------------------------- @@ -317,17 +339,20 @@ done: */ herr_t H5D_mpio_select_read(H5D_io_info_t *io_info, - size_t UNUSED nelmts, size_t elmt_size, - const H5S_t *file_space, const H5S_t *mem_space, - void *buf/*out*/) + size_t mpi_buf_count, + size_t elmt_size, + const H5S_t UNUSED *file_space, + const H5S_t UNUSED *mem_space, + haddr_t addr, + void *buf/*out*/) { - herr_t ret_value; + herr_t ret_value = SUCCEED; - FUNC_ENTER_NOAPI_NOFUNC(H5D_mpio_select_read); - - ret_value = H5D_mpio_spaces_xfer(io_info, elmt_size, file_space, - mem_space, buf, 0/*read*/); + FUNC_ENTER_NOAPI(H5D_mpio_select_read,FAIL); + if(H5F_block_read (io_info->dset->oloc.file, H5FD_MEM_DRAW, addr, mpi_buf_count, io_info->dxpl_id, buf) < 0) + HGOTO_ERROR(H5E_IO,H5E_READERROR,FAIL,"can't finish collective parallel read"); +done: FUNC_LEAVE_NOAPI(ret_value); } /* end H5D_mpio_select_read() */ @@ -345,19 +370,1382 @@ H5D_mpio_select_read(H5D_io_info_t *io_info, */ herr_t H5D_mpio_select_write(H5D_io_info_t *io_info, - size_t UNUSED nelmts, size_t elmt_size, - const H5S_t *file_space, const H5S_t *mem_space, - const void *buf) + size_t mpi_buf_count, + size_t elmt_size, + const H5S_t UNUSED *file_space, + const H5S_t UNUSED *mem_space, + haddr_t addr, + const void *buf) { herr_t ret_value; - FUNC_ENTER_NOAPI_NOFUNC(H5D_mpio_select_write); + FUNC_ENTER_NOAPI(H5D_mpio_select_write,FAIL); /*OKAY: CAST DISCARDS CONST QUALIFIER*/ - ret_value = H5D_mpio_spaces_xfer(io_info, elmt_size, file_space, - mem_space, (void*)buf, 1/*write*/); + if(H5F_block_write (io_info->dset->oloc.file, H5FD_MEM_DRAW, addr, mpi_buf_count, io_info->dxpl_id, buf)<0) + HGOTO_ERROR(H5E_IO,H5E_WRITEERROR,FAIL,"can't finish collective parallel write"); +done: + FUNC_LEAVE_NOAPI(ret_value); +} /* end H5D_mpio_select_write() */ + +/*------------------------------------------------------------------------- + * Function: H5D_ioinfo_make_ind + * + * Purpose: Switch to MPI independent I/O + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: Quincey Koziol + * Friday, August 12, 2005 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5D_ioinfo_make_ind(H5D_io_info_t *io_info) +{ + H5P_genplist_t *dx_plist; /* Data transer property list */ + herr_t ret_value = SUCCEED; /*return value */ + + FUNC_ENTER_NOAPI_NOINIT(H5D_ioinfo_make_ind) + + /* Get the dataset transfer property list */ + if (NULL == (dx_plist = H5I_object(io_info->dxpl_id))) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset transfer property list") + + /* Change the xfer_mode to independent, handle the request, + * then set xfer_mode before return. + */ + io_info->dxpl_cache->xfer_mode = H5FD_MPIO_INDEPENDENT; + if(H5P_set (dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &io_info->dxpl_cache->xfer_mode) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode") + + /* Set the pointers to the non-MPI-specific routines */ + io_info->ops.read = H5D_select_read; + io_info->ops.write = H5D_select_write; + + /* Indicate that the transfer mode should be restored before returning + * to user. + */ + io_info->xfer_mode_changed=TRUE; + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5D_ioinfo_make_ind() */ + + +/*------------------------------------------------------------------------- + * Function: H5D_ioinfo_make_coll + * + * Purpose: Switch to MPI collective I/O + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: Quincey Koziol + * Friday, August 12, 2005 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5D_ioinfo_make_coll(H5D_io_info_t *io_info) +{ + H5P_genplist_t *dx_plist; /* Data transer property list */ + herr_t ret_value = SUCCEED; /*return value */ + + FUNC_ENTER_NOAPI_NOINIT(H5D_ioinfo_make_coll) + + /* Get the dataset transfer property list */ + if (NULL == (dx_plist = H5I_object(io_info->dxpl_id))) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset transfer property list") + + /* Change the xfer_mode to independent, handle the request, + * then set xfer_mode before return. + */ + io_info->dxpl_cache->xfer_mode = H5FD_MPIO_COLLECTIVE; + if(H5P_set (dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &io_info->dxpl_cache->xfer_mode) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode") + + /* Set the pointers to the MPI-specific routines */ + io_info->ops.read = H5D_mpio_select_read; + io_info->ops.write = H5D_mpio_select_write; + + /* Indicate that the transfer mode should _NOT_ be restored before returning + * to user. + */ + io_info->xfer_mode_changed=FALSE; + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5D_ioinfo_make_coll() */ + + + + +/*------------------------------------------------------------------------- + * Function: H5D_mpio_get_min_chunk + * + * Purpose: Routine for obtaining minimum number of chunks to cover + * hyperslab selection selected by all processors. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: + * + *------------------------------------------------------------------------- + */ +static herr_t +H5D_mpio_get_min_chunk(const H5D_io_info_t *io_info, + const fm_map *fm, int *min_chunkf) +{ + int num_chunkf; /* Number of chunks to iterate over */ + int mpi_code; /* MPI return code */ + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI_NOINIT(H5D_mpio_get_min_chunk); + + /* Get the number of chunks to perform I/O on */ + num_chunkf = H5SL_count(fm->fsel); + + /* Determine the minimum # of chunks for all processes */ + if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&num_chunkf, min_chunkf, 1, MPI_INT, MPI_MIN, io_info->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code) + +done: FUNC_LEAVE_NOAPI(ret_value); -} /* end H5D_mpio_spaces_write() */ +} /* end H5D_mpio_get_min_chunk() */ +/*------------------------------------------------------------------------- + * Function: H5D_mpio_get_sum_chunk + * + * Purpose: Routine for obtaining total number of chunks to cover + * hyperslab selection selected by all processors. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: + * + *------------------------------------------------------------------------- + */ +static herr_t +H5D_mpio_get_sum_chunk(const H5D_io_info_t *io_info, + const fm_map *fm, int *sum_chunkf) +{ + int num_chunkf; /* Number of chunks to iterate over */ + size_t ori_num_chunkf; + int mpi_code; /* MPI return code */ + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI_NOINIT(H5D_mpio_get_sum_chunk); + + /* Get the number of chunks to perform I/O on */ + num_chunkf = 0; + ori_num_chunkf = H5SL_count(fm->fsel); + H5_ASSIGN_OVERFLOW(num_chunkf,ori_num_chunkf,size_t,int); + + /* Determine the minimum # of chunks for all processes */ + if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&num_chunkf, sum_chunkf, 1, MPI_INT, MPI_SUM, io_info->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code) + +done: + FUNC_LEAVE_NOAPI(ret_value); +} /* end H5D_mpio_get_sum_chunk() */ + + +/*------------------------------------------------------------------------- + * Function: H5D_contig_collective_io + * + * Purpose: Routine for + * 1) building up MPI derived datatype + * 2) setting up collective IO property list + * 3) Do IO + * + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +herr_t +H5D_contig_collective_io(H5D_io_info_t *io_info, + const H5S_t *file_space, + const H5S_t *mem_space, + void *buf, + hbool_t do_write) +{ + + + haddr_t addr = HADDR_UNDEF; /* Address of dataset (or selection) within file */ + hbool_t select_valid; + herr_t ret_value = SUCCEED; /* return value */ + + FUNC_ENTER_NOAPI_NOINIT(H5D_contig_collective_io) + assert (IS_H5FD_MPIO(io_info->dset->oloc.file)); + + /* Make certain we have the correct type of property list */ + assert(TRUE==H5P_isa_class(io_info->dxpl_id,H5P_DATASET_XFER)); + + /* Get the base address of the contiguous dataset */ + if(io_info->dset->shared->layout.type == H5D_CONTIGUOUS) + addr = H5D_contig_get_addr(io_info->dset); + + if(H5D_inter_collective_io(io_info,file_space,mem_space,addr,buf,do_write)<0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO"); + + done: + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5D_contig_collective_io */ + +/*------------------------------------------------------------------------- + * Function: H5D_chunk_collective_io + * + * Purpose: Routine for + 1) choose an IO option: + a) One collective IO defined by one MPI derived datatype to link through all chunks + or b) multiple chunk IOs,to do MPI-IO for each chunk, the IO mode may be adjusted + due to the selection pattern for each chunk. + * For option a) + 1. Sort the chunk address, obtain chunk info according to the sorted chunk address + 2. Build up MPI derived datatype for each chunk + 3. Build up the final MPI derived datatype + 4. Set up collective IO property list + 5. Do IO + * For option b) + 1. Use MPI_gather and MPI_Bcast to obtain information of *collective/independent/none* + IO mode for each chunk of the data space + 2. Depending on whether the IO mode is collective or independent or none, + Create either MPI derived datatype for each chunk to do collective IO or just do independent IO + 3. Set up collective IO property list for collective mode + 4. DO IO + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +herr_t +H5D_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,void *buf, hbool_t do_write) +{ + + int io_option = H5D_MULTI_CHUNK_IO; + int min_chunk,sum_chunk,mpi_size; + int one_link_chunk_io_threshold; + herr_t ret_value = SUCCEED; + + + FUNC_ENTER_NOAPI_NOINIT(H5D_chunk_collective_io) + + assert (IS_H5FD_MPIO(io_info->dset->oloc.file)); + + if(H5D_mpio_get_sum_chunk(io_info,fm,&sum_chunk)<0) + HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the total chunk number of all processes"); + if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file))<0) + HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size"); + one_link_chunk_io_threshold = H5D_ONE_LINK_CHUNK_IO_THRESHOLD;/*This should be replaced by the user inputting value from API. */ + + /* step 1: choose an IO option */ + /* If the average number of chunk per process is greater than a threshold, we will do one link chunked IO. */ + if(sum_chunk/mpi_size >= one_link_chunk_io_threshold) io_option = H5D_ONE_LINK_CHUNK_IO; + +/* If this MPI-IO package doesn't support collective IO when no IO is done for one or more processes, + use MULTIPLE CHUNK IO */ +/* +#ifndef H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS + if(H5D_mpio_get_min_chunk(io_info,fm,&min_chunk)<0) + HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the min chunk number of all processes"); + if(min_chunk == 0) io_option = H5D_MULTI_CHUNK_IO; +#endif +*/ + +#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS + if(io_option == H5D_ONE_LINK_CHUNK_IO ) io_option = H5D_MULTI_CHUNK_IO ;/* We can not do this with one chunk IO. */ +#endif + + /* step 2: Go ahead to do IO.*/ + if(io_option == H5D_ONE_LINK_CHUNK_IO) { + if(H5D_link_chunk_collective_io(io_info,fm,buf,do_write,sum_chunk)<0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish linked chunk MPI-IO"); + } + else {/* multiple chunk IOs */ + printf("coming into multiple chunk \n"); + if(H5D_multi_chunk_collective_io(io_info,fm,buf,do_write)<0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish multiple chunk MPI-IO"); + } + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5D_chunk_collective_io */ + +/*------------------------------------------------------------------------- + * Function: H5D_link_chunk_collective_io + * + * Purpose: Routine for one collective IO with one MPI derived datatype to link with all chunks + + 1. Sort the chunk address and chunk info + 2. Build up MPI derived datatype for each chunk + 3. Build up the final MPI derived datatype + 4. Use common collective IO routine to do MPI-IO + + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: + * + * Modifications: + * + *------------------------------------------------------------------------- + */ + +static herr_t +H5D_link_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,void *buf, hbool_t do_write,int sum_chunk) +{ + + + size_t src_type_size; /*size of source type */ + size_t dst_type_size; /*size of destination type*/ + hsize_t mpi_buf_extra_offset; + hsize_t mpi_file_extra_offset; + size_t mpi_buf_count; + size_t mpi_file_count; + hbool_t mbt_is_derived=0, /* Whether the buffer (memory) type is derived and needs to be free'd */ + mft_is_derived=0; /* Whether the file type is derived and needs to be free'd */ + + int mpi_size,mpi_code; /* MPI return code */ + + int i,num_chunk,total_chunks; + size_t ori_num_chunk; + hsize_t ori_total_chunks; + haddr_t chunk_base_addr; + haddr_t* total_chunk_addr_array; + MPI_Datatype *chunk_mtype; + MPI_Datatype *chunk_ftype; + MPI_Datatype chunk_final_mtype; + MPI_Datatype chunk_final_ftype; + MPI_Aint *chunk_disp_array; + MPI_Aint *chunk_mem_disp_array; + int *blocklen; + int blocklen_value; + int actual_bsearch_coll_chunk_threshold; + int bsearch_coll_chunk_threshold; + int bsearch_chunk_ratio; + int bsearch_chunk_threshold; + int many_chunk_opt = 0; + + H5D_common_coll_info_t coll_info; + H5D_chunk_addr_info_t* chunk_addr_info_array; + +#ifdef CC_PERF + char *bc_percent = NULL; + char *bcc_percent = NULL; +#endif + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI_NOINIT(H5D_link_chunk_collective_io) + ori_total_chunks = fm->total_chunks; + H5_ASSIGN_OVERFLOW(total_chunks,ori_total_chunks,hsize_t,int); + + /* Handle with a special case when only one chunk is covered by all processes */ + if(total_chunks == 1){ + H5SL_node_t *chunk_node; + H5D_chunk_info_t *chunk_info; + H5D_storage_t store; + + chunk_node = H5SL_first(fm->fsel); + if(chunk_node == NULL) { + if(H5D_istore_chunkmap(io_info,total_chunks,&chunk_base_addr,fm->down_chunks)<0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address"); + if(H5D_inter_collective_io(io_info,NULL,NULL,chunk_base_addr,buf,do_write)<0) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO"); + } + else { + if(NULL ==(chunk_info = H5SL_item(chunk_node))) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list"); + io_info->store = &store; + store.chunk.offset = chunk_info->coords; + store.chunk.index = chunk_info->index; + + if(HADDR_UNDEF==(chunk_base_addr = H5D_istore_get_addr(io_info,NULL))) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list"); + +#ifdef KENT +printf("before inter_collective_io for total chunk = 1 \n"); +#endif + if(H5D_inter_collective_io(io_info,chunk_info->fspace,chunk_info->mspace,chunk_base_addr,buf,do_write)<0) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO"); + } + goto done; + } + + /* Allocate chunking information */ + + ori_num_chunk = H5SL_count(fm->fsel); + H5_ASSIGN_OVERFLOW(num_chunk,ori_num_chunk,size_t,int); + + + if(num_chunk == 0) total_chunk_addr_array = H5MM_malloc(sizeof(haddr_t)*total_chunks); + else + { + chunk_addr_info_array= H5MM_malloc(num_chunk*sizeof(H5D_chunk_addr_info_t)); + chunk_mtype = H5MM_malloc(num_chunk*sizeof(MPI_Datatype)); + chunk_ftype = H5MM_malloc(num_chunk*sizeof(MPI_Datatype)); + chunk_disp_array = H5MM_malloc(num_chunk*sizeof(MPI_Aint)); + chunk_mem_disp_array = H5MM_calloc(num_chunk*sizeof(MPI_Aint)); + blocklen = H5MM_malloc(num_chunk*sizeof(int)); + } + + /* Obtain information to do collective IO, + in order to do collective IO, no datatype conversion should happen. */ + if((src_type_size = H5T_get_size(io_info->dset->shared->type))==0) + HGOTO_ERROR(H5E_DATATYPE, H5E_BADSIZE, FAIL, "datatype size invalid"); + dst_type_size = src_type_size; + + +#ifdef CC_PERF + /* "bcc" means 'b-tree iterately obtain all chunk addresses collectively', + "bc" means 'b-tree iterately obtain all chunk addresses individually', + the default one means 'obtaining the chunk address individually', + */ + + if(bcc_percent=getenv("BCC_PERCENT")){ + bsearch_coll_chunk_threshold = atoi(bcc_percent); + assert((bsearch_coll_chunk_threshold >=0) &&(bsearch_coll_chunk_threshold <=100)); + } + else + bsearch_coll_chunk_threshold = H5D_ALL_CHUNK_ADDR_THRES_COL; +#else + bsearch_coll_chunk_threshold = H5D_ALL_CHUNK_ADDR_THRES_COL; /*This number may be changed according to the performance study */ +#endif + + if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file))<0) + HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size"); + + /* Calculate the actual threshold to obtain all chunk addresses collectively + The bigger this number is, the more possible the use of obtaining chunk address collectively. */ + actual_bsearch_coll_chunk_threshold = sum_chunk*100/(total_chunks*mpi_size); + + if(actual_bsearch_coll_chunk_threshold >= bsearch_coll_chunk_threshold) + many_chunk_opt = H5D_OBTAIN_ALL_CHUNK_ADDR_COL; + + else { + +#ifdef CC_PERF + if(bc_percent=getenv("BC_PERCENT")){ + bsearch_chunk_ratio = atoi(bc_percent); + assert((bsearch_chunk_ratio<=100)&&(bsearch_chunk_ratio>=0)); + } + else + bsearch_chunk_ratio = H5D_ALL_CHUNK_ADDR_THRES_IND; +#else + bsearch_chunk_ratio = H5D_ALL_CHUNK_ADDR_THRES_IND; /*This number may be changed according to the performance study */ +#endif + + /* This threshold is to check whether we can use iterator to obtain all chunk addresses. + The unit of the threshold is the number of chunks. The value should be at least 1. + It can be calculated as follows: + + if(total_chunks*bsearch_chunk_ratio/100 <=1) + bsearch_chunk_threahold = 1; + else + bsearch_chunk_threshold = total_chunks*bsearch_chunk_ratio/100; + In order to make the caluculation more efficient, + we use the following approximate formula to calculate the threshold. + + bsearch_chunk_threshold = 1+ (total_chunks*bsearch_chunk_ratio-99)/100; + + The only difference is when total_chunks* besearch_chunk_ratio == 100n+99; + the approximate formula will give value (n+1) instead of n for threshold. + That shouldn't matter much from our persective. + */ + + bsearch_chunk_threshold = 1 +(total_chunks*bsearch_chunk_ratio-99)/100; + if(num_chunk > bsearch_chunk_threshold) many_chunk_opt = H5D_OBTAIN_ALL_CHUNK_ADDR_IND; + } + + /* Sort the chunk address + when chunk optimization selection is either H5D_OBTAIN_*/ + if(num_chunk == 0){ + if(H5D_istore_chunkmap(io_info,total_chunks,total_chunk_addr_array,fm->down_chunks)<0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address"); + chunk_base_addr = total_chunk_addr_array[0]; + } + + else { + if(H5D_sort_chunk(io_info,fm,chunk_addr_info_array,many_chunk_opt)<0) + HGOTO_ERROR (H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to sort chunk address"); + chunk_base_addr = chunk_addr_info_array[0].chunk_addr; + } + + /* Obtain MPI derived datatype from all individual chunks */ + for ( i = 0; i < num_chunk; i++) { + /* Disk MPI derived datatype */ + if(H5S_mpio_space_type(chunk_addr_info_array[i].chunk_info.fspace,src_type_size,&chunk_ftype[i], + &mpi_file_count,&mpi_file_extra_offset,&mft_is_derived)<0) + HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI file type"); + + /* Buffer MPI derived datatype */ + if(H5S_mpio_space_type(chunk_addr_info_array[i].chunk_info.mspace,dst_type_size,&chunk_mtype[i], + &mpi_buf_count,&mpi_buf_extra_offset,&mbt_is_derived)<0) + HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI buf type"); + + /* Chunk address relative to the first chunk */ + chunk_addr_info_array[i].chunk_addr -= chunk_base_addr; + H5_ASSIGN_OVERFLOW(chunk_disp_array[i],chunk_addr_info_array[i].chunk_addr,haddr_t,MPI_Aint); + /*chunk_disp_array[i] = (MPI_Aint)chunk_addr_array[i];*/ + } + + blocklen_value = 1; + if(num_chunk){ + + /* initialize the buffer with the constant value 1 */ + H5V_array_fill(blocklen,&blocklen_value,sizeof(int),(size_t)num_chunk); + + /* Create final MPI derived datatype */ + if(MPI_SUCCESS != (mpi_code = MPI_Type_struct(num_chunk,blocklen,chunk_disp_array,chunk_ftype,&chunk_final_ftype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_struct failed", mpi_code); + if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&chunk_final_ftype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code); + + if(MPI_SUCCESS != (mpi_code = MPI_Type_struct(num_chunk,blocklen,chunk_mem_disp_array,chunk_mtype,&chunk_final_mtype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_struct failed", mpi_code); + if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&chunk_final_mtype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code); + + for ( i = 0; i< num_chunk;i++){ + if (MPI_SUCCESS != (mpi_code= MPI_Type_free( chunk_mtype+i ))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); + if (MPI_SUCCESS != (mpi_code= MPI_Type_free( chunk_ftype+i ))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); + } + + /* buffer, file derived datatypes should be true */ + coll_info.mbt_is_derived = 1; + coll_info.mft_is_derived = 1; + coll_info.mpi_buf_count = 1; + coll_info.chunk_addr = chunk_base_addr; + + } + + else {/* no selection at all for this process */ + chunk_final_ftype = MPI_BYTE; + chunk_final_mtype = MPI_BYTE; + + /* buffer, file derived datatypes should be true */ + coll_info.mbt_is_derived = 0; + coll_info.mft_is_derived = 0; + coll_info.mpi_buf_count = 0; + coll_info.chunk_addr = chunk_base_addr; + } + if(H5D_final_collective_io(io_info,&chunk_final_ftype,&chunk_final_mtype,&coll_info,buf,do_write)<0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish MPI-IO"); + + done: + if(num_chunk == 0) HDfree(total_chunk_addr_array); + else { + if (fm->total_chunks != 1) { + HDfree(chunk_addr_info_array); + HDfree(chunk_mtype); + HDfree(chunk_ftype); + HDfree(chunk_disp_array); + HDfree(chunk_mem_disp_array); + HDfree(blocklen); + } + } + + FUNC_LEAVE_NOAPI(ret_value) +} +/* end H5D_link_chunk_collective_io */ + + +/*------------------------------------------------------------------------- + * Function: H5D_multi_chunk_collective_io + * + * Purpose: To do IO per chunk according to IO mode(collective/independent/none) + + 1. Use MPI_gather and MPI_Bcast to obtain IO mode in each chunk(collective/independent/none) + 2. Depending on whether the IO mode is collective or independent or none, + Create either MPI derived datatype for each chunk or just do independent IO + 3. Use common collective IO routine to do MPI-IO + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static herr_t +H5D_multi_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,void *buf, hbool_t do_write) +{ + + int i,total_chunk; + int mpi_rank; + hsize_t ori_total_chunk; + uint8_t *chunk_io_option; + + H5SL_node_t *chunk_node; /* Current node in chunk skip list */ + H5D_chunk_info_t *chunk_info; + haddr_t *chunk_addr; + H5D_storage_t store; /* union of EFL and chunk pointer in file space */ + hbool_t select_chunk; + hbool_t last_io_mode_coll = TRUE; + herr_t ret_value = SUCCEED; + + + FUNC_ENTER_NOAPI_NOINIT(H5D_multi_chunk_collective_io) +#ifdef KENT + mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file); +#endif + /* Allocate memories */ + ori_total_chunk = fm->total_chunks; + H5_ASSIGN_OVERFLOW(total_chunk,ori_total_chunk,hsize_t,int); + HDassert(total_chunk!=0); + chunk_io_option = (uint8_t *)H5MM_calloc(total_chunk*sizeof(MPI_BYTE)); + chunk_addr = (haddr_t *)H5MM_calloc(total_chunk*sizeof(haddr_t)); +#ifdef KENT + printf("total_chunk %d\n",total_chunk); +#endif + + /* obtain IO option for each chunk */ + if(H5D_obtain_mpio_mode(io_info,fm,chunk_io_option,chunk_addr)<0) + HGOTO_ERROR (H5E_DATASET, H5E_CANTRECV, FAIL, "unable to obtain MPIO mode"); + + for( i = 0; i<total_chunk;i++){ + +#ifdef KENT +printf("mpi_rank = %d, chunk index = %d\n",mpi_rank,i); +#endif + select_chunk = fm->select_chunk[i]; + if(select_chunk == 1){/* Have selection elements in this chunk. Find the chunk info. */ +#ifdef NEW_WAY + if(NULL ==(chunk_info = H5SL_item(chunk_node))) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list"); + HDassert(chunk_info->index == i); + + /* Set dataset storage for I/O info */ + io_info->store=&store; + /* Pass in chunk's coordinates in a union. */ + store.chunk.offset = chunk_info->coords; + store.chunk.index = chunk_info->index; + if(NULL ==(chunk_node = H5SL_first(fm->fsel))) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk node from skipped list"); +#else + + if(NULL ==(chunk_node = H5SL_first(fm->fsel))) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk node from skipped list"); + while(chunk_node){ + + if(NULL ==(chunk_info = H5SL_item(chunk_node))) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list"); + if(chunk_info->index == i) { + /* Set dataset storage for I/O info */ + io_info->store=&store; + /* Pass in chunk's coordinates in a union. */ + store.chunk.offset = chunk_info->coords; + store.chunk.index = chunk_info->index; + break; + } + + chunk_node = H5SL_next(chunk_node); + } +#endif + } + + if(chunk_io_option[i] == 1){ /*collective IO for this chunk, + note: even there is no selection for this process, + the process still needs to contribute MPI NONE TYPE.*/ +#ifdef KENT +printf("inside collective chunk IO mpi_rank = %d, chunk index = %d\n",mpi_rank,i); +#endif + + if(!last_io_mode_coll) + /* Switch back to collective I/O */ + if(H5D_ioinfo_make_coll(io_info) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to collective I/O") + + if(select_chunk){ + if(H5D_inter_collective_io(io_info,chunk_info->fspace,chunk_info->mspace, + chunk_addr[i],buf,do_write )<0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO"); + + } + else{ + if(H5D_inter_collective_io(io_info,NULL,NULL, + chunk_addr[i],buf,do_write )<0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO"); + + } + + + last_io_mode_coll = TRUE; + + } + else {/*possible independent IO for this chunk*/ +#ifdef KENT +printf("inside independent IO mpi_rank = %d, chunk index = %d\n",mpi_rank,i); +#endif + + HDassert(chunk_io_option[i] == 0); + if(!select_chunk) continue; /* this process has nothing to do with this chunk, continue! */ + if(last_io_mode_coll) + /* Switch to independent I/O */ + if(H5D_ioinfo_make_ind(io_info) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O") + + if(do_write) { + ret_value = (io_info->ops.write)(io_info, + chunk_info->chunk_points,H5T_get_size(io_info->dset->shared->type), + chunk_info->fspace,chunk_info->mspace,0, + buf); + /* Check return value of the write */ + if (ret_value<0) + HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed") + } + else { + ret_value = (io_info->ops.read)(io_info, + chunk_info->chunk_points,H5T_get_size(io_info->dset->shared->type), + chunk_info->fspace,chunk_info->mspace,0, + buf); + /* Check return value from optimized write */ + if (ret_value<0) + HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed") + } + + last_io_mode_coll = FALSE; + } + } + + if(!last_io_mode_coll) + /* Switch back to collective I/O */ + if(H5D_ioinfo_make_coll(io_info) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to collective I/O") + + done: + HDfree(chunk_io_option); + HDfree(chunk_addr); + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5D_multi_chunk_collective_io */ + +/*------------------------------------------------------------------------- + * Function: H5D_inter_collective_io + * + * Purpose: Routine for the shared part of collective IO between multiple chunk + collective IO and contiguous collective IO + + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static herr_t +H5D_inter_collective_io(H5D_io_info_t *io_info,const H5S_t *file_space,const H5S_t *mem_space, + haddr_t addr, void *buf, hbool_t do_write ) +{ + + size_t mpi_buf_count, mpi_file_count; /* Number of "objects" to transfer */ + MPI_Datatype mpi_file_type,mpi_buf_type; + hsize_t mpi_buf_offset, mpi_file_offset; /* Offset within dataset where selection (ie. MPI type) begins */ + hbool_t mbt_is_derived=0, /* Whether the buffer (memory) type is derived and needs to be free'd */ + mft_is_derived=0; /* Whether the file type is derived and needs to be free'd */ + H5D_common_coll_info_t coll_info; + herr_t ret_value = SUCCEED; /* return value */ + + FUNC_ENTER_NOAPI_NOINIT(H5D_inter_collective_io) + if((file_space!=NULL) && (mem_space != NULL)) { + /*Obtain disk and memory MPI derived datatype */ + if(H5S_mpio_space_type(file_space,H5T_get_size(io_info->dset->shared->type), + &mpi_file_type,&mpi_file_count,&mpi_file_offset,&mft_is_derived)<0) + HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI file type"); + + if(H5S_mpio_space_type(mem_space,H5T_get_size(io_info->dset->shared->type), + &mpi_buf_type,&mpi_buf_count,&mpi_buf_offset,&mbt_is_derived)<0) + HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI buffer type"); + + } + else { + /* For non-selection, participate with a none MPI derived datatype, the count is 0. */ + mpi_buf_type = MPI_BYTE; + mpi_file_type = MPI_BYTE; + mpi_file_count = 0; + mpi_buf_count = 0; + } + + coll_info.mbt_is_derived = mbt_is_derived; + coll_info.mft_is_derived = mft_is_derived; + coll_info.mpi_buf_count = mpi_buf_count; + coll_info.chunk_addr = addr; + +#ifdef KENT +printf("before final collective IO\n"); +#endif + if(H5D_final_collective_io(io_info,&mpi_file_type,&mpi_buf_type,&coll_info,buf,do_write)<0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish collective MPI-IO"); + done: + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5D_inter_collective_io */ +/*------------------------------------------------------------------------- + * Function: H5D_final_collective_io + * + * Purpose: Routine for the common part of collective IO with different storages. + + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static herr_t +H5D_final_collective_io(H5D_io_info_t *io_info,MPI_Datatype*mpi_file_type,MPI_Datatype *mpi_buf_type, + H5D_common_coll_info_t* coll_info, void *buf, hbool_t do_write) +{ + + + int mpi_code; /* MPI return code */ + hbool_t plist_is_setup=0; /* Whether the dxpl has been customized */ + herr_t ret_value = SUCCEED; + + + FUNC_ENTER_NOAPI_NOINIT(H5D_final_collective_io) + + /* + * Pass buf type, file type to the file driver. + */ + + if(H5FD_mpi_setup_collective(io_info->dxpl_id, *mpi_buf_type, *mpi_file_type)<0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O properties"); + + plist_is_setup=1; + /*HDfprintf(stdout,"chunk addr %Hu\n",coll_info->chunk_addr); + printf("mpi_buf_count %d\n",coll_info->mpi_buf_count); */ + if(do_write) { + ret_value = (io_info->ops.write)(io_info, + coll_info->mpi_buf_count,0,NULL,NULL,coll_info->chunk_addr, + buf); + /* Check return value from optimized write */ + if (ret_value<0) + HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed") + } + else { + ret_value = (io_info->ops.read)(io_info, + coll_info->mpi_buf_count,0,NULL,NULL,coll_info->chunk_addr, + buf); + /* Check return value from optimized write */ + if (ret_value<0) + HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed") + } + done: + + /* Reset the dxpl settings */ + if(plist_is_setup) { + if(H5FD_mpi_teardown_collective(io_info->dxpl_id)<0) + HDONE_ERROR(H5E_DATASPACE, H5E_CANTFREE, FAIL, "unable to reset dxpl values"); + } /* end if */ + + /* free the MPI buf and file types */ + if (coll_info->mbt_is_derived) { + if (MPI_SUCCESS != (mpi_code= MPI_Type_free( mpi_buf_type ))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); + + } + if (coll_info->mft_is_derived) { + if (MPI_SUCCESS != (mpi_code= MPI_Type_free( mpi_file_type ))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); + } + + FUNC_LEAVE_NOAPI(ret_value) +}/* end H5D_final_collective_io */ + +#ifdef OLD_WAY +/*------------------------------------------------------------------------- + * Function: H5D_pre_sort_chunk + * + * Purpose: Routine to obtain addresses of all chunks for all processes + + Description: + root will collective all chunk addresses and broadcast towards other processes. + + Parameters: + + Input: H5D_io_info_t* io_info, + int total_chunks, + Output: haddr_t total_chunk_addr_array[], : array to store sorted total chunk addresses + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static herr_t +H5D_pre_sort_chunk(H5D_io_info_t *io_info,int total_chunks,haddr_t total_chunk_addr_array[]){ + + int root, mpi_rank; + MPI_Comm comm; + int mpi_code; + MPI_Datatype chunk_addrtype; + int mpi_type_cleanup = 0; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI_NOINIT(H5D_pre_sort_chunk) + + root = 0; + comm = io_info->comm; + if(MPI_SUCCESS !=(mpi_code = MPI_Comm_rank(comm,&mpi_rank))) + HMPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code); + + /*Create received MPI derived datatype */ + if(MPI_SUCCESS !=(mpi_code = MPI_Type_contiguous(sizeof(haddr_t)*total_chunks,MPI_BYTE,&chunk_addrtype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_contiguous failed", mpi_code); + if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&chunk_addrtype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code); + + + mpi_type_cleanup = 1; + + if(mpi_rank == root) { + if(H5D_istore_chunkmap(io_info,total_chunks,total_chunk_addr_array)<0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address"); + + } + /* Broadcasting the MPI_IO option info. and chunk address info. */ + if(MPI_SUCCESS !=(mpi_code = MPI_Bcast(total_chunk_addr_array,1,chunk_addrtype,root,comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code); + +done: + + if(mpi_type_cleanup){ + if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &chunk_addrtype ))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); + } + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5D_pre_sort_chunk() */ + +#endif +/*------------------------------------------------------------------------- + * Function: H5D_sort_chunk + * + * Purpose: Routine to sort chunks in increasing order of chunk address + Each chunk address is also obtained. + + Description: + For most cases, the chunk address has already been sorted in increasing order. + The special sorting flag is used to optimize this common case. + quick sort is used for necessary sorting. + + Parameters: + Input: H5D_io_info_t* io_info, + fm_map *fm(global chunk map struct) + Input/Output: H5D_chunk_addr_info_t chunk_addr_info_array[] : array to store chunk address and information + many_chunk_opt : flag to optimize the way to obtain chunk addresses + for many chunks + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: + * + * Modifications: + * + *------------------------------------------------------------------------- + */ + +static herr_t +H5D_sort_chunk(H5D_io_info_t * io_info, + fm_map *fm, + H5D_chunk_addr_info_t chunk_addr_info_array[], + int many_chunk_opt) +{ + + + H5SL_node_t *chunk_node; /* Current node in chunk skip list */ + H5D_chunk_info_t *chunk_info; /* Current chunking info. of this node. */ + haddr_t chunk_addr; /* Current chunking address of this node */ + haddr_t *total_chunk_addr_array; /* The array of chunk address for the total number of chunk */ + int i,j,k,mpi_code; + int total_chunks; + size_t num_chunks; + int mpi_type_cleanup = 0; + int tchunk_addr_cleanup = 0; + MPI_Datatype chunk_addrtype; + H5D_storage_t store; /*union of EFL and chunk pointer in file space */ + hbool_t do_sort = FALSE; + herr_t ret_value = SUCCEED; /*return value */ + + FUNC_ENTER_NOAPI_NOINIT(H5D_sort_chunk) + + num_chunks = H5SL_count(fm->fsel); + /* If we need to optimize the way to obtain the chunk address */ + if(many_chunk_opt != H5D_OBTAIN_ONE_CHUNK_ADDR_IND){ + + total_chunks = (int)fm->total_chunks; + total_chunk_addr_array = H5MM_malloc(sizeof(haddr_t)*total_chunks); + tchunk_addr_cleanup = 1; + + if(many_chunk_opt == H5D_OBTAIN_ALL_CHUNK_ADDR_COL) {/* We will broadcast the array from the root process */ + + int mpi_rank, root,mpi_code; + root = 0; + if((mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file))<0) + HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank"); + + /*Create received MPI derived datatype */ + if(MPI_SUCCESS !=(mpi_code = MPI_Type_contiguous(sizeof(haddr_t)*total_chunks,MPI_BYTE,&chunk_addrtype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_contiguous failed", mpi_code); + if(MPI_SUCCESS !=(mpi_code = MPI_Type_commit(&chunk_addrtype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code); + + mpi_type_cleanup = 1; + + if(mpi_rank == root) { + if(H5D_istore_chunkmap(io_info,total_chunks,total_chunk_addr_array,fm->down_chunks)<0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address"); + } + /* Broadcasting the MPI_IO option info. and chunk address info. */ + if(MPI_SUCCESS !=(mpi_code = MPI_Bcast(total_chunk_addr_array,1,chunk_addrtype,root,io_info->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code); + } + + else { /* Obtain all chunk addresses independently */ + if(H5D_istore_chunkmap(io_info,total_chunks,total_chunk_addr_array,fm->down_chunks)<0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address"); + } + } + + /* Get first node in chunk skip list */ + if(NULL ==(chunk_node = H5SL_first(fm->fsel))) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk node from skipped list"); + /* Set dataset storage for I/O info */ + io_info->store = &store; + if(NULL ==(chunk_info = H5SL_item(chunk_node))) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list"); + store.chunk.offset = chunk_info->coords; + store.chunk.index = chunk_info->index; + i = 0; + if(many_chunk_opt == H5D_OBTAIN_ONE_CHUNK_ADDR_IND){ + if(HADDR_UNDEF==(chunk_addr = H5D_istore_get_addr(io_info,NULL))) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list"); + } + else + chunk_addr = total_chunk_addr_array[chunk_info->index]; + chunk_addr_info_array[i].chunk_addr = chunk_addr; + chunk_addr_info_array[i].chunk_info = *chunk_info; + + chunk_node = H5SL_next(chunk_node); + while(chunk_node) { + + chunk_info = H5SL_item(chunk_node); + store.chunk.offset = chunk_info->coords; + store.chunk.index = chunk_info->index; + + if(many_chunk_opt == H5D_OBTAIN_ONE_CHUNK_ADDR_IND){ + if(HADDR_UNDEF==(chunk_addr = H5D_istore_get_addr(io_info,NULL))) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list"); + } + else + chunk_addr = total_chunk_addr_array[chunk_info->index]; + + if(chunk_addr < chunk_addr_info_array[i].chunk_addr) do_sort = TRUE; + chunk_addr_info_array[i+1].chunk_addr = chunk_addr; + chunk_addr_info_array[i+1].chunk_info =*chunk_info; + i++; + chunk_node = H5SL_next(chunk_node); + } + + if(do_sort) + HDqsort(chunk_addr_info_array,num_chunks,sizeof(chunk_addr_info_array),H5D_cmp_chunk_addr); + +done: + + if(tchunk_addr_cleanup) + HDfree(total_chunk_addr_array); + if(mpi_type_cleanup) { + if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &chunk_addrtype ))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); + } + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5D_sort_chunk() */ + + +/*------------------------------------------------------------------------- + * Function: H5D_obtain_mpio_mode + * + * Purpose: Routine to obtain each io mode(collective,independent or none) for each chunk; + Each chunk address is also obtained. + + Description: + + 1) Each process provides two piece of information for all chunks with selection + a) chunk index + b) wheather this chunk is regular(for MPI derived datatype not working case) + + 2) Gather all the information to the root process + + 3) Root process will do the following: + a) Obtain chunk address for all chunks in this data space + b) With the consideration of the user option, calculate IO mode for each chunk + c) Build MPI derived datatype to combine "chunk address" and "assign_io" information + in order to do MPI Bcast only once + d) MPI Bcast the IO mode and chunk address information for each chunk. + 4) Each process then retrieves IO mode and chunk address information to assign_io_mode and chunk_addr. + + Parameters: + + Input: H5D_io_info_t* io_info, + fm_map *fm,(global chunk map struct) + Output: uint8_t assign_io_mode[], : IO mode, collective, independent or none + haddr_t chunk_addr[], : chunk address array for each chunk + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: + * + * Modifications: + * + *------------------------------------------------------------------------- + */ + +static herr_t +H5D_obtain_mpio_mode(H5D_io_info_t* io_info, + fm_map *fm, + uint8_t assign_io_mode[], + haddr_t chunk_addr[]) +{ + + int total_chunks; + hsize_t ori_total_chunks; + int percent_nproc_per_chunk,threshold_nproc_per_chunk; + uint8_t* io_mode_info; + uint8_t* recv_io_mode_info; + uint8_t* mergebuf; + uint8_t* tempbuf; + + H5SL_node_t* chunk_node; + H5D_chunk_info_t* chunk_info; + + MPI_Datatype bastype[2]; + MPI_Datatype chunk_addrtype; + int bascount; + int basblock[2]; + MPI_Aint basdisp[2]; + MPI_Datatype rtype; + MPI_Datatype stype; + int mpi_size,mpi_rank; + MPI_Comm comm; + int root; + int mpi_code; + int multi_chunk_io_col_threshold; + int mem_cleanup = 0, + mpi_type_cleanup = 0; + + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI_NOINIT(H5D_obtain_mpio_mode) + + /* Assign the rank 0 to the root */ + root = 0; + comm = io_info->comm; + + /* Obtain the number of process and the current rank of the process */ + if((mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file))<0) + HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank"); + if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file))<0) + HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size"); + multi_chunk_io_col_threshold = H5D_MULTI_CHUNK_IO_COL_THRESHOLD; /* May replace by user-input */ + percent_nproc_per_chunk = multi_chunk_io_col_threshold;/* For example, above 50%, do collective IO */ + threshold_nproc_per_chunk = mpi_size * percent_nproc_per_chunk/100; + + /* Allocate memory */ + ori_total_chunks = fm->total_chunks; + H5_ASSIGN_OVERFLOW(total_chunks,ori_total_chunks,hsize_t,int); + + io_mode_info = (uint8_t *)H5MM_calloc(total_chunks*sizeof(MPI_BYTE)); + mergebuf = H5MM_malloc((sizeof(haddr_t)+sizeof(MPI_BYTE))*total_chunks); + tempbuf = mergebuf + sizeof(MPI_BYTE)*total_chunks; + if(mpi_rank == root) + recv_io_mode_info = (uint8_t *)H5MM_malloc(total_chunks*sizeof(MPI_BYTE)*mpi_size); + + + mem_cleanup = 1; + + chunk_node = H5SL_first(fm->fsel); + + /*Obtain the regularity and selection information for all chunks in this process. */ + while(chunk_node){ + + chunk_info = H5SL_item(chunk_node); + +#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS + /* regularity information: 1, selection information: 2 */ + if(H5S_SELECT_IS_REGULAR(chunk_info->fspace) == TRUE && + H5S_SELECT_IS_REGULAR(chunk_info->mspace) == TRUE) +#endif + io_mode_info[chunk_info->index] = H5D_CHUNK_SELECT_REG; /* this chunk is selected and is "regular" without defining H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS. */ +#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS + else + io_mode_info[chunk_info->index] = H5D_CHUNK_SELECT_IRREG; /* this chunk is selected and is irregular*/ +#endif + + chunk_node = H5SL_next(chunk_node); + } + + /*Create sent MPI derived datatype */ + if(MPI_SUCCESS !=(mpi_code = MPI_Type_contiguous(total_chunks,MPI_BYTE,&stype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code); + if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&stype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code); + + /*Create received basic MPI derived datatype */ + bascount = 2; + basblock[0] = total_chunks; + basblock[1] = total_chunks; + basdisp[0] = 0; + basdisp[1] = (MPI_Aint)(sizeof(MPI_BYTE)*total_chunks);/* may need to check overflow */ + bastype[0] = MPI_BYTE; + + if(MPI_SUCCESS !=(mpi_code = MPI_Type_contiguous(sizeof(haddr_t),MPI_BYTE,&chunk_addrtype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_contiguous failed", mpi_code); + if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&chunk_addrtype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code); + bastype[1] = chunk_addrtype; + + if(MPI_SUCCESS !=(mpi_code = MPI_Type_struct(bascount,basblock,basdisp,bastype,&rtype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_struct failed", mpi_code); + if(MPI_SUCCESS !=(mpi_code = MPI_Type_commit(&rtype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code); + + /* Set up a flag to clean up the MPI derived datatype later */ + mpi_type_cleanup = 1; + + /*Gather all the information */ + if(MPI_SUCCESS !=(mpi_code = MPI_Gather(io_mode_info,1,stype,recv_io_mode_info,1,stype,root,comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Gather failed", mpi_code); + + /* Calculate the mode for IO(collective, independent or none) at root process */ + if(mpi_rank == root) { + + int ic,nproc; + int* nproc_per_chunk; +#if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS) + int* ind_this_chunk; +#endif + + /* pre-computing: calculate number of processes and + regularity of the selection occupied in each chunk */ + nproc_per_chunk = (int*)H5MM_calloc(total_chunks*sizeof(int)); +#if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS) + ind_this_chunk = (int*)H5MM_calloc(total_chunks*sizeof(int)); +#endif + + /* calculating the chunk address */ + if(H5D_istore_chunkmap(io_info,total_chunks,chunk_addr,fm->down_chunks)<0){ + HDfree(nproc_per_chunk); +#if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS) + HDfree(ind_this_chunk); +#endif + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address"); + } + + /* checking for number of process per chunk and regularity of the selection*/ + for (nproc = 0;nproc <mpi_size;nproc++){ + uint8_t *tmp_recv_io_mode_info = recv_io_mode_info + nproc*total_chunks; + /* calculate the number of process per chunk and adding irregular selection option */ + for(ic = 0; ic < total_chunks; ic++, tmp_recv_io_mode_info++){ + if(*tmp_recv_io_mode_info != 0) { + nproc_per_chunk[ic]++; +#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS + if(*tmp_recv_io_mode_info == H5D_CHUNK_SELECT_IRREG) + ind_this_chunk[ic] = 1; +#endif + } +#ifndef H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS + else { + /*checking whether we have a selection in this chunk */ + ind_this_chunk[ic] = 1; + } +#endif + } + + } + + /* Calculating MPIO mode for each chunk (collective, independent, none) */ + for(ic = 0; ic < total_chunks; ic++){ + if(nproc_per_chunk[ic]>=MAX(2,threshold_nproc_per_chunk)){ +#if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS) + if(!ind_this_chunk[ic]) assign_io_mode[ic] = H5D_CHUNK_IO_MODE_COL; +#else + assign_io_mode[ic] = H5D_CHUNK_IO_MODE_COL; +#endif + } + + } + + /* merge buffer io_mode info and chunk addr into one */ + HDmemcpy(mergebuf,assign_io_mode,sizeof(MPI_BYTE)*total_chunks); + HDmemcpy(tempbuf,chunk_addr,sizeof(haddr_t)*total_chunks); + + HDfree(nproc_per_chunk); +#if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS) + HDfree(ind_this_chunk); +#endif + } + + /* Broadcasting the MPI_IO option info. and chunk address info. */ + if(MPI_SUCCESS !=(mpi_code = MPI_Bcast(mergebuf,1,rtype,root,comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code); + + HDmemcpy(assign_io_mode,mergebuf,sizeof(MPI_BYTE)*total_chunks); + HDmemcpy(chunk_addr,tempbuf,sizeof(haddr_t)*total_chunks); + +done: + + if(mpi_type_cleanup) { + if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &chunk_addrtype ))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); + + if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &stype ))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); + + if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &rtype ))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); + } + + if(mem_cleanup){ + HDfree(io_mode_info); + HDfree(mergebuf); + if(mpi_rank == root) + HDfree(recv_io_mode_info); + } + + FUNC_LEAVE_NOAPI(ret_value) + }/* end H5D_obtain_mpio_mode*/ + +static int +H5D_cmp_chunk_addr(const void *chunk_addr_info1, const void *chunk_addr_info2) +{ + haddr_t addr1, addr2; + + FUNC_ENTER_NOAPI_NOINIT(H5D_cmp_chunk_addr) + + addr1 = ((const H5D_chunk_addr_info_t *)chunk_addr_info1)->chunk_addr; + addr2 = ((const H5D_chunk_addr_info_t *)chunk_addr_info2)->chunk_addr; + + FUNC_LEAVE_NOAPI(H5F_addr_cmp(addr1, addr2)) + + } #endif /* H5_HAVE_PARALLEL */ diff --git a/src/H5Dpkg.h b/src/H5Dpkg.h index 97e7763..df30800 100644 --- a/src/H5Dpkg.h +++ b/src/H5Dpkg.h @@ -34,7 +34,8 @@ #include "H5Gprivate.h" /* Groups */ #include "H5Oprivate.h" /* Object headers */ #include "H5Sprivate.h" /* Dataspaces */ -#include "H5Tprivate.h" /* Datatypes */ +#include "H5Tprivate.h" /* Datatype functions */ +#include "H5SLprivate.h" /* Skip lists */ /**************************/ /* Package Private Macros */ @@ -67,14 +68,14 @@ struct H5D_io_info_t; typedef herr_t (*H5D_io_read_func_t)(struct H5D_io_info_t *io_info, size_t nelmts, size_t elmt_size, - const H5S_t *file_space, const H5S_t *mem_space, + const H5S_t *file_space, const H5S_t *mem_space,haddr_t addr, void *buf/*out*/); /* Write directly from app buffer to file */ typedef herr_t (*H5D_io_write_func_t)(struct H5D_io_info_t *io_info, size_t nelmts, size_t elmt_size, - const H5S_t *file_space, const H5S_t *mem_space, + const H5S_t *file_space, const H5S_t *mem_space,haddr_t addr, const void *buf); /* Function pointers for I/O on particular types of dataset layouts */ @@ -187,6 +188,39 @@ typedef enum { H5D_ALLOC_WRITE /* Dataset is being extended */ } H5D_time_alloc_t; + +/* Structure holding information about a chunk's selection for mapping */ +typedef struct H5D_chunk_info_t { + hsize_t index; /* "Index" of chunk in dataset */ + size_t chunk_points; /* Number of elements selected in chunk */ + H5S_t *fspace; /* Dataspace describing chunk & selection in it */ + hsize_t coords[H5O_LAYOUT_NDIMS]; /* Coordinates of chunk in file dataset's dataspace */ + H5S_t *mspace; /* Dataspace describing selection in memory corresponding to this chunk */ + unsigned mspace_shared; /* Indicate that the memory space for a chunk is shared and shouldn't be freed */ +} H5D_chunk_info_t; + +/* Main structure holding the mapping between file chunks and memory */ +typedef struct fm_map { + H5SL_t *fsel; /* Skip list containing file dataspaces for all chunks */ + hsize_t last_index; /* Index of last chunk operated on */ + H5D_chunk_info_t *last_chunk_info; /* Pointer to last chunk's info */ + const H5S_t *file_space; /* Pointer to the file dataspace */ + const H5S_t *mem_space; /* Pointer to the memory dataspace */ + unsigned mem_space_copy; /* Flag to indicate that the memory dataspace must be copied */ + hsize_t f_dims[H5O_LAYOUT_NDIMS]; /* File dataspace dimensions */ + H5S_t *mchunk_tmpl; /* Dataspace template for new memory chunks */ + unsigned f_ndims; /* Number of dimensions for file dataspace */ + H5S_sel_iter_t mem_iter; /* Iterator for elements in memory selection */ + unsigned m_ndims; /* Number of dimensions for memory dataspace */ + hsize_t chunks[H5O_LAYOUT_NDIMS]; /* Number of chunks in each dimension */ + hsize_t chunk_dim[H5O_LAYOUT_NDIMS]; /* Size of chunk in each dimension */ + hsize_t down_chunks[H5O_LAYOUT_NDIMS]; /* "down" size of number of chunks in each dimension */ + H5O_layout_t *layout; /* Dataset layout information*/ + H5S_sel_type msel_type; /* Selection type in memory */ + hsize_t total_chunks; /* Number of total chunks */ + hbool_t *select_chunk; /* store the information about whether this chunk is selected or not */ +} fm_map; + /*****************************/ /* Package Private Variables */ /*****************************/ @@ -215,10 +249,12 @@ H5_DLL size_t H5D_select_mgath (const void *_buf, H5_DLL herr_t H5D_select_read(H5D_io_info_t *io_info, size_t nelmts, size_t elmt_size, const H5S_t *file_space, const H5S_t *mem_space, + haddr_t addr, void *buf/*out*/); H5_DLL herr_t H5D_select_write(H5D_io_info_t *io_info, size_t nelmts, size_t elmt_size, const H5S_t *file_space, const H5S_t *mem_space, + haddr_t addr, const void *buf/*out*/); /* Functions that operate on contiguous storage */ @@ -296,42 +332,31 @@ H5_DLL ssize_t H5D_efl_writevv(const H5D_io_info_t *io_info, H5_DLL herr_t H5D_mpio_select_read(H5D_io_info_t *io_info, size_t nelmts, size_t elmt_size, const struct H5S_t *file_space, const struct H5S_t *mem_space, - void *buf/*out*/); + haddr_t addr,void *buf/*out*/); /* MPI-IO function to read , it will select either regular or irregular read */ H5_DLL herr_t H5D_mpio_select_write(H5D_io_info_t *io_info, size_t nelmts, size_t elmt_size, const struct H5S_t *file_space, const struct H5S_t *mem_space, - const void *buf); - -/* MPI-IO function to read directly from app buffer to file rky980813 */ -H5_DLL herr_t H5D_mpio_spaces_read(H5D_io_info_t *io_info, - size_t nelmts, size_t elmt_size, - const struct H5S_t *file_space, const struct H5S_t *mem_space, - void *buf/*out*/); - -/* MPI-IO function to write directly from app buffer to file rky980813 */ -H5_DLL herr_t H5D_mpio_spaces_write(H5D_io_info_t *io_info, - size_t nelmts, size_t elmt_size, - const struct H5S_t *file_space, const struct H5S_t *mem_space, - const void *buf); - -/* MPI-IO function to read directly from app buffer to file rky980813 */ -H5_DLL herr_t H5D_mpio_spaces_span_read(H5D_io_info_t *io_info, - size_t nelmts, size_t elmt_size, - const struct H5S_t *file_space, const struct H5S_t *mem_space, - void *buf/*out*/); - -/* MPI-IO function to write directly from app buffer to file rky980813 */ -H5_DLL herr_t H5D_mpio_spaces_span_write(H5D_io_info_t *io_info, - size_t nelmts, size_t elmt_size, - const struct H5S_t *file_space, const struct H5S_t *mem_space, - const void *buf); - + haddr_t addr,const void *buf); + +/* MPI-IO function to handle contiguous collective IO */ +H5_DLL herr_t +H5D_contig_collective_io(H5D_io_info_t *io_info, + const H5S_t *file_space,const H5S_t *mem_space, + void *_buf,hbool_t do_write); + +/* MPI-IO function to handle chunked collective IO */ +H5_DLL herr_t +H5D_chunk_collective_io(H5D_io_info_t * io_info,fm_map *fm, void*buf, + hbool_t do_write); /* MPI-IO function to check if a direct I/O transfer is possible between * memory and the file */ H5_DLL htri_t H5D_mpio_opt_possible(const H5D_io_info_t *io_info, const H5S_t *mem_space, const H5S_t *file_space, const H5T_path_t *tpath); + +H5_DLL herr_t H5D_mpio_chunk_adjust_iomode(H5D_io_info_t *io_info,const fm_map *fm); + #endif /* H5_HAVE_PARALLEL */ /* Testing functions */ diff --git a/src/H5Dselect.c b/src/H5Dselect.c index 9f98d9e..df9d32f 100644 --- a/src/H5Dselect.c +++ b/src/H5Dselect.c @@ -432,6 +432,7 @@ herr_t H5D_select_read(H5D_io_info_t *io_info, size_t nelmts, size_t elmt_size, const H5S_t *file_space, const H5S_t *mem_space, + const haddr_t UNUSED addr, void *buf/*out*/) { H5S_sel_iter_t mem_iter; /* Memory selection iteration info */ @@ -575,6 +576,7 @@ herr_t H5D_select_write(H5D_io_info_t *io_info, size_t nelmts, size_t elmt_size, const H5S_t *file_space, const H5S_t *mem_space, + const haddr_t UNUSED addr, const void *buf/*out*/) { H5S_sel_iter_t mem_iter; /* Memory selection iteration info */ |