diff options
author | MuQun Yang <ymuqun@hdfgroup.org> | 2004-07-20 21:35:41 (GMT) |
---|---|---|
committer | MuQun Yang <ymuqun@hdfgroup.org> | 2004-07-20 21:35:41 (GMT) |
commit | 8933c596946b9ba88fb2ca66a3f46f1f1d209aed (patch) | |
tree | 84e3d9a39e134160a92b648c0b29b7696c8603ca /src/H5Smpio.c | |
parent | a86779f3f79905ce923d4689c1fb614969370031 (diff) | |
download | hdf5-8933c596946b9ba88fb2ca66a3f46f1f1d209aed.zip hdf5-8933c596946b9ba88fb2ca66a3f46f1f1d209aed.tar.gz hdf5-8933c596946b9ba88fb2ca66a3f46f1f1d209aed.tar.bz2 |
[svn-r8906] Purpose:
Adding the first round of patches about supporting collective chunk IO in HDF5
Description:
The current HDF5 library doesn't support collective MPIO with chunk storage. When users set collective option in their data transfer with chunk storage, the library silently converted the option to INDEPENDENT and that caused trememdous performance penalty. Some application like
WRF-parallel HDF5 IO module has to use contiguous storage for this reason. However, chunking storage has its own advantage(supporting compression filters and extensible dataset), so to make collective MPIO possible inside HDF5 with chunking storage is a very important task.
This check-in make collective chunk IO possible for some special cases. The condition is as follows(either case is fine with using collective chunk IO)
1. for each process, the hyperslab selection of the file data space of each dataset is regular and it is fit in one chunk.
2. for each process, the hyperslab selection of the file data space of each dataset is single and the number of chunks for the hyperslab selection should be equal.
Solution:
Lift up the contiguous storage requirement for collective IO.
Use H5D_isstore_get_addr to get the corresponding chunk address. Then the original library routines will take care of getting the correct address to make sure that MPI FILE TYPE is built correctly for collective IO>
Platforms tested:
arabica(sol), copper(AIX), eirene(Linux)
parallel test is checked at copper.
Misc. update:
Diffstat (limited to 'src/H5Smpio.c')
-rw-r--r-- | src/H5Smpio.c | 177 |
1 files changed, 165 insertions, 12 deletions
diff --git a/src/H5Smpio.c b/src/H5Smpio.c index 0207080..5701b9c 100644 --- a/src/H5Smpio.c +++ b/src/H5Smpio.c @@ -23,6 +23,7 @@ #define H5F_PACKAGE /*suppress error about including H5Fpkg */ #define H5S_PACKAGE /*suppress error about including H5Spkg */ +#define H5D_PACKAGE /* Pablo information */ /* (Put before include files to avoid problems with inline functions) */ @@ -31,11 +32,15 @@ #include "H5private.h" /* Generic Functions */ #include "H5Eprivate.h" /* Error handling */ #include "H5Fpkg.h" /* Files */ +#include "H5Dpkg.h" #include "H5FDprivate.h" /* File drivers */ #include "H5Iprivate.h" /* IDs */ #include "H5Pprivate.h" /* Property lists */ #include "H5Spkg.h" /* Dataspaces */ +#include "H5Oprivate.h" +#include "H5Dprivate.h" + #ifdef H5_HAVE_PARALLEL static herr_t @@ -69,7 +74,9 @@ H5S_mpio_space_type( const H5S_t *space, size_t elmt_size, static herr_t H5S_mpio_spaces_xfer(H5F_t *f, const H5D_t *dset, size_t elmt_size, const H5S_t *file_space, const H5S_t *mem_space, - hid_t dxpl_id, void *buf/*out*/, hbool_t do_write); + hid_t dxpl_id, void *buf/*out*/, + const H5D_storage_t *store, + hbool_t do_write); /*------------------------------------------------------------------------- @@ -628,7 +635,8 @@ done: static herr_t H5S_mpio_spaces_xfer(H5F_t *f, const H5D_t *dset, size_t elmt_size, const H5S_t *file_space, const H5S_t *mem_space, - hid_t dxpl_id, void *_buf /*out*/, + hid_t dxpl_id, void *_buf /*out*/, + const H5D_storage_t *store, hbool_t do_write ) { haddr_t addr; /* Address of dataset (or selection) within file */ @@ -642,6 +650,9 @@ H5S_mpio_spaces_xfer(H5F_t *f, const H5D_t *dset, size_t elmt_size, int mpi_code; /* MPI return code */ herr_t ret_value = SUCCEED; /* Return value */ + haddr_t chunk_addr; /* for collective chunk IO */ + + FUNC_ENTER_NOAPI_NOINIT(H5S_mpio_spaces_xfer); /* Check args */ @@ -672,7 +683,17 @@ H5S_mpio_spaces_xfer(H5F_t *f, const H5D_t *dset, size_t elmt_size, &mft_is_derived )<0) HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI file type"); - addr = H5D_contig_get_addr(dset) + mpi_file_offset; + +/* Adding comments for chunk collective IO */ + if(dset->layout.type == H5D_CONTIGUOUS) { + addr = H5D_contig_get_addr(dset) + mpi_file_offset; + } + else { + assert(dset->layout.type == H5D_CHUNKED); + chunk_addr=H5D_istore_get_addr(f,dxpl_id,&(dset->layout),store->chunk.offset,NULL); + addr = f->shared->base_addr + chunk_addr + mpi_file_offset; + } + #ifdef H5Smpi_DEBUG HDfprintf(stderr, "spaces_xfer: relative addr=%a\n", addr ); #endif @@ -740,7 +761,7 @@ done: */ herr_t H5S_mpio_spaces_read(H5F_t *f, const H5D_dxpl_cache_t UNUSED *dxpl_cache, hid_t dxpl_id, - H5D_t *dset, const H5D_storage_t UNUSED *store, + H5D_t *dset, const H5D_storage_t *store, size_t UNUSED nelmts, size_t elmt_size, const H5S_t *file_space, const H5S_t *mem_space, void *buf/*out*/) @@ -750,7 +771,7 @@ H5S_mpio_spaces_read(H5F_t *f, const H5D_dxpl_cache_t UNUSED *dxpl_cache, hid_t FUNC_ENTER_NOAPI_NOFUNC(H5S_mpio_spaces_read); ret_value = H5S_mpio_spaces_xfer(f, dset, elmt_size, file_space, - mem_space, dxpl_id, buf, 0/*read*/); + mem_space, dxpl_id, buf,store, 0/*read*/); FUNC_LEAVE_NOAPI(ret_value); } /* end H5S_mpio_spaces_read() */ @@ -778,7 +799,7 @@ H5S_mpio_spaces_read(H5F_t *f, const H5D_dxpl_cache_t UNUSED *dxpl_cache, hid_t */ herr_t H5S_mpio_spaces_write(H5F_t *f, const H5D_dxpl_cache_t UNUSED *dxpl_cache, hid_t dxpl_id, - H5D_t *dset, const H5D_storage_t UNUSED *store, + H5D_t *dset, const H5D_storage_t *store, size_t UNUSED nelmts, size_t elmt_size, const H5S_t *file_space, const H5S_t *mem_space, const void *buf) @@ -789,7 +810,7 @@ H5S_mpio_spaces_write(H5F_t *f, const H5D_dxpl_cache_t UNUSED *dxpl_cache, hid_t /*OKAY: CAST DISCARDS CONST QUALIFIER*/ ret_value = H5S_mpio_spaces_xfer(f, dset, elmt_size, file_space, - mem_space, dxpl_id, (void*)buf, 1/*write*/); + mem_space, dxpl_id, (void*)buf, store,1/*write*/); FUNC_LEAVE_NOAPI(ret_value); } /* end H5S_mpio_spaces_write() */ @@ -812,10 +833,21 @@ H5S_mpio_spaces_write(H5F_t *f, const H5D_dxpl_cache_t UNUSED *dxpl_cache, hid_t *------------------------------------------------------------------------- */ htri_t -H5S_mpio_opt_possible( const H5S_t *mem_space, const H5S_t *file_space, const unsigned flags) +H5S_mpio_opt_possible( const H5F_t *file, const H5S_t *mem_space, const H5S_t *file_space, const unsigned flags,const H5O_layout_t *layout) { htri_t c1,c2; /* Flags whether a selection is optimizable */ htri_t ret_value=TRUE; + hsize_t chunk_dim[H5S_MAX_RANK+1]; + hssize_t startf[H5S_MAX_RANK],endf[H5S_MAX_RANK],startm[H5S_MAX_RANK],endm[H5S_MAX_RANK]; + int fnum_chunk[H5S_MAX_RANK],mnum_chunk[H5S_MAX_RANK]; + int rank,i,dim_rankm,dim_rankf; + int pcheck_hyper,check_hyper,check_num_chunkm,check_num_chunkf; + int tnum_chunkf,manum_chunkf,minum_chunkf; + int tnum_chunkm,manum_chunkm,minum_chunkm; + H5S_sel_type fsel_type,msel_type; + MPI_Comm comm; + + FUNC_ENTER_NOAPI(H5S_mpio_opt_possible, FAIL); @@ -823,11 +855,34 @@ H5S_mpio_opt_possible( const H5S_t *mem_space, const H5S_t *file_space, const un assert(mem_space); assert(file_space); + /* Parallel I/O conversion flag must be set, if it is not collective IO, go to false. */ + if(!(flags&H5S_CONV_PAR_IO_POSSIBLE)) + HGOTO_DONE(FALSE); + + /*getting MPI communicator and rank */ + + comm = H5F_mpi_get_comm(file); + rank = H5F_mpi_get_rank(file); + +#if 0 + for (i =0;i<H5S_MAX_RANK;i++){ + chunk_dim[i] = 1; + startf[i] = 1; + endf[i] = 1; + startm[i] = 1; + endm[i] = 1; + fnum_chunk[i] = 1; + mnum_chunk[i] = 1; + } +#endif + /* Check whether these are both simple or scalar dataspaces */ if (!((H5S_SIMPLE==H5S_GET_EXTENT_TYPE(mem_space) || H5S_SCALAR==H5S_GET_EXTENT_TYPE(mem_space)) && (H5S_SIMPLE==H5S_GET_EXTENT_TYPE(file_space) || H5S_SCALAR==H5S_GET_EXTENT_TYPE(file_space)))) HGOTO_DONE(FALSE); + + /* Check whether both selections are "regular" */ c1=H5S_SELECT_IS_REGULAR(file_space); c2=H5S_SELECT_IS_REGULAR(mem_space); @@ -840,14 +895,112 @@ H5S_mpio_opt_possible( const H5S_t *mem_space, const H5S_t *file_space, const un if (H5S_SEL_POINTS==H5S_GET_SELECT_TYPE(mem_space) || H5S_SEL_POINTS==H5S_GET_SELECT_TYPE(file_space)) HGOTO_DONE(FALSE); - /* Dataset storage must be contiguous currently */ - if ((flags&H5S_CONV_STORAGE_MASK)!=H5S_CONV_STORAGE_CONTIGUOUS) + +/* Dataset storage must be contiguous or special chunk storage */ + /* KMY Adding conditions for chunk storage */ + if ((flags&H5S_CONV_STORAGE_MASK)!=H5S_CONV_STORAGE_CONTIGUOUS && + (flags&H5S_CONV_STORAGE_MASK)!=H5S_CONV_STORAGE_CHUNKED) HGOTO_DONE(FALSE); - /* Parallel I/O conversion flag must be set */ - if(!(flags&H5S_CONV_PAR_IO_POSSIBLE)) + if ((flags&H5S_CONV_STORAGE_MASK)==H5S_CONV_STORAGE_CHUNKED) { + + /* Currently collective chunking storage + inside HDF5 is supported for either one of the following two cases: + 1. All the hyperslabs for one process is inside one chunk. + 2. For single hyperslab selection, the number of chunks that covered + the single selection for all processes should be equal. + KY, 2004/7/14 + */ + + /* Quincey, please read. + This is maybe redundent, I think only when both memory and file space be SCALAR + space, the collective IO can work. Otherwise, SELECT_POINT will be reached,collective + IO shouldn't work. + Please clarify and correct the code on the following, + Quincey said that it was probably okay if only one data space is SCALAR, + Still keep the code here until we added more tests later. + Kent */ + if(H5S_SCALAR==mem_space->extent.type || H5S_SCALAR ==file_space->extent.type) { + if(!(H5S_SCALAR==mem_space->extent.type && H5S_SCALAR ==file_space->extent.type)){ + HGOTO_DONE(FALSE); + } + else{ + HGOTO_DONE(TRUE); + } + } + + dim_rankf = file_space->extent.rank; + fsel_type = file_space->select.type->type; + + /* Assure that selection type of either data space is not H5S_SEL_NONE */ +/* Not necessary according to Quincey, commented out for the time being. + if(fsel_type == H5S_SEL_NONE || msel_type == H5S_SEL_NONE) + HGOTO_DONE(FALSE); +*/ + + if(H5S_SELECT_BOUNDS(file_space,startf,endf)==FAIL) + HGOTO_ERROR(H5E_DATASPACE, H5E_BADRANGE,FAIL, "invalid check for single selection blocks"); + + + for(i = 0; i < layout->u.chunk.ndims;i++) + chunk_dim[i] = layout->u.chunk.dim[i]; + + /* Case 1: check whether all hyperslab in this process is inside one chunk. + Note: we don't handle when starting point is less than zero since that may cover + two chunks. */ + + /*for file space checking*/ + pcheck_hyper = 1; + for (i=0; i<dim_rankf; i++){ + if(endf[i]/chunk_dim[i]!=startf[i]/chunk_dim[i]) { + pcheck_hyper = 0; + break; + } + } + + + MPI_Reduce(&pcheck_hyper,&check_hyper,1,MPI_INT,MPI_LAND,0,comm); + MPI_Bcast(&check_hyper,1,MPI_INT,0,comm); + + /*if check_hyper is true, condition for collective IO case is fulfilled, no + need to do further test. */ + if(check_hyper) HGOTO_DONE(TRUE); + + /* Case 2:Check whether the number of chunks that covered the single hyperslab is the same. + If not,no collective chunk IO. We need to check both file and memeory space + KY, 2004/7/14 + */ + + c1 = H5S_SELECT_IS_SINGLE(file_space); + c2 = H5S_SELECT_IS_SINGLE(mem_space); + + if(c1==FAIL || c2 ==FAIL) + HGOTO_ERROR(H5E_DATASPACE, H5E_BADRANGE, FAIL, "invalid check for single selection blocks"); + if(c1==FALSE || c2 ==FALSE) HGOTO_DONE(FALSE); + tnum_chunkf = 1; + for (i = 0; i<dim_rankf;i++){ + fnum_chunk[i] = endf[i]/chunk_dim[i]-startf[i]/chunk_dim[i]+1; + tnum_chunkf = fnum_chunk[i]*tnum_chunkf; + } + + MPI_Reduce(&tnum_chunkf,&manum_chunkf,1,MPI_INT,MPI_MAX,0,comm); + MPI_Reduce(&tnum_chunkf,&minum_chunkf,1,MPI_INT,MPI_MIN,0,comm); + + if(rank == 0) { + if(manum_chunkf!=minum_chunkf) + check_num_chunkf = 0; + else + check_num_chunkf = 1; + } + + MPI_Bcast(&check_num_chunkf,1,MPI_INT,0,comm); + + if(!check_num_chunkf) HGOTO_DONE(FALSE); + + } + done: FUNC_LEAVE_NOAPI(ret_value); } /* H5S_mpio_opt_possible() */ |