From 8933c596946b9ba88fb2ca66a3f46f1f1d209aed Mon Sep 17 00:00:00 2001 From: MuQun Yang Date: Tue, 20 Jul 2004 16:35:41 -0500 Subject: [svn-r8906] Purpose: Adding the first round of patches about supporting collective chunk IO in HDF5 Description: The current HDF5 library doesn't support collective MPIO with chunk storage. When users set collective option in their data transfer with chunk storage, the library silently converted the option to INDEPENDENT and that caused trememdous performance penalty. Some application like WRF-parallel HDF5 IO module has to use contiguous storage for this reason. However, chunking storage has its own advantage(supporting compression filters and extensible dataset), so to make collective MPIO possible inside HDF5 with chunking storage is a very important task. This check-in make collective chunk IO possible for some special cases. The condition is as follows(either case is fine with using collective chunk IO) 1. for each process, the hyperslab selection of the file data space of each dataset is regular and it is fit in one chunk. 2. for each process, the hyperslab selection of the file data space of each dataset is single and the number of chunks for the hyperslab selection should be equal. Solution: Lift up the contiguous storage requirement for collective IO. Use H5D_isstore_get_addr to get the corresponding chunk address. Then the original library routines will take care of getting the correct address to make sure that MPI FILE TYPE is built correctly for collective IO> Platforms tested: arabica(sol), copper(AIX), eirene(Linux) parallel test is checked at copper. Misc. update: --- src/H5Dio.c | 4 +- src/H5Distore.c | 4 +- src/H5Dprivate.h | 7 +++ src/H5S.c | 23 ++++++-- src/H5Smpio.c | 177 +++++++++++++++++++++++++++++++++++++++++++++++++++---- src/H5Spkg.h | 4 +- src/H5Sprivate.h | 4 +- 7 files changed, 196 insertions(+), 27 deletions(-) diff --git a/src/H5Dio.c b/src/H5Dio.c index fb07831..1c00c79 100644 --- a/src/H5Dio.c +++ b/src/H5Dio.c @@ -745,7 +745,7 @@ H5D_read(H5D_t *dataset, hid_t mem_type_id, const H5S_t *mem_space, } /* end switch */ /* Get dataspace functions */ - if (NULL==(sconv=H5S_find(mem_space, file_space, sconv_flags, &use_par_opt_io))) + if (NULL==(sconv=H5S_find(dataset->ent.file,mem_space, file_space, sconv_flags, &use_par_opt_io,&dataset->layout))) HGOTO_ERROR (H5E_DATASET, H5E_UNSUPPORTED, FAIL, "unable to convert from file to memory data space") #ifdef H5_HAVE_PARALLEL @@ -961,7 +961,7 @@ H5D_write(H5D_t *dataset, hid_t mem_type_id, const H5S_t *mem_space, } /* end switch */ /* Get dataspace functions */ - if (NULL==(sconv=H5S_find(mem_space, file_space, sconv_flags, &use_par_opt_io))) + if (NULL==(sconv=H5S_find(dataset->ent.file,mem_space, file_space, sconv_flags, &use_par_opt_io,&dataset->layout))) HGOTO_ERROR (H5E_DATASET, H5E_UNSUPPORTED, FAIL, "unable to convert from memory to file data space") #ifdef H5_HAVE_PARALLEL diff --git a/src/H5Distore.c b/src/H5Distore.c index 1720a09..da9776a 100644 --- a/src/H5Distore.c +++ b/src/H5Distore.c @@ -142,8 +142,6 @@ typedef struct H5D_istore_ud1_t { #define H5D_HASH(D,ADDR) H5F_addr_hash(ADDR,(D)->cache.chunk.nslots) /* Private prototypes */ -static haddr_t H5D_istore_get_addr(H5F_t *f, hid_t dxpl_id, const H5O_layout_t *layout, - const hssize_t offset[], H5D_istore_ud1_t *_udata); static void *H5D_istore_chunk_alloc(size_t size, const H5O_pline_t *pline); static void *H5D_istore_chunk_xfree(void *chk, const H5O_pline_t *pline); static herr_t H5D_istore_shared_free (void *page); @@ -2153,7 +2151,7 @@ done: * *------------------------------------------------------------------------- */ -static haddr_t +haddr_t H5D_istore_get_addr(H5F_t *f, hid_t dxpl_id, const H5O_layout_t *layout, const hssize_t offset[], H5D_istore_ud1_t *_udata) { diff --git a/src/H5Dprivate.h b/src/H5Dprivate.h index 42f5eb5..6ca207e 100644 --- a/src/H5Dprivate.h +++ b/src/H5Dprivate.h @@ -193,6 +193,8 @@ typedef struct H5D_dcpl_cache_t { H5D_fill_time_t fill_time; /* Fill time (H5D_CRT_FILL_TIME_NAME) */ } H5D_dcpl_cache_t; +/* forward reference for collective-chunk IO use */ +struct H5D_istore_ud1_t; /*define at H5Distore.c*/ /* Library-private functions defined in H5D package */ H5_DLL herr_t H5D_init(void); H5_DLL hid_t H5D_open(H5G_entry_t *ent, hid_t dxpl_id); @@ -263,4 +265,9 @@ H5_DLL ssize_t H5D_istore_writevv(H5F_t *f, const H5D_dxpl_cache_t *dxpl_cache, H5_DLL herr_t H5D_istore_debug(H5F_t *f, hid_t dxpl_id, haddr_t addr, FILE * stream, int indent, int fwidth, int ndims); +/* Functions that obtain the dataset address */ +H5_DLL haddr_t H5D_istore_get_addr(H5F_t *f, hid_t dxpl_id, + const H5O_layout_t *layout,const hssize_t offset[], + struct H5D_istore_ud1_t *_udata); + #endif diff --git a/src/H5S.c b/src/H5S.c index ebdbbfb..f6b95fb 100644 --- a/src/H5S.c +++ b/src/H5S.c @@ -1419,7 +1419,12 @@ done: *------------------------------------------------------------------------- */ H5S_conv_t * -H5S_find (const H5S_t *mem_space, const H5S_t *file_space, unsigned +H5S_find (const H5F_t +#ifndef H5_HAVE_PARALLEL +UNUSED +#endif/* H5_HAVE_PARALLEL*/ +*file, +const H5S_t *mem_space, const H5S_t *file_space, unsigned #ifndef H5_HAVE_PARALLEL UNUSED #endif /* H5_HAVE_PARALLEL */ @@ -1427,7 +1432,13 @@ flags, hbool_t #ifndef H5_HAVE_PARALLEL UNUSED #endif /* H5_HAVE_PARALLEL */ -*use_par_opt_io) +*use_par_opt_io, +#ifndef H5_HAVE_PARALLEL +UNUSED +#endif +const H5O_layout_t *layout + +) { H5S_conv_t *path=NULL; /* Space conversion path */ #ifdef H5_HAVE_PARALLEL @@ -1459,9 +1470,9 @@ UNUSED /* * Check if we can set direct MPI-IO read/write functions */ - opt=H5S_mpio_opt_possible(mem_space,file_space,flags); + opt=H5S_mpio_opt_possible(file,mem_space,file_space,flags,layout); if(opt==FAIL) - HGOTO_ERROR(H5E_DATASPACE, H5E_BADRANGE, NULL, "invalid check for contiguous dataspace "); + HGOTO_ERROR(H5E_DATASPACE, H5E_BADRANGE, NULL, "invalid check for direct IO dataspace "); /* Check if we can use the optimized parallel I/O routines */ if(opt==TRUE) { @@ -1501,9 +1512,9 @@ UNUSED /* * Check if we can set direct MPI-IO read/write functions */ - opt=H5S_mpio_opt_possible(mem_space,file_space,flags); + opt=H5S_mpio_opt_possible(file,mem_space,file_space,flags,layout); if(opt==FAIL) - HGOTO_ERROR(H5E_DATASPACE, H5E_BADRANGE, NULL, "invalid check for contiguous dataspace "); + HGOTO_ERROR(H5E_DATASPACE, H5E_BADRANGE, NULL, "invalid check for direct IO dataspace "); /* Check if we can use the optimized parallel I/O routines */ if(opt==TRUE) { diff --git a/src/H5Smpio.c b/src/H5Smpio.c index 0207080..5701b9c 100644 --- a/src/H5Smpio.c +++ b/src/H5Smpio.c @@ -23,6 +23,7 @@ #define H5F_PACKAGE /*suppress error about including H5Fpkg */ #define H5S_PACKAGE /*suppress error about including H5Spkg */ +#define H5D_PACKAGE /* Pablo information */ /* (Put before include files to avoid problems with inline functions) */ @@ -31,11 +32,15 @@ #include "H5private.h" /* Generic Functions */ #include "H5Eprivate.h" /* Error handling */ #include "H5Fpkg.h" /* Files */ +#include "H5Dpkg.h" #include "H5FDprivate.h" /* File drivers */ #include "H5Iprivate.h" /* IDs */ #include "H5Pprivate.h" /* Property lists */ #include "H5Spkg.h" /* Dataspaces */ +#include "H5Oprivate.h" +#include "H5Dprivate.h" + #ifdef H5_HAVE_PARALLEL static herr_t @@ -69,7 +74,9 @@ H5S_mpio_space_type( const H5S_t *space, size_t elmt_size, static herr_t H5S_mpio_spaces_xfer(H5F_t *f, const H5D_t *dset, size_t elmt_size, const H5S_t *file_space, const H5S_t *mem_space, - hid_t dxpl_id, void *buf/*out*/, hbool_t do_write); + hid_t dxpl_id, void *buf/*out*/, + const H5D_storage_t *store, + hbool_t do_write); /*------------------------------------------------------------------------- @@ -628,7 +635,8 @@ done: static herr_t H5S_mpio_spaces_xfer(H5F_t *f, const H5D_t *dset, size_t elmt_size, const H5S_t *file_space, const H5S_t *mem_space, - hid_t dxpl_id, void *_buf /*out*/, + hid_t dxpl_id, void *_buf /*out*/, + const H5D_storage_t *store, hbool_t do_write ) { haddr_t addr; /* Address of dataset (or selection) within file */ @@ -642,6 +650,9 @@ H5S_mpio_spaces_xfer(H5F_t *f, const H5D_t *dset, size_t elmt_size, int mpi_code; /* MPI return code */ herr_t ret_value = SUCCEED; /* Return value */ + haddr_t chunk_addr; /* for collective chunk IO */ + + FUNC_ENTER_NOAPI_NOINIT(H5S_mpio_spaces_xfer); /* Check args */ @@ -672,7 +683,17 @@ H5S_mpio_spaces_xfer(H5F_t *f, const H5D_t *dset, size_t elmt_size, &mft_is_derived )<0) HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI file type"); - addr = H5D_contig_get_addr(dset) + mpi_file_offset; + +/* Adding comments for chunk collective IO */ + if(dset->layout.type == H5D_CONTIGUOUS) { + addr = H5D_contig_get_addr(dset) + mpi_file_offset; + } + else { + assert(dset->layout.type == H5D_CHUNKED); + chunk_addr=H5D_istore_get_addr(f,dxpl_id,&(dset->layout),store->chunk.offset,NULL); + addr = f->shared->base_addr + chunk_addr + mpi_file_offset; + } + #ifdef H5Smpi_DEBUG HDfprintf(stderr, "spaces_xfer: relative addr=%a\n", addr ); #endif @@ -740,7 +761,7 @@ done: */ herr_t H5S_mpio_spaces_read(H5F_t *f, const H5D_dxpl_cache_t UNUSED *dxpl_cache, hid_t dxpl_id, - H5D_t *dset, const H5D_storage_t UNUSED *store, + H5D_t *dset, const H5D_storage_t *store, size_t UNUSED nelmts, size_t elmt_size, const H5S_t *file_space, const H5S_t *mem_space, void *buf/*out*/) @@ -750,7 +771,7 @@ H5S_mpio_spaces_read(H5F_t *f, const H5D_dxpl_cache_t UNUSED *dxpl_cache, hid_t FUNC_ENTER_NOAPI_NOFUNC(H5S_mpio_spaces_read); ret_value = H5S_mpio_spaces_xfer(f, dset, elmt_size, file_space, - mem_space, dxpl_id, buf, 0/*read*/); + mem_space, dxpl_id, buf,store, 0/*read*/); FUNC_LEAVE_NOAPI(ret_value); } /* end H5S_mpio_spaces_read() */ @@ -778,7 +799,7 @@ H5S_mpio_spaces_read(H5F_t *f, const H5D_dxpl_cache_t UNUSED *dxpl_cache, hid_t */ herr_t H5S_mpio_spaces_write(H5F_t *f, const H5D_dxpl_cache_t UNUSED *dxpl_cache, hid_t dxpl_id, - H5D_t *dset, const H5D_storage_t UNUSED *store, + H5D_t *dset, const H5D_storage_t *store, size_t UNUSED nelmts, size_t elmt_size, const H5S_t *file_space, const H5S_t *mem_space, const void *buf) @@ -789,7 +810,7 @@ H5S_mpio_spaces_write(H5F_t *f, const H5D_dxpl_cache_t UNUSED *dxpl_cache, hid_t /*OKAY: CAST DISCARDS CONST QUALIFIER*/ ret_value = H5S_mpio_spaces_xfer(f, dset, elmt_size, file_space, - mem_space, dxpl_id, (void*)buf, 1/*write*/); + mem_space, dxpl_id, (void*)buf, store,1/*write*/); FUNC_LEAVE_NOAPI(ret_value); } /* end H5S_mpio_spaces_write() */ @@ -812,10 +833,21 @@ H5S_mpio_spaces_write(H5F_t *f, const H5D_dxpl_cache_t UNUSED *dxpl_cache, hid_t *------------------------------------------------------------------------- */ htri_t -H5S_mpio_opt_possible( const H5S_t *mem_space, const H5S_t *file_space, const unsigned flags) +H5S_mpio_opt_possible( const H5F_t *file, const H5S_t *mem_space, const H5S_t *file_space, const unsigned flags,const H5O_layout_t *layout) { htri_t c1,c2; /* Flags whether a selection is optimizable */ htri_t ret_value=TRUE; + hsize_t chunk_dim[H5S_MAX_RANK+1]; + hssize_t startf[H5S_MAX_RANK],endf[H5S_MAX_RANK],startm[H5S_MAX_RANK],endm[H5S_MAX_RANK]; + int fnum_chunk[H5S_MAX_RANK],mnum_chunk[H5S_MAX_RANK]; + int rank,i,dim_rankm,dim_rankf; + int pcheck_hyper,check_hyper,check_num_chunkm,check_num_chunkf; + int tnum_chunkf,manum_chunkf,minum_chunkf; + int tnum_chunkm,manum_chunkm,minum_chunkm; + H5S_sel_type fsel_type,msel_type; + MPI_Comm comm; + + FUNC_ENTER_NOAPI(H5S_mpio_opt_possible, FAIL); @@ -823,11 +855,34 @@ H5S_mpio_opt_possible( const H5S_t *mem_space, const H5S_t *file_space, const un assert(mem_space); assert(file_space); + /* Parallel I/O conversion flag must be set, if it is not collective IO, go to false. */ + if(!(flags&H5S_CONV_PAR_IO_POSSIBLE)) + HGOTO_DONE(FALSE); + + /*getting MPI communicator and rank */ + + comm = H5F_mpi_get_comm(file); + rank = H5F_mpi_get_rank(file); + +#if 0 + for (i =0;iextent.type || H5S_SCALAR ==file_space->extent.type) { + if(!(H5S_SCALAR==mem_space->extent.type && H5S_SCALAR ==file_space->extent.type)){ + HGOTO_DONE(FALSE); + } + else{ + HGOTO_DONE(TRUE); + } + } + + dim_rankf = file_space->extent.rank; + fsel_type = file_space->select.type->type; + + /* Assure that selection type of either data space is not H5S_SEL_NONE */ +/* Not necessary according to Quincey, commented out for the time being. + if(fsel_type == H5S_SEL_NONE || msel_type == H5S_SEL_NONE) + HGOTO_DONE(FALSE); +*/ + + if(H5S_SELECT_BOUNDS(file_space,startf,endf)==FAIL) + HGOTO_ERROR(H5E_DATASPACE, H5E_BADRANGE,FAIL, "invalid check for single selection blocks"); + + + for(i = 0; i < layout->u.chunk.ndims;i++) + chunk_dim[i] = layout->u.chunk.dim[i]; + + /* Case 1: check whether all hyperslab in this process is inside one chunk. + Note: we don't handle when starting point is less than zero since that may cover + two chunks. */ + + /*for file space checking*/ + pcheck_hyper = 1; + for (i=0; i