diff options
author | MuQun Yang <ymuqun@hdfgroup.org> | 2006-05-11 00:07:28 (GMT) |
---|---|---|
committer | MuQun Yang <ymuqun@hdfgroup.org> | 2006-05-11 00:07:28 (GMT) |
commit | f857bb3d755e753409319c9b9ffa02c4bdc316b4 (patch) | |
tree | db9018f9b793a9ef356b476ce47a185083c9a274 | |
parent | b415a49c9e0978f75abbe6e8916dbaa376d2f6c5 (diff) | |
download | hdf5-f857bb3d755e753409319c9b9ffa02c4bdc316b4.zip hdf5-f857bb3d755e753409319c9b9ffa02c4bdc316b4.tar.gz hdf5-f857bb3d755e753409319c9b9ffa02c4bdc316b4.tar.bz2 |
[svn-r12343] Purpose:
Bug fix for collective chunk IO
Description:
Several options have been provided inside HDF5 library for
obtaining chunk addresses when doing collective chunk IO.
One option is to obtain chunk addresses for one process and broadcast
to other processes. This option needs all processes to participate.
If using link-chunked IO without any optimizations,
sometimes this is not true due to the random initialization for one variable at mpich 1.2.7.
This is a bug inside the collective chunk IO code.
Solution:
1.Initalize all the variables to some safe numbers,
2. Avoid using MPI broadcast to obtain the chunk address if possible
until more performance studies have been done.
3. Seems okay to obtain chunk addresses individually for each processor.
This option may cover most cases.
Platforms tested:
h5committest(copper is not usable)
NCSA teragrid (mpich 1.2.5)
mir 64-bit linux (mpich 1.2.6)
Misc. update:
-rw-r--r-- | src/H5Dmpio.c | 64 |
1 files changed, 48 insertions, 16 deletions
diff --git a/src/H5Dmpio.c b/src/H5Dmpio.c index 8c7778a..c290f9c 100644 --- a/src/H5Dmpio.c +++ b/src/H5Dmpio.c @@ -69,7 +69,9 @@ /* Macros to define the default ratio of obtaining all chunk addresses for one linked-chunk IO case */ #define H5D_ALL_CHUNK_ADDR_THRES_IND 10 +#define H5D_ALL_CHUNK_ADDR_THRES_IND_NUM 4 #define H5D_ALL_CHUNK_ADDR_THRES_COL 20 +#define H5D_ALL_CHUNK_ADDR_THRES_COL_NUM 10000 /***** Macros for multi-chunk collective IO case. *****/ /* The default value of the threshold to do collective IO for this chunk. @@ -646,7 +648,7 @@ H5D_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf, hbool { int io_option = H5D_MULTI_CHUNK_IO_MORE_OPT; - int sum_chunk,mpi_size; + int sum_chunk = 0,mpi_size; unsigned one_link_chunk_io_threshold; H5P_genplist_t *plist; H5FD_mpio_chunk_opt_t chunk_opt_mode; @@ -790,18 +792,18 @@ H5D_link_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf, int mpi_size,mpi_code; /* MPI return code */ - int i,num_chunk,total_chunks; + int i,num_chunk=0,total_chunks; size_t ori_num_chunk; hsize_t ori_total_chunks; haddr_t chunk_base_addr; - haddr_t* total_chunk_addr_array; - MPI_Datatype *chunk_mtype; - MPI_Datatype *chunk_ftype; + haddr_t* total_chunk_addr_array=NULL; + MPI_Datatype *chunk_mtype=NULL; + MPI_Datatype *chunk_ftype=NULL; MPI_Datatype chunk_final_mtype; MPI_Datatype chunk_final_ftype; - MPI_Aint *chunk_disp_array; - MPI_Aint *chunk_mem_disp_array; - int *blocklen; + MPI_Aint *chunk_disp_array=NULL; + MPI_Aint *chunk_mem_disp_array=NULL; + int *blocklen=NULL; int blocklen_value; int actual_bsearch_coll_chunk_threshold; int bsearch_coll_chunk_threshold; @@ -810,7 +812,7 @@ H5D_link_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf, int many_chunk_opt = 0; H5D_common_coll_info_t coll_info; - H5D_chunk_addr_info_t* chunk_addr_info_array; + H5D_chunk_addr_info_t* chunk_addr_info_array=NULL; #ifdef CC_PERF char *bc_percent = NULL; @@ -858,6 +860,9 @@ printf("before inter_collective_io for total chunk = 1 \n"); ori_num_chunk = H5SL_count(fm->fsel); H5_ASSIGN_OVERFLOW(num_chunk,ori_num_chunk,size_t,int); +#ifdef KENT +printf("total_chunks = %d\n",(int)total_chunks); +#endif if(num_chunk == 0) total_chunk_addr_array = H5MM_malloc(sizeof(haddr_t)*total_chunks); @@ -899,9 +904,14 @@ printf("before inter_collective_io for total chunk = 1 \n"); /* Calculate the actual threshold to obtain all chunk addresses collectively The bigger this number is, the more possible the use of obtaining chunk address collectively. */ + /* For non-optimization one-link IO, + actual bsearch threshold is always 0, + we would always want to obtain the chunk addresses individually + for each process. */ actual_bsearch_coll_chunk_threshold = sum_chunk*100/(total_chunks*mpi_size); - if(actual_bsearch_coll_chunk_threshold >= bsearch_coll_chunk_threshold) + if((actual_bsearch_coll_chunk_threshold > bsearch_coll_chunk_threshold) + &&(sum_chunk/mpi_size >= H5D_ALL_CHUNK_ADDR_THRES_COL_NUM)) many_chunk_opt = H5D_OBTAIN_ALL_CHUNK_ADDR_COL; else { @@ -937,8 +947,12 @@ printf("before inter_collective_io for total chunk = 1 \n"); bsearch_chunk_threshold = 1 +(total_chunks*bsearch_chunk_ratio-99)/100; if(num_chunk > bsearch_chunk_threshold) many_chunk_opt = H5D_OBTAIN_ALL_CHUNK_ADDR_IND; + if((sum_chunk == 0) && (total_chunks >= H5D_ALL_CHUNK_ADDR_THRES_IND_NUM)) + many_chunk_opt = H5D_OBTAIN_ALL_CHUNK_ADDR_IND; } - +#ifdef KENT +printf("before sorting the chunk address \n"); +#endif /* Sort the chunk address when chunk optimization selection is either H5D_OBTAIN_*/ if(num_chunk == 0){ /* special case: this process doesn't select anything */ @@ -952,6 +966,9 @@ printf("before inter_collective_io for total chunk = 1 \n"); HGOTO_ERROR (H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to sort chunk address"); chunk_base_addr = chunk_addr_info_array[0].chunk_addr; } +#ifdef KENT +printf("after sorting the chunk address \n"); +#endif /* Obtain MPI derived datatype from all individual chunks */ for ( i = 0; i < num_chunk; i++) { @@ -1012,6 +1029,10 @@ printf("before inter_collective_io for total chunk = 1 \n"); coll_info.mpi_buf_count = 0; coll_info.chunk_addr = chunk_base_addr; } +#ifdef KENT +printf("before coming to final collective IO\n"); +#endif + if(H5D_final_collective_io(io_info,&chunk_final_ftype,&chunk_final_mtype,&coll_info,buf,do_write)<0) HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish MPI-IO"); @@ -1070,7 +1091,7 @@ H5D_multi_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf, uint8_t *chunk_io_option; H5SL_node_t *chunk_node; /* Current node in chunk skip list */ - H5D_chunk_info_t *chunk_info; + H5D_chunk_info_t *chunk_info=NULL; haddr_t *chunk_addr; H5D_storage_t store; /* union of EFL and chunk pointer in file space */ hbool_t select_chunk; @@ -1469,7 +1490,7 @@ H5D_sort_chunk(H5D_io_info_t * io_info, H5SL_node_t *chunk_node; /* Current node in chunk skip list */ H5D_chunk_info_t *chunk_info; /* Current chunking info. of this node. */ haddr_t chunk_addr; /* Current chunking address of this node */ - haddr_t *total_chunk_addr_array; /* The array of chunk address for the total number of chunk */ + haddr_t *total_chunk_addr_array=NULL; /* The array of chunk address for the total number of chunk */ int i,mpi_code; int total_chunks; size_t num_chunks; @@ -1483,6 +1504,10 @@ H5D_sort_chunk(H5D_io_info_t * io_info, FUNC_ENTER_NOAPI_NOINIT(H5D_sort_chunk) num_chunks = H5SL_count(fm->fsel); +#ifdef KENT +printf("many_chunk_opt= %d\n",many_chunk_opt); +#endif + /* If we need to optimize the way to obtain the chunk address */ if(many_chunk_opt != H5D_OBTAIN_ONE_CHUNK_ADDR_IND){ @@ -1493,6 +1518,10 @@ H5D_sort_chunk(H5D_io_info_t * io_info, if(many_chunk_opt == H5D_OBTAIN_ALL_CHUNK_ADDR_COL) {/* We will broadcast the array from the root process */ int mpi_rank, root; + +#ifdef KENT +printf("Coming inside H5D_OBTAIN_ALL_CHUNK_ADDR_COL\n"); +#endif root = 0; if((mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file))<0) HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank"); @@ -1559,6 +1588,9 @@ H5D_sort_chunk(H5D_io_info_t * io_info, i++; chunk_node = H5SL_next(chunk_node); } +#ifdef KENT +printf("before Qsort\n"); +#endif if(do_sort) HDqsort(chunk_addr_info_array,num_chunks,sizeof(chunk_addr_info_array),H5D_cmp_chunk_addr); @@ -1624,9 +1656,9 @@ H5D_obtain_mpio_mode(H5D_io_info_t* io_info, hsize_t ori_total_chunks; unsigned percent_nproc_per_chunk,threshold_nproc_per_chunk; H5FD_mpio_chunk_opt_t chunk_opt_mode; - uint8_t* io_mode_info; - uint8_t* recv_io_mode_info; - uint8_t* mergebuf; + uint8_t* io_mode_info=NULL; + uint8_t* recv_io_mode_info=NULL; + uint8_t* mergebuf=NULL; uint8_t* tempbuf; H5SL_node_t* chunk_node; |