diff options
author | Quincey Koziol <koziol@hdfgroup.org> | 2008-04-24 15:03:41 (GMT) |
---|---|---|
committer | Quincey Koziol <koziol@hdfgroup.org> | 2008-04-24 15:03:41 (GMT) |
commit | 495ca9c7bb19553d2c87ce68013f1de2dff3d54b (patch) | |
tree | 792e1a9ecc8aa314dfa3d0538464e4f87ad55cf5 /src/H5Dmpio.c | |
parent | 16d4cae5b16ffb91298d8d232214afeb5112da6d (diff) | |
download | hdf5-495ca9c7bb19553d2c87ce68013f1de2dff3d54b.zip hdf5-495ca9c7bb19553d2c87ce68013f1de2dff3d54b.tar.gz hdf5-495ca9c7bb19553d2c87ce68013f1de2dff3d54b.tar.bz2 |
[svn-r14860] Description:
Omnibus raw data I/O revisions, with wide-ranging changes and
refactoring, in order to prepare for implementing "fast append" feature.
These changes remove the majority of the code duplication for raw data
I/O which has crept in over the last ten years and introduces a more object-
oriented design for operating on different types of dataset storage.
Chunked storage no longer has it's own I/O routines, it is now handled
as either contiguous (if chunk is not pulled into the cache) or compact (if the
chunk is cached in memory).
No bug or feature changes, at least intentionally... :-)
Tested on:
FreeBSD/32 6.2 (duty) in debug mode
FreeBSD/64 6.2 (liberty) w/C++ & FORTRAN, in debug mode
Linux/32 2.6 (kagiso) w/PGI compilers, w/C++ & FORTRAN, w/threadsafe,
in debug mode
Linux/64-amd64 2.6 (smirom) w/default API=1.6.x, w/C++ & FORTRAN,
in production mode
Linux/64-ia64 2.6 (cobalt) w/Intel compilers, w/C++ & FORTRAN,
in production mode
Solaris/32 2.10 (linew) w/deprecated symbols disabled, w/C++ & FORTRAN,
w/szip filter, in production mode
Mac OS X/32 10.5.2 (amazon) in debug mode
Linux/64-ia64 2.4 (tg-login3) w/parallel, w/FORTRAN, in production mode
Diffstat (limited to 'src/H5Dmpio.c')
-rw-r--r-- | src/H5Dmpio.c | 2658 |
1 files changed, 1317 insertions, 1341 deletions
diff --git a/src/H5Dmpio.c b/src/H5Dmpio.c index 22f32ab..889fbce 100644 --- a/src/H5Dmpio.c +++ b/src/H5Dmpio.c @@ -34,12 +34,12 @@ /* Headers */ /***********/ #include "H5private.h" /* Generic Functions */ -#include "H5Iprivate.h" #include "H5Dpkg.h" /* Datasets */ #include "H5Eprivate.h" /* Error handling */ #include "H5Fprivate.h" /* File access */ #include "H5FDprivate.h" /* File drivers */ -#include "H5MMprivate.h" +#include "H5Iprivate.h" /* IDs */ +#include "H5MMprivate.h" /* Memory management */ #include "H5Oprivate.h" /* Object headers */ #include "H5Pprivate.h" /* Property lists */ #include "H5Sprivate.h" /* Dataspaces */ @@ -86,6 +86,7 @@ #define H5D_CHUNK_SELECT_IRREG 2 #define H5D_CHUNK_SELECT_NONE 0 + /******************/ /* Local Typedefs */ /******************/ @@ -95,66 +96,47 @@ typedef struct H5D_chunk_addr_info_t { H5D_chunk_info_t chunk_info; } H5D_chunk_addr_info_t; -/* Combine all information that needs to know for collective MPI-IO of this selection. */ -typedef struct H5D_common_coll_info_t { - hbool_t mbt_is_derived; - hbool_t mft_is_derived; - size_t mpi_buf_count; - haddr_t chunk_addr; -} H5D_common_coll_info_t; - /********************/ /* Local Prototypes */ /********************/ - -static herr_t -H5D_multi_chunk_collective_io(H5D_io_info_t *io_info,H5D_chunk_map_t *fm,const void *buf, - hbool_t do_write); -static herr_t -H5D_multi_chunk_collective_io_no_opt(H5D_io_info_t *io_info,H5D_chunk_map_t *fm,const void *buf, - hbool_t do_write); - -static herr_t -H5D_link_chunk_collective_io(H5D_io_info_t *io_info,H5D_chunk_map_t *fm,const void *buf, - hbool_t do_write,int sum_chunk); - -static herr_t -H5D_inter_collective_io(H5D_io_info_t *io_info,const H5S_t *file_space, - const H5S_t *mem_space,haddr_t addr, - const void *buf, hbool_t do_write ); - -static herr_t -H5D_final_collective_io(H5D_io_info_t *io_info,MPI_Datatype*mpi_file_type, - MPI_Datatype *mpi_buf_type, - H5D_common_coll_info_t* coll_info, - const void *buf, hbool_t do_write); -static herr_t -H5D_sort_chunk(H5D_io_info_t * io_info, - H5D_chunk_map_t *fm, - H5D_chunk_addr_info_t chunk_addr_info_array[], - int many_chunk_opt); - -static herr_t -H5D_obtain_mpio_mode(H5D_io_info_t* io_info, - H5D_chunk_map_t *fm, - uint8_t assign_io_mode[], - haddr_t chunk_addr[]); - -static herr_t H5D_ioinfo_make_ind(H5D_io_info_t *io_info); -static herr_t H5D_ioinfo_make_coll_opt(H5D_io_info_t *io_info); -static herr_t H5D_ioinfo_make_coll(H5D_io_info_t *io_info); +static herr_t H5D_chunk_collective_io(H5D_io_info_t *io_info, + const H5D_type_info_t *type_info, H5D_chunk_map_t *fm); +static herr_t H5D_multi_chunk_collective_io(H5D_io_info_t *io_info, + const H5D_type_info_t *type_info, H5D_chunk_map_t *fm, + H5P_genplist_t *dx_plist); +static herr_t H5D_multi_chunk_collective_io_no_opt(H5D_io_info_t *io_info, + const H5D_type_info_t *type_info, H5D_chunk_map_t *fm, H5P_genplist_t *dx_plist); +#ifdef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS +static herr_t H5D_link_chunk_collective_io(H5D_io_info_t *io_info, + const H5D_type_info_t *type_info, H5D_chunk_map_t *fm, int sum_chunk); +#endif /* H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS */ +static herr_t H5D_inter_collective_io(H5D_io_info_t *io_info, + const H5D_type_info_t *type_info, const H5S_t *file_space, + const H5S_t *mem_space); +static herr_t H5D_final_collective_io(H5D_io_info_t *io_info, + const H5D_type_info_t *type_info, size_t nelmts, MPI_Datatype *mpi_file_type, + MPI_Datatype *mpi_buf_type); +static herr_t H5D_sort_chunk(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm, + H5D_chunk_addr_info_t chunk_addr_info_array[], int many_chunk_opt); +static herr_t H5D_obtain_mpio_mode(H5D_io_info_t *io_info, H5D_chunk_map_t *fm, + H5P_genplist_t *dx_plist, uint8_t assign_io_mode[], haddr_t chunk_addr[]); +static herr_t H5D_ioinfo_xfer_mode(H5D_io_info_t *io_info, H5P_genplist_t *dx_plist, + H5FD_mpio_xfer_t xfer_mode); +static herr_t H5D_ioinfo_coll_opt_mode(H5D_io_info_t *io_info, H5P_genplist_t *dx_plist, + H5FD_mpio_collective_opt_t coll_opt_mode); static herr_t H5D_mpio_get_min_chunk(const H5D_io_info_t *io_info, const H5D_chunk_map_t *fm, int *min_chunkf); static int H5D_cmp_chunk_addr(const void *addr1, const void *addr2); static herr_t H5D_mpio_get_sum_chunk(const H5D_io_info_t *io_info, - const H5D_chunk_map_t *fm, int *sum_chunkf); + const H5D_chunk_map_t *fm, int *sum_chunkf); /*********************/ /* Package Variables */ /*********************/ + /*******************/ /* Local Variables */ /*******************/ @@ -175,64 +157,77 @@ static herr_t H5D_mpio_get_sum_chunk(const H5D_io_info_t *io_info, *------------------------------------------------------------------------- */ htri_t -H5D_mpio_opt_possible( const H5D_io_info_t *io_info, - const H5S_t *mem_space, const H5S_t *file_space, const H5T_path_t *tpath) +H5D_mpio_opt_possible(const H5D_io_info_t *io_info, const H5S_t *file_space, + const H5S_t *mem_space, const H5D_type_info_t *type_info, + const H5D_chunk_map_t *fm) { - int local_opinion = TRUE; /* This process's idea of whether to perform collective I/O or not */ - int consensus; /* Consensus opinion of all processes */ - int mpi_code; /* MPI error code */ - htri_t ret_value=TRUE; + int local_opinion = TRUE; /* This process's idea of whether to perform collective I/O or not */ + int consensus; /* Consensus opinion of all processes */ + int mpi_code; /* MPI error code */ + htri_t ret_value = TRUE; - FUNC_ENTER_NOAPI(H5D_mpio_opt_possible, FAIL); + FUNC_ENTER_NOAPI(H5D_mpio_opt_possible, FAIL) /* Check args */ - assert(io_info); - assert(mem_space); - assert(file_space); + HDassert(io_info); + HDassert(mem_space); + HDassert(file_space); + HDassert(type_info); /* For independent I/O, get out quickly and don't try to form consensus */ - if (io_info->dxpl_cache->xfer_mode==H5FD_MPIO_INDEPENDENT) + if(io_info->dxpl_cache->xfer_mode == H5FD_MPIO_INDEPENDENT) HGOTO_DONE(FALSE); + /* Don't allow collective operations if datatype conversions need to happen */ + if(!type_info->is_conv_noop) { + local_opinion = FALSE; + goto broadcast; + } /* end if */ + + /* Don't allow collective operations if data transform operations should occur */ + if(!type_info->is_xform_noop) { + local_opinion = FALSE; + goto broadcast; + } /* end if */ + /* Optimized MPI types flag must be set and it must be collective IO */ /* (Don't allow parallel I/O for the MPI-posix driver, since it doesn't do real collective I/O) */ - if (!(H5S_mpi_opt_types_g && io_info->dxpl_cache->xfer_mode==H5FD_MPIO_COLLECTIVE && !IS_H5FD_MPIPOSIX(io_info->dset->oloc.file))) { + if(!(H5S_mpi_opt_types_g && io_info->dxpl_cache->xfer_mode == H5FD_MPIO_COLLECTIVE + && !IS_H5FD_MPIPOSIX(io_info->dset->oloc.file))) { local_opinion = FALSE; goto broadcast; } /* end if */ /* Check whether these are both simple or scalar dataspaces */ - if (!((H5S_SIMPLE==H5S_GET_EXTENT_TYPE(mem_space) || H5S_SCALAR==H5S_GET_EXTENT_TYPE(mem_space)) - && (H5S_SIMPLE==H5S_GET_EXTENT_TYPE(file_space) || H5S_SCALAR==H5S_GET_EXTENT_TYPE(file_space)))) { + if(!((H5S_SIMPLE == H5S_GET_EXTENT_TYPE(mem_space) || H5S_SCALAR == H5S_GET_EXTENT_TYPE(mem_space)) + && (H5S_SIMPLE == H5S_GET_EXTENT_TYPE(file_space) || H5S_SCALAR == H5S_GET_EXTENT_TYPE(file_space)))) { local_opinion = FALSE; goto broadcast; } /* end if */ /* Can't currently handle point selections */ - if (H5S_SEL_POINTS==H5S_GET_SELECT_TYPE(mem_space) || H5S_SEL_POINTS==H5S_GET_SELECT_TYPE(file_space)) { + if(H5S_SEL_POINTS == H5S_GET_SELECT_TYPE(mem_space) + || H5S_SEL_POINTS == H5S_GET_SELECT_TYPE(file_space)) { local_opinion = FALSE; goto broadcast; } /* end if */ /* Dataset storage must be contiguous or chunked */ - if (!(io_info->dset->shared->layout.type == H5D_CONTIGUOUS || + if(!(io_info->dset->shared->layout.type == H5D_CONTIGUOUS || io_info->dset->shared->layout.type == H5D_CHUNKED)) { local_opinion = FALSE; goto broadcast; } /* end if */ - /* The handling of memory space is different for chunking - and contiguous storage, - For contigous storage, mem_space and file_space won't - change when it it is doing disk IO. - For chunking storage, mem_space will change for different - chunks. So for chunking storage, whether we can use - collective IO will defer until each chunk IO is reached. - For contiguous storage, if we find MPI-IO cannot - support complicated MPI derived data type and the shape - of data space is not regular, we will - set use_par_opt_io = FALSE. - */ + /* The handling of memory space is different for chunking and contiguous + * storage. For contiguous storage, mem_space and file_space won't change + * when it it is doing disk IO. For chunking storage, mem_space will + * change for different chunks. So for chunking storage, whether we can + * use collective IO will defer until each chunk IO is reached. For + * contiguous storage, if we find MPI-IO cannot support complicated MPI + * derived data type and the shape of data space is not regular, we will + * set use_par_opt_io = FALSE. + */ #ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS if(io_info->dset->shared->layout.type == H5D_CONTIGUOUS) if((H5S_SELECT_IS_REGULAR(file_space) != TRUE) || @@ -243,81 +238,43 @@ H5D_mpio_opt_possible( const H5D_io_info_t *io_info, #endif /* Don't allow collective operations if filters need to be applied */ - if(io_info->dset->shared->layout.type == H5D_CHUNKED) - if(io_info->dset->shared->dcpl_cache.pline.nused>0) { + if(io_info->dset->shared->layout.type == H5D_CHUNKED) { + if(io_info->dset->shared->dcpl_cache.pline.nused > 0) { local_opinion = FALSE; goto broadcast; } /* end if */ - /* Don't allow collective operations if datatype conversions need to happen */ - if(!H5T_path_noop(tpath)) { - local_opinion = FALSE; - goto broadcast; - } /* end if */ - - /* Don't allow collective operations if data transform operations should occur */ - if(!H5Z_xform_noop(io_info->dxpl_cache->data_xform_prop)) { - local_opinion = FALSE; - goto broadcast; +/* If H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS and H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS + * are defined, the HDF5 library will do collective IO if the application + * asks for it. + * + * If H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS is not defined and one or more + * processes are not participating in the IO, then collective IO is not + * assured. The library will check each process for the number of chunks + * it involves. If any process involves zero chunks, the library will use + * independent IO mode instead. + */ +#ifndef H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS + /* Check the number of chunks to perform I/O on */ + if(0 == H5SL_count(fm->sel_chunks)) { + local_opinion = FALSE; + goto broadcast; + } /* end if */ +#endif /* H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS */ } /* end if */ broadcast: /* Form consensus opinion among all processes about whether to perform - * collective I/O */ - if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&local_opinion, &consensus, 1, MPI_INT, MPI_LAND, io_info->comm))) + * collective I/O + */ + if(MPI_SUCCESS != (mpi_code = MPI_Allreduce(&local_opinion, &consensus, 1, MPI_INT, MPI_LAND, io_info->comm))) HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code) ret_value = consensus > 0 ? TRUE : FALSE; done: - FUNC_LEAVE_NOAPI(ret_value); -} /* H5D_mpio_opt_possible() */ - - -/*------------------------------------------------------------------------- - * Function: H5D_mpio_chunk_adjust_iomode - * - * Decription: If H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS and - H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS are defined, - the HDF5 library will do collective IO if the application asks for it. - - If H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS is not defined - and one or more processes are not participating in the IO, - then collective IO is not assured. The library will check - each process for the - number of chunks it involves. If any process involves zero chunks, - the library will use independent IO mode instead. - This function is only used for linked chunk IO. - * Purpose: Checks if it is possible to do collective IO - * - * Return: Success: Non-negative: TRUE or FALSE - * Failure: Negative - * - * Programmer: Muqun Yang - * Monday, Feb. 13th, 2006 - * - *------------------------------------------------------------------------- - */ -#ifndef H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS -herr_t -H5D_mpio_chunk_adjust_iomode(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm) -{ - int min_chunk; - herr_t ret_value = SUCCEED; - - FUNC_ENTER_NOAPI_NOINIT(H5D_mpio_chunk_adjust_iomode) - - if(H5D_mpio_get_min_chunk(io_info,fm,&min_chunk) < 0) - HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the min chunk number of all processes"); - if(min_chunk == 0) { - /* Switch to independent I/O */ - if(H5D_ioinfo_make_ind(io_info) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O") - } /* end if */ -done: FUNC_LEAVE_NOAPI(ret_value) -} -#endif +} /* H5D_mpio_opt_possible() */ /*------------------------------------------------------------------------- @@ -332,23 +289,20 @@ done: *------------------------------------------------------------------------- */ herr_t -H5D_mpio_select_read(H5D_io_info_t *io_info, - size_t mpi_buf_count, - const size_t UNUSED elmt_size, - const H5S_t UNUSED *file_space, - const H5S_t UNUSED *mem_space, - haddr_t addr, - void UNUSED *pointer, - void *buf/*out*/) +H5D_mpio_select_read(const H5D_io_info_t *io_info, const H5D_type_info_t UNUSED *type_info, + hsize_t mpi_buf_count, const H5S_t UNUSED *file_space, const H5S_t UNUSED *mem_space) { + const H5D_contig_storage_t *store_contig = &(io_info->store->contig); /* Contiguous storage info for this I/O operation */ herr_t ret_value = SUCCEED; - FUNC_ENTER_NOAPI(H5D_mpio_select_read,FAIL); + FUNC_ENTER_NOAPI(H5D_mpio_select_read, FAIL) + + H5_CHECK_OVERFLOW(mpi_buf_count, hsize_t, size_t); + if(H5F_block_read(io_info->dset->oloc.file, H5FD_MEM_DRAW, store_contig->dset_addr, (size_t)mpi_buf_count, io_info->dxpl_id, io_info->u.rbuf) < 0) + HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "can't finish collective parallel read") - if(H5F_block_read (io_info->dset->oloc.file, H5FD_MEM_DRAW, addr, mpi_buf_count, io_info->dxpl_id, buf) < 0) - HGOTO_ERROR(H5E_IO,H5E_READERROR,FAIL,"can't finish collective parallel read"); done: - FUNC_LEAVE_NOAPI(ret_value); + FUNC_LEAVE_NOAPI(ret_value) } /* end H5D_mpio_select_read() */ @@ -364,32 +318,28 @@ done: *------------------------------------------------------------------------- */ herr_t -H5D_mpio_select_write(H5D_io_info_t *io_info, - size_t mpi_buf_count, - const size_t UNUSED elmt_size, - const H5S_t UNUSED *file_space, - const H5S_t UNUSED *mem_space, - haddr_t addr, - void UNUSED *pointer, - const void *buf) +H5D_mpio_select_write(const H5D_io_info_t *io_info, const H5D_type_info_t UNUSED *type_info, + hsize_t mpi_buf_count, const H5S_t UNUSED *file_space, const H5S_t UNUSED *mem_space) { + const H5D_contig_storage_t *store_contig = &(io_info->store->contig); /* Contiguous storage info for this I/O operation */ herr_t ret_value = SUCCEED; - FUNC_ENTER_NOAPI(H5D_mpio_select_write,FAIL); + FUNC_ENTER_NOAPI(H5D_mpio_select_write, FAIL) /*OKAY: CAST DISCARDS CONST QUALIFIER*/ - if(H5F_block_write (io_info->dset->oloc.file, H5FD_MEM_DRAW, addr, mpi_buf_count, io_info->dxpl_id, buf)<0) - HGOTO_ERROR(H5E_IO,H5E_WRITEERROR,FAIL,"can't finish collective parallel write"); + H5_CHECK_OVERFLOW(mpi_buf_count, hsize_t, size_t); + if(H5F_block_write(io_info->dset->oloc.file, H5FD_MEM_DRAW, store_contig->dset_addr, (size_t)mpi_buf_count, io_info->dxpl_id, io_info->u.wbuf) < 0) + HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "can't finish collective parallel write") done: - FUNC_LEAVE_NOAPI(ret_value); + FUNC_LEAVE_NOAPI(ret_value) } /* end H5D_mpio_select_write() */ /*------------------------------------------------------------------------- - * Function: H5D_ioinfo_make_ind + * Function: H5D_ioinfo_xfer_mode * - * Purpose: Switch to MPI independent I/O + * Purpose: Switch to between collective & independent MPI I/O * * Return: Non-negative on success/Negative on failure * @@ -399,42 +349,42 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5D_ioinfo_make_ind(H5D_io_info_t *io_info) +H5D_ioinfo_xfer_mode(H5D_io_info_t *io_info, H5P_genplist_t *dx_plist, + H5FD_mpio_xfer_t xfer_mode) { - H5P_genplist_t *dx_plist; /* Data transer property list */ herr_t ret_value = SUCCEED; /*return value */ - FUNC_ENTER_NOAPI_NOINIT(H5D_ioinfo_make_ind) - - /* Get the dataset transfer property list */ - if (NULL == (dx_plist = H5I_object(io_info->dxpl_id))) - HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset transfer property list") + FUNC_ENTER_NOAPI_NOINIT(H5D_ioinfo_xfer_mode) - /* Change the xfer_mode to independent, handle the request, - * then set xfer_mode before return. - */ - io_info->dxpl_cache->xfer_mode = H5FD_MPIO_INDEPENDENT; - if(H5P_set (dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &io_info->dxpl_cache->xfer_mode) < 0) + /* Change the xfer_mode */ + io_info->dxpl_cache->xfer_mode = xfer_mode; + if(H5P_set(dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &io_info->dxpl_cache->xfer_mode) < 0) HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode") - /* Set the pointers to the non-MPI-specific routines */ - io_info->ops.read = H5D_select_read; - io_info->ops.write = H5D_select_write; + /* Change the "single I/O" function pointers */ + if(xfer_mode == H5FD_MPIO_INDEPENDENT) { + /* Set the pointers to the original, non-MPI-specific routines */ + io_info->io_ops.single_read = io_info->orig.io_ops.single_read; + io_info->io_ops.single_write = io_info->orig.io_ops.single_write; + } /* end if */ + else { + HDassert(xfer_mode == H5FD_MPIO_COLLECTIVE); - /* Indicate that the transfer mode should be restored before returning - * to user. - */ - io_info->xfer_mode_changed=TRUE; + /* Set the pointers to the MPI-specific routines */ + io_info->io_ops.single_read = H5D_mpio_select_read; + io_info->io_ops.single_write = H5D_mpio_select_write; + } /* end else */ done: FUNC_LEAVE_NOAPI(ret_value) -} /* end H5D_ioinfo_make_ind() */ +} /* end H5D_ioinfo_xfer_mode() */ /*------------------------------------------------------------------------- - * Function: H5D_ioinfo_make_coll_opt + * Function: H5D_ioinfo_coll_opt_mode * - * Purpose: Switch to MPI independent I/O with file set view + * Purpose: Switch between using collective & independent MPI I/O w/file + * set view * * Return: Non-negative on success/Negative on failure * @@ -444,87 +394,21 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5D_ioinfo_make_coll_opt(H5D_io_info_t *io_info) +H5D_ioinfo_coll_opt_mode(H5D_io_info_t *io_info, H5P_genplist_t *dx_plist, + H5FD_mpio_collective_opt_t coll_opt_mode) { - H5P_genplist_t *dx_plist; /* Data transer property list */ herr_t ret_value = SUCCEED; /*return value */ - FUNC_ENTER_NOAPI_NOINIT(H5D_ioinfo_make_coll_opt) + FUNC_ENTER_NOAPI_NOINIT(H5D_ioinfo_coll_opt_mode) - /* Get the dataset transfer property list */ - if (NULL == (dx_plist = H5I_object(io_info->dxpl_id))) - HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset transfer property list") - - /* Change the optimial xfer_mode to independent, handle the request, - * then set xfer_mode before return. - */ - io_info->dxpl_cache->xfer_opt_mode = H5FD_MPIO_INDIVIDUAL_IO; - if(H5P_set (dx_plist, H5D_XFER_IO_XFER_OPT_MODE_NAME, &io_info->dxpl_cache->xfer_opt_mode) < 0) + /* Change the optimal xfer_mode */ + io_info->dxpl_cache->coll_opt_mode = coll_opt_mode; + if(H5P_set(dx_plist, H5D_XFER_MPIO_COLLECTIVE_OPT_NAME, &io_info->dxpl_cache->coll_opt_mode) < 0) HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode") - /* Set the pointers to the non-MPI-specific routines */ - io_info->ops.read = H5D_mpio_select_read; - io_info->ops.write = H5D_mpio_select_write; - - /* Indicate that the transfer mode should be restored before returning - * to user. - */ - io_info->xfer_opt_mode_changed = TRUE; - done: FUNC_LEAVE_NOAPI(ret_value) -} /* end H5D_ioinfo_make_coll_opt() */ - - -/*------------------------------------------------------------------------- - * Function: H5D_ioinfo_make_coll - * - * Purpose: Switch to MPI collective I/O - * - * Return: Non-negative on success/Negative on failure - * - * Programmer: Quincey Koziol - * Friday, August 12, 2005 - * - *------------------------------------------------------------------------- - */ -static herr_t -H5D_ioinfo_make_coll(H5D_io_info_t *io_info) -{ - H5P_genplist_t *dx_plist; /* Data transer property list */ - herr_t ret_value = SUCCEED; /*return value */ - - FUNC_ENTER_NOAPI_NOINIT(H5D_ioinfo_make_coll) - - /* Get the dataset transfer property list */ - if (NULL == (dx_plist = H5I_object(io_info->dxpl_id))) - HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset transfer property list") - - /* Change the xfer_mode to independent, handle the request, - * then set xfer_mode before return. - */ - io_info->dxpl_cache->xfer_mode = H5FD_MPIO_COLLECTIVE; - if(H5P_set (dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &io_info->dxpl_cache->xfer_mode) < 0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode") - - io_info->dxpl_cache->xfer_opt_mode = H5FD_MPIO_COLLECTIVE_IO; - if(H5P_set (dx_plist, H5D_XFER_IO_XFER_OPT_MODE_NAME, &io_info->dxpl_cache->xfer_opt_mode) < 0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode") - - - /* Set the pointers to the MPI-specific routines */ - io_info->ops.read = H5D_mpio_select_read; - io_info->ops.write = H5D_mpio_select_write; - - /* Indicate that the transfer mode should _NOT_ be restored before returning - * to user. - */ - io_info->xfer_mode_changed=FALSE; - io_info->xfer_opt_mode_changed=FALSE; - -done: - FUNC_LEAVE_NOAPI(ret_value) -} /* end H5D_ioinfo_make_coll() */ +} /* end H5D_ioinfo_coll_opt_mode() */ /*------------------------------------------------------------------------- @@ -535,29 +419,30 @@ done: * * Return: Non-negative on success/Negative on failure * - * Programmer: + * Programmer: Muqun Yang + * Monday, Feb. 13th, 2006 * *------------------------------------------------------------------------- */ static herr_t -H5D_mpio_get_min_chunk(const H5D_io_info_t *io_info, - const H5D_chunk_map_t *fm, int *min_chunkf) +H5D_mpio_get_min_chunk(const H5D_io_info_t *io_info, const H5D_chunk_map_t *fm, + int *min_chunkf) { int num_chunkf; /* Number of chunks to iterate over */ int mpi_code; /* MPI return code */ herr_t ret_value = SUCCEED; - FUNC_ENTER_NOAPI_NOINIT(H5D_mpio_get_min_chunk); + FUNC_ENTER_NOAPI_NOINIT(H5D_mpio_get_min_chunk) /* Get the number of chunks to perform I/O on */ num_chunkf = H5SL_count(fm->sel_chunks); /* Determine the minimum # of chunks for all processes */ - if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&num_chunkf, min_chunkf, 1, MPI_INT, MPI_MIN, io_info->comm))) + if(MPI_SUCCESS != (mpi_code = MPI_Allreduce(&num_chunkf, min_chunkf, 1, MPI_INT, MPI_MIN, io_info->comm))) HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code) done: - FUNC_LEAVE_NOAPI(ret_value); + FUNC_LEAVE_NOAPI(ret_value) } /* end H5D_mpio_get_min_chunk() */ @@ -569,221 +454,252 @@ done: * * Return: Non-negative on success/Negative on failure * - * Programmer: + * Programmer: Muqun Yang + * Monday, Feb. 13th, 2006 * *------------------------------------------------------------------------- */ static herr_t -H5D_mpio_get_sum_chunk(const H5D_io_info_t *io_info, - const H5D_chunk_map_t *fm, int *sum_chunkf) +H5D_mpio_get_sum_chunk(const H5D_io_info_t *io_info, const H5D_chunk_map_t *fm, + int *sum_chunkf) { int num_chunkf; /* Number of chunks to iterate over */ size_t ori_num_chunkf; int mpi_code; /* MPI return code */ herr_t ret_value = SUCCEED; - FUNC_ENTER_NOAPI_NOINIT(H5D_mpio_get_sum_chunk); + FUNC_ENTER_NOAPI_NOINIT(H5D_mpio_get_sum_chunk) /* Get the number of chunks to perform I/O on */ num_chunkf = 0; ori_num_chunkf = H5SL_count(fm->sel_chunks); - H5_ASSIGN_OVERFLOW(num_chunkf,ori_num_chunkf,size_t,int); + H5_ASSIGN_OVERFLOW(num_chunkf, ori_num_chunkf, size_t, int); /* Determine the summation of number of chunks for all processes */ - if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&num_chunkf, sum_chunkf, 1, MPI_INT, MPI_SUM, io_info->comm))) + if(MPI_SUCCESS != (mpi_code = MPI_Allreduce(&num_chunkf, sum_chunkf, 1, MPI_INT, MPI_SUM, io_info->comm))) HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code) done: - FUNC_LEAVE_NOAPI(ret_value); + FUNC_LEAVE_NOAPI(ret_value) } /* end H5D_mpio_get_sum_chunk() */ /*------------------------------------------------------------------------- - * Function: H5D_contig_collective_io + * Function: H5D_contig_collective_read * - * Purpose: Wrapper Routine for H5D_inter_collective_io - The starting file address of contiguous layout - will be calculated and passed to H5D_inter_collective_io routine. - * + * Purpose: Reads directly from contiguous data in file into application + * memory using collective I/O. * * Return: Non-negative on success/Negative on failure * - * Programmer: - * - * Modifications: + * Programmer: Quincey Koziol + * Tuesday, March 4, 2008 * *------------------------------------------------------------------------- */ herr_t -H5D_contig_collective_io(H5D_io_info_t *io_info, - const H5S_t *file_space, - const H5S_t *mem_space, - const void *buf, - hbool_t do_write) +H5D_contig_collective_read(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, + hsize_t UNUSED nelmts, const H5S_t *file_space, const H5S_t *mem_space, + H5D_chunk_map_t UNUSED *fm) { + herr_t ret_value = SUCCEED; /* Return value */ + FUNC_ENTER_NOAPI(H5D_contig_collective_read, FAIL) - haddr_t addr = HADDR_UNDEF; /* Address of dataset (or selection) within file */ - herr_t ret_value = SUCCEED; /* return value */ + /* Sanity check */ + HDassert(IS_H5FD_MPIO(io_info->dset->oloc.file)); + HDassert(TRUE == H5P_isa_class(io_info->dxpl_id, H5P_DATASET_XFER)); + + /* Call generic internal collective I/O routine */ + if(H5D_inter_collective_io(io_info, type_info, file_space, mem_space) < 0) + HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "couldn't finish shared collective MPI-IO") - FUNC_ENTER_NOAPI_NOINIT(H5D_contig_collective_io) - assert (IS_H5FD_MPIO(io_info->dset->oloc.file)); +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5D_contig_collective_read() */ - /* Make certain we have the correct type of property list */ - assert(TRUE==H5P_isa_class(io_info->dxpl_id,H5P_DATASET_XFER)); + +/*------------------------------------------------------------------------- + * Function: H5D_contig_collective_write + * + * Purpose: Write directly to contiguous data in file from application + * memory using collective I/O. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: Quincey Koziol + * Tuesday, March 4, 2008 + * + *------------------------------------------------------------------------- + */ +herr_t +H5D_contig_collective_write(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, + hsize_t UNUSED nelmts, const H5S_t *file_space, const H5S_t *mem_space, + H5D_chunk_map_t UNUSED *fm) +{ + herr_t ret_value = SUCCEED; /* Return value */ - /* Get the base address of the contiguous dataset */ - if(io_info->dset->shared->layout.type == H5D_CONTIGUOUS) - addr = H5D_contig_get_addr(io_info->dset); + FUNC_ENTER_NOAPI(H5D_contig_collective_write, FAIL) - if(H5D_inter_collective_io(io_info,file_space,mem_space,addr,buf,do_write)<0) - HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO"); - - done: + /* Sanity check */ + HDassert(IS_H5FD_MPIO(io_info->dset->oloc.file)); + HDassert(TRUE == H5P_isa_class(io_info->dxpl_id, H5P_DATASET_XFER)); + /* Call generic internal collective I/O routine */ + if(H5D_inter_collective_io(io_info, type_info, file_space, mem_space) < 0) + HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "couldn't finish shared collective MPI-IO") + +done: FUNC_LEAVE_NOAPI(ret_value) -} /* end H5D_contig_collective_io */ +} /* end H5D_contig_collective_write() */ /*------------------------------------------------------------------------- * Function: H5D_chunk_collective_io * * Purpose: Routine for - 1) choose an IO option: - a) One collective IO defined by one MPI derived datatype to link through all chunks - or b) multiple chunk IOs,to do MPI-IO for each chunk, the IO mode may be adjusted - due to the selection pattern for each chunk. + * 1) choose an IO option: + * a) One collective IO defined by one MPI derived datatype to link through all chunks + * or b) multiple chunk IOs,to do MPI-IO for each chunk, the IO mode may be adjusted + * due to the selection pattern for each chunk. * For option a) - 1. Sort the chunk address, obtain chunk info according to the sorted chunk address - 2. Build up MPI derived datatype for each chunk - 3. Build up the final MPI derived datatype - 4. Set up collective IO property list - 5. Do IO + * 1. Sort the chunk address, obtain chunk info according to the sorted chunk address + * 2. Build up MPI derived datatype for each chunk + * 3. Build up the final MPI derived datatype + * 4. Set up collective IO property list + * 5. Do IO * For option b) - 1. Use MPI_gather and MPI_Bcast to obtain information of *collective/independent/none* - IO mode for each chunk of the selection - 2. Depending on whether the IO mode is collective or independent or none, - Create either MPI derived datatype for each chunk to do collective IO or - just do independent IO or independent IO with file set view - 3. Set up collective IO property list for collective mode - 4. DO IO + * 1. Use MPI_gather and MPI_Bcast to obtain information of *collective/independent/none* + * IO mode for each chunk of the selection + * 2. Depending on whether the IO mode is collective or independent or none, + * Create either MPI derived datatype for each chunk to do collective IO or + * just do independent IO or independent IO with file set view + * 3. Set up collective IO property list for collective mode + * 4. DO IO * * Return: Non-negative on success/Negative on failure * - * Programmer: - * - * Modifications: + * Programmer: Muqun Yang + * Monday, Feb. 13th, 2006 * *------------------------------------------------------------------------- */ -herr_t -H5D_chunk_collective_io(H5D_io_info_t *io_info,H5D_chunk_map_t *fm,const void *buf, hbool_t do_write) +static herr_t +H5D_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, + H5D_chunk_map_t *fm) { - - int io_option = H5D_MULTI_CHUNK_IO_MORE_OPT; - int sum_chunk = 0,mpi_size; - unsigned one_link_chunk_io_threshold; - H5P_genplist_t *plist; + H5P_genplist_t *dx_plist; /* Pointer to DXPL */ H5FD_mpio_chunk_opt_t chunk_opt_mode; - + int io_option = H5D_MULTI_CHUNK_IO_MORE_OPT; + int sum_chunk = -1; #ifdef H5_HAVE_INSTRUMENTED_LIBRARY - htri_t check_prop,temp_not_link_io = FALSE; - int new_value; + htri_t temp_not_link_io = FALSE; #endif - herr_t ret_value = SUCCEED; + herr_t ret_value = SUCCEED; FUNC_ENTER_NOAPI_NOINIT(H5D_chunk_collective_io) - assert (IS_H5FD_MPIO(io_info->dset->oloc.file)); + /* Sanity checks */ + HDassert(io_info); + HDassert(io_info->using_mpi_vfd); + HDassert(type_info); + HDassert(fm); /* Obtain the data transfer properties */ - if(NULL == (plist = H5I_object(io_info->dxpl_id))) + if(NULL == (dx_plist = H5I_object(io_info->dxpl_id))) HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list") - + /* Check the optional property list on what to do with collective chunk IO. */ - chunk_opt_mode=(H5FD_mpio_chunk_opt_t)H5P_peek_unsigned(plist,H5D_XFER_MPIO_CHUNK_OPT_HARD_NAME); - + chunk_opt_mode = (H5FD_mpio_chunk_opt_t)H5P_peek_unsigned(dx_plist, H5D_XFER_MPIO_CHUNK_OPT_HARD_NAME); if(chunk_opt_mode == H5FD_MPIO_CHUNK_ONE_IO) - io_option = H5D_ONE_LINK_CHUNK_IO;/*no opt*/ + io_option = H5D_ONE_LINK_CHUNK_IO; /*no opt*/ else if(chunk_opt_mode == H5FD_MPIO_CHUNK_MULTI_IO) - io_option = H5D_MULTI_CHUNK_IO;/*no opt */ + io_option = H5D_MULTI_CHUNK_IO; /*no opt */ else { - if(H5D_mpio_get_sum_chunk(io_info,fm,&sum_chunk)<0) + unsigned one_link_chunk_io_threshold; /* Threshhold to use single collective I/O for all chunks */ + int mpi_size; /* Number of processes in MPI job */ + + if(H5D_mpio_get_sum_chunk(io_info, fm, &sum_chunk) < 0) HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the total chunk number of all processes"); - if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file))<0) - HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size"); + if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file)) < 0) + HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size") - if(NULL == (plist = H5I_object(io_info->dxpl_id))) - HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list") - - one_link_chunk_io_threshold =H5P_peek_unsigned(plist,H5D_XFER_MPIO_CHUNK_OPT_NUM_NAME); + one_link_chunk_io_threshold = H5P_peek_unsigned(dx_plist, H5D_XFER_MPIO_CHUNK_OPT_NUM_NAME); - /* step 1: choose an IO option */ - /* If the average number of chunk per process is greater than a threshold, we will do one link chunked IO. */ - if((unsigned)sum_chunk/mpi_size >= one_link_chunk_io_threshold) + /* step 1: choose an IO option */ + /* If the average number of chunk per process is greater than a threshold, we will do one link chunked IO. */ + if((unsigned)sum_chunk / mpi_size >= one_link_chunk_io_threshold) io_option = H5D_ONE_LINK_CHUNK_IO_MORE_OPT; #ifdef H5_HAVE_INSTRUMENTED_LIBRARY - else - temp_not_link_io = TRUE; + else + temp_not_link_io = TRUE; +#endif + } /* end else */ + +#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS + if(io_option == H5D_ONE_LINK_CHUNK_IO) + io_option = H5D_MULTI_CHUNK_IO; /* We can not do this with one chunk IO. */ + if(io_option == H5D_ONE_LINK_CHUNK_IO_MORE_OPT) + io_option = H5D_MULTI_CHUNK_IO_MORE_OPT; #endif - } #ifdef H5_HAVE_INSTRUMENTED_LIBRARY +{ + htri_t check_prop; + int new_value; + /*** Test collective chunk user-input optimization APIs. ***/ - check_prop = H5Pexist(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_LINK_HARD_NAME); + check_prop = H5Pexist(io_info->dxpl_id, H5D_XFER_COLL_CHUNK_LINK_HARD_NAME); if(check_prop > 0) { if(io_option == H5D_ONE_LINK_CHUNK_IO) { - new_value = 0; - if(H5Pset(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_LINK_HARD_NAME,&new_value)<0) - HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to get property value"); - } - } - check_prop = H5Pexist(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_MULTI_HARD_NAME); + new_value = 0; + if(H5Pset(io_info->dxpl_id, H5D_XFER_COLL_CHUNK_LINK_HARD_NAME, &new_value) < 0) + HGOTO_ERROR(H5E_IO, H5E_CANTSET, FAIL, "unable to set property value") + } /* end if */ + } /* end if */ + check_prop = H5Pexist(io_info->dxpl_id, H5D_XFER_COLL_CHUNK_MULTI_HARD_NAME); if(check_prop > 0) { - if(io_option == H5D_MULTI_CHUNK_IO) { - new_value = 0; - if(H5Pset(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_MULTI_HARD_NAME,&new_value)<0) - HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to get property value"); - } - } - check_prop = H5Pexist(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_LINK_NUM_TRUE_NAME); + if(io_option == H5D_MULTI_CHUNK_IO) { + new_value = 0; + if(H5Pset(io_info->dxpl_id, H5D_XFER_COLL_CHUNK_MULTI_HARD_NAME, &new_value) < 0) + HGOTO_ERROR(H5E_IO, H5E_CANTSET, FAIL, "unable to set property value") + } /* end if */ + } /* end if */ + check_prop = H5Pexist(io_info->dxpl_id, H5D_XFER_COLL_CHUNK_LINK_NUM_TRUE_NAME); if(check_prop > 0) { - if(io_option == H5D_ONE_LINK_CHUNK_IO_MORE_OPT) { - new_value = 0; - if(H5Pset(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_LINK_NUM_TRUE_NAME,&new_value)<0) - HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to get property value"); - } - } - check_prop = H5Pexist(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_LINK_NUM_FALSE_NAME); + if(io_option == H5D_ONE_LINK_CHUNK_IO_MORE_OPT) { + new_value = 0; + if(H5Pset(io_info->dxpl_id, H5D_XFER_COLL_CHUNK_LINK_NUM_TRUE_NAME, &new_value) < 0) + HGOTO_ERROR(H5E_IO, H5E_CANTSET, FAIL, "unable to set property value") + } /* end if */ + } /* end if */ + check_prop = H5Pexist(io_info->dxpl_id, H5D_XFER_COLL_CHUNK_LINK_NUM_FALSE_NAME); if(check_prop > 0) { - if(temp_not_link_io){ - new_value = 0; - if(H5Pset(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_LINK_NUM_FALSE_NAME,&new_value)<0) - HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to get property value"); - } - } -#endif - -#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS - if(io_option == H5D_ONE_LINK_CHUNK_IO ) - io_option = H5D_MULTI_CHUNK_IO ;/* We can not do this with one chunk IO. */ - if(io_option == H5D_ONE_LINK_CHUNK_IO_MORE_OPT) - io_option = H5D_MULTI_CHUNK_IO_MORE_OPT; + if(temp_not_link_io) { + new_value = 0; + if(H5Pset(io_info->dxpl_id, H5D_XFER_COLL_CHUNK_LINK_NUM_FALSE_NAME, &new_value) < 0) + HGOTO_ERROR(H5E_IO, H5E_CANTSET, FAIL, "unable to set property value") + } /* end if */ + } /* end if */ +} #endif /* step 2: Go ahead to do IO.*/ +#ifdef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS if(io_option == H5D_ONE_LINK_CHUNK_IO || io_option == H5D_ONE_LINK_CHUNK_IO_MORE_OPT) { - if(H5D_link_chunk_collective_io(io_info,fm,buf,do_write,sum_chunk)<0) - HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish linked chunk MPI-IO"); - } - else if(io_option == H5D_MULTI_CHUNK_IO) { - if(H5D_multi_chunk_collective_io_no_opt(io_info,fm,buf,do_write)<0) - HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish multiple chunk MPI-IO"); - } + if(H5D_link_chunk_collective_io(io_info, type_info, fm, sum_chunk) < 0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish linked chunk MPI-IO") + } /* end if */ + else +#endif /* H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS */ + if(io_option == H5D_MULTI_CHUNK_IO) { + if(H5D_multi_chunk_collective_io_no_opt(io_info, type_info, fm, dx_plist) < 0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish multiple chunk MPI-IO") + } /* end if */ else { /*multiple chunk IOs with opt */ - if(H5D_multi_chunk_collective_io(io_info,fm,buf,do_write)<0) - HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish multiple chunk MPI-IO"); - } + if(H5D_multi_chunk_collective_io(io_info, type_info, fm, dx_plist) < 0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish optimized multiple chunk MPI-IO") + } /* end else */ done: FUNC_LEAVE_NOAPI(ret_value) @@ -791,459 +707,583 @@ done: /*------------------------------------------------------------------------- - * Function: H5D_link_chunk_collective_io + * Function: H5D_chunk_collective_read * - * Purpose: Routine for one collective IO with one MPI derived datatype to link with all chunks + * Purpose: Reads directly from chunks in file into application memory + * using collective I/O. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: Quincey Koziol + * Tuesday, March 4, 2008 + * + *------------------------------------------------------------------------- + */ +herr_t +H5D_chunk_collective_read(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, + hsize_t UNUSED nelmts, const H5S_t UNUSED *file_space, const H5S_t UNUSED *mem_space, + H5D_chunk_map_t *fm) +{ + herr_t ret_value = SUCCEED; /* Return value */ - 1. Sort the chunk address and chunk info - 2. Build up MPI derived datatype for each chunk - 3. Build up the final MPI derived datatype - 4. Use common collective IO routine to do MPI-IO + FUNC_ENTER_NOAPI(H5D_chunk_collective_read, FAIL) + /* Call generic selection operation */ + if(H5D_chunk_collective_io(io_info, type_info, fm) < 0) + HGOTO_ERROR(H5E_DATASPACE, H5E_READERROR, FAIL, "read error") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5D_chunk_collective_read() */ + + +/*------------------------------------------------------------------------- + * Function: H5D_chunk_collective_write * - * Return: Non-negative on success/Negative on failure + * Purpose: Write directly to chunks in file from application memory + * using collective I/O. * - * Programmer: + * Return: Non-negative on success/Negative on failure * - * Modifications: + * Programmer: Quincey Koziol + * Tuesday, March 4, 2008 * *------------------------------------------------------------------------- */ +herr_t +H5D_chunk_collective_write(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, + hsize_t UNUSED nelmts, const H5S_t UNUSED *file_space, const H5S_t UNUSED *mem_space, + H5D_chunk_map_t *fm) +{ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(H5D_chunk_collective_write, FAIL) + + /* Call generic selection operation */ + if(H5D_chunk_collective_io(io_info, type_info, fm) < 0) + HGOTO_ERROR(H5E_DATASPACE, H5E_WRITEERROR, FAIL, "write error") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5D_chunk_collective_write() */ +#ifdef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS + +/*------------------------------------------------------------------------- + * Function: H5D_link_chunk_collective_io + * + * Purpose: Routine for one collective IO with one MPI derived datatype to link with all chunks + * + * 1. Sort the chunk address and chunk info + * 2. Build up MPI derived datatype for each chunk + * 3. Build up the final MPI derived datatype + * 4. Use common collective IO routine to do MPI-IO + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: Muqun Yang + * Monday, Feb. 13th, 2006 + * + *------------------------------------------------------------------------- + */ static herr_t -H5D_link_chunk_collective_io(H5D_io_info_t *io_info,H5D_chunk_map_t *fm,const void *buf, hbool_t do_write,int sum_chunk) +H5D_link_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, + H5D_chunk_map_t *fm, int sum_chunk) { - size_t src_type_size; /*size of source type */ - size_t dst_type_size; /*size of destination type*/ - hsize_t mpi_buf_extra_offset; - hsize_t mpi_file_extra_offset; - size_t mpi_buf_count; - size_t mpi_file_count; - hbool_t mbt_is_derived=0, /* Whether the buffer (memory) type is derived and needs to be free'd */ - mft_is_derived=0; /* Whether the file type is derived and needs to be free'd */ - - int mpi_size,mpi_code; /* MPI return code */ - - int i,num_chunk=0,total_chunks; - size_t ori_num_chunk; - hsize_t ori_total_chunks; - haddr_t chunk_base_addr; - haddr_t* total_chunk_addr_array=NULL; - MPI_Datatype *chunk_mtype=NULL; - MPI_Datatype *chunk_ftype=NULL; - MPI_Datatype chunk_final_mtype; - MPI_Datatype chunk_final_ftype; - MPI_Aint *chunk_disp_array=NULL; - MPI_Aint *chunk_mem_disp_array=NULL; - int *blocklen=NULL; - int blocklen_value; - int actual_bsearch_coll_chunk_threshold; - int bsearch_coll_chunk_threshold; - int many_chunk_opt = 0; - - H5D_common_coll_info_t coll_info; - H5D_chunk_addr_info_t* chunk_addr_info_array=NULL; - - herr_t ret_value = SUCCEED; - - FUNC_ENTER_NOAPI_NOINIT(H5D_link_chunk_collective_io) - ori_total_chunks = fm->total_chunks; - H5_ASSIGN_OVERFLOW(total_chunks,ori_total_chunks,hsize_t,int); - - /* Handle with a special case when only one chunk is covered by all processes */ - if(total_chunks == 1){ - H5SL_node_t *chunk_node; - H5D_chunk_info_t *chunk_info; - H5D_storage_t store; + H5D_chunk_addr_info_t *chunk_addr_info_array = NULL; + hbool_t mbt_is_derived = FALSE; + hbool_t mft_is_derived = FALSE; + MPI_Datatype chunk_final_mtype; /* Final memory MPI datatype for all chunks with seletion */ + MPI_Datatype chunk_final_ftype; /* Final file MPI datatype for all chunks with seletion */ + H5D_storage_t ctg_store; /* Storage info for "fake" contiguous dataset */ + size_t total_chunks; + haddr_t *total_chunk_addr_array = NULL; + MPI_Datatype *chunk_mtype = NULL; + MPI_Datatype *chunk_ftype = NULL; + MPI_Aint *chunk_disp_array = NULL; + MPI_Aint *chunk_mem_disp_array = NULL; + int *blocklen = NULL; + int mpi_code; /* MPI return code */ + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI_NOINIT(H5D_link_chunk_collective_io) + + /* Get the sum # of chunks, if not already available */ + if(sum_chunk < 0) { + if(H5D_mpio_get_sum_chunk(io_info, fm, &sum_chunk) < 0) + HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the total chunk number of all processes"); + } /* end if */ + + /* Retrieve total # of chunks in dataset */ + H5_ASSIGN_OVERFLOW(total_chunks, fm->total_chunks, hsize_t, size_t); + + /* Handle special case when dataspace dimensions only allow one chunk in + * the dataset. [This sometimes is used by developers who want the + * equivalent of compressed contiguous datasets - QAK] + */ + if(total_chunks == 1) { + H5D_storage_t chk_store; /* Temporary storage info for chunk address lookup */ + hsize_t coords[H5O_LAYOUT_NDIMS]; /* Coordinates of chunk in file dataset's dataspace */ + H5SL_node_t *chunk_node; /* Pointer to chunk node for selection */ + H5S_t *fspace; /* Dataspace describing chunk & selection in it */ + H5S_t *mspace; /* Dataspace describing selection in memory corresponding to this chunk */ + + /* Initialize the chunk coordinates */ + /* (must be all zero, since there's only one chunk) */ + HDmemset(coords, 0, sizeof(coords)); + + /* Look up address of chunk */ + io_info->store = &chk_store; + chk_store.chunk.offset = coords; + chk_store.chunk.index = 0; + if(HADDR_UNDEF == (ctg_store.contig.dset_addr = H5D_istore_get_addr(io_info, NULL))) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL, "couldn't get chunk info from skipped list") + /* Check for this process having selection in this chunk */ chunk_node = H5SL_first(fm->sel_chunks); if(chunk_node == NULL) { - if(H5D_istore_chunkmap(io_info, &chunk_base_addr, fm->down_chunks) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address"); - if(H5D_inter_collective_io(io_info,NULL,NULL,chunk_base_addr,buf,do_write)<0) - HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO"); - } + /* Set the dataspace info for I/O to NULL, this process doesn't have any I/O to perform */ + fspace = mspace = NULL; + } /* end if */ else { - if(NULL ==(chunk_info = H5SL_item(chunk_node))) - HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list"); - io_info->store = &store; - store.chunk.offset = chunk_info->coords; - store.chunk.index = chunk_info->index; + H5D_chunk_info_t *chunk_info; + + /* Get the chunk info, for the selection in the chunk */ + if(NULL == (chunk_info = H5SL_item(chunk_node))) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL, "couldn't get chunk info from skipped list") + + /* Set the dataspace info for I/O */ + fspace = chunk_info->fspace; + mspace = chunk_info->mspace; + } /* end else */ + + /* Set up the base storage address for this chunk */ + io_info->store = &ctg_store; - if(HADDR_UNDEF==(chunk_base_addr = H5D_istore_get_addr(io_info,NULL))) - HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list"); - #ifdef H5D_DEBUG - if(H5DEBUG(D)) - HDfprintf(H5DEBUG(D),"before inter_collective_io for total chunk = 1 \n"); +if(H5DEBUG(D)) + HDfprintf(H5DEBUG(D),"before inter_collective_io for total chunk = 1 \n"); #endif - if(H5D_inter_collective_io(io_info,chunk_info->fspace,chunk_info->mspace,chunk_base_addr,buf,do_write)<0) - HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO"); - } - goto done; - } - /* Allocate chunking information */ - ori_num_chunk = H5SL_count(fm->sel_chunks); - H5_ASSIGN_OVERFLOW(num_chunk,ori_num_chunk,size_t,int); + /* Perform I/O */ + if(H5D_inter_collective_io(io_info, type_info, fspace, mspace) < 0) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL, "couldn't finish shared collective MPI-IO") + } /* end if */ + else { + size_t mpi_buf_count; /* Number of MPI types */ + size_t num_chunk; /* Number of chunks for this process */ + size_t u; /* Local index variable */ + + /* Get the number of chunks with a selection */ + num_chunk = H5SL_count(fm->sel_chunks); + H5_CHECK_OVERFLOW(num_chunk, size_t, int); #ifdef H5D_DEBUG - if(H5DEBUG(D)) - HDfprintf(H5DEBUG(D),"total_chunks = %d\n",(int)total_chunks); +if(H5DEBUG(D)) + HDfprintf(H5DEBUG(D),"total_chunks = %Zu, num_chunk = %Zu\n", total_chunks, num_chunk); #endif - - if(num_chunk == 0) - total_chunk_addr_array = H5MM_malloc(sizeof(haddr_t)*total_chunks); - else - { - chunk_addr_info_array= H5MM_malloc(num_chunk*sizeof(H5D_chunk_addr_info_t)); - chunk_mtype = H5MM_malloc(num_chunk*sizeof(MPI_Datatype)); - chunk_ftype = H5MM_malloc(num_chunk*sizeof(MPI_Datatype)); - chunk_disp_array = H5MM_malloc(num_chunk*sizeof(MPI_Aint)); - chunk_mem_disp_array = H5MM_calloc(num_chunk*sizeof(MPI_Aint)); - blocklen = H5MM_malloc(num_chunk*sizeof(int)); - } - - /* Obtain information to do collective IO, - in order to do collective IO, no datatype conversion should happen. */ - if((src_type_size = H5T_get_size(io_info->dset->shared->type))==0) - HGOTO_ERROR(H5E_DATATYPE, H5E_BADSIZE, FAIL, "datatype size invalid"); - dst_type_size = src_type_size; - - bsearch_coll_chunk_threshold = H5D_ALL_CHUNK_ADDR_THRES_COL; - - if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file))<0) - HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size"); - - /* Calculate the actual threshold to obtain all chunk addresses collectively - The bigger this number is, the more possible the use of obtaining chunk address collectively. */ - /* For non-optimization one-link IO, - actual bsearch threshold is always 0, - we would always want to obtain the chunk addresses individually - for each process. */ - actual_bsearch_coll_chunk_threshold = sum_chunk*100/(total_chunks*mpi_size); - - if((actual_bsearch_coll_chunk_threshold > bsearch_coll_chunk_threshold) - &&(sum_chunk/mpi_size >= H5D_ALL_CHUNK_ADDR_THRES_COL_NUM)) - many_chunk_opt = H5D_OBTAIN_ALL_CHUNK_ADDR_COL; + + /* Set up MPI datatype for chunks selected */ + if(num_chunk) { + hsize_t mpi_mem_extra_offset; /* Extra offset for memory MPI datatype */ + hsize_t mpi_file_extra_offset; /* Extra offset for file MPI datatype */ + size_t mpi_mem_count; /* Memory MPI datatype count */ + size_t mpi_file_count; /* File MPI datatype count */ + hbool_t locl_mbt_is_derived = FALSE, /* Whether the buffer (memory) type is derived and needs to be free'd */ + local_mft_is_derived = FALSE; /* Whether the file type is derived and needs to be free'd */ + int blocklen_value; /* Placeholder for array fill */ + + /* Allocate chunking information */ + chunk_addr_info_array= H5MM_malloc(num_chunk * sizeof(H5D_chunk_addr_info_t)); + chunk_mtype = H5MM_malloc(num_chunk * sizeof(MPI_Datatype)); + chunk_ftype = H5MM_malloc(num_chunk * sizeof(MPI_Datatype)); + chunk_disp_array = H5MM_malloc(num_chunk * sizeof(MPI_Aint)); + chunk_mem_disp_array = H5MM_calloc(num_chunk * sizeof(MPI_Aint)); + blocklen = H5MM_malloc(num_chunk * sizeof(int)); #ifdef H5D_DEBUG - if(H5DEBUG(D)) +if(H5DEBUG(D)) HDfprintf(H5DEBUG(D),"before sorting the chunk address \n"); #endif - - /* Sort the chunk address - when chunk optimization selection is either H5D_OBTAIN_*/ - - if(num_chunk == 0){ /* special case: this process doesn't select anything */ - if(H5D_istore_chunkmap(io_info, total_chunk_addr_array, fm->down_chunks)<0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address"); - chunk_base_addr = total_chunk_addr_array[0]; - } - - else { - if(H5D_sort_chunk(io_info,fm,chunk_addr_info_array,many_chunk_opt)<0) - HGOTO_ERROR (H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to sort chunk address"); - chunk_base_addr = chunk_addr_info_array[0].chunk_addr; - } + /* Sort the chunk address */ + if(H5D_sort_chunk(io_info, fm, chunk_addr_info_array, sum_chunk) < 0) + HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to sort chunk address") + ctg_store.contig.dset_addr = chunk_addr_info_array[0].chunk_addr; #ifdef H5D_DEBUG - if(H5DEBUG(D)) +if(H5DEBUG(D)) HDfprintf(H5DEBUG(D),"after sorting the chunk address \n"); #endif - - /* Obtain MPI derived datatype from all individual chunks */ - for ( i = 0; i < num_chunk; i++) { - /* Disk MPI derived datatype */ - if(H5S_mpio_space_type(chunk_addr_info_array[i].chunk_info.fspace,src_type_size,&chunk_ftype[i], - &mpi_file_count,&mpi_file_extra_offset,&mft_is_derived)<0) - HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI file type"); - - /* Buffer MPI derived datatype */ - if(H5S_mpio_space_type(chunk_addr_info_array[i].chunk_info.mspace,dst_type_size,&chunk_mtype[i], - &mpi_buf_count,&mpi_buf_extra_offset,&mbt_is_derived)<0) - HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI buf type"); - - /* Chunk address relative to the first chunk */ - chunk_addr_info_array[i].chunk_addr -= chunk_base_addr; - H5_ASSIGN_OVERFLOW(chunk_disp_array[i],chunk_addr_info_array[i].chunk_addr,haddr_t,MPI_Aint); - } - - blocklen_value = 1; - if(num_chunk){ - - /* initialize the buffer with the constant value 1; this algo. is very fast. */ - H5V_array_fill(blocklen,&blocklen_value,sizeof(int),(size_t)num_chunk); - - /* Create final MPI derived datatype */ - if(MPI_SUCCESS != (mpi_code = MPI_Type_struct(num_chunk,blocklen,chunk_disp_array,chunk_ftype,&chunk_final_ftype))) - HMPI_GOTO_ERROR(FAIL, "MPI_Type_struct failed", mpi_code); - if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&chunk_final_ftype))) - HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code); - - if(MPI_SUCCESS != (mpi_code = MPI_Type_struct(num_chunk,blocklen,chunk_mem_disp_array,chunk_mtype,&chunk_final_mtype))) - HMPI_GOTO_ERROR(FAIL, "MPI_Type_struct failed", mpi_code); - if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&chunk_final_mtype))) - HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code); - - for ( i = 0; i< num_chunk;i++){ - if (MPI_SUCCESS != (mpi_code= MPI_Type_free( chunk_mtype+i ))) - HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); - - if (MPI_SUCCESS != (mpi_code= MPI_Type_free( chunk_ftype+i ))) - HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); - } - - /* buffer, file derived datatypes should be true */ - coll_info.mbt_is_derived = 1; - coll_info.mft_is_derived = 1; - coll_info.mpi_buf_count = 1; - coll_info.chunk_addr = chunk_base_addr; - - } - - else {/* no selection at all for this process */ - chunk_final_ftype = MPI_BYTE; - chunk_final_mtype = MPI_BYTE; - - /* buffer, file derived datatypes should be true */ - coll_info.mbt_is_derived = 0; - coll_info.mft_is_derived = 0; - coll_info.mpi_buf_count = 0; - coll_info.chunk_addr = chunk_base_addr; - } + + /* Obtain MPI derived datatype from all individual chunks */ + for(u = 0; u < num_chunk; u++) { + /* Disk MPI derived datatype */ + if(H5S_mpio_space_type(chunk_addr_info_array[u].chunk_info.fspace, + type_info->src_type_size, &chunk_ftype[u], &mpi_file_count, + &mpi_file_extra_offset, &local_mft_is_derived) < 0) + HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "couldn't create MPI file type") + + /* Buffer MPI derived datatype */ + if(H5S_mpio_space_type(chunk_addr_info_array[u].chunk_info.mspace, + type_info->dst_type_size, &chunk_mtype[u], &mpi_mem_count, + &mpi_mem_extra_offset, &locl_mbt_is_derived) < 0) + HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "couldn't create MPI buf type") + + /* Chunk address relative to the first chunk */ + chunk_addr_info_array[u].chunk_addr -= ctg_store.contig.dset_addr; + H5_ASSIGN_OVERFLOW(chunk_disp_array[u], chunk_addr_info_array[u].chunk_addr, haddr_t, MPI_Aint); + } /* end for */ + + /* Initialize the buffer with the constant value 1 */ + blocklen_value = 1; + H5V_array_fill(blocklen, &blocklen_value, sizeof(int), num_chunk); + + /* Create final MPI derived datatype for the file */ + if(MPI_SUCCESS != (mpi_code = MPI_Type_struct((int)num_chunk, blocklen, chunk_disp_array, chunk_ftype, &chunk_final_ftype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_struct failed", mpi_code) + if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&chunk_final_ftype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code) + + /* Create final MPI derived datatype for memory */ + if(MPI_SUCCESS != (mpi_code = MPI_Type_struct(num_chunk, blocklen, chunk_mem_disp_array, chunk_mtype, &chunk_final_mtype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_struct failed", mpi_code) + if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&chunk_final_mtype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code) + + /* Free the file & memory MPI datatypes for each chunk */ + for(u = 0; u < num_chunk; u++) { + if(MPI_SUCCESS != (mpi_code = MPI_Type_free(chunk_mtype + u))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code) + + if(MPI_SUCCESS != (mpi_code = MPI_Type_free(chunk_ftype + u))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code) + } /* end for */ + + /* buffer, file derived datatypes should be true */ + mbt_is_derived = TRUE; + mft_is_derived = TRUE; + mpi_buf_count = (size_t)1; + } /* end if */ + else { /* no selection at all for this process */ + /* Allocate chunking information */ + total_chunk_addr_array = H5MM_malloc(sizeof(haddr_t) * total_chunks); + + /* Retrieve chunk address map */ + if(H5D_istore_chunkmap(io_info, total_chunk_addr_array, fm->down_chunks) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address") + + /* Get chunk with lowest address */ + ctg_store.contig.dset_addr = HADDR_MAX; + for(u = 0; u < total_chunks; u++) + if(total_chunk_addr_array[u] < ctg_store.contig.dset_addr) + ctg_store.contig.dset_addr = total_chunk_addr_array[u]; + HDassert(ctg_store.contig.dset_addr != HADDR_MAX); + + /* Set the MPI datatype */ + chunk_final_ftype = MPI_BYTE; + chunk_final_mtype = MPI_BYTE; + + /* buffer, file derived datatypes should be true */ + mpi_buf_count = (size_t)0; + } /* end else */ #ifdef H5D_DEBUG - if(H5DEBUG(D)) +if(H5DEBUG(D)) HDfprintf(H5DEBUG(D),"before coming to final collective IO\n"); #endif - if(H5D_final_collective_io(io_info,&chunk_final_ftype,&chunk_final_mtype,&coll_info,buf,do_write)<0) - HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish MPI-IO"); + /* Set up the base storage address for this chunk */ + io_info->store = &ctg_store; + + /* Perform final collective I/O operation */ + if(H5D_final_collective_io(io_info, type_info, mpi_buf_count, &chunk_final_ftype, &chunk_final_mtype) < 0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish MPI-IO") + } /* end else */ done: #ifdef H5D_DEBUG - if(H5DEBUG(D)) - HDfprintf(H5DEBUG(D),"before freeing memory inside H5D_link_collective_io ret_value = %d\n",ret_value); +if(H5DEBUG(D)) + HDfprintf(H5DEBUG(D),"before freeing memory inside H5D_link_collective_io ret_value = %d\n", ret_value); #endif - if (fm->total_chunks != 1) { - if(num_chunk == 0) HDfree(total_chunk_addr_array); - else { - HDfree(chunk_addr_info_array); - HDfree(chunk_mtype); - HDfree(chunk_ftype); - HDfree(chunk_disp_array); - HDfree(chunk_mem_disp_array); - HDfree(blocklen); - } - } - FUNC_LEAVE_NOAPI(ret_value) + if(total_chunk_addr_array) + H5MM_xfree(total_chunk_addr_array); + if(chunk_addr_info_array) + H5MM_xfree(chunk_addr_info_array); + if(chunk_mtype) + H5MM_xfree(chunk_mtype); + if(chunk_ftype) + H5MM_xfree(chunk_ftype); + if(chunk_disp_array) + H5MM_xfree(chunk_disp_array); + if(chunk_mem_disp_array) + H5MM_xfree(chunk_mem_disp_array); + if(blocklen) + H5MM_xfree(blocklen); + + /* Free the MPI buf and file types, if they were derived */ + if(mbt_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&chunk_final_mtype))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code) + if(mft_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&chunk_final_ftype))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code) + + FUNC_LEAVE_NOAPI(ret_value) } /* end H5D_link_chunk_collective_io */ +#endif /* H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS */ /*------------------------------------------------------------------------- * Function: H5D_multi_chunk_collective_io * * Purpose: To do IO per chunk according to IO mode(collective/independent/none) - - 1. Use MPI_gather and MPI_Bcast to obtain IO mode in each chunk(collective/independent/none) - 2. Depending on whether the IO mode is collective or independent or none, - Create either MPI derived datatype for each chunk or just do independent IO - 3. Use common collective IO routine to do MPI-IO * - * Return: Non-negative on success/Negative on failure + * 1. Use MPI_gather and MPI_Bcast to obtain IO mode in each chunk(collective/independent/none) + * 2. Depending on whether the IO mode is collective or independent or none, + * Create either MPI derived datatype for each chunk or just do independent IO + * 3. Use common collective IO routine to do MPI-IO * - * Programmer: + * Return: Non-negative on success/Negative on failure * - * Modifications: + * Programmer: Muqun Yang + * Monday, Feb. 13th, 2006 * *------------------------------------------------------------------------- */ static herr_t -H5D_multi_chunk_collective_io(H5D_io_info_t *io_info,H5D_chunk_map_t *fm,const void *buf, hbool_t do_write) +H5D_multi_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, + H5D_chunk_map_t *fm, H5P_genplist_t *dx_plist) { - unsigned i, total_chunk; - hsize_t ori_total_chunk; - uint8_t *chunk_io_option; - - H5SL_node_t *chunk_node; /* Current node in chunk skip list */ - H5D_chunk_info_t *chunk_info=NULL; - haddr_t *chunk_addr; - H5D_storage_t store; /* union of EFL and chunk pointer in file space */ - hbool_t select_chunk; - hbool_t last_io_mode_coll = TRUE; - - void *chunk = NULL; /* Pointer to the data chunk in cache */ - H5D_t *dataset=io_info->dset;/* Local pointer to dataset info */ - H5D_istore_ud1_t udata; /*B-tree pass-through */ - haddr_t caddr; /* Address of the cached chunk */ - size_t accessed_bytes; /*total accessed size in a chunk */ - unsigned idx_hint=0; /* Cache index hint */ - hbool_t dirty = TRUE; /* Flag for cache flushing */ - hbool_t relax=TRUE; /* Whether whole chunk is selected */ - - herr_t ret_value = SUCCEED; + H5D_t *dataset = io_info->dset;/* Local pointer to dataset info */ + H5D_io_info_t ctg_io_info; /* Contiguous I/O info object */ + H5D_storage_t ctg_store; /* Chunk storage information as contiguous dataset */ + H5D_io_info_t cpt_io_info; /* Compact I/O info object */ + H5D_storage_t cpt_store; /* Chunk storage information as compact dataset */ + hbool_t cpt_dirty; /* Temporary placeholder for compact storage "dirty" flag */ + uint8_t *chunk_io_option = NULL; + haddr_t *chunk_addr = NULL; + H5D_storage_t store; /* union of EFL and chunk pointer in file space */ + H5FD_mpio_xfer_t last_xfer_mode = H5FD_MPIO_COLLECTIVE; /* Last parallel transfer for this request (H5D_XFER_IO_XFER_MODE_NAME) */ + H5FD_mpio_collective_opt_t last_coll_opt_mode = H5FD_MPIO_COLLECTIVE_IO; /* Last parallel transfer with independent IO or collective IO with this mode */ + size_t total_chunk; /* Total # of chunks in dataset */ #ifdef H5Dmpio_DEBUG - int mpi_rank; + int mpi_rank; #endif + size_t u; /* Local index variable */ + herr_t ret_value = SUCCEED; - - FUNC_ENTER_NOAPI_NOINIT(H5D_multi_chunk_collective_io) + FUNC_ENTER_NOAPI_NOINIT(H5D_multi_chunk_collective_io) #ifdef H5Dmpio_DEBUG - mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file); + mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file); #endif - /* Allocate memories */ - ori_total_chunk = fm->total_chunks; - H5_ASSIGN_OVERFLOW(total_chunk,ori_total_chunk,hsize_t,unsigned); - HDassert(total_chunk!=0); - chunk_io_option = (uint8_t *)H5MM_calloc(total_chunk*sizeof(MPI_BYTE)); - chunk_addr = (haddr_t *)H5MM_calloc(total_chunk*sizeof(haddr_t)); + /* Retrieve total # of chunks in dataset */ + H5_ASSIGN_OVERFLOW(total_chunk, fm->total_chunks, hsize_t, size_t); + HDassert(total_chunk != 0); + + /* Allocate memories */ + chunk_io_option = (uint8_t *)H5MM_calloc(total_chunk); + chunk_addr = (haddr_t *)H5MM_calloc(total_chunk * sizeof(haddr_t)); #ifdef H5D_DEBUG - if(H5DEBUG(D)) - HDfprintf(H5DEBUG(D),"total_chunk %u\n",total_chunk); +if(H5DEBUG(D)) + HDfprintf(H5DEBUG(D), "total_chunk %Zu\n", total_chunk); #endif - /* obtain IO option for each chunk */ - if(H5D_obtain_mpio_mode(io_info,fm,chunk_io_option,chunk_addr)<0) - HGOTO_ERROR (H5E_DATASET, H5E_CANTRECV, FAIL, "unable to obtain MPIO mode"); + /* Obtain IO option for each chunk */ + if(H5D_obtain_mpio_mode(io_info, fm, dx_plist, chunk_io_option, chunk_addr) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTRECV, FAIL, "unable to obtain MPIO mode") + + /* Set up contiguous I/O info object */ + HDmemcpy(&ctg_io_info, io_info, sizeof(ctg_io_info)); + ctg_io_info.store = &ctg_store; + ctg_io_info.layout_ops = *H5D_LOPS_CONTIG; + + /* Initialize temporary contiguous storage info */ + ctg_store.contig.dset_size = (hsize_t)io_info->dset->shared->layout.u.chunk.size; + + /* Set up compact I/O info object */ + HDmemcpy(&cpt_io_info, io_info, sizeof(cpt_io_info)); + cpt_io_info.store = &cpt_store; + cpt_io_info.layout_ops = *H5D_LOPS_COMPACT; + + /* Initialize temporary compact storage info */ + cpt_store.compact.dirty = &cpt_dirty; + + /* Set dataset storage for I/O info */ + io_info->store = &store; + + /* Loop over _all_ the chunks */ + for(u = 0; u < total_chunk; u++) { + H5D_chunk_info_t *chunk_info; /* Chunk info for current chunk */ + H5S_t *fspace; /* Dataspace describing chunk & selection in it */ + H5S_t *mspace; /* Dataspace describing selection in memory corresponding to this chunk */ - for(i = 0; i < total_chunk; i++) { #ifdef H5D_DEBUG - if(H5DEBUG(D)) - HDfprintf(H5DEBUG(D),"mpi_rank = %d, chunk index = %u\n",mpi_rank,i); +if(H5DEBUG(D)) + HDfprintf(H5DEBUG(D),"mpi_rank = %d, chunk index = %Zu\n", mpi_rank, u); #endif - select_chunk = fm->select_chunk[i]; - if(select_chunk == 1){/* Have selection elements in this chunk. Find the chunk info. */ - if(NULL ==(chunk_node = H5SL_first(fm->sel_chunks))) - HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk node from skipped list"); - - while(chunk_node){ - if(NULL ==(chunk_info = H5SL_item(chunk_node))) - HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list"); - if(chunk_info->index == i) { - /* Set dataset storage for I/O info */ - io_info->store=&store; - /* Pass in chunk's coordinates in a union. */ - store.chunk.offset = chunk_info->coords; - store.chunk.index = chunk_info->index; - break; - } - - chunk_node = H5SL_next(chunk_node); - } - } - - if(chunk_io_option[i] == 1){ /*collective IO for this chunk, - note: even there is no selection for this process, - the process still needs to contribute MPI NONE TYPE.*/ + /* Get the chunk info for this chunk, if there are elements selected */ + chunk_info = fm->select_chunk[u]; + + /* Set the storage information for chunks with selections */ + if(chunk_info) { + HDassert(chunk_info->index == u); + + /* Pass in chunk's coordinates in a union. */ + store.chunk.offset = chunk_info->coords; + store.chunk.index = chunk_info->index; + } /* end if */ + + /* Collective IO for this chunk, + * Note: even there is no selection for this process, the process still + * needs to contribute MPI NONE TYPE. + */ + if(chunk_io_option[u] == 1) { #ifdef H5D_DEBUG - if(H5DEBUG(D)) - HDfprintf(H5DEBUG(D),"inside collective chunk IO mpi_rank = %d, chunk index = %u\n",mpi_rank,i); +if(H5DEBUG(D)) + HDfprintf(H5DEBUG(D),"inside collective chunk IO mpi_rank = %d, chunk index = %Zu\n", mpi_rank, u); #endif - if(!last_io_mode_coll) - /* Switch back to collective I/O */ - if(H5D_ioinfo_make_coll(io_info) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to collective I/O") + /* Set the file & memory dataspaces */ + if(chunk_info) { + fspace = chunk_info->fspace; + mspace = chunk_info->mspace; + } /* end if */ + else { + fspace = mspace = NULL; + } /* end else */ + + /* Switch back to collective I/O */ + if(last_xfer_mode != H5FD_MPIO_COLLECTIVE) { + if(H5D_ioinfo_xfer_mode(io_info, dx_plist, H5FD_MPIO_COLLECTIVE) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to collective I/O") + last_xfer_mode = H5FD_MPIO_COLLECTIVE; + } /* end if */ + if(last_coll_opt_mode != H5FD_MPIO_COLLECTIVE_IO) { + if(H5D_ioinfo_coll_opt_mode(io_info, dx_plist, H5FD_MPIO_COLLECTIVE_IO) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to collective I/O") + last_coll_opt_mode = H5FD_MPIO_COLLECTIVE_IO; + } /* end if */ - if(select_chunk){ - if(H5D_inter_collective_io(io_info,chunk_info->fspace,chunk_info->mspace, - chunk_addr[i],buf,do_write )<0) - HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO"); - - } - else{ - if(H5D_inter_collective_io(io_info,NULL,NULL, - chunk_addr[i],buf,do_write )<0) - HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO"); - - } - last_io_mode_coll = TRUE; - - } - else {/*possible independent IO for this chunk*/ + /* Initialize temporary contiguous storage address */ + ctg_store.contig.dset_addr = chunk_addr[u]; + + /* Perform the I/O */ + if(H5D_inter_collective_io(&ctg_io_info, type_info, fspace, mspace) < 0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish shared collective MPI-IO") + } /* end if */ + else { /* possible independent IO for this chunk */ #ifdef H5D_DEBUG - if(H5DEBUG(D)) - HDfprintf(H5DEBUG(D),"inside independent IO mpi_rank = %d, chunk index = %u\n",mpi_rank,i); +if(H5DEBUG(D)) + HDfprintf(H5DEBUG(D),"inside independent IO mpi_rank = %d, chunk index = %Zu\n", mpi_rank, u); #endif - HDassert(chunk_io_option[i] == 0); + HDassert(chunk_io_option[u] == 0); #if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS) - if(!select_chunk) - continue; /* this process has nothing to do with this chunk, continue! */ - if(last_io_mode_coll) + /* Check if this process has somethign to do with this chunk */ + if(chunk_info) { + H5D_io_info_t *chk_io_info; /* Pointer to I/O info object for this chunk */ + H5D_istore_ud1_t udata; /* B-tree pass-through */ + void *chunk; /* Pointer to the data chunk in cache */ + size_t accessed_bytes; /* Total accessed size in a chunk */ + unsigned idx_hint = 0; /* Cache index hint */ + haddr_t caddr; /* Address of the cached chunk */ + /* Switch to independent I/O */ - if(H5D_ioinfo_make_ind(io_info) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O") - - /* Load the chunk into cache. But if the whole chunk is written, - * simply allocate space instead of load the chunk. */ - if(HADDR_UNDEF==(caddr = H5D_istore_get_addr(io_info, &udata))) - HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list"); - - if(H5D_istore_if_load(io_info, caddr)) { - accessed_bytes = chunk_info->chunk_points * H5T_get_size(dataset->shared->type); - if((do_write && (accessed_bytes != dataset->shared->layout.u.chunk.size)) || !do_write) - relax=FALSE; - - if(NULL == (chunk = H5D_istore_lock(io_info, &udata, relax, &idx_hint))) - HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "unable to read raw data chunk") - } else - chunk = NULL; - - if(do_write) { - if((io_info->ops.write)(io_info, - chunk_info->chunk_points,H5T_get_size(io_info->dset->shared->type), - chunk_info->fspace,chunk_info->mspace,caddr,chunk, buf) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed") - } - else { - if((io_info->ops.read)(io_info, - chunk_info->chunk_points,H5T_get_size(io_info->dset->shared->type), - chunk_info->fspace,chunk_info->mspace,caddr,chunk, buf) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed") - } + if(last_xfer_mode != H5FD_MPIO_INDEPENDENT) { + if(H5D_ioinfo_xfer_mode(io_info, dx_plist, H5FD_MPIO_INDEPENDENT) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O") + last_xfer_mode = H5FD_MPIO_INDEPENDENT; + } /* end if */ - /* Release the cache lock on the chunk. */ - if(chunk) { - if(!do_write) - dirty = FALSE; + /* Load the chunk into cache. But if the whole chunk is written, + * simply allocate space instead of load the chunk. + */ + if(HADDR_UNDEF == (caddr = H5D_istore_get_addr(io_info, &udata))) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL, "couldn't get chunk info from skipped list") - if(H5D_istore_unlock(io_info, dirty, idx_hint, chunk, accessed_bytes) < 0) - HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "unable to unlock raw data chunk") - } /* end if */ -#else - if(!last_io_mode_coll) - /* using independent I/O with file setview.*/ - if(H5D_ioinfo_make_coll_opt(io_info) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to collective I/O") - if(select_chunk){ - if(H5D_inter_collective_io(io_info,chunk_info->fspace,chunk_info->mspace, - chunk_addr[i],buf,do_write )<0) - HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO"); - } + /* Load the chunk into cache and lock it. */ + if(H5D_chunk_cacheable(io_info, caddr)) { + hbool_t entire_chunk = TRUE; /* Whether whole chunk is selected */ + + /* Compute # of bytes accessed in chunk */ + accessed_bytes = chunk_info->chunk_points * type_info->src_type_size; + + /* Determine if we will access all the data in the chunk */ + if(((io_info->op_type == H5D_IO_OP_WRITE) && (accessed_bytes != ctg_store.contig.dset_size)) + || (io_info->op_type != H5D_IO_OP_WRITE)) + entire_chunk = FALSE; + + /* Lock the chunk into the cache */ + if(NULL == (chunk = H5D_istore_lock(io_info, &udata, entire_chunk, &idx_hint))) + HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "unable to read raw data chunk") + + /* Set up the storage buffer information for this chunk */ + cpt_store.compact.buf = chunk; + + /* Point I/O info at contiguous I/O info for this chunk */ + chk_io_info = &cpt_io_info; + } /* end if */ + else { + /* Set up the storage address information for this chunk */ + ctg_store.contig.dset_addr = caddr; + + /* No chunk cached */ + chunk = NULL; + + /* Point I/O info at temporary I/O info for this chunk */ + chk_io_info = &ctg_io_info; + } /* end else */ + + if(io_info->op_type == H5D_IO_OP_WRITE) { + if((io_info->io_ops.single_write)(chk_io_info, type_info, + (hsize_t)chunk_info->chunk_points, chunk_info->fspace, chunk_info->mspace) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed") + } /* end if */ + else { + if((io_info->io_ops.single_read)(chk_io_info, type_info, + (hsize_t)chunk_info->chunk_points, chunk_info->fspace, chunk_info->mspace) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed") + } /* end else */ + + /* Release the cache lock on the chunk. */ + if(chunk && H5D_istore_unlock(io_info, (io_info->op_type == H5D_IO_OP_WRITE), idx_hint, chunk, accessed_bytes) < 0) + HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "unable to unlock raw data chunk") + } /* end if */ +#else /* !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS) */ + /* Set the file & memory dataspaces */ + if(chunk_info) { + fspace = chunk_info->fspace; + mspace = chunk_info->mspace; + } /* end if */ else { - if(H5D_inter_collective_io(io_info,NULL,NULL, - chunk_addr[i],buf,do_write )<0) - HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO"); - } + fspace = mspace = NULL; + } /* end else */ + + /* Using independent I/O with file setview.*/ + if(last_coll_opt_mode != H5FD_MPIO_INDIVIDUAL_IO) { + if(H5D_ioinfo_coll_opt_mode(io_info, dx_plist, H5FD_MPIO_INDIVIDUAL_IO) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to individual I/O") + last_coll_opt_mode = H5FD_MPIO_INDIVIDUAL_IO; + } /* end if */ + /* Initialize temporary contiguous storage address */ + ctg_store.contig.dset_addr = chunk_addr[u]; + + /* Perform the I/O */ + if(H5D_inter_collective_io(&ctg_io_info, type_info, fspace, mspace) < 0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish shared collective MPI-IO") #ifdef H5D_DEBUG if(H5DEBUG(D)) HDfprintf(H5DEBUG(D),"after inter collective IO\n"); #endif -#endif - last_io_mode_coll = FALSE; - } - } - if(!last_io_mode_coll) - /* Switch back to collective I/O */ - if(H5D_ioinfo_make_coll(io_info) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to collective I/O") +#endif /* !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS) */ + } /* end else */ + } /* end for */ + done: - HDfree(chunk_io_option); - HDfree(chunk_addr); + if(chunk_io_option) + H5MM_xfree(chunk_io_option); + if(chunk_addr) + H5MM_xfree(chunk_addr); FUNC_LEAVE_NOAPI(ret_value) } /* end H5D_multi_chunk_collective_io */ @@ -1257,90 +1297,105 @@ done: * non-contiguous(or with holes) storage efficiently. * Under this case, the one independent IO call may consist of * many small disk IOs. So we may use independent IO with derived datatype - to replace the independent IO when we find this chunk is not good to - do collective IO. However, according to our performance study, - this approach may not overcome the overhead caused by MPI gather/scatter. - So we decide to leave the original collective IO per chunk approach as - an option for users. NO MPI gather/scatter calls are used. - HDF5 will try to collective IO if possible. - If users choose to use - H5Pset_dxpl_mpio_chunk_opt(dxpl_id,H5FD_MPIO_OPT_MULTI_IO), - this function will be called. - The HDF5 library won't do any IO management but leave it to MPI-IO to figure - out. + * to replace the independent IO when we find this chunk is not good to + * do collective IO. However, according to our performance study, + * this approach may not overcome the overhead caused by MPI gather/scatter. + * So we decide to leave the original collective IO per chunk approach as + * an option for users. NO MPI gather/scatter calls are used. + * HDF5 will try to collective IO if possible. + * If users choose to use + * H5Pset_dxpl_mpio_chunk_opt(dxpl_id,H5FD_MPIO_OPT_MULTI_IO), + * this function will be called. + * The HDF5 library won't do any IO management but leave it to MPI-IO to figure + * out. * * Return: Non-negative on success/Negative on failure * - * Programmer: - * - * Modifications: + * Programmer: Muqun Yang + * Monday, Feb. 13th, 2006 * *------------------------------------------------------------------------- */ static herr_t -H5D_multi_chunk_collective_io_no_opt(H5D_io_info_t *io_info,H5D_chunk_map_t *fm,const void *buf, hbool_t do_write) +H5D_multi_chunk_collective_io_no_opt(H5D_io_info_t *io_info, + const H5D_type_info_t *type_info, H5D_chunk_map_t *fm, H5P_genplist_t *dx_plist) { - int count_chunk,min_num_chunk; - haddr_t chunk_addr; - H5SL_node_t *chunk_node; /* Current node in chunk skip list */ - H5D_storage_t store; /* union of EFL and chunk pointer in file space */ - H5D_chunk_info_t *chunk_info; /* chunk information */ - hbool_t make_ind, make_coll; /* Flags to indicate that the MPI mode should change */ - - void *chunk = NULL; /* Pointer to the data chunk in cache */ - H5D_t *dataset=io_info->dset;/* Local pointer to dataset info */ - H5D_istore_ud1_t udata; /*B-tree pass-through */ - size_t accessed_bytes; /*total accessed size in a chunk */ - unsigned idx_hint=0; /* Cache index hint */ - hbool_t dirty = TRUE; /* Flag for cache flushing */ - hbool_t relax=TRUE; /* Whether whole chunk is selected */ - herr_t ret_value = SUCCEED; - -#ifdef H5Dmpio_DEBUG - int mpi_rank; -#endif + H5D_t *dataset = io_info->dset;/* Local pointer to dataset info */ + H5SL_node_t *chunk_node; /* Current node in chunk skip list */ + H5D_io_info_t ctg_io_info; /* Contiguous I/O info object */ + H5D_storage_t ctg_store; /* Chunk storage information as contiguous dataset */ + H5D_io_info_t cpt_io_info; /* Compact I/O info object */ + H5D_storage_t cpt_store; /* Chunk storage information as compact dataset */ + hbool_t cpt_dirty; /* Temporary placeholder for compact storage "dirty" flag */ + int min_chunk = -1; /* Minimum # of chunks all processes will operate on */ + int count_chunk; /* How many chunks have we operated on? */ + H5D_storage_t store; /* union of EFL and chunk pointer in file space */ + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI_NOINIT(H5D_multi_chunk_collective_io_no_opt) - FUNC_ENTER_NOAPI_NOINIT(H5D_multi_chunk_collective_io_no_opt) #ifdef H5D_DEBUG - if(H5DEBUG(D)){ +if(H5DEBUG(D)) { + int mpi_rank; + mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file); - HDfprintf(H5DEBUG(D),"coming to multi_chunk_collective_io_no_opt\n"); - } + HDfprintf(H5DEBUG(D), "coming to multi_chunk_collective_io_no_opt\n"); +} #endif - if(H5D_mpio_get_min_chunk(io_info,fm,&min_num_chunk)<0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get minimum number of chunk"); - count_chunk = 0; - - /* Get first node in chunk skip list */ - chunk_node=H5SL_first(fm->sel_chunks); - - /* Iterate through chunks to be operated on */ - while(chunk_node) { - H5D_chunk_info_t *chunk_info; /* chunk information */ - hbool_t make_ind, make_coll; /* Flags to indicate that the MPI mode should change */ - - /* Get the actual chunk information from the skip list node */ - chunk_info=H5SL_item(chunk_node); - - /* Set dataset storage for I/O info */ - io_info->store=&store; - - /* Pass in chunk's coordinates in a union. */ - store.chunk.offset = chunk_info->coords; - store.chunk.index = chunk_info->index; - - /* Reset flags for changing parallel I/O mode */ - make_ind = make_coll = FALSE; - - count_chunk++; - /* If the number of chunk is greater than minimum number of chunk, - Do independent read */ - if(count_chunk > min_num_chunk) { - /* Switch to independent I/O (permanently) */ - make_ind = TRUE; - } + /* Set up contiguous I/O info object */ + HDmemcpy(&ctg_io_info, io_info, sizeof(ctg_io_info)); + ctg_io_info.store = &ctg_store; + ctg_io_info.layout_ops = *H5D_LOPS_CONTIG; + + /* Initialize temporary contiguous storage info */ + ctg_store.contig.dset_size = (hsize_t)io_info->dset->shared->layout.u.chunk.size; + + /* Set up compact I/O info object */ + HDmemcpy(&cpt_io_info, io_info, sizeof(cpt_io_info)); + cpt_io_info.store = &cpt_store; + cpt_io_info.layout_ops = *H5D_LOPS_COMPACT; + + /* Initialize temporary compact storage info */ + cpt_store.compact.dirty = &cpt_dirty; + + /* Set dataset storage for I/O info */ + io_info->store = &store; + + /* Get the min. # of chunks */ + if(H5D_mpio_get_min_chunk(io_info, fm, &min_chunk) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get minimum number of chunk") + HDassert(min_chunk >= 0); + + /* Get first node in chunk skip list */ + chunk_node = H5SL_first(fm->sel_chunks); + count_chunk = 0; + + /* Iterate through chunks to be operated on */ + while(chunk_node) { + H5D_chunk_info_t *chunk_info; /* chunk information */ + haddr_t chunk_addr; /* Address of chunk in file */ + H5D_istore_ud1_t udata; /* B-tree pass-through */ + hbool_t make_ind, make_coll; /* Flags to indicate that the MPI mode should change */ + /* Get the actual chunk information from the skip list node */ + chunk_info = H5SL_item(chunk_node); + + /* Pass in chunk's coordinates in a union. */ + store.chunk.offset = chunk_info->coords; + store.chunk.index = chunk_info->index; + + /* Reset flags for changing parallel I/O mode */ + make_ind = make_coll = FALSE; + + count_chunk++; + + /* If the number of chunk is greater than minimum number of chunk, + * Do independent read. + */ + if(count_chunk > min_chunk) + /* Switch to independent I/O (permanently) */ + make_ind = TRUE; #ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS /* This case needs to be improved to check if the selected space is regular. If all selections are regular, collective IO can still be done. @@ -1349,67 +1404,92 @@ H5D_multi_chunk_collective_io_no_opt(H5D_io_info_t *io_info,H5D_chunk_map_t *fm, we turn off this optimization but leave the following code for future optimization. Otherwise, the following else {} doesn't make sense. KY 2006/8/4/ */ - else { - /* Switch to independent I/O (temporarily) */ - make_ind = TRUE; - make_coll = TRUE; - } /* end else */ + else { + /* Switch to independent I/O (temporarily) */ + make_ind = TRUE; + make_coll = TRUE; + } /* end else */ #endif /* H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS */ + /* Retrieve the chunk's address */ + if(HADDR_UNDEF == (chunk_addr = H5D_istore_get_addr(io_info, &udata))) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list") + + /* Independent I/O */ + if(make_ind) { + void *chunk; /* Pointer to the data chunk in cache */ + H5D_io_info_t *chk_io_info; /* Pointer to I/O info object for this chunk */ + size_t accessed_bytes = 0; /* Total accessed size in a chunk */ + unsigned idx_hint = 0; /* Cache index hint */ + /* Switch to independent I/O */ - if(make_ind) - if(H5D_ioinfo_make_ind(io_info) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O") + if(H5D_ioinfo_xfer_mode(io_info, dx_plist, H5FD_MPIO_INDEPENDENT) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O") - if(HADDR_UNDEF==(chunk_addr = H5D_istore_get_addr(io_info, &udata))) - HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list"); + /* Load the chunk into cache and lock it. */ + if(H5D_chunk_cacheable(io_info, chunk_addr)) { + hbool_t entire_chunk = TRUE; /* Whether whole chunk is selected */ - if(make_ind) {/*independent I/O */ - /* Load the chunk into cache. But if the whole chunk is written, - * simply allocate space instead of load the chunk. */ - if(H5D_istore_if_load(io_info, chunk_addr)) { - accessed_bytes = chunk_info->chunk_points * H5T_get_size(dataset->shared->type); - if((do_write && (accessed_bytes != dataset->shared->layout.u.chunk.size)) || !do_write) - relax=FALSE; - - if(NULL == (chunk = H5D_istore_lock(io_info, &udata, relax, &idx_hint))) - HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "unable to read raw data chunk") - } else - chunk = NULL; - - if(do_write) { - if((io_info->ops.write)(io_info, - chunk_info->chunk_points,H5T_get_size(io_info->dset->shared->type), - chunk_info->fspace,chunk_info->mspace, chunk_addr, chunk, buf) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed") - } else { - if((io_info->ops.read)(io_info, - chunk_info->chunk_points,H5T_get_size(io_info->dset->shared->type), - chunk_info->fspace,chunk_info->mspace, chunk_addr, chunk, buf) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed") - } + /* Compute # of bytes accessed in chunk */ + accessed_bytes = chunk_info->chunk_points * type_info->src_type_size; - /* Release the cache lock on the chunk. */ - if(chunk) { - if(!do_write) - dirty = FALSE; + /* Determine if we will access all the data in the chunk */ + if(((io_info->op_type == H5D_IO_OP_WRITE) && (accessed_bytes != ctg_store.contig.dset_size)) + || (io_info->op_type != H5D_IO_OP_WRITE)) + entire_chunk = FALSE; + + /* Lock the chunk into the cache */ + if(NULL == (chunk = H5D_istore_lock(io_info, &udata, entire_chunk, &idx_hint))) + HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "unable to read raw data chunk") - if(H5D_istore_unlock(io_info, dirty, idx_hint, chunk, accessed_bytes) < 0) - HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "unable to unlock raw data chunk") - } /* end if */ + /* Set up the storage buffer information for this chunk */ + cpt_store.compact.buf = chunk; + + /* Point I/O info at contiguous I/O info for this chunk */ + chk_io_info = &cpt_io_info; } /* end if */ - else { /*collective I/O */ - if(H5D_inter_collective_io(io_info,chunk_info->fspace,chunk_info->mspace, - chunk_addr,buf,do_write ) < 0) - HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO"); - } - - if(make_coll) - if(H5D_ioinfo_make_coll(io_info) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O") - /* Get the next chunk node in the skip list */ - chunk_node=H5SL_next(chunk_node); - } /* end while */ + else { + /* Set up the storage address information for this chunk */ + ctg_store.contig.dset_addr = chunk_addr; + + /* No chunk cached */ + chunk = NULL; + + /* Point I/O info at temporary I/O info for this chunk */ + chk_io_info = &ctg_io_info; + } /* end else */ + + if(io_info->op_type == H5D_IO_OP_WRITE) { + if((io_info->io_ops.single_write)(chk_io_info, type_info, + (hsize_t)chunk_info->chunk_points, chunk_info->fspace, chunk_info->mspace) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed") + } /* end if */ + else { + if((io_info->io_ops.single_read)(chk_io_info, type_info, + (hsize_t)chunk_info->chunk_points, chunk_info->fspace, chunk_info->mspace) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed") + } /* end ese */ + + /* Release the cache lock on the chunk. */ + if(chunk) + if(H5D_istore_unlock(io_info, (io_info->op_type == H5D_IO_OP_WRITE), idx_hint, chunk, accessed_bytes) < 0) + HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "unable to unlock raw data chunk") + } /* end if */ + else { /*collective I/O */ + /* Set up the storage address information for this chunk */ + ctg_store.contig.dset_addr = chunk_addr; + + if(H5D_inter_collective_io(&ctg_io_info, type_info, chunk_info->fspace, chunk_info->mspace) < 0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO") + } /* end else */ + + if(make_coll) + if(H5D_ioinfo_xfer_mode(io_info, dx_plist, H5FD_MPIO_COLLECTIVE) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O") + + /* Get the next chunk node in the skip list */ + chunk_node = H5SL_next(chunk_node); + } /* end while */ done: FUNC_LEAVE_NOAPI(ret_value) @@ -1420,303 +1500,266 @@ done: * Function: H5D_inter_collective_io * * Purpose: Routine for the shared part of collective IO between multiple chunk - collective IO and contiguous collective IO - + * collective IO and contiguous collective IO * * Return: Non-negative on success/Negative on failure * - * Programmer: - * - * Modifications: + * Programmer: Muqun Yang + * Monday, Feb. 13th, 2006 * *------------------------------------------------------------------------- */ static herr_t -H5D_inter_collective_io(H5D_io_info_t *io_info,const H5S_t *file_space,const H5S_t *mem_space, - haddr_t addr, const void *buf, hbool_t do_write ) +H5D_inter_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, + const H5S_t *file_space, const H5S_t *mem_space) { + size_t mpi_buf_count; /* # of MPI types */ + hbool_t mbt_is_derived = FALSE; + hbool_t mft_is_derived = FALSE; + MPI_Datatype mpi_file_type, mpi_buf_type; + int mpi_code; /* MPI return code */ + herr_t ret_value = SUCCEED; /* return value */ + + FUNC_ENTER_NOAPI_NOINIT(H5D_inter_collective_io) + + if((file_space != NULL) && (mem_space != NULL)) { + hsize_t mpi_buf_offset, mpi_file_offset; /* Offset within dataset where selection (ie. MPI type) begins */ + size_t mpi_file_count; /* Number of file "objects" to transfer */ - size_t mpi_buf_count, mpi_file_count; /* Number of "objects" to transfer */ - MPI_Datatype mpi_file_type,mpi_buf_type; - hsize_t mpi_buf_offset, mpi_file_offset; /* Offset within dataset where selection (ie. MPI type) begins */ - hbool_t mbt_is_derived=0, /* Whether the buffer (memory) type is derived and needs to be free'd */ - mft_is_derived=0; /* Whether the file type is derived and needs to be free'd */ - H5D_common_coll_info_t coll_info; - herr_t ret_value = SUCCEED; /* return value */ - - FUNC_ENTER_NOAPI_NOINIT(H5D_inter_collective_io) - if((file_space!=NULL) && (mem_space != NULL)) { - /*Obtain disk and memory MPI derived datatype */ - if(H5S_mpio_space_type(file_space,H5T_get_size(io_info->dset->shared->type), - &mpi_file_type,&mpi_file_count,&mpi_file_offset,&mft_is_derived)<0) - HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI file type"); - - if(H5S_mpio_space_type(mem_space,H5T_get_size(io_info->dset->shared->type), - &mpi_buf_type,&mpi_buf_count,&mpi_buf_offset,&mbt_is_derived)<0) - HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI buffer type"); - - } - else { - /* For non-selection, participate with a none MPI derived datatype, the count is 0. */ - mpi_buf_type = MPI_BYTE; - mpi_file_type = MPI_BYTE; - mpi_file_count = 0; - mpi_buf_count = 0; - } - - coll_info.mbt_is_derived = mbt_is_derived; - coll_info.mft_is_derived = mft_is_derived; - coll_info.mpi_buf_count = mpi_buf_count; - coll_info.chunk_addr = addr; + /* Obtain disk and memory MPI derived datatype */ + if(H5S_mpio_space_type(file_space, type_info->src_type_size, + &mpi_file_type, &mpi_file_count, &mpi_file_offset, &mft_is_derived) < 0) + HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "couldn't create MPI file type") + if(H5S_mpio_space_type(mem_space, type_info->src_type_size, + &mpi_buf_type, &mpi_buf_count, &mpi_buf_offset, &mbt_is_derived) < 0) + HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "couldn't create MPI buffer type") + } /* end if */ + else { + /* For non-selection, participate with a none MPI derived datatype, the count is 0. */ + mpi_buf_type = MPI_BYTE; + mpi_file_type = MPI_BYTE; + mpi_buf_count = (size_t)0; + mbt_is_derived = FALSE; + mft_is_derived = FALSE; + } /* end else */ #ifdef H5D_DEBUG - if(H5DEBUG(D)) - HDfprintf(H5DEBUG(D),"before final collective IO \n"); +if(H5DEBUG(D)) + HDfprintf(H5DEBUG(D),"before final collective IO \n"); #endif - if(H5D_final_collective_io(io_info,&mpi_file_type,&mpi_buf_type,&coll_info,buf,do_write)<0) - HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish collective MPI-IO"); - done: + /* Perform final collective I/O operation */ + if(H5D_final_collective_io(io_info, type_info, mpi_buf_count, &mpi_file_type, &mpi_buf_type) < 0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish collective MPI-IO") + +done: + /* Free the MPI buf and file types, if they were derived */ + if(mbt_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&mpi_buf_type))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code) + if(mft_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&mpi_file_type))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code) + #ifdef H5D_DEBUG - if(H5DEBUG(D)) +if(H5DEBUG(D)) HDfprintf(H5DEBUG(D),"before leaving inter_collective_io ret_value = %d\n",ret_value); #endif - FUNC_LEAVE_NOAPI(ret_value) -} /* end H5D_inter_collective_io */ + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5D_inter_collective_io() */ /*------------------------------------------------------------------------- * Function: H5D_final_collective_io * * Purpose: Routine for the common part of collective IO with different storages. - * * Return: Non-negative on success/Negative on failure * - * Programmer: - * - * Modifications: + * Programmer: Muqun Yang + * Monday, Feb. 13th, 2006 * *------------------------------------------------------------------------- */ static herr_t -H5D_final_collective_io(H5D_io_info_t *io_info,MPI_Datatype*mpi_file_type,MPI_Datatype *mpi_buf_type, - H5D_common_coll_info_t* coll_info, const void *buf, hbool_t do_write) +H5D_final_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info, + size_t mpi_buf_count, MPI_Datatype *mpi_file_type, MPI_Datatype *mpi_buf_type) { - - - int mpi_code; /* MPI return code */ - hbool_t plist_is_setup=0; /* Whether the dxpl has been customized */ - herr_t ret_value = SUCCEED; - + int mpi_code; /* MPI return code */ + hbool_t plist_is_setup = FALSE; /* Whether the dxpl has been customized */ + herr_t ret_value = SUCCEED; FUNC_ENTER_NOAPI_NOINIT(H5D_final_collective_io) - /* - * Pass buf type, file type to the file driver. - */ - - if(H5FD_mpi_setup_collective(io_info->dxpl_id, *mpi_buf_type, *mpi_file_type)<0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O properties"); + /* Pass buf type, file type to the file driver. */ + if(H5FD_mpi_setup_collective(io_info->dxpl_id, *mpi_buf_type, *mpi_file_type) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O properties") + plist_is_setup = TRUE; - plist_is_setup=1; -#ifdef H5D_DEBUG - if(H5DEBUG(D)){ - HDfprintf(H5DEBUG(D),"chunk addr %Hu\n",coll_info->chunk_addr); - HDfprintf(H5DEBUG(D),"mpi_buf_count %d\n",coll_info->mpi_buf_count); - } - -#endif - - if(do_write) { - if((io_info->ops.write)(io_info, - coll_info->mpi_buf_count,0,NULL,NULL,coll_info->chunk_addr, - NULL, buf) < 0) + if(io_info->op_type == H5D_IO_OP_WRITE) { + if((io_info->io_ops.single_write)(io_info, type_info, + (hsize_t)mpi_buf_count, NULL, NULL) < 0) HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed") - } + } /* end if */ else { - if((io_info->ops.read)(io_info, - coll_info->mpi_buf_count,0,NULL,NULL,coll_info->chunk_addr, - NULL, buf) < 0) + if((io_info->io_ops.single_read)(io_info, type_info, + (hsize_t)mpi_buf_count, NULL, NULL) < 0) HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed") - } + } /* end else */ done: - /* Reset the dxpl settings */ - if(plist_is_setup) { - if(H5FD_mpi_teardown_collective(io_info->dxpl_id)<0) - HDONE_ERROR(H5E_DATASPACE, H5E_CANTFREE, FAIL, "unable to reset dxpl values"); - } /* end if */ - - /* free the MPI buf and file types */ - if (coll_info->mbt_is_derived) { - if (MPI_SUCCESS != (mpi_code= MPI_Type_free( mpi_buf_type ))) - HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); - } - if (coll_info->mft_is_derived) { - if (MPI_SUCCESS != (mpi_code= MPI_Type_free( mpi_file_type ))) - HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); - } + /* Reset the dxpl settings */ + if(plist_is_setup) + if(H5FD_mpi_teardown_collective(io_info->dxpl_id) < 0) + HDONE_ERROR(H5E_DATASPACE, H5E_CANTFREE, FAIL, "unable to reset dxpl values") + #ifdef H5D_DEBUG - if(H5DEBUG(D)) +if(H5DEBUG(D)) HDfprintf(H5DEBUG(D),"ret_value before leaving final_collective_io=%d\n",ret_value); #endif - FUNC_LEAVE_NOAPI(ret_value) -}/* end H5D_final_collective_io */ +} /* end H5D_final_collective_io */ /*------------------------------------------------------------------------- * Function: H5D_sort_chunk * * Purpose: Routine to sort chunks in increasing order of chunk address - Each chunk address is also obtained. - - Description: - For most cases, the chunk address has already been sorted in increasing order. - The special sorting flag is used to optimize this common case. - quick sort is used for necessary sorting. - - Parameters: - Input: H5D_io_info_t* io_info, - H5D_chunk_map_t *fm(global chunk map struct) - Input/Output: H5D_chunk_addr_info_t chunk_addr_info_array[] : array to store chunk address and information - many_chunk_opt : flag to optimize the way to obtain chunk addresses - for many chunks + * Each chunk address is also obtained. * - * Return: Non-negative on success/Negative on failure + * Description: + * For most cases, the chunk address has already been sorted in increasing order. + * The special sorting flag is used to optimize this common case. + * quick sort is used for necessary sorting. * - * Programmer: + * Parameters: + * Input: H5D_io_info_t* io_info, + * H5D_chunk_map_t *fm(global chunk map struct) + * Input/Output: H5D_chunk_addr_info_t chunk_addr_info_array[] : array to store chunk address and information + * many_chunk_opt : flag to optimize the way to obtain chunk addresses + * for many chunks * - * Modifications: + * Return: Non-negative on success/Negative on failure + * + * Programmer: Muqun Yang + * Monday, Feb. 13th, 2006 * *------------------------------------------------------------------------- */ - static herr_t -H5D_sort_chunk(H5D_io_info_t * io_info, - H5D_chunk_map_t *fm, - H5D_chunk_addr_info_t chunk_addr_info_array[], - int many_chunk_opt) +H5D_sort_chunk(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm, + H5D_chunk_addr_info_t chunk_addr_info_array[], int sum_chunk) { - - - H5SL_node_t *chunk_node; /* Current node in chunk skip list */ + H5SL_node_t *chunk_node; /* Current node in chunk skip list */ H5D_chunk_info_t *chunk_info; /* Current chunking info. of this node. */ - haddr_t chunk_addr; /* Current chunking address of this node */ - haddr_t *total_chunk_addr_array=NULL; /* The array of chunk address for the total number of chunk */ - int i,mpi_code; - int total_chunks; - size_t num_chunks; - int mpi_type_cleanup = 0; - int tchunk_addr_cleanup = 0; - MPI_Datatype chunk_addrtype; - H5D_storage_t store; /*union of EFL and chunk pointer in file space */ - hbool_t do_sort = FALSE; - herr_t ret_value = SUCCEED; /*return value */ + haddr_t chunk_addr; /* Current chunking address of this node */ + haddr_t *total_chunk_addr_array = NULL; /* The array of chunk address for the total number of chunk */ + H5D_storage_t store; /*union of EFL and chunk pointer in file space */ + hbool_t do_sort = FALSE; /* Whether the addresses need to be sorted */ + int bsearch_coll_chunk_threshold; + int many_chunk_opt = H5D_OBTAIN_ONE_CHUNK_ADDR_IND; + int mpi_size; /* Number of MPI processes */ + int mpi_code; /* MPI return code */ + int i; /* Local index variable */ + herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_NOAPI_NOINIT(H5D_sort_chunk) - num_chunks = H5SL_count(fm->sel_chunks); + /* Retrieve # of MPI processes */ + if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file)) < 0) + HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size") + + /* Calculate the actual threshold to obtain all chunk addresses collectively + * The bigger this number is, the more possible the use of obtaining chunk + * address collectively. + */ + /* For non-optimization one-link IO, actual bsearch threshold is always + * 0, we would always want to obtain the chunk addresses individually + * for each process. + */ + bsearch_coll_chunk_threshold = (sum_chunk * 100) / ((int)fm->total_chunks * mpi_size); + if((bsearch_coll_chunk_threshold > H5D_ALL_CHUNK_ADDR_THRES_COL) + && ((sum_chunk / mpi_size) >= H5D_ALL_CHUNK_ADDR_THRES_COL_NUM)) + many_chunk_opt = H5D_OBTAIN_ALL_CHUNK_ADDR_COL; + #ifdef H5D_DEBUG - if(H5DEBUG(D)) - HDfprintf(H5DEBUG(D),"many_chunk_opt= %d\n",many_chunk_opt); +if(H5DEBUG(D)) + HDfprintf(H5DEBUG(D), "many_chunk_opt= %d\n", many_chunk_opt); #endif /* If we need to optimize the way to obtain the chunk address */ - if(many_chunk_opt != H5D_OBTAIN_ONE_CHUNK_ADDR_IND){ + if(many_chunk_opt != H5D_OBTAIN_ONE_CHUNK_ADDR_IND) { + int mpi_rank; - int mpi_rank, root; - total_chunks = (int)fm->total_chunks; - total_chunk_addr_array = H5MM_malloc(sizeof(haddr_t)*total_chunks); - tchunk_addr_cleanup = 1; #ifdef H5D_DEBUG - if(H5DEBUG(D)) - HDfprintf(H5DEBUG(D),"Coming inside H5D_OBTAIN_ALL_CHUNK_ADDR_COL\n"); +if(H5DEBUG(D)) + HDfprintf(H5DEBUG(D), "Coming inside H5D_OBTAIN_ALL_CHUNK_ADDR_COL\n"); #endif - root = 0; - if((mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file))<0) - HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank"); - - /*Create received MPI derived datatype */ - if(MPI_SUCCESS !=(mpi_code = MPI_Type_contiguous((int)(sizeof(haddr_t)*total_chunks), MPI_BYTE, &chunk_addrtype))) - HMPI_GOTO_ERROR(FAIL, "MPI_Type_contiguous failed", mpi_code); - if(MPI_SUCCESS !=(mpi_code = MPI_Type_commit(&chunk_addrtype))) - HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code); - - mpi_type_cleanup = 1; - - if(mpi_rank == root) { - if(H5D_istore_chunkmap(io_info, total_chunk_addr_array, fm->down_chunks)<0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address"); - } + /* Allocate array for chunk addresses */ + if(NULL == (total_chunk_addr_array = H5MM_malloc(sizeof(haddr_t) * (size_t)fm->total_chunks))) + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, "unable to allocate memory chunk address array") + + /* Retrieve all the chunk addresses with process 0 */ + if((mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file)) < 0) + HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank") + if(mpi_rank == 0) { + if(H5D_istore_chunkmap(io_info, total_chunk_addr_array, fm->down_chunks) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address") + } /* end if */ + /* Broadcasting the MPI_IO option info. and chunk address info. */ - if(MPI_SUCCESS !=(mpi_code = MPI_Bcast(total_chunk_addr_array,1,chunk_addrtype,root,io_info->comm))) - HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code); + if(MPI_SUCCESS != (mpi_code = MPI_Bcast(total_chunk_addr_array, (int)(sizeof(haddr_t) * fm->total_chunks), MPI_BYTE, (int)0, io_info->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code) } /* end if */ - /* Get first node in chunk skip list */ - if(NULL ==(chunk_node = H5SL_first(fm->sel_chunks))) - HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk node from skipped list"); /* Set dataset storage for I/O info */ io_info->store = &store; - if(NULL ==(chunk_info = H5SL_item(chunk_node))) - HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list"); - store.chunk.offset = chunk_info->coords; - store.chunk.index = chunk_info->index; + + /* Start at first node in chunk skip list */ i = 0; - if(many_chunk_opt == H5D_OBTAIN_ONE_CHUNK_ADDR_IND){ - if(HADDR_UNDEF==(chunk_addr = H5D_istore_get_addr(io_info,NULL))) - HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list"); + if(NULL == (chunk_node = H5SL_first(fm->sel_chunks))) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk node from skipped list") -#ifdef H5D_DEBUG - if(H5DEBUG(D)) - HDfprintf(H5DEBUG(D),"coming to obtain each chunk address individually \n"); -#endif - } - else - chunk_addr = total_chunk_addr_array[chunk_info->index]; + /* Iterate over all chunks for this process */ + while(chunk_node) { + if(NULL == (chunk_info = H5SL_item(chunk_node))) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list") + + if(many_chunk_opt == H5D_OBTAIN_ONE_CHUNK_ADDR_IND) { + store.chunk.offset = chunk_info->coords; + store.chunk.index = chunk_info->index; + if(HADDR_UNDEF == (chunk_addr = H5D_istore_get_addr(io_info, NULL))) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL, "couldn't get chunk info from skipped list") + } /* end if */ + else + chunk_addr = total_chunk_addr_array[chunk_info->index]; - chunk_addr_info_array[i].chunk_addr = chunk_addr; - chunk_addr_info_array[i].chunk_info = *chunk_info; + /* Check if chunk addresses are not in increasing order in the file */ + if(i > 0 && chunk_addr < chunk_addr_info_array[i - 1].chunk_addr) + do_sort = TRUE; - chunk_node = H5SL_next(chunk_node); + /* Set the address & info for this chunk */ + chunk_addr_info_array[i].chunk_addr = chunk_addr; + chunk_addr_info_array[i].chunk_info = *chunk_info; - while(chunk_node) { + /* Advance to next chunk in list */ + i++; + chunk_node = H5SL_next(chunk_node); + } /* end while */ - chunk_info = H5SL_item(chunk_node); - store.chunk.offset = chunk_info->coords; - store.chunk.index = chunk_info->index; - - if(many_chunk_opt == H5D_OBTAIN_ONE_CHUNK_ADDR_IND){ - if(HADDR_UNDEF==(chunk_addr = H5D_istore_get_addr(io_info,NULL))) - HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list"); - } - else - chunk_addr = total_chunk_addr_array[chunk_info->index]; - - if(chunk_addr < chunk_addr_info_array[i].chunk_addr) do_sort = TRUE; - chunk_addr_info_array[i+1].chunk_addr = chunk_addr; - chunk_addr_info_array[i+1].chunk_info =*chunk_info; - i++; - chunk_node = H5SL_next(chunk_node); - } #ifdef H5D_DEBUG - if(H5DEBUG(D)) - HDfprintf(H5DEBUG(D),"before Qsort\n"); +if(H5DEBUG(D)) + HDfprintf(H5DEBUG(D), "before Qsort\n"); #endif - if(do_sort) - HDqsort(chunk_addr_info_array,num_chunks,sizeof(chunk_addr_info_array),H5D_cmp_chunk_addr); + if(do_sort) { + size_t num_chunks = H5SL_count(fm->sel_chunks); + + HDqsort(chunk_addr_info_array, num_chunks, sizeof(chunk_addr_info_array[0]), H5D_cmp_chunk_addr); + } /* end if */ done: + if(total_chunk_addr_array) + H5MM_xfree(total_chunk_addr_array); - if(tchunk_addr_cleanup) - HDfree(total_chunk_addr_array); - if(mpi_type_cleanup) { - if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &chunk_addrtype ))) - HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); - } FUNC_LEAVE_NOAPI(ret_value) } /* end H5D_sort_chunk() */ @@ -1725,310 +1768,244 @@ done: * Function: H5D_obtain_mpio_mode * * Purpose: Routine to obtain each io mode(collective,independent or none) for each chunk; - Each chunk address is also obtained. - - Description: - - 1) Each process provides two piece of information for all chunks having selection - a) chunk index - b) wheather this chunk is regular(for MPI derived datatype not working case) - - 2) Gather all the information to the root process - - 3) Root process will do the following: - a) Obtain chunk addresses for all chunks in this data space - b) With the consideration of the user option, calculate IO mode for each chunk - c) Build MPI derived datatype to combine "chunk address" and "assign_io" information - in order to do MPI Bcast only once - d) MPI Bcast the IO mode and chunk address information for each chunk. - 4) Each process then retrieves IO mode and chunk address information to assign_io_mode and chunk_addr. - - Parameters: - - Input: H5D_io_info_t* io_info, - H5D_chunk_map_t *fm,(global chunk map struct) - Output: uint8_t assign_io_mode[], : IO mode, collective, independent or none - haddr_t chunk_addr[], : chunk address array for each chunk + * Each chunk address is also obtained. * - * Return: Non-negative on success/Negative on failure + * Description: * - * Programmer: + * 1) Each process provides two piece of information for all chunks having selection + * a) chunk index + * b) wheather this chunk is regular(for MPI derived datatype not working case) + * + * 2) Gather all the information to the root process * - * Modifications: + * 3) Root process will do the following: + * a) Obtain chunk addresses for all chunks in this data space + * b) With the consideration of the user option, calculate IO mode for each chunk + * c) Build MPI derived datatype to combine "chunk address" and "assign_io" information + * in order to do MPI Bcast only once + * d) MPI Bcast the IO mode and chunk address information for each chunk. + * 4) Each process then retrieves IO mode and chunk address information to assign_io_mode and chunk_addr. + * + * Parameters: + * + * Input: H5D_io_info_t* io_info, + * H5D_chunk_map_t *fm,(global chunk map struct) + * Output: uint8_t assign_io_mode[], : IO mode, collective, independent or none + * haddr_t chunk_addr[], : chunk address array for each chunk + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: Muqun Yang + * Monday, Feb. 13th, 2006 * *------------------------------------------------------------------------- */ - static herr_t -H5D_obtain_mpio_mode(H5D_io_info_t* io_info, - H5D_chunk_map_t *fm, - uint8_t assign_io_mode[], - haddr_t chunk_addr[]) +H5D_obtain_mpio_mode(H5D_io_info_t* io_info, H5D_chunk_map_t *fm, + H5P_genplist_t *dx_plist, uint8_t assign_io_mode[], haddr_t chunk_addr[]) { - - int total_chunks; - hsize_t ori_total_chunks; - unsigned percent_nproc_per_chunk,threshold_nproc_per_chunk; - H5FD_mpio_chunk_opt_t chunk_opt_mode; - uint8_t* io_mode_info=NULL; - uint8_t* recv_io_mode_info=NULL; - uint8_t* mergebuf=NULL; - uint8_t* tempbuf; - - H5SL_node_t* chunk_node; - H5D_chunk_info_t* chunk_info; - - MPI_Datatype bastype[2]; - MPI_Datatype chunk_addrtype; - int bascount; - int basblock[2]; - MPI_Aint basdisp[2]; - MPI_Datatype rtype; - MPI_Datatype stype; - int mpi_size,mpi_rank; - MPI_Comm comm; - int ic,root; - int mpi_code; - H5P_genplist_t *plist; - int mem_cleanup = 0, - mpi_type_cleanup = 0; - + int total_chunks; + unsigned percent_nproc_per_chunk,threshold_nproc_per_chunk; + H5FD_mpio_chunk_opt_t chunk_opt_mode; + uint8_t* io_mode_info=NULL; + uint8_t* recv_io_mode_info=NULL; + uint8_t* mergebuf=NULL; + uint8_t* tempbuf; + H5SL_node_t* chunk_node; + H5D_chunk_info_t* chunk_info; + int mpi_size,mpi_rank; + MPI_Comm comm; + int ic,root; + int mpi_code; + int mem_cleanup = 0; #ifdef H5_HAVE_INSTRUMENTED_LIBRARY - int new_value; - htri_t check_prop; + int new_value; + htri_t check_prop; #endif + herr_t ret_value = SUCCEED; - herr_t ret_value = SUCCEED; - - FUNC_ENTER_NOAPI_NOINIT(H5D_obtain_mpio_mode) + FUNC_ENTER_NOAPI_NOINIT(H5D_obtain_mpio_mode) - /* Assign the rank 0 to the root */ - root = 0; - comm = io_info->comm; + /* Assign the rank 0 to the root */ + root = 0; + comm = io_info->comm; - /* Obtain the number of process and the current rank of the process */ - if((mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file))<0) - HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank"); - if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file))<0) - HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size"); + /* Obtain the number of process and the current rank of the process */ + if((mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file)) < 0) + HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank") + if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file)) < 0) + HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size") - /* Allocate memory */ - ori_total_chunks = fm->total_chunks; - H5_ASSIGN_OVERFLOW(total_chunks,ori_total_chunks,hsize_t,int); - - /* Obtain the data transfer properties */ - if(NULL == (plist = H5I_object(io_info->dxpl_id))) - HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list") - - percent_nproc_per_chunk=H5P_peek_unsigned(plist,H5D_XFER_MPIO_CHUNK_OPT_RATIO_NAME); + /* Setup parameters */ + H5_ASSIGN_OVERFLOW(total_chunks, fm->total_chunks, hsize_t, int); + percent_nproc_per_chunk = H5P_peek_unsigned(dx_plist, H5D_XFER_MPIO_CHUNK_OPT_RATIO_NAME); #if defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) && defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS) - - chunk_opt_mode=(H5FD_mpio_chunk_opt_t)H5P_peek_unsigned(plist,H5D_XFER_MPIO_CHUNK_OPT_HARD_NAME); - - if((chunk_opt_mode == H5FD_MPIO_CHUNK_MULTI_IO) || (percent_nproc_per_chunk == 0)){ - if(H5D_istore_chunkmap(io_info, chunk_addr, fm->down_chunks) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address"); - for(ic = 0; ic<total_chunks;ic++) - assign_io_mode[ic] = H5D_CHUNK_IO_MODE_COL; - goto done; - } + chunk_opt_mode = (H5FD_mpio_chunk_opt_t)H5P_peek_unsigned(dx_plist, H5D_XFER_MPIO_CHUNK_OPT_HARD_NAME); + if((chunk_opt_mode == H5FD_MPIO_CHUNK_MULTI_IO) || (percent_nproc_per_chunk == 0)) { + if(H5D_istore_chunkmap(io_info, chunk_addr, fm->down_chunks) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address"); + for(ic = 0; ic < total_chunks; ic++) + assign_io_mode[ic] = H5D_CHUNK_IO_MODE_COL; + + HGOTO_DONE(SUCCEED) + } /* end if */ #endif - threshold_nproc_per_chunk = mpi_size * percent_nproc_per_chunk/100; - - io_mode_info = (uint8_t *)H5MM_calloc(total_chunks*sizeof(MPI_BYTE)); - mergebuf = H5MM_malloc((sizeof(haddr_t)+sizeof(MPI_BYTE))*total_chunks); - tempbuf = mergebuf + sizeof(MPI_BYTE)*total_chunks; - if(mpi_rank == root) - recv_io_mode_info = (uint8_t *)H5MM_malloc(total_chunks*sizeof(MPI_BYTE)*mpi_size); + threshold_nproc_per_chunk = mpi_size * percent_nproc_per_chunk/100; + + /* Allocate memory */ + io_mode_info = (uint8_t *)H5MM_calloc(total_chunks); + mergebuf = H5MM_malloc((sizeof(haddr_t) + 1) * total_chunks); + tempbuf = mergebuf + total_chunks; + if(mpi_rank == root) + recv_io_mode_info = (uint8_t *)H5MM_malloc(total_chunks * mpi_size); - mem_cleanup = 1; + mem_cleanup = 1; - chunk_node = H5SL_first(fm->sel_chunks); - - /*Obtain the regularity and selection information for all chunks in this process. */ - while(chunk_node){ - - chunk_info = H5SL_item(chunk_node); + /* Obtain the regularity and selection information for all chunks in this process. */ + chunk_node = H5SL_first(fm->sel_chunks); + while(chunk_node) { + chunk_info = H5SL_item(chunk_node); #ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS - /* regularity information: 1, selection information: 2 */ - if(H5S_SELECT_IS_REGULAR(chunk_info->fspace) == TRUE && - H5S_SELECT_IS_REGULAR(chunk_info->mspace) == TRUE) + /* regularity information: 1, selection information: 2 */ + if(H5S_SELECT_IS_REGULAR(chunk_info->fspace) == TRUE && + H5S_SELECT_IS_REGULAR(chunk_info->mspace) == TRUE) #endif - io_mode_info[chunk_info->index] = H5D_CHUNK_SELECT_REG; /* this chunk is selected and is "regular" without defining H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS. */ + io_mode_info[chunk_info->index] = H5D_CHUNK_SELECT_REG; /* this chunk is selected and is "regular" without defining H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS. */ #ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS - else - io_mode_info[chunk_info->index] = H5D_CHUNK_SELECT_IRREG; /* this chunk is selected and is irregular*/ + else + io_mode_info[chunk_info->index] = H5D_CHUNK_SELECT_IRREG; /* this chunk is selected and is irregular*/ #endif - - chunk_node = H5SL_next(chunk_node); - } + chunk_node = H5SL_next(chunk_node); + } /* end while */ - /*Create sent MPI derived datatype */ - if(MPI_SUCCESS !=(mpi_code = MPI_Type_contiguous(total_chunks,MPI_BYTE,&stype))) - HMPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code); - if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&stype))) - HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code); - - /*Create received basic MPI derived datatype */ - bascount = 2; - basblock[0] = total_chunks; - basblock[1] = total_chunks; - basdisp[0] = 0; - basdisp[1] = (MPI_Aint)(sizeof(MPI_BYTE)*total_chunks);/* may need to check overflow */ - bastype[0] = MPI_BYTE; - - if(MPI_SUCCESS !=(mpi_code = MPI_Type_contiguous(sizeof(haddr_t),MPI_BYTE,&chunk_addrtype))) - HMPI_GOTO_ERROR(FAIL, "MPI_Type_contiguous failed", mpi_code); - if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&chunk_addrtype))) - HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code); - bastype[1] = chunk_addrtype; - - if(MPI_SUCCESS !=(mpi_code = MPI_Type_struct(bascount,basblock,basdisp,bastype,&rtype))) - HMPI_GOTO_ERROR(FAIL, "MPI_Type_struct failed", mpi_code); - if(MPI_SUCCESS !=(mpi_code = MPI_Type_commit(&rtype))) - HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code); - - /* Set up a flag to clean up the MPI derived datatype later */ - mpi_type_cleanup = 1; - - /*Gather all the information */ - if(MPI_SUCCESS !=(mpi_code = MPI_Gather(io_mode_info,1,stype,recv_io_mode_info,1,stype,root,comm))) - HMPI_GOTO_ERROR(FAIL, "MPI_Gather failed", mpi_code); - - /* Calculate the mode for IO(collective, independent or none) at root process */ - if(mpi_rank == root) { - - int nproc; - int* nproc_per_chunk; + /*Gather all the information */ + if(MPI_SUCCESS != (mpi_code = MPI_Gather(io_mode_info, total_chunks, MPI_BYTE, recv_io_mode_info, total_chunks, MPI_BYTE, root, comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Gather failed", mpi_code) + + /* Calculate the mode for IO(collective, independent or none) at root process */ + if(mpi_rank == root) { + int nproc; + int* nproc_per_chunk; #if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS) - int* ind_this_chunk; + int* ind_this_chunk; #endif - /* pre-computing: calculate number of processes and - regularity of the selection occupied in each chunk */ - nproc_per_chunk = (int*)H5MM_calloc(total_chunks*sizeof(int)); + /* pre-computing: calculate number of processes and + regularity of the selection occupied in each chunk */ + nproc_per_chunk = (int*)H5MM_calloc(total_chunks * sizeof(int)); #if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS) - ind_this_chunk = (int*)H5MM_calloc(total_chunks*sizeof(int)); + ind_this_chunk = (int*)H5MM_calloc(total_chunks * sizeof(int)); #endif - /* calculating the chunk address */ - if(H5D_istore_chunkmap(io_info, chunk_addr, fm->down_chunks)<0){ - HDfree(nproc_per_chunk); + /* calculating the chunk address */ + if(H5D_istore_chunkmap(io_info, chunk_addr, fm->down_chunks) < 0) { + HDfree(nproc_per_chunk); #if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS) - HDfree(ind_this_chunk); + HDfree(ind_this_chunk); #endif - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address"); - } - - /* checking for number of process per chunk and regularity of the selection*/ - for (nproc = 0;nproc <mpi_size;nproc++){ - uint8_t *tmp_recv_io_mode_info = recv_io_mode_info + nproc*total_chunks; - /* calculate the number of process per chunk and adding irregular selection option */ - for(ic = 0; ic < total_chunks; ic++, tmp_recv_io_mode_info++){ - if(*tmp_recv_io_mode_info != 0) { - nproc_per_chunk[ic]++; + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address") + } /* end if */ + + /* checking for number of process per chunk and regularity of the selection*/ + for(nproc = 0; nproc < mpi_size; nproc++) { + uint8_t *tmp_recv_io_mode_info = recv_io_mode_info + (nproc * total_chunks); + + /* Calculate the number of process per chunk and adding irregular selection option */ + for(ic = 0; ic < total_chunks; ic++, tmp_recv_io_mode_info++) { + if(*tmp_recv_io_mode_info != 0) { + nproc_per_chunk[ic]++; #ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS - if(*tmp_recv_io_mode_info == H5D_CHUNK_SELECT_IRREG) - ind_this_chunk[ic] = 1; + if(*tmp_recv_io_mode_info == H5D_CHUNK_SELECT_IRREG) + ind_this_chunk[ic] = 1; #endif - } + } /* end if */ #ifndef H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS else { - /*checking whether we have a selection in this chunk */ - ind_this_chunk[ic] = 1; - } + /*checking whether we have a selection in this chunk */ + ind_this_chunk[ic] = 1; + } /* end else */ #endif - } + } /* end for */ + } /* end for */ - } - - /* Calculating MPIO mode for each chunk (collective, independent, none) */ - for(ic = 0; ic < total_chunks; ic++){ - if(nproc_per_chunk[ic]>MAX(1,threshold_nproc_per_chunk)){ + /* Calculating MPIO mode for each chunk (collective, independent, none) */ + for(ic = 0; ic < total_chunks; ic++) { + if(nproc_per_chunk[ic] > MAX(1, threshold_nproc_per_chunk)) { #if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS) - if(!ind_this_chunk[ic]) assign_io_mode[ic] = H5D_CHUNK_IO_MODE_COL; + if(!ind_this_chunk[ic]) + assign_io_mode[ic] = H5D_CHUNK_IO_MODE_COL; #else - assign_io_mode[ic] = H5D_CHUNK_IO_MODE_COL; + assign_io_mode[ic] = H5D_CHUNK_IO_MODE_COL; #endif - } - } + } /* end if */ + } /* end for */ - /* merge buffer io_mode info and chunk addr into one */ - HDmemcpy(mergebuf,assign_io_mode,sizeof(MPI_BYTE)*total_chunks); - HDmemcpy(tempbuf,chunk_addr,sizeof(haddr_t)*total_chunks); + /* merge buffer io_mode info and chunk addr into one */ + HDmemcpy(mergebuf, assign_io_mode, total_chunks); + HDmemcpy(tempbuf, chunk_addr, sizeof(haddr_t) * total_chunks); - HDfree(nproc_per_chunk); + HDfree(nproc_per_chunk); #if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS) - HDfree(ind_this_chunk); + HDfree(ind_this_chunk); #endif - } + } /* end if */ - /* Broadcasting the MPI_IO option info. and chunk address info. */ - if(MPI_SUCCESS !=(mpi_code = MPI_Bcast(mergebuf,1,rtype,root,comm))) - HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code); + /* Broadcasting the MPI_IO option info. and chunk address info. */ + if(MPI_SUCCESS != (mpi_code = MPI_Bcast(mergebuf, ((sizeof(haddr_t) + 1) * total_chunks), MPI_BYTE, root, comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code) - HDmemcpy(assign_io_mode,mergebuf,sizeof(MPI_BYTE)*total_chunks); - HDmemcpy(chunk_addr,tempbuf,sizeof(haddr_t)*total_chunks); + HDmemcpy(assign_io_mode, mergebuf, total_chunks); + HDmemcpy(chunk_addr, tempbuf, sizeof(haddr_t) * total_chunks); #ifdef H5_HAVE_INSTRUMENTED_LIBRARY - check_prop = H5Pexist(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_MULTI_RATIO_COLL_NAME); + check_prop = H5Pexist(io_info->dxpl_id, H5D_XFER_COLL_CHUNK_MULTI_RATIO_COLL_NAME); if(check_prop > 0) { #if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS) - new_value = 0; - if(H5Pset(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_MULTI_RATIO_COLL_NAME,&new_value)<0) - HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to set property value"); + new_value = 0; + if(H5Pset(io_info->dxpl_id, H5D_XFER_COLL_CHUNK_MULTI_RATIO_COLL_NAME, &new_value) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to set property value") #else - for(ic = 0; ic < total_chunks; ic++){ - if(assign_io_mode[ic] == H5D_CHUNK_IO_MODE_COL) { - new_value = 0; - if(H5Pset(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_MULTI_RATIO_COLL_NAME,&new_value)<0) - HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to set property value"); - break; - } - } + for(ic = 0; ic < total_chunks; ic++) { + if(assign_io_mode[ic] == H5D_CHUNK_IO_MODE_COL) { + new_value = 0; + if(H5Pset(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_MULTI_RATIO_COLL_NAME,&new_value) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to set property value") + break; + } /* end if */ + } /* end for */ #endif - } - check_prop = H5Pexist(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_MULTI_RATIO_IND_NAME); - if(check_prop > 0) { - int temp_count = 0; - for(ic = 0; ic < total_chunks; ic++){ - if(assign_io_mode[ic] == H5D_CHUNK_IO_MODE_COL) { - temp_count++; - break; - } - } - if(temp_count==0){ - new_value = 0; - if(H5Pset(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_MULTI_RATIO_IND_NAME,&new_value)<0) - HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to set property value"); - } - } + } /* end if */ + + check_prop = H5Pexist(io_info->dxpl_id, H5D_XFER_COLL_CHUNK_MULTI_RATIO_IND_NAME); + if(check_prop > 0) { + int temp_count = 0; + + for(ic = 0; ic < total_chunks; ic++) { + if(assign_io_mode[ic] == H5D_CHUNK_IO_MODE_COL) { + temp_count++; + break; + } /* end if */ + } /* end for */ + if(temp_count == 0) { + new_value = 0; + if(H5Pset(io_info->dxpl_id, H5D_XFER_COLL_CHUNK_MULTI_RATIO_IND_NAME, &new_value) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to set property value") + } /* end if */ + } /* end if */ #endif done: - - if(mpi_type_cleanup) { - if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &chunk_addrtype ))) - HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); - - if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &stype ))) - HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); - - if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &rtype ))) - HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); - } - - if(mem_cleanup){ - HDfree(io_mode_info); - HDfree(mergebuf); - if(mpi_rank == root) - HDfree(recv_io_mode_info); - } + if(mem_cleanup) { + HDfree(io_mode_info); + HDfree(mergebuf); + if(mpi_rank == root) + HDfree(recv_io_mode_info); + } /* end if */ FUNC_LEAVE_NOAPI(ret_value) -}/* end H5D_obtain_mpio_mode*/ +} /* end H5D_obtain_mpio_mode() */ static int H5D_cmp_chunk_addr(const void *chunk_addr_info1, const void *chunk_addr_info2) @@ -2041,7 +2018,6 @@ H5D_cmp_chunk_addr(const void *chunk_addr_info1, const void *chunk_addr_info2) addr2 = ((const H5D_chunk_addr_info_t *)chunk_addr_info2)->chunk_addr; FUNC_LEAVE_NOAPI(H5F_addr_cmp(addr1, addr2)) - } #endif /* H5_HAVE_PARALLEL */ |