diff options
Diffstat (limited to 'src/H5Dmpio.c')
-rw-r--r-- | src/H5Dmpio.c | 1646 |
1 files changed, 1517 insertions, 129 deletions
diff --git a/src/H5Dmpio.c b/src/H5Dmpio.c index 9d8821a..2bec70d 100644 --- a/src/H5Dmpio.c +++ b/src/H5Dmpio.c @@ -26,19 +26,23 @@ /****************/ #define H5D_PACKAGE /*suppress error about including H5Dpkg */ +/*#define KENT */ /***********/ /* Headers */ /***********/ #include "H5private.h" /* Generic Functions */ +#include "H5Iprivate.h" #include "H5Dpkg.h" /* Datasets */ #include "H5Eprivate.h" /* Error handling */ #include "H5Fprivate.h" /* File access */ #include "H5FDprivate.h" /* File drivers */ +#include "H5MMprivate.h" #include "H5Oprivate.h" /* Object headers */ #include "H5Pprivate.h" /* Property lists */ #include "H5Sprivate.h" /* Dataspaces */ +#include "H5Vprivate.h" /* Vector */ #ifdef H5_HAVE_PARALLEL @@ -46,20 +50,109 @@ /* Local Macros */ /****************/ +/* Macros to represent different IO options */ +#define H5D_ONE_LINK_CHUNK_IO 0 +#define H5D_MULTI_CHUNK_IO 1 + +/***** Macros for One linked collective IO case. *****/ +/* The default value to do one linked collective IO for all chunks. + If the average number of chunks per process is greater than this value, + the library will create an MPI derived datatype to link all chunks to do collective IO. + The user can set this value through an API. */ +#define H5D_ONE_LINK_CHUNK_IO_THRESHOLD 0 /* always set this option with value 0*/ + +/* Macros to represent options on how to obtain chunk address for one linked-chunk IO case */ +#define H5D_OBTAIN_ONE_CHUNK_ADDR_IND 0 +#define H5D_OBTAIN_ALL_CHUNK_ADDR_IND 1 +#define H5D_OBTAIN_ALL_CHUNK_ADDR_COL 2 + +/* Macros to define the default ratio of obtaining all chunk addresses for one linked-chunk IO case */ +#define H5D_ALL_CHUNK_ADDR_THRES_IND 10 +#define H5D_ALL_CHUNK_ADDR_THRES_COL 20 + +/***** Macros for multi-chunk collective IO case. *****/ +/* The default value of the threshold to do collective IO for this chunk. + If the average number of processes per chunk is greater than the default value, + collective IO is done for this chunk. +*/ +#define H5D_MULTI_CHUNK_IO_COL_THRESHOLD 50 + +/* Macros to represent different IO modes(NONE, Independent or collective)for multiple chunk IO case */ +#define H5D_CHUNK_IO_MODE_IND 0 +#define H5D_CHUNK_IO_MODE_COL 1 + +/* Macros to represent the regularity of the selection for multiple chunk IO case. */ +#define H5D_CHUNK_SELECT_REG 1 +#define H5D_CHUNK_SELECT_IRREG 2 +#define H5D_CHUNK_SELECT_NONE 0 + /******************/ /* Local Typedefs */ /******************/ +/* Combine chunk address and chunk info into a struct for better performance. */ +typedef struct H5D_chunk_addr_info_t { + haddr_t chunk_addr; + H5D_chunk_info_t chunk_info; +} H5D_chunk_addr_info_t; + +/* Combine all information that needs to know for collective MPI-IO of this selection. */ +typedef struct H5D_common_coll_info_t { + hbool_t mbt_is_derived; + hbool_t mft_is_derived; + size_t mpi_buf_count; + haddr_t chunk_addr; +} H5D_common_coll_info_t; + /********************/ /* Local Prototypes */ /********************/ -/* For regular hyperslab selection. */ +static herr_t +H5D_multi_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,void *buf, + hbool_t do_write); + +static herr_t +H5D_link_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,void *buf, + hbool_t do_write,int sum_chunk); + +static herr_t +H5D_inter_collective_io(H5D_io_info_t *io_info,const H5S_t *file_space, + const H5S_t *mem_space,haddr_t addr, + void *buf, hbool_t do_write ); + +static herr_t +H5D_final_collective_io(H5D_io_info_t *io_info,MPI_Datatype*mpi_file_type, + MPI_Datatype *mpi_buf_type, + H5D_common_coll_info_t* coll_info, + void *buf, hbool_t do_write); +#ifdef OLD_WAY static herr_t -H5D_mpio_spaces_xfer(H5D_io_info_t *io_info, size_t elmt_size, - const H5S_t *file_space, const H5S_t *mem_space, - void *buf/*out*/, - hbool_t do_write); +H5D_pre_sort_chunk(H5D_io_info_t *io_info,int total_chunks, + haddr_t total_chunk_addr_array[]); +#endif + +static herr_t +H5D_sort_chunk(H5D_io_info_t * io_info, + fm_map *fm, + H5D_chunk_addr_info_t chunk_addr_info_array[], + int many_chunk_opt); + +static herr_t +H5D_obtain_mpio_mode(H5D_io_info_t* io_info, + fm_map *fm, + uint8_t assign_io_mode[], + haddr_t chunk_addr[]); + +static herr_t H5D_ioinfo_make_ind(H5D_io_info_t *io_info); +static herr_t H5D_ioinfo_make_coll(H5D_io_info_t *io_info); +static herr_t H5D_mpio_get_min_chunk(const H5D_io_info_t *io_info, + const fm_map *fm, int *min_chunkf); +static int H5D_cmp_chunk_addr(const void *addr1, const void *addr2); +static herr_t +H5D_mpio_get_sum_chunk(const H5D_io_info_t *io_info, + const fm_map *fm, int *sum_chunkf); + /*********************/ /* Package Variables */ @@ -104,7 +197,7 @@ H5D_mpio_opt_possible( const H5D_io_info_t *io_info, if (io_info->dxpl_cache->xfer_mode==H5FD_MPIO_INDEPENDENT) HGOTO_DONE(FALSE); - /* Optimized MPI types flag must be set and it is must be collective IO */ + /* Optimized MPI types flag must be set and it must be collective IO */ /* (Don't allow parallel I/O for the MPI-posix driver, since it doesn't do real collective I/O) */ if (!(H5S_mpi_opt_types_g && io_info->dxpl_cache->xfer_mode==H5FD_MPIO_COLLECTIVE && !IS_H5FD_MPIPOSIX(io_info->dset->oloc.file))) { local_opinion = FALSE; @@ -131,14 +224,14 @@ H5D_mpio_opt_possible( const H5D_io_info_t *io_info, goto broadcast; } /* end if */ - /*The handling of memory space is different for chunking + /* The handling of memory space is different for chunking and contiguous storage, For contigous storage, mem_space and file_space won't change when it it is doing disk IO. For chunking storage, mem_space will change for different chunks. So for chunking storage, whether we can use - collective IO will defer until the each chunk IO is reached. - For contiguous storage, if we find the MPI-IO cannot + collective IO will defer until each chunk IO is reached. + For contiguous storage, if we find MPI-IO cannot support complicated MPI derived data type, we will set use_par_opt_io = FALSE. */ @@ -184,124 +277,53 @@ done: /*------------------------------------------------------------------------- - * Function: H5D_mpio_spaces_xfer + * Function: H5D_mpio_chunk_adjust_iomode * - * Purpose: Use MPI-IO to transfer data efficiently - * directly between app buffer and file. - * - * Return: non-negative on success, negative on failure. + * Purpose: Checks if it is possible to do collective IO * - * Programmer: rky 980813 - * - * Notes: - * For collective data transfer only since this would eventually call - * H5FD_mpio_setup to do setup to eveually call MPI_File_set_view in - * H5FD_mpio_read or H5FD_mpio_write. MPI_File_set_view is a collective - * call. Letting independent data transfer use this route would result in - * hanging. + * Return: Success: Non-negative: TRUE or FALSE + * Failure: Negative * - * The preconditions for calling this routine are located in the - * H5S_mpio_opt_possible() routine, which determines whether this routine - * can be called for a given dataset transfer. + * Programmer: Muqun Yang + * Monday, Feb. 13th, 2006 * *------------------------------------------------------------------------- */ -static herr_t -H5D_mpio_spaces_xfer(H5D_io_info_t *io_info, size_t elmt_size, - const H5S_t *file_space, const H5S_t *mem_space, - void *_buf /*out*/, hbool_t do_write ) -{ - haddr_t addr; /* Address of dataset (or selection) within file */ - size_t mpi_buf_count, mpi_file_count; /* Number of "objects" to transfer */ - hsize_t mpi_buf_offset, mpi_file_offset; /* Offset within dataset where selection (ie. MPI type) begins */ - MPI_Datatype mpi_buf_type, mpi_file_type; /* MPI types for buffer (memory) and file */ - hbool_t mbt_is_derived=0, /* Whether the buffer (memory) type is derived and needs to be free'd */ - mft_is_derived=0; /* Whether the file type is derived and needs to be free'd */ - hbool_t plist_is_setup=0; /* Whether the dxpl has been customized */ - uint8_t *buf=(uint8_t *)_buf; /* Alias for pointer arithmetic */ - int mpi_code; /* MPI return code */ - herr_t ret_value = SUCCEED; /* Return value */ - - FUNC_ENTER_NOAPI_NOINIT(H5D_mpio_spaces_xfer); +herr_t +H5D_mpio_chunk_adjust_iomode(H5D_io_info_t *io_info, const fm_map *fm) { - /* Check args */ - assert (io_info); - assert (io_info->dset); - assert (file_space); - assert (mem_space); - assert (buf); - assert (IS_H5FD_MPIO(io_info->dset->oloc.file)); - /* Make certain we have the correct type of property list */ - assert(TRUE==H5P_isa_class(io_info->dxpl_id,H5P_DATASET_XFER)); + int min_chunk; + herr_t ret_value = SUCCEED; - /* create the MPI buffer type */ - if (H5S_mpio_space_type( mem_space, elmt_size, - /* out: */ - &mpi_buf_type, - &mpi_buf_count, - &mpi_buf_offset, - &mbt_is_derived )<0) - HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI buf type"); - - /* create the MPI file type */ - if ( H5S_mpio_space_type( file_space, elmt_size, - /* out: */ - &mpi_file_type, - &mpi_file_count, - &mpi_file_offset, - &mft_is_derived )<0) - HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI file type"); - - /* Get the base address of the contiguous dataset or the chunk */ - if(io_info->dset->shared->layout.type == H5D_CONTIGUOUS) - addr = H5D_contig_get_addr(io_info->dset) + mpi_file_offset; - else { - haddr_t chunk_addr; /* for collective chunk IO */ + FUNC_ENTER_NOAPI_NOINIT(H5D_mpio_chunk_adjust_iomode) - assert(io_info->dset->shared->layout.type == H5D_CHUNKED); - chunk_addr=H5D_istore_get_addr(io_info,NULL); - addr = H5F_BASE_ADDR(io_info->dset->oloc.file) + chunk_addr + mpi_file_offset; - } +#ifndef H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS + if(H5D_mpio_get_min_chunk(io_info,fm,&min_chunk)<0) + HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the min chunk number of all processes"); + if(min_chunk == 0) { + H5P_genplist_t *dx_plist; /* Data transer property list */ - /* - * Pass buf type, file type to the file driver. Request an MPI type - * transfer (instead of an elementary byteblock transfer). - */ - if(H5FD_mpi_setup_collective(io_info->dxpl_id, mpi_buf_type, mpi_file_type)<0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O properties"); - plist_is_setup=1; - - /* Adjust the buffer pointer to the beginning of the selection */ - buf+=mpi_buf_offset; - - /* transfer the data */ - if (do_write) { - if (H5F_block_write(io_info->dset->oloc.file, H5FD_MEM_DRAW, addr, mpi_buf_count, io_info->dxpl_id, buf) <0) - HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL,"MPI write failed"); - } else { - if (H5F_block_read (io_info->dset->oloc.file, H5FD_MEM_DRAW, addr, mpi_buf_count, io_info->dxpl_id, buf) <0) - HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL,"MPI read failed"); - } + /* Get the dataset transfer property list */ + if (NULL == (dx_plist = H5I_object(io_info->dxpl_id))) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset creation property list") -done: - /* Reset the dxpl settings */ - if(plist_is_setup) { - if(H5FD_mpi_teardown_collective(io_info->dxpl_id)<0) - HDONE_ERROR(H5E_DATASPACE, H5E_CANTFREE, FAIL, "unable to reset dxpl values"); - } /* end if */ + /* Change the xfer_mode to independent for handling the I/O */ + io_info->dxpl_cache->xfer_mode = H5FD_MPIO_INDEPENDENT; + if(H5P_set (dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &io_info->dxpl_cache->xfer_mode) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode") - /* free the MPI buf and file types */ - if (mbt_is_derived) { - if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &mpi_buf_type ))) - HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); - } - if (mft_is_derived) { - if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &mpi_file_type ))) - HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); - } + /* Indicate that the transfer mode should be restored before returning + * to user. + */ + io_info->xfer_mode_changed = TRUE; + io_info->ops.read = H5D_select_read; + io_info->ops.write = H5D_select_write; + } /* end if */ +#endif +done: + FUNC_LEAVE_NOAPI(ret_value) +} - FUNC_LEAVE_NOAPI(ret_value); -} /* end H5D_mpio_spaces_xfer() */ /*------------------------------------------------------------------------- @@ -317,17 +339,20 @@ done: */ herr_t H5D_mpio_select_read(H5D_io_info_t *io_info, - size_t UNUSED nelmts, size_t elmt_size, - const H5S_t *file_space, const H5S_t *mem_space, - void *buf/*out*/) + size_t mpi_buf_count, + size_t elmt_size, + const H5S_t UNUSED *file_space, + const H5S_t UNUSED *mem_space, + haddr_t addr, + void *buf/*out*/) { - herr_t ret_value; + herr_t ret_value = SUCCEED; - FUNC_ENTER_NOAPI_NOFUNC(H5D_mpio_select_read); - - ret_value = H5D_mpio_spaces_xfer(io_info, elmt_size, file_space, - mem_space, buf, 0/*read*/); + FUNC_ENTER_NOAPI(H5D_mpio_select_read,FAIL); + if(H5F_block_read (io_info->dset->oloc.file, H5FD_MEM_DRAW, addr, mpi_buf_count, io_info->dxpl_id, buf) < 0) + HGOTO_ERROR(H5E_IO,H5E_READERROR,FAIL,"can't finish collective parallel read"); +done: FUNC_LEAVE_NOAPI(ret_value); } /* end H5D_mpio_select_read() */ @@ -345,19 +370,1382 @@ H5D_mpio_select_read(H5D_io_info_t *io_info, */ herr_t H5D_mpio_select_write(H5D_io_info_t *io_info, - size_t UNUSED nelmts, size_t elmt_size, - const H5S_t *file_space, const H5S_t *mem_space, - const void *buf) + size_t mpi_buf_count, + size_t elmt_size, + const H5S_t UNUSED *file_space, + const H5S_t UNUSED *mem_space, + haddr_t addr, + const void *buf) { herr_t ret_value; - FUNC_ENTER_NOAPI_NOFUNC(H5D_mpio_select_write); + FUNC_ENTER_NOAPI(H5D_mpio_select_write,FAIL); /*OKAY: CAST DISCARDS CONST QUALIFIER*/ - ret_value = H5D_mpio_spaces_xfer(io_info, elmt_size, file_space, - mem_space, (void*)buf, 1/*write*/); + if(H5F_block_write (io_info->dset->oloc.file, H5FD_MEM_DRAW, addr, mpi_buf_count, io_info->dxpl_id, buf)<0) + HGOTO_ERROR(H5E_IO,H5E_WRITEERROR,FAIL,"can't finish collective parallel write"); +done: + FUNC_LEAVE_NOAPI(ret_value); +} /* end H5D_mpio_select_write() */ + +/*------------------------------------------------------------------------- + * Function: H5D_ioinfo_make_ind + * + * Purpose: Switch to MPI independent I/O + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: Quincey Koziol + * Friday, August 12, 2005 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5D_ioinfo_make_ind(H5D_io_info_t *io_info) +{ + H5P_genplist_t *dx_plist; /* Data transer property list */ + herr_t ret_value = SUCCEED; /*return value */ + + FUNC_ENTER_NOAPI_NOINIT(H5D_ioinfo_make_ind) + + /* Get the dataset transfer property list */ + if (NULL == (dx_plist = H5I_object(io_info->dxpl_id))) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset transfer property list") + + /* Change the xfer_mode to independent, handle the request, + * then set xfer_mode before return. + */ + io_info->dxpl_cache->xfer_mode = H5FD_MPIO_INDEPENDENT; + if(H5P_set (dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &io_info->dxpl_cache->xfer_mode) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode") + + /* Set the pointers to the non-MPI-specific routines */ + io_info->ops.read = H5D_select_read; + io_info->ops.write = H5D_select_write; + + /* Indicate that the transfer mode should be restored before returning + * to user. + */ + io_info->xfer_mode_changed=TRUE; + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5D_ioinfo_make_ind() */ + + +/*------------------------------------------------------------------------- + * Function: H5D_ioinfo_make_coll + * + * Purpose: Switch to MPI collective I/O + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: Quincey Koziol + * Friday, August 12, 2005 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5D_ioinfo_make_coll(H5D_io_info_t *io_info) +{ + H5P_genplist_t *dx_plist; /* Data transer property list */ + herr_t ret_value = SUCCEED; /*return value */ + + FUNC_ENTER_NOAPI_NOINIT(H5D_ioinfo_make_coll) + + /* Get the dataset transfer property list */ + if (NULL == (dx_plist = H5I_object(io_info->dxpl_id))) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset transfer property list") + + /* Change the xfer_mode to independent, handle the request, + * then set xfer_mode before return. + */ + io_info->dxpl_cache->xfer_mode = H5FD_MPIO_COLLECTIVE; + if(H5P_set (dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &io_info->dxpl_cache->xfer_mode) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode") + + /* Set the pointers to the MPI-specific routines */ + io_info->ops.read = H5D_mpio_select_read; + io_info->ops.write = H5D_mpio_select_write; + + /* Indicate that the transfer mode should _NOT_ be restored before returning + * to user. + */ + io_info->xfer_mode_changed=FALSE; + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5D_ioinfo_make_coll() */ + + + + +/*------------------------------------------------------------------------- + * Function: H5D_mpio_get_min_chunk + * + * Purpose: Routine for obtaining minimum number of chunks to cover + * hyperslab selection selected by all processors. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: + * + *------------------------------------------------------------------------- + */ +static herr_t +H5D_mpio_get_min_chunk(const H5D_io_info_t *io_info, + const fm_map *fm, int *min_chunkf) +{ + int num_chunkf; /* Number of chunks to iterate over */ + int mpi_code; /* MPI return code */ + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI_NOINIT(H5D_mpio_get_min_chunk); + + /* Get the number of chunks to perform I/O on */ + num_chunkf = H5SL_count(fm->fsel); + + /* Determine the minimum # of chunks for all processes */ + if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&num_chunkf, min_chunkf, 1, MPI_INT, MPI_MIN, io_info->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code) + +done: FUNC_LEAVE_NOAPI(ret_value); -} /* end H5D_mpio_spaces_write() */ +} /* end H5D_mpio_get_min_chunk() */ +/*------------------------------------------------------------------------- + * Function: H5D_mpio_get_sum_chunk + * + * Purpose: Routine for obtaining total number of chunks to cover + * hyperslab selection selected by all processors. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: + * + *------------------------------------------------------------------------- + */ +static herr_t +H5D_mpio_get_sum_chunk(const H5D_io_info_t *io_info, + const fm_map *fm, int *sum_chunkf) +{ + int num_chunkf; /* Number of chunks to iterate over */ + size_t ori_num_chunkf; + int mpi_code; /* MPI return code */ + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI_NOINIT(H5D_mpio_get_sum_chunk); + + /* Get the number of chunks to perform I/O on */ + num_chunkf = 0; + ori_num_chunkf = H5SL_count(fm->fsel); + H5_ASSIGN_OVERFLOW(num_chunkf,ori_num_chunkf,size_t,int); + + /* Determine the minimum # of chunks for all processes */ + if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&num_chunkf, sum_chunkf, 1, MPI_INT, MPI_SUM, io_info->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code) + +done: + FUNC_LEAVE_NOAPI(ret_value); +} /* end H5D_mpio_get_sum_chunk() */ + + +/*------------------------------------------------------------------------- + * Function: H5D_contig_collective_io + * + * Purpose: Routine for + * 1) building up MPI derived datatype + * 2) setting up collective IO property list + * 3) Do IO + * + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +herr_t +H5D_contig_collective_io(H5D_io_info_t *io_info, + const H5S_t *file_space, + const H5S_t *mem_space, + void *buf, + hbool_t do_write) +{ + + + haddr_t addr = HADDR_UNDEF; /* Address of dataset (or selection) within file */ + hbool_t select_valid; + herr_t ret_value = SUCCEED; /* return value */ + + FUNC_ENTER_NOAPI_NOINIT(H5D_contig_collective_io) + assert (IS_H5FD_MPIO(io_info->dset->oloc.file)); + + /* Make certain we have the correct type of property list */ + assert(TRUE==H5P_isa_class(io_info->dxpl_id,H5P_DATASET_XFER)); + + /* Get the base address of the contiguous dataset */ + if(io_info->dset->shared->layout.type == H5D_CONTIGUOUS) + addr = H5D_contig_get_addr(io_info->dset); + + if(H5D_inter_collective_io(io_info,file_space,mem_space,addr,buf,do_write)<0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO"); + + done: + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5D_contig_collective_io */ + +/*------------------------------------------------------------------------- + * Function: H5D_chunk_collective_io + * + * Purpose: Routine for + 1) choose an IO option: + a) One collective IO defined by one MPI derived datatype to link through all chunks + or b) multiple chunk IOs,to do MPI-IO for each chunk, the IO mode may be adjusted + due to the selection pattern for each chunk. + * For option a) + 1. Sort the chunk address, obtain chunk info according to the sorted chunk address + 2. Build up MPI derived datatype for each chunk + 3. Build up the final MPI derived datatype + 4. Set up collective IO property list + 5. Do IO + * For option b) + 1. Use MPI_gather and MPI_Bcast to obtain information of *collective/independent/none* + IO mode for each chunk of the data space + 2. Depending on whether the IO mode is collective or independent or none, + Create either MPI derived datatype for each chunk to do collective IO or just do independent IO + 3. Set up collective IO property list for collective mode + 4. DO IO + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +herr_t +H5D_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,void *buf, hbool_t do_write) +{ + + int io_option = H5D_MULTI_CHUNK_IO; + int min_chunk,sum_chunk,mpi_size; + int one_link_chunk_io_threshold; + herr_t ret_value = SUCCEED; + + + FUNC_ENTER_NOAPI_NOINIT(H5D_chunk_collective_io) + + assert (IS_H5FD_MPIO(io_info->dset->oloc.file)); + + if(H5D_mpio_get_sum_chunk(io_info,fm,&sum_chunk)<0) + HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the total chunk number of all processes"); + if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file))<0) + HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size"); + one_link_chunk_io_threshold = H5D_ONE_LINK_CHUNK_IO_THRESHOLD;/*This should be replaced by the user inputting value from API. */ + + /* step 1: choose an IO option */ + /* If the average number of chunk per process is greater than a threshold, we will do one link chunked IO. */ + if(sum_chunk/mpi_size >= one_link_chunk_io_threshold) io_option = H5D_ONE_LINK_CHUNK_IO; + +/* If this MPI-IO package doesn't support collective IO when no IO is done for one or more processes, + use MULTIPLE CHUNK IO */ +/* +#ifndef H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS + if(H5D_mpio_get_min_chunk(io_info,fm,&min_chunk)<0) + HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the min chunk number of all processes"); + if(min_chunk == 0) io_option = H5D_MULTI_CHUNK_IO; +#endif +*/ + +#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS + if(io_option == H5D_ONE_LINK_CHUNK_IO ) io_option = H5D_MULTI_CHUNK_IO ;/* We can not do this with one chunk IO. */ +#endif + + /* step 2: Go ahead to do IO.*/ + if(io_option == H5D_ONE_LINK_CHUNK_IO) { + if(H5D_link_chunk_collective_io(io_info,fm,buf,do_write,sum_chunk)<0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish linked chunk MPI-IO"); + } + else {/* multiple chunk IOs */ + printf("coming into multiple chunk \n"); + if(H5D_multi_chunk_collective_io(io_info,fm,buf,do_write)<0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish multiple chunk MPI-IO"); + } + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5D_chunk_collective_io */ + +/*------------------------------------------------------------------------- + * Function: H5D_link_chunk_collective_io + * + * Purpose: Routine for one collective IO with one MPI derived datatype to link with all chunks + + 1. Sort the chunk address and chunk info + 2. Build up MPI derived datatype for each chunk + 3. Build up the final MPI derived datatype + 4. Use common collective IO routine to do MPI-IO + + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: + * + * Modifications: + * + *------------------------------------------------------------------------- + */ + +static herr_t +H5D_link_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,void *buf, hbool_t do_write,int sum_chunk) +{ + + + size_t src_type_size; /*size of source type */ + size_t dst_type_size; /*size of destination type*/ + hsize_t mpi_buf_extra_offset; + hsize_t mpi_file_extra_offset; + size_t mpi_buf_count; + size_t mpi_file_count; + hbool_t mbt_is_derived=0, /* Whether the buffer (memory) type is derived and needs to be free'd */ + mft_is_derived=0; /* Whether the file type is derived and needs to be free'd */ + + int mpi_size,mpi_code; /* MPI return code */ + + int i,num_chunk,total_chunks; + size_t ori_num_chunk; + hsize_t ori_total_chunks; + haddr_t chunk_base_addr; + haddr_t* total_chunk_addr_array; + MPI_Datatype *chunk_mtype; + MPI_Datatype *chunk_ftype; + MPI_Datatype chunk_final_mtype; + MPI_Datatype chunk_final_ftype; + MPI_Aint *chunk_disp_array; + MPI_Aint *chunk_mem_disp_array; + int *blocklen; + int blocklen_value; + int actual_bsearch_coll_chunk_threshold; + int bsearch_coll_chunk_threshold; + int bsearch_chunk_ratio; + int bsearch_chunk_threshold; + int many_chunk_opt = 0; + + H5D_common_coll_info_t coll_info; + H5D_chunk_addr_info_t* chunk_addr_info_array; + +#ifdef CC_PERF + char *bc_percent = NULL; + char *bcc_percent = NULL; +#endif + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI_NOINIT(H5D_link_chunk_collective_io) + ori_total_chunks = fm->total_chunks; + H5_ASSIGN_OVERFLOW(total_chunks,ori_total_chunks,hsize_t,int); + + /* Handle with a special case when only one chunk is covered by all processes */ + if(total_chunks == 1){ + H5SL_node_t *chunk_node; + H5D_chunk_info_t *chunk_info; + H5D_storage_t store; + + chunk_node = H5SL_first(fm->fsel); + if(chunk_node == NULL) { + if(H5D_istore_chunkmap(io_info,total_chunks,&chunk_base_addr,fm->down_chunks)<0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address"); + if(H5D_inter_collective_io(io_info,NULL,NULL,chunk_base_addr,buf,do_write)<0) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO"); + } + else { + if(NULL ==(chunk_info = H5SL_item(chunk_node))) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list"); + io_info->store = &store; + store.chunk.offset = chunk_info->coords; + store.chunk.index = chunk_info->index; + + if(HADDR_UNDEF==(chunk_base_addr = H5D_istore_get_addr(io_info,NULL))) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list"); + +#ifdef KENT +printf("before inter_collective_io for total chunk = 1 \n"); +#endif + if(H5D_inter_collective_io(io_info,chunk_info->fspace,chunk_info->mspace,chunk_base_addr,buf,do_write)<0) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO"); + } + goto done; + } + + /* Allocate chunking information */ + + ori_num_chunk = H5SL_count(fm->fsel); + H5_ASSIGN_OVERFLOW(num_chunk,ori_num_chunk,size_t,int); + + + if(num_chunk == 0) total_chunk_addr_array = H5MM_malloc(sizeof(haddr_t)*total_chunks); + else + { + chunk_addr_info_array= H5MM_malloc(num_chunk*sizeof(H5D_chunk_addr_info_t)); + chunk_mtype = H5MM_malloc(num_chunk*sizeof(MPI_Datatype)); + chunk_ftype = H5MM_malloc(num_chunk*sizeof(MPI_Datatype)); + chunk_disp_array = H5MM_malloc(num_chunk*sizeof(MPI_Aint)); + chunk_mem_disp_array = H5MM_calloc(num_chunk*sizeof(MPI_Aint)); + blocklen = H5MM_malloc(num_chunk*sizeof(int)); + } + + /* Obtain information to do collective IO, + in order to do collective IO, no datatype conversion should happen. */ + if((src_type_size = H5T_get_size(io_info->dset->shared->type))==0) + HGOTO_ERROR(H5E_DATATYPE, H5E_BADSIZE, FAIL, "datatype size invalid"); + dst_type_size = src_type_size; + + +#ifdef CC_PERF + /* "bcc" means 'b-tree iterately obtain all chunk addresses collectively', + "bc" means 'b-tree iterately obtain all chunk addresses individually', + the default one means 'obtaining the chunk address individually', + */ + + if(bcc_percent=getenv("BCC_PERCENT")){ + bsearch_coll_chunk_threshold = atoi(bcc_percent); + assert((bsearch_coll_chunk_threshold >=0) &&(bsearch_coll_chunk_threshold <=100)); + } + else + bsearch_coll_chunk_threshold = H5D_ALL_CHUNK_ADDR_THRES_COL; +#else + bsearch_coll_chunk_threshold = H5D_ALL_CHUNK_ADDR_THRES_COL; /*This number may be changed according to the performance study */ +#endif + + if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file))<0) + HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size"); + + /* Calculate the actual threshold to obtain all chunk addresses collectively + The bigger this number is, the more possible the use of obtaining chunk address collectively. */ + actual_bsearch_coll_chunk_threshold = sum_chunk*100/(total_chunks*mpi_size); + + if(actual_bsearch_coll_chunk_threshold >= bsearch_coll_chunk_threshold) + many_chunk_opt = H5D_OBTAIN_ALL_CHUNK_ADDR_COL; + + else { + +#ifdef CC_PERF + if(bc_percent=getenv("BC_PERCENT")){ + bsearch_chunk_ratio = atoi(bc_percent); + assert((bsearch_chunk_ratio<=100)&&(bsearch_chunk_ratio>=0)); + } + else + bsearch_chunk_ratio = H5D_ALL_CHUNK_ADDR_THRES_IND; +#else + bsearch_chunk_ratio = H5D_ALL_CHUNK_ADDR_THRES_IND; /*This number may be changed according to the performance study */ +#endif + + /* This threshold is to check whether we can use iterator to obtain all chunk addresses. + The unit of the threshold is the number of chunks. The value should be at least 1. + It can be calculated as follows: + + if(total_chunks*bsearch_chunk_ratio/100 <=1) + bsearch_chunk_threahold = 1; + else + bsearch_chunk_threshold = total_chunks*bsearch_chunk_ratio/100; + In order to make the caluculation more efficient, + we use the following approximate formula to calculate the threshold. + + bsearch_chunk_threshold = 1+ (total_chunks*bsearch_chunk_ratio-99)/100; + + The only difference is when total_chunks* besearch_chunk_ratio == 100n+99; + the approximate formula will give value (n+1) instead of n for threshold. + That shouldn't matter much from our persective. + */ + + bsearch_chunk_threshold = 1 +(total_chunks*bsearch_chunk_ratio-99)/100; + if(num_chunk > bsearch_chunk_threshold) many_chunk_opt = H5D_OBTAIN_ALL_CHUNK_ADDR_IND; + } + + /* Sort the chunk address + when chunk optimization selection is either H5D_OBTAIN_*/ + if(num_chunk == 0){ + if(H5D_istore_chunkmap(io_info,total_chunks,total_chunk_addr_array,fm->down_chunks)<0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address"); + chunk_base_addr = total_chunk_addr_array[0]; + } + + else { + if(H5D_sort_chunk(io_info,fm,chunk_addr_info_array,many_chunk_opt)<0) + HGOTO_ERROR (H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to sort chunk address"); + chunk_base_addr = chunk_addr_info_array[0].chunk_addr; + } + + /* Obtain MPI derived datatype from all individual chunks */ + for ( i = 0; i < num_chunk; i++) { + /* Disk MPI derived datatype */ + if(H5S_mpio_space_type(chunk_addr_info_array[i].chunk_info.fspace,src_type_size,&chunk_ftype[i], + &mpi_file_count,&mpi_file_extra_offset,&mft_is_derived)<0) + HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI file type"); + + /* Buffer MPI derived datatype */ + if(H5S_mpio_space_type(chunk_addr_info_array[i].chunk_info.mspace,dst_type_size,&chunk_mtype[i], + &mpi_buf_count,&mpi_buf_extra_offset,&mbt_is_derived)<0) + HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI buf type"); + + /* Chunk address relative to the first chunk */ + chunk_addr_info_array[i].chunk_addr -= chunk_base_addr; + H5_ASSIGN_OVERFLOW(chunk_disp_array[i],chunk_addr_info_array[i].chunk_addr,haddr_t,MPI_Aint); + /*chunk_disp_array[i] = (MPI_Aint)chunk_addr_array[i];*/ + } + + blocklen_value = 1; + if(num_chunk){ + + /* initialize the buffer with the constant value 1 */ + H5V_array_fill(blocklen,&blocklen_value,sizeof(int),(size_t)num_chunk); + + /* Create final MPI derived datatype */ + if(MPI_SUCCESS != (mpi_code = MPI_Type_struct(num_chunk,blocklen,chunk_disp_array,chunk_ftype,&chunk_final_ftype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_struct failed", mpi_code); + if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&chunk_final_ftype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code); + + if(MPI_SUCCESS != (mpi_code = MPI_Type_struct(num_chunk,blocklen,chunk_mem_disp_array,chunk_mtype,&chunk_final_mtype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_struct failed", mpi_code); + if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&chunk_final_mtype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code); + + for ( i = 0; i< num_chunk;i++){ + if (MPI_SUCCESS != (mpi_code= MPI_Type_free( chunk_mtype+i ))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); + if (MPI_SUCCESS != (mpi_code= MPI_Type_free( chunk_ftype+i ))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); + } + + /* buffer, file derived datatypes should be true */ + coll_info.mbt_is_derived = 1; + coll_info.mft_is_derived = 1; + coll_info.mpi_buf_count = 1; + coll_info.chunk_addr = chunk_base_addr; + + } + + else {/* no selection at all for this process */ + chunk_final_ftype = MPI_BYTE; + chunk_final_mtype = MPI_BYTE; + + /* buffer, file derived datatypes should be true */ + coll_info.mbt_is_derived = 0; + coll_info.mft_is_derived = 0; + coll_info.mpi_buf_count = 0; + coll_info.chunk_addr = chunk_base_addr; + } + if(H5D_final_collective_io(io_info,&chunk_final_ftype,&chunk_final_mtype,&coll_info,buf,do_write)<0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish MPI-IO"); + + done: + if(num_chunk == 0) HDfree(total_chunk_addr_array); + else { + if (fm->total_chunks != 1) { + HDfree(chunk_addr_info_array); + HDfree(chunk_mtype); + HDfree(chunk_ftype); + HDfree(chunk_disp_array); + HDfree(chunk_mem_disp_array); + HDfree(blocklen); + } + } + + FUNC_LEAVE_NOAPI(ret_value) +} +/* end H5D_link_chunk_collective_io */ + + +/*------------------------------------------------------------------------- + * Function: H5D_multi_chunk_collective_io + * + * Purpose: To do IO per chunk according to IO mode(collective/independent/none) + + 1. Use MPI_gather and MPI_Bcast to obtain IO mode in each chunk(collective/independent/none) + 2. Depending on whether the IO mode is collective or independent or none, + Create either MPI derived datatype for each chunk or just do independent IO + 3. Use common collective IO routine to do MPI-IO + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static herr_t +H5D_multi_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,void *buf, hbool_t do_write) +{ + + int i,total_chunk; + int mpi_rank; + hsize_t ori_total_chunk; + uint8_t *chunk_io_option; + + H5SL_node_t *chunk_node; /* Current node in chunk skip list */ + H5D_chunk_info_t *chunk_info; + haddr_t *chunk_addr; + H5D_storage_t store; /* union of EFL and chunk pointer in file space */ + hbool_t select_chunk; + hbool_t last_io_mode_coll = TRUE; + herr_t ret_value = SUCCEED; + + + FUNC_ENTER_NOAPI_NOINIT(H5D_multi_chunk_collective_io) +#ifdef KENT + mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file); +#endif + /* Allocate memories */ + ori_total_chunk = fm->total_chunks; + H5_ASSIGN_OVERFLOW(total_chunk,ori_total_chunk,hsize_t,int); + HDassert(total_chunk!=0); + chunk_io_option = (uint8_t *)H5MM_calloc(total_chunk*sizeof(MPI_BYTE)); + chunk_addr = (haddr_t *)H5MM_calloc(total_chunk*sizeof(haddr_t)); +#ifdef KENT + printf("total_chunk %d\n",total_chunk); +#endif + + /* obtain IO option for each chunk */ + if(H5D_obtain_mpio_mode(io_info,fm,chunk_io_option,chunk_addr)<0) + HGOTO_ERROR (H5E_DATASET, H5E_CANTRECV, FAIL, "unable to obtain MPIO mode"); + + for( i = 0; i<total_chunk;i++){ + +#ifdef KENT +printf("mpi_rank = %d, chunk index = %d\n",mpi_rank,i); +#endif + select_chunk = fm->select_chunk[i]; + if(select_chunk == 1){/* Have selection elements in this chunk. Find the chunk info. */ +#ifdef NEW_WAY + if(NULL ==(chunk_info = H5SL_item(chunk_node))) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list"); + HDassert(chunk_info->index == i); + + /* Set dataset storage for I/O info */ + io_info->store=&store; + /* Pass in chunk's coordinates in a union. */ + store.chunk.offset = chunk_info->coords; + store.chunk.index = chunk_info->index; + if(NULL ==(chunk_node = H5SL_first(fm->fsel))) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk node from skipped list"); +#else + + if(NULL ==(chunk_node = H5SL_first(fm->fsel))) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk node from skipped list"); + while(chunk_node){ + + if(NULL ==(chunk_info = H5SL_item(chunk_node))) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list"); + if(chunk_info->index == i) { + /* Set dataset storage for I/O info */ + io_info->store=&store; + /* Pass in chunk's coordinates in a union. */ + store.chunk.offset = chunk_info->coords; + store.chunk.index = chunk_info->index; + break; + } + + chunk_node = H5SL_next(chunk_node); + } +#endif + } + + if(chunk_io_option[i] == 1){ /*collective IO for this chunk, + note: even there is no selection for this process, + the process still needs to contribute MPI NONE TYPE.*/ +#ifdef KENT +printf("inside collective chunk IO mpi_rank = %d, chunk index = %d\n",mpi_rank,i); +#endif + + if(!last_io_mode_coll) + /* Switch back to collective I/O */ + if(H5D_ioinfo_make_coll(io_info) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to collective I/O") + + if(select_chunk){ + if(H5D_inter_collective_io(io_info,chunk_info->fspace,chunk_info->mspace, + chunk_addr[i],buf,do_write )<0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO"); + + } + else{ + if(H5D_inter_collective_io(io_info,NULL,NULL, + chunk_addr[i],buf,do_write )<0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO"); + + } + + + last_io_mode_coll = TRUE; + + } + else {/*possible independent IO for this chunk*/ +#ifdef KENT +printf("inside independent IO mpi_rank = %d, chunk index = %d\n",mpi_rank,i); +#endif + + HDassert(chunk_io_option[i] == 0); + if(!select_chunk) continue; /* this process has nothing to do with this chunk, continue! */ + if(last_io_mode_coll) + /* Switch to independent I/O */ + if(H5D_ioinfo_make_ind(io_info) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O") + + if(do_write) { + ret_value = (io_info->ops.write)(io_info, + chunk_info->chunk_points,H5T_get_size(io_info->dset->shared->type), + chunk_info->fspace,chunk_info->mspace,0, + buf); + /* Check return value of the write */ + if (ret_value<0) + HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed") + } + else { + ret_value = (io_info->ops.read)(io_info, + chunk_info->chunk_points,H5T_get_size(io_info->dset->shared->type), + chunk_info->fspace,chunk_info->mspace,0, + buf); + /* Check return value from optimized write */ + if (ret_value<0) + HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed") + } + + last_io_mode_coll = FALSE; + } + } + + if(!last_io_mode_coll) + /* Switch back to collective I/O */ + if(H5D_ioinfo_make_coll(io_info) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to collective I/O") + + done: + HDfree(chunk_io_option); + HDfree(chunk_addr); + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5D_multi_chunk_collective_io */ + +/*------------------------------------------------------------------------- + * Function: H5D_inter_collective_io + * + * Purpose: Routine for the shared part of collective IO between multiple chunk + collective IO and contiguous collective IO + + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static herr_t +H5D_inter_collective_io(H5D_io_info_t *io_info,const H5S_t *file_space,const H5S_t *mem_space, + haddr_t addr, void *buf, hbool_t do_write ) +{ + + size_t mpi_buf_count, mpi_file_count; /* Number of "objects" to transfer */ + MPI_Datatype mpi_file_type,mpi_buf_type; + hsize_t mpi_buf_offset, mpi_file_offset; /* Offset within dataset where selection (ie. MPI type) begins */ + hbool_t mbt_is_derived=0, /* Whether the buffer (memory) type is derived and needs to be free'd */ + mft_is_derived=0; /* Whether the file type is derived and needs to be free'd */ + H5D_common_coll_info_t coll_info; + herr_t ret_value = SUCCEED; /* return value */ + + FUNC_ENTER_NOAPI_NOINIT(H5D_inter_collective_io) + if((file_space!=NULL) && (mem_space != NULL)) { + /*Obtain disk and memory MPI derived datatype */ + if(H5S_mpio_space_type(file_space,H5T_get_size(io_info->dset->shared->type), + &mpi_file_type,&mpi_file_count,&mpi_file_offset,&mft_is_derived)<0) + HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI file type"); + + if(H5S_mpio_space_type(mem_space,H5T_get_size(io_info->dset->shared->type), + &mpi_buf_type,&mpi_buf_count,&mpi_buf_offset,&mbt_is_derived)<0) + HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI buffer type"); + + } + else { + /* For non-selection, participate with a none MPI derived datatype, the count is 0. */ + mpi_buf_type = MPI_BYTE; + mpi_file_type = MPI_BYTE; + mpi_file_count = 0; + mpi_buf_count = 0; + } + + coll_info.mbt_is_derived = mbt_is_derived; + coll_info.mft_is_derived = mft_is_derived; + coll_info.mpi_buf_count = mpi_buf_count; + coll_info.chunk_addr = addr; + +#ifdef KENT +printf("before final collective IO\n"); +#endif + if(H5D_final_collective_io(io_info,&mpi_file_type,&mpi_buf_type,&coll_info,buf,do_write)<0) + HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish collective MPI-IO"); + done: + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5D_inter_collective_io */ +/*------------------------------------------------------------------------- + * Function: H5D_final_collective_io + * + * Purpose: Routine for the common part of collective IO with different storages. + + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static herr_t +H5D_final_collective_io(H5D_io_info_t *io_info,MPI_Datatype*mpi_file_type,MPI_Datatype *mpi_buf_type, + H5D_common_coll_info_t* coll_info, void *buf, hbool_t do_write) +{ + + + int mpi_code; /* MPI return code */ + hbool_t plist_is_setup=0; /* Whether the dxpl has been customized */ + herr_t ret_value = SUCCEED; + + + FUNC_ENTER_NOAPI_NOINIT(H5D_final_collective_io) + + /* + * Pass buf type, file type to the file driver. + */ + + if(H5FD_mpi_setup_collective(io_info->dxpl_id, *mpi_buf_type, *mpi_file_type)<0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O properties"); + + plist_is_setup=1; + /*HDfprintf(stdout,"chunk addr %Hu\n",coll_info->chunk_addr); + printf("mpi_buf_count %d\n",coll_info->mpi_buf_count); */ + if(do_write) { + ret_value = (io_info->ops.write)(io_info, + coll_info->mpi_buf_count,0,NULL,NULL,coll_info->chunk_addr, + buf); + /* Check return value from optimized write */ + if (ret_value<0) + HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed") + } + else { + ret_value = (io_info->ops.read)(io_info, + coll_info->mpi_buf_count,0,NULL,NULL,coll_info->chunk_addr, + buf); + /* Check return value from optimized write */ + if (ret_value<0) + HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed") + } + done: + + /* Reset the dxpl settings */ + if(plist_is_setup) { + if(H5FD_mpi_teardown_collective(io_info->dxpl_id)<0) + HDONE_ERROR(H5E_DATASPACE, H5E_CANTFREE, FAIL, "unable to reset dxpl values"); + } /* end if */ + + /* free the MPI buf and file types */ + if (coll_info->mbt_is_derived) { + if (MPI_SUCCESS != (mpi_code= MPI_Type_free( mpi_buf_type ))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); + + } + if (coll_info->mft_is_derived) { + if (MPI_SUCCESS != (mpi_code= MPI_Type_free( mpi_file_type ))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); + } + + FUNC_LEAVE_NOAPI(ret_value) +}/* end H5D_final_collective_io */ + +#ifdef OLD_WAY +/*------------------------------------------------------------------------- + * Function: H5D_pre_sort_chunk + * + * Purpose: Routine to obtain addresses of all chunks for all processes + + Description: + root will collective all chunk addresses and broadcast towards other processes. + + Parameters: + + Input: H5D_io_info_t* io_info, + int total_chunks, + Output: haddr_t total_chunk_addr_array[], : array to store sorted total chunk addresses + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static herr_t +H5D_pre_sort_chunk(H5D_io_info_t *io_info,int total_chunks,haddr_t total_chunk_addr_array[]){ + + int root, mpi_rank; + MPI_Comm comm; + int mpi_code; + MPI_Datatype chunk_addrtype; + int mpi_type_cleanup = 0; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI_NOINIT(H5D_pre_sort_chunk) + + root = 0; + comm = io_info->comm; + if(MPI_SUCCESS !=(mpi_code = MPI_Comm_rank(comm,&mpi_rank))) + HMPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code); + + /*Create received MPI derived datatype */ + if(MPI_SUCCESS !=(mpi_code = MPI_Type_contiguous(sizeof(haddr_t)*total_chunks,MPI_BYTE,&chunk_addrtype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_contiguous failed", mpi_code); + if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&chunk_addrtype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code); + + + mpi_type_cleanup = 1; + + if(mpi_rank == root) { + if(H5D_istore_chunkmap(io_info,total_chunks,total_chunk_addr_array)<0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address"); + + } + /* Broadcasting the MPI_IO option info. and chunk address info. */ + if(MPI_SUCCESS !=(mpi_code = MPI_Bcast(total_chunk_addr_array,1,chunk_addrtype,root,comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code); + +done: + + if(mpi_type_cleanup){ + if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &chunk_addrtype ))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); + } + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5D_pre_sort_chunk() */ + +#endif +/*------------------------------------------------------------------------- + * Function: H5D_sort_chunk + * + * Purpose: Routine to sort chunks in increasing order of chunk address + Each chunk address is also obtained. + + Description: + For most cases, the chunk address has already been sorted in increasing order. + The special sorting flag is used to optimize this common case. + quick sort is used for necessary sorting. + + Parameters: + Input: H5D_io_info_t* io_info, + fm_map *fm(global chunk map struct) + Input/Output: H5D_chunk_addr_info_t chunk_addr_info_array[] : array to store chunk address and information + many_chunk_opt : flag to optimize the way to obtain chunk addresses + for many chunks + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: + * + * Modifications: + * + *------------------------------------------------------------------------- + */ + +static herr_t +H5D_sort_chunk(H5D_io_info_t * io_info, + fm_map *fm, + H5D_chunk_addr_info_t chunk_addr_info_array[], + int many_chunk_opt) +{ + + + H5SL_node_t *chunk_node; /* Current node in chunk skip list */ + H5D_chunk_info_t *chunk_info; /* Current chunking info. of this node. */ + haddr_t chunk_addr; /* Current chunking address of this node */ + haddr_t *total_chunk_addr_array; /* The array of chunk address for the total number of chunk */ + int i,j,k,mpi_code; + int total_chunks; + size_t num_chunks; + int mpi_type_cleanup = 0; + int tchunk_addr_cleanup = 0; + MPI_Datatype chunk_addrtype; + H5D_storage_t store; /*union of EFL and chunk pointer in file space */ + hbool_t do_sort = FALSE; + herr_t ret_value = SUCCEED; /*return value */ + + FUNC_ENTER_NOAPI_NOINIT(H5D_sort_chunk) + + num_chunks = H5SL_count(fm->fsel); + /* If we need to optimize the way to obtain the chunk address */ + if(many_chunk_opt != H5D_OBTAIN_ONE_CHUNK_ADDR_IND){ + + total_chunks = (int)fm->total_chunks; + total_chunk_addr_array = H5MM_malloc(sizeof(haddr_t)*total_chunks); + tchunk_addr_cleanup = 1; + + if(many_chunk_opt == H5D_OBTAIN_ALL_CHUNK_ADDR_COL) {/* We will broadcast the array from the root process */ + + int mpi_rank, root,mpi_code; + root = 0; + if((mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file))<0) + HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank"); + + /*Create received MPI derived datatype */ + if(MPI_SUCCESS !=(mpi_code = MPI_Type_contiguous(sizeof(haddr_t)*total_chunks,MPI_BYTE,&chunk_addrtype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_contiguous failed", mpi_code); + if(MPI_SUCCESS !=(mpi_code = MPI_Type_commit(&chunk_addrtype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code); + + mpi_type_cleanup = 1; + + if(mpi_rank == root) { + if(H5D_istore_chunkmap(io_info,total_chunks,total_chunk_addr_array,fm->down_chunks)<0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address"); + } + /* Broadcasting the MPI_IO option info. and chunk address info. */ + if(MPI_SUCCESS !=(mpi_code = MPI_Bcast(total_chunk_addr_array,1,chunk_addrtype,root,io_info->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code); + } + + else { /* Obtain all chunk addresses independently */ + if(H5D_istore_chunkmap(io_info,total_chunks,total_chunk_addr_array,fm->down_chunks)<0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address"); + } + } + + /* Get first node in chunk skip list */ + if(NULL ==(chunk_node = H5SL_first(fm->fsel))) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk node from skipped list"); + /* Set dataset storage for I/O info */ + io_info->store = &store; + if(NULL ==(chunk_info = H5SL_item(chunk_node))) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list"); + store.chunk.offset = chunk_info->coords; + store.chunk.index = chunk_info->index; + i = 0; + if(many_chunk_opt == H5D_OBTAIN_ONE_CHUNK_ADDR_IND){ + if(HADDR_UNDEF==(chunk_addr = H5D_istore_get_addr(io_info,NULL))) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list"); + } + else + chunk_addr = total_chunk_addr_array[chunk_info->index]; + chunk_addr_info_array[i].chunk_addr = chunk_addr; + chunk_addr_info_array[i].chunk_info = *chunk_info; + + chunk_node = H5SL_next(chunk_node); + while(chunk_node) { + + chunk_info = H5SL_item(chunk_node); + store.chunk.offset = chunk_info->coords; + store.chunk.index = chunk_info->index; + + if(many_chunk_opt == H5D_OBTAIN_ONE_CHUNK_ADDR_IND){ + if(HADDR_UNDEF==(chunk_addr = H5D_istore_get_addr(io_info,NULL))) + HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list"); + } + else + chunk_addr = total_chunk_addr_array[chunk_info->index]; + + if(chunk_addr < chunk_addr_info_array[i].chunk_addr) do_sort = TRUE; + chunk_addr_info_array[i+1].chunk_addr = chunk_addr; + chunk_addr_info_array[i+1].chunk_info =*chunk_info; + i++; + chunk_node = H5SL_next(chunk_node); + } + + if(do_sort) + HDqsort(chunk_addr_info_array,num_chunks,sizeof(chunk_addr_info_array),H5D_cmp_chunk_addr); + +done: + + if(tchunk_addr_cleanup) + HDfree(total_chunk_addr_array); + if(mpi_type_cleanup) { + if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &chunk_addrtype ))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); + } + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5D_sort_chunk() */ + + +/*------------------------------------------------------------------------- + * Function: H5D_obtain_mpio_mode + * + * Purpose: Routine to obtain each io mode(collective,independent or none) for each chunk; + Each chunk address is also obtained. + + Description: + + 1) Each process provides two piece of information for all chunks with selection + a) chunk index + b) wheather this chunk is regular(for MPI derived datatype not working case) + + 2) Gather all the information to the root process + + 3) Root process will do the following: + a) Obtain chunk address for all chunks in this data space + b) With the consideration of the user option, calculate IO mode for each chunk + c) Build MPI derived datatype to combine "chunk address" and "assign_io" information + in order to do MPI Bcast only once + d) MPI Bcast the IO mode and chunk address information for each chunk. + 4) Each process then retrieves IO mode and chunk address information to assign_io_mode and chunk_addr. + + Parameters: + + Input: H5D_io_info_t* io_info, + fm_map *fm,(global chunk map struct) + Output: uint8_t assign_io_mode[], : IO mode, collective, independent or none + haddr_t chunk_addr[], : chunk address array for each chunk + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: + * + * Modifications: + * + *------------------------------------------------------------------------- + */ + +static herr_t +H5D_obtain_mpio_mode(H5D_io_info_t* io_info, + fm_map *fm, + uint8_t assign_io_mode[], + haddr_t chunk_addr[]) +{ + + int total_chunks; + hsize_t ori_total_chunks; + int percent_nproc_per_chunk,threshold_nproc_per_chunk; + uint8_t* io_mode_info; + uint8_t* recv_io_mode_info; + uint8_t* mergebuf; + uint8_t* tempbuf; + + H5SL_node_t* chunk_node; + H5D_chunk_info_t* chunk_info; + + MPI_Datatype bastype[2]; + MPI_Datatype chunk_addrtype; + int bascount; + int basblock[2]; + MPI_Aint basdisp[2]; + MPI_Datatype rtype; + MPI_Datatype stype; + int mpi_size,mpi_rank; + MPI_Comm comm; + int root; + int mpi_code; + int multi_chunk_io_col_threshold; + int mem_cleanup = 0, + mpi_type_cleanup = 0; + + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI_NOINIT(H5D_obtain_mpio_mode) + + /* Assign the rank 0 to the root */ + root = 0; + comm = io_info->comm; + + /* Obtain the number of process and the current rank of the process */ + if((mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file))<0) + HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank"); + if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file))<0) + HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size"); + multi_chunk_io_col_threshold = H5D_MULTI_CHUNK_IO_COL_THRESHOLD; /* May replace by user-input */ + percent_nproc_per_chunk = multi_chunk_io_col_threshold;/* For example, above 50%, do collective IO */ + threshold_nproc_per_chunk = mpi_size * percent_nproc_per_chunk/100; + + /* Allocate memory */ + ori_total_chunks = fm->total_chunks; + H5_ASSIGN_OVERFLOW(total_chunks,ori_total_chunks,hsize_t,int); + + io_mode_info = (uint8_t *)H5MM_calloc(total_chunks*sizeof(MPI_BYTE)); + mergebuf = H5MM_malloc((sizeof(haddr_t)+sizeof(MPI_BYTE))*total_chunks); + tempbuf = mergebuf + sizeof(MPI_BYTE)*total_chunks; + if(mpi_rank == root) + recv_io_mode_info = (uint8_t *)H5MM_malloc(total_chunks*sizeof(MPI_BYTE)*mpi_size); + + + mem_cleanup = 1; + + chunk_node = H5SL_first(fm->fsel); + + /*Obtain the regularity and selection information for all chunks in this process. */ + while(chunk_node){ + + chunk_info = H5SL_item(chunk_node); + +#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS + /* regularity information: 1, selection information: 2 */ + if(H5S_SELECT_IS_REGULAR(chunk_info->fspace) == TRUE && + H5S_SELECT_IS_REGULAR(chunk_info->mspace) == TRUE) +#endif + io_mode_info[chunk_info->index] = H5D_CHUNK_SELECT_REG; /* this chunk is selected and is "regular" without defining H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS. */ +#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS + else + io_mode_info[chunk_info->index] = H5D_CHUNK_SELECT_IRREG; /* this chunk is selected and is irregular*/ +#endif + + chunk_node = H5SL_next(chunk_node); + } + + /*Create sent MPI derived datatype */ + if(MPI_SUCCESS !=(mpi_code = MPI_Type_contiguous(total_chunks,MPI_BYTE,&stype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code); + if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&stype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code); + + /*Create received basic MPI derived datatype */ + bascount = 2; + basblock[0] = total_chunks; + basblock[1] = total_chunks; + basdisp[0] = 0; + basdisp[1] = (MPI_Aint)(sizeof(MPI_BYTE)*total_chunks);/* may need to check overflow */ + bastype[0] = MPI_BYTE; + + if(MPI_SUCCESS !=(mpi_code = MPI_Type_contiguous(sizeof(haddr_t),MPI_BYTE,&chunk_addrtype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_contiguous failed", mpi_code); + if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&chunk_addrtype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code); + bastype[1] = chunk_addrtype; + + if(MPI_SUCCESS !=(mpi_code = MPI_Type_struct(bascount,basblock,basdisp,bastype,&rtype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_struct failed", mpi_code); + if(MPI_SUCCESS !=(mpi_code = MPI_Type_commit(&rtype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code); + + /* Set up a flag to clean up the MPI derived datatype later */ + mpi_type_cleanup = 1; + + /*Gather all the information */ + if(MPI_SUCCESS !=(mpi_code = MPI_Gather(io_mode_info,1,stype,recv_io_mode_info,1,stype,root,comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Gather failed", mpi_code); + + /* Calculate the mode for IO(collective, independent or none) at root process */ + if(mpi_rank == root) { + + int ic,nproc; + int* nproc_per_chunk; +#if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS) + int* ind_this_chunk; +#endif + + /* pre-computing: calculate number of processes and + regularity of the selection occupied in each chunk */ + nproc_per_chunk = (int*)H5MM_calloc(total_chunks*sizeof(int)); +#if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS) + ind_this_chunk = (int*)H5MM_calloc(total_chunks*sizeof(int)); +#endif + + /* calculating the chunk address */ + if(H5D_istore_chunkmap(io_info,total_chunks,chunk_addr,fm->down_chunks)<0){ + HDfree(nproc_per_chunk); +#if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS) + HDfree(ind_this_chunk); +#endif + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address"); + } + + /* checking for number of process per chunk and regularity of the selection*/ + for (nproc = 0;nproc <mpi_size;nproc++){ + uint8_t *tmp_recv_io_mode_info = recv_io_mode_info + nproc*total_chunks; + /* calculate the number of process per chunk and adding irregular selection option */ + for(ic = 0; ic < total_chunks; ic++, tmp_recv_io_mode_info++){ + if(*tmp_recv_io_mode_info != 0) { + nproc_per_chunk[ic]++; +#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS + if(*tmp_recv_io_mode_info == H5D_CHUNK_SELECT_IRREG) + ind_this_chunk[ic] = 1; +#endif + } +#ifndef H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS + else { + /*checking whether we have a selection in this chunk */ + ind_this_chunk[ic] = 1; + } +#endif + } + + } + + /* Calculating MPIO mode for each chunk (collective, independent, none) */ + for(ic = 0; ic < total_chunks; ic++){ + if(nproc_per_chunk[ic]>=MAX(2,threshold_nproc_per_chunk)){ +#if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS) + if(!ind_this_chunk[ic]) assign_io_mode[ic] = H5D_CHUNK_IO_MODE_COL; +#else + assign_io_mode[ic] = H5D_CHUNK_IO_MODE_COL; +#endif + } + + } + + /* merge buffer io_mode info and chunk addr into one */ + HDmemcpy(mergebuf,assign_io_mode,sizeof(MPI_BYTE)*total_chunks); + HDmemcpy(tempbuf,chunk_addr,sizeof(haddr_t)*total_chunks); + + HDfree(nproc_per_chunk); +#if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS) + HDfree(ind_this_chunk); +#endif + } + + /* Broadcasting the MPI_IO option info. and chunk address info. */ + if(MPI_SUCCESS !=(mpi_code = MPI_Bcast(mergebuf,1,rtype,root,comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code); + + HDmemcpy(assign_io_mode,mergebuf,sizeof(MPI_BYTE)*total_chunks); + HDmemcpy(chunk_addr,tempbuf,sizeof(haddr_t)*total_chunks); + +done: + + if(mpi_type_cleanup) { + if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &chunk_addrtype ))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); + + if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &stype ))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); + + if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &rtype ))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); + } + + if(mem_cleanup){ + HDfree(io_mode_info); + HDfree(mergebuf); + if(mpi_rank == root) + HDfree(recv_io_mode_info); + } + + FUNC_LEAVE_NOAPI(ret_value) + }/* end H5D_obtain_mpio_mode*/ + +static int +H5D_cmp_chunk_addr(const void *chunk_addr_info1, const void *chunk_addr_info2) +{ + haddr_t addr1, addr2; + + FUNC_ENTER_NOAPI_NOINIT(H5D_cmp_chunk_addr) + + addr1 = ((const H5D_chunk_addr_info_t *)chunk_addr_info1)->chunk_addr; + addr2 = ((const H5D_chunk_addr_info_t *)chunk_addr_info2)->chunk_addr; + + FUNC_LEAVE_NOAPI(H5F_addr_cmp(addr1, addr2)) + + } #endif /* H5_HAVE_PARALLEL */ |