From 853ce47ac0341ec3baa28f2e29888fce7be3147a Mon Sep 17 00:00:00 2001 From: MuQun Yang Date: Thu, 8 Sep 2005 17:23:48 -0500 Subject: [svn-r11379] Purpose: To support collective IO for irregular selection in the upcoming 1.6.5 release Description: Support collective chunk/contiguous IO for both regular and irregular selections. Make collective IO features in 1.6 the same as 1.7. Solution: Platforms tested: AIX 5.1(32bit and 64bit) Linux 2.4(heping)-mpich 1.2.6 Linux 2.4(NCSA teragrid) -mpich 1.2.5 Altrix(cobalt) IRIX 6.5- c compiler version 7.4.3(NCAR SGI) Linux 2.4(tune) - CMPI 2.1.0 Misc. update: --- src/H5Dio.c | 848 +++++++++++++++++++++++++++++++--------------------------- src/H5Dmpio.c | 459 ++++++------------------------- src/H5Dpkg.h | 25 +- src/H5Smpio.c | 503 +++++++++++++++------------------- 4 files changed, 776 insertions(+), 1059 deletions(-) diff --git a/src/H5Dio.c b/src/H5Dio.c index 495467a..b030f95 100644 --- a/src/H5Dio.c +++ b/src/H5Dio.c @@ -98,7 +98,7 @@ H5D_chunk_write(H5D_io_info_t *io_info, hsize_t nelmts, const H5S_t *file_space, H5T_path_t *tpath, hid_t src_id, hid_t dst_id, const void *buf); #ifdef H5_HAVE_PARALLEL -static herr_t +/*static herr_t H5D_io_assist_mpio(hid_t dxpl_id, H5D_dxpl_cache_t *dxpl_cache, hbool_t *xfer_mode_changed); static herr_t @@ -107,19 +107,24 @@ static htri_t H5D_get_collective_io_consensus(const H5F_t *file, const htri_t local_opinion, const unsigned flags); +*/ +static herr_t H5D_ioinfo_make_ind(H5D_io_info_t *io_info); +static herr_t H5D_ioinfo_make_coll(H5D_io_info_t *io_info); +static herr_t H5D_ioinfo_term(H5D_io_info_t *io_info); +static herr_t H5D_mpio_get_min_chunk(const H5D_io_info_t *io_info, + const fm_map *fm, int *min_chunkf); #endif /* H5_HAVE_PARALLEL */ /* I/O info operations */ -static herr_t -H5D_ioinfo_init(H5D_t *dset, const H5D_dxpl_cache_t *dxpl_cache, hid_t dxpl_id, - const H5S_t *mem_space, const H5S_t *file_space, - unsigned flags, hbool_t *use_par_opt_io, H5D_io_info_t *io_info); +static herr_t H5D_ioinfo_init(H5D_t *dset, const H5D_dxpl_cache_t *dxpl_cache, + hid_t dxpl_id, const H5S_t *mem_space, const H5S_t *file_space, + H5T_path_t *tpath, H5D_io_info_t *io_info); /* Chunk operations */ static herr_t H5D_create_chunk_map(const H5D_t *dataset, const H5T_t *mem_type, const H5S_t *file_space, const H5S_t *mem_space, fm_map *fm); static herr_t H5D_destroy_chunk_map(const fm_map *fm); -static herr_t H5D_free_chunk_info(void *item, void UNUSED *key, void UNUSED *opdata); +static herr_t H5D_free_chunk_info(void *item, void *key, void *opdata); static herr_t H5D_create_chunk_file_map_hyper(const fm_map *fm); static herr_t H5D_create_chunk_mem_map_hyper(const fm_map *fm); static herr_t H5D_chunk_file_cb(void *elem, hid_t type_id, unsigned ndims, @@ -642,17 +647,10 @@ H5D_read(H5D_t *dataset, hid_t mem_type_id, const H5S_t *mem_space, H5T_path_t *tpath = NULL; /*type conversion info */ const H5T_t *mem_type = NULL; /* Memory datatype */ H5D_io_info_t io_info; /* Dataset I/O info */ - hbool_t use_par_opt_io=FALSE; /* Whether the 'optimized' I/O routines with be parallel */ -#ifdef H5_HAVE_PARALLEL - hbool_t xfer_mode_changed=FALSE; /* Whether the transfer mode was changed */ -#ifdef H5_HAVE_INSTRUMENTED_LIBRARY - int prop_value,new_value; - htri_t check_prop; -#endif /* H5_HAVE_INSTRUMENTED_LIBRARY */ -#endif /*H5_HAVE_PARALLEL*/ + hbool_t io_info_init = FALSE; /* Whether the I/O info has been initialized */ H5D_dxpl_cache_t _dxpl_cache; /* Data transfer property cache buffer */ H5D_dxpl_cache_t *dxpl_cache=&_dxpl_cache; /* Data transfer property cache */ - unsigned sconv_flags=0; /* Flags for the space conversion */ + herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_NOAPI_NOINIT(H5D_read) @@ -681,10 +679,7 @@ H5D_read(H5D_t *dataset, hid_t mem_type_id, const H5S_t *mem_space, if (dxpl_cache->xfer_mode==H5FD_MPIO_COLLECTIVE && !IS_H5FD_MPI(dataset->ent.file)) HGOTO_ERROR (H5E_DATASET, H5E_UNSUPPORTED, FAIL, "collective access for MPI-based drivers only") - /* Set the "parallel I/O possible" flag, for H5S_find(), if we are doing collective I/O */ - /* (Don't set the parallel I/O possible flag for the MPI-posix driver, since it doesn't do real collective I/O) */ - if (H5S_mpi_opt_types_g && dxpl_cache->xfer_mode==H5FD_MPIO_COLLECTIVE && !IS_H5FD_MPIPOSIX(dataset->ent.file)) - sconv_flags |= H5S_CONV_PAR_IO_POSSIBLE; + #endif /*H5_HAVE_PARALLEL*/ /* Make certain that the number of elements in each selection is the same */ @@ -742,56 +737,12 @@ H5D_read(H5D_t *dataset, hid_t mem_type_id, const H5S_t *mem_space, if (NULL==(tpath=H5T_path_find(dataset->shared->type, mem_type, NULL, NULL, dxpl_id))) HGOTO_ERROR(H5E_DATASET, H5E_UNSUPPORTED, FAIL, "unable to convert between src and dest data types") - /* Set the storage flags for the space conversion check */ - switch(dataset->shared->layout.type) { - case H5D_COMPACT: - sconv_flags |= H5S_CONV_STORAGE_COMPACT; - break; - - case H5D_CONTIGUOUS: - sconv_flags |= H5S_CONV_STORAGE_CONTIGUOUS; - break; - - case H5D_CHUNKED: - sconv_flags |= H5S_CONV_STORAGE_CHUNKED; - break; - - default: - assert(0 && "Unhandled layout type!"); - } /* end switch */ + /* Set up I/O operation */ - if(H5D_ioinfo_init(dataset,dxpl_cache,dxpl_id,mem_space,file_space,sconv_flags,&use_par_opt_io,&io_info)<0) + if(H5D_ioinfo_init(dataset,dxpl_cache,dxpl_id,mem_space,file_space,tpath,&io_info)<0) HGOTO_ERROR (H5E_DATASET, H5E_UNSUPPORTED, FAIL, "unable to set up I/O operation") - -#ifdef H5_HAVE_PARALLEL -#ifdef H5_HAVE_INSTRUMENTED_LIBRARY - /**** Test for collective chunk IO - notice the following code should be removed after - a more general collective chunk IO algorithm is applied. - */ - - if(dataset->shared->layout.type == H5D_CHUNKED) { /*only check for chunking storage */ - check_prop = H5Pexist(dxpl_id,H5D_XFER_COLL_CHUNK_NAME); - if(check_prop < 0) - HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to check property list"); - if(check_prop > 0) { - if(H5Pget(dxpl_id,H5D_XFER_COLL_CHUNK_NAME,&prop_value)<0) - HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to get property value"); - if(!use_par_opt_io) { - new_value = 0; - if(H5Pset(dxpl_id,H5D_XFER_COLL_CHUNK_NAME,&new_value)<0) - HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to set property value"); - } - } - } - /* end Test for collective chunk IO */ -#endif /* H5_HAVE_INSTRUMENTED_LIBRARY */ - - /* Don't reset the transfer mode if we can't or won't use it */ - if(!use_par_opt_io || !H5T_path_noop(tpath)) - H5D_io_assist_mpio(dxpl_id, dxpl_cache, &xfer_mode_changed); -#endif /*H5_HAVE_PARALLEL*/ + io_info_init = TRUE; /* Determine correct I/O routine to invoke */ if(dataset->shared->layout.type!=H5D_CHUNKED) { @@ -807,9 +758,10 @@ H5D_read(H5D_t *dataset, hid_t mem_type_id, const H5S_t *mem_space, done: #ifdef H5_HAVE_PARALLEL - /* Restore xfer_mode due to the kludge */ - if (xfer_mode_changed) - H5D_io_restore_mpio(dxpl_id); + /* Shut down io_info struct */ + if (io_info_init) + if(H5D_ioinfo_term(&io_info) < 0) + HDONE_ERROR(H5E_DATASET, H5E_CANTCLOSEOBJ, FAIL, "can't shut down io_info") #endif /*H5_HAVE_PARALLEL*/ FUNC_LEAVE_NOAPI(ret_value) @@ -871,17 +823,9 @@ H5D_write(H5D_t *dataset, hid_t mem_type_id, const H5S_t *mem_space, H5T_path_t *tpath = NULL; /*type conversion info */ const H5T_t *mem_type = NULL; /* Memory datatype */ H5D_io_info_t io_info; /* Dataset I/O info */ - hbool_t use_par_opt_io=FALSE; /* Whether the 'optimized' I/O routines with be parallel */ -#ifdef H5_HAVE_PARALLEL - hbool_t xfer_mode_changed=FALSE; /* Whether the transfer mode was changed */ -#ifdef H5_HAVE_INSTRUMENTED_LIBRARY - int prop_value,new_value; - htri_t check_prop; -#endif /* H5_HAVE_INSTRUMENTED_LIBRARY */ -#endif /*H5_HAVE_PARALLEL*/ + hbool_t io_info_init = FALSE; /* Whether the I/O info has been initialized */ H5D_dxpl_cache_t _dxpl_cache; /* Data transfer property cache buffer */ H5D_dxpl_cache_t *dxpl_cache=&_dxpl_cache; /* Data transfer property cache */ - unsigned sconv_flags=0; /* Flags for the space conversion */ herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_NOAPI_NOINIT(H5D_write) @@ -901,23 +845,6 @@ H5D_write(H5D_t *dataset, hid_t mem_type_id, const H5S_t *mem_space, dataset->shared->checked_filters = TRUE; } - - /* If MPI based VFD is used, no VL datatype support yet. */ - /* This is because they use the global heap in the file and we don't */ - /* support parallel access of that yet */ - if (IS_H5FD_MPI(dataset->ent.file) && H5T_detect_class(mem_type, H5T_VLEN)>0) - HGOTO_ERROR (H5E_DATASET, H5E_UNSUPPORTED, FAIL, "Parallel IO does not support writing VL datatypes yet") - /* If MPI based VFD is used, no VL datatype support yet. */ - /* This is because they use the global heap in the file and we don't */ - /* support parallel access of that yet */ - /* We should really use H5T_detect_class() here, but it will be difficult - * to detect the type of the reference if it is nested... -QAK - */ - if (IS_H5FD_MPI(dataset->ent.file) && - H5T_get_class(mem_type, TRUE)==H5T_REFERENCE && - H5T_get_ref_type(mem_type)==H5R_DATASET_REGION) - HGOTO_ERROR (H5E_DATASET, H5E_UNSUPPORTED, FAIL, "Parallel IO does not support writing region reference datatypes yet") - /* Check if we are allowed to write to this file */ if (0==(H5F_get_intent(dataset->ent.file) & H5F_ACC_RDWR)) HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "no write intent on file") @@ -926,6 +853,32 @@ H5D_write(H5D_t *dataset, hid_t mem_type_id, const H5S_t *mem_space, if (H5D_get_dxpl_cache(dxpl_id,&dxpl_cache)<0) HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't fill dxpl cache") + /* Various MPI based checks */ + if (IS_H5FD_MPI(dataset->ent.file)) { + /* If MPI based VFD is used, no VL datatype support yet. */ + /* This is because they use the global heap in the file and we don't */ + /* support parallel access of that yet */ + if(H5T_detect_class(mem_type, H5T_VLEN)>0) + HGOTO_ERROR (H5E_DATASET, H5E_UNSUPPORTED, FAIL, "Parallel IO does not support writing VL datatypes yet") + + /* If MPI based VFD is used, no VL datatype support yet. */ + /* This is because they use the global heap in the file and we don't */ + /* support parallel access of that yet */ + /* We should really use H5T_detect_class() here, but it will be difficult + * to detect the type of the reference if it is nested... -QAK + */ + if (H5T_get_class(mem_type, TRUE)==H5T_REFERENCE && + H5T_get_ref_type(mem_type)==H5R_DATASET_REGION) + HGOTO_ERROR (H5E_DATASET, H5E_UNSUPPORTED, FAIL, "Parallel IO does not support writing region reference datatypes yet") + } /* end if */ +#ifdef H5_HAVE_PARALLEL + else { + /* Collective access is not permissible without a MPI based VFD */ + if (dxpl_cache->xfer_mode==H5FD_MPIO_COLLECTIVE) + HGOTO_ERROR (H5E_DATASET, H5E_UNSUPPORTED, FAIL, "collective access for MPI-based driver only") + } /* end else */ +#endif /*H5_HAVE_PARALLEL*/ + if (!file_space) file_space = dataset->shared->space; if (!mem_space) @@ -939,10 +892,6 @@ H5D_write(H5D_t *dataset, hid_t mem_type_id, const H5S_t *mem_space, if (dxpl_cache->xfer_mode==H5FD_MPIO_COLLECTIVE && !IS_H5FD_MPI(dataset->ent.file)) HGOTO_ERROR (H5E_DATASET, H5E_UNSUPPORTED, FAIL, "collective access for MPI-based driver only") - /* Set the "parallel I/O possible" flag, for H5S_find(), if we are doing collective I/O */ - /* (Don't set the parallel I/O possible flag for the MPI-posix driver, since it doesn't do real collective I/O) */ - if (H5S_mpi_opt_types_g && dxpl_cache->xfer_mode==H5FD_MPIO_COLLECTIVE && !IS_H5FD_MPIPOSIX(dataset->ent.file)) - sconv_flags |= H5S_CONV_PAR_IO_POSSIBLE; #endif /*H5_HAVE_PARALLEL*/ /* Make certain that the number of elements in each selection is the same */ @@ -991,56 +940,11 @@ H5D_write(H5D_t *dataset, hid_t mem_type_id, const H5S_t *mem_space, if (NULL==(tpath=H5T_path_find(mem_type, dataset->shared->type, NULL, NULL, dxpl_id))) HGOTO_ERROR(H5E_DATASET, H5E_UNSUPPORTED, FAIL, "unable to convert between src and dest data types") - /* Set the storage flags for the space conversion check */ - switch(dataset->shared->layout.type) { - case H5D_COMPACT: - sconv_flags |= H5S_CONV_STORAGE_COMPACT; - break; - - case H5D_CONTIGUOUS: - sconv_flags |= H5S_CONV_STORAGE_CONTIGUOUS; - break; - - case H5D_CHUNKED: - sconv_flags |= H5S_CONV_STORAGE_CHUNKED; - break; - - default: - assert(0 && "Unhandled layout type!"); - } /* end switch */ /* Set up I/O operation */ - if(H5D_ioinfo_init(dataset,dxpl_cache,dxpl_id,mem_space,file_space,sconv_flags,&use_par_opt_io,&io_info)<0) + if(H5D_ioinfo_init(dataset,dxpl_cache,dxpl_id,mem_space,file_space,tpath,&io_info)<0) HGOTO_ERROR (H5E_DATASET, H5E_UNSUPPORTED, FAIL, "unable to set up I/O operation") - -#ifdef H5_HAVE_PARALLEL -#ifdef H5_HAVE_INSTRUMENTED_LIBRARY - /**** Test for collective chunk IO - notice the following code should be removed after - a more general collective chunk IO algorithm is applied. - */ - - if(dataset->shared->layout.type == H5D_CHUNKED) { /*only check for chunking storage */ - - check_prop = H5Pexist(dxpl_id,H5D_XFER_COLL_CHUNK_NAME); - if(check_prop < 0) - HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to check property list"); - if(check_prop > 0) { - if(H5Pget(dxpl_id,H5D_XFER_COLL_CHUNK_NAME,&prop_value)<0) - HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to get property value"); - if(!use_par_opt_io) { - new_value = 0; - if(H5Pset(dxpl_id,H5D_XFER_COLL_CHUNK_NAME,&new_value)<0) - HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to set property value"); - } - } - } -#endif /* H5_HAVE_INSTRUMENTED_LIBRARY */ - - /* Don't reset the transfer mode if we can't or won't use it */ - if(!use_par_opt_io || !H5T_path_noop(tpath)) - H5D_io_assist_mpio(dxpl_id, dxpl_cache, &xfer_mode_changed); -#endif /*H5_HAVE_PARALLEL*/ + io_info_init = TRUE; /* Determine correct I/O routine to invoke */ if(dataset->shared->layout.type!=H5D_CHUNKED) { @@ -1071,9 +975,10 @@ H5D_write(H5D_t *dataset, hid_t mem_type_id, const H5S_t *mem_space, done: #ifdef H5_HAVE_PARALLEL - /* Restore xfer_mode due to the kludge */ - if (xfer_mode_changed) - H5D_io_restore_mpio(dxpl_id); + /* Shut down io_info struct */ + if (io_info_init) + if(H5D_ioinfo_term(&io_info) < 0) + HDONE_ERROR(H5E_DATASET, H5E_CANTCLOSEOBJ, FAIL, "can't shut down io_info") #endif /*H5_HAVE_PARALLEL*/ FUNC_LEAVE_NOAPI(ret_value) @@ -1631,6 +1536,10 @@ H5D_chunk_read(H5D_io_info_t *io_info, hsize_t nelmts, uint8_t *tconv_buf = NULL; /*data type conv buffer */ uint8_t *bkg_buf = NULL; /*background buffer */ H5D_storage_t store; /*union of EFL and chunk pointer in file space */ +#ifdef H5_HAVE_PARALLEL + int count_chunk; /* Number of chunks accessed */ + int min_num_chunk; /* Number of chunks to access collectively */ +#endif herr_t ret_value = SUCCEED; /*return value */ FUNC_ENTER_NOAPI_NOINIT(H5D_chunk_read) @@ -1659,9 +1568,20 @@ H5D_chunk_read(H5D_io_info_t *io_info, hsize_t nelmts, /* Get first node in chunk skip list */ chunk_node=H5SL_first(fm.fsel); +#ifdef H5_HAVE_PARALLEL + if(io_info->dxpl_cache->xfer_mode == H5FD_MPIO_COLLECTIVE) { + if(H5D_mpio_get_min_chunk(io_info, &fm, &min_num_chunk)<0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get minimum number of chunk") + } + count_chunk = 0; +#endif /* H5_HAVE_PARALLEL */ + /* Iterate through chunks to be operated on */ while(chunk_node) { H5D_chunk_info_t *chunk_info; /* chunk information */ +#ifdef H5_HAVE_PARALLEL + hbool_t make_ind, make_coll; /* Flags to indicate that the MPI mode should change */ +#endif /* H5_HAVE_PARALLEL */ /* Get the actual chunk information from the skip list node */ chunk_info=H5SL_item(chunk_node); @@ -1670,11 +1590,66 @@ H5D_chunk_read(H5D_io_info_t *io_info, hsize_t nelmts, store.chunk.offset = chunk_info->coords; store.chunk.index = chunk_info->index; +#ifdef H5_HAVE_PARALLEL + /* Reset flags for changing parallel I/O mode */ + make_ind = make_coll = FALSE; + + if(io_info->dxpl_cache->xfer_mode == H5FD_MPIO_COLLECTIVE) { + /* Increment chunk we are operating on */ + count_chunk++; + + /* If the number of chunk is greater than minimum number of chunk, + Do independent read */ + if(count_chunk > min_num_chunk) { + /* Switch to independent I/O (permanently) */ + make_ind = TRUE; + } +#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS + else { + int is_regular; /* If this chunk's selections are regular */ + int mpi_code; /* MPI error code */ + int all_regular = 0; /* If this chunk's selections are regular on all processes */ + + /* Determine if this process has regular selections */ + if(H5S_SELECT_IS_REGULAR(chunk_info->fspace) == TRUE && + H5S_SELECT_IS_REGULAR(chunk_info->mspace) == TRUE) + is_regular = 1; + else + is_regular = 0; + + /* Determine if all processes have regular selections */ + if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&is_regular, &all_regular, 1, MPI_INT, MPI_LAND, io_info->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code) + + /* For regular selection, + if MPI_COMPLEX_DERIVED_DATATYPE is not defined, + unless spaces for all processors are regular, independent read operation should be performed.*/ + if(!all_regular) { + /* Switch to independent I/O (temporarily) */ + make_ind = TRUE; + make_coll = TRUE; + } /* end if */ + } /* end else */ +#endif /* H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS */ + } /* end if */ + + /* Switch to independent I/O */ + if(make_ind) + if(H5D_ioinfo_make_ind(io_info) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O") +#endif /* H5_HAVE_PARALLEL */ + /* Perform the actual read operation */ status = (io_info->ops.read)(io_info, chunk_info->chunk_points, H5T_get_size(dataset->shared->type), - chunk_info->fspace, chunk_info->mspace, - buf); + chunk_info->fspace, chunk_info->mspace, buf); + +#ifdef H5_HAVE_PARALLEL + /* Switch back to collective I/O */ + if(make_coll) + if(H5D_ioinfo_make_coll(io_info) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O") +#endif /* H5_HAVE_PARALLEL */ /* Check return value from optimized read */ if (status<0) @@ -1909,6 +1884,9 @@ done: * Modifications: * QAK - 2003/04/17 * Hacked on it a lot. :-) + + * Kent Yang: 8/10/04 + * Added support for collective chunk IO. * *------------------------------------------------------------------------- */ @@ -1943,6 +1921,10 @@ H5D_chunk_write(H5D_io_info_t *io_info, hsize_t nelmts, uint8_t *tconv_buf = NULL; /*data type conv buffer */ uint8_t *bkg_buf = NULL; /*background buffer */ H5D_storage_t store; /*union of EFL and chunk pointer in file space */ +#ifdef H5_HAVE_PARALLEL + int count_chunk; /* Number of chunks accessed */ + int min_num_chunk; /* Number of chunks to access collectively */ +#endif herr_t ret_value = SUCCEED; /*return value */ FUNC_ENTER_NOAPI_NOINIT(H5D_chunk_write) @@ -1962,12 +1944,24 @@ H5D_chunk_write(H5D_io_info_t *io_info, hsize_t nelmts, #ifdef H5S_DEBUG H5_timer_begin(&timer); #endif + +#ifdef H5_HAVE_PARALLEL + if(io_info->dxpl_cache->xfer_mode == H5FD_MPIO_COLLECTIVE) { + if(H5D_mpio_get_min_chunk(io_info, &fm, &min_num_chunk)<0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get minimum number of chunk") + } + count_chunk = 0; +#endif /* H5_HAVE_PARALLEL */ + /* Get first node in chunk skip list */ chunk_node=H5SL_first(fm.fsel); /* Iterate through chunks to be operated on */ while(chunk_node) { - H5D_chunk_info_t *chunk_info; /* chunk information */ + H5D_chunk_info_t *chunk_info; /* Chunk information */ +#ifdef H5_HAVE_PARALLEL + hbool_t make_ind, make_coll; /* Flags to indicate that the MPI mode should change */ +#endif /* H5_HAVE_PARALLEL */ /* Get the actual chunk information from the skip list node */ chunk_info=H5SL_item(chunk_node); @@ -1976,11 +1970,66 @@ H5D_chunk_write(H5D_io_info_t *io_info, hsize_t nelmts, store.chunk.offset = chunk_info->coords; store.chunk.index = chunk_info->index; +#ifdef H5_HAVE_PARALLEL + /* Reset flags for changing parallel I/O mode */ + make_ind = make_coll = FALSE; + + if(io_info->dxpl_cache->xfer_mode == H5FD_MPIO_COLLECTIVE) { + /* Increment chunk we are operating on */ + count_chunk++; + + /* If the number of chunk is greater than minimum number of chunk, + Do independent write */ + if(count_chunk > min_num_chunk) { + /* Switch to independent I/O (permanently) */ + make_ind = TRUE; + } +#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS + else { + int is_regular; /* If this chunk's selections are regular */ + int mpi_code; /* MPI error code */ + int all_regular = 0; /* If this chunk's selections are regular on all processes */ + + /* Determine if this process has regular selections */ + if(H5S_SELECT_IS_REGULAR(chunk_info->fspace) == TRUE && + H5S_SELECT_IS_REGULAR(chunk_info->mspace) == TRUE) + is_regular = 1; + else + is_regular = 0; + + /* Determine if all processes have regular selections */ + if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&is_regular, &all_regular, 1, MPI_INT, MPI_LAND, io_info->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code) + + /* For regular selection, + if MPI_COMPLEX_DERIVED_DATATYPE is not defined, + unless spaces for all processors are regular, independent read operation should be performed.*/ + if(!all_regular) { + /* Switch to independent I/O (temporarily) */ + make_ind = TRUE; + make_coll = TRUE; + } /* end if */ + } /* end else */ +#endif /* H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS */ + } /* end if */ + + /* Switch to independent I/O */ + if(make_ind) + if(H5D_ioinfo_make_ind(io_info) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O") +#endif /* H5_HAVE_PARALLEL */ + /* Perform the actual write operation */ status = (io_info->ops.write)(io_info, chunk_info->chunk_points, H5T_get_size(dataset->shared->type), - chunk_info->fspace, chunk_info->mspace, - buf); + chunk_info->fspace, chunk_info->mspace, buf); + +#ifdef H5_HAVE_PARALLEL + /* Switch back to collective I/O */ + if(make_coll) + if(H5D_ioinfo_make_coll(io_info) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O") +#endif /* H5_HAVE_PARALLEL */ /* Check return value from optimized write */ if (status<0) @@ -2204,102 +2253,6 @@ done: FUNC_LEAVE_NOAPI(ret_value) } /* H5D_chunk_write() */ -#ifdef H5_HAVE_PARALLEL - -/*------------------------------------------------------------------------- - * Function: H5D_io_assist_mpio - * - * Purpose: Common logic for determining if the MPI transfer mode should - * be changed. - * - * Return: Non-negative on success/Negative on failure - * - * Programmer: Raymond Lu - * Thursday, April 10, 2003 - * - * Modifications: - * QAK - 2003/04/17 - * Hacked on it a lot. :-) - * - *------------------------------------------------------------------------- - */ -static herr_t -H5D_io_assist_mpio(hid_t dxpl_id, H5D_dxpl_cache_t *dxpl_cache, - hbool_t *xfer_mode_changed) -{ - herr_t ret_value = SUCCEED; /*return value */ - - FUNC_ENTER_NOAPI_NOINIT(H5D_io_assist_mpio) - - /* The following may not handle a collective call correctly - * since it does not ensure all processes can handle the write - * request according to the MPI collective specification. - * Do the collective request via independent mode. - */ - if (dxpl_cache->xfer_mode==H5FD_MPIO_COLLECTIVE) { - H5P_genplist_t *dx_plist; /* Data transer property list */ - - /* Get the dataset transfer property list */ - if (NULL == (dx_plist = H5I_object(dxpl_id))) - HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset creation property list") - - /* Kludge: change the xfer_mode to independent, handle the request, - * then xfer_mode before return. - * Better way is to get a temporary data_xfer property with - * INDEPENDENT xfer_mode and pass it downwards. - */ - dxpl_cache->xfer_mode = H5FD_MPIO_INDEPENDENT; - if(H5P_set (dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &dxpl_cache->xfer_mode) < 0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode") - *xfer_mode_changed=TRUE; /* restore it before return */ - } /* end if */ - -done: - FUNC_LEAVE_NOAPI(ret_value) -} /* end H5D_io_assist_mpio() */ - - -/*------------------------------------------------------------------------- - * Function: H5D_io_restore_mpio - * - * Purpose: Common logic for restoring MPI transfer mode - * - * Return: Non-negative on success/Negative on failure - * - * Programmer: Quincey Koziol - * Friday, February 6, 2004 - * - * Modifications: - * - *------------------------------------------------------------------------- - */ -static herr_t -H5D_io_restore_mpio(hid_t dxpl_id) -{ - H5P_genplist_t *dx_plist; /* Data transer property list */ - H5FD_mpio_xfer_t xfer_mode; /*xfer_mode for this request */ - herr_t ret_value = SUCCEED; /*return value */ - - FUNC_ENTER_NOAPI_NOINIT(H5D_io_restore_mpio) - - /* Get the dataset transfer property list */ - if (NULL == (dx_plist = H5I_object(dxpl_id))) - HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset creation property list") - - /* Kludge: change the xfer_mode to independent, handle the request, - * then xfer_mode before return. - * Better way is to get a temporary data_xfer property with - * INDEPENDENT xfer_mode and pass it downwards. - */ - xfer_mode = H5FD_MPIO_COLLECTIVE; - if(H5P_set (dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &xfer_mode) < 0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode") - -done: - FUNC_LEAVE_NOAPI(ret_value) -} /* end H5D_io_restore_mpio() */ -#endif /*H5_HAVE_PARALLEL*/ - /*------------------------------------------------------------------------- * Function: H5D_create_chunk_map @@ -3124,103 +3077,6 @@ done: /*------------------------------------------------------------------------- - * Function: H5D_get_collective_io_consensus - * - * Purpose: Compare notes with all other processes involved in this I/O - * and see if all are go for collective I/O. - * - * If all are, return TRUE. - * - * If any process can't manage collective I/O, then collective - * I/O is impossible, and we return FALSE. - * - * If the flags indicate that collective I/O is impossible, - * skip the interprocess communication and just return FALSE. - * - * In any error is detected, return FAIL. - * - * Return: Success: TRUE or FALSE - * - * Failure: FAIL - * - * Programmer: JRM -- 8/30/04 - * - * Modifications: - * - * None. - * - *------------------------------------------------------------------------- - */ - -#ifdef H5_HAVE_PARALLEL -static htri_t -H5D_get_collective_io_consensus(const H5F_t *file, - const htri_t local_opinion, - const unsigned flags) -{ - htri_t ret_value = FAIL; /* will update if successful */ - MPI_Comm comm; - int int_local_opinion; - int consensus; - int mpi_result; - - FUNC_ENTER_NOAPI_NOINIT(H5D_get_collective_io_consensus); - - HDassert ( ( local_opinion == TRUE ) || ( local_opinion == FALSE ) ); - - /* Don't do the interprocess communication unless the Parallel I/O - * conversion flag is set -- there may not be other processes to - * talk to. - */ - if ( ! ( flags & flags&H5S_CONV_PAR_IO_POSSIBLE ) ) { - - HGOTO_DONE(FALSE); - } - - comm = H5F_mpi_get_comm(file); - - if ( comm == MPI_COMM_NULL ) - HGOTO_ERROR(H5E_DATASPACE, H5E_CANTGET, FAIL, \ - "can't retrieve MPI communicator") - - if ( local_opinion == TRUE ) { - - int_local_opinion = 1; - - } else { - - int_local_opinion = 0; - } - - mpi_result = MPI_Allreduce((void *)(&int_local_opinion), - (void *)(&consensus), - 1, - MPI_INT, - MPI_LAND, - comm); - - if ( mpi_result != MPI_SUCCESS ) - HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_result) - - if ( consensus ) { - - ret_value = TRUE; - - } else { - - ret_value = FALSE; - } - -done: - - FUNC_LEAVE_NOAPI(ret_value); - -} /* H5D_get_collective_io_consensus() */ - -#endif /* H5_HAVE_PARALLEL */ - - -/*------------------------------------------------------------------------- * Function: H5D_ioinfo_init * * Purpose: Routine for determining correct I/O operations for @@ -3245,19 +3101,14 @@ H5D_ioinfo_init(H5D_t *dset, const H5D_dxpl_cache_t *dxpl_cache, hid_t dxpl_id, #if !(defined H5_HAVE_PARALLEL || defined H5S_DEBUG) UNUSED #endif /* H5_HAVE_PARALLEL */ - *file_space, unsigned + *file_space, H5T_path_t #ifndef H5_HAVE_PARALLEL UNUSED #endif /* H5_HAVE_PARALLEL */ - flags, hbool_t -#ifndef H5_HAVE_PARALLEL - UNUSED -#endif /* H5_HAVE_PARALLEL */ - *use_par_opt_io, H5D_io_info_t *io_info) + *tpath, + H5D_io_info_t *io_info) { -#ifdef H5_HAVE_PARALLEL - htri_t opt; /* Flag whether a selection is optimizable */ -#endif /* H5_HAVE_PARALLEL */ + herr_t ret_value = SUCCEED; /* Return value */ #if defined H5_HAVE_PARALLEL || defined H5S_DEBUG @@ -3271,7 +3122,7 @@ H5D_ioinfo_init(H5D_t *dset, const H5D_dxpl_cache_t *dxpl_cache, hid_t dxpl_id, HDassert(dset->ent.file); HDassert(mem_space); HDassert(file_space); - HDassert(use_par_opt_io); + HDassert(tpath); HDassert(io_info); /* Set up "normal" I/O fields */ @@ -3284,48 +3135,88 @@ H5D_ioinfo_init(H5D_t *dset, const H5D_dxpl_cache_t *dxpl_cache, hid_t dxpl_id, io_info->ops=dset->shared->io_ops; #ifdef H5_HAVE_PARALLEL - /* - * Check if we can set direct MPI-IO read/write functions - */ - opt=H5D_mpio_opt_possible(dset,mem_space,file_space,flags); - if(opt==FAIL) - HGOTO_ERROR(H5E_DATASPACE, H5E_BADRANGE, FAIL, "invalid check for direct IO dataspace "); - - opt = H5D_get_collective_io_consensus(dset->ent.file, opt, flags); - - if ( opt == FAIL ) - HGOTO_ERROR(H5E_DATASPACE, H5E_BADRANGE, FAIL, \ - "check for collective I/O consensus failed."); - - /* Check if we can use the optimized parallel I/O routines */ - if(opt==TRUE) { - /* Set the pointers to the MPI-specific routines */ - if((H5S_SELECT_IS_REGULAR(file_space) == TRUE) && - (H5S_SELECT_IS_REGULAR(mem_space) == TRUE)){ - io_info->ops.read = H5D_mpio_spaces_read; - io_info->ops.write = H5D_mpio_spaces_write; - } - - #ifdef KYANG - else { - io_info->ops.read = H5D_mpio_spaces_span_read; - io_info->ops.write = H5D_mpio_spaces_span_write; - } - #endif - /* Indicate that the I/O will be parallel */ - *use_par_opt_io=TRUE; + /* Start in the "not modified" xfer_mode state */ + io_info->xfer_mode_changed = FALSE; + + if(IS_H5FD_MPI(dset->ent.file)) { + htri_t opt; /* Flag whether a selection is optimizable */ + + /* Get MPI communicator */ + if((io_info->comm = H5F_mpi_get_comm(dset->ent.file)) == MPI_COMM_NULL) + HGOTO_ERROR(H5E_DATASPACE, H5E_CANTGET, FAIL, "can't retrieve MPI communicator") + + /* + * Check if we can set direct MPI-IO read/write functions + */ + opt=H5D_mpio_opt_possible(io_info, mem_space, file_space, tpath); + if(opt==FAIL) + HGOTO_ERROR(H5E_DATASPACE, H5E_BADRANGE, FAIL, "invalid check for direct IO dataspace "); + + /* Check if we can use the optimized parallel I/O routines */ + if(opt==TRUE) { + /* Set the pointers to the MPI-specific routines */ + io_info->ops.read = H5D_mpio_select_read; + io_info->ops.write = H5D_mpio_select_write; + } /* end if */ + else { + /* Set the pointers to the non-MPI-specific routines */ + io_info->ops.read = H5D_select_read; + io_info->ops.write = H5D_select_write; + + /* If we won't be doing collective I/O, but the user asked for + * collective I/O, change the request to use independent I/O, but + * mark it so that we remember to revert the change. + */ + if(io_info->dxpl_cache->xfer_mode==H5FD_MPIO_COLLECTIVE) { + H5P_genplist_t *dx_plist; /* Data transer property list */ + + /* Get the dataset transfer property list */ + if (NULL == (dx_plist = H5I_object(dxpl_id))) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset creation property list") + + /* Change the xfer_mode to independent for handling the I/O */ + io_info->dxpl_cache->xfer_mode = H5FD_MPIO_INDEPENDENT; + if(H5P_set (dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &io_info->dxpl_cache->xfer_mode) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode") + + /* Indicate that the transfer mode should be restored before returning + * to user. + */ + io_info->xfer_mode_changed = TRUE; + } /* end if */ + +#ifdef H5_HAVE_INSTRUMENTED_LIBRARY + /**** Test for collective chunk IO + notice the following code should be removed after + a more general collective chunk IO algorithm is applied. + (This property is only reset for independent I/O) + */ + if(dset->shared->layout.type == H5D_CHUNKED) { /*only check for chunking storage */ + htri_t check_prop; + + check_prop = H5Pexist(dxpl_id,H5D_XFER_COLL_CHUNK_NAME); + if(check_prop < 0) + HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to check property list"); + if(check_prop > 0) { + int prop_value = 0; + + if(H5Pset(dxpl_id,H5D_XFER_COLL_CHUNK_NAME,&prop_value)<0) + HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to set property value"); + } /* end if */ + } /* end if */ +#endif /* H5_HAVE_INSTRUMENTED_LIBRARY */ + } /* end else */ } /* end if */ else { - /* Indicate that the I/O will _NOT_ be parallel */ - *use_par_opt_io=FALSE; + /* Set the pointers to the non-MPI-specific routines */ io_info->ops.read = H5D_select_read; io_info->ops.write = H5D_select_write; } /* end else */ -#else - io_info->ops.read = H5D_select_read; - io_info->ops.write = H5D_select_write; +#else /* H5_HAVE_PARALLEL */ + io_info->ops.read = H5D_select_read; + io_info->ops.write = H5D_select_write; #endif /* H5_HAVE_PARALLEL */ #ifdef H5S_DEBUG @@ -3339,3 +3230,176 @@ done: #endif /* H5_HAVE_PARALLEL || H5S_DEBUG */ FUNC_LEAVE_NOAPI(ret_value) } /* end H5D_ioinfo_init() */ +#ifdef H5_HAVE_PARALLEL + +/*------------------------------------------------------------------------- + * Function: H5D_ioinfo_make_ind + * + * Purpose: Switch to MPI independent I/O + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: Quincey Koziol + * Friday, August 12, 2005 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static herr_t +H5D_ioinfo_make_ind(H5D_io_info_t *io_info) +{ + H5P_genplist_t *dx_plist; /* Data transer property list */ + herr_t ret_value = SUCCEED; /*return value */ + + FUNC_ENTER_NOAPI_NOINIT(H5D_ioinfo_make_ind) + + /* Get the dataset transfer property list */ + if (NULL == (dx_plist = H5I_object(io_info->dxpl_id))) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset transfer property list") + + /* Change the xfer_mode to independent, handle the request, + * then set xfer_mode before return. + */ + io_info->dxpl_cache->xfer_mode = H5FD_MPIO_INDEPENDENT; + if(H5P_set (dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &io_info->dxpl_cache->xfer_mode) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode") + + /* Set the pointers to the non-MPI-specific routines */ + io_info->ops.read = H5D_select_read; + io_info->ops.write = H5D_select_write; + + /* Indicate that the transfer mode should be restored before returning + * to user. + */ + io_info->xfer_mode_changed=TRUE; + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5D_ioinfo_make_ind() */ + + +/*------------------------------------------------------------------------- + * Function: H5D_ioinfo_make_coll + * + * Purpose: Switch to MPI collective I/O + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: Quincey Koziol + * Friday, August 12, 2005 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static herr_t +H5D_ioinfo_make_coll(H5D_io_info_t *io_info) +{ + H5P_genplist_t *dx_plist; /* Data transer property list */ + herr_t ret_value = SUCCEED; /*return value */ + + FUNC_ENTER_NOAPI_NOINIT(H5D_ioinfo_make_coll) + + /* Get the dataset transfer property list */ + if (NULL == (dx_plist = H5I_object(io_info->dxpl_id))) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset transfer property list") + + /* Change the xfer_mode to independent, handle the request, + * then set xfer_mode before return. + */ + io_info->dxpl_cache->xfer_mode = H5FD_MPIO_COLLECTIVE; + if(H5P_set (dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &io_info->dxpl_cache->xfer_mode) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode") + + /* Set the pointers to the MPI-specific routines */ + io_info->ops.read = H5D_mpio_select_read; + io_info->ops.write = H5D_mpio_select_write; + + /* Indicate that the transfer mode should _NOT_ be restored before returning + * to user. + */ + io_info->xfer_mode_changed=FALSE; + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5D_ioinfo_make_coll() */ + + +/*------------------------------------------------------------------------- + * Function: H5D_ioinfo_term + * + * Purpose: Common logic for terminating an I/O info object + * (Only used for restoring MPI transfer mode currently) + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: Quincey Koziol + * Friday, February 6, 2004 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static herr_t +H5D_ioinfo_term(H5D_io_info_t *io_info) +{ + herr_t ret_value = SUCCEED; /*return value */ + + FUNC_ENTER_NOAPI_NOINIT(H5D_ioinfo_term) + + /* Check if we need to revert the change to the xfer mode */ + if (io_info->xfer_mode_changed) { + H5P_genplist_t *dx_plist; /* Data transer property list */ + + /* Get the dataset transfer property list */ + if (NULL == (dx_plist = H5I_object(io_info->dxpl_id))) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset transfer property list") + + /* Restore the original parallel I/O mode */ + io_info->dxpl_cache->xfer_mode = H5FD_MPIO_COLLECTIVE; + if(H5P_set (dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &io_info->dxpl_cache->xfer_mode) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode") + } /* end if */ + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5D_ioinfo_term() */ + + +/*------------------------------------------------------------------------- + * Function: H5D_mpio_get_min_chunk + * + * Purpose: Routine for obtaining minimum number of chunks to cover + * hyperslab selection selected by all processors. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static herr_t +H5D_mpio_get_min_chunk(const H5D_io_info_t *io_info, + const fm_map *fm, int *min_chunkf) +{ + int num_chunkf; /* Number of chunks to iterate over */ + int mpi_code; /* MPI return code */ + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI_NOINIT(H5D_mpio_get_min_chunk); + + /* Get the number of chunks to perform I/O on */ + num_chunkf = H5SL_count(fm->fsel); + + /* Determine the minimum # of chunks for all processes */ + if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&num_chunkf, min_chunkf, 1, MPI_INT, MPI_MIN, io_info->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code) + +done: + FUNC_LEAVE_NOAPI(ret_value); +} /* end H5D_mpio_get_min_chunk() */ +#endif /*H5_HAVE_PARALLEL*/ + diff --git a/src/H5Dmpio.c b/src/H5Dmpio.c index febfcdc..858b325 100644 --- a/src/H5Dmpio.c +++ b/src/H5Dmpio.c @@ -24,7 +24,6 @@ #define H5D_PACKAGE /*suppress error about including H5Dpkg */ - #include "H5private.h" /* Generic Functions */ #include "H5Dpkg.h" /* Datasets */ #include "H5Eprivate.h" /* Error handling */ @@ -43,12 +42,6 @@ H5D_mpio_spaces_xfer(H5D_io_info_t *io_info, size_t elmt_size, void *buf/*out*/, hbool_t do_write); -/* For irregular hyperslab selection. */ -static herr_t -H5D_mpio_spaces_span_xfer(H5D_io_info_t *io_info, size_t elmt_size, - const H5S_t *file_space, const H5S_t *mem_space, - void *buf/*out*/, - hbool_t do_write); /*------------------------------------------------------------------------- * Function: H5D_mpio_opt_possible @@ -67,163 +60,94 @@ H5D_mpio_spaces_span_xfer(H5D_io_info_t *io_info, size_t elmt_size, *------------------------------------------------------------------------- */ htri_t -H5D_mpio_opt_possible( const H5D_t *dset, const H5S_t *mem_space, const H5S_t *file_space, const unsigned flags) +H5D_mpio_opt_possible( const H5D_io_info_t *io_info, + const H5S_t *mem_space, const H5S_t *file_space, const H5T_path_t *tpath) { - htri_t c1,c2; /* Flags whether a selection is optimizable */ + int local_opinion = TRUE; /* This process's idea of whether to perform collective I/O or not */ + int consensus; /* Consensus opinion of all processes */ + int mpi_code; /* MPI error code */ htri_t ret_value=TRUE; FUNC_ENTER_NOAPI(H5D_mpio_opt_possible, FAIL); /* Check args */ - assert(dset); + assert(io_info); assert(mem_space); assert(file_space); - /* Parallel I/O conversion flag must be set, if it is not collective IO, go to false. */ - if(!(flags&H5S_CONV_PAR_IO_POSSIBLE)) + /* For independent I/O, get out quickly and don't try to form consensus */ + if (io_info->dxpl_cache->xfer_mode==H5FD_MPIO_INDEPENDENT) HGOTO_DONE(FALSE); + /* Optimized MPI types flag must be set and it is must be collective IO */ + /* (Don't allow parallel I/O for the MPI-posix driver, since it doesn't do real collective I/O) */ + if (!(H5S_mpi_opt_types_g && io_info->dxpl_cache->xfer_mode==H5FD_MPIO_COLLECTIVE && !IS_H5FD_MPIPOSIX(io_info->dset->ent.file))) { + local_opinion = FALSE; + goto broadcast; + } /* end if */ + /* Check whether these are both simple or scalar dataspaces */ if (!((H5S_SIMPLE==H5S_GET_EXTENT_TYPE(mem_space) || H5S_SCALAR==H5S_GET_EXTENT_TYPE(mem_space)) - && (H5S_SIMPLE==H5S_GET_EXTENT_TYPE(file_space) || H5S_SCALAR==H5S_GET_EXTENT_TYPE(file_space)))) - HGOTO_DONE(FALSE); + && (H5S_SIMPLE==H5S_GET_EXTENT_TYPE(file_space) || H5S_SCALAR==H5S_GET_EXTENT_TYPE(file_space)))) { + local_opinion = FALSE; + goto broadcast; + } /* end if */ - /* Check whether both selections are "regular" */ -#ifndef KYANG - c1=H5S_SELECT_IS_REGULAR(file_space); - c2=H5S_SELECT_IS_REGULAR(mem_space); - if(c1==FAIL || c2==FAIL) - HGOTO_ERROR(H5E_DATASPACE, H5E_BADRANGE, FAIL, "invalid check for single selection blocks"); - if(c1==FALSE || c2==FALSE) - HGOTO_DONE(FALSE); -#endif /* Can't currently handle point selections */ - if (H5S_SEL_POINTS==H5S_GET_SELECT_TYPE(mem_space) || H5S_SEL_POINTS==H5S_GET_SELECT_TYPE(file_space)) - HGOTO_DONE(FALSE); + if (H5S_SEL_POINTS==H5S_GET_SELECT_TYPE(mem_space) || H5S_SEL_POINTS==H5S_GET_SELECT_TYPE(file_space)) { + local_opinion = FALSE; + goto broadcast; + } /* end if */ /* Dataset storage must be contiguous or chunked */ - if ((flags&H5S_CONV_STORAGE_MASK)!=H5S_CONV_STORAGE_CONTIGUOUS && - (flags&H5S_CONV_STORAGE_MASK)!=H5S_CONV_STORAGE_CHUNKED) - HGOTO_DONE(FALSE); + if (!(io_info->dset->shared->layout.type == H5D_CONTIGUOUS || + io_info->dset->shared->layout.type == H5D_CHUNKED)) { + local_opinion = FALSE; + goto broadcast; + } /* end if */ - if ((flags&H5S_CONV_STORAGE_MASK)==H5S_CONV_STORAGE_CHUNKED) { - hsize_t chunk_dim[H5O_LAYOUT_NDIMS]; /* Chunk dimensions */ - hsize_t startf[H5S_MAX_RANK], /* Selection start bounds */ - endf[H5S_MAX_RANK]; /* Selection end bounds */ - unsigned dim_rankf; /* Number of dimensions of file dataspace */ - int pcheck_hyper,check_hyper, /* Flags for checking if selection is in one chunk */ - tnum_chunkf, /* Number of chunks selection overlaps */ - max_chunkf, /* Maximum number of chunks selection overlaps */ - min_chunkf, /* Minimum number of chunks selection overlaps */ - num_chunks_same; /* Flag indicating whether all processes have the same # of chunks to operate on */ - unsigned dim_chunks; /* Temporary number of chunks in a dimension */ - MPI_Comm comm; /* MPI communicator for file */ - int mpi_rank; /* Rank in MPI communicator */ - int mpi_code; /* MPI return code */ - unsigned u; /* Local index variable */ - - /* Disallow collective I/O if there are any I/O filters on chunks */ - if(dset->shared->dcpl_cache.pline.nused>0) - HGOTO_DONE(FALSE) - - /* Getting MPI communicator and rank */ - if((comm = H5F_mpi_get_comm(dset->ent.file))==MPI_COMM_NULL) - HGOTO_ERROR(H5E_DATASPACE, H5E_CANTGET, FAIL, "can't retrieve MPI communicator") - if((mpi_rank = H5F_mpi_get_rank(dset->ent.file))<0) - HGOTO_ERROR(H5E_DATASPACE, H5E_CANTGET, FAIL, "can't retrieve MPI rank") - - /* Currently collective chunking storage - inside HDF5 is supported for either one of the following two cases: - 1. All the hyperslabs for one process is inside one chunk. - 2. For single hyperslab selection, the number of chunks that covered - the single selection for all processes should be equal. - KY, 2004/7/14 - */ - - /* Quincey, please read. - This is maybe redundant, I think only when both memory and file space be SCALAR - space, the collective IO can work. Otherwise, SELECT_POINT will be reached,collective - IO shouldn't work. - Please clarify and correct the code on the following, - Quincey said that it was probably okay if only one data space is SCALAR, - Still keep the code here until we added more tests later. - Kent */ - if(H5S_SCALAR==H5S_GET_EXTENT_TYPE(mem_space) || H5S_SCALAR ==H5S_GET_EXTENT_TYPE(file_space)) { - if(!(H5S_SCALAR==H5S_GET_EXTENT_TYPE(mem_space) && H5S_SCALAR ==H5S_GET_EXTENT_TYPE(file_space))) - HGOTO_DONE(FALSE) - else - HGOTO_DONE(TRUE) + /*The handling of memory space is different for chunking + and contiguous storage, + For contigous storage, mem_space and file_space won't + change when it it is doing disk IO. + For chunking storage, mem_space will change for different + chunks. So for chunking storage, whether we can use + collective IO will defer until the each chunk IO is reached. + For contiguous storage, if we find the MPI-IO cannot + support complicated MPI derived data type, we will + set use_par_opt_io = FALSE. + */ +#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS + if(io_info->dset->shared->layout.type == H5D_CONTIGUOUS) + if((H5S_SELECT_IS_REGULAR(file_space) != TRUE) || + (H5S_SELECT_IS_REGULAR(mem_space) != TRUE)) { + local_opinion = FALSE; + goto broadcast; } /* end if */ +#endif - dim_rankf = H5S_GET_EXTENT_NDIMS(file_space); - - if(H5S_SELECT_BOUNDS(file_space,startf,endf)==FAIL) - HGOTO_ERROR(H5E_DATASPACE, H5E_BADRANGE,FAIL, "invalid check for single selection blocks"); - - for(u=0; u < dset->shared->layout.u.chunk.ndims; u++) - chunk_dim[u] = dset->shared->layout.u.chunk.dim[u]; - - /* Case 1: check whether all hyperslab in this process is inside one chunk. - Note: we don't handle when starting point is less than zero since that may cover - two chunks. */ - - /*for file space checking*/ - pcheck_hyper = 1; - for (u=0; udset->shared->layout.type == H5D_CHUNKED) + if(io_info->dset->shared->dcpl_cache.pline.nused>0) { + local_opinion = FALSE; + goto broadcast; + } /* end if */ + + /* Don't allow collective operations if datatype conversions need to happen */ + if(!H5T_path_noop(tpath)) { + local_opinion = FALSE; + goto broadcast; } /* end if */ + +broadcast: + /* Form consensus opinion among all processes about whether to perform + * collective I/O */ + if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&local_opinion, &consensus, 1, MPI_INT, MPI_LAND, io_info->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code) + + ret_value = consensus > 0 ? TRUE : FALSE; + done: FUNC_LEAVE_NOAPI(ret_value); } /* H5D_mpio_opt_possible() */ @@ -369,295 +293,66 @@ done: FUNC_LEAVE_NOAPI(ret_value); } /* end H5D_mpio_spaces_xfer() */ - -/** The following function has been tested, don't call this - function until you don't see this line. Nov. 11,2004, KY**/ - -static herr_t -H5D_mpio_spaces_span_xfer(H5D_io_info_t *io_info, - size_t elmt_size, - const H5S_t *file_space, - const H5S_t *mem_space, - void *_buf /*out*/, - hbool_t do_write ) -{ - haddr_t addr; /* Address of dataset (or selection) within file */ - size_t mpi_buf_count, mpi_file_count; /* Number of "objects" to transfer */ - hsize_t mpi_buf_offset, mpi_file_offset; /* Offset within dataset where selection (ie. MPI type) begins */ - MPI_Datatype mpi_buf_type, mpi_file_type; /* MPI types for buffer (memory) and file */ - hbool_t mbt_is_derived=0, /* Whether the buffer (memory) type is derived and needs to be free'd */ - mft_is_derived=0; /* Whether the file type is derived and needs to be free'd */ - hbool_t plist_is_setup=0; /* Whether the dxpl has been customized */ - uint8_t *buf=(uint8_t *)_buf; /* Alias for pointer arithmetic */ - int mpi_code; /* MPI return code */ - herr_t ret_value = SUCCEED; /* Return value */ - - - FUNC_ENTER_NOAPI_NOINIT(H5D_mpio_spaces_span_xfer); - - /* Check args */ - assert (io_info); - assert (io_info->dset); - assert (file_space); - assert (mem_space); - assert (buf); - assert (IS_H5FD_MPIO(io_info->dset->ent.file)); - /* Make certain we have the correct type of property list */ - assert(TRUE==H5P_isa_class(io_info->dxpl_id,H5P_DATASET_XFER)); - - printf("coming to span tree xfer \n"); - /* create the MPI buffer type */ - if(H5S_SELECT_IS_REGULAR(mem_space)==TRUE){ - if (H5S_mpio_space_type( mem_space, elmt_size, - /* out: */ - &mpi_buf_type, - &mpi_buf_count, - &mpi_buf_offset, - &mbt_is_derived )<0) - HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI buf type");} - else { - if (H5S_mpio_space_span_type( mem_space, elmt_size, - /* out: */ - &mpi_buf_type, - &mpi_buf_count, - &mpi_buf_offset, - &mbt_is_derived )<0) - HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI buf type"); - } - printf("mpi_buf_count %d\n",mpi_buf_count); - /* create the MPI file type */ - - if(H5S_SELECT_IS_REGULAR(file_space)== TRUE){ - if ( H5S_mpio_space_type( file_space, elmt_size, - /* out: */ - &mpi_file_type, - &mpi_file_count, - &mpi_file_offset, - &mft_is_derived )<0) - HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI file type"); - } - else { - if ( H5S_mpio_space_span_type( file_space, elmt_size, - /* out: */ - &mpi_file_type, - &mpi_file_count, - &mpi_file_offset, - &mft_is_derived )<0) - HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI file type"); - } - - /* Get the base address of the contiguous dataset or the chunk */ - if(io_info->dset->shared->layout.type == H5D_CONTIGUOUS) - addr = H5D_contig_get_addr(io_info->dset) + mpi_file_offset; - else { - haddr_t chunk_addr; /* for collective chunk IO */ - - assert(io_info->dset->shared->layout.type == H5D_CHUNKED); - chunk_addr=H5D_istore_get_addr(io_info,NULL); - addr = H5F_BASE_ADDR(io_info->dset->ent.file) + chunk_addr + mpi_file_offset; - } - - /* - * Pass buf type, file type to the file driver. Request an MPI type - * transfer (instead of an elementary byteblock transfer). - */ - if(H5FD_mpi_setup_collective(io_info->dxpl_id, mpi_buf_type, mpi_file_type)<0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O properties"); - plist_is_setup=1; - - /* Adjust the buffer pointer to the beginning of the selection */ - buf+=mpi_buf_offset; - - /* transfer the data */ - if (do_write) { - if (H5F_block_write(io_info->dset->ent.file, H5FD_MEM_DRAW, addr, mpi_buf_count, io_info->dxpl_id, buf) <0) - HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL,"MPI write failed"); - } - else { - if (H5F_block_read (io_info->dset->ent.file, H5FD_MEM_DRAW, addr, mpi_buf_count, io_info->dxpl_id, buf) <0) - HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL,"MPI read failed"); - } - -done: - /* Reset the dxpl settings */ - if(plist_is_setup) { - if(H5FD_mpi_teardown_collective(io_info->dxpl_id)<0) - HDONE_ERROR(H5E_DATASPACE, H5E_CANTFREE, FAIL, "unable to reset dxpl values"); - } /* end if */ - - /* free the MPI buf and file types */ - if (mbt_is_derived) { - if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &mpi_buf_type ))) - HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); - } - if (mft_is_derived) { - if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &mpi_file_type ))) - HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code); - } - - FUNC_LEAVE_NOAPI(ret_value); -} /* end H5D_mpio_spaces_span_xfer() */ - /*------------------------------------------------------------------------- - * Function: H5D_mpio_spaces_read + * Function: H5D_mpio_select_read * * Purpose: MPI-IO function to read directly from app buffer to file. * * Return: non-negative on success, negative on failure. * - * Programmer: rky 980813 + * Programmer: * * Modifications: * - * rky 980918 - * Added must_convert parameter to let caller know we can't optimize the xfer. - * - * QAK - 2002/04/02 - * Removed the must_convert parameter and move preconditions to - * H5S_mpio_opt_possible() routine - * *------------------------------------------------------------------------- */ herr_t -H5D_mpio_spaces_read(H5D_io_info_t *io_info, +H5D_mpio_select_read(H5D_io_info_t *io_info, size_t UNUSED nelmts, size_t elmt_size, const H5S_t *file_space, const H5S_t *mem_space, void *buf/*out*/) { herr_t ret_value; - FUNC_ENTER_NOAPI_NOFUNC(H5D_mpio_spaces_read); + FUNC_ENTER_NOAPI_NOFUNC(H5D_mpio_select_read); ret_value = H5D_mpio_spaces_xfer(io_info, elmt_size, file_space, mem_space, buf, 0/*read*/); FUNC_LEAVE_NOAPI(ret_value); -} /* end H5D_mpio_spaces_read() */ +} /* end H5D_mpio_select_read() */ /*------------------------------------------------------------------------- - * Function: H5D_mpio_spaces_write + * Function: H5D_mpio_select_write * * Purpose: MPI-IO function to write directly from app buffer to file. * * Return: non-negative on success, negative on failure. * - * Programmer: rky 980813 + * Programmer: * * Modifications: * - * rky 980918 - * Added must_convert parameter to let caller know we can't optimize the xfer. - * - * QAK - 2002/04/02 - * Removed the must_convert parameter and move preconditions to - * H5S_mpio_opt_possible() routine * *------------------------------------------------------------------------- */ herr_t -H5D_mpio_spaces_write(H5D_io_info_t *io_info, +H5D_mpio_select_write(H5D_io_info_t *io_info, size_t UNUSED nelmts, size_t elmt_size, const H5S_t *file_space, const H5S_t *mem_space, const void *buf) { herr_t ret_value; - FUNC_ENTER_NOAPI_NOFUNC(H5D_mpio_spaces_write); + FUNC_ENTER_NOAPI_NOFUNC(H5D_mpio_select_write); /*OKAY: CAST DISCARDS CONST QUALIFIER*/ ret_value = H5D_mpio_spaces_xfer(io_info, elmt_size, file_space, - mem_space, (void*)buf, 1/*write*/); + mem_space, (void*)buf, 1/*write*/); FUNC_LEAVE_NOAPI(ret_value); } /* end H5D_mpio_spaces_write() */ - - - -/*------------------------------------------------------------------------- - * Function: H5D_mpio_spaces_span_read - * - * Purpose: MPI-IO function to read directly from app buffer to file for - span-tree - * - * Return: non-negative on success, negative on failure. - * - * Programmer: KY - * Note : Don't call this routine - * until you don't see this line. 11/11/2004, KY - * - * Modifications: - * - * rky 980918 - * Added must_convert parameter to let caller know we can't optimize the xfer. - * - * QAK - 2002/04/02 - * Removed the must_convert parameter and move preconditions to - * H5S_mpio_opt_possible() routine - * - *------------------------------------------------------------------------- - */ -herr_t -H5D_mpio_spaces_span_read(H5D_io_info_t *io_info, - size_t UNUSED nelmts, - size_t elmt_size, - const H5S_t *file_space, - const H5S_t *mem_space, - void *buf/*out*/) -{ - herr_t ret_value; - - FUNC_ENTER_NOAPI_NOFUNC(H5D_mpio_spaces_span_read); - - ret_value = H5D_mpio_spaces_span_xfer(io_info, elmt_size, file_space, - mem_space, buf, 0/*read*/); - - FUNC_LEAVE_NOAPI(ret_value); -} /* end H5D_mpio_spaces_read() */ - - -/*------------------------------------------------------------------------- - * Function: H5D_mpio_spaces_span_write - * - * Purpose: MPI-IO function to write directly from app buffer to file. - * - * Return: non-negative on success, negative on failure. - * - * Programmer: KY - * Note: Don't call this funtion until you don't see this line. - * KY, 11/11/04 - * - * Modifications: - * - * rky 980918 - * Added must_convert parameter to let caller know we can't optimize the xfer. - * - * QAK - 2002/04/02 - * Removed the must_convert parameter and move preconditions to - * H5S_mpio_opt_possible() routine - * - *------------------------------------------------------------------------- - */ -herr_t -H5D_mpio_spaces_span_write(H5D_io_info_t *io_info, - size_t UNUSED nelmts, - size_t elmt_size, - const H5S_t *file_space, - const H5S_t *mem_space, - const void *buf) -{ - herr_t ret_value; - - FUNC_ENTER_NOAPI_NOFUNC(H5D_mpio_spaces_span_write); - - printf(" coming to spaces_span_write function\n"); - fflush(stdout); - /*OKAY: CAST DISCARDS CONST QUALIFIER*/ - printf("element size %d\n",elmt_size); - ret_value = H5D_mpio_spaces_span_xfer(io_info, elmt_size, file_space, - mem_space, (void*)buf, 1/*write*/); - - FUNC_LEAVE_NOAPI(ret_value); -} /* end H5D_mpio_spaces_span_write() */ #endif /* H5_HAVE_PARALLEL */ + diff --git a/src/H5Dpkg.h b/src/H5Dpkg.h index 14b97c9..e98c2ae 100644 --- a/src/H5Dpkg.h +++ b/src/H5Dpkg.h @@ -98,8 +98,15 @@ typedef struct H5D_io_ops_t { /* Typedef for raw data I/O operation info */ typedef struct H5D_io_info_t { H5D_t *dset; /* Pointer to dataset being operated on */ - const H5D_dxpl_cache_t *dxpl_cache; /* Pointer to cache DXPL info */ +#ifndef H5_HAVE_PARALLEL + const +#endif /* H5_HAVE_PARALLEL */ + H5D_dxpl_cache_t *dxpl_cache; /* Pointer to cache DXPL info */ hid_t dxpl_id; /* Original DXPL ID */ +#ifdef H5_HAVE_PARALLEL + MPI_Comm comm; /* MPI communicator for file */ + hbool_t xfer_mode_changed; /* Whether the transfer mode was changed */ +#endif /* H5_HAVE_PARALLEL */ const H5D_storage_t *store; /* Dataset storage info */ H5D_io_ops_t ops; /* I/O operation function pointers */ #ifdef H5S_DEBUG @@ -277,6 +284,18 @@ H5_DLL ssize_t H5D_efl_writevv(const H5D_io_info_t *io_info, #ifdef H5_HAVE_PARALLEL /* MPI-IO function to read directly from app buffer to file rky980813 */ +H5_DLL herr_t H5D_mpio_select_read(H5D_io_info_t *io_info, + size_t nelmts, size_t elmt_size, + const struct H5S_t *file_space, const struct H5S_t *mem_space, + void *buf/*out*/); + +/* MPI-IO function to read , it will select either regular or irregular read */ +H5_DLL herr_t H5D_mpio_select_write(H5D_io_info_t *io_info, + size_t nelmts, size_t elmt_size, + const struct H5S_t *file_space, const struct H5S_t *mem_space, + const void *buf); + +/* MPI-IO function to read directly from app buffer to file rky980813 */ H5_DLL herr_t H5D_mpio_spaces_read(H5D_io_info_t *io_info, size_t nelmts, size_t elmt_size, const struct H5S_t *file_space, const struct H5S_t *mem_space, @@ -302,8 +321,8 @@ H5_DLL herr_t H5D_mpio_spaces_span_write(H5D_io_info_t *io_info, /* MPI-IO function to check if a direct I/O transfer is possible between * memory and the file */ -H5_DLL htri_t H5D_mpio_opt_possible(const H5D_t *dset, const H5S_t *mem_space, - const H5S_t *file_space, const unsigned flags); +H5_DLL htri_t H5D_mpio_opt_possible(const H5D_io_info_t *io_info, const H5S_t *mem_space, + const H5S_t *file_space, const H5T_path_t *tpath); #endif /* H5_HAVE_PARALLEL */ /* Testing functions */ diff --git a/src/H5Smpio.c b/src/H5Smpio.c index 86d16e2..1f82618 100644 --- a/src/H5Smpio.c +++ b/src/H5Smpio.c @@ -66,7 +66,7 @@ H5S_mpio_span_hyper_type( const H5S_t *space, size_t elmt_size, hsize_t *extra_offset, hbool_t *is_derived_type ); -static herr_t obtain_datatype(const hsize_t size[], +static herr_t H5S_obtain_datatype(const hsize_t size[], H5S_hyper_span_t* span,MPI_Datatype *span_type, size_t elmt_size,int dimindex); @@ -462,101 +462,15 @@ done: FUNC_LEAVE_NOAPI(ret_value); } - -/*------------------------------------------------------------------------- - * Function: H5S_mpio_space_type - * - * Purpose: Translate an HDF5 dataspace selection into an MPI type. - * Currently handle only hyperslab and "all" selections. - * - * Return: non-negative on success, negative on failure. - * - * Outputs: *new_type the MPI type corresponding to the selection - * *count how many objects of the new_type in selection - * (useful if this is the buffer type for xfer) - * *extra_offset Number of bytes of offset within dataset - * *is_derived_type 0 if MPI primitive type, 1 if derived - * - * Programmer: rky 980813 - * - * Modifications: - * - * Quincey Koziol, June 18, 2002 - * Added 'extra_offset' parameter - * - *------------------------------------------------------------------------- - */ -herr_t -H5S_mpio_space_type( const H5S_t *space, size_t elmt_size, - /* out: */ - MPI_Datatype *new_type, - size_t *count, - hsize_t *extra_offset, - hbool_t *is_derived_type ) -{ - herr_t ret_value = SUCCEED; - - FUNC_ENTER_NOAPI_NOINIT(H5S_mpio_space_type); - - /* Check args */ - assert (space); - - /* Creat MPI type based on the kind of selection */ - switch (H5S_GET_EXTENT_TYPE(space)) { - case H5S_SCALAR: - case H5S_SIMPLE: - switch(H5S_GET_SELECT_TYPE(space)) { - case H5S_SEL_NONE: - if ( H5S_mpio_none_type( space, elmt_size, - /* out: */ new_type, count, extra_offset, is_derived_type ) <0) - HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't convert \"all\" selection to MPI type"); - break; - - case H5S_SEL_ALL: - if ( H5S_mpio_all_type( space, elmt_size, - /* out: */ new_type, count, extra_offset, is_derived_type ) <0) - HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't convert \"all\" selection to MPI type"); - break; - - case H5S_SEL_POINTS: - /* not yet implemented */ - ret_value = FAIL; - break; - - case H5S_SEL_HYPERSLABS: - if(H5S_mpio_hyper_type( space, elmt_size, - /* out: */ new_type, count, extra_offset, is_derived_type )<0) - HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't convert \"all\" selection to MPI type"); - break; - - default: - assert("unknown selection type" && 0); - break; - } /* end switch */ - break; - - case H5S_COMPLEX: - /* not yet implemented */ - HGOTO_ERROR(H5E_DATASPACE, H5E_UNSUPPORTED, FAIL, "complex data spaces are not supported yet"); - default: - assert("unknown data space type" && 0); - break; - } - -done: - FUNC_LEAVE_NOAPI(ret_value); -} /*------------------------------------------------------------------------- - * Function: H5S_mpio_space_span_type + * Function: H5S_mpio_span_hyper_type * - * Purpose: Translate an HDF5 dataspace selection into a general - * MPI derived datatype built with span-tree. - * - * Currently handle only hyperslab and "all" selections. + * Purpose: Translate an HDF5 irregular hyperslab selection into an + MPI type. * * Return: non-negative on success, negative on failure. * @@ -566,119 +480,43 @@ done: * *extra_offset Number of bytes of offset within dataset * *is_derived_type 0 if MPI primitive type, 1 if derived * - * Programmer: KY + * Programmer: kyang * - * Modifications: - * - * Quincey Koziol, June 18, 2002 - * Added 'extra_offset' parameter - * - *------------------------------------------------------------------------- */ -herr_t -H5S_mpio_space_span_type( const H5S_t *space, size_t elmt_size, - /* out: */ - MPI_Datatype *new_type, - size_t *count, - hsize_t *extra_offset, - hbool_t *is_derived_type ) -{ - herr_t ret_value = SUCCEED; - - FUNC_ENTER_NOAPI_NOINIT(H5S_mpio_space_span_type); - - /* Check args */ - assert (space); - - /* Creat MPI type based on the kind of selection */ - switch (H5S_GET_EXTENT_TYPE(space)) { - case H5S_SCALAR: - case H5S_SIMPLE: - switch(H5S_GET_SELECT_TYPE(space)) { - case H5S_SEL_NONE: - if ( H5S_mpio_none_type( space, elmt_size, - /* out: */ new_type, count, extra_offset, is_derived_type ) <0) - HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't convert \"all\" selection to MPI type"); - break; - - case H5S_SEL_ALL: - if ( H5S_mpio_all_type( space, elmt_size, - /* out: */ new_type, count, extra_offset, is_derived_type ) <0) - HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't convert \"all\" selection to MPI type"); - break; - - case H5S_SEL_POINTS: - /* not yet implemented */ - ret_value = FAIL; - break; - - case H5S_SEL_HYPERSLABS: - if(H5S_mpio_span_hyper_type( space, elmt_size, - /* out: */ new_type, count, extra_offset, is_derived_type )<0) - HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't convert \"all\" selection to MPI type"); - break; - - default: - assert("unknown selection type" && 0); - break; - } /* end switch */ - break; - - case H5S_COMPLEX: - /* not yet implemented */ - HGOTO_ERROR(H5E_DATASPACE, H5E_UNSUPPORTED, FAIL, "complex data spaces are not supported yet"); - - default: - assert("unknown data space type" && 0); - break; - } - -done: - FUNC_LEAVE_NOAPI(ret_value); -} - - -/* The following codes have been used by Kent to test - general collective derived datatype functionality. - It should NOT be called by other routines except with - macro #ifdef KENT #endif - Nov. 11th, 2004 */ - - static herr_t -H5S_mpio_span_hyper_type( const H5S_t *space, size_t elmt_size, - /* out: */ - MPI_Datatype *new_type, - size_t *count, - hsize_t *extra_offset, - hbool_t *is_derived_type ){ +H5S_mpio_span_hyper_type( const H5S_t *space, + size_t elmt_size, + MPI_Datatype *new_type,/* out: */ + size_t *count, + hsize_t *extra_offset, + hbool_t *is_derived_type ){ MPI_Datatype span_type; H5S_hyper_span_t *ospan; H5S_hyper_span_info_t *odown; hsize_t *size; int rank; + int mpi_code; herr_t ret_value = SUCCEED; + MPI_Aint extent,lb; - MPI_Aint extent,lb; - FUNC_ENTER_NOAPI_NOINIT_NOFUNC(H5S_mpio_span_hyper_type); + FUNC_ENTER_NOAPI_NOINIT(H5S_mpio_span_hyper_type); - printf("coming to hyper type \n"); /* Check args */ assert (space); - /* assert(sizeof(MPI_Aint) >= sizeof(elmt_size));?? */ - /* Only for simple extent - rank = space->extent.u.simple.rank; - */ + /* assert(sizeof(MPI_Aint) >= sizeof(elmt_size)); not sure the reason*/ + + rank = space->extent.rank; /* size = HDcalloc((size_t)rank,sizeof(hsize_t)); */ if (0==elmt_size) goto empty; size = space->extent.size; - + if(size == 0) + goto empty; odown = space->select.sel_info.hslab->span_lst; if(odown == NULL) @@ -687,37 +525,52 @@ H5S_mpio_span_hyper_type( const H5S_t *space, size_t elmt_size, if(ospan == NULL) goto empty; - obtain_datatype(space->extent.size,ospan,&span_type,elmt_size,rank); - MPI_Type_commit(&span_type); + /* obtain derived data type */ + if(FAIL == H5S_obtain_datatype(space->extent.size,ospan,&span_type,elmt_size,rank)) + HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't obtain MPI derived data type"); + + if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(&span_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code); - MPI_Type_lb(span_type,&lb); - printf("lb %d\n",lb); - MPI_Type_extent(span_type,&extent); - printf("extent %d\n",extent); *new_type = span_type; - /* fill in the remaining return values */ + /* fill in the remaining return values */ *count = 1; *extra_offset = 0; *is_derived_type = 1; - printf("before freeing size\n"); - /* HDfree(size);*/ - printf("after freeing size\n"); + HGOTO_DONE(SUCCEED); empty: /* special case: empty hyperslab */ - *new_type = MPI_BYTE; - *count = 0; - *extra_offset = 0; + *new_type = MPI_BYTE; + *count = 0; + *extra_offset = 0; *is_derived_type = 0; done: FUNC_LEAVE_NOAPI(ret_value); } - -static herr_t obtain_datatype(const hsize_t size[], H5S_hyper_span_t* span,MPI_Datatype *span_type, - size_t elmt_size,int dimindex) { + +/*------------------------------------------------------------------------- + * Function: H5S_obtain datatype + * + * Purpose: Obtain an MPI derived datatype based on span-tree + implementation + * + * Return: non-negative on success, negative on failure. + * + * Outputs: *span_type the MPI type corresponding to the selection + * + * Programmer: kyang + * + */ +static herr_t H5S_obtain_datatype(const hsize_t size[], + H5S_hyper_span_t* span, + MPI_Datatype *span_type, + size_t elmt_size, + int dimindex) +{ int innercount,outercount; MPI_Datatype bas_type; @@ -730,6 +583,9 @@ static herr_t obtain_datatype(const hsize_t size[], H5S_hyper_span_t* span,MPI_D MPI_Aint extent,lb; H5S_hyper_span_info_t *down; H5S_hyper_span_t *tspan; + int mpi_code; + herr_t ret_value = SUCCEED; + #ifdef H5_HAVE_MPI2 MPI_Aint sizeaint,sizedtype; #endif /* H5_HAVE_MPI2 */ @@ -737,21 +593,19 @@ static herr_t obtain_datatype(const hsize_t size[], H5S_hyper_span_t* span,MPI_D int i; int ret; + FUNC_ENTER_NOAPI_NOINIT(H5S_obtain_datatype); assert(span); inner_type = NULL; - down = NULL; - tspan= NULL; - down = span->down; - tspan = span; + down = NULL; + tspan = NULL; + down = span->down; + tspan = span; outercount = 0; + +/* obtain the number of span tree for this dimension */ while(tspan) { - if(tspan) { - HDfprintf(stdout, " span->low %Hd\n",tspan->low); - HDfprintf(stdout, " span->high %Hd\n",tspan->high); - HDfprintf(stdout, " span->nelm %Hu\n",tspan->nelem); - } tspan = tspan->next; outercount ++; } @@ -760,53 +614,55 @@ static herr_t obtain_datatype(const hsize_t size[], H5S_hyper_span_t* span,MPI_D span_type = NULL; return 0; } - printf("outercount %d\n",outercount); + +/* MPI2 hasn't been widely acccepted, adding H5_HAVE_MPI2 for the future use */ #ifdef H5_HAVE_MPI2 - printf("coming into HAVE_MPI2\n"); - fflush(stdout); MPI_Type_extent(MPI_Aint,&sizeaint); MPI_Type_extent(MPI_Datatype,&sizedtype); - printf("coming into HAVE_MPI2 type extent\n"); - fflush(stdout); blocklen = (int *)HDcalloc((size_t)outercount,sizeof(int)); disp = (MPI_Aint *)HDcalloc((size_t)outercount,sizeaint); inner_type = (MPI_Datatype *)HDcalloc((size_t)outercount,sizedtype); + #else - printf("coming to MPI2 else \n"); - blocklen = (int *)HDcalloc((size_t)outercount,sizeof(int)); - disp = (MPI_Aint *)HDcalloc((size_t)outercount,sizeof(int)); - inner_type = (MPI_Datatype *)HDcalloc((size_t)outercount,sizeof(int)); - printf("end of calloc \n"); + + blocklen = (int *)HDcalloc((size_t)outercount,sizeof(int)); + disp = (MPI_Aint *)HDcalloc((size_t)outercount,sizeof(MPI_Aint)); + inner_type = (MPI_Datatype *)HDcalloc((size_t)outercount,sizeof(MPI_Datatype)); + #endif - tspan = span; + tspan = span; outercount = 0; + /* if this is the fastest changing dimension, it is the base case for derived datatype. */ if(down == NULL){ - printf("coming to down = NULL \n"); - if(dimindex > 1) printf("wrong area \n"); - MPI_Type_contiguous((int)elmt_size,MPI_BYTE,&bas_type); - MPI_Type_commit(&bas_type); - printf("after type commit \n"); + + assert(dimindex <= 1); + if(MPI_SUCCESS != (mpi_code = MPI_Type_contiguous((int)elmt_size, MPI_BYTE,&bas_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_contiguous failed", mpi_code); + + if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&bas_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code); + while(tspan){ + disp[outercount] = (MPI_Aint)elmt_size * tspan->low; blocklen[outercount] = tspan->nelem; tspan = tspan->next; outercount ++; } - /* printf("outercount %d\n",outercount); - printf("after while loop \n");*/ - ret = MPI_Type_hindexed(outercount,blocklen,disp,bas_type,span_type); - if(ret < 0) printf("type hindexed doesn't work\n"); - printf("after hindexed \n"); + + if(MPI_SUCCESS != (mpi_code = MPI_Type_hindexed(outercount,blocklen, + disp,bas_type,span_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_hindexed failed", mpi_code); } else {/* dimindex is the rank of the dimension */ - if(dimindex <2) printf("something is wrong \n"); + assert(dimindex >1); /* Calculate the total bytes of the lower dimension */ total_lowd = 1; /* one dimension down */ total_lowd1 = 1; /* two dimensions down */ @@ -815,77 +671,160 @@ static herr_t obtain_datatype(const hsize_t size[], H5S_hyper_span_t* span,MPI_D total_lowd = total_lowd * size[i]; for ( i = dimindex-1; i > 1; i--) - total_lowd1 = total_lowd1 * size[i]; - - HDfprintf(stdout, " one dimension down size %Hu\n",total_lowd); - HDfprintf(stdout, " two dimension down size %Hu\n",total_lowd1); + total_lowd1 = total_lowd1 * size[i]; while(tspan){ -/* Displacement should be in byte and should have dimension information */ -/* First using MPI Type vector to build derived data type for this span only */ -/* Need to calculate the disp in byte for this dimension. */ - /* Calculate the total bytes of the lower dimension */ - - disp[outercount] = tspan->low*total_lowd*elmt_size; - blocklen[outercount] = 1; - printf("displacement the 0 rank %d\n",disp[0]); - /* generating inner derived datatype by using MPI_Type_hvector */ - obtain_datatype(size,tspan->down->head,&temp_type,elmt_size,dimindex-1); - ret= MPI_Type_commit(&temp_type); - if(ret < 0) printf("cannot commit temp_type\n"); -/* inner_type[count] = temp_type; */ -#ifdef H5_HAVE_MPI2 - MPI_Type_get_extent(temp_type,&lb,&extent); -#else - MPI_Type_lb(temp_type,&lb); - printf("lb %d\n",lb); - MPI_Type_extent(temp_type,&extent); - printf("extent %d\n",extent); -#endif - /* building the inner vector datatype */ - /* The following calculation of stride is wrong since stride is calculated - from the first element of the block to the first element of the next - block. */ - /*stride = total_lowd1 * (size[dimindex-1]*elmt_size-extent-lb);*/ + /* Displacement should be in byte and should have dimension information */ + /* First using MPI Type vector to build derived data type for this span only */ + /* Need to calculate the disp in byte for this dimension. */ + /* Calculate the total bytes of the lower dimension */ + + disp[outercount] = tspan->low*total_lowd*elmt_size; + blocklen[outercount] = 1; + + /* generating inner derived datatype by using MPI_Type_hvector */ + if(FAIL == H5S_obtain_datatype(size,tspan->down->head,&temp_type,elmt_size,dimindex-1)) + HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't obtain MPI derived data type"); + + if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&temp_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code); + + /* building the inner vector datatype */ stride = total_lowd*elmt_size; - innercount = tspan->nelem; - printf("stride %d\n",stride); - printf("innercount %d\n",innercount); - fflush(stdout); - ret = MPI_Type_hvector(innercount,1,stride,temp_type,&tempinner_type); - - /* MPI_Type_contiguous(2,temp_type,&tempinner_type);*/ - if(ret < 0) printf("wrong vector \n"); - MPI_Type_commit(&tempinner_type); - printf("after tempinner_type commit\n"); - MPI_Type_free(&temp_type); - printf("after free \n"); - inner_type[outercount] = tempinner_type; - outercount ++; - tspan = tspan->next; - } + innercount = tspan->nelem; + + if(MPI_SUCCESS != (mpi_code = MPI_Type_hvector(innercount,1,stride,temp_type,&tempinner_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_hvector failed", mpi_code); + + if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&tempinner_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code); + + if(MPI_SUCCESS != (mpi_code =MPI_Type_free(&temp_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_free failed",mpi_code); + inner_type[outercount] = tempinner_type; + outercount ++; + tspan = tspan->next; + + } /* building the whole vector datatype */ - MPI_Type_struct(outercount,blocklen,disp,inner_type,span_type); - printf("after type struct \n"); + if(MPI_SUCCESS != (mpi_code = + MPI_Type_struct(outercount,blocklen,disp,inner_type,span_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_struct failed", mpi_code); + } if(inner_type != NULL){ if(down != NULL) { - for(i=0;i