summaryrefslogtreecommitdiffstats
path: root/src/H5Dmpio.c
diff options
context:
space:
mode:
authorQuincey Koziol <koziol@hdfgroup.org>2008-04-24 15:03:41 (GMT)
committerQuincey Koziol <koziol@hdfgroup.org>2008-04-24 15:03:41 (GMT)
commit495ca9c7bb19553d2c87ce68013f1de2dff3d54b (patch)
tree792e1a9ecc8aa314dfa3d0538464e4f87ad55cf5 /src/H5Dmpio.c
parent16d4cae5b16ffb91298d8d232214afeb5112da6d (diff)
downloadhdf5-495ca9c7bb19553d2c87ce68013f1de2dff3d54b.zip
hdf5-495ca9c7bb19553d2c87ce68013f1de2dff3d54b.tar.gz
hdf5-495ca9c7bb19553d2c87ce68013f1de2dff3d54b.tar.bz2
[svn-r14860] Description:
Omnibus raw data I/O revisions, with wide-ranging changes and refactoring, in order to prepare for implementing "fast append" feature. These changes remove the majority of the code duplication for raw data I/O which has crept in over the last ten years and introduces a more object- oriented design for operating on different types of dataset storage. Chunked storage no longer has it's own I/O routines, it is now handled as either contiguous (if chunk is not pulled into the cache) or compact (if the chunk is cached in memory). No bug or feature changes, at least intentionally... :-) Tested on: FreeBSD/32 6.2 (duty) in debug mode FreeBSD/64 6.2 (liberty) w/C++ & FORTRAN, in debug mode Linux/32 2.6 (kagiso) w/PGI compilers, w/C++ & FORTRAN, w/threadsafe, in debug mode Linux/64-amd64 2.6 (smirom) w/default API=1.6.x, w/C++ & FORTRAN, in production mode Linux/64-ia64 2.6 (cobalt) w/Intel compilers, w/C++ & FORTRAN, in production mode Solaris/32 2.10 (linew) w/deprecated symbols disabled, w/C++ & FORTRAN, w/szip filter, in production mode Mac OS X/32 10.5.2 (amazon) in debug mode Linux/64-ia64 2.4 (tg-login3) w/parallel, w/FORTRAN, in production mode
Diffstat (limited to 'src/H5Dmpio.c')
-rw-r--r--src/H5Dmpio.c2658
1 files changed, 1317 insertions, 1341 deletions
diff --git a/src/H5Dmpio.c b/src/H5Dmpio.c
index 22f32ab..889fbce 100644
--- a/src/H5Dmpio.c
+++ b/src/H5Dmpio.c
@@ -34,12 +34,12 @@
/* Headers */
/***********/
#include "H5private.h" /* Generic Functions */
-#include "H5Iprivate.h"
#include "H5Dpkg.h" /* Datasets */
#include "H5Eprivate.h" /* Error handling */
#include "H5Fprivate.h" /* File access */
#include "H5FDprivate.h" /* File drivers */
-#include "H5MMprivate.h"
+#include "H5Iprivate.h" /* IDs */
+#include "H5MMprivate.h" /* Memory management */
#include "H5Oprivate.h" /* Object headers */
#include "H5Pprivate.h" /* Property lists */
#include "H5Sprivate.h" /* Dataspaces */
@@ -86,6 +86,7 @@
#define H5D_CHUNK_SELECT_IRREG 2
#define H5D_CHUNK_SELECT_NONE 0
+
/******************/
/* Local Typedefs */
/******************/
@@ -95,66 +96,47 @@ typedef struct H5D_chunk_addr_info_t {
H5D_chunk_info_t chunk_info;
} H5D_chunk_addr_info_t;
-/* Combine all information that needs to know for collective MPI-IO of this selection. */
-typedef struct H5D_common_coll_info_t {
- hbool_t mbt_is_derived;
- hbool_t mft_is_derived;
- size_t mpi_buf_count;
- haddr_t chunk_addr;
-} H5D_common_coll_info_t;
-
/********************/
/* Local Prototypes */
/********************/
-
-static herr_t
-H5D_multi_chunk_collective_io(H5D_io_info_t *io_info,H5D_chunk_map_t *fm,const void *buf,
- hbool_t do_write);
-static herr_t
-H5D_multi_chunk_collective_io_no_opt(H5D_io_info_t *io_info,H5D_chunk_map_t *fm,const void *buf,
- hbool_t do_write);
-
-static herr_t
-H5D_link_chunk_collective_io(H5D_io_info_t *io_info,H5D_chunk_map_t *fm,const void *buf,
- hbool_t do_write,int sum_chunk);
-
-static herr_t
-H5D_inter_collective_io(H5D_io_info_t *io_info,const H5S_t *file_space,
- const H5S_t *mem_space,haddr_t addr,
- const void *buf, hbool_t do_write );
-
-static herr_t
-H5D_final_collective_io(H5D_io_info_t *io_info,MPI_Datatype*mpi_file_type,
- MPI_Datatype *mpi_buf_type,
- H5D_common_coll_info_t* coll_info,
- const void *buf, hbool_t do_write);
-static herr_t
-H5D_sort_chunk(H5D_io_info_t * io_info,
- H5D_chunk_map_t *fm,
- H5D_chunk_addr_info_t chunk_addr_info_array[],
- int many_chunk_opt);
-
-static herr_t
-H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
- H5D_chunk_map_t *fm,
- uint8_t assign_io_mode[],
- haddr_t chunk_addr[]);
-
-static herr_t H5D_ioinfo_make_ind(H5D_io_info_t *io_info);
-static herr_t H5D_ioinfo_make_coll_opt(H5D_io_info_t *io_info);
-static herr_t H5D_ioinfo_make_coll(H5D_io_info_t *io_info);
+static herr_t H5D_chunk_collective_io(H5D_io_info_t *io_info,
+ const H5D_type_info_t *type_info, H5D_chunk_map_t *fm);
+static herr_t H5D_multi_chunk_collective_io(H5D_io_info_t *io_info,
+ const H5D_type_info_t *type_info, H5D_chunk_map_t *fm,
+ H5P_genplist_t *dx_plist);
+static herr_t H5D_multi_chunk_collective_io_no_opt(H5D_io_info_t *io_info,
+ const H5D_type_info_t *type_info, H5D_chunk_map_t *fm, H5P_genplist_t *dx_plist);
+#ifdef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
+static herr_t H5D_link_chunk_collective_io(H5D_io_info_t *io_info,
+ const H5D_type_info_t *type_info, H5D_chunk_map_t *fm, int sum_chunk);
+#endif /* H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS */
+static herr_t H5D_inter_collective_io(H5D_io_info_t *io_info,
+ const H5D_type_info_t *type_info, const H5S_t *file_space,
+ const H5S_t *mem_space);
+static herr_t H5D_final_collective_io(H5D_io_info_t *io_info,
+ const H5D_type_info_t *type_info, size_t nelmts, MPI_Datatype *mpi_file_type,
+ MPI_Datatype *mpi_buf_type);
+static herr_t H5D_sort_chunk(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm,
+ H5D_chunk_addr_info_t chunk_addr_info_array[], int many_chunk_opt);
+static herr_t H5D_obtain_mpio_mode(H5D_io_info_t *io_info, H5D_chunk_map_t *fm,
+ H5P_genplist_t *dx_plist, uint8_t assign_io_mode[], haddr_t chunk_addr[]);
+static herr_t H5D_ioinfo_xfer_mode(H5D_io_info_t *io_info, H5P_genplist_t *dx_plist,
+ H5FD_mpio_xfer_t xfer_mode);
+static herr_t H5D_ioinfo_coll_opt_mode(H5D_io_info_t *io_info, H5P_genplist_t *dx_plist,
+ H5FD_mpio_collective_opt_t coll_opt_mode);
static herr_t H5D_mpio_get_min_chunk(const H5D_io_info_t *io_info,
const H5D_chunk_map_t *fm, int *min_chunkf);
static int H5D_cmp_chunk_addr(const void *addr1, const void *addr2);
static herr_t H5D_mpio_get_sum_chunk(const H5D_io_info_t *io_info,
- const H5D_chunk_map_t *fm, int *sum_chunkf);
+ const H5D_chunk_map_t *fm, int *sum_chunkf);
/*********************/
/* Package Variables */
/*********************/
+
/*******************/
/* Local Variables */
/*******************/
@@ -175,64 +157,77 @@ static herr_t H5D_mpio_get_sum_chunk(const H5D_io_info_t *io_info,
*-------------------------------------------------------------------------
*/
htri_t
-H5D_mpio_opt_possible( const H5D_io_info_t *io_info,
- const H5S_t *mem_space, const H5S_t *file_space, const H5T_path_t *tpath)
+H5D_mpio_opt_possible(const H5D_io_info_t *io_info, const H5S_t *file_space,
+ const H5S_t *mem_space, const H5D_type_info_t *type_info,
+ const H5D_chunk_map_t *fm)
{
- int local_opinion = TRUE; /* This process's idea of whether to perform collective I/O or not */
- int consensus; /* Consensus opinion of all processes */
- int mpi_code; /* MPI error code */
- htri_t ret_value=TRUE;
+ int local_opinion = TRUE; /* This process's idea of whether to perform collective I/O or not */
+ int consensus; /* Consensus opinion of all processes */
+ int mpi_code; /* MPI error code */
+ htri_t ret_value = TRUE;
- FUNC_ENTER_NOAPI(H5D_mpio_opt_possible, FAIL);
+ FUNC_ENTER_NOAPI(H5D_mpio_opt_possible, FAIL)
/* Check args */
- assert(io_info);
- assert(mem_space);
- assert(file_space);
+ HDassert(io_info);
+ HDassert(mem_space);
+ HDassert(file_space);
+ HDassert(type_info);
/* For independent I/O, get out quickly and don't try to form consensus */
- if (io_info->dxpl_cache->xfer_mode==H5FD_MPIO_INDEPENDENT)
+ if(io_info->dxpl_cache->xfer_mode == H5FD_MPIO_INDEPENDENT)
HGOTO_DONE(FALSE);
+ /* Don't allow collective operations if datatype conversions need to happen */
+ if(!type_info->is_conv_noop) {
+ local_opinion = FALSE;
+ goto broadcast;
+ } /* end if */
+
+ /* Don't allow collective operations if data transform operations should occur */
+ if(!type_info->is_xform_noop) {
+ local_opinion = FALSE;
+ goto broadcast;
+ } /* end if */
+
/* Optimized MPI types flag must be set and it must be collective IO */
/* (Don't allow parallel I/O for the MPI-posix driver, since it doesn't do real collective I/O) */
- if (!(H5S_mpi_opt_types_g && io_info->dxpl_cache->xfer_mode==H5FD_MPIO_COLLECTIVE && !IS_H5FD_MPIPOSIX(io_info->dset->oloc.file))) {
+ if(!(H5S_mpi_opt_types_g && io_info->dxpl_cache->xfer_mode == H5FD_MPIO_COLLECTIVE
+ && !IS_H5FD_MPIPOSIX(io_info->dset->oloc.file))) {
local_opinion = FALSE;
goto broadcast;
} /* end if */
/* Check whether these are both simple or scalar dataspaces */
- if (!((H5S_SIMPLE==H5S_GET_EXTENT_TYPE(mem_space) || H5S_SCALAR==H5S_GET_EXTENT_TYPE(mem_space))
- && (H5S_SIMPLE==H5S_GET_EXTENT_TYPE(file_space) || H5S_SCALAR==H5S_GET_EXTENT_TYPE(file_space)))) {
+ if(!((H5S_SIMPLE == H5S_GET_EXTENT_TYPE(mem_space) || H5S_SCALAR == H5S_GET_EXTENT_TYPE(mem_space))
+ && (H5S_SIMPLE == H5S_GET_EXTENT_TYPE(file_space) || H5S_SCALAR == H5S_GET_EXTENT_TYPE(file_space)))) {
local_opinion = FALSE;
goto broadcast;
} /* end if */
/* Can't currently handle point selections */
- if (H5S_SEL_POINTS==H5S_GET_SELECT_TYPE(mem_space) || H5S_SEL_POINTS==H5S_GET_SELECT_TYPE(file_space)) {
+ if(H5S_SEL_POINTS == H5S_GET_SELECT_TYPE(mem_space)
+ || H5S_SEL_POINTS == H5S_GET_SELECT_TYPE(file_space)) {
local_opinion = FALSE;
goto broadcast;
} /* end if */
/* Dataset storage must be contiguous or chunked */
- if (!(io_info->dset->shared->layout.type == H5D_CONTIGUOUS ||
+ if(!(io_info->dset->shared->layout.type == H5D_CONTIGUOUS ||
io_info->dset->shared->layout.type == H5D_CHUNKED)) {
local_opinion = FALSE;
goto broadcast;
} /* end if */
- /* The handling of memory space is different for chunking
- and contiguous storage,
- For contigous storage, mem_space and file_space won't
- change when it it is doing disk IO.
- For chunking storage, mem_space will change for different
- chunks. So for chunking storage, whether we can use
- collective IO will defer until each chunk IO is reached.
- For contiguous storage, if we find MPI-IO cannot
- support complicated MPI derived data type and the shape
- of data space is not regular, we will
- set use_par_opt_io = FALSE.
- */
+ /* The handling of memory space is different for chunking and contiguous
+ * storage. For contiguous storage, mem_space and file_space won't change
+ * when it it is doing disk IO. For chunking storage, mem_space will
+ * change for different chunks. So for chunking storage, whether we can
+ * use collective IO will defer until each chunk IO is reached. For
+ * contiguous storage, if we find MPI-IO cannot support complicated MPI
+ * derived data type and the shape of data space is not regular, we will
+ * set use_par_opt_io = FALSE.
+ */
#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
if(io_info->dset->shared->layout.type == H5D_CONTIGUOUS)
if((H5S_SELECT_IS_REGULAR(file_space) != TRUE) ||
@@ -243,81 +238,43 @@ H5D_mpio_opt_possible( const H5D_io_info_t *io_info,
#endif
/* Don't allow collective operations if filters need to be applied */
- if(io_info->dset->shared->layout.type == H5D_CHUNKED)
- if(io_info->dset->shared->dcpl_cache.pline.nused>0) {
+ if(io_info->dset->shared->layout.type == H5D_CHUNKED) {
+ if(io_info->dset->shared->dcpl_cache.pline.nused > 0) {
local_opinion = FALSE;
goto broadcast;
} /* end if */
- /* Don't allow collective operations if datatype conversions need to happen */
- if(!H5T_path_noop(tpath)) {
- local_opinion = FALSE;
- goto broadcast;
- } /* end if */
-
- /* Don't allow collective operations if data transform operations should occur */
- if(!H5Z_xform_noop(io_info->dxpl_cache->data_xform_prop)) {
- local_opinion = FALSE;
- goto broadcast;
+/* If H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS and H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
+ * are defined, the HDF5 library will do collective IO if the application
+ * asks for it.
+ *
+ * If H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS is not defined and one or more
+ * processes are not participating in the IO, then collective IO is not
+ * assured. The library will check each process for the number of chunks
+ * it involves. If any process involves zero chunks, the library will use
+ * independent IO mode instead.
+ */
+#ifndef H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS
+ /* Check the number of chunks to perform I/O on */
+ if(0 == H5SL_count(fm->sel_chunks)) {
+ local_opinion = FALSE;
+ goto broadcast;
+ } /* end if */
+#endif /* H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS */
} /* end if */
broadcast:
/* Form consensus opinion among all processes about whether to perform
- * collective I/O */
- if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&local_opinion, &consensus, 1, MPI_INT, MPI_LAND, io_info->comm)))
+ * collective I/O
+ */
+ if(MPI_SUCCESS != (mpi_code = MPI_Allreduce(&local_opinion, &consensus, 1, MPI_INT, MPI_LAND, io_info->comm)))
HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code)
ret_value = consensus > 0 ? TRUE : FALSE;
done:
- FUNC_LEAVE_NOAPI(ret_value);
-} /* H5D_mpio_opt_possible() */
-
-
-/*-------------------------------------------------------------------------
- * Function: H5D_mpio_chunk_adjust_iomode
- *
- * Decription: If H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS and
- H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS are defined,
- the HDF5 library will do collective IO if the application asks for it.
-
- If H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS is not defined
- and one or more processes are not participating in the IO,
- then collective IO is not assured. The library will check
- each process for the
- number of chunks it involves. If any process involves zero chunks,
- the library will use independent IO mode instead.
- This function is only used for linked chunk IO.
- * Purpose: Checks if it is possible to do collective IO
- *
- * Return: Success: Non-negative: TRUE or FALSE
- * Failure: Negative
- *
- * Programmer: Muqun Yang
- * Monday, Feb. 13th, 2006
- *
- *-------------------------------------------------------------------------
- */
-#ifndef H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS
-herr_t
-H5D_mpio_chunk_adjust_iomode(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm)
-{
- int min_chunk;
- herr_t ret_value = SUCCEED;
-
- FUNC_ENTER_NOAPI_NOINIT(H5D_mpio_chunk_adjust_iomode)
-
- if(H5D_mpio_get_min_chunk(io_info,fm,&min_chunk) < 0)
- HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the min chunk number of all processes");
- if(min_chunk == 0) {
- /* Switch to independent I/O */
- if(H5D_ioinfo_make_ind(io_info) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O")
- } /* end if */
-done:
FUNC_LEAVE_NOAPI(ret_value)
-}
-#endif
+} /* H5D_mpio_opt_possible() */
/*-------------------------------------------------------------------------
@@ -332,23 +289,20 @@ done:
*-------------------------------------------------------------------------
*/
herr_t
-H5D_mpio_select_read(H5D_io_info_t *io_info,
- size_t mpi_buf_count,
- const size_t UNUSED elmt_size,
- const H5S_t UNUSED *file_space,
- const H5S_t UNUSED *mem_space,
- haddr_t addr,
- void UNUSED *pointer,
- void *buf/*out*/)
+H5D_mpio_select_read(const H5D_io_info_t *io_info, const H5D_type_info_t UNUSED *type_info,
+ hsize_t mpi_buf_count, const H5S_t UNUSED *file_space, const H5S_t UNUSED *mem_space)
{
+ const H5D_contig_storage_t *store_contig = &(io_info->store->contig); /* Contiguous storage info for this I/O operation */
herr_t ret_value = SUCCEED;
- FUNC_ENTER_NOAPI(H5D_mpio_select_read,FAIL);
+ FUNC_ENTER_NOAPI(H5D_mpio_select_read, FAIL)
+
+ H5_CHECK_OVERFLOW(mpi_buf_count, hsize_t, size_t);
+ if(H5F_block_read(io_info->dset->oloc.file, H5FD_MEM_DRAW, store_contig->dset_addr, (size_t)mpi_buf_count, io_info->dxpl_id, io_info->u.rbuf) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "can't finish collective parallel read")
- if(H5F_block_read (io_info->dset->oloc.file, H5FD_MEM_DRAW, addr, mpi_buf_count, io_info->dxpl_id, buf) < 0)
- HGOTO_ERROR(H5E_IO,H5E_READERROR,FAIL,"can't finish collective parallel read");
done:
- FUNC_LEAVE_NOAPI(ret_value);
+ FUNC_LEAVE_NOAPI(ret_value)
} /* end H5D_mpio_select_read() */
@@ -364,32 +318,28 @@ done:
*-------------------------------------------------------------------------
*/
herr_t
-H5D_mpio_select_write(H5D_io_info_t *io_info,
- size_t mpi_buf_count,
- const size_t UNUSED elmt_size,
- const H5S_t UNUSED *file_space,
- const H5S_t UNUSED *mem_space,
- haddr_t addr,
- void UNUSED *pointer,
- const void *buf)
+H5D_mpio_select_write(const H5D_io_info_t *io_info, const H5D_type_info_t UNUSED *type_info,
+ hsize_t mpi_buf_count, const H5S_t UNUSED *file_space, const H5S_t UNUSED *mem_space)
{
+ const H5D_contig_storage_t *store_contig = &(io_info->store->contig); /* Contiguous storage info for this I/O operation */
herr_t ret_value = SUCCEED;
- FUNC_ENTER_NOAPI(H5D_mpio_select_write,FAIL);
+ FUNC_ENTER_NOAPI(H5D_mpio_select_write, FAIL)
/*OKAY: CAST DISCARDS CONST QUALIFIER*/
- if(H5F_block_write (io_info->dset->oloc.file, H5FD_MEM_DRAW, addr, mpi_buf_count, io_info->dxpl_id, buf)<0)
- HGOTO_ERROR(H5E_IO,H5E_WRITEERROR,FAIL,"can't finish collective parallel write");
+ H5_CHECK_OVERFLOW(mpi_buf_count, hsize_t, size_t);
+ if(H5F_block_write(io_info->dset->oloc.file, H5FD_MEM_DRAW, store_contig->dset_addr, (size_t)mpi_buf_count, io_info->dxpl_id, io_info->u.wbuf) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "can't finish collective parallel write")
done:
- FUNC_LEAVE_NOAPI(ret_value);
+ FUNC_LEAVE_NOAPI(ret_value)
} /* end H5D_mpio_select_write() */
/*-------------------------------------------------------------------------
- * Function: H5D_ioinfo_make_ind
+ * Function: H5D_ioinfo_xfer_mode
*
- * Purpose: Switch to MPI independent I/O
+ * Purpose: Switch to between collective & independent MPI I/O
*
* Return: Non-negative on success/Negative on failure
*
@@ -399,42 +349,42 @@ done:
*-------------------------------------------------------------------------
*/
static herr_t
-H5D_ioinfo_make_ind(H5D_io_info_t *io_info)
+H5D_ioinfo_xfer_mode(H5D_io_info_t *io_info, H5P_genplist_t *dx_plist,
+ H5FD_mpio_xfer_t xfer_mode)
{
- H5P_genplist_t *dx_plist; /* Data transer property list */
herr_t ret_value = SUCCEED; /*return value */
- FUNC_ENTER_NOAPI_NOINIT(H5D_ioinfo_make_ind)
-
- /* Get the dataset transfer property list */
- if (NULL == (dx_plist = H5I_object(io_info->dxpl_id)))
- HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset transfer property list")
+ FUNC_ENTER_NOAPI_NOINIT(H5D_ioinfo_xfer_mode)
- /* Change the xfer_mode to independent, handle the request,
- * then set xfer_mode before return.
- */
- io_info->dxpl_cache->xfer_mode = H5FD_MPIO_INDEPENDENT;
- if(H5P_set (dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &io_info->dxpl_cache->xfer_mode) < 0)
+ /* Change the xfer_mode */
+ io_info->dxpl_cache->xfer_mode = xfer_mode;
+ if(H5P_set(dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &io_info->dxpl_cache->xfer_mode) < 0)
HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode")
- /* Set the pointers to the non-MPI-specific routines */
- io_info->ops.read = H5D_select_read;
- io_info->ops.write = H5D_select_write;
+ /* Change the "single I/O" function pointers */
+ if(xfer_mode == H5FD_MPIO_INDEPENDENT) {
+ /* Set the pointers to the original, non-MPI-specific routines */
+ io_info->io_ops.single_read = io_info->orig.io_ops.single_read;
+ io_info->io_ops.single_write = io_info->orig.io_ops.single_write;
+ } /* end if */
+ else {
+ HDassert(xfer_mode == H5FD_MPIO_COLLECTIVE);
- /* Indicate that the transfer mode should be restored before returning
- * to user.
- */
- io_info->xfer_mode_changed=TRUE;
+ /* Set the pointers to the MPI-specific routines */
+ io_info->io_ops.single_read = H5D_mpio_select_read;
+ io_info->io_ops.single_write = H5D_mpio_select_write;
+ } /* end else */
done:
FUNC_LEAVE_NOAPI(ret_value)
-} /* end H5D_ioinfo_make_ind() */
+} /* end H5D_ioinfo_xfer_mode() */
/*-------------------------------------------------------------------------
- * Function: H5D_ioinfo_make_coll_opt
+ * Function: H5D_ioinfo_coll_opt_mode
*
- * Purpose: Switch to MPI independent I/O with file set view
+ * Purpose: Switch between using collective & independent MPI I/O w/file
+ * set view
*
* Return: Non-negative on success/Negative on failure
*
@@ -444,87 +394,21 @@ done:
*-------------------------------------------------------------------------
*/
static herr_t
-H5D_ioinfo_make_coll_opt(H5D_io_info_t *io_info)
+H5D_ioinfo_coll_opt_mode(H5D_io_info_t *io_info, H5P_genplist_t *dx_plist,
+ H5FD_mpio_collective_opt_t coll_opt_mode)
{
- H5P_genplist_t *dx_plist; /* Data transer property list */
herr_t ret_value = SUCCEED; /*return value */
- FUNC_ENTER_NOAPI_NOINIT(H5D_ioinfo_make_coll_opt)
+ FUNC_ENTER_NOAPI_NOINIT(H5D_ioinfo_coll_opt_mode)
- /* Get the dataset transfer property list */
- if (NULL == (dx_plist = H5I_object(io_info->dxpl_id)))
- HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset transfer property list")
-
- /* Change the optimial xfer_mode to independent, handle the request,
- * then set xfer_mode before return.
- */
- io_info->dxpl_cache->xfer_opt_mode = H5FD_MPIO_INDIVIDUAL_IO;
- if(H5P_set (dx_plist, H5D_XFER_IO_XFER_OPT_MODE_NAME, &io_info->dxpl_cache->xfer_opt_mode) < 0)
+ /* Change the optimal xfer_mode */
+ io_info->dxpl_cache->coll_opt_mode = coll_opt_mode;
+ if(H5P_set(dx_plist, H5D_XFER_MPIO_COLLECTIVE_OPT_NAME, &io_info->dxpl_cache->coll_opt_mode) < 0)
HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode")
- /* Set the pointers to the non-MPI-specific routines */
- io_info->ops.read = H5D_mpio_select_read;
- io_info->ops.write = H5D_mpio_select_write;
-
- /* Indicate that the transfer mode should be restored before returning
- * to user.
- */
- io_info->xfer_opt_mode_changed = TRUE;
-
done:
FUNC_LEAVE_NOAPI(ret_value)
-} /* end H5D_ioinfo_make_coll_opt() */
-
-
-/*-------------------------------------------------------------------------
- * Function: H5D_ioinfo_make_coll
- *
- * Purpose: Switch to MPI collective I/O
- *
- * Return: Non-negative on success/Negative on failure
- *
- * Programmer: Quincey Koziol
- * Friday, August 12, 2005
- *
- *-------------------------------------------------------------------------
- */
-static herr_t
-H5D_ioinfo_make_coll(H5D_io_info_t *io_info)
-{
- H5P_genplist_t *dx_plist; /* Data transer property list */
- herr_t ret_value = SUCCEED; /*return value */
-
- FUNC_ENTER_NOAPI_NOINIT(H5D_ioinfo_make_coll)
-
- /* Get the dataset transfer property list */
- if (NULL == (dx_plist = H5I_object(io_info->dxpl_id)))
- HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset transfer property list")
-
- /* Change the xfer_mode to independent, handle the request,
- * then set xfer_mode before return.
- */
- io_info->dxpl_cache->xfer_mode = H5FD_MPIO_COLLECTIVE;
- if(H5P_set (dx_plist, H5D_XFER_IO_XFER_MODE_NAME, &io_info->dxpl_cache->xfer_mode) < 0)
- HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode")
-
- io_info->dxpl_cache->xfer_opt_mode = H5FD_MPIO_COLLECTIVE_IO;
- if(H5P_set (dx_plist, H5D_XFER_IO_XFER_OPT_MODE_NAME, &io_info->dxpl_cache->xfer_opt_mode) < 0)
- HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set transfer mode")
-
-
- /* Set the pointers to the MPI-specific routines */
- io_info->ops.read = H5D_mpio_select_read;
- io_info->ops.write = H5D_mpio_select_write;
-
- /* Indicate that the transfer mode should _NOT_ be restored before returning
- * to user.
- */
- io_info->xfer_mode_changed=FALSE;
- io_info->xfer_opt_mode_changed=FALSE;
-
-done:
- FUNC_LEAVE_NOAPI(ret_value)
-} /* end H5D_ioinfo_make_coll() */
+} /* end H5D_ioinfo_coll_opt_mode() */
/*-------------------------------------------------------------------------
@@ -535,29 +419,30 @@ done:
*
* Return: Non-negative on success/Negative on failure
*
- * Programmer:
+ * Programmer: Muqun Yang
+ * Monday, Feb. 13th, 2006
*
*-------------------------------------------------------------------------
*/
static herr_t
-H5D_mpio_get_min_chunk(const H5D_io_info_t *io_info,
- const H5D_chunk_map_t *fm, int *min_chunkf)
+H5D_mpio_get_min_chunk(const H5D_io_info_t *io_info, const H5D_chunk_map_t *fm,
+ int *min_chunkf)
{
int num_chunkf; /* Number of chunks to iterate over */
int mpi_code; /* MPI return code */
herr_t ret_value = SUCCEED;
- FUNC_ENTER_NOAPI_NOINIT(H5D_mpio_get_min_chunk);
+ FUNC_ENTER_NOAPI_NOINIT(H5D_mpio_get_min_chunk)
/* Get the number of chunks to perform I/O on */
num_chunkf = H5SL_count(fm->sel_chunks);
/* Determine the minimum # of chunks for all processes */
- if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&num_chunkf, min_chunkf, 1, MPI_INT, MPI_MIN, io_info->comm)))
+ if(MPI_SUCCESS != (mpi_code = MPI_Allreduce(&num_chunkf, min_chunkf, 1, MPI_INT, MPI_MIN, io_info->comm)))
HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code)
done:
- FUNC_LEAVE_NOAPI(ret_value);
+ FUNC_LEAVE_NOAPI(ret_value)
} /* end H5D_mpio_get_min_chunk() */
@@ -569,221 +454,252 @@ done:
*
* Return: Non-negative on success/Negative on failure
*
- * Programmer:
+ * Programmer: Muqun Yang
+ * Monday, Feb. 13th, 2006
*
*-------------------------------------------------------------------------
*/
static herr_t
-H5D_mpio_get_sum_chunk(const H5D_io_info_t *io_info,
- const H5D_chunk_map_t *fm, int *sum_chunkf)
+H5D_mpio_get_sum_chunk(const H5D_io_info_t *io_info, const H5D_chunk_map_t *fm,
+ int *sum_chunkf)
{
int num_chunkf; /* Number of chunks to iterate over */
size_t ori_num_chunkf;
int mpi_code; /* MPI return code */
herr_t ret_value = SUCCEED;
- FUNC_ENTER_NOAPI_NOINIT(H5D_mpio_get_sum_chunk);
+ FUNC_ENTER_NOAPI_NOINIT(H5D_mpio_get_sum_chunk)
/* Get the number of chunks to perform I/O on */
num_chunkf = 0;
ori_num_chunkf = H5SL_count(fm->sel_chunks);
- H5_ASSIGN_OVERFLOW(num_chunkf,ori_num_chunkf,size_t,int);
+ H5_ASSIGN_OVERFLOW(num_chunkf, ori_num_chunkf, size_t, int);
/* Determine the summation of number of chunks for all processes */
- if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&num_chunkf, sum_chunkf, 1, MPI_INT, MPI_SUM, io_info->comm)))
+ if(MPI_SUCCESS != (mpi_code = MPI_Allreduce(&num_chunkf, sum_chunkf, 1, MPI_INT, MPI_SUM, io_info->comm)))
HMPI_GOTO_ERROR(FAIL, "MPI_Allreduce failed", mpi_code)
done:
- FUNC_LEAVE_NOAPI(ret_value);
+ FUNC_LEAVE_NOAPI(ret_value)
} /* end H5D_mpio_get_sum_chunk() */
/*-------------------------------------------------------------------------
- * Function: H5D_contig_collective_io
+ * Function: H5D_contig_collective_read
*
- * Purpose: Wrapper Routine for H5D_inter_collective_io
- The starting file address of contiguous layout
- will be calculated and passed to H5D_inter_collective_io routine.
- *
+ * Purpose: Reads directly from contiguous data in file into application
+ * memory using collective I/O.
*
* Return: Non-negative on success/Negative on failure
*
- * Programmer:
- *
- * Modifications:
+ * Programmer: Quincey Koziol
+ * Tuesday, March 4, 2008
*
*-------------------------------------------------------------------------
*/
herr_t
-H5D_contig_collective_io(H5D_io_info_t *io_info,
- const H5S_t *file_space,
- const H5S_t *mem_space,
- const void *buf,
- hbool_t do_write)
+H5D_contig_collective_read(H5D_io_info_t *io_info, const H5D_type_info_t *type_info,
+ hsize_t UNUSED nelmts, const H5S_t *file_space, const H5S_t *mem_space,
+ H5D_chunk_map_t UNUSED *fm)
{
+ herr_t ret_value = SUCCEED; /* Return value */
+ FUNC_ENTER_NOAPI(H5D_contig_collective_read, FAIL)
- haddr_t addr = HADDR_UNDEF; /* Address of dataset (or selection) within file */
- herr_t ret_value = SUCCEED; /* return value */
+ /* Sanity check */
+ HDassert(IS_H5FD_MPIO(io_info->dset->oloc.file));
+ HDassert(TRUE == H5P_isa_class(io_info->dxpl_id, H5P_DATASET_XFER));
+
+ /* Call generic internal collective I/O routine */
+ if(H5D_inter_collective_io(io_info, type_info, file_space, mem_space) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "couldn't finish shared collective MPI-IO")
- FUNC_ENTER_NOAPI_NOINIT(H5D_contig_collective_io)
- assert (IS_H5FD_MPIO(io_info->dset->oloc.file));
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D_contig_collective_read() */
- /* Make certain we have the correct type of property list */
- assert(TRUE==H5P_isa_class(io_info->dxpl_id,H5P_DATASET_XFER));
+
+/*-------------------------------------------------------------------------
+ * Function: H5D_contig_collective_write
+ *
+ * Purpose: Write directly to contiguous data in file from application
+ * memory using collective I/O.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: Quincey Koziol
+ * Tuesday, March 4, 2008
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5D_contig_collective_write(H5D_io_info_t *io_info, const H5D_type_info_t *type_info,
+ hsize_t UNUSED nelmts, const H5S_t *file_space, const H5S_t *mem_space,
+ H5D_chunk_map_t UNUSED *fm)
+{
+ herr_t ret_value = SUCCEED; /* Return value */
- /* Get the base address of the contiguous dataset */
- if(io_info->dset->shared->layout.type == H5D_CONTIGUOUS)
- addr = H5D_contig_get_addr(io_info->dset);
+ FUNC_ENTER_NOAPI(H5D_contig_collective_write, FAIL)
- if(H5D_inter_collective_io(io_info,file_space,mem_space,addr,buf,do_write)<0)
- HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO");
-
- done:
+ /* Sanity check */
+ HDassert(IS_H5FD_MPIO(io_info->dset->oloc.file));
+ HDassert(TRUE == H5P_isa_class(io_info->dxpl_id, H5P_DATASET_XFER));
+ /* Call generic internal collective I/O routine */
+ if(H5D_inter_collective_io(io_info, type_info, file_space, mem_space) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "couldn't finish shared collective MPI-IO")
+
+done:
FUNC_LEAVE_NOAPI(ret_value)
-} /* end H5D_contig_collective_io */
+} /* end H5D_contig_collective_write() */
/*-------------------------------------------------------------------------
* Function: H5D_chunk_collective_io
*
* Purpose: Routine for
- 1) choose an IO option:
- a) One collective IO defined by one MPI derived datatype to link through all chunks
- or b) multiple chunk IOs,to do MPI-IO for each chunk, the IO mode may be adjusted
- due to the selection pattern for each chunk.
+ * 1) choose an IO option:
+ * a) One collective IO defined by one MPI derived datatype to link through all chunks
+ * or b) multiple chunk IOs,to do MPI-IO for each chunk, the IO mode may be adjusted
+ * due to the selection pattern for each chunk.
* For option a)
- 1. Sort the chunk address, obtain chunk info according to the sorted chunk address
- 2. Build up MPI derived datatype for each chunk
- 3. Build up the final MPI derived datatype
- 4. Set up collective IO property list
- 5. Do IO
+ * 1. Sort the chunk address, obtain chunk info according to the sorted chunk address
+ * 2. Build up MPI derived datatype for each chunk
+ * 3. Build up the final MPI derived datatype
+ * 4. Set up collective IO property list
+ * 5. Do IO
* For option b)
- 1. Use MPI_gather and MPI_Bcast to obtain information of *collective/independent/none*
- IO mode for each chunk of the selection
- 2. Depending on whether the IO mode is collective or independent or none,
- Create either MPI derived datatype for each chunk to do collective IO or
- just do independent IO or independent IO with file set view
- 3. Set up collective IO property list for collective mode
- 4. DO IO
+ * 1. Use MPI_gather and MPI_Bcast to obtain information of *collective/independent/none*
+ * IO mode for each chunk of the selection
+ * 2. Depending on whether the IO mode is collective or independent or none,
+ * Create either MPI derived datatype for each chunk to do collective IO or
+ * just do independent IO or independent IO with file set view
+ * 3. Set up collective IO property list for collective mode
+ * 4. DO IO
*
* Return: Non-negative on success/Negative on failure
*
- * Programmer:
- *
- * Modifications:
+ * Programmer: Muqun Yang
+ * Monday, Feb. 13th, 2006
*
*-------------------------------------------------------------------------
*/
-herr_t
-H5D_chunk_collective_io(H5D_io_info_t *io_info,H5D_chunk_map_t *fm,const void *buf, hbool_t do_write)
+static herr_t
+H5D_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info,
+ H5D_chunk_map_t *fm)
{
-
- int io_option = H5D_MULTI_CHUNK_IO_MORE_OPT;
- int sum_chunk = 0,mpi_size;
- unsigned one_link_chunk_io_threshold;
- H5P_genplist_t *plist;
+ H5P_genplist_t *dx_plist; /* Pointer to DXPL */
H5FD_mpio_chunk_opt_t chunk_opt_mode;
-
+ int io_option = H5D_MULTI_CHUNK_IO_MORE_OPT;
+ int sum_chunk = -1;
#ifdef H5_HAVE_INSTRUMENTED_LIBRARY
- htri_t check_prop,temp_not_link_io = FALSE;
- int new_value;
+ htri_t temp_not_link_io = FALSE;
#endif
- herr_t ret_value = SUCCEED;
+ herr_t ret_value = SUCCEED;
FUNC_ENTER_NOAPI_NOINIT(H5D_chunk_collective_io)
- assert (IS_H5FD_MPIO(io_info->dset->oloc.file));
+ /* Sanity checks */
+ HDassert(io_info);
+ HDassert(io_info->using_mpi_vfd);
+ HDassert(type_info);
+ HDassert(fm);
/* Obtain the data transfer properties */
- if(NULL == (plist = H5I_object(io_info->dxpl_id)))
+ if(NULL == (dx_plist = H5I_object(io_info->dxpl_id)))
HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list")
-
+
/* Check the optional property list on what to do with collective chunk IO. */
- chunk_opt_mode=(H5FD_mpio_chunk_opt_t)H5P_peek_unsigned(plist,H5D_XFER_MPIO_CHUNK_OPT_HARD_NAME);
-
+ chunk_opt_mode = (H5FD_mpio_chunk_opt_t)H5P_peek_unsigned(dx_plist, H5D_XFER_MPIO_CHUNK_OPT_HARD_NAME);
if(chunk_opt_mode == H5FD_MPIO_CHUNK_ONE_IO)
- io_option = H5D_ONE_LINK_CHUNK_IO;/*no opt*/
+ io_option = H5D_ONE_LINK_CHUNK_IO; /*no opt*/
else if(chunk_opt_mode == H5FD_MPIO_CHUNK_MULTI_IO)
- io_option = H5D_MULTI_CHUNK_IO;/*no opt */
+ io_option = H5D_MULTI_CHUNK_IO; /*no opt */
else {
- if(H5D_mpio_get_sum_chunk(io_info,fm,&sum_chunk)<0)
+ unsigned one_link_chunk_io_threshold; /* Threshhold to use single collective I/O for all chunks */
+ int mpi_size; /* Number of processes in MPI job */
+
+ if(H5D_mpio_get_sum_chunk(io_info, fm, &sum_chunk) < 0)
HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the total chunk number of all processes");
- if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file))<0)
- HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size");
+ if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file)) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size")
- if(NULL == (plist = H5I_object(io_info->dxpl_id)))
- HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list")
-
- one_link_chunk_io_threshold =H5P_peek_unsigned(plist,H5D_XFER_MPIO_CHUNK_OPT_NUM_NAME);
+ one_link_chunk_io_threshold = H5P_peek_unsigned(dx_plist, H5D_XFER_MPIO_CHUNK_OPT_NUM_NAME);
- /* step 1: choose an IO option */
- /* If the average number of chunk per process is greater than a threshold, we will do one link chunked IO. */
- if((unsigned)sum_chunk/mpi_size >= one_link_chunk_io_threshold)
+ /* step 1: choose an IO option */
+ /* If the average number of chunk per process is greater than a threshold, we will do one link chunked IO. */
+ if((unsigned)sum_chunk / mpi_size >= one_link_chunk_io_threshold)
io_option = H5D_ONE_LINK_CHUNK_IO_MORE_OPT;
#ifdef H5_HAVE_INSTRUMENTED_LIBRARY
- else
- temp_not_link_io = TRUE;
+ else
+ temp_not_link_io = TRUE;
+#endif
+ } /* end else */
+
+#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
+ if(io_option == H5D_ONE_LINK_CHUNK_IO)
+ io_option = H5D_MULTI_CHUNK_IO; /* We can not do this with one chunk IO. */
+ if(io_option == H5D_ONE_LINK_CHUNK_IO_MORE_OPT)
+ io_option = H5D_MULTI_CHUNK_IO_MORE_OPT;
#endif
- }
#ifdef H5_HAVE_INSTRUMENTED_LIBRARY
+{
+ htri_t check_prop;
+ int new_value;
+
/*** Test collective chunk user-input optimization APIs. ***/
- check_prop = H5Pexist(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_LINK_HARD_NAME);
+ check_prop = H5Pexist(io_info->dxpl_id, H5D_XFER_COLL_CHUNK_LINK_HARD_NAME);
if(check_prop > 0) {
if(io_option == H5D_ONE_LINK_CHUNK_IO) {
- new_value = 0;
- if(H5Pset(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_LINK_HARD_NAME,&new_value)<0)
- HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to get property value");
- }
- }
- check_prop = H5Pexist(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_MULTI_HARD_NAME);
+ new_value = 0;
+ if(H5Pset(io_info->dxpl_id, H5D_XFER_COLL_CHUNK_LINK_HARD_NAME, &new_value) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_CANTSET, FAIL, "unable to set property value")
+ } /* end if */
+ } /* end if */
+ check_prop = H5Pexist(io_info->dxpl_id, H5D_XFER_COLL_CHUNK_MULTI_HARD_NAME);
if(check_prop > 0) {
- if(io_option == H5D_MULTI_CHUNK_IO) {
- new_value = 0;
- if(H5Pset(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_MULTI_HARD_NAME,&new_value)<0)
- HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to get property value");
- }
- }
- check_prop = H5Pexist(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_LINK_NUM_TRUE_NAME);
+ if(io_option == H5D_MULTI_CHUNK_IO) {
+ new_value = 0;
+ if(H5Pset(io_info->dxpl_id, H5D_XFER_COLL_CHUNK_MULTI_HARD_NAME, &new_value) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_CANTSET, FAIL, "unable to set property value")
+ } /* end if */
+ } /* end if */
+ check_prop = H5Pexist(io_info->dxpl_id, H5D_XFER_COLL_CHUNK_LINK_NUM_TRUE_NAME);
if(check_prop > 0) {
- if(io_option == H5D_ONE_LINK_CHUNK_IO_MORE_OPT) {
- new_value = 0;
- if(H5Pset(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_LINK_NUM_TRUE_NAME,&new_value)<0)
- HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to get property value");
- }
- }
- check_prop = H5Pexist(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_LINK_NUM_FALSE_NAME);
+ if(io_option == H5D_ONE_LINK_CHUNK_IO_MORE_OPT) {
+ new_value = 0;
+ if(H5Pset(io_info->dxpl_id, H5D_XFER_COLL_CHUNK_LINK_NUM_TRUE_NAME, &new_value) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_CANTSET, FAIL, "unable to set property value")
+ } /* end if */
+ } /* end if */
+ check_prop = H5Pexist(io_info->dxpl_id, H5D_XFER_COLL_CHUNK_LINK_NUM_FALSE_NAME);
if(check_prop > 0) {
- if(temp_not_link_io){
- new_value = 0;
- if(H5Pset(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_LINK_NUM_FALSE_NAME,&new_value)<0)
- HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to get property value");
- }
- }
-#endif
-
-#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
- if(io_option == H5D_ONE_LINK_CHUNK_IO )
- io_option = H5D_MULTI_CHUNK_IO ;/* We can not do this with one chunk IO. */
- if(io_option == H5D_ONE_LINK_CHUNK_IO_MORE_OPT)
- io_option = H5D_MULTI_CHUNK_IO_MORE_OPT;
+ if(temp_not_link_io) {
+ new_value = 0;
+ if(H5Pset(io_info->dxpl_id, H5D_XFER_COLL_CHUNK_LINK_NUM_FALSE_NAME, &new_value) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_CANTSET, FAIL, "unable to set property value")
+ } /* end if */
+ } /* end if */
+}
#endif
/* step 2: Go ahead to do IO.*/
+#ifdef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
if(io_option == H5D_ONE_LINK_CHUNK_IO || io_option == H5D_ONE_LINK_CHUNK_IO_MORE_OPT) {
- if(H5D_link_chunk_collective_io(io_info,fm,buf,do_write,sum_chunk)<0)
- HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish linked chunk MPI-IO");
- }
- else if(io_option == H5D_MULTI_CHUNK_IO) {
- if(H5D_multi_chunk_collective_io_no_opt(io_info,fm,buf,do_write)<0)
- HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish multiple chunk MPI-IO");
- }
+ if(H5D_link_chunk_collective_io(io_info, type_info, fm, sum_chunk) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish linked chunk MPI-IO")
+ } /* end if */
+ else
+#endif /* H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS */
+ if(io_option == H5D_MULTI_CHUNK_IO) {
+ if(H5D_multi_chunk_collective_io_no_opt(io_info, type_info, fm, dx_plist) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish multiple chunk MPI-IO")
+ } /* end if */
else { /*multiple chunk IOs with opt */
- if(H5D_multi_chunk_collective_io(io_info,fm,buf,do_write)<0)
- HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish multiple chunk MPI-IO");
- }
+ if(H5D_multi_chunk_collective_io(io_info, type_info, fm, dx_plist) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish optimized multiple chunk MPI-IO")
+ } /* end else */
done:
FUNC_LEAVE_NOAPI(ret_value)
@@ -791,459 +707,583 @@ done:
/*-------------------------------------------------------------------------
- * Function: H5D_link_chunk_collective_io
+ * Function: H5D_chunk_collective_read
*
- * Purpose: Routine for one collective IO with one MPI derived datatype to link with all chunks
+ * Purpose: Reads directly from chunks in file into application memory
+ * using collective I/O.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: Quincey Koziol
+ * Tuesday, March 4, 2008
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5D_chunk_collective_read(H5D_io_info_t *io_info, const H5D_type_info_t *type_info,
+ hsize_t UNUSED nelmts, const H5S_t UNUSED *file_space, const H5S_t UNUSED *mem_space,
+ H5D_chunk_map_t *fm)
+{
+ herr_t ret_value = SUCCEED; /* Return value */
- 1. Sort the chunk address and chunk info
- 2. Build up MPI derived datatype for each chunk
- 3. Build up the final MPI derived datatype
- 4. Use common collective IO routine to do MPI-IO
+ FUNC_ENTER_NOAPI(H5D_chunk_collective_read, FAIL)
+ /* Call generic selection operation */
+ if(H5D_chunk_collective_io(io_info, type_info, fm) < 0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_READERROR, FAIL, "read error")
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D_chunk_collective_read() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5D_chunk_collective_write
*
- * Return: Non-negative on success/Negative on failure
+ * Purpose: Write directly to chunks in file from application memory
+ * using collective I/O.
*
- * Programmer:
+ * Return: Non-negative on success/Negative on failure
*
- * Modifications:
+ * Programmer: Quincey Koziol
+ * Tuesday, March 4, 2008
*
*-------------------------------------------------------------------------
*/
+herr_t
+H5D_chunk_collective_write(H5D_io_info_t *io_info, const H5D_type_info_t *type_info,
+ hsize_t UNUSED nelmts, const H5S_t UNUSED *file_space, const H5S_t UNUSED *mem_space,
+ H5D_chunk_map_t *fm)
+{
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI(H5D_chunk_collective_write, FAIL)
+
+ /* Call generic selection operation */
+ if(H5D_chunk_collective_io(io_info, type_info, fm) < 0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_WRITEERROR, FAIL, "write error")
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D_chunk_collective_write() */
+#ifdef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
+
+/*-------------------------------------------------------------------------
+ * Function: H5D_link_chunk_collective_io
+ *
+ * Purpose: Routine for one collective IO with one MPI derived datatype to link with all chunks
+ *
+ * 1. Sort the chunk address and chunk info
+ * 2. Build up MPI derived datatype for each chunk
+ * 3. Build up the final MPI derived datatype
+ * 4. Use common collective IO routine to do MPI-IO
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: Muqun Yang
+ * Monday, Feb. 13th, 2006
+ *
+ *-------------------------------------------------------------------------
+ */
static herr_t
-H5D_link_chunk_collective_io(H5D_io_info_t *io_info,H5D_chunk_map_t *fm,const void *buf, hbool_t do_write,int sum_chunk)
+H5D_link_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info,
+ H5D_chunk_map_t *fm, int sum_chunk)
{
- size_t src_type_size; /*size of source type */
- size_t dst_type_size; /*size of destination type*/
- hsize_t mpi_buf_extra_offset;
- hsize_t mpi_file_extra_offset;
- size_t mpi_buf_count;
- size_t mpi_file_count;
- hbool_t mbt_is_derived=0, /* Whether the buffer (memory) type is derived and needs to be free'd */
- mft_is_derived=0; /* Whether the file type is derived and needs to be free'd */
-
- int mpi_size,mpi_code; /* MPI return code */
-
- int i,num_chunk=0,total_chunks;
- size_t ori_num_chunk;
- hsize_t ori_total_chunks;
- haddr_t chunk_base_addr;
- haddr_t* total_chunk_addr_array=NULL;
- MPI_Datatype *chunk_mtype=NULL;
- MPI_Datatype *chunk_ftype=NULL;
- MPI_Datatype chunk_final_mtype;
- MPI_Datatype chunk_final_ftype;
- MPI_Aint *chunk_disp_array=NULL;
- MPI_Aint *chunk_mem_disp_array=NULL;
- int *blocklen=NULL;
- int blocklen_value;
- int actual_bsearch_coll_chunk_threshold;
- int bsearch_coll_chunk_threshold;
- int many_chunk_opt = 0;
-
- H5D_common_coll_info_t coll_info;
- H5D_chunk_addr_info_t* chunk_addr_info_array=NULL;
-
- herr_t ret_value = SUCCEED;
-
- FUNC_ENTER_NOAPI_NOINIT(H5D_link_chunk_collective_io)
- ori_total_chunks = fm->total_chunks;
- H5_ASSIGN_OVERFLOW(total_chunks,ori_total_chunks,hsize_t,int);
-
- /* Handle with a special case when only one chunk is covered by all processes */
- if(total_chunks == 1){
- H5SL_node_t *chunk_node;
- H5D_chunk_info_t *chunk_info;
- H5D_storage_t store;
+ H5D_chunk_addr_info_t *chunk_addr_info_array = NULL;
+ hbool_t mbt_is_derived = FALSE;
+ hbool_t mft_is_derived = FALSE;
+ MPI_Datatype chunk_final_mtype; /* Final memory MPI datatype for all chunks with seletion */
+ MPI_Datatype chunk_final_ftype; /* Final file MPI datatype for all chunks with seletion */
+ H5D_storage_t ctg_store; /* Storage info for "fake" contiguous dataset */
+ size_t total_chunks;
+ haddr_t *total_chunk_addr_array = NULL;
+ MPI_Datatype *chunk_mtype = NULL;
+ MPI_Datatype *chunk_ftype = NULL;
+ MPI_Aint *chunk_disp_array = NULL;
+ MPI_Aint *chunk_mem_disp_array = NULL;
+ int *blocklen = NULL;
+ int mpi_code; /* MPI return code */
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_NOAPI_NOINIT(H5D_link_chunk_collective_io)
+
+ /* Get the sum # of chunks, if not already available */
+ if(sum_chunk < 0) {
+ if(H5D_mpio_get_sum_chunk(io_info, fm, &sum_chunk) < 0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the total chunk number of all processes");
+ } /* end if */
+
+ /* Retrieve total # of chunks in dataset */
+ H5_ASSIGN_OVERFLOW(total_chunks, fm->total_chunks, hsize_t, size_t);
+
+ /* Handle special case when dataspace dimensions only allow one chunk in
+ * the dataset. [This sometimes is used by developers who want the
+ * equivalent of compressed contiguous datasets - QAK]
+ */
+ if(total_chunks == 1) {
+ H5D_storage_t chk_store; /* Temporary storage info for chunk address lookup */
+ hsize_t coords[H5O_LAYOUT_NDIMS]; /* Coordinates of chunk in file dataset's dataspace */
+ H5SL_node_t *chunk_node; /* Pointer to chunk node for selection */
+ H5S_t *fspace; /* Dataspace describing chunk & selection in it */
+ H5S_t *mspace; /* Dataspace describing selection in memory corresponding to this chunk */
+
+ /* Initialize the chunk coordinates */
+ /* (must be all zero, since there's only one chunk) */
+ HDmemset(coords, 0, sizeof(coords));
+
+ /* Look up address of chunk */
+ io_info->store = &chk_store;
+ chk_store.chunk.offset = coords;
+ chk_store.chunk.index = 0;
+ if(HADDR_UNDEF == (ctg_store.contig.dset_addr = H5D_istore_get_addr(io_info, NULL)))
+ HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL, "couldn't get chunk info from skipped list")
+ /* Check for this process having selection in this chunk */
chunk_node = H5SL_first(fm->sel_chunks);
if(chunk_node == NULL) {
- if(H5D_istore_chunkmap(io_info, &chunk_base_addr, fm->down_chunks) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address");
- if(H5D_inter_collective_io(io_info,NULL,NULL,chunk_base_addr,buf,do_write)<0)
- HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO");
- }
+ /* Set the dataspace info for I/O to NULL, this process doesn't have any I/O to perform */
+ fspace = mspace = NULL;
+ } /* end if */
else {
- if(NULL ==(chunk_info = H5SL_item(chunk_node)))
- HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list");
- io_info->store = &store;
- store.chunk.offset = chunk_info->coords;
- store.chunk.index = chunk_info->index;
+ H5D_chunk_info_t *chunk_info;
+
+ /* Get the chunk info, for the selection in the chunk */
+ if(NULL == (chunk_info = H5SL_item(chunk_node)))
+ HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL, "couldn't get chunk info from skipped list")
+
+ /* Set the dataspace info for I/O */
+ fspace = chunk_info->fspace;
+ mspace = chunk_info->mspace;
+ } /* end else */
+
+ /* Set up the base storage address for this chunk */
+ io_info->store = &ctg_store;
- if(HADDR_UNDEF==(chunk_base_addr = H5D_istore_get_addr(io_info,NULL)))
- HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list");
-
#ifdef H5D_DEBUG
- if(H5DEBUG(D))
- HDfprintf(H5DEBUG(D),"before inter_collective_io for total chunk = 1 \n");
+if(H5DEBUG(D))
+ HDfprintf(H5DEBUG(D),"before inter_collective_io for total chunk = 1 \n");
#endif
- if(H5D_inter_collective_io(io_info,chunk_info->fspace,chunk_info->mspace,chunk_base_addr,buf,do_write)<0)
- HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO");
- }
- goto done;
- }
- /* Allocate chunking information */
- ori_num_chunk = H5SL_count(fm->sel_chunks);
- H5_ASSIGN_OVERFLOW(num_chunk,ori_num_chunk,size_t,int);
+ /* Perform I/O */
+ if(H5D_inter_collective_io(io_info, type_info, fspace, mspace) < 0)
+ HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL, "couldn't finish shared collective MPI-IO")
+ } /* end if */
+ else {
+ size_t mpi_buf_count; /* Number of MPI types */
+ size_t num_chunk; /* Number of chunks for this process */
+ size_t u; /* Local index variable */
+
+ /* Get the number of chunks with a selection */
+ num_chunk = H5SL_count(fm->sel_chunks);
+ H5_CHECK_OVERFLOW(num_chunk, size_t, int);
#ifdef H5D_DEBUG
- if(H5DEBUG(D))
- HDfprintf(H5DEBUG(D),"total_chunks = %d\n",(int)total_chunks);
+if(H5DEBUG(D))
+ HDfprintf(H5DEBUG(D),"total_chunks = %Zu, num_chunk = %Zu\n", total_chunks, num_chunk);
#endif
-
- if(num_chunk == 0)
- total_chunk_addr_array = H5MM_malloc(sizeof(haddr_t)*total_chunks);
- else
- {
- chunk_addr_info_array= H5MM_malloc(num_chunk*sizeof(H5D_chunk_addr_info_t));
- chunk_mtype = H5MM_malloc(num_chunk*sizeof(MPI_Datatype));
- chunk_ftype = H5MM_malloc(num_chunk*sizeof(MPI_Datatype));
- chunk_disp_array = H5MM_malloc(num_chunk*sizeof(MPI_Aint));
- chunk_mem_disp_array = H5MM_calloc(num_chunk*sizeof(MPI_Aint));
- blocklen = H5MM_malloc(num_chunk*sizeof(int));
- }
-
- /* Obtain information to do collective IO,
- in order to do collective IO, no datatype conversion should happen. */
- if((src_type_size = H5T_get_size(io_info->dset->shared->type))==0)
- HGOTO_ERROR(H5E_DATATYPE, H5E_BADSIZE, FAIL, "datatype size invalid");
- dst_type_size = src_type_size;
-
- bsearch_coll_chunk_threshold = H5D_ALL_CHUNK_ADDR_THRES_COL;
-
- if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file))<0)
- HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size");
-
- /* Calculate the actual threshold to obtain all chunk addresses collectively
- The bigger this number is, the more possible the use of obtaining chunk address collectively. */
- /* For non-optimization one-link IO,
- actual bsearch threshold is always 0,
- we would always want to obtain the chunk addresses individually
- for each process. */
- actual_bsearch_coll_chunk_threshold = sum_chunk*100/(total_chunks*mpi_size);
-
- if((actual_bsearch_coll_chunk_threshold > bsearch_coll_chunk_threshold)
- &&(sum_chunk/mpi_size >= H5D_ALL_CHUNK_ADDR_THRES_COL_NUM))
- many_chunk_opt = H5D_OBTAIN_ALL_CHUNK_ADDR_COL;
+
+ /* Set up MPI datatype for chunks selected */
+ if(num_chunk) {
+ hsize_t mpi_mem_extra_offset; /* Extra offset for memory MPI datatype */
+ hsize_t mpi_file_extra_offset; /* Extra offset for file MPI datatype */
+ size_t mpi_mem_count; /* Memory MPI datatype count */
+ size_t mpi_file_count; /* File MPI datatype count */
+ hbool_t locl_mbt_is_derived = FALSE, /* Whether the buffer (memory) type is derived and needs to be free'd */
+ local_mft_is_derived = FALSE; /* Whether the file type is derived and needs to be free'd */
+ int blocklen_value; /* Placeholder for array fill */
+
+ /* Allocate chunking information */
+ chunk_addr_info_array= H5MM_malloc(num_chunk * sizeof(H5D_chunk_addr_info_t));
+ chunk_mtype = H5MM_malloc(num_chunk * sizeof(MPI_Datatype));
+ chunk_ftype = H5MM_malloc(num_chunk * sizeof(MPI_Datatype));
+ chunk_disp_array = H5MM_malloc(num_chunk * sizeof(MPI_Aint));
+ chunk_mem_disp_array = H5MM_calloc(num_chunk * sizeof(MPI_Aint));
+ blocklen = H5MM_malloc(num_chunk * sizeof(int));
#ifdef H5D_DEBUG
- if(H5DEBUG(D))
+if(H5DEBUG(D))
HDfprintf(H5DEBUG(D),"before sorting the chunk address \n");
#endif
-
- /* Sort the chunk address
- when chunk optimization selection is either H5D_OBTAIN_*/
-
- if(num_chunk == 0){ /* special case: this process doesn't select anything */
- if(H5D_istore_chunkmap(io_info, total_chunk_addr_array, fm->down_chunks)<0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address");
- chunk_base_addr = total_chunk_addr_array[0];
- }
-
- else {
- if(H5D_sort_chunk(io_info,fm,chunk_addr_info_array,many_chunk_opt)<0)
- HGOTO_ERROR (H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to sort chunk address");
- chunk_base_addr = chunk_addr_info_array[0].chunk_addr;
- }
+ /* Sort the chunk address */
+ if(H5D_sort_chunk(io_info, fm, chunk_addr_info_array, sum_chunk) < 0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to sort chunk address")
+ ctg_store.contig.dset_addr = chunk_addr_info_array[0].chunk_addr;
#ifdef H5D_DEBUG
- if(H5DEBUG(D))
+if(H5DEBUG(D))
HDfprintf(H5DEBUG(D),"after sorting the chunk address \n");
#endif
-
- /* Obtain MPI derived datatype from all individual chunks */
- for ( i = 0; i < num_chunk; i++) {
- /* Disk MPI derived datatype */
- if(H5S_mpio_space_type(chunk_addr_info_array[i].chunk_info.fspace,src_type_size,&chunk_ftype[i],
- &mpi_file_count,&mpi_file_extra_offset,&mft_is_derived)<0)
- HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI file type");
-
- /* Buffer MPI derived datatype */
- if(H5S_mpio_space_type(chunk_addr_info_array[i].chunk_info.mspace,dst_type_size,&chunk_mtype[i],
- &mpi_buf_count,&mpi_buf_extra_offset,&mbt_is_derived)<0)
- HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI buf type");
-
- /* Chunk address relative to the first chunk */
- chunk_addr_info_array[i].chunk_addr -= chunk_base_addr;
- H5_ASSIGN_OVERFLOW(chunk_disp_array[i],chunk_addr_info_array[i].chunk_addr,haddr_t,MPI_Aint);
- }
-
- blocklen_value = 1;
- if(num_chunk){
-
- /* initialize the buffer with the constant value 1; this algo. is very fast. */
- H5V_array_fill(blocklen,&blocklen_value,sizeof(int),(size_t)num_chunk);
-
- /* Create final MPI derived datatype */
- if(MPI_SUCCESS != (mpi_code = MPI_Type_struct(num_chunk,blocklen,chunk_disp_array,chunk_ftype,&chunk_final_ftype)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Type_struct failed", mpi_code);
- if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&chunk_final_ftype)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code);
-
- if(MPI_SUCCESS != (mpi_code = MPI_Type_struct(num_chunk,blocklen,chunk_mem_disp_array,chunk_mtype,&chunk_final_mtype)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Type_struct failed", mpi_code);
- if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&chunk_final_mtype)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code);
-
- for ( i = 0; i< num_chunk;i++){
- if (MPI_SUCCESS != (mpi_code= MPI_Type_free( chunk_mtype+i )))
- HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
-
- if (MPI_SUCCESS != (mpi_code= MPI_Type_free( chunk_ftype+i )))
- HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
- }
-
- /* buffer, file derived datatypes should be true */
- coll_info.mbt_is_derived = 1;
- coll_info.mft_is_derived = 1;
- coll_info.mpi_buf_count = 1;
- coll_info.chunk_addr = chunk_base_addr;
-
- }
-
- else {/* no selection at all for this process */
- chunk_final_ftype = MPI_BYTE;
- chunk_final_mtype = MPI_BYTE;
-
- /* buffer, file derived datatypes should be true */
- coll_info.mbt_is_derived = 0;
- coll_info.mft_is_derived = 0;
- coll_info.mpi_buf_count = 0;
- coll_info.chunk_addr = chunk_base_addr;
- }
+
+ /* Obtain MPI derived datatype from all individual chunks */
+ for(u = 0; u < num_chunk; u++) {
+ /* Disk MPI derived datatype */
+ if(H5S_mpio_space_type(chunk_addr_info_array[u].chunk_info.fspace,
+ type_info->src_type_size, &chunk_ftype[u], &mpi_file_count,
+ &mpi_file_extra_offset, &local_mft_is_derived) < 0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "couldn't create MPI file type")
+
+ /* Buffer MPI derived datatype */
+ if(H5S_mpio_space_type(chunk_addr_info_array[u].chunk_info.mspace,
+ type_info->dst_type_size, &chunk_mtype[u], &mpi_mem_count,
+ &mpi_mem_extra_offset, &locl_mbt_is_derived) < 0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "couldn't create MPI buf type")
+
+ /* Chunk address relative to the first chunk */
+ chunk_addr_info_array[u].chunk_addr -= ctg_store.contig.dset_addr;
+ H5_ASSIGN_OVERFLOW(chunk_disp_array[u], chunk_addr_info_array[u].chunk_addr, haddr_t, MPI_Aint);
+ } /* end for */
+
+ /* Initialize the buffer with the constant value 1 */
+ blocklen_value = 1;
+ H5V_array_fill(blocklen, &blocklen_value, sizeof(int), num_chunk);
+
+ /* Create final MPI derived datatype for the file */
+ if(MPI_SUCCESS != (mpi_code = MPI_Type_struct((int)num_chunk, blocklen, chunk_disp_array, chunk_ftype, &chunk_final_ftype)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_struct failed", mpi_code)
+ if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&chunk_final_ftype)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code)
+
+ /* Create final MPI derived datatype for memory */
+ if(MPI_SUCCESS != (mpi_code = MPI_Type_struct(num_chunk, blocklen, chunk_mem_disp_array, chunk_mtype, &chunk_final_mtype)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_struct failed", mpi_code)
+ if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&chunk_final_mtype)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code)
+
+ /* Free the file & memory MPI datatypes for each chunk */
+ for(u = 0; u < num_chunk; u++) {
+ if(MPI_SUCCESS != (mpi_code = MPI_Type_free(chunk_mtype + u)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+
+ if(MPI_SUCCESS != (mpi_code = MPI_Type_free(chunk_ftype + u)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ } /* end for */
+
+ /* buffer, file derived datatypes should be true */
+ mbt_is_derived = TRUE;
+ mft_is_derived = TRUE;
+ mpi_buf_count = (size_t)1;
+ } /* end if */
+ else { /* no selection at all for this process */
+ /* Allocate chunking information */
+ total_chunk_addr_array = H5MM_malloc(sizeof(haddr_t) * total_chunks);
+
+ /* Retrieve chunk address map */
+ if(H5D_istore_chunkmap(io_info, total_chunk_addr_array, fm->down_chunks) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address")
+
+ /* Get chunk with lowest address */
+ ctg_store.contig.dset_addr = HADDR_MAX;
+ for(u = 0; u < total_chunks; u++)
+ if(total_chunk_addr_array[u] < ctg_store.contig.dset_addr)
+ ctg_store.contig.dset_addr = total_chunk_addr_array[u];
+ HDassert(ctg_store.contig.dset_addr != HADDR_MAX);
+
+ /* Set the MPI datatype */
+ chunk_final_ftype = MPI_BYTE;
+ chunk_final_mtype = MPI_BYTE;
+
+ /* buffer, file derived datatypes should be true */
+ mpi_buf_count = (size_t)0;
+ } /* end else */
#ifdef H5D_DEBUG
- if(H5DEBUG(D))
+if(H5DEBUG(D))
HDfprintf(H5DEBUG(D),"before coming to final collective IO\n");
#endif
- if(H5D_final_collective_io(io_info,&chunk_final_ftype,&chunk_final_mtype,&coll_info,buf,do_write)<0)
- HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish MPI-IO");
+ /* Set up the base storage address for this chunk */
+ io_info->store = &ctg_store;
+
+ /* Perform final collective I/O operation */
+ if(H5D_final_collective_io(io_info, type_info, mpi_buf_count, &chunk_final_ftype, &chunk_final_mtype) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish MPI-IO")
+ } /* end else */
done:
#ifdef H5D_DEBUG
- if(H5DEBUG(D))
- HDfprintf(H5DEBUG(D),"before freeing memory inside H5D_link_collective_io ret_value = %d\n",ret_value);
+if(H5DEBUG(D))
+ HDfprintf(H5DEBUG(D),"before freeing memory inside H5D_link_collective_io ret_value = %d\n", ret_value);
#endif
- if (fm->total_chunks != 1) {
- if(num_chunk == 0) HDfree(total_chunk_addr_array);
- else {
- HDfree(chunk_addr_info_array);
- HDfree(chunk_mtype);
- HDfree(chunk_ftype);
- HDfree(chunk_disp_array);
- HDfree(chunk_mem_disp_array);
- HDfree(blocklen);
- }
- }
- FUNC_LEAVE_NOAPI(ret_value)
+ if(total_chunk_addr_array)
+ H5MM_xfree(total_chunk_addr_array);
+ if(chunk_addr_info_array)
+ H5MM_xfree(chunk_addr_info_array);
+ if(chunk_mtype)
+ H5MM_xfree(chunk_mtype);
+ if(chunk_ftype)
+ H5MM_xfree(chunk_ftype);
+ if(chunk_disp_array)
+ H5MM_xfree(chunk_disp_array);
+ if(chunk_mem_disp_array)
+ H5MM_xfree(chunk_mem_disp_array);
+ if(blocklen)
+ H5MM_xfree(blocklen);
+
+ /* Free the MPI buf and file types, if they were derived */
+ if(mbt_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&chunk_final_mtype)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ if(mft_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&chunk_final_ftype)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+
+ FUNC_LEAVE_NOAPI(ret_value)
} /* end H5D_link_chunk_collective_io */
+#endif /* H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS */
/*-------------------------------------------------------------------------
* Function: H5D_multi_chunk_collective_io
*
* Purpose: To do IO per chunk according to IO mode(collective/independent/none)
-
- 1. Use MPI_gather and MPI_Bcast to obtain IO mode in each chunk(collective/independent/none)
- 2. Depending on whether the IO mode is collective or independent or none,
- Create either MPI derived datatype for each chunk or just do independent IO
- 3. Use common collective IO routine to do MPI-IO
*
- * Return: Non-negative on success/Negative on failure
+ * 1. Use MPI_gather and MPI_Bcast to obtain IO mode in each chunk(collective/independent/none)
+ * 2. Depending on whether the IO mode is collective or independent or none,
+ * Create either MPI derived datatype for each chunk or just do independent IO
+ * 3. Use common collective IO routine to do MPI-IO
*
- * Programmer:
+ * Return: Non-negative on success/Negative on failure
*
- * Modifications:
+ * Programmer: Muqun Yang
+ * Monday, Feb. 13th, 2006
*
*-------------------------------------------------------------------------
*/
static herr_t
-H5D_multi_chunk_collective_io(H5D_io_info_t *io_info,H5D_chunk_map_t *fm,const void *buf, hbool_t do_write)
+H5D_multi_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info,
+ H5D_chunk_map_t *fm, H5P_genplist_t *dx_plist)
{
- unsigned i, total_chunk;
- hsize_t ori_total_chunk;
- uint8_t *chunk_io_option;
-
- H5SL_node_t *chunk_node; /* Current node in chunk skip list */
- H5D_chunk_info_t *chunk_info=NULL;
- haddr_t *chunk_addr;
- H5D_storage_t store; /* union of EFL and chunk pointer in file space */
- hbool_t select_chunk;
- hbool_t last_io_mode_coll = TRUE;
-
- void *chunk = NULL; /* Pointer to the data chunk in cache */
- H5D_t *dataset=io_info->dset;/* Local pointer to dataset info */
- H5D_istore_ud1_t udata; /*B-tree pass-through */
- haddr_t caddr; /* Address of the cached chunk */
- size_t accessed_bytes; /*total accessed size in a chunk */
- unsigned idx_hint=0; /* Cache index hint */
- hbool_t dirty = TRUE; /* Flag for cache flushing */
- hbool_t relax=TRUE; /* Whether whole chunk is selected */
-
- herr_t ret_value = SUCCEED;
+ H5D_t *dataset = io_info->dset;/* Local pointer to dataset info */
+ H5D_io_info_t ctg_io_info; /* Contiguous I/O info object */
+ H5D_storage_t ctg_store; /* Chunk storage information as contiguous dataset */
+ H5D_io_info_t cpt_io_info; /* Compact I/O info object */
+ H5D_storage_t cpt_store; /* Chunk storage information as compact dataset */
+ hbool_t cpt_dirty; /* Temporary placeholder for compact storage "dirty" flag */
+ uint8_t *chunk_io_option = NULL;
+ haddr_t *chunk_addr = NULL;
+ H5D_storage_t store; /* union of EFL and chunk pointer in file space */
+ H5FD_mpio_xfer_t last_xfer_mode = H5FD_MPIO_COLLECTIVE; /* Last parallel transfer for this request (H5D_XFER_IO_XFER_MODE_NAME) */
+ H5FD_mpio_collective_opt_t last_coll_opt_mode = H5FD_MPIO_COLLECTIVE_IO; /* Last parallel transfer with independent IO or collective IO with this mode */
+ size_t total_chunk; /* Total # of chunks in dataset */
#ifdef H5Dmpio_DEBUG
- int mpi_rank;
+ int mpi_rank;
#endif
+ size_t u; /* Local index variable */
+ herr_t ret_value = SUCCEED;
-
- FUNC_ENTER_NOAPI_NOINIT(H5D_multi_chunk_collective_io)
+ FUNC_ENTER_NOAPI_NOINIT(H5D_multi_chunk_collective_io)
#ifdef H5Dmpio_DEBUG
- mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file);
+ mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file);
#endif
- /* Allocate memories */
- ori_total_chunk = fm->total_chunks;
- H5_ASSIGN_OVERFLOW(total_chunk,ori_total_chunk,hsize_t,unsigned);
- HDassert(total_chunk!=0);
- chunk_io_option = (uint8_t *)H5MM_calloc(total_chunk*sizeof(MPI_BYTE));
- chunk_addr = (haddr_t *)H5MM_calloc(total_chunk*sizeof(haddr_t));
+ /* Retrieve total # of chunks in dataset */
+ H5_ASSIGN_OVERFLOW(total_chunk, fm->total_chunks, hsize_t, size_t);
+ HDassert(total_chunk != 0);
+
+ /* Allocate memories */
+ chunk_io_option = (uint8_t *)H5MM_calloc(total_chunk);
+ chunk_addr = (haddr_t *)H5MM_calloc(total_chunk * sizeof(haddr_t));
#ifdef H5D_DEBUG
- if(H5DEBUG(D))
- HDfprintf(H5DEBUG(D),"total_chunk %u\n",total_chunk);
+if(H5DEBUG(D))
+ HDfprintf(H5DEBUG(D), "total_chunk %Zu\n", total_chunk);
#endif
- /* obtain IO option for each chunk */
- if(H5D_obtain_mpio_mode(io_info,fm,chunk_io_option,chunk_addr)<0)
- HGOTO_ERROR (H5E_DATASET, H5E_CANTRECV, FAIL, "unable to obtain MPIO mode");
+ /* Obtain IO option for each chunk */
+ if(H5D_obtain_mpio_mode(io_info, fm, dx_plist, chunk_io_option, chunk_addr) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTRECV, FAIL, "unable to obtain MPIO mode")
+
+ /* Set up contiguous I/O info object */
+ HDmemcpy(&ctg_io_info, io_info, sizeof(ctg_io_info));
+ ctg_io_info.store = &ctg_store;
+ ctg_io_info.layout_ops = *H5D_LOPS_CONTIG;
+
+ /* Initialize temporary contiguous storage info */
+ ctg_store.contig.dset_size = (hsize_t)io_info->dset->shared->layout.u.chunk.size;
+
+ /* Set up compact I/O info object */
+ HDmemcpy(&cpt_io_info, io_info, sizeof(cpt_io_info));
+ cpt_io_info.store = &cpt_store;
+ cpt_io_info.layout_ops = *H5D_LOPS_COMPACT;
+
+ /* Initialize temporary compact storage info */
+ cpt_store.compact.dirty = &cpt_dirty;
+
+ /* Set dataset storage for I/O info */
+ io_info->store = &store;
+
+ /* Loop over _all_ the chunks */
+ for(u = 0; u < total_chunk; u++) {
+ H5D_chunk_info_t *chunk_info; /* Chunk info for current chunk */
+ H5S_t *fspace; /* Dataspace describing chunk & selection in it */
+ H5S_t *mspace; /* Dataspace describing selection in memory corresponding to this chunk */
- for(i = 0; i < total_chunk; i++) {
#ifdef H5D_DEBUG
- if(H5DEBUG(D))
- HDfprintf(H5DEBUG(D),"mpi_rank = %d, chunk index = %u\n",mpi_rank,i);
+if(H5DEBUG(D))
+ HDfprintf(H5DEBUG(D),"mpi_rank = %d, chunk index = %Zu\n", mpi_rank, u);
#endif
- select_chunk = fm->select_chunk[i];
- if(select_chunk == 1){/* Have selection elements in this chunk. Find the chunk info. */
- if(NULL ==(chunk_node = H5SL_first(fm->sel_chunks)))
- HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk node from skipped list");
-
- while(chunk_node){
- if(NULL ==(chunk_info = H5SL_item(chunk_node)))
- HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list");
- if(chunk_info->index == i) {
- /* Set dataset storage for I/O info */
- io_info->store=&store;
- /* Pass in chunk's coordinates in a union. */
- store.chunk.offset = chunk_info->coords;
- store.chunk.index = chunk_info->index;
- break;
- }
-
- chunk_node = H5SL_next(chunk_node);
- }
- }
-
- if(chunk_io_option[i] == 1){ /*collective IO for this chunk,
- note: even there is no selection for this process,
- the process still needs to contribute MPI NONE TYPE.*/
+ /* Get the chunk info for this chunk, if there are elements selected */
+ chunk_info = fm->select_chunk[u];
+
+ /* Set the storage information for chunks with selections */
+ if(chunk_info) {
+ HDassert(chunk_info->index == u);
+
+ /* Pass in chunk's coordinates in a union. */
+ store.chunk.offset = chunk_info->coords;
+ store.chunk.index = chunk_info->index;
+ } /* end if */
+
+ /* Collective IO for this chunk,
+ * Note: even there is no selection for this process, the process still
+ * needs to contribute MPI NONE TYPE.
+ */
+ if(chunk_io_option[u] == 1) {
#ifdef H5D_DEBUG
- if(H5DEBUG(D))
- HDfprintf(H5DEBUG(D),"inside collective chunk IO mpi_rank = %d, chunk index = %u\n",mpi_rank,i);
+if(H5DEBUG(D))
+ HDfprintf(H5DEBUG(D),"inside collective chunk IO mpi_rank = %d, chunk index = %Zu\n", mpi_rank, u);
#endif
- if(!last_io_mode_coll)
- /* Switch back to collective I/O */
- if(H5D_ioinfo_make_coll(io_info) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to collective I/O")
+ /* Set the file & memory dataspaces */
+ if(chunk_info) {
+ fspace = chunk_info->fspace;
+ mspace = chunk_info->mspace;
+ } /* end if */
+ else {
+ fspace = mspace = NULL;
+ } /* end else */
+
+ /* Switch back to collective I/O */
+ if(last_xfer_mode != H5FD_MPIO_COLLECTIVE) {
+ if(H5D_ioinfo_xfer_mode(io_info, dx_plist, H5FD_MPIO_COLLECTIVE) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to collective I/O")
+ last_xfer_mode = H5FD_MPIO_COLLECTIVE;
+ } /* end if */
+ if(last_coll_opt_mode != H5FD_MPIO_COLLECTIVE_IO) {
+ if(H5D_ioinfo_coll_opt_mode(io_info, dx_plist, H5FD_MPIO_COLLECTIVE_IO) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to collective I/O")
+ last_coll_opt_mode = H5FD_MPIO_COLLECTIVE_IO;
+ } /* end if */
- if(select_chunk){
- if(H5D_inter_collective_io(io_info,chunk_info->fspace,chunk_info->mspace,
- chunk_addr[i],buf,do_write )<0)
- HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO");
-
- }
- else{
- if(H5D_inter_collective_io(io_info,NULL,NULL,
- chunk_addr[i],buf,do_write )<0)
- HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO");
-
- }
- last_io_mode_coll = TRUE;
-
- }
- else {/*possible independent IO for this chunk*/
+ /* Initialize temporary contiguous storage address */
+ ctg_store.contig.dset_addr = chunk_addr[u];
+
+ /* Perform the I/O */
+ if(H5D_inter_collective_io(&ctg_io_info, type_info, fspace, mspace) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish shared collective MPI-IO")
+ } /* end if */
+ else { /* possible independent IO for this chunk */
#ifdef H5D_DEBUG
- if(H5DEBUG(D))
- HDfprintf(H5DEBUG(D),"inside independent IO mpi_rank = %d, chunk index = %u\n",mpi_rank,i);
+if(H5DEBUG(D))
+ HDfprintf(H5DEBUG(D),"inside independent IO mpi_rank = %d, chunk index = %Zu\n", mpi_rank, u);
#endif
- HDassert(chunk_io_option[i] == 0);
+ HDassert(chunk_io_option[u] == 0);
#if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS)
- if(!select_chunk)
- continue; /* this process has nothing to do with this chunk, continue! */
- if(last_io_mode_coll)
+ /* Check if this process has somethign to do with this chunk */
+ if(chunk_info) {
+ H5D_io_info_t *chk_io_info; /* Pointer to I/O info object for this chunk */
+ H5D_istore_ud1_t udata; /* B-tree pass-through */
+ void *chunk; /* Pointer to the data chunk in cache */
+ size_t accessed_bytes; /* Total accessed size in a chunk */
+ unsigned idx_hint = 0; /* Cache index hint */
+ haddr_t caddr; /* Address of the cached chunk */
+
/* Switch to independent I/O */
- if(H5D_ioinfo_make_ind(io_info) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O")
-
- /* Load the chunk into cache. But if the whole chunk is written,
- * simply allocate space instead of load the chunk. */
- if(HADDR_UNDEF==(caddr = H5D_istore_get_addr(io_info, &udata)))
- HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list");
-
- if(H5D_istore_if_load(io_info, caddr)) {
- accessed_bytes = chunk_info->chunk_points * H5T_get_size(dataset->shared->type);
- if((do_write && (accessed_bytes != dataset->shared->layout.u.chunk.size)) || !do_write)
- relax=FALSE;
-
- if(NULL == (chunk = H5D_istore_lock(io_info, &udata, relax, &idx_hint)))
- HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "unable to read raw data chunk")
- } else
- chunk = NULL;
-
- if(do_write) {
- if((io_info->ops.write)(io_info,
- chunk_info->chunk_points,H5T_get_size(io_info->dset->shared->type),
- chunk_info->fspace,chunk_info->mspace,caddr,chunk, buf) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed")
- }
- else {
- if((io_info->ops.read)(io_info,
- chunk_info->chunk_points,H5T_get_size(io_info->dset->shared->type),
- chunk_info->fspace,chunk_info->mspace,caddr,chunk, buf) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed")
- }
+ if(last_xfer_mode != H5FD_MPIO_INDEPENDENT) {
+ if(H5D_ioinfo_xfer_mode(io_info, dx_plist, H5FD_MPIO_INDEPENDENT) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O")
+ last_xfer_mode = H5FD_MPIO_INDEPENDENT;
+ } /* end if */
- /* Release the cache lock on the chunk. */
- if(chunk) {
- if(!do_write)
- dirty = FALSE;
+ /* Load the chunk into cache. But if the whole chunk is written,
+ * simply allocate space instead of load the chunk.
+ */
+ if(HADDR_UNDEF == (caddr = H5D_istore_get_addr(io_info, &udata)))
+ HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL, "couldn't get chunk info from skipped list")
- if(H5D_istore_unlock(io_info, dirty, idx_hint, chunk, accessed_bytes) < 0)
- HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "unable to unlock raw data chunk")
- } /* end if */
-#else
- if(!last_io_mode_coll)
- /* using independent I/O with file setview.*/
- if(H5D_ioinfo_make_coll_opt(io_info) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to collective I/O")
- if(select_chunk){
- if(H5D_inter_collective_io(io_info,chunk_info->fspace,chunk_info->mspace,
- chunk_addr[i],buf,do_write )<0)
- HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO");
- }
+ /* Load the chunk into cache and lock it. */
+ if(H5D_chunk_cacheable(io_info, caddr)) {
+ hbool_t entire_chunk = TRUE; /* Whether whole chunk is selected */
+
+ /* Compute # of bytes accessed in chunk */
+ accessed_bytes = chunk_info->chunk_points * type_info->src_type_size;
+
+ /* Determine if we will access all the data in the chunk */
+ if(((io_info->op_type == H5D_IO_OP_WRITE) && (accessed_bytes != ctg_store.contig.dset_size))
+ || (io_info->op_type != H5D_IO_OP_WRITE))
+ entire_chunk = FALSE;
+
+ /* Lock the chunk into the cache */
+ if(NULL == (chunk = H5D_istore_lock(io_info, &udata, entire_chunk, &idx_hint)))
+ HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "unable to read raw data chunk")
+
+ /* Set up the storage buffer information for this chunk */
+ cpt_store.compact.buf = chunk;
+
+ /* Point I/O info at contiguous I/O info for this chunk */
+ chk_io_info = &cpt_io_info;
+ } /* end if */
+ else {
+ /* Set up the storage address information for this chunk */
+ ctg_store.contig.dset_addr = caddr;
+
+ /* No chunk cached */
+ chunk = NULL;
+
+ /* Point I/O info at temporary I/O info for this chunk */
+ chk_io_info = &ctg_io_info;
+ } /* end else */
+
+ if(io_info->op_type == H5D_IO_OP_WRITE) {
+ if((io_info->io_ops.single_write)(chk_io_info, type_info,
+ (hsize_t)chunk_info->chunk_points, chunk_info->fspace, chunk_info->mspace) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed")
+ } /* end if */
+ else {
+ if((io_info->io_ops.single_read)(chk_io_info, type_info,
+ (hsize_t)chunk_info->chunk_points, chunk_info->fspace, chunk_info->mspace) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed")
+ } /* end else */
+
+ /* Release the cache lock on the chunk. */
+ if(chunk && H5D_istore_unlock(io_info, (io_info->op_type == H5D_IO_OP_WRITE), idx_hint, chunk, accessed_bytes) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "unable to unlock raw data chunk")
+ } /* end if */
+#else /* !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS) */
+ /* Set the file & memory dataspaces */
+ if(chunk_info) {
+ fspace = chunk_info->fspace;
+ mspace = chunk_info->mspace;
+ } /* end if */
else {
- if(H5D_inter_collective_io(io_info,NULL,NULL,
- chunk_addr[i],buf,do_write )<0)
- HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO");
- }
+ fspace = mspace = NULL;
+ } /* end else */
+
+ /* Using independent I/O with file setview.*/
+ if(last_coll_opt_mode != H5FD_MPIO_INDIVIDUAL_IO) {
+ if(H5D_ioinfo_coll_opt_mode(io_info, dx_plist, H5FD_MPIO_INDIVIDUAL_IO) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to individual I/O")
+ last_coll_opt_mode = H5FD_MPIO_INDIVIDUAL_IO;
+ } /* end if */
+ /* Initialize temporary contiguous storage address */
+ ctg_store.contig.dset_addr = chunk_addr[u];
+
+ /* Perform the I/O */
+ if(H5D_inter_collective_io(&ctg_io_info, type_info, fspace, mspace) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish shared collective MPI-IO")
#ifdef H5D_DEBUG
if(H5DEBUG(D))
HDfprintf(H5DEBUG(D),"after inter collective IO\n");
#endif
-#endif
- last_io_mode_coll = FALSE;
- }
- }
- if(!last_io_mode_coll)
- /* Switch back to collective I/O */
- if(H5D_ioinfo_make_coll(io_info) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to collective I/O")
+#endif /* !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS) */
+ } /* end else */
+ } /* end for */
+
done:
- HDfree(chunk_io_option);
- HDfree(chunk_addr);
+ if(chunk_io_option)
+ H5MM_xfree(chunk_io_option);
+ if(chunk_addr)
+ H5MM_xfree(chunk_addr);
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5D_multi_chunk_collective_io */
@@ -1257,90 +1297,105 @@ done:
* non-contiguous(or with holes) storage efficiently.
* Under this case, the one independent IO call may consist of
* many small disk IOs. So we may use independent IO with derived datatype
- to replace the independent IO when we find this chunk is not good to
- do collective IO. However, according to our performance study,
- this approach may not overcome the overhead caused by MPI gather/scatter.
- So we decide to leave the original collective IO per chunk approach as
- an option for users. NO MPI gather/scatter calls are used.
- HDF5 will try to collective IO if possible.
- If users choose to use
- H5Pset_dxpl_mpio_chunk_opt(dxpl_id,H5FD_MPIO_OPT_MULTI_IO),
- this function will be called.
- The HDF5 library won't do any IO management but leave it to MPI-IO to figure
- out.
+ * to replace the independent IO when we find this chunk is not good to
+ * do collective IO. However, according to our performance study,
+ * this approach may not overcome the overhead caused by MPI gather/scatter.
+ * So we decide to leave the original collective IO per chunk approach as
+ * an option for users. NO MPI gather/scatter calls are used.
+ * HDF5 will try to collective IO if possible.
+ * If users choose to use
+ * H5Pset_dxpl_mpio_chunk_opt(dxpl_id,H5FD_MPIO_OPT_MULTI_IO),
+ * this function will be called.
+ * The HDF5 library won't do any IO management but leave it to MPI-IO to figure
+ * out.
*
* Return: Non-negative on success/Negative on failure
*
- * Programmer:
- *
- * Modifications:
+ * Programmer: Muqun Yang
+ * Monday, Feb. 13th, 2006
*
*-------------------------------------------------------------------------
*/
static herr_t
-H5D_multi_chunk_collective_io_no_opt(H5D_io_info_t *io_info,H5D_chunk_map_t *fm,const void *buf, hbool_t do_write)
+H5D_multi_chunk_collective_io_no_opt(H5D_io_info_t *io_info,
+ const H5D_type_info_t *type_info, H5D_chunk_map_t *fm, H5P_genplist_t *dx_plist)
{
- int count_chunk,min_num_chunk;
- haddr_t chunk_addr;
- H5SL_node_t *chunk_node; /* Current node in chunk skip list */
- H5D_storage_t store; /* union of EFL and chunk pointer in file space */
- H5D_chunk_info_t *chunk_info; /* chunk information */
- hbool_t make_ind, make_coll; /* Flags to indicate that the MPI mode should change */
-
- void *chunk = NULL; /* Pointer to the data chunk in cache */
- H5D_t *dataset=io_info->dset;/* Local pointer to dataset info */
- H5D_istore_ud1_t udata; /*B-tree pass-through */
- size_t accessed_bytes; /*total accessed size in a chunk */
- unsigned idx_hint=0; /* Cache index hint */
- hbool_t dirty = TRUE; /* Flag for cache flushing */
- hbool_t relax=TRUE; /* Whether whole chunk is selected */
- herr_t ret_value = SUCCEED;
-
-#ifdef H5Dmpio_DEBUG
- int mpi_rank;
-#endif
+ H5D_t *dataset = io_info->dset;/* Local pointer to dataset info */
+ H5SL_node_t *chunk_node; /* Current node in chunk skip list */
+ H5D_io_info_t ctg_io_info; /* Contiguous I/O info object */
+ H5D_storage_t ctg_store; /* Chunk storage information as contiguous dataset */
+ H5D_io_info_t cpt_io_info; /* Compact I/O info object */
+ H5D_storage_t cpt_store; /* Chunk storage information as compact dataset */
+ hbool_t cpt_dirty; /* Temporary placeholder for compact storage "dirty" flag */
+ int min_chunk = -1; /* Minimum # of chunks all processes will operate on */
+ int count_chunk; /* How many chunks have we operated on? */
+ H5D_storage_t store; /* union of EFL and chunk pointer in file space */
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_NOAPI_NOINIT(H5D_multi_chunk_collective_io_no_opt)
- FUNC_ENTER_NOAPI_NOINIT(H5D_multi_chunk_collective_io_no_opt)
#ifdef H5D_DEBUG
- if(H5DEBUG(D)){
+if(H5DEBUG(D)) {
+ int mpi_rank;
+
mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file);
- HDfprintf(H5DEBUG(D),"coming to multi_chunk_collective_io_no_opt\n");
- }
+ HDfprintf(H5DEBUG(D), "coming to multi_chunk_collective_io_no_opt\n");
+}
#endif
- if(H5D_mpio_get_min_chunk(io_info,fm,&min_num_chunk)<0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get minimum number of chunk");
- count_chunk = 0;
-
- /* Get first node in chunk skip list */
- chunk_node=H5SL_first(fm->sel_chunks);
-
- /* Iterate through chunks to be operated on */
- while(chunk_node) {
- H5D_chunk_info_t *chunk_info; /* chunk information */
- hbool_t make_ind, make_coll; /* Flags to indicate that the MPI mode should change */
-
- /* Get the actual chunk information from the skip list node */
- chunk_info=H5SL_item(chunk_node);
-
- /* Set dataset storage for I/O info */
- io_info->store=&store;
-
- /* Pass in chunk's coordinates in a union. */
- store.chunk.offset = chunk_info->coords;
- store.chunk.index = chunk_info->index;
-
- /* Reset flags for changing parallel I/O mode */
- make_ind = make_coll = FALSE;
-
- count_chunk++;
- /* If the number of chunk is greater than minimum number of chunk,
- Do independent read */
- if(count_chunk > min_num_chunk) {
- /* Switch to independent I/O (permanently) */
- make_ind = TRUE;
- }
+ /* Set up contiguous I/O info object */
+ HDmemcpy(&ctg_io_info, io_info, sizeof(ctg_io_info));
+ ctg_io_info.store = &ctg_store;
+ ctg_io_info.layout_ops = *H5D_LOPS_CONTIG;
+
+ /* Initialize temporary contiguous storage info */
+ ctg_store.contig.dset_size = (hsize_t)io_info->dset->shared->layout.u.chunk.size;
+
+ /* Set up compact I/O info object */
+ HDmemcpy(&cpt_io_info, io_info, sizeof(cpt_io_info));
+ cpt_io_info.store = &cpt_store;
+ cpt_io_info.layout_ops = *H5D_LOPS_COMPACT;
+
+ /* Initialize temporary compact storage info */
+ cpt_store.compact.dirty = &cpt_dirty;
+
+ /* Set dataset storage for I/O info */
+ io_info->store = &store;
+
+ /* Get the min. # of chunks */
+ if(H5D_mpio_get_min_chunk(io_info, fm, &min_chunk) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get minimum number of chunk")
+ HDassert(min_chunk >= 0);
+
+ /* Get first node in chunk skip list */
+ chunk_node = H5SL_first(fm->sel_chunks);
+ count_chunk = 0;
+
+ /* Iterate through chunks to be operated on */
+ while(chunk_node) {
+ H5D_chunk_info_t *chunk_info; /* chunk information */
+ haddr_t chunk_addr; /* Address of chunk in file */
+ H5D_istore_ud1_t udata; /* B-tree pass-through */
+ hbool_t make_ind, make_coll; /* Flags to indicate that the MPI mode should change */
+ /* Get the actual chunk information from the skip list node */
+ chunk_info = H5SL_item(chunk_node);
+
+ /* Pass in chunk's coordinates in a union. */
+ store.chunk.offset = chunk_info->coords;
+ store.chunk.index = chunk_info->index;
+
+ /* Reset flags for changing parallel I/O mode */
+ make_ind = make_coll = FALSE;
+
+ count_chunk++;
+
+ /* If the number of chunk is greater than minimum number of chunk,
+ * Do independent read.
+ */
+ if(count_chunk > min_chunk)
+ /* Switch to independent I/O (permanently) */
+ make_ind = TRUE;
#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
/* This case needs to be improved to check if the selected space
is regular. If all selections are regular, collective IO can still be done.
@@ -1349,67 +1404,92 @@ H5D_multi_chunk_collective_io_no_opt(H5D_io_info_t *io_info,H5D_chunk_map_t *fm,
we turn off this optimization but leave the following code
for future optimization. Otherwise, the following else {} doesn't make sense.
KY 2006/8/4/ */
- else {
- /* Switch to independent I/O (temporarily) */
- make_ind = TRUE;
- make_coll = TRUE;
- } /* end else */
+ else {
+ /* Switch to independent I/O (temporarily) */
+ make_ind = TRUE;
+ make_coll = TRUE;
+ } /* end else */
#endif /* H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS */
+ /* Retrieve the chunk's address */
+ if(HADDR_UNDEF == (chunk_addr = H5D_istore_get_addr(io_info, &udata)))
+ HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list")
+
+ /* Independent I/O */
+ if(make_ind) {
+ void *chunk; /* Pointer to the data chunk in cache */
+ H5D_io_info_t *chk_io_info; /* Pointer to I/O info object for this chunk */
+ size_t accessed_bytes = 0; /* Total accessed size in a chunk */
+ unsigned idx_hint = 0; /* Cache index hint */
+
/* Switch to independent I/O */
- if(make_ind)
- if(H5D_ioinfo_make_ind(io_info) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O")
+ if(H5D_ioinfo_xfer_mode(io_info, dx_plist, H5FD_MPIO_INDEPENDENT) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O")
- if(HADDR_UNDEF==(chunk_addr = H5D_istore_get_addr(io_info, &udata)))
- HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list");
+ /* Load the chunk into cache and lock it. */
+ if(H5D_chunk_cacheable(io_info, chunk_addr)) {
+ hbool_t entire_chunk = TRUE; /* Whether whole chunk is selected */
- if(make_ind) {/*independent I/O */
- /* Load the chunk into cache. But if the whole chunk is written,
- * simply allocate space instead of load the chunk. */
- if(H5D_istore_if_load(io_info, chunk_addr)) {
- accessed_bytes = chunk_info->chunk_points * H5T_get_size(dataset->shared->type);
- if((do_write && (accessed_bytes != dataset->shared->layout.u.chunk.size)) || !do_write)
- relax=FALSE;
-
- if(NULL == (chunk = H5D_istore_lock(io_info, &udata, relax, &idx_hint)))
- HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "unable to read raw data chunk")
- } else
- chunk = NULL;
-
- if(do_write) {
- if((io_info->ops.write)(io_info,
- chunk_info->chunk_points,H5T_get_size(io_info->dset->shared->type),
- chunk_info->fspace,chunk_info->mspace, chunk_addr, chunk, buf) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed")
- } else {
- if((io_info->ops.read)(io_info,
- chunk_info->chunk_points,H5T_get_size(io_info->dset->shared->type),
- chunk_info->fspace,chunk_info->mspace, chunk_addr, chunk, buf) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed")
- }
+ /* Compute # of bytes accessed in chunk */
+ accessed_bytes = chunk_info->chunk_points * type_info->src_type_size;
- /* Release the cache lock on the chunk. */
- if(chunk) {
- if(!do_write)
- dirty = FALSE;
+ /* Determine if we will access all the data in the chunk */
+ if(((io_info->op_type == H5D_IO_OP_WRITE) && (accessed_bytes != ctg_store.contig.dset_size))
+ || (io_info->op_type != H5D_IO_OP_WRITE))
+ entire_chunk = FALSE;
+
+ /* Lock the chunk into the cache */
+ if(NULL == (chunk = H5D_istore_lock(io_info, &udata, entire_chunk, &idx_hint)))
+ HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "unable to read raw data chunk")
- if(H5D_istore_unlock(io_info, dirty, idx_hint, chunk, accessed_bytes) < 0)
- HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "unable to unlock raw data chunk")
- } /* end if */
+ /* Set up the storage buffer information for this chunk */
+ cpt_store.compact.buf = chunk;
+
+ /* Point I/O info at contiguous I/O info for this chunk */
+ chk_io_info = &cpt_io_info;
} /* end if */
- else { /*collective I/O */
- if(H5D_inter_collective_io(io_info,chunk_info->fspace,chunk_info->mspace,
- chunk_addr,buf,do_write ) < 0)
- HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO");
- }
-
- if(make_coll)
- if(H5D_ioinfo_make_coll(io_info) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O")
- /* Get the next chunk node in the skip list */
- chunk_node=H5SL_next(chunk_node);
- } /* end while */
+ else {
+ /* Set up the storage address information for this chunk */
+ ctg_store.contig.dset_addr = chunk_addr;
+
+ /* No chunk cached */
+ chunk = NULL;
+
+ /* Point I/O info at temporary I/O info for this chunk */
+ chk_io_info = &ctg_io_info;
+ } /* end else */
+
+ if(io_info->op_type == H5D_IO_OP_WRITE) {
+ if((io_info->io_ops.single_write)(chk_io_info, type_info,
+ (hsize_t)chunk_info->chunk_points, chunk_info->fspace, chunk_info->mspace) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed")
+ } /* end if */
+ else {
+ if((io_info->io_ops.single_read)(chk_io_info, type_info,
+ (hsize_t)chunk_info->chunk_points, chunk_info->fspace, chunk_info->mspace) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed")
+ } /* end ese */
+
+ /* Release the cache lock on the chunk. */
+ if(chunk)
+ if(H5D_istore_unlock(io_info, (io_info->op_type == H5D_IO_OP_WRITE), idx_hint, chunk, accessed_bytes) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "unable to unlock raw data chunk")
+ } /* end if */
+ else { /*collective I/O */
+ /* Set up the storage address information for this chunk */
+ ctg_store.contig.dset_addr = chunk_addr;
+
+ if(H5D_inter_collective_io(&ctg_io_info, type_info, chunk_info->fspace, chunk_info->mspace) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish shared collective MPI-IO")
+ } /* end else */
+
+ if(make_coll)
+ if(H5D_ioinfo_xfer_mode(io_info, dx_plist, H5FD_MPIO_COLLECTIVE) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't switch to independent I/O")
+
+ /* Get the next chunk node in the skip list */
+ chunk_node = H5SL_next(chunk_node);
+ } /* end while */
done:
FUNC_LEAVE_NOAPI(ret_value)
@@ -1420,303 +1500,266 @@ done:
* Function: H5D_inter_collective_io
*
* Purpose: Routine for the shared part of collective IO between multiple chunk
- collective IO and contiguous collective IO
-
+ * collective IO and contiguous collective IO
*
* Return: Non-negative on success/Negative on failure
*
- * Programmer:
- *
- * Modifications:
+ * Programmer: Muqun Yang
+ * Monday, Feb. 13th, 2006
*
*-------------------------------------------------------------------------
*/
static herr_t
-H5D_inter_collective_io(H5D_io_info_t *io_info,const H5S_t *file_space,const H5S_t *mem_space,
- haddr_t addr, const void *buf, hbool_t do_write )
+H5D_inter_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info,
+ const H5S_t *file_space, const H5S_t *mem_space)
{
+ size_t mpi_buf_count; /* # of MPI types */
+ hbool_t mbt_is_derived = FALSE;
+ hbool_t mft_is_derived = FALSE;
+ MPI_Datatype mpi_file_type, mpi_buf_type;
+ int mpi_code; /* MPI return code */
+ herr_t ret_value = SUCCEED; /* return value */
+
+ FUNC_ENTER_NOAPI_NOINIT(H5D_inter_collective_io)
+
+ if((file_space != NULL) && (mem_space != NULL)) {
+ hsize_t mpi_buf_offset, mpi_file_offset; /* Offset within dataset where selection (ie. MPI type) begins */
+ size_t mpi_file_count; /* Number of file "objects" to transfer */
- size_t mpi_buf_count, mpi_file_count; /* Number of "objects" to transfer */
- MPI_Datatype mpi_file_type,mpi_buf_type;
- hsize_t mpi_buf_offset, mpi_file_offset; /* Offset within dataset where selection (ie. MPI type) begins */
- hbool_t mbt_is_derived=0, /* Whether the buffer (memory) type is derived and needs to be free'd */
- mft_is_derived=0; /* Whether the file type is derived and needs to be free'd */
- H5D_common_coll_info_t coll_info;
- herr_t ret_value = SUCCEED; /* return value */
-
- FUNC_ENTER_NOAPI_NOINIT(H5D_inter_collective_io)
- if((file_space!=NULL) && (mem_space != NULL)) {
- /*Obtain disk and memory MPI derived datatype */
- if(H5S_mpio_space_type(file_space,H5T_get_size(io_info->dset->shared->type),
- &mpi_file_type,&mpi_file_count,&mpi_file_offset,&mft_is_derived)<0)
- HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI file type");
-
- if(H5S_mpio_space_type(mem_space,H5T_get_size(io_info->dset->shared->type),
- &mpi_buf_type,&mpi_buf_count,&mpi_buf_offset,&mbt_is_derived)<0)
- HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL,"couldn't create MPI buffer type");
-
- }
- else {
- /* For non-selection, participate with a none MPI derived datatype, the count is 0. */
- mpi_buf_type = MPI_BYTE;
- mpi_file_type = MPI_BYTE;
- mpi_file_count = 0;
- mpi_buf_count = 0;
- }
-
- coll_info.mbt_is_derived = mbt_is_derived;
- coll_info.mft_is_derived = mft_is_derived;
- coll_info.mpi_buf_count = mpi_buf_count;
- coll_info.chunk_addr = addr;
+ /* Obtain disk and memory MPI derived datatype */
+ if(H5S_mpio_space_type(file_space, type_info->src_type_size,
+ &mpi_file_type, &mpi_file_count, &mpi_file_offset, &mft_is_derived) < 0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "couldn't create MPI file type")
+ if(H5S_mpio_space_type(mem_space, type_info->src_type_size,
+ &mpi_buf_type, &mpi_buf_count, &mpi_buf_offset, &mbt_is_derived) < 0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "couldn't create MPI buffer type")
+ } /* end if */
+ else {
+ /* For non-selection, participate with a none MPI derived datatype, the count is 0. */
+ mpi_buf_type = MPI_BYTE;
+ mpi_file_type = MPI_BYTE;
+ mpi_buf_count = (size_t)0;
+ mbt_is_derived = FALSE;
+ mft_is_derived = FALSE;
+ } /* end else */
#ifdef H5D_DEBUG
- if(H5DEBUG(D))
- HDfprintf(H5DEBUG(D),"before final collective IO \n");
+if(H5DEBUG(D))
+ HDfprintf(H5DEBUG(D),"before final collective IO \n");
#endif
- if(H5D_final_collective_io(io_info,&mpi_file_type,&mpi_buf_type,&coll_info,buf,do_write)<0)
- HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL,"couldn't finish collective MPI-IO");
- done:
+ /* Perform final collective I/O operation */
+ if(H5D_final_collective_io(io_info, type_info, mpi_buf_count, &mpi_file_type, &mpi_buf_type) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "couldn't finish collective MPI-IO")
+
+done:
+ /* Free the MPI buf and file types, if they were derived */
+ if(mbt_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&mpi_buf_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ if(mft_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&mpi_file_type)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+
#ifdef H5D_DEBUG
- if(H5DEBUG(D))
+if(H5DEBUG(D))
HDfprintf(H5DEBUG(D),"before leaving inter_collective_io ret_value = %d\n",ret_value);
#endif
- FUNC_LEAVE_NOAPI(ret_value)
-} /* end H5D_inter_collective_io */
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5D_inter_collective_io() */
/*-------------------------------------------------------------------------
* Function: H5D_final_collective_io
*
* Purpose: Routine for the common part of collective IO with different storages.
-
*
* Return: Non-negative on success/Negative on failure
*
- * Programmer:
- *
- * Modifications:
+ * Programmer: Muqun Yang
+ * Monday, Feb. 13th, 2006
*
*-------------------------------------------------------------------------
*/
static herr_t
-H5D_final_collective_io(H5D_io_info_t *io_info,MPI_Datatype*mpi_file_type,MPI_Datatype *mpi_buf_type,
- H5D_common_coll_info_t* coll_info, const void *buf, hbool_t do_write)
+H5D_final_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_info,
+ size_t mpi_buf_count, MPI_Datatype *mpi_file_type, MPI_Datatype *mpi_buf_type)
{
-
-
- int mpi_code; /* MPI return code */
- hbool_t plist_is_setup=0; /* Whether the dxpl has been customized */
- herr_t ret_value = SUCCEED;
-
+ int mpi_code; /* MPI return code */
+ hbool_t plist_is_setup = FALSE; /* Whether the dxpl has been customized */
+ herr_t ret_value = SUCCEED;
FUNC_ENTER_NOAPI_NOINIT(H5D_final_collective_io)
- /*
- * Pass buf type, file type to the file driver.
- */
-
- if(H5FD_mpi_setup_collective(io_info->dxpl_id, *mpi_buf_type, *mpi_file_type)<0)
- HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O properties");
+ /* Pass buf type, file type to the file driver. */
+ if(H5FD_mpi_setup_collective(io_info->dxpl_id, *mpi_buf_type, *mpi_file_type) < 0)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O properties")
+ plist_is_setup = TRUE;
- plist_is_setup=1;
-#ifdef H5D_DEBUG
- if(H5DEBUG(D)){
- HDfprintf(H5DEBUG(D),"chunk addr %Hu\n",coll_info->chunk_addr);
- HDfprintf(H5DEBUG(D),"mpi_buf_count %d\n",coll_info->mpi_buf_count);
- }
-
-#endif
-
- if(do_write) {
- if((io_info->ops.write)(io_info,
- coll_info->mpi_buf_count,0,NULL,NULL,coll_info->chunk_addr,
- NULL, buf) < 0)
+ if(io_info->op_type == H5D_IO_OP_WRITE) {
+ if((io_info->io_ops.single_write)(io_info, type_info,
+ (hsize_t)mpi_buf_count, NULL, NULL) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "optimized write failed")
- }
+ } /* end if */
else {
- if((io_info->ops.read)(io_info,
- coll_info->mpi_buf_count,0,NULL,NULL,coll_info->chunk_addr,
- NULL, buf) < 0)
+ if((io_info->io_ops.single_read)(io_info, type_info,
+ (hsize_t)mpi_buf_count, NULL, NULL) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_READERROR, FAIL, "optimized read failed")
- }
+ } /* end else */
done:
- /* Reset the dxpl settings */
- if(plist_is_setup) {
- if(H5FD_mpi_teardown_collective(io_info->dxpl_id)<0)
- HDONE_ERROR(H5E_DATASPACE, H5E_CANTFREE, FAIL, "unable to reset dxpl values");
- } /* end if */
-
- /* free the MPI buf and file types */
- if (coll_info->mbt_is_derived) {
- if (MPI_SUCCESS != (mpi_code= MPI_Type_free( mpi_buf_type )))
- HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
- }
- if (coll_info->mft_is_derived) {
- if (MPI_SUCCESS != (mpi_code= MPI_Type_free( mpi_file_type )))
- HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
- }
+ /* Reset the dxpl settings */
+ if(plist_is_setup)
+ if(H5FD_mpi_teardown_collective(io_info->dxpl_id) < 0)
+ HDONE_ERROR(H5E_DATASPACE, H5E_CANTFREE, FAIL, "unable to reset dxpl values")
+
#ifdef H5D_DEBUG
- if(H5DEBUG(D))
+if(H5DEBUG(D))
HDfprintf(H5DEBUG(D),"ret_value before leaving final_collective_io=%d\n",ret_value);
#endif
-
FUNC_LEAVE_NOAPI(ret_value)
-}/* end H5D_final_collective_io */
+} /* end H5D_final_collective_io */
/*-------------------------------------------------------------------------
* Function: H5D_sort_chunk
*
* Purpose: Routine to sort chunks in increasing order of chunk address
- Each chunk address is also obtained.
-
- Description:
- For most cases, the chunk address has already been sorted in increasing order.
- The special sorting flag is used to optimize this common case.
- quick sort is used for necessary sorting.
-
- Parameters:
- Input: H5D_io_info_t* io_info,
- H5D_chunk_map_t *fm(global chunk map struct)
- Input/Output: H5D_chunk_addr_info_t chunk_addr_info_array[] : array to store chunk address and information
- many_chunk_opt : flag to optimize the way to obtain chunk addresses
- for many chunks
+ * Each chunk address is also obtained.
*
- * Return: Non-negative on success/Negative on failure
+ * Description:
+ * For most cases, the chunk address has already been sorted in increasing order.
+ * The special sorting flag is used to optimize this common case.
+ * quick sort is used for necessary sorting.
*
- * Programmer:
+ * Parameters:
+ * Input: H5D_io_info_t* io_info,
+ * H5D_chunk_map_t *fm(global chunk map struct)
+ * Input/Output: H5D_chunk_addr_info_t chunk_addr_info_array[] : array to store chunk address and information
+ * many_chunk_opt : flag to optimize the way to obtain chunk addresses
+ * for many chunks
*
- * Modifications:
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: Muqun Yang
+ * Monday, Feb. 13th, 2006
*
*-------------------------------------------------------------------------
*/
-
static herr_t
-H5D_sort_chunk(H5D_io_info_t * io_info,
- H5D_chunk_map_t *fm,
- H5D_chunk_addr_info_t chunk_addr_info_array[],
- int many_chunk_opt)
+H5D_sort_chunk(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm,
+ H5D_chunk_addr_info_t chunk_addr_info_array[], int sum_chunk)
{
-
-
- H5SL_node_t *chunk_node; /* Current node in chunk skip list */
+ H5SL_node_t *chunk_node; /* Current node in chunk skip list */
H5D_chunk_info_t *chunk_info; /* Current chunking info. of this node. */
- haddr_t chunk_addr; /* Current chunking address of this node */
- haddr_t *total_chunk_addr_array=NULL; /* The array of chunk address for the total number of chunk */
- int i,mpi_code;
- int total_chunks;
- size_t num_chunks;
- int mpi_type_cleanup = 0;
- int tchunk_addr_cleanup = 0;
- MPI_Datatype chunk_addrtype;
- H5D_storage_t store; /*union of EFL and chunk pointer in file space */
- hbool_t do_sort = FALSE;
- herr_t ret_value = SUCCEED; /*return value */
+ haddr_t chunk_addr; /* Current chunking address of this node */
+ haddr_t *total_chunk_addr_array = NULL; /* The array of chunk address for the total number of chunk */
+ H5D_storage_t store; /*union of EFL and chunk pointer in file space */
+ hbool_t do_sort = FALSE; /* Whether the addresses need to be sorted */
+ int bsearch_coll_chunk_threshold;
+ int many_chunk_opt = H5D_OBTAIN_ONE_CHUNK_ADDR_IND;
+ int mpi_size; /* Number of MPI processes */
+ int mpi_code; /* MPI return code */
+ int i; /* Local index variable */
+ herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_NOAPI_NOINIT(H5D_sort_chunk)
- num_chunks = H5SL_count(fm->sel_chunks);
+ /* Retrieve # of MPI processes */
+ if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file)) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size")
+
+ /* Calculate the actual threshold to obtain all chunk addresses collectively
+ * The bigger this number is, the more possible the use of obtaining chunk
+ * address collectively.
+ */
+ /* For non-optimization one-link IO, actual bsearch threshold is always
+ * 0, we would always want to obtain the chunk addresses individually
+ * for each process.
+ */
+ bsearch_coll_chunk_threshold = (sum_chunk * 100) / ((int)fm->total_chunks * mpi_size);
+ if((bsearch_coll_chunk_threshold > H5D_ALL_CHUNK_ADDR_THRES_COL)
+ && ((sum_chunk / mpi_size) >= H5D_ALL_CHUNK_ADDR_THRES_COL_NUM))
+ many_chunk_opt = H5D_OBTAIN_ALL_CHUNK_ADDR_COL;
+
#ifdef H5D_DEBUG
- if(H5DEBUG(D))
- HDfprintf(H5DEBUG(D),"many_chunk_opt= %d\n",many_chunk_opt);
+if(H5DEBUG(D))
+ HDfprintf(H5DEBUG(D), "many_chunk_opt= %d\n", many_chunk_opt);
#endif
/* If we need to optimize the way to obtain the chunk address */
- if(many_chunk_opt != H5D_OBTAIN_ONE_CHUNK_ADDR_IND){
+ if(many_chunk_opt != H5D_OBTAIN_ONE_CHUNK_ADDR_IND) {
+ int mpi_rank;
- int mpi_rank, root;
- total_chunks = (int)fm->total_chunks;
- total_chunk_addr_array = H5MM_malloc(sizeof(haddr_t)*total_chunks);
- tchunk_addr_cleanup = 1;
#ifdef H5D_DEBUG
- if(H5DEBUG(D))
- HDfprintf(H5DEBUG(D),"Coming inside H5D_OBTAIN_ALL_CHUNK_ADDR_COL\n");
+if(H5DEBUG(D))
+ HDfprintf(H5DEBUG(D), "Coming inside H5D_OBTAIN_ALL_CHUNK_ADDR_COL\n");
#endif
- root = 0;
- if((mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file))<0)
- HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank");
-
- /*Create received MPI derived datatype */
- if(MPI_SUCCESS !=(mpi_code = MPI_Type_contiguous((int)(sizeof(haddr_t)*total_chunks), MPI_BYTE, &chunk_addrtype)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Type_contiguous failed", mpi_code);
- if(MPI_SUCCESS !=(mpi_code = MPI_Type_commit(&chunk_addrtype)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code);
-
- mpi_type_cleanup = 1;
-
- if(mpi_rank == root) {
- if(H5D_istore_chunkmap(io_info, total_chunk_addr_array, fm->down_chunks)<0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address");
- }
+ /* Allocate array for chunk addresses */
+ if(NULL == (total_chunk_addr_array = H5MM_malloc(sizeof(haddr_t) * (size_t)fm->total_chunks)))
+ HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, "unable to allocate memory chunk address array")
+
+ /* Retrieve all the chunk addresses with process 0 */
+ if((mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file)) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank")
+ if(mpi_rank == 0) {
+ if(H5D_istore_chunkmap(io_info, total_chunk_addr_array, fm->down_chunks) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address")
+ } /* end if */
+
/* Broadcasting the MPI_IO option info. and chunk address info. */
- if(MPI_SUCCESS !=(mpi_code = MPI_Bcast(total_chunk_addr_array,1,chunk_addrtype,root,io_info->comm)))
- HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code);
+ if(MPI_SUCCESS != (mpi_code = MPI_Bcast(total_chunk_addr_array, (int)(sizeof(haddr_t) * fm->total_chunks), MPI_BYTE, (int)0, io_info->comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code)
} /* end if */
- /* Get first node in chunk skip list */
- if(NULL ==(chunk_node = H5SL_first(fm->sel_chunks)))
- HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk node from skipped list");
/* Set dataset storage for I/O info */
io_info->store = &store;
- if(NULL ==(chunk_info = H5SL_item(chunk_node)))
- HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list");
- store.chunk.offset = chunk_info->coords;
- store.chunk.index = chunk_info->index;
+
+ /* Start at first node in chunk skip list */
i = 0;
- if(many_chunk_opt == H5D_OBTAIN_ONE_CHUNK_ADDR_IND){
- if(HADDR_UNDEF==(chunk_addr = H5D_istore_get_addr(io_info,NULL)))
- HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list");
+ if(NULL == (chunk_node = H5SL_first(fm->sel_chunks)))
+ HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk node from skipped list")
-#ifdef H5D_DEBUG
- if(H5DEBUG(D))
- HDfprintf(H5DEBUG(D),"coming to obtain each chunk address individually \n");
-#endif
- }
- else
- chunk_addr = total_chunk_addr_array[chunk_info->index];
+ /* Iterate over all chunks for this process */
+ while(chunk_node) {
+ if(NULL == (chunk_info = H5SL_item(chunk_node)))
+ HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list")
+
+ if(many_chunk_opt == H5D_OBTAIN_ONE_CHUNK_ADDR_IND) {
+ store.chunk.offset = chunk_info->coords;
+ store.chunk.index = chunk_info->index;
+ if(HADDR_UNDEF == (chunk_addr = H5D_istore_get_addr(io_info, NULL)))
+ HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL, "couldn't get chunk info from skipped list")
+ } /* end if */
+ else
+ chunk_addr = total_chunk_addr_array[chunk_info->index];
- chunk_addr_info_array[i].chunk_addr = chunk_addr;
- chunk_addr_info_array[i].chunk_info = *chunk_info;
+ /* Check if chunk addresses are not in increasing order in the file */
+ if(i > 0 && chunk_addr < chunk_addr_info_array[i - 1].chunk_addr)
+ do_sort = TRUE;
- chunk_node = H5SL_next(chunk_node);
+ /* Set the address & info for this chunk */
+ chunk_addr_info_array[i].chunk_addr = chunk_addr;
+ chunk_addr_info_array[i].chunk_info = *chunk_info;
- while(chunk_node) {
+ /* Advance to next chunk in list */
+ i++;
+ chunk_node = H5SL_next(chunk_node);
+ } /* end while */
- chunk_info = H5SL_item(chunk_node);
- store.chunk.offset = chunk_info->coords;
- store.chunk.index = chunk_info->index;
-
- if(many_chunk_opt == H5D_OBTAIN_ONE_CHUNK_ADDR_IND){
- if(HADDR_UNDEF==(chunk_addr = H5D_istore_get_addr(io_info,NULL)))
- HGOTO_ERROR(H5E_STORAGE, H5E_CANTGET, FAIL,"couldn't get chunk info from skipped list");
- }
- else
- chunk_addr = total_chunk_addr_array[chunk_info->index];
-
- if(chunk_addr < chunk_addr_info_array[i].chunk_addr) do_sort = TRUE;
- chunk_addr_info_array[i+1].chunk_addr = chunk_addr;
- chunk_addr_info_array[i+1].chunk_info =*chunk_info;
- i++;
- chunk_node = H5SL_next(chunk_node);
- }
#ifdef H5D_DEBUG
- if(H5DEBUG(D))
- HDfprintf(H5DEBUG(D),"before Qsort\n");
+if(H5DEBUG(D))
+ HDfprintf(H5DEBUG(D), "before Qsort\n");
#endif
- if(do_sort)
- HDqsort(chunk_addr_info_array,num_chunks,sizeof(chunk_addr_info_array),H5D_cmp_chunk_addr);
+ if(do_sort) {
+ size_t num_chunks = H5SL_count(fm->sel_chunks);
+
+ HDqsort(chunk_addr_info_array, num_chunks, sizeof(chunk_addr_info_array[0]), H5D_cmp_chunk_addr);
+ } /* end if */
done:
+ if(total_chunk_addr_array)
+ H5MM_xfree(total_chunk_addr_array);
- if(tchunk_addr_cleanup)
- HDfree(total_chunk_addr_array);
- if(mpi_type_cleanup) {
- if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &chunk_addrtype )))
- HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
- }
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5D_sort_chunk() */
@@ -1725,310 +1768,244 @@ done:
* Function: H5D_obtain_mpio_mode
*
* Purpose: Routine to obtain each io mode(collective,independent or none) for each chunk;
- Each chunk address is also obtained.
-
- Description:
-
- 1) Each process provides two piece of information for all chunks having selection
- a) chunk index
- b) wheather this chunk is regular(for MPI derived datatype not working case)
-
- 2) Gather all the information to the root process
-
- 3) Root process will do the following:
- a) Obtain chunk addresses for all chunks in this data space
- b) With the consideration of the user option, calculate IO mode for each chunk
- c) Build MPI derived datatype to combine "chunk address" and "assign_io" information
- in order to do MPI Bcast only once
- d) MPI Bcast the IO mode and chunk address information for each chunk.
- 4) Each process then retrieves IO mode and chunk address information to assign_io_mode and chunk_addr.
-
- Parameters:
-
- Input: H5D_io_info_t* io_info,
- H5D_chunk_map_t *fm,(global chunk map struct)
- Output: uint8_t assign_io_mode[], : IO mode, collective, independent or none
- haddr_t chunk_addr[], : chunk address array for each chunk
+ * Each chunk address is also obtained.
*
- * Return: Non-negative on success/Negative on failure
+ * Description:
*
- * Programmer:
+ * 1) Each process provides two piece of information for all chunks having selection
+ * a) chunk index
+ * b) wheather this chunk is regular(for MPI derived datatype not working case)
+ *
+ * 2) Gather all the information to the root process
*
- * Modifications:
+ * 3) Root process will do the following:
+ * a) Obtain chunk addresses for all chunks in this data space
+ * b) With the consideration of the user option, calculate IO mode for each chunk
+ * c) Build MPI derived datatype to combine "chunk address" and "assign_io" information
+ * in order to do MPI Bcast only once
+ * d) MPI Bcast the IO mode and chunk address information for each chunk.
+ * 4) Each process then retrieves IO mode and chunk address information to assign_io_mode and chunk_addr.
+ *
+ * Parameters:
+ *
+ * Input: H5D_io_info_t* io_info,
+ * H5D_chunk_map_t *fm,(global chunk map struct)
+ * Output: uint8_t assign_io_mode[], : IO mode, collective, independent or none
+ * haddr_t chunk_addr[], : chunk address array for each chunk
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: Muqun Yang
+ * Monday, Feb. 13th, 2006
*
*-------------------------------------------------------------------------
*/
-
static herr_t
-H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
- H5D_chunk_map_t *fm,
- uint8_t assign_io_mode[],
- haddr_t chunk_addr[])
+H5D_obtain_mpio_mode(H5D_io_info_t* io_info, H5D_chunk_map_t *fm,
+ H5P_genplist_t *dx_plist, uint8_t assign_io_mode[], haddr_t chunk_addr[])
{
-
- int total_chunks;
- hsize_t ori_total_chunks;
- unsigned percent_nproc_per_chunk,threshold_nproc_per_chunk;
- H5FD_mpio_chunk_opt_t chunk_opt_mode;
- uint8_t* io_mode_info=NULL;
- uint8_t* recv_io_mode_info=NULL;
- uint8_t* mergebuf=NULL;
- uint8_t* tempbuf;
-
- H5SL_node_t* chunk_node;
- H5D_chunk_info_t* chunk_info;
-
- MPI_Datatype bastype[2];
- MPI_Datatype chunk_addrtype;
- int bascount;
- int basblock[2];
- MPI_Aint basdisp[2];
- MPI_Datatype rtype;
- MPI_Datatype stype;
- int mpi_size,mpi_rank;
- MPI_Comm comm;
- int ic,root;
- int mpi_code;
- H5P_genplist_t *plist;
- int mem_cleanup = 0,
- mpi_type_cleanup = 0;
-
+ int total_chunks;
+ unsigned percent_nproc_per_chunk,threshold_nproc_per_chunk;
+ H5FD_mpio_chunk_opt_t chunk_opt_mode;
+ uint8_t* io_mode_info=NULL;
+ uint8_t* recv_io_mode_info=NULL;
+ uint8_t* mergebuf=NULL;
+ uint8_t* tempbuf;
+ H5SL_node_t* chunk_node;
+ H5D_chunk_info_t* chunk_info;
+ int mpi_size,mpi_rank;
+ MPI_Comm comm;
+ int ic,root;
+ int mpi_code;
+ int mem_cleanup = 0;
#ifdef H5_HAVE_INSTRUMENTED_LIBRARY
- int new_value;
- htri_t check_prop;
+ int new_value;
+ htri_t check_prop;
#endif
+ herr_t ret_value = SUCCEED;
- herr_t ret_value = SUCCEED;
-
- FUNC_ENTER_NOAPI_NOINIT(H5D_obtain_mpio_mode)
+ FUNC_ENTER_NOAPI_NOINIT(H5D_obtain_mpio_mode)
- /* Assign the rank 0 to the root */
- root = 0;
- comm = io_info->comm;
+ /* Assign the rank 0 to the root */
+ root = 0;
+ comm = io_info->comm;
- /* Obtain the number of process and the current rank of the process */
- if((mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file))<0)
- HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank");
- if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file))<0)
- HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size");
+ /* Obtain the number of process and the current rank of the process */
+ if((mpi_rank = H5F_mpi_get_rank(io_info->dset->oloc.file)) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank")
+ if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file)) < 0)
+ HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size")
- /* Allocate memory */
- ori_total_chunks = fm->total_chunks;
- H5_ASSIGN_OVERFLOW(total_chunks,ori_total_chunks,hsize_t,int);
-
- /* Obtain the data transfer properties */
- if(NULL == (plist = H5I_object(io_info->dxpl_id)))
- HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list")
-
- percent_nproc_per_chunk=H5P_peek_unsigned(plist,H5D_XFER_MPIO_CHUNK_OPT_RATIO_NAME);
+ /* Setup parameters */
+ H5_ASSIGN_OVERFLOW(total_chunks, fm->total_chunks, hsize_t, int);
+ percent_nproc_per_chunk = H5P_peek_unsigned(dx_plist, H5D_XFER_MPIO_CHUNK_OPT_RATIO_NAME);
#if defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) && defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS)
-
- chunk_opt_mode=(H5FD_mpio_chunk_opt_t)H5P_peek_unsigned(plist,H5D_XFER_MPIO_CHUNK_OPT_HARD_NAME);
-
- if((chunk_opt_mode == H5FD_MPIO_CHUNK_MULTI_IO) || (percent_nproc_per_chunk == 0)){
- if(H5D_istore_chunkmap(io_info, chunk_addr, fm->down_chunks) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address");
- for(ic = 0; ic<total_chunks;ic++)
- assign_io_mode[ic] = H5D_CHUNK_IO_MODE_COL;
- goto done;
- }
+ chunk_opt_mode = (H5FD_mpio_chunk_opt_t)H5P_peek_unsigned(dx_plist, H5D_XFER_MPIO_CHUNK_OPT_HARD_NAME);
+ if((chunk_opt_mode == H5FD_MPIO_CHUNK_MULTI_IO) || (percent_nproc_per_chunk == 0)) {
+ if(H5D_istore_chunkmap(io_info, chunk_addr, fm->down_chunks) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address");
+ for(ic = 0; ic < total_chunks; ic++)
+ assign_io_mode[ic] = H5D_CHUNK_IO_MODE_COL;
+
+ HGOTO_DONE(SUCCEED)
+ } /* end if */
#endif
- threshold_nproc_per_chunk = mpi_size * percent_nproc_per_chunk/100;
-
- io_mode_info = (uint8_t *)H5MM_calloc(total_chunks*sizeof(MPI_BYTE));
- mergebuf = H5MM_malloc((sizeof(haddr_t)+sizeof(MPI_BYTE))*total_chunks);
- tempbuf = mergebuf + sizeof(MPI_BYTE)*total_chunks;
- if(mpi_rank == root)
- recv_io_mode_info = (uint8_t *)H5MM_malloc(total_chunks*sizeof(MPI_BYTE)*mpi_size);
+ threshold_nproc_per_chunk = mpi_size * percent_nproc_per_chunk/100;
+
+ /* Allocate memory */
+ io_mode_info = (uint8_t *)H5MM_calloc(total_chunks);
+ mergebuf = H5MM_malloc((sizeof(haddr_t) + 1) * total_chunks);
+ tempbuf = mergebuf + total_chunks;
+ if(mpi_rank == root)
+ recv_io_mode_info = (uint8_t *)H5MM_malloc(total_chunks * mpi_size);
- mem_cleanup = 1;
+ mem_cleanup = 1;
- chunk_node = H5SL_first(fm->sel_chunks);
-
- /*Obtain the regularity and selection information for all chunks in this process. */
- while(chunk_node){
-
- chunk_info = H5SL_item(chunk_node);
+ /* Obtain the regularity and selection information for all chunks in this process. */
+ chunk_node = H5SL_first(fm->sel_chunks);
+ while(chunk_node) {
+ chunk_info = H5SL_item(chunk_node);
#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
- /* regularity information: 1, selection information: 2 */
- if(H5S_SELECT_IS_REGULAR(chunk_info->fspace) == TRUE &&
- H5S_SELECT_IS_REGULAR(chunk_info->mspace) == TRUE)
+ /* regularity information: 1, selection information: 2 */
+ if(H5S_SELECT_IS_REGULAR(chunk_info->fspace) == TRUE &&
+ H5S_SELECT_IS_REGULAR(chunk_info->mspace) == TRUE)
#endif
- io_mode_info[chunk_info->index] = H5D_CHUNK_SELECT_REG; /* this chunk is selected and is "regular" without defining H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS. */
+ io_mode_info[chunk_info->index] = H5D_CHUNK_SELECT_REG; /* this chunk is selected and is "regular" without defining H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS. */
#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
- else
- io_mode_info[chunk_info->index] = H5D_CHUNK_SELECT_IRREG; /* this chunk is selected and is irregular*/
+ else
+ io_mode_info[chunk_info->index] = H5D_CHUNK_SELECT_IRREG; /* this chunk is selected and is irregular*/
#endif
-
- chunk_node = H5SL_next(chunk_node);
- }
+ chunk_node = H5SL_next(chunk_node);
+ } /* end while */
- /*Create sent MPI derived datatype */
- if(MPI_SUCCESS !=(mpi_code = MPI_Type_contiguous(total_chunks,MPI_BYTE,&stype)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code);
- if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&stype)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code);
-
- /*Create received basic MPI derived datatype */
- bascount = 2;
- basblock[0] = total_chunks;
- basblock[1] = total_chunks;
- basdisp[0] = 0;
- basdisp[1] = (MPI_Aint)(sizeof(MPI_BYTE)*total_chunks);/* may need to check overflow */
- bastype[0] = MPI_BYTE;
-
- if(MPI_SUCCESS !=(mpi_code = MPI_Type_contiguous(sizeof(haddr_t),MPI_BYTE,&chunk_addrtype)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Type_contiguous failed", mpi_code);
- if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&chunk_addrtype)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code);
- bastype[1] = chunk_addrtype;
-
- if(MPI_SUCCESS !=(mpi_code = MPI_Type_struct(bascount,basblock,basdisp,bastype,&rtype)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Type_struct failed", mpi_code);
- if(MPI_SUCCESS !=(mpi_code = MPI_Type_commit(&rtype)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code);
-
- /* Set up a flag to clean up the MPI derived datatype later */
- mpi_type_cleanup = 1;
-
- /*Gather all the information */
- if(MPI_SUCCESS !=(mpi_code = MPI_Gather(io_mode_info,1,stype,recv_io_mode_info,1,stype,root,comm)))
- HMPI_GOTO_ERROR(FAIL, "MPI_Gather failed", mpi_code);
-
- /* Calculate the mode for IO(collective, independent or none) at root process */
- if(mpi_rank == root) {
-
- int nproc;
- int* nproc_per_chunk;
+ /*Gather all the information */
+ if(MPI_SUCCESS != (mpi_code = MPI_Gather(io_mode_info, total_chunks, MPI_BYTE, recv_io_mode_info, total_chunks, MPI_BYTE, root, comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Gather failed", mpi_code)
+
+ /* Calculate the mode for IO(collective, independent or none) at root process */
+ if(mpi_rank == root) {
+ int nproc;
+ int* nproc_per_chunk;
#if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS)
- int* ind_this_chunk;
+ int* ind_this_chunk;
#endif
- /* pre-computing: calculate number of processes and
- regularity of the selection occupied in each chunk */
- nproc_per_chunk = (int*)H5MM_calloc(total_chunks*sizeof(int));
+ /* pre-computing: calculate number of processes and
+ regularity of the selection occupied in each chunk */
+ nproc_per_chunk = (int*)H5MM_calloc(total_chunks * sizeof(int));
#if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS)
- ind_this_chunk = (int*)H5MM_calloc(total_chunks*sizeof(int));
+ ind_this_chunk = (int*)H5MM_calloc(total_chunks * sizeof(int));
#endif
- /* calculating the chunk address */
- if(H5D_istore_chunkmap(io_info, chunk_addr, fm->down_chunks)<0){
- HDfree(nproc_per_chunk);
+ /* calculating the chunk address */
+ if(H5D_istore_chunkmap(io_info, chunk_addr, fm->down_chunks) < 0) {
+ HDfree(nproc_per_chunk);
#if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS)
- HDfree(ind_this_chunk);
+ HDfree(ind_this_chunk);
#endif
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address");
- }
-
- /* checking for number of process per chunk and regularity of the selection*/
- for (nproc = 0;nproc <mpi_size;nproc++){
- uint8_t *tmp_recv_io_mode_info = recv_io_mode_info + nproc*total_chunks;
- /* calculate the number of process per chunk and adding irregular selection option */
- for(ic = 0; ic < total_chunks; ic++, tmp_recv_io_mode_info++){
- if(*tmp_recv_io_mode_info != 0) {
- nproc_per_chunk[ic]++;
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address")
+ } /* end if */
+
+ /* checking for number of process per chunk and regularity of the selection*/
+ for(nproc = 0; nproc < mpi_size; nproc++) {
+ uint8_t *tmp_recv_io_mode_info = recv_io_mode_info + (nproc * total_chunks);
+
+ /* Calculate the number of process per chunk and adding irregular selection option */
+ for(ic = 0; ic < total_chunks; ic++, tmp_recv_io_mode_info++) {
+ if(*tmp_recv_io_mode_info != 0) {
+ nproc_per_chunk[ic]++;
#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
- if(*tmp_recv_io_mode_info == H5D_CHUNK_SELECT_IRREG)
- ind_this_chunk[ic] = 1;
+ if(*tmp_recv_io_mode_info == H5D_CHUNK_SELECT_IRREG)
+ ind_this_chunk[ic] = 1;
#endif
- }
+ } /* end if */
#ifndef H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS
else {
- /*checking whether we have a selection in this chunk */
- ind_this_chunk[ic] = 1;
- }
+ /*checking whether we have a selection in this chunk */
+ ind_this_chunk[ic] = 1;
+ } /* end else */
#endif
- }
+ } /* end for */
+ } /* end for */
- }
-
- /* Calculating MPIO mode for each chunk (collective, independent, none) */
- for(ic = 0; ic < total_chunks; ic++){
- if(nproc_per_chunk[ic]>MAX(1,threshold_nproc_per_chunk)){
+ /* Calculating MPIO mode for each chunk (collective, independent, none) */
+ for(ic = 0; ic < total_chunks; ic++) {
+ if(nproc_per_chunk[ic] > MAX(1, threshold_nproc_per_chunk)) {
#if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS)
- if(!ind_this_chunk[ic]) assign_io_mode[ic] = H5D_CHUNK_IO_MODE_COL;
+ if(!ind_this_chunk[ic])
+ assign_io_mode[ic] = H5D_CHUNK_IO_MODE_COL;
#else
- assign_io_mode[ic] = H5D_CHUNK_IO_MODE_COL;
+ assign_io_mode[ic] = H5D_CHUNK_IO_MODE_COL;
#endif
- }
- }
+ } /* end if */
+ } /* end for */
- /* merge buffer io_mode info and chunk addr into one */
- HDmemcpy(mergebuf,assign_io_mode,sizeof(MPI_BYTE)*total_chunks);
- HDmemcpy(tempbuf,chunk_addr,sizeof(haddr_t)*total_chunks);
+ /* merge buffer io_mode info and chunk addr into one */
+ HDmemcpy(mergebuf, assign_io_mode, total_chunks);
+ HDmemcpy(tempbuf, chunk_addr, sizeof(haddr_t) * total_chunks);
- HDfree(nproc_per_chunk);
+ HDfree(nproc_per_chunk);
#if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS)
- HDfree(ind_this_chunk);
+ HDfree(ind_this_chunk);
#endif
- }
+ } /* end if */
- /* Broadcasting the MPI_IO option info. and chunk address info. */
- if(MPI_SUCCESS !=(mpi_code = MPI_Bcast(mergebuf,1,rtype,root,comm)))
- HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code);
+ /* Broadcasting the MPI_IO option info. and chunk address info. */
+ if(MPI_SUCCESS != (mpi_code = MPI_Bcast(mergebuf, ((sizeof(haddr_t) + 1) * total_chunks), MPI_BYTE, root, comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code)
- HDmemcpy(assign_io_mode,mergebuf,sizeof(MPI_BYTE)*total_chunks);
- HDmemcpy(chunk_addr,tempbuf,sizeof(haddr_t)*total_chunks);
+ HDmemcpy(assign_io_mode, mergebuf, total_chunks);
+ HDmemcpy(chunk_addr, tempbuf, sizeof(haddr_t) * total_chunks);
#ifdef H5_HAVE_INSTRUMENTED_LIBRARY
- check_prop = H5Pexist(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_MULTI_RATIO_COLL_NAME);
+ check_prop = H5Pexist(io_info->dxpl_id, H5D_XFER_COLL_CHUNK_MULTI_RATIO_COLL_NAME);
if(check_prop > 0) {
#if !defined(H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS) || !defined(H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS)
- new_value = 0;
- if(H5Pset(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_MULTI_RATIO_COLL_NAME,&new_value)<0)
- HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to set property value");
+ new_value = 0;
+ if(H5Pset(io_info->dxpl_id, H5D_XFER_COLL_CHUNK_MULTI_RATIO_COLL_NAME, &new_value) < 0)
+ HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to set property value")
#else
- for(ic = 0; ic < total_chunks; ic++){
- if(assign_io_mode[ic] == H5D_CHUNK_IO_MODE_COL) {
- new_value = 0;
- if(H5Pset(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_MULTI_RATIO_COLL_NAME,&new_value)<0)
- HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to set property value");
- break;
- }
- }
+ for(ic = 0; ic < total_chunks; ic++) {
+ if(assign_io_mode[ic] == H5D_CHUNK_IO_MODE_COL) {
+ new_value = 0;
+ if(H5Pset(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_MULTI_RATIO_COLL_NAME,&new_value) < 0)
+ HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to set property value")
+ break;
+ } /* end if */
+ } /* end for */
#endif
- }
- check_prop = H5Pexist(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_MULTI_RATIO_IND_NAME);
- if(check_prop > 0) {
- int temp_count = 0;
- for(ic = 0; ic < total_chunks; ic++){
- if(assign_io_mode[ic] == H5D_CHUNK_IO_MODE_COL) {
- temp_count++;
- break;
- }
- }
- if(temp_count==0){
- new_value = 0;
- if(H5Pset(io_info->dxpl_id,H5D_XFER_COLL_CHUNK_MULTI_RATIO_IND_NAME,&new_value)<0)
- HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to set property value");
- }
- }
+ } /* end if */
+
+ check_prop = H5Pexist(io_info->dxpl_id, H5D_XFER_COLL_CHUNK_MULTI_RATIO_IND_NAME);
+ if(check_prop > 0) {
+ int temp_count = 0;
+
+ for(ic = 0; ic < total_chunks; ic++) {
+ if(assign_io_mode[ic] == H5D_CHUNK_IO_MODE_COL) {
+ temp_count++;
+ break;
+ } /* end if */
+ } /* end for */
+ if(temp_count == 0) {
+ new_value = 0;
+ if(H5Pset(io_info->dxpl_id, H5D_XFER_COLL_CHUNK_MULTI_RATIO_IND_NAME, &new_value) < 0)
+ HGOTO_ERROR(H5E_PLIST, H5E_UNSUPPORTED, FAIL, "unable to set property value")
+ } /* end if */
+ } /* end if */
#endif
done:
-
- if(mpi_type_cleanup) {
- if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &chunk_addrtype )))
- HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
-
- if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &stype )))
- HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
-
- if (MPI_SUCCESS != (mpi_code= MPI_Type_free( &rtype )))
- HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
- }
-
- if(mem_cleanup){
- HDfree(io_mode_info);
- HDfree(mergebuf);
- if(mpi_rank == root)
- HDfree(recv_io_mode_info);
- }
+ if(mem_cleanup) {
+ HDfree(io_mode_info);
+ HDfree(mergebuf);
+ if(mpi_rank == root)
+ HDfree(recv_io_mode_info);
+ } /* end if */
FUNC_LEAVE_NOAPI(ret_value)
-}/* end H5D_obtain_mpio_mode*/
+} /* end H5D_obtain_mpio_mode() */
static int
H5D_cmp_chunk_addr(const void *chunk_addr_info1, const void *chunk_addr_info2)
@@ -2041,7 +2018,6 @@ H5D_cmp_chunk_addr(const void *chunk_addr_info1, const void *chunk_addr_info2)
addr2 = ((const H5D_chunk_addr_info_t *)chunk_addr_info2)->chunk_addr;
FUNC_LEAVE_NOAPI(H5F_addr_cmp(addr1, addr2))
-
}
#endif /* H5_HAVE_PARALLEL */