summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMuQun Yang <ymuqun@hdfgroup.org>2006-03-14 18:29:35 (GMT)
committerMuQun Yang <ymuqun@hdfgroup.org>2006-03-14 18:29:35 (GMT)
commita4c816eb75c20de14a5e557ffd76c33190152ce2 (patch)
treed486af4248f19dca54b25c525af5e79766a37959
parent838e79971076a29aea1fbad1c1b8d218d6da8644 (diff)
downloadhdf5-a4c816eb75c20de14a5e557ffd76c33190152ce2.zip
hdf5-a4c816eb75c20de14a5e557ffd76c33190152ce2.tar.gz
hdf5-a4c816eb75c20de14a5e557ffd76c33190152ce2.tar.bz2
[svn-r12090] Purpose:
New APIs to add for collective chunk IO Description: Three new APIs H5Pset_dxpl_mpio_chunk_opt_ratio H5Pset_dxpl_mpio_chunk_opt_num H5Pset_dxpl_mpio_chunk_opt for optional optimization choices from users. Solution: Haven't added tests yet, won't affect other parts of the library. Will add tests after urgent investigations of memory leaking problems from NASA Aura team. Platforms tested: heping: both parallel and sequential shanti Misc. update:
-rw-r--r--src/H5D.c11
-rw-r--r--src/H5Dmpio.c54
-rw-r--r--src/H5Dprivate.h12
-rw-r--r--src/H5FDmpi.h22
-rw-r--r--src/H5FDmpio.c146
-rw-r--r--src/H5FDmpio.h3
6 files changed, 223 insertions, 25 deletions
diff --git a/src/H5D.c b/src/H5D.c
index 3510f28..202e4ec 100644
--- a/src/H5D.c
+++ b/src/H5D.c
@@ -195,7 +195,10 @@ H5D_init_interface(void)
void *def_vfl_info = H5D_XFER_VFL_INFO_DEF;
size_t def_hyp_vec_size = H5D_XFER_HYPER_VECTOR_SIZE_DEF;
#ifdef H5_HAVE_PARALLEL
- H5FD_mpio_xfer_t def_io_xfer_mode = H5D_XFER_IO_XFER_MODE_DEF;
+ H5FD_mpio_xfer_t def_io_xfer_mode = H5D_XFER_IO_XFER_MODE_DEF;
+ H5FD_mpio_chunk_opt_t def_mpio_chunk_opt_mode = H5D_XFER_MPIO_CHUNK_OPT_HARD_DEF;
+ unsigned def_mpio_chunk_opt_num = H5D_XFER_MPIO_CHUNK_OPT_NUM_DEF;
+ unsigned def_mpio_chunk_opt_ratio = H5D_XFER_MPIO_CHUNK_OPT_RATIO_DEF;
#endif /* H5_HAVE_PARALLEL */
H5Z_EDC_t enable_edc = H5D_XFER_EDC_DEF;
H5Z_cb_t filter_cb = H5D_XFER_FILTER_CB_DEF;
@@ -298,6 +301,12 @@ H5D_init_interface(void)
/* Register the I/O transfer mode property */
if(H5P_register(xfer_pclass,H5D_XFER_IO_XFER_MODE_NAME,H5D_XFER_IO_XFER_MODE_SIZE,&def_io_xfer_mode,NULL,NULL,NULL,NULL,NULL,NULL,NULL)<0)
HGOTO_ERROR(H5E_PLIST, H5E_CANTINSERT, FAIL, "can't insert property into class")
+ if(H5P_register(xfer_pclass,H5D_XFER_MPIO_CHUNK_OPT_HARD_NAME,H5D_XFER_MPIO_CHUNK_OPT_HARD_SIZE,&def_mpio_chunk_opt_mode,NULL,NULL,NULL,NULL,NULL,NULL,NULL)<0)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTINSERT, FAIL, "can't insert property into class")
+ if(H5P_register(xfer_pclass,H5D_XFER_MPIO_CHUNK_OPT_NUM_NAME,H5D_XFER_MPIO_CHUNK_OPT_NUM_SIZE,&def_mpio_chunk_opt_num,NULL,NULL,NULL,NULL,NULL,NULL,NULL)<0)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTINSERT, FAIL, "can't insert property into class")
+ if(H5P_register(xfer_pclass,H5D_XFER_MPIO_CHUNK_OPT_RATIO_NAME,H5D_XFER_MPIO_CHUNK_OPT_RATIO_SIZE,&def_mpio_chunk_opt_ratio,NULL,NULL,NULL,NULL,NULL,NULL,NULL)<0)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTINSERT, FAIL, "can't insert property into class")
#endif /* H5_HAVE_PARALLEL */
/* Register the EDC property */
diff --git a/src/H5Dmpio.c b/src/H5Dmpio.c
index 3e95d37..958c760 100644
--- a/src/H5Dmpio.c
+++ b/src/H5Dmpio.c
@@ -59,7 +59,6 @@
If the average number of chunks per process is greater than this value,
the library will create an MPI derived datatype to link all chunks to do collective IO.
The user can set this value through an API. */
-#define H5D_ONE_LINK_CHUNK_IO_THRESHOLD 0 /* always set this option with value 0*/
/* Macros to represent options on how to obtain chunk address for one linked-chunk IO case */
#define H5D_OBTAIN_ONE_CHUNK_ADDR_IND 0
@@ -75,7 +74,6 @@
If the average number of processes per chunk is greater than the default value,
collective IO is done for this chunk.
*/
-#define H5D_MULTI_CHUNK_IO_COL_THRESHOLD 50
/* Macros to represent different IO modes(NONE, Independent or collective)for multiple chunk IO case */
#define H5D_CHUNK_IO_MODE_IND 0
@@ -640,34 +638,39 @@ H5D_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf, hbool
int io_option = H5D_MULTI_CHUNK_IO;
int sum_chunk,mpi_size;
- int one_link_chunk_io_threshold;
+ unsigned one_link_chunk_io_threshold;
+ H5P_genplist_t *plist;
+ H5FD_mpio_chunk_opt_t chunk_opt_mode;
herr_t ret_value = SUCCEED;
FUNC_ENTER_NOAPI_NOINIT(H5D_chunk_collective_io)
assert (IS_H5FD_MPIO(io_info->dset->oloc.file));
-
- if(H5D_mpio_get_sum_chunk(io_info,fm,&sum_chunk)<0)
- HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the total chunk number of all processes");
- if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file))<0)
+
+ /* Obtain the data transfer properties */
+ if(NULL == (plist = H5I_object(io_info->dxpl_id)))
+ HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list")
+ chunk_opt_mode=(H5FD_mpio_chunk_opt_t)H5P_peek_unsigned(plist,H5D_XFER_MPIO_CHUNK_OPT_HARD_NAME);
+ if(chunk_opt_mode != H5FD_MPIO_OPT_ONE_IO) {
+
+ if(H5D_mpio_get_sum_chunk(io_info,fm,&sum_chunk)<0)
+ HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the total chunk number of all processes");
+ if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file))<0)
HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size");
- one_link_chunk_io_threshold = H5D_ONE_LINK_CHUNK_IO_THRESHOLD;/*This should be replaced by the user inputting value from API. */
+
+ if(NULL == (plist = H5I_object(io_info->dxpl_id)))
+ HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list")
- /* step 1: choose an IO option */
- /* If the average number of chunk per process is greater than a threshold, we will do one link chunked IO. */
- if(sum_chunk/mpi_size >= one_link_chunk_io_threshold) io_option = H5D_ONE_LINK_CHUNK_IO;
+ one_link_chunk_io_threshold =H5P_peek_unsigned(plist,H5D_XFER_MPIO_CHUNK_OPT_NUM_NAME);
-/* If this MPI-IO package doesn't support collective IO when no IO is done for one or more processes,
- use MULTIPLE CHUNK IO */
-/*
-#ifndef H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS
- if(H5D_mpio_get_min_chunk(io_info,fm,&min_chunk)<0)
- HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the min chunk number of all processes");
- if(min_chunk == 0) io_option = H5D_MULTI_CHUNK_IO;
-#endif
-*/
+ /* step 1: choose an IO option */
+ /* If the average number of chunk per process is greater than a threshold, we will do one link chunked IO. */
+ if(sum_chunk/mpi_size >= one_link_chunk_io_threshold) io_option = H5D_ONE_LINK_CHUNK_IO;
+ }
+ else
+ io_option = H5D_ONE_LINK_CHUNK_IO;
#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
if(io_option == H5D_ONE_LINK_CHUNK_IO ) io_option = H5D_MULTI_CHUNK_IO ;/* We can not do this with one chunk IO. */
#endif
@@ -1557,7 +1560,7 @@ H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
int total_chunks;
hsize_t ori_total_chunks;
- int percent_nproc_per_chunk,threshold_nproc_per_chunk;
+ unsigned percent_nproc_per_chunk,threshold_nproc_per_chunk;
uint8_t* io_mode_info;
uint8_t* recv_io_mode_info;
uint8_t* mergebuf;
@@ -1577,7 +1580,7 @@ H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
MPI_Comm comm;
int root;
int mpi_code;
- int multi_chunk_io_col_threshold;
+ H5P_genplist_t *plist;
int mem_cleanup = 0,
mpi_type_cleanup = 0;
@@ -1594,8 +1597,11 @@ H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank");
if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file))<0)
HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size");
- multi_chunk_io_col_threshold = H5D_MULTI_CHUNK_IO_COL_THRESHOLD; /* May replace by user-input */
- percent_nproc_per_chunk = multi_chunk_io_col_threshold;/* For example, above 50%, do collective IO */
+ /* Obtain the data transfer properties */
+ if(NULL == (plist = H5I_object(io_info->dxpl_id)))
+ HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list")
+ percent_nproc_per_chunk=H5P_peek_unsigned(plist,H5D_XFER_MPIO_CHUNK_OPT_RATIO_NAME);
+
threshold_nproc_per_chunk = mpi_size * percent_nproc_per_chunk/100;
/* Allocate memory */
diff --git a/src/H5Dprivate.h b/src/H5Dprivate.h
index 2117d30..42be608 100644
--- a/src/H5Dprivate.h
+++ b/src/H5Dprivate.h
@@ -141,6 +141,18 @@
#define H5D_XFER_IO_XFER_MODE_NAME "io_xfer_mode"
#define H5D_XFER_IO_XFER_MODE_SIZE sizeof(H5FD_mpio_xfer_t)
#define H5D_XFER_IO_XFER_MODE_DEF H5FD_MPIO_INDEPENDENT
+/* Definitions for optimization of MPI-IO transfer mode property */
+#define H5D_XFER_MPIO_CHUNK_OPT_HARD_NAME "mpio_chunk_opt_hard"
+#define H5D_XFER_MPIO_CHUNK_OPT_HARD_SIZE sizeof(H5FD_mpio_chunk_opt_t)
+#define H5D_XFER_MPIO_CHUNK_OPT_HARD_DEF H5FD_MPIO_OPT_IGNORE
+
+#define H5D_XFER_MPIO_CHUNK_OPT_NUM_NAME "mpio_chunk_opt_num"
+#define H5D_XFER_MPIO_CHUNK_OPT_NUM_SIZE sizeof(unsigned)
+#define H5D_XFER_MPIO_CHUNK_OPT_NUM_DEF H5D_ONE_LINK_CHUNK_IO_THRESHOLD
+
+#define H5D_XFER_MPIO_CHUNK_OPT_RATIO_NAME "mpio_chunk_opt_ratio"
+#define H5D_XFER_MPIO_CHUNK_OPT_RATIO_SIZE sizeof(unsigned)
+#define H5D_XFER_MPIO_CHUNK_OPT_RATIO_DEF H5D_MULTI_CHUNK_IO_COL_THRESHOLD
/* Definitions for EDC property */
#define H5D_XFER_EDC_NAME "err_detect"
#define H5D_XFER_EDC_SIZE sizeof(H5Z_EDC_t)
diff --git a/src/H5FDmpi.h b/src/H5FDmpi.h
index aac7bf6..c9da439 100644
--- a/src/H5FDmpi.h
+++ b/src/H5FDmpi.h
@@ -21,12 +21,34 @@
#ifndef H5FDmpi_H
#define H5FDmpi_H
+/***** Macros for One linked collective IO case. *****/
+/* The default value to do one linked collective IO for all chunks.
+ If the average number of chunks per process is greater than this value,
+ the library will create an MPI derived datatype to link all chunks to do collective IO.
+ The user can set this value through an API. */
+
+#define H5D_ONE_LINK_CHUNK_IO_THRESHOLD 0
+/***** Macros for multi-chunk collective IO case. *****/
+/* The default value of the threshold to do collective IO for this chunk.
+ If the average number of processes per chunk is greater than the default value,
+ collective IO is done for this chunk.
+*/
+
+#define H5D_MULTI_CHUNK_IO_COL_THRESHOLD 50
/* Type of I/O for data transfer properties */
typedef enum H5FD_mpio_xfer_t {
H5FD_MPIO_INDEPENDENT = 0, /*zero is the default*/
H5FD_MPIO_COLLECTIVE
} H5FD_mpio_xfer_t;
+/* Type of I/O for data transfer properties */
+typedef enum H5FD_mpio_chunk_opt_t {
+ H5FD_MPIO_OPT_IGNORE = 0,
+ H5FD_MPIO_OPT_ONE_IO, /*zero is the default*/
+ H5FD_MPIO_OPT_MULTI_IO
+} H5FD_mpio_chunk_opt_t;
+
+
#ifdef H5_HAVE_PARALLEL
/* Sub-class the H5FD_class_t to add more specific functions for MPI-based VFDs */
diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c
index 3cf1968..e296094 100644
--- a/src/H5FDmpio.c
+++ b/src/H5FDmpio.c
@@ -529,6 +529,152 @@ H5Pget_dxpl_mpio(hid_t dxpl_id, H5FD_mpio_xfer_t *xfer_mode/*out*/)
done:
FUNC_LEAVE_API(ret_value)
}
+
+/*-------------------------------------------------------------------------
+ * Function: H5Pset_dxpl_mpio_chunk_opt
+
+Purpose:
+ To set a flag to choose linked chunk IO or multi-chunk IO without
+ involving decision-making inside HDF5
+
+Description:
+ The library will do linked chunk IO or multi-chunk IO without
+ involving communications for decision-making process.
+ The library won't behave as it asks for only when we find
+ that the low-level MPI-IO package doesn't support this.
+
+Parameters:
+ hid_t dxpl_id in: Data transfer property list identifier
+ H5FD_mpio_chunk_opt_t in: The optimization flag for linked chunk IO
+ or multi-chunk IO.
+
+
+Returns:
+Returns a non-negative value if successful. Otherwise returns a negative value.
+*
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5Pset_dxpl_mpio_chunk_opt(hid_t dxpl_id, H5FD_mpio_chunk_opt_t opt_mode)
+{
+ H5P_genplist_t *plist; /* Property list pointer */
+ herr_t ret_value;
+
+ FUNC_ENTER_API(H5Pset_dxpl_mpio_chunk_opt, FAIL)
+/* H5TRACE2("e","iDt",dxpl_id,xfer_mode);*/
+
+ if(dxpl_id==H5P_DEFAULT)
+ HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list")
+
+ /* Check arguments */
+ if(NULL == (plist = H5P_object_verify(dxpl_id,H5P_DATASET_XFER)))
+ HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl")
+
+ /* Set the transfer mode */
+ if (H5P_set(plist,H5D_XFER_MPIO_CHUNK_OPT_HARD_NAME,&opt_mode)<0)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value")
+
+ /* Initialize driver-specific properties */
+ ret_value= H5P_set_driver(plist, H5FD_MPIO, NULL);
+
+done:
+ FUNC_LEAVE_API(ret_value)
+}
+
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5Pset_dxpl_mpio_chunk_opt_num
+
+Purpose:
+ To set a threshold for doing linked chunk IO
+
+Description:
+ If the number is greater than the threshold set by the user,
+ the library will do linked chunk IO; otherwise, IO will be done for every chunk.
+
+Parameters:
+ hid_t dxpl_id in: Data transfer property list identifier
+ unsigned num_proc_per_chunk in: the threshold of the average number of chunks selected by each process
+
+Returns:
+Returns a non-negative value if successful. Otherwise returns a negative value.
+*
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5Pset_dxpl_mpio_chunk_opt_num(hid_t dxpl_id, unsigned num_chunk_per_proc)
+{
+ H5P_genplist_t *plist; /* Property list pointer */
+ herr_t ret_value;
+
+ FUNC_ENTER_API(H5Pset_dxpl_mpio_chunk_opt_num, FAIL)
+/* H5TRACE2("e","iDt",dxpl_id,xfer_mode);*/
+
+ if(dxpl_id==H5P_DEFAULT)
+ HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list")
+
+ /* Check arguments */
+ if(NULL == (plist = H5P_object_verify(dxpl_id,H5P_DATASET_XFER)))
+ HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl")
+
+ /* Set the transfer mode */
+ if (H5P_set(plist,H5D_XFER_MPIO_CHUNK_OPT_NUM_NAME,&num_chunk_per_proc)<0)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value")
+
+ /* Initialize driver-specific properties */
+ ret_value= H5P_set_driver(plist, H5FD_MPIO, NULL);
+
+done:
+ FUNC_LEAVE_API(ret_value)
+}
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5Pset_dxpl_mpio_chunk_opt_ratio
+
+Purpose:
+ To set a threshold for doing collective IO for each chunk
+Description:
+ The library will calculate the percentage of the number of process holding selections at each chunk. If that percentage of number of process in the individual chunk is greater than the threshold set by the user, the library will do collective chunk IO for this chunk; otherwise, independent IO will be done for this chunk.
+Parameters:
+ hid_t dxpl_id
+ in: Data transfer property list identifier
+ unsigned percent_num_proc_per_chunk
+ in: the threshold of the percentage of the number of process holding selections per chunk
+Returns:
+Returns a non-negative value if successful. Otherwise returns a negative value.
+
+
+*
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5Pset_dxpl_mpio_chunk_opt_ratio(hid_t dxpl_id, unsigned percent_num_proc_per_chunk)
+{
+ H5P_genplist_t *plist; /* Property list pointer */
+ herr_t ret_value;
+
+ FUNC_ENTER_API(H5Pset_dxpl_mpio_chunk_opt_ratio, FAIL)
+/* H5TRACE2("e","iDt",dxpl_id,xfer_mode);*/
+
+ if(dxpl_id==H5P_DEFAULT)
+ HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list")
+
+ /* Check arguments */
+ if(NULL == (plist = H5P_object_verify(dxpl_id,H5P_DATASET_XFER)))
+ HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl")
+
+ /* Set the transfer mode */
+ if (H5P_set(plist,H5D_XFER_MPIO_CHUNK_OPT_RATIO_NAME,&percent_num_proc_per_chunk)<0)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value")
+
+ /* Initialize driver-specific properties */
+ ret_value= H5P_set_driver(plist, H5FD_MPIO, NULL);
+
+done:
+ FUNC_LEAVE_API(ret_value)
+}
/*-------------------------------------------------------------------------
diff --git a/src/H5FDmpio.h b/src/H5FDmpio.h
index 912cbd8..d2ddd0e 100644
--- a/src/H5FDmpio.h
+++ b/src/H5FDmpio.h
@@ -51,6 +51,9 @@ H5_DLL herr_t H5Pget_fapl_mpio(hid_t fapl_id, MPI_Comm *comm/*out*/,
MPI_Info *info/*out*/);
H5_DLL herr_t H5Pset_dxpl_mpio(hid_t dxpl_id, H5FD_mpio_xfer_t xfer_mode);
H5_DLL herr_t H5Pget_dxpl_mpio(hid_t dxpl_id, H5FD_mpio_xfer_t *xfer_mode/*out*/);
+H5_DLL herr_t H5Pset_dxpl_mpio_chunk_opt(hid_t dxpl_id, H5FD_mpio_chunk_opt_t opt_mode);
+H5_DLL herr_t H5Pset_dxpl_mpio_chunk_opt_num(hid_t dxpl_id, unsigned num_chunk_per_proc);
+H5_DLL herr_t H5Pset_dxpl_mpio_chunk_opt_ratio(hid_t dxpl_id, unsigned percent_num_proc_per_chunk);
#ifdef __cplusplus
}
#endif