summaryrefslogtreecommitdiffstats
path: root/src/H5FDmpio.c
diff options
context:
space:
mode:
authorMuQun Yang <ymuqun@hdfgroup.org>2006-08-09 03:00:11 (GMT)
committerMuQun Yang <ymuqun@hdfgroup.org>2006-08-09 03:00:11 (GMT)
commit6916816a563532fddc3699a6d5e4adb57212968d (patch)
tree70121257e539ec369455ebd43119873fd96c7489 /src/H5FDmpio.c
parentd17d42acd0fbba4b3433937f448c99930553b038 (diff)
downloadhdf5-6916816a563532fddc3699a6d5e4adb57212968d.zip
hdf5-6916816a563532fddc3699a6d5e4adb57212968d.tar.gz
hdf5-6916816a563532fddc3699a6d5e4adb57212968d.tar.bz2
[svn-r12553] This check-in includes the following part of parallel optimization codes:
1. Provide another option for users to do independent IO with MPI file setview(collectively) 2. With the request of collective IO from users, using Independent IO with MPI file setview if we find collective IO is not good for the applications for IO per chunk(multi-chunk IO) case. Previously we used pure independent IO and that actually performed small IO(IO each row) for this case. The recent performance study suggested the independent IO with file setview can acheieve significantly better performance than collective IO when not many processes participate in the IO. 3. For applications that explicitly choose to do collective IO per chunk case, the library won't do any optimization(gather/broadcast) operations. The library simply passes the collective IO request to MPI-IO. Tested at copper, kagiso, heping, mir and tungsten(cmpi and mpich) Kagiso is using LAM, t_mpi test was broken even. The cchunk10 test failed at heping and mir. I suspected it was an MPICH problem. Will investigate later. Everything passed at copper. at tungsten: the old cmpi bug(failed at esetw) is still there. Other tests passed. Some sequential fheap tests failed at kagiso.
Diffstat (limited to 'src/H5FDmpio.c')
-rw-r--r--src/H5FDmpio.c147
1 files changed, 127 insertions, 20 deletions
diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c
index 0be55e3..73bc2dc 100644
--- a/src/H5FDmpio.c
+++ b/src/H5FDmpio.c
@@ -531,7 +531,7 @@ done:
}
/*-------------------------------------------------------------------------
- * Function: H5Pset_dxpl_mpio_chunk_opt
+ * Function: H5Pset_dxpl_mpio_collective_opt
Purpose:
To set a flag to choose linked chunk IO or multi-chunk IO without
@@ -543,14 +543,66 @@ Description:
The library won't behave as it asks for only when we find
that the low-level MPI-IO package doesn't support this.
-Parameters:
+Parameters:
hid_t dxpl_id in: Data transfer property list identifier
H5FD_mpio_chunk_opt_t in: The optimization flag for linked chunk IO
or multi-chunk IO.
+
+
+Returns:
+Returns a non-negative value if successful. Otherwise returns a negative value.
+*
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5Pset_dxpl_mpio_collective_opt(hid_t dxpl_id, H5FD_mpio_collective_opt_t opt_mode)
+{
+ H5P_genplist_t *plist; /* Property list pointer */
+ herr_t ret_value;
+
+ FUNC_ENTER_API(H5Pset_dxpl_mpio_collective_opt, FAIL)
+/* H5TRACE2("e","iDt",dxpl_id,xfer_mode);*/
+
+ if(dxpl_id==H5P_DEFAULT)
+ HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list")
+
+ /* Check arguments */
+ if(NULL == (plist = H5P_object_verify(dxpl_id,H5P_DATASET_XFER)))
+ HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl")
+ /* Set the transfer mode */
+ if (H5P_set(plist,H5D_XFER_MPIO_COLLECTIVE_OPT_NAME,&opt_mode)<0)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value")
+
+ /* Initialize driver-specific properties */
+ ret_value= H5P_set_driver(plist, H5FD_MPIO, NULL);
+
+done:
+ FUNC_LEAVE_API(ret_value)
+}
-Returns:
-Returns a non-negative value if successful. Otherwise returns a negative value.
+
+/*-------------------------------------------------------------------------
+ * Function: H5Pset_dxpl_mpio_chunk_opt
+
+Purpose:
+ To set a flag to choose linked chunk IO or multi-chunk IO without
+ involving decision-making inside HDF5
+
+Description:
+ The library will do linked chunk IO or multi-chunk IO without
+ involving communications for decision-making process.
+ The library won't behave as it asks for only when we find
+ that the low-level MPI-IO package doesn't support this.
+
+Parameters:
+ hid_t dxpl_id in: Data transfer property list identifier
+ H5FD_mpio_chunk_opt_t in: The optimization flag for linked chunk IO
+ or multi-chunk IO.
+
+
+Returns:
+Returns a non-negative value if successful. Otherwise returns a negative value.
*
*-------------------------------------------------------------------------
*/
@@ -590,15 +642,15 @@ Purpose:
To set a threshold for doing linked chunk IO
Description:
- If the number is greater than the threshold set by the user,
+ If the number is greater than the threshold set by the user,
the library will do linked chunk IO; otherwise, IO will be done for every chunk.
-Parameters:
+Parameters:
hid_t dxpl_id in: Data transfer property list identifier
- unsigned num_proc_per_chunk in: the threshold of the average number of chunks selected by each process
+ unsigned num_proc_per_chunk in: the threshold of the average number of chunks selected by each process
-Returns:
-Returns a non-negative value if successful. Otherwise returns a negative value.
+Returns:
+Returns a non-negative value if successful. Otherwise returns a negative value.
*
*-------------------------------------------------------------------------
*/
@@ -637,13 +689,13 @@ Purpose:
To set a threshold for doing collective IO for each chunk
Description:
The library will calculate the percentage of the number of process holding selections at each chunk. If that percentage of number of process in the individual chunk is greater than the threshold set by the user, the library will do collective chunk IO for this chunk; otherwise, independent IO will be done for this chunk.
-Parameters:
- hid_t dxpl_id
+Parameters:
+ hid_t dxpl_id
in: Data transfer property list identifier
- unsigned percent_num_proc_per_chunk
+ unsigned percent_num_proc_per_chunk
in: the threshold of the percentage of the number of process holding selections per chunk
-Returns:
-Returns a non-negative value if successful. Otherwise returns a negative value.
+Returns:
+Returns a non-negative value if successful. Otherwise returns a negative value.
*
@@ -1343,6 +1395,7 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t UNUSED type, hid_t dxpl_id, haddr_t add
int n;
int type_size; /* MPI datatype used for I/O's size */
int io_size; /* Actual number of bytes requested */
+ H5P_genplist_t *plist; /* Property list pointer */
unsigned use_view_this_time=0;
herr_t ret_value=SUCCEED;
@@ -1377,7 +1430,6 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t UNUSED type, hid_t dxpl_id, haddr_t add
/* Only look for MPI views for raw data transfers */
if(type==H5FD_MEM_DRAW) {
- H5P_genplist_t *plist; /* Property list pointer */
H5FD_mpio_xfer_t xfer_mode; /* I/O tranfer mode */
/* Obtain the data transfer properties */
@@ -1419,13 +1471,38 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t UNUSED type, hid_t dxpl_id, haddr_t add
/* Read the data. */
if (use_view_this_time) {
+ H5FD_mpio_collective_opt_t coll_opt_mode;
+ H5FD_mpio_collective_opt_t xfer_opt_mode;
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_Debug[(int)'t'])
fprintf(stdout, "H5FD_mpio_read: using MPIO collective mode\n");
#endif
+ /* Peek the collective_opt property to check whether the application wants to do IO individually. */
+ coll_opt_mode=(H5FD_mpio_collective_opt_t)H5P_peek_unsigned(plist,H5D_XFER_MPIO_COLLECTIVE_OPT_NAME);
+
+ /* Peek the xfer_opt_mode property to check whether the application wants to do IO individually. */
+ xfer_opt_mode=(H5FD_mpio_collective_opt_t)H5P_peek_unsigned(plist,H5D_XFER_IO_XFER_OPT_MODE_NAME);
+
+ if(coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO && xfer_opt_mode == H5FD_MPIO_COLLECTIVE_IO) {
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_Debug[(int)'t'])
+ fprintf(stdout, "H5FD_mpio_read: doing MPI collective IO\n");
+#endif
+/* Temporarily change to read_at_all
+ if (MPI_SUCCESS!= (mpi_code=MPI_File_read_at_all(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat )))*/
if (MPI_SUCCESS!= (mpi_code=MPI_File_read_at_all(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat )))
HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code)
+ }
+ else {
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_Debug[(int)'t'])
+ fprintf(stdout, "H5FD_mpio_read: doing MPI independent IO\n");
+#endif
+ if (MPI_SUCCESS!= (mpi_code=MPI_File_read_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat )))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
+ }
+
/*
* Reset the file view when we used MPI derived types
*/
@@ -1701,7 +1778,7 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
/* The metadata cache now only writes from process 0, which makes
* this synchronization incorrect. I'm leaving this code commented
* out instead of deleting it to remind us that we should re-write
- * this function so that a metadata write from any other process
+ * this function so that a metadata write from any other process
* should flag an error.
* -- JRM 9/1/05
*/
@@ -1725,15 +1802,45 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
/* Write the data. */
if (use_view_this_time) {
+ H5FD_mpio_collective_opt_t coll_opt_mode;
+ H5FD_mpio_collective_opt_t xfer_opt_mode;
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_Debug[(int)'t'])
fprintf(stdout, "H5FD_mpio_write: using MPIO collective mode\n");
#endif
+ /* Peek the collective_opt property to check whether the application wants to do IO individually. */
+ coll_opt_mode=(H5FD_mpio_collective_opt_t)H5P_peek_unsigned(plist,H5D_XFER_MPIO_COLLECTIVE_OPT_NAME);
+
+ /* Peek the xfer_opt_mode property to check whether the application wants to do IO individually. */
+ xfer_opt_mode=(H5FD_mpio_collective_opt_t)H5P_peek_unsigned(plist,H5D_XFER_IO_XFER_OPT_MODE_NAME);
+
+
+
/*OKAY: CAST DISCARDS CONST QUALIFIER*/
+
+ if(coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO && xfer_opt_mode == H5FD_MPIO_COLLECTIVE_IO ) {
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_Debug[(int)'t'])
+ fprintf(stdout, "H5FD_mpio_write: doing MPI collective IO\n");
+#endif
+ /* Temporarily change to _at
+if (MPI_SUCCESS != (mpi_code=MPI_File_write_at_all(file->f, mpi_off, (void*)buf, size_i, buf_type, &mpi_stat)))
+*/
if (MPI_SUCCESS != (mpi_code=MPI_File_write_at_all(file->f, mpi_off, (void*)buf, size_i, buf_type, &mpi_stat)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at_all failed", mpi_code)
+ }
+ else {
+#ifdef H5FDmpio_DEBUG
+ if (H5FD_mpio_Debug[(int)'t'])
+ fprintf(stdout, "H5FD_mpio_write: doing MPI independent IO\n");
+#endif
+
+ if (MPI_SUCCESS != (mpi_code=MPI_File_write_at(file->f, mpi_off, (void*)buf, size_i, buf_type, &mpi_stat)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at failed", mpi_code)
+ }
- /*
+
+ /*
* Reset the file view when we used MPI derived types
*/
/*OKAY: CAST DISCARDS CONST QUALIFIER*/
@@ -1772,13 +1879,13 @@ done:
#if 0 /* JRM */
/* Since metadata writes are now done by process 0 only, this broadcast
- * is no longer needed. I leave it in and commented out to remind us
+ * is no longer needed. I leave it in and commented out to remind us
* that we need to re-work this function to reflect this reallity.
*
* -- JRM 9/1/05
*/
- /* if only one process writes, need to broadcast the ret_value to
- * other processes
+ /* if only one process writes, need to broadcast the ret_value to
+ * other processes
*/
if (type!=H5FD_MEM_DRAW) {
if (MPI_SUCCESS != (mpi_code=MPI_Bcast(&ret_value, sizeof(ret_value), MPI_BYTE, H5_PAR_META_WRITE, file->comm)))