summaryrefslogtreecommitdiffstats
path: root/src/H5FDmpio.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/H5FDmpio.c')
-rw-r--r--src/H5FDmpio.c126
1 files changed, 79 insertions, 47 deletions
diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c
index 87f8b6a..efbdafe 100644
--- a/src/H5FDmpio.c
+++ b/src/H5FDmpio.c
@@ -1414,32 +1414,35 @@ static herr_t
H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type,
hid_t H5_ATTR_UNUSED dxpl_id, haddr_t addr, size_t size, void *buf/*out*/)
{
- H5FD_mpio_t *file = (H5FD_mpio_t*)_file;
- MPI_Offset mpi_off;
- MPI_Status mpi_stat; /* Status from I/O operation */
- int mpi_code; /* mpi return code */
- MPI_Datatype buf_type = MPI_BYTE; /* MPI description of the selection in memory */
- int size_i; /* Integer copy of 'size' to read */
+ H5FD_mpio_t *file = (H5FD_mpio_t*)_file;
+ MPI_Offset mpi_off;
+ MPI_Status mpi_stat; /* Status from I/O operation */
+ int mpi_code; /* mpi return code */
+ MPI_Datatype buf_type = MPI_BYTE; /* MPI description of the selection in memory */
+ int size_i; /* Integer copy of 'size' to read */
#if MPI_VERSION >= 3
- MPI_Count bytes_read; /* Number of bytes read in */
- MPI_Count type_size; /* MPI datatype used for I/O's size */
- MPI_Count io_size; /* Actual number of bytes requested */
- MPI_Count n;
+ MPI_Count bytes_read = 0; /* Number of bytes read in */
+ MPI_Count type_size; /* MPI datatype used for I/O's size */
+ MPI_Count io_size; /* Actual number of bytes requested */
+ MPI_Count n;
#else
- int bytes_read; /* Number of bytes read in */
- int type_size; /* MPI datatype used for I/O's size */
- int io_size; /* Actual number of bytes requested */
- int n;
+ int bytes_read = 0; /* Number of bytes read in */
+ int type_size; /* MPI datatype used for I/O's size */
+ int io_size; /* Actual number of bytes requested */
+ int n;
#endif
- hbool_t use_view_this_time = FALSE;
- herr_t ret_value = SUCCEED;
+ hbool_t use_view_this_time = FALSE;
+ hbool_t rank0_bcast = FALSE; /* If read-with-rank0-and-bcast flag was used */
+ herr_t ret_value = SUCCEED;
FUNC_ENTER_NOAPI_NOINIT
#ifdef H5FDmpio_DEBUG
- if (H5FD_mpio_Debug[(int)'t'])
- fprintf(stdout, "Entering H5FD_mpio_read\n" );
+ if(H5FD_mpio_Debug[(int)'t'])
+ HDfprintf(stdout, "%s: Entering\n", FUNC);
#endif
+
+ /* Sanity checks */
HDassert(file);
HDassert(H5FD_MPIO==file->pub.driver_id);
HDassert(buf);
@@ -1448,21 +1451,20 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type,
HDmemset(&mpi_stat,0,sizeof(MPI_Status));
/* some numeric conversions */
- if (H5FD_mpi_haddr_to_MPIOff(addr, &mpi_off/*out*/)<0)
+ if(H5FD_mpi_haddr_to_MPIOff(addr, &mpi_off/*out*/) < 0)
HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off")
size_i = (int)size;
- if ((hsize_t)size_i != size)
+ if((hsize_t)size_i != size)
HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from size to size_i")
#ifdef H5FDmpio_DEBUG
- if (H5FD_mpio_Debug[(int)'r'])
- fprintf(stdout, "in H5FD_mpio_read mpi_off=%ld size_i=%d\n",
- (long)mpi_off, size_i );
+ if(H5FD_mpio_Debug[(int)'r'])
+ HDfprintf(stdout, "%s: mpi_off = %ld size_i = %d\n", FUNC, (long)mpi_off, size_i);
#endif
/* Only look for MPI views for raw data transfers */
if(type == H5FD_MEM_DRAW) {
- H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */
+ H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */
/* Get the transfer mode from the API context */
if(H5CX_get_io_xfer_mode(&xfer_mode) < 0)
@@ -1474,8 +1476,8 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type,
* us to test that btype=ftype=MPI_BYTE (or even MPI_TYPE_NULL, which
* could mean "use MPI_BYTE" by convention).
*/
- if(xfer_mode==H5FD_MPIO_COLLECTIVE) {
- MPI_Datatype file_type;
+ if(xfer_mode == H5FD_MPIO_COLLECTIVE) {
+ MPI_Datatype file_type;
/* Remember that views are used */
use_view_this_time = TRUE;
@@ -1502,8 +1504,8 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type,
H5FD_mpio_collective_opt_t coll_opt_mode;
#ifdef H5FDmpio_DEBUG
- if (H5FD_mpio_Debug[(int)'t'])
- fprintf(stdout, "H5FD_mpio_read: using MPIO collective mode\n");
+ if(H5FD_mpio_Debug[(int)'r'])
+ HDfprintf(stdout, "%s: using MPIO collective mode\n", FUNC);
#endif
/* Get the collective_opt property to check whether the application wants to do IO individually. */
if(H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0)
@@ -1511,16 +1513,33 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type,
if(coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) {
#ifdef H5FDmpio_DEBUG
- if(H5FD_mpio_Debug[(int)'t'])
- fprintf(stdout, "H5FD_mpio_read: doing MPI collective IO\n");
+ if(H5FD_mpio_Debug[(int)'r'])
+ HDfprintf(stdout, "%s: doing MPI collective IO\n", FUNC);
#endif
- if(MPI_SUCCESS != (mpi_code = MPI_File_read_at_all(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
- HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code)
+ /* Check whether we should read from rank 0 and broadcast to other ranks */
+ if(H5CX_get_mpio_rank0_bcast()) {
+#ifdef H5FDmpio_DEBUG
+ if(H5FD_mpio_Debug[(int)'r'])
+ HDfprintf(stdout, "%s: doing read-rank0-and-MPI_Bcast\n", FUNC);
+#endif
+ /* Indicate path we've taken */
+ rank0_bcast = TRUE;
+
+ /* Read on rank 0 Bcast to other ranks */
+ if(file->mpi_rank == 0)
+ if(MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
+ if(MPI_SUCCESS != (mpi_code = MPI_Bcast(buf, size_i, buf_type, 0, file->comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code)
+ } /* end if */
+ else
+ if(MPI_SUCCESS != (mpi_code = MPI_File_read_at_all(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code)
} /* end if */
else {
#ifdef H5FDmpio_DEBUG
- if(H5FD_mpio_Debug[(int)'t'])
- fprintf(stdout, "H5FD_mpio_read: doing MPI independent IO\n");
+ if(H5FD_mpio_Debug[(int)'r'])
+ HDfprintf(stdout, "%s: doing MPI independent IO\n", FUNC);
#endif
if(MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
@@ -1532,44 +1551,57 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type,
*/
if(MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, H5FD_mpi_native_g, file->info)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
- } else {
+ } /* end if */
+ else
if(MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
- }
- /* How many bytes were actually read? */
+ /* Only retrieve bytes read if this rank _actually_ participated in I/O */
+ if(!rank0_bcast || (rank0_bcast && file->mpi_rank == 0) ) {
+ /* How many bytes were actually read? */
#if MPI_VERSION >= 3
- if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_read)))
+ if(MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_read)))
#else
- if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read)))
+ if(MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read)))
#endif
- HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
+ HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
+ } /* end if */
+
+ /* If the rank0-bcast feature was used, broadcast the # of bytes read to
+ * other ranks, which didn't perform any I/O.
+ */
+ /* NOTE: This could be optimized further to be combined with the broadcast
+ * of the data. (QAK - 2019/1/2)
+ */
+ if(rank0_bcast)
+ if(MPI_SUCCESS != MPI_Bcast(&bytes_read, 1, MPI_LONG_LONG, 0, file->comm))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", 0)
/* Get the type's size */
#if MPI_VERSION >= 3
- if (MPI_SUCCESS != (mpi_code = MPI_Type_size_x(buf_type, &type_size)))
+ if(MPI_SUCCESS != (mpi_code = MPI_Type_size_x(buf_type, &type_size)))
#else
- if (MPI_SUCCESS != (mpi_code = MPI_Type_size(buf_type, &type_size)))
+ if(MPI_SUCCESS != (mpi_code = MPI_Type_size(buf_type, &type_size)))
#endif
HMPI_GOTO_ERROR(FAIL, "MPI_Type_size failed", mpi_code)
/* Compute the actual number of bytes requested */
- io_size=type_size*size_i;
+ io_size = type_size * size_i;
/* Check for read failure */
- if (bytes_read<0 || bytes_read>io_size)
+ if(bytes_read < 0 || bytes_read > io_size)
HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed")
/*
* This gives us zeroes beyond end of physical MPI file.
*/
- if ((n=(io_size-bytes_read)) > 0)
+ if((n = (io_size - bytes_read)) > 0)
HDmemset((char*)buf+bytes_read, 0, (size_t)n);
done:
#ifdef H5FDmpio_DEBUG
- if (H5FD_mpio_Debug[(int)'t'])
- fprintf(stdout, "Leaving H5FD_mpio_read\n" );
+ if(H5FD_mpio_Debug[(int)'t'])
+ HDfprintf(stdout, "%s: Leaving\n", FUNC);
#endif
FUNC_LEAVE_NOAPI(ret_value)