summaryrefslogtreecommitdiffstats
path: root/src/H5FDmpio.c
diff options
context:
space:
mode:
authorQuincey Koziol <koziol@hdfgroup.org>2002-07-19 19:27:09 (GMT)
committerQuincey Koziol <koziol@hdfgroup.org>2002-07-19 19:27:09 (GMT)
commit99eee6dff9a38a1bab3d74b4b77dd8169e57409c (patch)
tree5db3dd6e5ed35a694c655337750dc741e30181fb /src/H5FDmpio.c
parent814ea8b962fec329f6a440d1f52f6b682de47524 (diff)
downloadhdf5-99eee6dff9a38a1bab3d74b4b77dd8169e57409c.zip
hdf5-99eee6dff9a38a1bab3d74b4b77dd8169e57409c.tar.gz
hdf5-99eee6dff9a38a1bab3d74b4b77dd8169e57409c.tar.bz2
[svn-r5814] Purpose:
Bug Fix Description: It was possible to create corrupted metadata information (either in memory or in the file or both) with a parallel I/O program because of the way metadata writes were being handled for writes out of the metadata cache. Solution: Added a dataset transfer property called "block before metadata write" which is used by the MPI-I/O and MPI-posix drivers to sync up all the processes before attempting a metadata write. This property is currently only for metadata writes from the metadata cache. Platforms tested: IRIX64 6.5 (modi4) w/parallel
Diffstat (limited to 'src/H5FDmpio.c')
-rw-r--r--src/H5FDmpio.c69
1 files changed, 40 insertions, 29 deletions
diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c
index df4b95c..023057f 100644
--- a/src/H5FDmpio.c
+++ b/src/H5FDmpio.c
@@ -8,11 +8,6 @@
* Purpose: This is the MPI-2 I/O driver.
*
* Limitations:
- * H5FD_mpio_read & H5FD_mpio_write
- * Eventually these should choose collective or independent i/o
- * based on a parameter that is passed down to it from H5Dwrite,
- * rather than the access_parms (which are fixed at the open).
- *
* H5FD_mpio_read
* One implementation of MPI/MPI-IO causes MPI_Get_count
* to return (incorrectly) a negative count. I (who?) added code
@@ -21,6 +16,7 @@
* kluge is activated by #ifdef MPI_KLUGE0202.
*/
#include "H5private.h" /*library functions */
+#include "H5ACprivate.h" /* Metadata cache */
#include "H5Eprivate.h" /*error handling */
#include "H5Fprivate.h" /*files */
#include "H5FDprivate.h" /*file driver */
@@ -71,9 +67,9 @@ static herr_t H5FD_mpio_query(const H5FD_t *_f1, unsigned long *flags);
static haddr_t H5FD_mpio_get_eoa(H5FD_t *_file);
static herr_t H5FD_mpio_set_eoa(H5FD_t *_file, haddr_t addr);
static haddr_t H5FD_mpio_get_eof(H5FD_t *_file);
-static herr_t H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t type, hid_t fapl_id, haddr_t addr,
+static herr_t H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
size_t size, void *buf);
-static herr_t H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t fapl_id, haddr_t addr,
+static herr_t H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
size_t size, const void *buf);
static herr_t H5FD_mpio_flush(H5FD_t *_file, unsigned closing);
@@ -1210,6 +1206,7 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t UNUSED type, hid_t dxpl_id, haddr_t add
/* Make certain we have the correct type of property list */
assert(H5I_GENPROP_LST==H5I_get_type(dxpl_id));
assert(TRUE==H5P_isa_class(dxpl_id,H5P_DATASET_XFER));
+ assert(buf);
/* Portably initialize MPI status variable */
HDmemset(&mpi_stat,0,sizeof(MPI_Status));
@@ -1484,6 +1481,12 @@ done:
* if the first I/O was a collective I/O using MPI derived types
* and the next I/O was an independent I/O.
*
+ * Quincey Koziol - 2002/07/18
+ * Added "block_before_meta_write" dataset transfer flag, which
+ * is set during writes from a metadata cache flush and indicates
+ * that all the processes must sync up before (one of them)
+ * writing metadata.
+ *
*-------------------------------------------------------------------------
*/
static herr_t
@@ -1496,8 +1499,10 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
MPI_Offset mpi_off, mpi_disp;
MPI_Status mpi_stat;
MPI_Datatype buf_type, file_type;
+ int mpi_code; /* MPI return code */
int size_i, bytes_written;
unsigned use_view_this_time=0;
+ unsigned block_before_meta_write=0; /* Whether to block before a metadata write */
H5P_genplist_t *plist; /* Property list pointer */
herr_t ret_value=SUCCEED;
@@ -1512,6 +1517,7 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
/* Make certain we have the correct type of property list */
assert(H5I_GENPROP_LST==H5I_get_type(dxpl_id));
assert(TRUE==H5P_isa_class(dxpl_id,H5P_DATASET_XFER));
+ assert(buf);
/* Portably initialize MPI status variable */
HDmemset(&mpi_stat,0,sizeof(MPI_Status));
@@ -1582,19 +1588,36 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "MPI_File_set_view failed");
} /* end if */
- /* Only p<round> will do the actual write if all procs in comm write same data */
- if ((type!=H5FD_MEM_DRAW) && H5_mpi_1_metawrite_g) {
- if (file->mpi_rank != file->mpi_round) {
+ /* Metadata specific actions */
+ if(type!=H5FD_MEM_DRAW) {
+ /* Check if we need to syncronize all processes before attempting metadata write
+ * (Prevents race condition where the process writing the metadata goes ahead
+ * and writes the metadata to the file before all the processes have
+ * read the data, "transmitting" data from the "future" to the reading
+ * process. -QAK )
+ */
+ if(H5P_exist_plist(plist,H5AC_BLOCK_BEFORE_META_WRITE_NAME)>0)
+ if(H5P_get(plist,H5AC_BLOCK_BEFORE_META_WRITE_NAME,&block_before_meta_write)<0)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get H5AC property");
+
+ if(block_before_meta_write)
+ if (MPI_SUCCESS!= (mpi_code=MPI_Barrier(file->comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code);
+
+ /* Only p<round> will do the actual write if all procs in comm write same metadata */
+ if (H5_mpi_1_metawrite_g) {
+ if (file->mpi_rank != file->mpi_round) {
#ifdef H5FDmpio_DEBUG
- if (H5FD_mpio_Debug[(int)'w']) {
- fprintf(stdout,
- " proc %d: in H5FD_mpio_write (write omitted)\n",
- file->mpi_rank );
- }
+ if (H5FD_mpio_Debug[(int)'w']) {
+ fprintf(stdout,
+ " proc %d: in H5FD_mpio_write (write omitted)\n",
+ file->mpi_rank );
+ }
#endif
- HGOTO_DONE(SUCCEED) /* skip the actual write */
+ HGOTO_DONE(SUCCEED) /* skip the actual write */
+ }
}
- }
+ } /* end if */
/* Write the data. */
assert(H5FD_MPIO_INDEPENDENT==dx->xfer_mode || H5FD_MPIO_COLLECTIVE==dx->xfer_mode);
@@ -1681,18 +1704,6 @@ done:
/* Round-robin rotate to the next process */
file->mpi_round = (++file->mpi_round)%file->mpi_size;
-#ifdef QAK
- {
- int max,min;
-
- MPI_Allreduce(&file->mpi_round, &max, 1, MPI_INT, MPI_MAX, file->comm);
- MPI_Allreduce(&file->mpi_round, &min, 1, MPI_INT, MPI_MIN, file->comm);
- if(max!=file->mpi_round)
- printf("%s: rank=%d, round=%d, max=%d\n",FUNC,file->mpi_rank,file->mpi_round,max);
- if(min!=file->mpi_round)
- printf("%s: rank=%d, round=%d, min=%d\n",FUNC,file->mpi_rank,file->mpi_round,min);
- }
-#endif /* QAK */
} /* end if */
} /* end if */