summaryrefslogtreecommitdiffstats
path: root/src/H5FDmpiposix.c
diff options
context:
space:
mode:
authorQuincey Koziol <koziol@hdfgroup.org>2002-07-19 19:27:09 (GMT)
committerQuincey Koziol <koziol@hdfgroup.org>2002-07-19 19:27:09 (GMT)
commit99eee6dff9a38a1bab3d74b4b77dd8169e57409c (patch)
tree5db3dd6e5ed35a694c655337750dc741e30181fb /src/H5FDmpiposix.c
parent814ea8b962fec329f6a440d1f52f6b682de47524 (diff)
downloadhdf5-99eee6dff9a38a1bab3d74b4b77dd8169e57409c.zip
hdf5-99eee6dff9a38a1bab3d74b4b77dd8169e57409c.tar.gz
hdf5-99eee6dff9a38a1bab3d74b4b77dd8169e57409c.tar.bz2
[svn-r5814] Purpose:
Bug Fix Description: It was possible to create corrupted metadata information (either in memory or in the file or both) with a parallel I/O program because of the way metadata writes were being handled for writes out of the metadata cache. Solution: Added a dataset transfer property called "block before metadata write" which is used by the MPI-I/O and MPI-posix drivers to sync up all the processes before attempting a metadata write. This property is currently only for metadata writes from the metadata cache. Platforms tested: IRIX64 6.5 (modi4) w/parallel
Diffstat (limited to 'src/H5FDmpiposix.c')
-rw-r--r--src/H5FDmpiposix.c56
1 files changed, 37 insertions, 19 deletions
diff --git a/src/H5FDmpiposix.c b/src/H5FDmpiposix.c
index 82533d0..32e5535 100644
--- a/src/H5FDmpiposix.c
+++ b/src/H5FDmpiposix.c
@@ -32,6 +32,7 @@
*
*/
#include "H5private.h" /*library functions */
+#include "H5ACprivate.h" /* Metadata cache */
#include "H5Eprivate.h" /*error handling */
#include "H5Fprivate.h" /*files */
#include "H5FDprivate.h" /*file driver */
@@ -965,21 +966,31 @@ done:
*
* Modifications:
*
+ * Quincey Koziol - 2002/07/18
+ * Added "block_before_meta_write" dataset transfer flag, which
+ * is set during writes from a metadata cache flush and indicates
+ * that all the processes must sync up before (one of them)
+ * writing metadata.
+ *
*-------------------------------------------------------------------------
*/
static herr_t
-H5FD_mpiposix_write(H5FD_t *_file, H5FD_mem_t type, hid_t UNUSED dxpl_id, haddr_t addr,
+H5FD_mpiposix_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
size_t size, const void *buf)
{
H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file;
int mpi_code; /* MPI return code */
ssize_t nbytes; /* Number of bytes written each I/O call */
+ H5P_genplist_t *plist; /* Property list pointer */
+ unsigned block_before_meta_write=0; /* Whether to block before a metadata write */
herr_t ret_value=SUCCEED; /* Return value */
FUNC_ENTER_NOAPI(H5FD_mpiposix_write, FAIL);
assert(file);
assert(H5FD_MPIPOSIX==file->pub.driver_id);
+ assert(H5I_GENPROP_LST==H5I_get_type(dxpl_id));
+ assert(TRUE==H5P_isa_class(dxpl_id,H5P_DATASET_XFER));
assert(buf);
/* Check for overflow conditions */
@@ -990,17 +1001,36 @@ H5FD_mpiposix_write(H5FD_t *_file, H5FD_mem_t type, hid_t UNUSED dxpl_id, haddr_
if (addr+size>file->eoa)
HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow");
- /* Only p<round> will do the actual write if all procs in comm write same data */
- if ((type!=H5FD_MEM_DRAW) && H5_mpiposix_1_metawrite_g) {
- if (file->mpi_rank != file->mpi_round)
- HGOTO_DONE(SUCCEED) /* skip the actual write */
+ /* Obtain the data transfer properties */
+ if(NULL == (plist = H5I_object(dxpl_id)))
+ HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list");
+
+ /* Metadata specific actions */
+ if(type!=H5FD_MEM_DRAW) {
+ /* Check if we need to syncronize all processes before attempting metadata write
+ * (Prevents race condition where the process writing the metadata goes ahead
+ * and writes the metadata to the file before all the processes have
+ * read the data, "transmitting" data from the "future" to the reading
+ * process. -QAK )
+ */
+ if(H5P_exist_plist(plist,H5AC_BLOCK_BEFORE_META_WRITE_NAME)>0)
+ if(H5P_get(plist,H5AC_BLOCK_BEFORE_META_WRITE_NAME,&block_before_meta_write)<0)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get H5AC property");
+
+ if(block_before_meta_write)
+ if (MPI_SUCCESS!= (mpi_code=MPI_Barrier(file->comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code);
+
+ /* Only p<round> will do the actual write if all procs in comm write same metadata */
+ if (H5_mpiposix_1_metawrite_g)
+ if (file->mpi_rank != file->mpi_round)
+ HGOTO_DONE(SUCCEED) /* skip the actual write */
} /* end if */
/* Seek to the correct location */
if ((addr!=file->pos || OP_WRITE!=file->op) &&
file_seek(file->fd, (file_offset_t)addr, SEEK_SET)<0)
- HGOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL,
- "unable to seek to proper position");
+ HGOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL, "unable to seek to proper position");
/*
* Write the data, being careful of interrupted system calls and partial
@@ -1039,18 +1069,6 @@ done:
/* Round-robin rotate to the next process */
file->mpi_round = (++file->mpi_round)%file->mpi_size;
-#ifdef QAK
- {
- int max,min;
-
- MPI_Allreduce(&file->mpi_round, &max, 1, MPI_INT, MPI_MAX, file->comm);
- MPI_Allreduce(&file->mpi_round, &min, 1, MPI_INT, MPI_MIN, file->comm);
- if(max!=file->mpi_round)
- printf("%s: rank=%d, round=%d, max=%d\n",FUNC,file->mpi_rank,file->mpi_round,max);
- if(min!=file->mpi_round)
- printf("%s: rank=%d, round=%d, min=%d\n",FUNC,file->mpi_rank,file->mpi_round,min);
- }
-#endif /* QAK */
} /* end if */
} /* end else */