diff options
author | Quincey Koziol <koziol@hdfgroup.org> | 2002-07-19 19:27:09 (GMT) |
---|---|---|
committer | Quincey Koziol <koziol@hdfgroup.org> | 2002-07-19 19:27:09 (GMT) |
commit | 99eee6dff9a38a1bab3d74b4b77dd8169e57409c (patch) | |
tree | 5db3dd6e5ed35a694c655337750dc741e30181fb /src/H5FDmpiposix.c | |
parent | 814ea8b962fec329f6a440d1f52f6b682de47524 (diff) | |
download | hdf5-99eee6dff9a38a1bab3d74b4b77dd8169e57409c.zip hdf5-99eee6dff9a38a1bab3d74b4b77dd8169e57409c.tar.gz hdf5-99eee6dff9a38a1bab3d74b4b77dd8169e57409c.tar.bz2 |
[svn-r5814] Purpose:
Bug Fix
Description:
It was possible to create corrupted metadata information (either in memory
or in the file or both) with a parallel I/O program because of the way
metadata writes were being handled for writes out of the metadata cache.
Solution:
Added a dataset transfer property called "block before metadata write"
which is used by the MPI-I/O and MPI-posix drivers to sync up all the
processes before attempting a metadata write. This property is currently
only for metadata writes from the metadata cache.
Platforms tested:
IRIX64 6.5 (modi4) w/parallel
Diffstat (limited to 'src/H5FDmpiposix.c')
-rw-r--r-- | src/H5FDmpiposix.c | 56 |
1 files changed, 37 insertions, 19 deletions
diff --git a/src/H5FDmpiposix.c b/src/H5FDmpiposix.c index 82533d0..32e5535 100644 --- a/src/H5FDmpiposix.c +++ b/src/H5FDmpiposix.c @@ -32,6 +32,7 @@ * */ #include "H5private.h" /*library functions */ +#include "H5ACprivate.h" /* Metadata cache */ #include "H5Eprivate.h" /*error handling */ #include "H5Fprivate.h" /*files */ #include "H5FDprivate.h" /*file driver */ @@ -965,21 +966,31 @@ done: * * Modifications: * + * Quincey Koziol - 2002/07/18 + * Added "block_before_meta_write" dataset transfer flag, which + * is set during writes from a metadata cache flush and indicates + * that all the processes must sync up before (one of them) + * writing metadata. + * *------------------------------------------------------------------------- */ static herr_t -H5FD_mpiposix_write(H5FD_t *_file, H5FD_mem_t type, hid_t UNUSED dxpl_id, haddr_t addr, +H5FD_mpiposix_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size, const void *buf) { H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file; int mpi_code; /* MPI return code */ ssize_t nbytes; /* Number of bytes written each I/O call */ + H5P_genplist_t *plist; /* Property list pointer */ + unsigned block_before_meta_write=0; /* Whether to block before a metadata write */ herr_t ret_value=SUCCEED; /* Return value */ FUNC_ENTER_NOAPI(H5FD_mpiposix_write, FAIL); assert(file); assert(H5FD_MPIPOSIX==file->pub.driver_id); + assert(H5I_GENPROP_LST==H5I_get_type(dxpl_id)); + assert(TRUE==H5P_isa_class(dxpl_id,H5P_DATASET_XFER)); assert(buf); /* Check for overflow conditions */ @@ -990,17 +1001,36 @@ H5FD_mpiposix_write(H5FD_t *_file, H5FD_mem_t type, hid_t UNUSED dxpl_id, haddr_ if (addr+size>file->eoa) HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow"); - /* Only p<round> will do the actual write if all procs in comm write same data */ - if ((type!=H5FD_MEM_DRAW) && H5_mpiposix_1_metawrite_g) { - if (file->mpi_rank != file->mpi_round) - HGOTO_DONE(SUCCEED) /* skip the actual write */ + /* Obtain the data transfer properties */ + if(NULL == (plist = H5I_object(dxpl_id))) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list"); + + /* Metadata specific actions */ + if(type!=H5FD_MEM_DRAW) { + /* Check if we need to syncronize all processes before attempting metadata write + * (Prevents race condition where the process writing the metadata goes ahead + * and writes the metadata to the file before all the processes have + * read the data, "transmitting" data from the "future" to the reading + * process. -QAK ) + */ + if(H5P_exist_plist(plist,H5AC_BLOCK_BEFORE_META_WRITE_NAME)>0) + if(H5P_get(plist,H5AC_BLOCK_BEFORE_META_WRITE_NAME,&block_before_meta_write)<0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get H5AC property"); + + if(block_before_meta_write) + if (MPI_SUCCESS!= (mpi_code=MPI_Barrier(file->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); + + /* Only p<round> will do the actual write if all procs in comm write same metadata */ + if (H5_mpiposix_1_metawrite_g) + if (file->mpi_rank != file->mpi_round) + HGOTO_DONE(SUCCEED) /* skip the actual write */ } /* end if */ /* Seek to the correct location */ if ((addr!=file->pos || OP_WRITE!=file->op) && file_seek(file->fd, (file_offset_t)addr, SEEK_SET)<0) - HGOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL, - "unable to seek to proper position"); + HGOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL, "unable to seek to proper position"); /* * Write the data, being careful of interrupted system calls and partial @@ -1039,18 +1069,6 @@ done: /* Round-robin rotate to the next process */ file->mpi_round = (++file->mpi_round)%file->mpi_size; -#ifdef QAK - { - int max,min; - - MPI_Allreduce(&file->mpi_round, &max, 1, MPI_INT, MPI_MAX, file->comm); - MPI_Allreduce(&file->mpi_round, &min, 1, MPI_INT, MPI_MIN, file->comm); - if(max!=file->mpi_round) - printf("%s: rank=%d, round=%d, max=%d\n",FUNC,file->mpi_rank,file->mpi_round,max); - if(min!=file->mpi_round) - printf("%s: rank=%d, round=%d, min=%d\n",FUNC,file->mpi_rank,file->mpi_round,min); - } -#endif /* QAK */ } /* end if */ } /* end else */ |