diff options
author | Quincey Koziol <koziol@hdfgroup.org> | 2002-07-19 19:27:09 (GMT) |
---|---|---|
committer | Quincey Koziol <koziol@hdfgroup.org> | 2002-07-19 19:27:09 (GMT) |
commit | 99eee6dff9a38a1bab3d74b4b77dd8169e57409c (patch) | |
tree | 5db3dd6e5ed35a694c655337750dc741e30181fb /src/H5FDmpio.c | |
parent | 814ea8b962fec329f6a440d1f52f6b682de47524 (diff) | |
download | hdf5-99eee6dff9a38a1bab3d74b4b77dd8169e57409c.zip hdf5-99eee6dff9a38a1bab3d74b4b77dd8169e57409c.tar.gz hdf5-99eee6dff9a38a1bab3d74b4b77dd8169e57409c.tar.bz2 |
[svn-r5814] Purpose:
Bug Fix
Description:
It was possible to create corrupted metadata information (either in memory
or in the file or both) with a parallel I/O program because of the way
metadata writes were being handled for writes out of the metadata cache.
Solution:
Added a dataset transfer property called "block before metadata write"
which is used by the MPI-I/O and MPI-posix drivers to sync up all the
processes before attempting a metadata write. This property is currently
only for metadata writes from the metadata cache.
Platforms tested:
IRIX64 6.5 (modi4) w/parallel
Diffstat (limited to 'src/H5FDmpio.c')
-rw-r--r-- | src/H5FDmpio.c | 69 |
1 files changed, 40 insertions, 29 deletions
diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c index df4b95c..023057f 100644 --- a/src/H5FDmpio.c +++ b/src/H5FDmpio.c @@ -8,11 +8,6 @@ * Purpose: This is the MPI-2 I/O driver. * * Limitations: - * H5FD_mpio_read & H5FD_mpio_write - * Eventually these should choose collective or independent i/o - * based on a parameter that is passed down to it from H5Dwrite, - * rather than the access_parms (which are fixed at the open). - * * H5FD_mpio_read * One implementation of MPI/MPI-IO causes MPI_Get_count * to return (incorrectly) a negative count. I (who?) added code @@ -21,6 +16,7 @@ * kluge is activated by #ifdef MPI_KLUGE0202. */ #include "H5private.h" /*library functions */ +#include "H5ACprivate.h" /* Metadata cache */ #include "H5Eprivate.h" /*error handling */ #include "H5Fprivate.h" /*files */ #include "H5FDprivate.h" /*file driver */ @@ -71,9 +67,9 @@ static herr_t H5FD_mpio_query(const H5FD_t *_f1, unsigned long *flags); static haddr_t H5FD_mpio_get_eoa(H5FD_t *_file); static herr_t H5FD_mpio_set_eoa(H5FD_t *_file, haddr_t addr); static haddr_t H5FD_mpio_get_eof(H5FD_t *_file); -static herr_t H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t type, hid_t fapl_id, haddr_t addr, +static herr_t H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size, void *buf); -static herr_t H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t fapl_id, haddr_t addr, +static herr_t H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size, const void *buf); static herr_t H5FD_mpio_flush(H5FD_t *_file, unsigned closing); @@ -1210,6 +1206,7 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t UNUSED type, hid_t dxpl_id, haddr_t add /* Make certain we have the correct type of property list */ assert(H5I_GENPROP_LST==H5I_get_type(dxpl_id)); assert(TRUE==H5P_isa_class(dxpl_id,H5P_DATASET_XFER)); + assert(buf); /* Portably initialize MPI status variable */ HDmemset(&mpi_stat,0,sizeof(MPI_Status)); @@ -1484,6 +1481,12 @@ done: * if the first I/O was a collective I/O using MPI derived types * and the next I/O was an independent I/O. * + * Quincey Koziol - 2002/07/18 + * Added "block_before_meta_write" dataset transfer flag, which + * is set during writes from a metadata cache flush and indicates + * that all the processes must sync up before (one of them) + * writing metadata. + * *------------------------------------------------------------------------- */ static herr_t @@ -1496,8 +1499,10 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, MPI_Offset mpi_off, mpi_disp; MPI_Status mpi_stat; MPI_Datatype buf_type, file_type; + int mpi_code; /* MPI return code */ int size_i, bytes_written; unsigned use_view_this_time=0; + unsigned block_before_meta_write=0; /* Whether to block before a metadata write */ H5P_genplist_t *plist; /* Property list pointer */ herr_t ret_value=SUCCEED; @@ -1512,6 +1517,7 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, /* Make certain we have the correct type of property list */ assert(H5I_GENPROP_LST==H5I_get_type(dxpl_id)); assert(TRUE==H5P_isa_class(dxpl_id,H5P_DATASET_XFER)); + assert(buf); /* Portably initialize MPI status variable */ HDmemset(&mpi_stat,0,sizeof(MPI_Status)); @@ -1582,19 +1588,36 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "MPI_File_set_view failed"); } /* end if */ - /* Only p<round> will do the actual write if all procs in comm write same data */ - if ((type!=H5FD_MEM_DRAW) && H5_mpi_1_metawrite_g) { - if (file->mpi_rank != file->mpi_round) { + /* Metadata specific actions */ + if(type!=H5FD_MEM_DRAW) { + /* Check if we need to syncronize all processes before attempting metadata write + * (Prevents race condition where the process writing the metadata goes ahead + * and writes the metadata to the file before all the processes have + * read the data, "transmitting" data from the "future" to the reading + * process. -QAK ) + */ + if(H5P_exist_plist(plist,H5AC_BLOCK_BEFORE_META_WRITE_NAME)>0) + if(H5P_get(plist,H5AC_BLOCK_BEFORE_META_WRITE_NAME,&block_before_meta_write)<0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get H5AC property"); + + if(block_before_meta_write) + if (MPI_SUCCESS!= (mpi_code=MPI_Barrier(file->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); + + /* Only p<round> will do the actual write if all procs in comm write same metadata */ + if (H5_mpi_1_metawrite_g) { + if (file->mpi_rank != file->mpi_round) { #ifdef H5FDmpio_DEBUG - if (H5FD_mpio_Debug[(int)'w']) { - fprintf(stdout, - " proc %d: in H5FD_mpio_write (write omitted)\n", - file->mpi_rank ); - } + if (H5FD_mpio_Debug[(int)'w']) { + fprintf(stdout, + " proc %d: in H5FD_mpio_write (write omitted)\n", + file->mpi_rank ); + } #endif - HGOTO_DONE(SUCCEED) /* skip the actual write */ + HGOTO_DONE(SUCCEED) /* skip the actual write */ + } } - } + } /* end if */ /* Write the data. */ assert(H5FD_MPIO_INDEPENDENT==dx->xfer_mode || H5FD_MPIO_COLLECTIVE==dx->xfer_mode); @@ -1681,18 +1704,6 @@ done: /* Round-robin rotate to the next process */ file->mpi_round = (++file->mpi_round)%file->mpi_size; -#ifdef QAK - { - int max,min; - - MPI_Allreduce(&file->mpi_round, &max, 1, MPI_INT, MPI_MAX, file->comm); - MPI_Allreduce(&file->mpi_round, &min, 1, MPI_INT, MPI_MIN, file->comm); - if(max!=file->mpi_round) - printf("%s: rank=%d, round=%d, max=%d\n",FUNC,file->mpi_rank,file->mpi_round,max); - if(min!=file->mpi_round) - printf("%s: rank=%d, round=%d, min=%d\n",FUNC,file->mpi_rank,file->mpi_round,min); - } -#endif /* QAK */ } /* end if */ } /* end if */ |