diff options
author | John Mainzer <mainzer@hdfgroup.org> | 2005-09-27 05:20:11 (GMT) |
---|---|---|
committer | John Mainzer <mainzer@hdfgroup.org> | 2005-09-27 05:20:11 (GMT) |
commit | c100b0bf2639c03579ce1b2c4013b36c6f40350b (patch) | |
tree | 5c3b2834af1206110243886f357857900a8b108e /src/H5FDmpiposix.c | |
parent | f9fc749ca218a878dbea4022ba1c2fb527f7822c (diff) | |
download | hdf5-c100b0bf2639c03579ce1b2c4013b36c6f40350b.zip hdf5-c100b0bf2639c03579ce1b2c4013b36c6f40350b.tar.gz hdf5-c100b0bf2639c03579ce1b2c4013b36c6f40350b.tar.bz2 |
[svn-r11470] Purpose:
Repair synchronization bug in the metadata cache in PHDF5
Also repair numerous other bugs that surfaced in testing the
bug fix.
Description:
While operations modifying metadata must be collective, we allow
independant reads. This allows metadata caches on different processes
to adjust to different sizes, and to place the entries on their dirty
lists in different orders. Since only process 0 actually writes
metadata to disk (all other processes thought they did, but the writes
were discarded on the theory that they had to be collective), this made
it possible for another process to modify metadata, flush it, and then
read it back in in its original form (pre-modification) form. The
possibilities for file corruption should be obvious.
Solution:
Make the policy that only process 0 can write to file explicit, and
visible to the metadata caches. Thus only process 0 may flush dirty
entries -- all other caches must retain dirty entries until they are
informed by process 0 that the entries are clean.
Synchronization is handled by counting the bytes of dirty cache entries
created, and then synching up between the caches whenever the sum
exceeds an (eventually user specified) limit. Dirty metadata creation
is consistent across all processes because all operations modifying
metadata must be collective.
This change uncovered may bugs which are repaired in this checkin.
It also required modification of H5HL and H5O to allocate file space
on insertion rather than on flush from cache.
Platforms tested:
H5committest, heping(parallel & serial)
Misc. update:
Diffstat (limited to 'src/H5FDmpiposix.c')
-rw-r--r-- | src/H5FDmpiposix.c | 34 |
1 files changed, 25 insertions, 9 deletions
diff --git a/src/H5FDmpiposix.c b/src/H5FDmpiposix.c index 11e7849..de491f0 100644 --- a/src/H5FDmpiposix.c +++ b/src/H5FDmpiposix.c @@ -910,6 +910,12 @@ done: * * Modifications: * + * John Mainzer -- 9/21/05 + * Modified code to turn off the + * H5FD_FEAT_ACCUMULATE_METADATA_WRITE flag. + * With the movement of all cache writes to process 0, + * this flag has become problematic in PHDF5. + * *------------------------------------------------------------------------- */ static herr_t @@ -923,15 +929,6 @@ H5FD_mpiposix_query(const H5FD_t UNUSED *_file, unsigned long *flags /* out */) if(flags) { *flags=0; *flags|=H5FD_FEAT_AGGREGATE_METADATA; /* OK to aggregate metadata allocations */ - - /* Distinguish between updating the metadata accumulator on writes and - * reads. This is particularly (perhaps only, even) important for MPI-I/O - * where we guarantee that writes are collective, but reads may not be. - * If we were to allow the metadata accumulator to be written during a - * read operation, the application would hang. - */ - *flags|=H5FD_FEAT_ACCUMULATE_METADATA_WRITE; /* OK to accumulate metadata for faster writes */ - *flags|=H5FD_FEAT_AGGREGATE_SMALLDATA; /* OK to aggregate "small" raw data allocations */ } /* end if */ @@ -1235,6 +1232,14 @@ H5FD_mpiposix_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list") /* Metadata specific actions */ + /* All metadata is now written from process 0 -- thus this function + * needs to be re-written to reflect this. For now I have simply + * commented out the code that attempts to synchronize metadata + * writes between processes, but we should really just flag an error + * whenever any process other than process 0 attempts to write + * metadata. + * -- JRM 9/1/05 + */ if(type!=H5FD_MEM_DRAW) { unsigned block_before_meta_write=0; /* Whether to block before a metadata write */ @@ -1252,9 +1257,11 @@ H5FD_mpiposix_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, if(H5P_get(plist,H5AC_BLOCK_BEFORE_META_WRITE_NAME,&block_before_meta_write)<0) HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get H5AC property") +#if 0 /* JRM */ if(block_before_meta_write) if (MPI_SUCCESS!= (mpi_code=MPI_Barrier(file->comm))) HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code) +#endif /* JRM */ /* Only one process will do the actual write if all procs in comm write same metadata */ if (file->mpi_rank != H5_PAR_META_WRITE) @@ -1328,6 +1335,14 @@ done: file->pos = HADDR_UNDEF; file->op = OP_UNKNOWN; } /* end if */ +#if 0 /* JRM */ + /* Since metadata writes are now done by process 0 only, this broadcast + * is no longer needed. I leave it in and commented out to remind us + * that we need to re-work this function to reflect this reallity. + * + * -- JRM 9/1/05 + */ + /* Guard against getting into metadata broadcast in failure cases */ else { /* when only one process writes, need to broadcast the ret_value to other processes */ @@ -1336,6 +1351,7 @@ done: HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code) } /* end if */ } /* end else */ +#endif /* JRM */ FUNC_LEAVE_NOAPI(ret_value) } /* end H5FD_mpiposix_write() */ |