summaryrefslogtreecommitdiffstats
path: root/src/H5FDmpiposix.c
diff options
context:
space:
mode:
authorJohn Mainzer <mainzer@hdfgroup.org>2005-09-27 05:20:11 (GMT)
committerJohn Mainzer <mainzer@hdfgroup.org>2005-09-27 05:20:11 (GMT)
commitc100b0bf2639c03579ce1b2c4013b36c6f40350b (patch)
tree5c3b2834af1206110243886f357857900a8b108e /src/H5FDmpiposix.c
parentf9fc749ca218a878dbea4022ba1c2fb527f7822c (diff)
downloadhdf5-c100b0bf2639c03579ce1b2c4013b36c6f40350b.zip
hdf5-c100b0bf2639c03579ce1b2c4013b36c6f40350b.tar.gz
hdf5-c100b0bf2639c03579ce1b2c4013b36c6f40350b.tar.bz2
[svn-r11470] Purpose:
Repair synchronization bug in the metadata cache in PHDF5 Also repair numerous other bugs that surfaced in testing the bug fix. Description: While operations modifying metadata must be collective, we allow independant reads. This allows metadata caches on different processes to adjust to different sizes, and to place the entries on their dirty lists in different orders. Since only process 0 actually writes metadata to disk (all other processes thought they did, but the writes were discarded on the theory that they had to be collective), this made it possible for another process to modify metadata, flush it, and then read it back in in its original form (pre-modification) form. The possibilities for file corruption should be obvious. Solution: Make the policy that only process 0 can write to file explicit, and visible to the metadata caches. Thus only process 0 may flush dirty entries -- all other caches must retain dirty entries until they are informed by process 0 that the entries are clean. Synchronization is handled by counting the bytes of dirty cache entries created, and then synching up between the caches whenever the sum exceeds an (eventually user specified) limit. Dirty metadata creation is consistent across all processes because all operations modifying metadata must be collective. This change uncovered may bugs which are repaired in this checkin. It also required modification of H5HL and H5O to allocate file space on insertion rather than on flush from cache. Platforms tested: H5committest, heping(parallel & serial) Misc. update:
Diffstat (limited to 'src/H5FDmpiposix.c')
-rw-r--r--src/H5FDmpiposix.c34
1 files changed, 25 insertions, 9 deletions
diff --git a/src/H5FDmpiposix.c b/src/H5FDmpiposix.c
index 11e7849..de491f0 100644
--- a/src/H5FDmpiposix.c
+++ b/src/H5FDmpiposix.c
@@ -910,6 +910,12 @@ done:
*
* Modifications:
*
+ * John Mainzer -- 9/21/05
+ * Modified code to turn off the
+ * H5FD_FEAT_ACCUMULATE_METADATA_WRITE flag.
+ * With the movement of all cache writes to process 0,
+ * this flag has become problematic in PHDF5.
+ *
*-------------------------------------------------------------------------
*/
static herr_t
@@ -923,15 +929,6 @@ H5FD_mpiposix_query(const H5FD_t UNUSED *_file, unsigned long *flags /* out */)
if(flags) {
*flags=0;
*flags|=H5FD_FEAT_AGGREGATE_METADATA; /* OK to aggregate metadata allocations */
-
- /* Distinguish between updating the metadata accumulator on writes and
- * reads. This is particularly (perhaps only, even) important for MPI-I/O
- * where we guarantee that writes are collective, but reads may not be.
- * If we were to allow the metadata accumulator to be written during a
- * read operation, the application would hang.
- */
- *flags|=H5FD_FEAT_ACCUMULATE_METADATA_WRITE; /* OK to accumulate metadata for faster writes */
-
*flags|=H5FD_FEAT_AGGREGATE_SMALLDATA; /* OK to aggregate "small" raw data allocations */
} /* end if */
@@ -1235,6 +1232,14 @@ H5FD_mpiposix_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list")
/* Metadata specific actions */
+ /* All metadata is now written from process 0 -- thus this function
+ * needs to be re-written to reflect this. For now I have simply
+ * commented out the code that attempts to synchronize metadata
+ * writes between processes, but we should really just flag an error
+ * whenever any process other than process 0 attempts to write
+ * metadata.
+ * -- JRM 9/1/05
+ */
if(type!=H5FD_MEM_DRAW) {
unsigned block_before_meta_write=0; /* Whether to block before a metadata write */
@@ -1252,9 +1257,11 @@ H5FD_mpiposix_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
if(H5P_get(plist,H5AC_BLOCK_BEFORE_META_WRITE_NAME,&block_before_meta_write)<0)
HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get H5AC property")
+#if 0 /* JRM */
if(block_before_meta_write)
if (MPI_SUCCESS!= (mpi_code=MPI_Barrier(file->comm)))
HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code)
+#endif /* JRM */
/* Only one process will do the actual write if all procs in comm write same metadata */
if (file->mpi_rank != H5_PAR_META_WRITE)
@@ -1328,6 +1335,14 @@ done:
file->pos = HADDR_UNDEF;
file->op = OP_UNKNOWN;
} /* end if */
+#if 0 /* JRM */
+ /* Since metadata writes are now done by process 0 only, this broadcast
+ * is no longer needed. I leave it in and commented out to remind us
+ * that we need to re-work this function to reflect this reallity.
+ *
+ * -- JRM 9/1/05
+ */
+
/* Guard against getting into metadata broadcast in failure cases */
else {
/* when only one process writes, need to broadcast the ret_value to other processes */
@@ -1336,6 +1351,7 @@ done:
HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code)
} /* end if */
} /* end else */
+#endif /* JRM */
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD_mpiposix_write() */