[svn-r11470] Purpose:

Repair synchronization bug in the metadata cache in PHDF5 Also repair numerous other bugs that surfaced in testing the bug fix. Description: While operations modifying metadata must be collective, we allow independant reads. This allows metadata caches on different processes to adjust to different sizes, and to place the entries on their dirty lists in different orders. Since only process 0 actually writes metadata to disk (all other processes thought they did, but the writes were discarded on the theory that they had to be collective), this made it possible for another process to modify metadata, flush it, and then read it back in in its original form (pre-modification) form. The possibilities for file corruption should be obvious. Solution: Make the policy that only process 0 can write to file explicit, and visible to the metadata caches. Thus only process 0 may flush dirty entries -- all other caches must retain dirty entries until they are informed by process 0 that the entries are clean. Synchronization is handled by counting the bytes of dirty cache entries created, and then synching up between the caches whenever the sum exceeds an (eventually user specified) limit. Dirty metadata creation is consistent across all processes because all operations modifying metadata must be collective. This change uncovered may bugs which are repaired in this checkin. It also required modification of H5HL and H5O to allocate file space on insertion rather than on flush from cache. Platforms tested: H5committest, heping(parallel & serial) Misc. update:
author: John Mainzer <mainzer@hdfgroup.org> 2005-09-27 05:20:11 (GMT)
committer: John Mainzer <mainzer@hdfgroup.org> 2005-09-27 05:20:11 (GMT)
commit: c100b0bf2639c03579ce1b2c4013b36c6f40350b (patch)
tree: 5c3b2834af1206110243886f357857900a8b108e /src/H5FDmpio.c
parent: f9fc749ca218a878dbea4022ba1c2fb527f7822c (diff)
download: hdf5-c100b0bf2639c03579ce1b2c4013b36c6f40350b.zip
hdf5-c100b0bf2639c03579ce1b2c4013b36c6f40350b.tar.gz
hdf5-c100b0bf2639c03579ce1b2c4013b36c6f40350b.tar.bz2
1 files changed, 28 insertions, 10 deletions
diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c
index f285f1a..3cf1968 100644
--- a/src/H5FDmpio.c
+++ b/src/H5FDmpio.c
@@ -934,6 +934,13 @@ done:
  *
  * Modifications:
  *
+ *		John Mainzer -- 9/21/05
+ *		Modified code to turn off the
+ *		H5FD_FEAT_ACCUMULATE_METADATA_WRITE flag.
+ *              With the movement of
+ *		all cache writes to process 0, this flag has become
+ *		problematic in PHDF5.
+ *
  *-------------------------------------------------------------------------
  */
 static herr_t
@@ -947,15 +954,6 @@ H5FD_mpio_query(const H5FD_t UNUSED *_file, unsigned long *flags /* out */)
     if(flags) {
         *flags=0;
         *flags|=H5FD_FEAT_AGGREGATE_METADATA; /* OK to aggregate metadata allocations */
-
-        /* Distinguish between updating the metadata accumulator on writes and
-         * reads.  This is particularly (perhaps only, even) important for MPI-I/O
-         * where we guarantee that writes are collective, but reads may not be.
-         * If we were to allow the metadata accumulator to be written during a
-         * read operation, the application would hang.
-         */
-        *flags|=H5FD_FEAT_ACCUMULATE_METADATA_WRITE; /* OK to accumulate metadata for faster writes */
-
         *flags|=H5FD_FEAT_AGGREGATE_SMALLDATA; /* OK to aggregate "small" raw data allocations */
     } /* end if */
 
@@ -1553,9 +1551,18 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
             if(H5P_get(plist,H5AC_BLOCK_BEFORE_META_WRITE_NAME,&block_before_meta_write)<0)
                 HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get H5AC property")
 
+#if 0 /* JRM */
+	/* The metadata cache now only writes from process 0, which makes
+	 * this synchronization incorrect.  I'm leaving this code commented
+	 * out instead of deleting it to remind us that we should re-write
+	 * this function so that a metadata write from any other process 
+	 * should flag an error.
+	 *                                  -- JRM 9/1/05
+	 */
         if(block_before_meta_write)
             if (MPI_SUCCESS!= (mpi_code=MPI_Barrier(file->comm)))
                 HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code)
+#endif /* JRM */
 
         /* Only one process will do the actual write if all procs in comm write same metadata */
         if (file->mpi_rank != H5_PAR_META_WRITE) {
@@ -1616,11 +1623,22 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
     file->eof = HADDR_UNDEF;
 
 done:
-    /* if only one process writes, need to broadcast the ret_value to other processes */
+
+#if 0 /* JRM */
+    /* Since metadata writes are now done by process 0 only, this broadcast
+     * is no longer needed.  I leave it in and commented out to remind us 
+     * that we need to re-work this function to reflect this reallity.
+     *
+     *                                          -- JRM 9/1/05
+     */
+    /* if only one process writes, need to broadcast the ret_value to 
+     * other processes 
+     */
     if (type!=H5FD_MEM_DRAW) {
 	if (MPI_SUCCESS != (mpi_code=MPI_Bcast(&ret_value, sizeof(ret_value), MPI_BYTE, H5_PAR_META_WRITE, file->comm)))
 	    HMPI_DONE_ERROR(FAIL, "MPI_Bcast failed", mpi_code)
     } /* end if */
+#endif /* JRM */
 
 #ifdef H5FDmpio_DEBUG
     if (H5FD_mpio_Debug[(int)'t'])
author	John Mainzer <mainzer@hdfgroup.org>	2005-09-27 05:20:11 (GMT)
committer	John Mainzer <mainzer@hdfgroup.org>	2005-09-27 05:20:11 (GMT)
commit	c100b0bf2639c03579ce1b2c4013b36c6f40350b (patch)
tree	5c3b2834af1206110243886f357857900a8b108e /src/H5FDmpio.c
parent	f9fc749ca218a878dbea4022ba1c2fb527f7822c (diff)
download	hdf5-c100b0bf2639c03579ce1b2c4013b36c6f40350b.zip hdf5-c100b0bf2639c03579ce1b2c4013b36c6f40350b.tar.gz hdf5-c100b0bf2639c03579ce1b2c4013b36c6f40350b.tar.bz2