diff options
author | John Mainzer <mainzer@hdfgroup.org> | 2010-10-17 07:52:19 (GMT) |
---|---|---|
committer | John Mainzer <mainzer@hdfgroup.org> | 2010-10-17 07:52:19 (GMT) |
commit | 93b2b7cf0782943085e2e9cb87459ed60d10c97b (patch) | |
tree | c279ed3cf00e85662cbc201c51050aebd2e4f479 /src | |
parent | 1c14d3e96d8e4ab0b504ca98fdd3958c9cc451ea (diff) | |
download | hdf5-93b2b7cf0782943085e2e9cb87459ed60d10c97b.zip hdf5-93b2b7cf0782943085e2e9cb87459ed60d10c97b.tar.gz hdf5-93b2b7cf0782943085e2e9cb87459ed60d10c97b.tar.bz2 |
[svn-r19621] Port fo fix for the round robin parallel flush bug caused by the failure
of the H5Ocache.c code to update its image of the on disk representation
of the object header on a call to clear callback.
This wasn't an issue as long as all flushes of the object header were
made from the same process, but if an object header is modified, and
then flushed on one process and cleared on the rest, the changes were
not be reflected in the images of the on disk representation on all
processes where the object header was cleared rather than flushed.
If one of these processes did the next flush, the changes were lost in
the on disk representation.
Fixed this by causing all dirty messages and to be written to the copy
of the on disk image maintained by the object header code on both flush
and clear.
Also added associated test code in t_mdset.c.
Also checking in some cache debug code developed while chasing this bug.
Commit tested and tested (parallel) on phoenix.
Diffstat (limited to 'src')
-rw-r--r-- | src/H5AC.c | 34 | ||||
-rw-r--r-- | src/H5ACprivate.h | 2 | ||||
-rw-r--r-- | src/H5C.c | 131 | ||||
-rw-r--r-- | src/H5Cprivate.h | 3 | ||||
-rw-r--r-- | src/H5Ocache.c | 74 |
5 files changed, 244 insertions, 0 deletions
@@ -1710,6 +1710,40 @@ done: /*------------------------------------------------------------------------- + * Function: H5AC_dump_cache + * + * Purpose: Dumps a summary of the contents of the metadata cache + * to stdout. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: John Mainzer + * Sunday, October 10, 2010 + * + *------------------------------------------------------------------------- + */ +herr_t +H5AC_dump_cache(const H5F_t *f) +{ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(H5AC_dump_cache, FAIL) + + HDassert(f); + HDassert(f->shared); + HDassert(f->shared->cache); + + if ( H5C_dump_cache(f->shared->cache, H5F_OPEN_NAME(f)) < 0 ) { + + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "H5C_dump_cache() failed.") + } + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* H5AC_dump_cache() */ + + +/*------------------------------------------------------------------------- * Function: H5AC_get_cache_auto_resize_config * * Purpose: Wrapper function for H5C_get_cache_auto_resize_config(). diff --git a/src/H5ACprivate.h b/src/H5ACprivate.h index 1c37d41..359b576 100644 --- a/src/H5ACprivate.h +++ b/src/H5ACprivate.h @@ -339,6 +339,8 @@ H5_DLL herr_t H5AC_set_write_done_callback(H5C_t * cache_ptr, void (* write_done)(void)); H5_DLL herr_t H5AC_stats(const H5F_t *f); +H5_DLL herr_t H5AC_dump_cache(const H5F_t *f); + H5_DLL herr_t H5AC_get_cache_auto_resize_config(const H5AC_t * cache_ptr, H5AC_cache_config_t *config_ptr); @@ -4878,6 +4878,137 @@ H5C_stats__reset(H5C_t UNUSED * cache_ptr) /*------------------------------------------------------------------------- + * Function: H5C_dump_cache + * + * Purpose: Print a summary of the contents of the metadata cache for + * debugging purposes. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: John Mainzer + * 10/10/10 + * + *------------------------------------------------------------------------- + */ +herr_t +H5C_dump_cache(H5C_t * cache_ptr, + const char * cache_name) +{ + herr_t ret_value = SUCCEED; /* Return value */ + int i; + H5C_cache_entry_t * entry_ptr = NULL; + H5SL_t * slist_ptr = NULL; + H5SL_node_t * node_ptr = NULL; + + FUNC_ENTER_NOAPI(H5C_dump_cache, FAIL) + + HDassert(cache_ptr != NULL); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + HDassert(cache_name != NULL ); + + /* First, create a skip list */ + slist_ptr = H5SL_create(H5SL_TYPE_HADDR); + + if ( slist_ptr == NULL ) { + + HGOTO_ERROR(H5E_CACHE, H5E_CANTCREATE, FAIL, "can't create skip list.") + } + + /* Next, scan the index, and insert all entries in the skip list. + * Do this, as we want to display cache entries in increasing address + * order. + */ + for ( i = 0; i < H5C__HASH_TABLE_LEN; i++ ) { + + entry_ptr = cache_ptr->index[i]; + + while ( entry_ptr != NULL ) { + + HDassert( entry_ptr->magic == H5C__H5C_CACHE_ENTRY_T_MAGIC ); + + if ( H5SL_insert(slist_ptr, entry_ptr, &(entry_ptr->addr)) < 0 ) { + + HGOTO_ERROR(H5E_CACHE, H5E_BADVALUE, FAIL, \ + "Can't insert entry in skip list") + } + + entry_ptr = entry_ptr->ht_next; + } + } + + /* If we get this far, all entries in the cache are listed in the + * skip list -- scan the skip list generating the desired output. + */ + + HDfprintf(stdout, "\n\nDump of metadata cache \"%s\".\n", cache_name); + HDfprintf(stdout, + "Num: Addr: Len: Type: Prot: Pinned: Dirty:\n"); + + i = 0; + + node_ptr = H5SL_first(slist_ptr); + + if ( node_ptr != NULL ) { + + entry_ptr = (H5C_cache_entry_t *)H5SL_item(node_ptr); + + } else { + + entry_ptr = NULL; + } + + while ( entry_ptr != NULL ) { + + HDassert( entry_ptr->magic == H5C__H5C_CACHE_ENTRY_T_MAGIC ); + + HDfprintf(stdout, + "%s%d 0x%08llx 0x%3llx %2d %d %d %d\n", + cache_ptr->prefix, i, + (long long)(entry_ptr->addr), + (long long)(entry_ptr->size), + (int)(entry_ptr->type->id), + (int)(entry_ptr->is_protected), + (int)(entry_ptr->is_pinned), + (int)(entry_ptr->is_dirty)); + + /* increment node_ptr before we delete its target */ + node_ptr = H5SL_next(node_ptr); + + /* remove the first item in the skip list */ + if ( H5SL_remove(slist_ptr, &(entry_ptr->addr)) != entry_ptr ) { + + HGOTO_ERROR(H5E_CACHE, H5E_BADVALUE, FAIL, \ + "Can't delete entry from skip list.") + } + + if ( node_ptr != NULL ) { + + entry_ptr = (H5C_cache_entry_t *)H5SL_item(node_ptr); + + } else { + + entry_ptr = NULL; + } + + i++; + } + + HDfprintf(stdout, "\n\n"); + + /* Finally, discard the skip list */ + + HDassert( H5SL_count(slist_ptr) == 0 ); + + H5SL_close(slist_ptr); + +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5C_dump_cache() */ + + +/*------------------------------------------------------------------------- * Function: H5C_unpin_entry() * * Purpose: Unpin a cache entry. The entry must be unprotected at diff --git a/src/H5Cprivate.h b/src/H5Cprivate.h index 2ad6909..389cf5d 100644 --- a/src/H5Cprivate.h +++ b/src/H5Cprivate.h @@ -1121,6 +1121,9 @@ H5_DLL herr_t H5C_stats(H5C_t * cache_ptr, H5_DLL void H5C_stats__reset(H5C_t * cache_ptr); +H5_DLL herr_t H5C_dump_cache(H5C_t * cache_ptr, + const char * cache_name); + H5_DLL herr_t H5C_unpin_entry(void *thing); H5_DLL herr_t H5C_unprotect(H5F_t * f, diff --git a/src/H5Ocache.c b/src/H5Ocache.c index fd8be18..a89da1f 100644 --- a/src/H5Ocache.c +++ b/src/H5Ocache.c @@ -558,6 +558,21 @@ done: * koziol@ncsa.uiuc.edu * Mar 20 2003 * + * Changes: In the parallel case, there is the possibility that the + * the object header may be flushed by different processes + * over the life of the computation. Thus we must ensure + * that the chunk images are up to date before we mark the + * messages clean -- as otherwise we may overwrite valid + * data with a blank section of a chunk image. + * + * To deal with this, I have added code to call + * H5O_chunk_serialize() for all chunks before we + * mark all messages as clean if we are not destroying the + * object. Do this in the parallel case only, as the problem + * can only occur in this context. + * + * JRM -- 10/12/10 + * *------------------------------------------------------------------------- */ static herr_t @@ -571,6 +586,30 @@ H5O_clear(H5F_t *f, H5O_t *oh, hbool_t destroy) /* check args */ HDassert(oh); +#ifdef H5_HAVE_PARALLEL + if ( ( oh->cache_info.is_dirty ) && ( ! destroy ) ) { + + size_t i; + + /* scan through all chunks associated with the object header, + * and cause them to update their images for all entries currently + * marked dirty. Must do this in the parallel case, as it is possible + * that this processor may clear this object header several times + * before flushing it -- thus causing undefined sections of the image + * to be written to disk overwriting valid data. + */ + + for ( i = 0; i < oh->nchunks; i++ ) { + + if ( H5O_chunk_serialize(f, oh, i) < 0 ) { + + HGOTO_ERROR(H5E_OHDR, H5E_CANTSERIALIZE, FAIL, + "unable to serialize object header chunk") + } + } + } +#endif /* H5_HAVE_PARALLEL */ + /* Mark messages as clean */ for(u = 0; u < oh->nmesgs; u++) oh->mesg[u].dirty = FALSE; @@ -828,6 +867,30 @@ done: * koziol@hdfgroup.org * July 12, 2008 * + * Changes: In the parallel case, there is the possibility that the + * the object header chunk may be flushed by different + * processes over the life of the computation. Thus we must + * ensure that the chunk image is up to date before we mark its + * messages clean -- as otherwise we may overwrite valid + * data with a blank section of a chunk image. + * + * To deal with this, I have added code to call + * H5O_chunk_serialize() for this chunk before we + * mark all messages as clean if we are not destroying the + * chunk. + * + * Do this in the parallel case only, as the problem + * can only occur in this context. + * + * Note that at present at least, it seems that this fix + * is not necessary, as we don't seem to be able to + * generate a dirty chunk without creating a dirty object + * header. However, the object header code will be changing + * a lot in the near future, so I'll leave this fix in + * for now, unless Quincey requests otherwise. + * + * JRM -- 10/12/10 + * *------------------------------------------------------------------------- */ static herr_t @@ -841,6 +904,17 @@ H5O_cache_chk_clear(H5F_t *f, H5O_chunk_proxy_t *chk_proxy, hbool_t destroy) /* check args */ HDassert(chk_proxy); +#ifdef H5_HAVE_PARALLEL + if ( ( chk_proxy->oh->cache_info.is_dirty ) && ( ! destroy ) ) { + + if ( H5O_chunk_serialize(f, chk_proxy->oh, chk_proxy->chunkno) < 0 ) { + + HGOTO_ERROR(H5E_OHDR, H5E_CANTSERIALIZE, FAIL, + "unable to serialize object header chunk") + } + } +#endif /* H5_HAVE_PARALLEL */ + /* Mark messages in chunk as clean */ for(u = 0; u < chk_proxy->oh->nmesgs; u++) if(chk_proxy->oh->mesg[u].chunkno == chk_proxy->chunkno) |