diff options
author | John Mainzer <mainzer@hdfgroup.org> | 2009-02-06 18:20:13 (GMT) |
---|---|---|
committer | John Mainzer <mainzer@hdfgroup.org> | 2009-02-06 18:20:13 (GMT) |
commit | 2933d2f80b01c5e46c0a8161b3bd82249ab0b289 (patch) | |
tree | 30d6d611afc55328787e08efd7a4a17fbe0c85f7 | |
parent | 7418c063568d22fab2d34bd07664da60f44cdf98 (diff) | |
download | hdf5-2933d2f80b01c5e46c0a8161b3bd82249ab0b289.zip hdf5-2933d2f80b01c5e46c0a8161b3bd82249ab0b289.tar.gz hdf5-2933d2f80b01c5e46c0a8161b3bd82249ab0b289.tar.bz2 |
[svn-r16451] Repaired intermittant failure of the t_cache test in testpar.
The failure was caused by some over active sanity checking code in
unlock_entry(). In essence the code did not consider the possibility
that under certain, very unusual circumstances, an entry could be flushed
to disk during the H5AC_unprotect() call. Instead, it simply failed
if a dirty entry was marked clean after the call to H5AC_unprotect().
This bug in the test code was exposed by recent changes to the default
cache configuration made as part of the "metadata blizard" bug fix.
Fixed the bug by adding code to detect when an entry is flushed during
the call to H5AC_unprotect(), and not trigger a failure if a dirty entry
is marked clean after a call to H5AC_unprotect() if the entry has been
flushed.
In passing also found and fixed another test bug in which expunged
entries were erroneously marked as dirty in the test code's independant
register of entry status.
Tested parallel on Phoenix (AMD64 Linux) and Jam. Also ran t_cache
manually hundreds of times looking for intermittant failures.
Larry kindly tested (parallel) on Mercury.
-rw-r--r-- | testpar/t_cache.c | 154 |
1 files changed, 147 insertions, 7 deletions
diff --git a/testpar/t_cache.c b/testpar/t_cache.c index c858e9b..0fdd78a 100644 --- a/testpar/t_cache.c +++ b/testpar/t_cache.c @@ -129,6 +129,12 @@ long local_pins = 0; * processes, and thus cannot be marked as dirty unless they * happen to overlap some collective operation. * + * cleared: Boolean flag that is set to true whenever the entry is + * dirty, and is cleared via a call to clear_datum(). + * + * flushed: Boolean flag that is set to true whenever the entry is + * dirty, and is flushed via a call to flush_datum(). + * * index: Index of this instance of datum in the data_index[] array * discussed below. * @@ -146,6 +152,8 @@ struct datum hbool_t locked; hbool_t global_pinned; hbool_t local_pinned; + hbool_t cleared; + hbool_t flushed; int index; }; @@ -691,6 +699,9 @@ addr_to_datum_index(haddr_t base_addr) * JRM -- 7/11/06 * Added support for the local_len field. * + * JRM -- 2/4/09 + * Added initialization for the cleared and flushed fields. + * *****************************************************************************/ void @@ -728,6 +739,8 @@ init_data(void) data[i].locked = FALSE; data[i].global_pinned = FALSE; data[i].local_pinned = FALSE; + data[i].cleared = FALSE; + data[i].flushed = FALSE; data[i].index = i; data_index[i] = i; @@ -1662,13 +1675,17 @@ serve_write_request(struct mssg_t * mssg_ptr) * length of the entry, while retaining the original * value for communications with the server. * + * JRM -- 2/4/09 + * Added code to set the cleared flag when a dirty entry is + * cleared. + * *------------------------------------------------------------------------- */ herr_t clear_datum(H5F_t * f, void * thing, - hbool_t dest) + hbool_t dest) { int idx; struct datum * entry_ptr; @@ -1688,6 +1705,13 @@ clear_datum(H5F_t * f, HDassert( ( entry_ptr->header.size == entry_ptr->len ) || ( entry_ptr->header.size == entry_ptr->local_len ) ); + HDassert( entry_ptr->header.is_dirty == entry_ptr->dirty ); + + if ( entry_ptr->header.is_dirty ) { + + entry_ptr->cleared = TRUE; + } + entry_ptr->header.is_dirty = FALSE; entry_ptr->dirty = FALSE; @@ -1791,6 +1815,10 @@ destroy_datum(H5F_t UNUSED * f, * length of the entry, while retaining the original * value for communications with the server. * + * JRM -- 2/4/09 + * Added code to set the flushed flag when a dirty entry + * is flushed. + * *------------------------------------------------------------------------- */ @@ -1859,6 +1887,7 @@ flush_datum(H5F_t *f, { entry_ptr->header.is_dirty = FALSE; entry_ptr->dirty = FALSE; + entry_ptr->flushed = TRUE; } } } @@ -2175,8 +2204,6 @@ expunge_entry(H5C_t * cache_ptr, HDassert( !(entry_ptr->global_pinned) ); HDassert( !(entry_ptr->local_pinned) ); - entry_ptr->dirty = TRUE; - if ( nerrors == 0 ) { result = H5AC_expunge_entry(file_ptr, -1, &(types[0]), @@ -3492,7 +3519,7 @@ setup_rand(void) const char * fcn_name = "setup_rand()"; hbool_t use_predefined_seeds = FALSE; int num_predefined_seeds = 3; - unsigned predefined_seeds[3] = {18669, 89925, 12577}; + unsigned predefined_seeds[3] = {33402, 33505, 33422}; unsigned seed; struct timeval tv; struct timezone tz; @@ -3663,9 +3690,14 @@ unlock_entry(H5C_t * cache_ptr, HDassert( ((entry_ptr->header).type)->id == DATUM_ENTRY_TYPE ); - if ( ( flags & H5AC__DIRTIED_FLAG ) != 0 - && ( (flags & H5C__DELETED_FLAG) == 0 ) ) { - + if ( ( (flags & H5AC__DIRTIED_FLAG) != 0 ) && + ( (flags & H5C__DELETED_FLAG) == 0 ) && + ( ! ( ( ( world_mpi_rank == 0 ) && ( entry_ptr->flushed ) ) + || + ( ( world_mpi_rank != 0 ) && ( entry_ptr->cleared ) ) + ) + ) + ) { HDassert( entry_ptr->header.is_dirty ); HDassert( entry_ptr->dirty ); } @@ -4412,6 +4444,8 @@ smoke_check_3(void) { const char * fcn_name = "smoke_check_3()"; hbool_t success = TRUE; + hbool_t verbose = FALSE; + int cp = 0; int i; int max_nerrors; int min_count; @@ -4428,12 +4462,18 @@ smoke_check_3(void) TESTING("smoke check #3"); } + /* 0 */ + if ( verbose ) { HDfprintf(stderr, "%d: cp = %d\n", world_mpi_rank, cp++); } + nerrors = 0; init_data(); reset_stats(); if ( world_mpi_rank == world_server_mpi_rank ) { + /* 1 */ + if ( verbose ) {HDfprintf(stderr, "%d: cp = %d\n", world_mpi_rank, cp++);} + if ( ! server_main() ) { /* some error occured in the server -- report failure */ @@ -4443,9 +4483,15 @@ smoke_check_3(void) world_mpi_rank, fcn_name); } } + + /* 2 */ + if ( verbose ) {HDfprintf(stderr, "%d: cp = %d\n", world_mpi_rank, cp++);} } else /* run the clients */ { + /* 1 */ + if ( verbose ) {HDfprintf(stderr, "%d: cp = %d\n", world_mpi_rank, cp++);} + if ( ! setup_cache_for_test(&fid, &file_ptr, &cache_ptr) ) { nerrors++; @@ -4457,6 +4503,9 @@ smoke_check_3(void) } } + /* 2 */ + if ( verbose ) {HDfprintf(stderr, "%d: cp = %d\n", world_mpi_rank, cp++);} + min_count = 100 / ((file_mpi_rank + 1) * (file_mpi_rank + 1)); max_count = min_count + 50; @@ -4472,6 +4521,9 @@ smoke_check_3(void) } } + /* 3 */ + if ( verbose ) {HDfprintf(stderr, "%d: cp = %d\n", world_mpi_rank, cp++);} + min_count = 100 / ((file_mpi_rank + 2) * (file_mpi_rank + 2)); max_count = min_count + 50; @@ -4512,6 +4564,9 @@ smoke_check_3(void) } + /* 4 */ + if ( verbose ) {HDfprintf(stderr, "%d: cp = %d\n", world_mpi_rank, cp++);} + /* flush the file to be sure that we have no problems flushing * pinned entries @@ -4524,6 +4579,9 @@ smoke_check_3(void) } } + /* 5 */ + if ( verbose ) {HDfprintf(stderr, "%d: cp = %d\n", world_mpi_rank, cp++);} + min_idx = 0; max_idx = ((virt_num_data_entries / 10) / @@ -4558,6 +4616,9 @@ smoke_check_3(void) } } + /* 6 */ + if ( verbose ) {HDfprintf(stderr, "%d: cp = %d\n", world_mpi_rank, cp++);} + min_idx = 0; max_idx = ((virt_num_data_entries / 10) / ((file_mpi_rank + 3) * (file_mpi_rank + 3))) - 1; @@ -4574,6 +4635,9 @@ smoke_check_3(void) min_idx, max_idx, 0, 100); } + /* 7 */ + if ( verbose ) {HDfprintf(stderr, "%d: cp = %d\n", world_mpi_rank, cp++);} + /* we can't rename pinned entries, so release any local pins now. */ local_unpin_all_entries(cache_ptr, file_ptr, FALSE); @@ -4592,6 +4656,9 @@ smoke_check_3(void) min_count, max_count); } + /* 8 */ + if ( verbose ) {HDfprintf(stderr, "%d: cp = %d\n", world_mpi_rank, cp++);} + /* ...and then rename them back. */ for ( i = (virt_num_data_entries / 2) - 1; i >= 0; i-- ) { @@ -4604,6 +4671,9 @@ smoke_check_3(void) min_count, max_count); } + /* 9 */ + if ( verbose ) {HDfprintf(stderr, "%d: cp = %d\n", world_mpi_rank, cp++);} + /* finally, do some dirty lock/unlocks while we give the cache * a chance t reduce its size. */ @@ -4627,6 +4697,9 @@ smoke_check_3(void) } } + /* 10 */ + if ( verbose ) {HDfprintf(stderr, "%d: cp = %d\n", world_mpi_rank, cp++);} + /* release any local pins before we take down the cache. */ local_unpin_all_entries(cache_ptr, file_ptr, FALSE); @@ -4642,6 +4715,9 @@ smoke_check_3(void) } } + /* 11 */ + if ( verbose ) {HDfprintf(stderr, "%d: cp = %d\n", world_mpi_rank, cp++);} + /* verify that all instances of datum are back where the started * and are clean. */ @@ -4676,6 +4752,9 @@ smoke_check_3(void) } } } + + /* 12 */ + if ( verbose ) {HDfprintf(stderr, "%d: cp = %d\n", world_mpi_rank, cp++);} } max_nerrors = get_max_nerrors(); @@ -5043,6 +5122,8 @@ smoke_check_5(void) { const char * fcn_name = "smoke_check_5()"; hbool_t success = TRUE; + hbool_t verbose = FALSE; + int cp = 0; int i; int max_nerrors; hid_t fid = -1; @@ -5055,12 +5136,20 @@ smoke_check_5(void) TESTING("smoke check #5"); } + /* 0 */ + if ( verbose ) { HDfprintf(stderr, "%d: cp = %d\n", world_mpi_rank, cp++); } + nerrors = 0; init_data(); reset_stats(); if ( world_mpi_rank == world_server_mpi_rank ) { + /* 1 */ + if ( verbose ) { + HDfprintf(stderr, "%d: cp = %d\n", world_mpi_rank, cp++); + } + if ( ! server_main() ) { /* some error occured in the server -- report failure */ @@ -5070,9 +5159,20 @@ smoke_check_5(void) world_mpi_rank, fcn_name); } } + + /* 2 */ + if ( verbose ) { + HDfprintf(stderr, "%d: cp = %d\n", world_mpi_rank, cp++); + } } else /* run the clients */ { + + /* 1 */ + if ( verbose ) { + HDfprintf(stderr, "%d: cp = %d\n", world_mpi_rank, cp++); + } + if ( ! setup_cache_for_test(&fid, &file_ptr, &cache_ptr) ) { nerrors++; @@ -5084,11 +5184,21 @@ smoke_check_5(void) } } + /* 2 */ + if ( verbose ) { + HDfprintf(stderr, "%d: cp = %d\n", world_mpi_rank, cp++); + } + for ( i = 0; i < (virt_num_data_entries / 2); i++ ) { insert_entry(cache_ptr, file_ptr, i, H5AC__NO_FLAGS_SET); } + /* 3 */ + if ( verbose ) { + HDfprintf(stderr, "%d: cp = %d\n", world_mpi_rank, cp++); + } + /* flush the file so we can lock known clean entries. */ if ( H5Fflush(fid, H5F_SCOPE_GLOBAL) < 0 ) { nerrors++; @@ -5098,6 +5208,11 @@ smoke_check_5(void) } } + /* 4 */ + if ( verbose ) { + HDfprintf(stderr, "%d: cp = %d\n", world_mpi_rank, cp++); + } + for ( i = 0; i < (virt_num_data_entries / 4); i++ ) { lock_entry(cache_ptr, file_ptr, i); @@ -5121,6 +5236,11 @@ smoke_check_5(void) } } + /* 5 */ + if ( verbose ) { + HDfprintf(stderr, "%d: cp = %d\n", world_mpi_rank, cp++); + } + for ( i = (virt_num_data_entries / 2) - 1; i >= (virt_num_data_entries / 4); i-- ) @@ -5154,6 +5274,11 @@ smoke_check_5(void) unpin_entry(cache_ptr, file_ptr, i, TRUE, FALSE, FALSE); } + /* 6 */ + if ( verbose ) { + HDfprintf(stderr, "%d: cp = %d\n", world_mpi_rank, cp++); + } + if ( fid >= 0 ) { if ( ! take_down_cache(fid) ) { @@ -5166,6 +5291,11 @@ smoke_check_5(void) } } + /* 7 */ + if ( verbose ) { + HDfprintf(stderr, "%d: cp = %d\n", world_mpi_rank, cp++); + } + /* verify that all instance of datum are back where the started * and are clean. */ @@ -5176,6 +5306,11 @@ smoke_check_5(void) HDassert( ! (data[i].dirty) ); } + /* 8 */ + if ( verbose ) { + HDfprintf(stderr, "%d: cp = %d\n", world_mpi_rank, cp++); + } + /* compose the done message */ mssg.req = DONE_REQ_CODE; mssg.src = world_mpi_rank; @@ -5199,6 +5334,11 @@ smoke_check_5(void) } } } + + /* 9 */ + if ( verbose ) { + HDfprintf(stderr, "%d: cp = %d\n", world_mpi_rank, cp++); + } } max_nerrors = get_max_nerrors(); |