From 8e0b427bf2a22871318e1c3ca76717344ec0019f Mon Sep 17 00:00:00 2001 From: jhendersonHDF Date: Tue, 1 Mar 2022 12:30:49 -0600 Subject: Fix metadata cache bug when resizing a pinned/protected entry (v2) (#1463) --- release_docs/RELEASE.txt | 26 +++++++++++------ src/H5AC.c | 73 ++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 84 insertions(+), 15 deletions(-) diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt index 2346e2f..1fd6f2d 100644 --- a/release_docs/RELEASE.txt +++ b/release_docs/RELEASE.txt @@ -922,15 +922,6 @@ New Features * Improved coverage of regression testing for the feature - NOTE: we are aware of and working on a fix for an assertion failure that - occurs during regression testing of the feature. It looks similar to: - - hdf5/src/H5ACmpio.c:1089: H5AC__log_moved_entry: Assertion `!entry_dirty' failed. - - so far, this assertion failure has only occurred during testing and has - not been seen in any production uses, but until this bug is fixed the - feature should be considered unstable. - (JTH - 2022/2/23) Fortran Library: @@ -1138,6 +1129,23 @@ Bug Fixes since HDF5-1.12.0 release =================================== Library ------- + - Fixed a metadata cache bug when resizing a pinned/protected cache entry + + When resizing a pinned/protected cache entry, the metadata + cache code previously would wait until after resizing the + entry to attempt to log the newly-dirtied entry. This would + cause H5C_resize_entry to mark the entry as dirty and make + H5AC_resize_entry think that it doesn't need to add the + newly-dirtied entry to the dirty entries skiplist. + + Thus, a subsequent H5AC__log_moved_entry would think it + needs to allocate a new entry for insertion into the dirty + entry skip list, since the entry doesn't exist on that list. + This causes an assertion failure, as the code to allocate a + new entry assumes that the entry is not dirty. + + (JRM - 2022/02/28) + - Issue #1436 identified a problem with the H5_VERS_RELEASE check in the H5check_version function. diff --git a/src/H5AC.c b/src/H5AC.c index 47d3a65..ac28a8c 100644 --- a/src/H5AC.c +++ b/src/H5AC.c @@ -1440,21 +1440,82 @@ H5AC_resize_entry(void *thing, size_t new_size) cache_ptr = entry_ptr->cache_ptr; HDassert(cache_ptr); - /* Resize the entry */ - if (H5C_resize_entry(thing, new_size) < 0) - HGOTO_ERROR(H5E_CACHE, H5E_CANTRESIZE, FAIL, "can't resize entry") - #ifdef H5_HAVE_PARALLEL - { + /* Log the generation of dirty bytes of metadata iff: + * + * 1) The entry is clean on entry, and this resize will dirty it + * (i.e. the current and new sizes are different), and + * + * 2) This is a parallel computation -- which it is if the aux_ptr + * is non-null. + * + * A few points to note about this section of the code: + * + * 1) This call must occur before the call to H5C_resize_entry() since + * H5AC__log_dirtied_entry() expects the target entry to be clean + * on entry. + * + * 2) This code has some basic issues in terms of the number of bytes + * added to the dirty bytes count. + * + * First, it adds the initial entry size to aux_ptr->dirty_bytes, + * not the final size. Note that this code used to use the final + * size, but code to support this has been removed from + * H5AC__log_dirtied_entry() for reasons unknown since I wrote this + * code. + * + * As long as all ranks do the same thing here, this probably doesn't + * matter much, although it will delay initiation of sync points. + * + * A more interesting point is that this code will not increment + * aux_ptr->dirty_bytes if a dirty entry is resized. At first glance + * this seems major, as particularly with the older file formats, + * resizes can be quite large. However, this is probably not an + * issue either, since such resizes will be accompanied by large + * amounts of dirty metadata creation in other areas -- which will + * cause aux_ptr->dirty_bytes to be incremented. + * + * The bottom line is that this code is probably OK, but the above + * points should be kept in mind. + * + * One final observation: This comment is occasioned by a bug caused + * by moving the call to H5AC__log_dirtied_entry() after the call to + * H5C_resize_entry(), and then only calling H5AC__log_dirtied_entry() + * if entry_ptr->is_dirty was false. + * + * Since H5C_resize_entry() marks the target entry dirty unless there + * is not change in size, this had the effect of not calling + * H5AC__log_dirtied_entry() when it should be, and corrupting + * the cleaned and dirtied lists used by rank 0 in the parallel + * version of the metadata cache. + * + * The point here is that you should be very careful when working with + * this code, and not modify it unless you fully understand it. + * + * JRM -- 2/28/22 + */ + + if ((!entry_ptr->is_dirty) && (entry_ptr->size != new_size)) { + + /* the entry is clean, and will be marked dirty in the resize + * operation. + */ H5AC_aux_t *aux_ptr; aux_ptr = (H5AC_aux_t *)H5C_get_aux_ptr(cache_ptr); - if ((!entry_ptr->is_dirty) && (NULL != aux_ptr)) + + if (NULL != aux_ptr) { + if (H5AC__log_dirtied_entry(entry_ptr) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTMARKDIRTY, FAIL, "can't log dirtied entry") + } } #endif /* H5_HAVE_PARALLEL */ + /* Resize the entry */ + if (H5C_resize_entry(thing, new_size) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTRESIZE, FAIL, "can't resize entry") + done: /* If currently logging, generate a message */ if (cache_ptr != NULL && cache_ptr->log_info != NULL) -- cgit v0.12