From 18dab4e5767fcf9cdad2220d1a89f1ae7b002dd1 Mon Sep 17 00:00:00 2001 From: mainzer Date: Wed, 29 Apr 2020 11:34:46 -0500 Subject: Modified page buffer to split entries only where necessary -- specifically when handling an I/O request on a metadata entry that has been sub-allocated from a larger file space allocation (i.e. fixed and extensible array), and that crosses at least one page boundary. . This required modifying the metadata cache to provide the type of the metadata cache entry in the current I/O request. For now, this is done with a function call. Once we are sure this works, it may be appropriate to convert this to a macro, or to add a flags parameter to the H5F block read/write calls. Also updated the metadata cache to report whether a read request is speculative -- again via a function call. This allowed me to remove the last address static variable in the H5PB_read() call, which is necessary to support multiple files opened in VFD SWMR mode. Also re-wrote the H5PB_remove_entries() call to handle release of large metadata file space allocations that have been sub-allocated into multiple metadata entries. Also modified the call to H5PB_remove_entries() in H5MF__xfree_impl() to invoke it whenever the page buffer is enabled and the size of the space to be freed is of page size or larger. Tested serial / debug on charis and Jelly. Found a bug in H5MF_xfree_impl(), in which the call to H5PB_remove_entries() is skipped due to HGOTO_DONE calls earlier in the function. While the obvious action is to move the call earlier in the function, best to consult with Vailin first, as there is much going on and it would be best to avoid making the situation worse. If nothing else, there are some error management issues. --- src/H5C.c | 75 ++- src/H5Cimage.c | 29 + src/H5Cmpio.c | 13 + src/H5Cpkg.h | 165 ++++++ src/H5Cprivate.h | 2 + src/H5Cquery.c | 108 ++++ src/H5Ctest.c | 56 +- src/H5PB.c | 1219 ++++++++++++++++++++++++++++++--------- src/H5PBpkg.h | 26 +- src/H5PBprivate.h | 23 +- test/page_buffer.c | 1622 ++++++++++++++++++++++++++++++++++++++++++++++++---- 11 files changed, 2957 insertions(+), 381 deletions(-) diff --git a/src/H5C.c b/src/H5C.c index abea0d4..aa3428b 100644 --- a/src/H5C.c +++ b/src/H5C.c @@ -477,6 +477,10 @@ H5C_create(size_t max_cache_size, cache_ptr->rdfsm_settled = FALSE; cache_ptr->mdfsm_settled = FALSE; + /* fields supporting page buffer hints */ + cache_ptr->curr_io_type = NULL; + cache_ptr->curr_read_speculative = FALSE; + if(H5C_reset_cache_hit_rate_stats(cache_ptr) < 0) /* this should be impossible... */ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, NULL, "H5C_reset_cache_hit_rate_stats failed") @@ -487,6 +491,7 @@ H5C_create(size_t max_cache_size, #ifndef NDEBUG cache_ptr->get_entry_ptr_from_addr_counter = 0; + cache_ptr->curr_io_type = NULL; #endif /* NDEBUG */ /* Set return value */ @@ -974,10 +979,13 @@ done: * * Programmer: John Mainzer -- 12/16/18 * - * Changes: None. + * Changes: Added macro calls to maintain the page buffer hints. + * + * JRM -- 3/20/20 * *------------------------------------------------------------------------- */ + herr_t H5C_evict_or_refresh_all_entries_in_page(H5F_t * f, uint64_t page, uint32_t length, uint64_t tick) @@ -994,7 +1002,7 @@ H5C_evict_or_refresh_all_entries_in_page(H5F_t * f, uint64_t page, H5C_cache_entry_t * entry_ptr; H5C_cache_entry_t * follow_ptr = NULL; herr_t ret_value = SUCCEED; /* Return value */ - bool found = false; + hbool_t found = FALSE; FUNC_ENTER_NOAPI(FAIL) @@ -1036,7 +1044,7 @@ H5C_evict_or_refresh_all_entries_in_page(H5F_t * f, uint64_t page, page * cache_ptr->page_size + length <= entry_ptr->addr + entry_ptr->size); - found = true; + found = TRUE; /* since end of tick occurs only on API call entry in * the VFD SWMR reader case, the entry must not be protected. @@ -1134,12 +1142,17 @@ H5C_evict_or_refresh_all_entries_in_page(H5F_t * f, uint64_t page, H5C_IMAGE_EXTRA_SPACE); #endif /* H5C_DO_MEMORY_SANITY_CHECKS */ + H5C__SET_PB_READ_HINTS(cache_ptr, entry_ptr->type, TRUE) + if ( H5F_block_read(f, entry_ptr->type->mem_type, entry_ptr->addr, - image_len, image_ptr) < 0 ) + image_len, image_ptr) < 0 ) { + H5C__RESET_PB_READ_HINTS(cache_ptr) HGOTO_ERROR(H5E_CACHE, H5E_READERROR, FAIL, \ "Can't read image (1)") + } + H5C__RESET_PB_READ_HINTS(cache_ptr) /* 3) Call the refresh callback. If it doesn't * request a different image size, goto 6) @@ -1171,12 +1184,18 @@ H5C_evict_or_refresh_all_entries_in_page(H5F_t * f, uint64_t page, H5C_IMAGE_EXTRA_SPACE); #endif /* H5C_DO_MEMORY_SANITY_CHECKS */ + H5C__SET_PB_READ_HINTS(cache_ptr, entry_ptr->type, TRUE) + if ( H5F_block_read(f, entry_ptr->type->mem_type, entry_ptr->addr, - image_len, image_ptr) < 0 ) + image_len, image_ptr) < 0 ) { + + H5C__RESET_PB_READ_HINTS(cache_ptr) HGOTO_ERROR(H5E_CACHE, H5E_READERROR, FAIL, \ "Can't read image (2)") + } + H5C__RESET_PB_READ_HINTS(cache_ptr) /* 5) Call the refresh callback again. Requesting * a different buffer size again is an error. @@ -6494,6 +6513,14 @@ done: * * Programmer: John Mainzer, 5/5/04 * + * Changes: Please maintain the changes list, and do not delete it + * unless you have merged it into the header comment + * proper. + * + * Added macro calls to maintain page buffer hints. + * + * JRM -- 3/20/20 + * *------------------------------------------------------------------------- */ herr_t @@ -6679,8 +6706,18 @@ H5C__flush_single_entry(H5F_t *f, H5C_cache_entry_t *entry_ptr, unsigned flags) else mem_type = entry_ptr->type->mem_type; - if(H5F_block_write(f, mem_type, entry_ptr->addr, entry_ptr->size, entry_ptr->image_ptr) < 0) - HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't write image to file") + H5C__SET_PB_WRITE_HINTS(cache_ptr, entry_ptr->type) + + if ( H5F_block_write(f, mem_type, entry_ptr->addr, + entry_ptr->size, + entry_ptr->image_ptr) < 0 ) { + + H5C__RESET_PB_WRITE_HINTS(cache_ptr) + + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, \ + "Can't write image to file") + } + H5C__RESET_PB_WRITE_HINTS(cache_ptr) #ifdef H5_HAVE_PARALLEL } #endif /* H5_HAVE_PARALLEL */ @@ -7082,6 +7119,10 @@ done: * small. * JRM -- 3/25/20 * + * Added macro calls to maintain the page buffer read hints. + * + * JRM -- 3/20/20 + * *------------------------------------------------------------------------- */ static void * @@ -7233,10 +7274,18 @@ H5C_load_entry(H5F_t * f, if ( !coll_access || 0 == mpi_rank ) { #endif /* H5_HAVE_PARALLEL */ - if ( H5F_block_read(f, type->mem_type, addr, len, image) < 0 ) + H5C__SET_PB_READ_HINTS(f->shared->cache, type, TRUE) + + if ( H5F_block_read(f, type->mem_type, addr, len, image) < 0 ) { + + H5C__RESET_PB_READ_HINTS(f->shared->cache) HGOTO_ERROR(H5E_CACHE, H5E_READERROR, NULL, \ "Can't read image*") + } + + H5C__RESET_PB_READ_HINTS(f->shared->cache) + #ifdef H5_HAVE_PARALLEL } /* end if */ /* if the collective metadata read optimization is turned on, @@ -7345,11 +7394,19 @@ H5C_load_entry(H5F_t * f, * * JRM -- 3/24/20 */ + + H5C__SET_PB_READ_HINTS(f->shared->cache, type, \ + FALSE); + if ( H5F_block_read(f, type->mem_type, addr, - actual_len, image) < 0) + actual_len, image) < 0 ) { + + H5C__RESET_PB_READ_HINTS(f->shared->cache) HGOTO_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, \ "can't read image") + } + H5C__RESET_PB_READ_HINTS(f->shared->cache) #endif /* JRM */ #ifdef H5_HAVE_PARALLEL } diff --git a/src/H5Cimage.c b/src/H5Cimage.c index ee286d9..9a6d667 100644 --- a/src/H5Cimage.c +++ b/src/H5Cimage.c @@ -1058,6 +1058,22 @@ H5C__read_cache_image(H5F_t *f, H5C_t *cache_ptr) #endif /* H5_HAVE_PARALLEL */ /* Read the buffer (if serial access, or rank 0 of parallel access) */ + + /* No need to set the page buffer hints here, as if paged + * allocation is in use, we know that the cache image was allocated + * directly from the free space manager, and thus either doesn't + * cross page boundaries, or is page aligned. Between this, + * and the fact that the cache image is never read speculatively, + * the page buffer should never request hints in this context. + * + * If for some reason it does, the NULL curr_io_type will trigger + * an assertion failure. + * + * Note that we will have to revisit this if we ever use + * cache_ptr->curr_io_type for something other than sanity + * checking + * JRM -- 3/30/20 + */ if(H5F_block_read(f, H5FD_MEM_SUPER, cache_ptr->image_addr, cache_ptr->image_len, cache_ptr->image_buffer) < 0) HGOTO_ERROR(H5E_CACHE, H5E_READERROR, FAIL, "Can't read metadata cache image block") @@ -3554,6 +3570,19 @@ H5C__write_cache_image(H5F_t *f, const H5C_t *cache_ptr) #endif /* H5_HAVE_PARALLEL */ /* Write the buffer (if serial access, or rank 0 for parallel access) */ + + /* No need to set the page buffer hints here. + * + * If paged allocation is in use, we know that the cache image + * was allocated directly from the free space manager, and thus + * either doesn't cross page boundaries, or is page aligned. + * Thus it should never trigger the sanity checks in the page buffer. + * + * If for some reason it does, the NULL curr_io_type will trigger + * an assertion failure. + * + * JRM -- 3/30/20 + */ if(H5F_block_write(f, H5FD_MEM_SUPER, cache_ptr->image_addr, cache_ptr->image_len, cache_ptr->image_buffer) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't write metadata cache image block to file") #ifdef H5_HAVE_PARALLEL diff --git a/src/H5Cmpio.c b/src/H5Cmpio.c index 0ac4c4f..e3c60a6 100644 --- a/src/H5Cmpio.c +++ b/src/H5Cmpio.c @@ -1018,6 +1018,19 @@ H5C__collective_write(H5F_t *f) HGOTO_ERROR(H5E_CACHE, H5E_CANTSET, FAIL, "can't set MPI-I/O properties") /* Write data */ + /* + * At present the page buffer is disabled in the parallel case, and + * thus VFD SWMR can't be used either. Thus, for now, there is + * no point in setting the page buffer hints. + * + * More to the point, since we are actually writing a derived type + * containing multiple metadata cache entries, we couldn't set it + * to a meaningful value. + * + * When we enable the page buffer in parallel, we will have to + * revisit this. + * JRM -- 3/30/20 + */ if(H5F_block_write(f, H5FD_MEM_DEFAULT, (haddr_t)0, (size_t)1, base_buf) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to write entries collectively") diff --git a/src/H5Cpkg.h b/src/H5Cpkg.h index d9a1641..a5eafd6 100644 --- a/src/H5Cpkg.h +++ b/src/H5Cpkg.h @@ -3480,6 +3480,102 @@ if ( ( (entry_ptr) == NULL ) || \ } /* H5C__MOVE_TO_TOP_IN_COLL_LIST */ #endif /* H5_HAVE_PARALLEL */ + +/***************************************/ +/* page buffer hint maintenance macros */ +/***************************************/ + +/*------------------------------------------------------------------------- + * + * Macro: H5C__SET/RESET_PB_READ_HINTS + * + * Purpose: Set or reset the fields needed to provide hints to the + * page buffer so that it can disambuate between speculative + * reads that cross page boundaries and read of metadata + * entries that cross page boundaries without starting on + * a page boundary. This latter behaviour shouldn't happen, + * and the hints allow the page buffer to detect this + * behaviour by un-expected cache client. + * + * See the discussion of the PB hint fields in the header + * comment for H5C_t for further details. + * + * Return: N/A + * + * Programmer: John Mainzer, 3/30/20 + * + * Modifications: + * + * None. + * + *------------------------------------------------------------------------- + */ + +#define H5C__SET_PB_READ_HINTS(cache_ptr, type, may_be_speculative) \ +{ \ + HDassert(cache_ptr); \ + HDassert((cache_ptr)->magic == H5C__H5C_T_MAGIC); \ + HDassert((cache_ptr)->curr_io_type == NULL); \ + HDassert(type); \ + (cache_ptr)->curr_io_type = (type); \ + (cache_ptr)->curr_read_speculative = (may_be_speculative) && \ + ((cache_ptr)->curr_io_type->flags & H5AC__CLASS_SPECULATIVE_LOAD_FLAG); \ + \ +} /* H5C__SET_PB_READ_HINTS() */ + +#define H5C__RESET_PB_READ_HINTS(cache_ptr) \ +{ \ + HDassert(cache_ptr); \ + HDassert((cache_ptr)->magic == H5C__H5C_T_MAGIC); \ + HDassert((cache_ptr)->curr_io_type); \ + (cache_ptr)->curr_io_type = NULL; \ + (cache_ptr)->curr_read_speculative = FALSE; \ + \ +} /* H5C__SET_PB_READ_HINTS() */ + + +/*------------------------------------------------------------------------- + * + * Macro: H5C__SET/RESET_PB_WRITE_HINTS + * + * Purpose: Set or reset the fields needed to provide hints to the + * page buffer so that it can detect un-expected writes of + * metadata entries that cross page boundaries and do not + * start on page boundaries. + * + * See the discussion of the PB hint fields in the header + * comment for H5C_t for further details. + * + * Return: N/A + * + * Programmer: John Mainzer, 3/30/20 + * + * Modifications: + * + * None. + * + *------------------------------------------------------------------------- + */ + +#define H5C__SET_PB_WRITE_HINTS(cache_ptr, type) \ +{ \ + HDassert(cache_ptr); \ + HDassert((cache_ptr)->magic == H5C__H5C_T_MAGIC); \ + HDassert((cache_ptr)->curr_io_type == NULL); \ + HDassert(type); \ + (cache_ptr)->curr_io_type = (type); \ + \ +} /* H5C__SET_PB_WRITE_HINTS() */ + +#define H5C__RESET_PB_WRITE_HINTS(cache_ptr) \ +{ \ + HDassert(cache_ptr); \ + HDassert((cache_ptr)->magic == H5C__H5C_T_MAGIC); \ + HDassert((cache_ptr)->curr_io_type); \ + (cache_ptr)->curr_io_type = NULL; \ + \ +} /* H5C__SET_PB_WRITE_HINTS() */ + /****************************/ /* Package Private Typedefs */ @@ -4413,6 +4509,47 @@ typedef struct H5C_tag_info_t { * managers that are involved in allocating space for free * space managers. * + * Page Buffer Related Fields: + * + * Due to the irregular behavior of some of the cache clients, the + * page buffer occasionally need hints to manage metadta I/O requests + * from the metadata cache -- particularly in the context of VFD SWMR. + * The following fields exist to support this. + * + * + * curr_io_type: Pointer to the instance of H5C_class_t associated with + * the current I/O operation. This pointer should be set + * just before any I/O operation by the metadata cache, and + * re-set to NULL immediately thereafter. + * + * This field exists because the fixed and variable length + * array cache clients allocate numerous entries in a single + * block, and sub-allocate metadata cache entries out of this + * block. The effect of this is to break the invarient, + * normally maintained by the free space managers in paged + * allocation mode, that no entry of less than a page in + * size crosses page boundaries, and that entries of page + * size or greater are page aligned. This in turn causes + * problems for the page buffer -- particularly in VFD SWMR + * mode. + * + * The correct solution is to modify the fixed and variable + * length array cache client to repair this. However, in + * the interrim, this field exists to detect similar + * behaviour elsewhere. + * + * To complicate matters, speculative reads for metadata + * cache entries which must determine their lengths via + * inspection of the on disk image of the entry, may mimic + * the behaviour of the fixed and extensible arrays. Thus + * the curr_io_type is also needed to dis-ambiguate reads. + * + * curr_read_speculative: Boolean flag indicating whether the current + * read request is guaranteed to be of the correct length. + * Field is used to distinguish between the initial and final + * read attempts + * + * * * Statistics collection fields: * @@ -4744,6 +4881,28 @@ typedef struct H5C_tag_info_t { * called successfully. This field is only defined when * NDEBUG is not #defined. * + * curr_io_type: Pointer to the instance of H5C_class_t associated with + * the current I/O operation. This pointer should be set + * just before any I/O operation by the metadata cache, and + * re-set to NULL immediately thereafter. This field is + * only defined when NDEBUG is not #defined. + * + * This field exists because the fixed and variable length + * array cache clients allocate numerous entries in a single + * block, and sub-allocate metadata cache entries out of this + * block. The effect of this is to break the invarient, + * normally maintained by the free space managers in paged + * allocation mode, that no entry of less than a page in + * size crosses page boundaries, and that entries of page + * size or greater are page aligned. This in turn causes + * problems for the page buffer -- particularly in VFD SWMR + * mode. + * + * The correct solution is to modify the fixed and variable + * length array cache client to repair this. However, in + * the interrim, this field exists to detect similar + * behaviour elsewhere. + * ****************************************************************************/ struct H5C_t { uint32_t magic; @@ -4892,6 +5051,10 @@ struct H5C_t { hbool_t rdfsm_settled; hbool_t mdfsm_settled; + /* Fields supporting page buffer hints */ + const H5C_class_t * curr_io_type; + hbool_t curr_read_speculative; + #if H5C_COLLECT_CACHE_STATS /* stats fields */ int64_t hits[H5C__MAX_NUM_TYPE_IDS + 1]; @@ -5025,6 +5188,8 @@ H5_DLL herr_t H5C__untag_entry(H5C_t *cache, H5C_cache_entry_t *entry); /* Testing functions */ #ifdef H5C_TESTING H5_DLL herr_t H5C__verify_cork_tag_test(hid_t fid, H5O_token_t tag_token, hbool_t status); +H5_DLL void H5C_set_curr_io_type_splitable(H5C_t * cache_ptr, + hbool_t set_splitable); #endif /* H5C_TESTING */ #endif /* _H5Cpkg_H */ diff --git a/src/H5Cprivate.h b/src/H5Cprivate.h index 23091cb..7678911 100644 --- a/src/H5Cprivate.h +++ b/src/H5Cprivate.h @@ -2411,6 +2411,8 @@ H5_DLL herr_t H5C_get_cache_size(H5C_t *cache_ptr, size_t *max_size_ptr, uint32_t *cur_num_entries_ptr); H5_DLL herr_t H5C_get_cache_flush_in_progress(H5C_t *cache_ptr, hbool_t *flush_in_progress_ptr); H5_DLL herr_t H5C_get_cache_hit_rate(H5C_t *cache_ptr, double *hit_rate_ptr); +H5_DLL int H5C_get_curr_io_client_type(H5C_t * cache_ptr); +H5_DLL hbool_t H5C_get_curr_read_speculative(H5C_t * cache_ptr); H5_DLL herr_t H5C_get_entry_status(const H5F_t *f, haddr_t addr, size_t *size_ptr, hbool_t *in_cache_ptr, hbool_t *is_dirty_ptr, hbool_t *is_protected_ptr, hbool_t *is_pinned_ptr, hbool_t *is_corked_ptr, diff --git a/src/H5Cquery.c b/src/H5Cquery.c index 9f1ec31..477a8ba 100644 --- a/src/H5Cquery.c +++ b/src/H5Cquery.c @@ -452,3 +452,111 @@ done: FUNC_LEAVE_NOAPI(ret_value) } /* H5C_get_mdc_image_info() */ + +/*------------------------------------------------------------------------- + * Function: H5C_get_curr_io_client_type + * + * Purpose: Return the type id associated with the metadata cache + * client whose data is currently being read or written. + * + * This id is obtained via the curr_io_type field in + * H5C_t, which is set just before most I/O calls from the + * metadata cache, and reset to NULL immediately thereafter. + * + * If cache_ptr->curr_io_type is NULL, the function + * returns -1. + * + * Note: At present, cache_ptr->curr_io_type should always + * be defined in the serial case with the exception + * of cache image I/O. In general, it is not defined in + * the parallel case. This is not a problem for now, as + * this function is used in page buffer sanity checking, + * and for now at least, the page buffer is not enabled in + * the parallel case. + * + * Return: ID of cache client whose image is being read or written, + * or H5AC_NTYPES if cache_ptr->curr_io_type is undefined. + * + * Programmer: John Mainzer + * 3/31/20 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ + +int +H5C_get_curr_io_client_type(H5C_t * cache_ptr) +{ + int ret_value = -1; /* Return value */ + + FUNC_ENTER_NOAPI_NOINIT_NOERR + + HDassert(cache_ptr); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + + if ( cache_ptr->curr_io_type ) { + + ret_value = cache_ptr->curr_io_type->id; + } + + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5C_get_curr_io_client_type() */ + + +/*------------------------------------------------------------------------- + * Function: H5C_get_curr_read_speculative + * + * Purpose: Return a boolean flag indicating whether the current + * read is speculative. + * + * Note that this value is only defined during a read generated + * by the metadatat cache. At all other times, the return + * value undefined (although the current implementation + * returns FALSE in such cases). + * + * Note also that this function exists to provide hints to the + * page buffer, which for now at least, is only available in + * the serial case. It should not be depended upon in the + * parallel case -- at least until verified, and potential + * interactions with collective metadata reads are investigated + * and dismissed. + * + * Return: True if the current call to H5F_block_read() by the + * metadata cache is an initial read attempt for a cache + * client whose speculative read flag is set (in H5AC_class_t), + * and false otherwise. + * + * Return value is undefined if a call to H5F_block_read by + * the metadata cache is not in progress. + * + * Programmer: John Mainzer + * 3/31/20 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ + +hbool_t +H5C_get_curr_read_speculative(H5C_t * cache_ptr) +{ + hbool_t ret_value = FALSE; /* Return value */ + + FUNC_ENTER_NOAPI_NOINIT_NOERR + + HDassert(cache_ptr); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + + if ( cache_ptr->curr_io_type ) { + + ret_value = cache_ptr->curr_read_speculative; + } + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5C_get_curr_read_speculative() */ + + diff --git a/src/H5Ctest.c b/src/H5Ctest.c index 7f24302..b549da5 100644 --- a/src/H5Ctest.c +++ b/src/H5Ctest.c @@ -78,8 +78,6 @@ typedef struct { /* Local Variables */ /*******************/ - - /*------------------------------------------------------------------------- * Function: H5C__verify_cork_tag_test_cb @@ -167,3 +165,57 @@ done: FUNC_LEAVE_NOAPI(ret_value) } /* H5C__verify_cork_tag_test() */ + +/*------------------------------------------------------------------------- + * Function: H5C_set_curr_io_type_splitable() + * + * Purpose: To test the meta data entry splitting capability in the page + * buffer (needed to deal with H5FA and H5EA's unfortunate + * design choice of sub-allocating multiple metadata entries + * out of a single file space allocation), we must be able + * to configure the metadata cache to report that the + * current I/O request is for such an entry. + * + * To do this, we must set cache_ptr->curr_io_type to + * point to the instance of H5C_class_t with one such + * client. + * + * This function does this by setting cache_ptr->curr_io_type + * to H5AC_EARRAY_DBLK_PAGE if set_splitable is TRUE, and to + * NULL otherwise. + * + * Needless to say, this is purely a testing function, and + * should not be called otherwise. + * + * Return: void + * + * Programmer: John Mainzer + * 4/10/20 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ + +void +H5C_set_curr_io_type_splitable(H5C_t * cache_ptr, hbool_t set_splitable) +{ + FUNC_ENTER_NOAPI_NOINIT_NOERR + + HDassert(cache_ptr); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + + if ( set_splitable ) { + + cache_ptr->curr_io_type = H5AC_EARRAY_DBLK_PAGE; + + } else { + + cache_ptr->curr_io_type = NULL; + } + + + FUNC_LEAVE_NOAPI_VOID + +} /* H5C_set_curr_io_type_splitable() */ + diff --git a/src/H5PB.c b/src/H5PB.c index 14ced59..da65788 100644 --- a/src/H5PB.c +++ b/src/H5PB.c @@ -52,9 +52,12 @@ /****************/ /* Round _x down to nearest _size. */ +/* not used at present */ +/* #ifndef rounddown #define rounddown(_x, _size) (((_x) / (_size)) * (_size)) #endif +*/ /* Round _x up to nearest _size. */ #ifndef roundup @@ -113,14 +116,6 @@ static herr_t H5PB__write_meta(H5F_shared_t *, H5FD_mem_t, haddr_t, static herr_t H5PB__write_raw(H5F_shared_t *, H5FD_mem_t, haddr_t, size_t, const void *); -static void metadata_section_split(size_t, haddr_t, size_t, const void *, - metadata_section_t *); - -static herr_t metadata_multipart_read(H5F_shared_t *, H5FD_mem_t, haddr_t, - size_t, void *); - -static herr_t metadata_multipart_write(H5F_shared_t *, H5FD_mem_t, haddr_t, - size_t, const void *); /*********************/ /* Package Variables */ @@ -222,6 +217,8 @@ H5PB_reset_stats(H5PB_t *pb_ptr) pb_ptr->max_dwl_len = 0; pb_ptr->max_dwl_size = 0; pb_ptr->total_dwl_ins_depth = 0; + pb_ptr->md_read_splits = 0; + pb_ptr->md_write_splits = 0; FUNC_LEAVE_NOAPI(SUCCEED) @@ -252,7 +249,13 @@ H5PB_reset_stats(H5PB_t *pb_ptr) * --bypasses: the number of metadata and raw data accesses * that bypass the page buffer layer * - * Return: Non-negative on success/Negative on failure + * TODO: The available stats have changed considerably + * since Mohamad wrote this routine. Update + * the function once things settle down. + * + * JRM -- 4/13/20 + * + * Return: Non-negative on success/Negative on failure * * Programmer: Mohamad Chaarawi * @@ -297,7 +300,9 @@ H5PB_get_stats(const H5PB_t *pb_ptr, unsigned accesses[2], unsigned hits[2], * * Programmer: John Mainzer -- 10/12/18 * - * Changes: None. + * Changes: Added support for md_read_splits and md_write_splits. + * + * JRM -- 4/11/20 * *------------------------------------------------------------------------- */ @@ -404,10 +409,14 @@ H5PB_print_stats(const H5PB_t *pb_ptr) ave_delayed_write_ins_depth = (double)(pb_ptr->total_dwl_ins_depth) / (double)(pb_ptr->delayed_writes); } + HDfprintf(stdout, "delayed writes / ave delay / ave ins depth = %lld / %llf / %llf\n", pb_ptr->delayed_writes, ave_delayed_write, ave_delayed_write_ins_depth); + HDfprintf(stdout, "metadata read / write splits = %lld / %lld.\n", + pb_ptr->md_read_splits, pb_ptr->md_write_splits); + FUNC_LEAVE_NOAPI(SUCCEED) } /* H5PB_print_stats */ @@ -444,7 +453,10 @@ H5PB_print_stats(const H5PB_t *pb_ptr) * * Programmer: John Mainzer -- 10/12/18 * - * Changes: None. + * Changes: Modified function to function to prevent the insertion + * of raw data pages when operating in VFD SWMR mode. + * + * JRM -- 3/25/20 * *------------------------------------------------------------------------- */ @@ -468,7 +480,8 @@ H5PB_add_new_page(H5F_shared_t *shared, H5FD_mem_t type, haddr_t page_addr) if ( H5FD_MEM_DRAW == type ) { /* raw data page insertion */ - if ( pb_ptr->min_md_pages == pb_ptr->max_pages ) { + if ( ( pb_ptr->min_md_pages == pb_ptr->max_pages ) || + ( pb_ptr->vfd_swmr ) ) { can_insert = FALSE; @@ -514,7 +527,12 @@ done: * * Programmer: John Mainzer -- 10/11/18 * - * Changes: None. + * Changes: Added initialization for the vfd_swmr field. Also + * added code to force min_rd_pages to 0 if vfd_swrm is + * TRUE. Do this since we now exclude raw data from the + * page buffer when operating in VFD SWMR mode. + * + * JRM -- 3/28/20 * *------------------------------------------------------------------------- */ @@ -522,6 +540,7 @@ herr_t H5PB_create(H5F_shared_t *shared, size_t size, unsigned page_buf_min_meta_perc, unsigned page_buf_min_raw_perc) { + hbool_t vfd_swmr = FALSE; hbool_t vfd_swmr_writer = FALSE; int i; int32_t min_md_pages; @@ -572,11 +591,21 @@ H5PB_create(H5F_shared_t *shared, size_t size, unsigned page_buf_min_meta_perc, (int32_t)(size / shared->fs_page_size)); - /* compute vfd_swmr_writer */ - if ( ( H5F_SHARED_VFD_SWMR_CONFIG(shared) ) && ( H5F_SHARED_INTENT(shared) & H5F_ACC_RDWR ) ) { + /* compute vfd_swrm and vfd_swmr_writer */ + if ( H5F_SHARED_VFD_SWMR_CONFIG(shared) ) { + + vfd_swmr = TRUE; + + /* force min_rd_pages to zero since raw data is exclued from + * the page buffer in VFD SWMR mode. + */ + min_rd_pages = 0; + + if ( H5F_SHARED_INTENT(shared) & H5F_ACC_RDWR ) { - HDassert(shared->vfd_swmr_config.writer); - vfd_swmr_writer = TRUE; + HDassert(shared->vfd_swmr_config.writer); + vfd_swmr_writer = TRUE; + } } @@ -626,6 +655,7 @@ H5PB_create(H5F_shared_t *shared, size_t size, unsigned page_buf_min_meta_perc, /* VFD SWMR specific fields. * The following fields are defined iff vfd_swmr_writer is TRUE. */ + pb_ptr->vfd_swmr = vfd_swmr; pb_ptr->vfd_swmr_writer = vfd_swmr_writer; pb_ptr->mpmde_count = 0; pb_ptr->cur_tick = 0; @@ -925,9 +955,11 @@ done: * * 2) If the read is for raw data, and the page buffer is * configured for metadata only (i.e. min_md_pages == - * max_pages), simply read from the HDF5 file and return. + * max_pages), or if we are operating in VFD SWMR mode + * (i.e. vfd_swmr == TRUE), simply read from the HDF5 + * file and return. * - * 3) If the read is for raw data, and it of page size or + * 3) If the read is for raw data, and is of page size or * larger, read it directly from the HDF5 file. * * It is possible that the page buffer contains dirty pages @@ -957,17 +989,41 @@ done: * between small and multi-page metadata entries so that * pages containing the former will be buffered and the * latter be read directly from file. - * - * Unfortunately, the metadata cache does not always know the + * + * Unfortunately, there are several flies in the ointment. + * + * First, the fixed and extensible array on disk data + * structures allocate multiple metadata cache entries in + * a single block, and use this fact to make the addresses + * of all but the first entry in the block computable. While + * this simplifies the fixed and extensible array on disk data + * structures, if complicates the metadata cache and the page + * buffer. Needless to say, the correct solution to this + * problem is to remove the complexity at its source. However, + * for now, we must code around the problem. + * + * Thus, this function must examine each read request + * to determine if it crosses page boundaries and is not for + * two or more complete pages. If it does, and it is one of + * the fixed or extensible array entries that is sub-allocated + * from a larger space allocation, the read request must be + * split into the minimal set of read requests that either + * don't cross page boundaries, or are page aligned and + * consist of an integral number of pages. + * + * + * Second, the metadata cache does not always know the * size of metadata entries when it tries to read them. In * such cases, it issues speculative reads that may be either * smaller or larger than the actual size of the piece of * metadata that is finally read. * * Since we are guaranteed that all metadata allocations larger - * that one page are page aligned, we can safely clip at the - * page boundary any non page aligned metadata read that crosses - * page boundaries. + * that one page are page aligned (with the exception of those + * sub-allocated from larger allocations -- which we deal with + * by splitting I/O requests as discussed above), we can safely + * clip at the page boundary any non page aligned metadata + * read that crosses page boundaries. * * However, page aligned reads could wind up being either * small or multi-page. This results in two scenarios that @@ -1008,15 +1064,13 @@ done: * * 8) If the read is for metadata, is page aligned, is larger * than one page, and there is a regular entry at the target - * page address, test to see if the last read was for the - * same address. + * page address, test to see if the read is speculative. * - * If was, evict the page, and satisfy the read from file. - * Flag an error if the page was dirty. + * If it is not, evict the page, and satisfy the read from + * file. Flag an error if the page was dirty. * - * If the last read was for a different page, clip the read - * to one page, and satisfy the read from the existing - * regular entry. + * If it is, clip the read to one page, and satisfy the + * read from the existing regular entry. * * 9) If the read is for metadata, is page aligned, is larger * than one page, and there is a multi-page metadata entry @@ -1051,60 +1105,334 @@ done: * * Programmer: John Mainzer -- 10/11/18 * - * Changes: None. + * Changes: Updated for discovery of the fact that the fixed and + * extensible array data structures allocate multiple + * metadata cache entries in a single block, and thus + * violate that invarient that metadata entries either + * do not cross page boundaries, or are page aligned. + * + * JRM -- 3/28/20 * *------------------------------------------------------------------------- */ -/* TBD Add optional raw-data bypass here and at H5PB_write when we - * are operating in parallel mode. - */ + herr_t H5PB_read(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size, void *buf/*out*/) { - H5PB_t *pb_ptr; /* Page buffer for this file */ + H5PB_t *pb_ptr; /* Page buffer for this file */ + hbool_t bypass_pb = FALSE; /* Whether to bypass page buffering */ + hbool_t split_read = FALSE; /* whether the read must be split */ herr_t ret_value = SUCCEED; /* Return value */ + /* the following six fields are defined iff split_read is TRUE */ + haddr_t prefix_addr = HADDR_UNDEF; /* addr of prefix -- if defined */ + haddr_t body_addr = HADDR_UNDEF; /* addr of body -- if defined */ + haddr_t suffix_addr = HADDR_UNDEF; /* addr of suffix -- if defined */ + size_t prefix_size = 0; /* size of prefix */ + size_t body_size = 0; /* size of body */ + size_t suffix_size = 0; /* size of suffix */ + + FUNC_ENTER_NOAPI(FAIL) + /* Sanity checks */ + HDassert(shared); + hlog_fast(pbrd, "%s %p type %d %" PRIuHADDR " size %zu", __func__, (void *)shared, type, addr, size); + pb_ptr = shared->pb_ptr; - HDassert(pb_ptr == NULL || pb_ptr->magic == H5PB__H5PB_T_MAGIC); + if ( pb_ptr == NULL ) { - /* Bypass the page buffer in case - * 1) page buffer is disabled - * _) MPI I/O is enabled - * 2) page buffer configured for metadata only, and it's a raw-data access - * 5) page buffer configured for raw data only, and it's a metadata access - */ - if (pb_ptr == NULL || H5F_SHARED_HAS_FEATURE(shared, H5FD_FEAT_HAS_MPI) || - (H5FD_MEM_DRAW == type && pb_ptr->min_md_pages == pb_ptr->max_pages) || - (H5FD_MEM_DRAW != type && pb_ptr->min_rd_pages == pb_ptr->max_pages)) { + bypass_pb = TRUE; /* case 1) -- page buffer is disabled */ + + } else { + + HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); + + if ( H5FD_MEM_DRAW == type ) { /* raw data read */ + + if ( ( pb_ptr->min_md_pages == pb_ptr->max_pages ) || + ( pb_ptr->vfd_swmr ) ) { + + /* case 2) -- page buffer configured for metadata only + * or vfd swmr. + */ + bypass_pb = TRUE; + + } + } else { /* metadata read */ + + if ( pb_ptr->min_rd_pages == pb_ptr->max_pages ) { + + /* case 5) -- page buffer configured for raw data only */ + bypass_pb = TRUE; + + } else { + /* determine whether the read request must be split, + * and if so, compute the start points and sizes of + * of the sections. + * + * Note: The following code is almost identical to the + * similar code in H5PB_write(). Thus, on the surface, + * it is an obvious candidate for refactoring into a + * function 0r macro. + * + * However, there are subtle differences between + * the two pieces of code which are driven by the + * possibility of speculative reads. + * + * More to the point, further changes may be necessary. + * Thus we should wait on refactoring until this code has + * been in daily use for some time, and it is clear + * that further changes are unlikely. + */ + int mdc_client_id = -1; /* id of mdc client, or -1 if undef */ + uint64_t start_page; /* page index of first page in read */ + uint64_t second_page; /* page index of second page in read */ + uint64_t end_page; /* page index of last page in read */ + uint64_t body_page; /* page index of start of body */ + haddr_t start_page_addr; /* addr of first page in read */ + haddr_t second_page_addr;/* addr of second page in read */ + haddr_t end_page_addr; /* addr of last page in read */ + haddr_t end_addr; /* addr of last byte in read */ + + /* Calculate the aligned address of the first page */ + start_page = (addr / pb_ptr->page_size); + start_page_addr = start_page * pb_ptr->page_size; + + /* Calculate the aligned address of the last page */ + end_addr = addr + (haddr_t)(size - 1); + end_page = end_addr / (haddr_t)(pb_ptr->page_size); + end_page_addr = end_page * pb_ptr->page_size; + + HDassert(start_page_addr <= addr); + HDassert(addr < start_page_addr + (haddr_t)(pb_ptr->page_size)); + + HDassert(start_page <= end_page); + HDassert(end_page_addr <= ((addr + (haddr_t)size - 1))); + HDassert((addr + (haddr_t)size - 1) < + (end_page_addr + pb_ptr->page_size)); + + /* test to see if the read crosses a page boundary, and + * does not start on a page boundary, and is not of an + * integral number of pages. + */ + if ( ( start_page < end_page ) && + ( ! ( ( addr == start_page_addr ) && + ( end_page_addr + (haddr_t)(pb_ptr->page_size) == + end_addr + 1 ) ) ) ) { + + /* the read crosses a page boundary and is not + * page aligned and of length some multiple of page size. + * + * Test to see if the read is for a metadata entry that + * is sub-allocated from a larger space allocation. + * + * Note that the following test may have to be + * adjusted. + */ + mdc_client_id = H5C_get_curr_io_client_type(shared->cache); + + if ( ( mdc_client_id == (int)H5AC_EARRAY_DBLK_PAGE_ID ) || \ + ( mdc_client_id == (int)H5AC_FARRAY_DBLK_PAGE_ID ) ) { + + split_read = TRUE; + } + } + + if ( split_read ) { + + /* compute the base addresses and length of the prefix, + * body, and suffix of the read, where these terms are + * defined as follows: + * + * prefix: All bytes from addr to the first page address + * at or after addr. If addr == start_page_addr, + * the prefix is empty. + * + * body: All bytes from the first page address covered + * by the read up to but not including the last + * page address in the read. Note that the + * length of the body must be a multiple of the + * page size. If only one page address is + * included in the read, the body is empty. + * + * suffix: All bytes from the last page address in the + * read until the end of the read. If the + * read ends on a page boundary, the suffix is + * empty. + * + * Since we know that the read crosses at least one + * page boundary, and we have aleady filtered out the + * body only case, at least two of the above must be + * non-empty. + */ + + second_page = start_page + 1; + second_page_addr = + (haddr_t)(second_page * pb_ptr->page_size); + + if ( addr > start_page_addr ) { /* prefix exists */ + + prefix_addr = addr; + prefix_size = (size_t)(second_page_addr - addr); + + HDassert(prefix_addr > start_page_addr); + HDassert(prefix_size < pb_ptr->page_size); + HDassert(((size_t)(addr - start_page_addr) + \ + prefix_size) == pb_ptr->page_size); + } + + if ( size - prefix_size >= pb_ptr->page_size ) { + + /* body exists */ + + if ( addr == start_page_addr ) { + + body_page = start_page; + body_addr = start_page_addr; + + } else { + + body_page = second_page; + body_addr = second_page_addr; + } + + if ( end_addr < end_page_addr + + (haddr_t)(pb_ptr->page_size - 1) ) { + + /* suffix exists */ + body_size = (size_t)(end_page - body_page) * + pb_ptr->page_size; + + } else { + + /* suffix is empty */ + body_size = (size_t)(end_page - body_page + 1) * + pb_ptr->page_size; + } + + HDassert((body_page == start_page) || \ + (body_page == start_page + 1)); + + HDassert(body_addr == \ + (haddr_t)(body_page * pb_ptr->page_size)); + + HDassert(body_size < size); + HDassert(body_size >= pb_ptr->page_size); + + + HDassert(body_addr == \ + addr + (haddr_t)prefix_size); + HDassert((body_addr + (haddr_t)body_size) \ + <= (end_addr + 1)); + } - if (H5FD_read(shared->lf, type, addr, size, buf) < 0) { - HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, - "read through lower VFD failed"); + if ( end_addr < end_page_addr + + (haddr_t)(pb_ptr->page_size - 1) ) { + + suffix_addr = end_page_addr; + suffix_size = (end_addr + 1) - end_page_addr; + + HDassert(suffix_addr == \ + addr + (haddr_t)(prefix_size + body_size)); + } + + HDassert(size == prefix_size + body_size + suffix_size); + } + } } + } + +#ifdef H5_HAVE_PARALLEL + /* at present, the page buffer must be disabled in the parallel case. + * However, just in case ... + */ + if ( H5F_SHARED_HAS_FEATURE(shared, H5FD_FEAT_HAS_MPI) ) { + + bypass_pb = TRUE; + + } /* end if */ +#endif /* H5_HAVE_PARALLEL */ + + + if ( bypass_pb ) { /* cases 1, 2. and 5 */ + + if ( H5FD_read(shared->lf, type, addr, size, buf) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ + "read through failed") + + /* Update statistics */ + if ( pb_ptr ) { - if (pb_ptr != NULL) H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size); - HGOTO_DONE(SUCCEED); - } + } + } else { - if (H5FD_MEM_DRAW == type) { /* cases 3 and 4 */ - if (H5PB__read_raw(shared, type, addr, size, buf) < 0) - HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, "raw read failed"); - } else if (metadata_multipart_read(shared, type, addr, size, buf) < 0) - HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, "meta read failed"); + if ( H5FD_MEM_DRAW == type ) { /* cases 3 and 4 */ - H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size); + if ( H5PB__read_raw(shared, type, addr, size, buf) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ + "H5PB_read_raw() failed") + + } else if ( split_read ) { + + /* handle the sub-allocated entry case */ + + /* read prefix if it exists */ + if ( prefix_size > 0 ) { + + if ( H5PB__read_meta(shared, type, prefix_addr, + prefix_size, buf) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ + "H5PB_read_meta() failed on prefix") + } + + /* read body -- if it exists. */ + if ( body_size > 0 ) { + + if ( H5PB__read_meta(shared, type, body_addr, body_size, + (void *)((uint8_t *)buf + + prefix_size)) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ + "H5PB_read_meta() failed on body") + } + + /* read suffix -- if it exists. */ + if ( suffix_size > 0 ) { + + if ( H5PB__read_meta(shared, type, suffix_addr, suffix_size, + (void *)((uint8_t *)buf + prefix_size + + body_size)) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ + "H5PB_read_meta() failed on suffix") + } + + H5PB__UPDATE_STATS_FOR_READ_SPLIT(pb_ptr) + + } else { /* pass to H5PB_read_meta() -- cases 6, 7, 8, 9, & 10 */ + + if ( H5PB__read_meta(shared, type, addr, size, buf) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ + "H5PB_read_meta() failed") + } + } done: + FUNC_LEAVE_NOAPI(ret_value) -} + +} /* H5PB_read() */ /* Remove the entry corresponding to lower-file page number `page`. * Return 0 if there was no such entry or if the entry was removed @@ -1198,12 +1526,16 @@ herr_t H5PB_remove_entry(H5F_shared_t *shared, haddr_t addr) { uint64_t page; - H5PB_t *pb_ptr; + H5PB_t *pb_ptr = NULL; H5PB_entry_t *entry_ptr = NULL; - herr_t ret_value = SUCCEED; + herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_NOAPI(FAIL) + /* Sanity checks */ + HDassert(shared); + HDassert(shared->pb_ptr); + pb_ptr = shared->pb_ptr; /* Calculate the page offset */ @@ -1263,50 +1595,169 @@ done: } /* H5PB_remove_entry */ + +/*------------------------------------------------------------------------- + * + * Function: H5PB_remove_entries + * + * Purpose: Remove entries in the page buffer associated with a + * newly freed multi-page block of file space. + * + * There are several possible situations here. + * + * In the context of metadata, there are two possible cases. + * + * 1) The block of file space is associated with a metadata + * entry. + * + * In regular operating mode, this entry will not be + * cached in the page buffer, so there should be nothing + * to do. + * + * In VFD SWMR mode, the entry may be cached in a single + * multi-page entry. + * + * 2) The block of file space has been sub-allocated + * into multiple metadata entries (i.e. fixed and extensible + * array). In this case, the individual entries may cross + * boundaries without being page aligned -- however, for + * purposes of the page buffer, I/O requests on these + * entries will have been broken up into requests that + * either do not cross page boundaries or are page aligned. + * + * In the context of raw data, the page buffer may or may + * not contain regular entries scattered over the space + * touched by the newly freed file space. + * + * In all contexts, there is no guarantee that the page buffer + * will contain any of the possible entries. + * + * Space allocations larger than one page must be page alligned. + * Further, any space between the end of a multi-page allocation + * and the next page boundary will remain un-allocated until after + * the original allocation is freed. This implies that: + * + * 1) The address passed into this call must be page aligned. + * + * 2) The page buffer may safely discard any page that + * intersects with the newly freed file space allocation. + * + * The bottom line here is that we must scan the page buffer + * index, and discard all entries that intersect the supplied + * address and length. As a sanity check, we must verify that + * any such entries don't overlap. + * + * Also, in the context of the VFD SWMR write, it is possible + * that the discarded pages will reside in the tick list or + * the delayed write list -- if so, they must be removed + * prior to eviction. + * + * Note: + * + * This function scans the page buffer hash table to + * find entries to remove. While this is normally + * pretty in-expensive, a very large (i.e. GB) file + * space free may impose significant cost. + * + * As best I understand it, such frees are rare, so + * the current solution should be good enough for now. + * However, if we determine that the current solution + * is too expensive, two alternate solutions come to mine. + * + * a) Scan the index list instead of the hash table + * if the free is sufficiently large. Also, skip + * entirely if the page buffer doesn't contain any + * pages of the appropriate type. + * + * b) Whenever writing a large metadata entry, scan for + * intersecting entries and delete them. (potential + * issues with fixed and variable array entries are + * dealt with via the splitting mechanism.) In this + * case we would also have to simply ignore writes + * beyond EOA on flush or close. + * + * Note that we already scan for intersecting entries + * on large raw data writes -- with possible performance + * issues for large writes. + * + * JRM -- 4/25/20 + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: John Mainzer 4/25/20 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ + herr_t H5PB_remove_entries(H5F_shared_t *shared, haddr_t addr, hsize_t size) { - H5PB_t *pb_ptr; - H5PB_entry_t *entry_ptr; - herr_t ret_value = SUCCEED; - metadata_section_t section[3] = {{0, 0, NULL}, {0, 0, NULL}, {0, 0, NULL}}; - int i; + uint64_t i; + uint64_t start_page; + uint64_t end_page; + int64_t entry_pages = 0; + hsize_t entry_size; + H5PB_t *pb_ptr = NULL; + H5PB_entry_t *entry_ptr = NULL; + herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_NOAPI(FAIL) + /* Sanity checks */ + HDassert(shared); + HDassert(shared->pb_ptr); + pb_ptr = shared->pb_ptr; - HDassert(addr % pb_ptr->page_size == 0); + /* Calculate the start_page offset */ + start_page = (addr / pb_ptr->page_size); - if (size > pb_ptr->page_size) { - hlog_fast(pbrm, - "removing multipage region [%" PRIuHADDR ", %" PRIuHADDR ")", - addr, addr + size); - } + HDassert(addr == start_page * pb_ptr->page_size); - metadata_section_split(pb_ptr->page_size, addr, size, NULL, section); + /* Calculate the end_page offset */ + end_page = ((addr + (haddr_t)(size - 1)) / pb_ptr->page_size); - for (i = 0; i < 3; i++) { - metadata_section_t *iter = §ion[i]; + HDassert(start_page <= end_page); + HDassert(((end_page - start_page) * pb_ptr->page_size) <= size); + HDassert(size <= ((end_page - start_page + 1) * pb_ptr->page_size)); + + for ( i = start_page; i <= end_page; i++ ) + { + /* test to see if page i exists */ + H5PB__SEARCH_INDEX(pb_ptr, i, entry_ptr, FAIL) - if (iter->len == 0) - continue; + if ( entry_ptr ) { - if (iter->len < size) { - hlog_fast(pbrm, "removing entry [%" PRIuHADDR ", %" PRIuHADDR ") " - "for split region [%" PRIuHADDR ", %" PRIuHADDR ")", - iter->addr, iter->addr + iter->len, addr, addr + size); - } + /* verify that this entry doesn't overlap with a previously + * visited entry. + */ + HDassert(entry_pages <= 0); - assert(iter->addr % pb_ptr->page_size == 0); + entry_size = entry_ptr->size; + entry_pages = (int64_t)(entry_size / pb_ptr->page_size); - if (H5PB_remove_entry(shared, iter->addr) < 0) - HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, "forced eviction failed") + if ( (uint64_t)entry_pages * pb_ptr->page_size < entry_size ) { + + entry_pages++; + } + + /* remove the entry */ + if ( H5PB_remove_entry(shared, entry_ptr->addr) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "H5PB_remove_entry() failed") + + } + entry_pages--; } done: + FUNC_LEAVE_NOAPI(ret_value) -} + +} /* H5PB_remove_entries() */ /*------------------------------------------------------------------------- @@ -1706,9 +2157,9 @@ done: * *------------------------------------------------------------------------- */ -herr_t -H5PB_vfd_swmr__update_index(H5F_t *f, - uint32_t * idx_ent_added_ptr, +herr_t +H5PB_vfd_swmr__update_index(H5F_t *f, + uint32_t * idx_ent_added_ptr, uint32_t * idx_ent_modified_ptr, uint32_t * idx_ent_not_in_tl_ptr, uint32_t * idx_ent_not_in_tl_flushed_ptr) @@ -1734,7 +2185,7 @@ H5PB_vfd_swmr__update_index(H5F_t *f, idx = shared->mdf_idx; HDassert(idx); - + pb_ptr = shared->pb_ptr; HDassert(pb_ptr); @@ -1763,7 +2214,7 @@ H5PB_vfd_swmr__update_index(H5F_t *f, if ( ie_ptr == NULL ) { /* alloc new entry in the metadata file index*/ uint32_t new_index_entry_index; - new_index_entry_index = shared->mdf_idx_entries_used + + new_index_entry_index = shared->mdf_idx_entries_used + idx_ent_added++; if (new_index_entry_index >= shared->mdf_idx_len && @@ -1816,7 +2267,7 @@ H5PB_vfd_swmr__update_index(H5F_t *f, ie_ptr->tick_of_last_flush = 0; } - /* scan the metadata file index for entries that don't appear in the + /* scan the metadata file index for entries that don't appear in the * tick list. If the index entry is dirty, and either doesn't appear * in the page buffer, or is clean in the page buffer, mark the index * entry clean and as having been flushed in the current tick. @@ -1848,7 +2299,7 @@ H5PB_vfd_swmr__update_index(H5F_t *f, } } - HDassert(idx_ent_modified + idx_ent_not_in_tl == + HDassert(idx_ent_modified + idx_ent_not_in_tl == shared->mdf_idx_entries_used); HDassert(idx_ent_modified + idx_ent_not_in_tl + idx_ent_added <= @@ -1860,8 +2311,10 @@ H5PB_vfd_swmr__update_index(H5F_t *f, *idx_ent_not_in_tl_flushed_ptr = idx_ent_not_in_tl_flushed; done: + FUNC_LEAVE_NOAPI(ret_value) -} + +} /* H5PB_vfd_swmr__update_index() */ /*------------------------------------------------------------------------- @@ -1876,9 +2329,10 @@ done: * * 2) If the write is raw data, and the page buffer is * configured for metadata only (i.e. min_md_pages == - * max_pages), simply write to the HDF5 file and return. + * max_pages), or if the page buffer is operating in + * vfd_swmr mode, simply write to the HDF5 file and return. * - * 3) If the write is raw data, and it of page size or + * 3) If the write is raw data, and is of page size or * larger, write directly from the HDF5 file. * * It is possible that the write intersects one or more @@ -1898,13 +2352,68 @@ done: * configured for raw data only (i.e. min_rd_pages == * max_pages), simply write to the HDF5 file and return. * + * The free space manager guarantees that allocations larger + * than one page will be page alligned, and that allocations + * of size less than or equal to page size will not cross page + * boundaries. Further, unlike raw data, metadata is always + * written and read atomically. + * + * In principle, this should make it easy to discriminate + * between small and multi-page metadata entries so that + * pages containing the former will be buffered and the + * latter be written directly to file. + * + * Unfortunately, there is a fly in the ointment. + * + * The fixed and extensible array on disk data + * structures allocate multiple metadata cache entries in + * a single block, and use this fact to make the addresses + * of all but the first entry in the block computable. While + * this simplifies the fixed and extensible array on disk data + * structures, it complicates the metadata cache and the page + * buffer. + * + * From the page buffer perspective, it breaks the invarient + * that metadata entries of less than page size don't cross + * page boundaries, and those of size greater than or equal + * to page size start on page boundaries -- which is important + * for VFD SWMR as it allows efficient management of multi-page + * metadata entries. + * + * While it is tempting to repair the fixed and extensible + * array data structures so as to remove this irregularity, + * and remove the resulting complexity from both the metadata + * cache and the page buffer, this is a ticklish task, as there + * are already files in the wild that use the existing versions + * of these data structures. Thus, due to resource constraints, + * we have to program around the issue for now. + * + * Fortunately, for purposes of the page buffer, this is + * relatively easy -- when we encounter a metadata write + * that crosses one or more page boundaries, and is not + * both page aligned and an integral number of pages, we + * query the metadata cache to determine the type of the + * client whose data is being writtne. If it is one of the + * mis-behaving types, we split it into two or three writes + * such that each write either doesn't cross page boundaries, + * or is page aligned and an integral number of pages. + * + * This is done in this function, and is not reflected in + * the case analysis in the rest of this comment. + * * 6) If the write is of metadata, the write is larger than - * one page, and vfd_swmr_writer is FALSE, simply read - * from the HDF5 file. There is no need to check the + * one page, and vfd_swmr_writer is FALSE, simply write + * to the HDF5 file. There is no need to check the * page buffer, as metadata is always read atomically, * and entries of this size are not buffered in the page * buffer. * + * Observe that this write must be page aligned. This + * should be enforced by the free space manager, but + * for now it is enforced by the above mentioned practice + * of splitting writes from cache client that don't + * allocate each entry separately. + * * 7) If the write is of metadata, the write is larger than * one page, and vfd_swmr_writer is TRUE, the write must * buffered in the page buffer until the end of the tick. @@ -1937,7 +2446,17 @@ done: * * Programmer: John Mainzer -- 10/11/18 * - * Changes: None. + * Changes: Updated to support splitting of metadata writes that + * are not page aligned and cross page boundaries into + * 2 or 3 writes that are either page aligned or do not + * cross page boundaries. Full details in the header + * comment above, that has been updated to document + * this change. + * + * Also updated case 2 to bypass the page buffer for raw + * data writes in vfd swmr mode. + * + * JRM -- 4/5/20 * *------------------------------------------------------------------------- */ @@ -1945,10 +2464,19 @@ herr_t H5PB_write(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size, const void *buf) { - H5PB_t *pb_ptr; /* Page buffer for this file */ + H5PB_t *pb_ptr; /* Page buffer for this file */ hbool_t bypass_pb = FALSE; /* Whether to bypass page buffering */ + hbool_t split_write = FALSE; /* whether md write must be split */ herr_t ret_value = SUCCEED; /* Return value */ + /* the following six fields are defined iff split_write is TRUE */ + haddr_t prefix_addr = HADDR_UNDEF; /* addr of prefix -- if defined */ + haddr_t body_addr = HADDR_UNDEF; /* addr of body -- if defined */ + haddr_t suffix_addr = HADDR_UNDEF; /* addr of suffix -- if defined */ + size_t prefix_size = 0; /* size of prefix */ + size_t body_size = 0; /* size of body */ + size_t suffix_size = 0; /* size of suffix */ + FUNC_ENTER_NOAPI(FAIL) hlog_fast(pbwr, "%s %p type %d %" PRIuHADDR " size %zu", @@ -1966,7 +2494,8 @@ H5PB_write(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size, if ( H5FD_MEM_DRAW == type ) { /* raw data write */ - if ( pb_ptr->min_md_pages == pb_ptr->max_pages ) { + if ( ( pb_ptr->min_md_pages == pb_ptr->max_pages ) || + ( pb_ptr->vfd_swmr ) ) { /* case 2) -- page buffer configured for metadata only */ bypass_pb = TRUE; @@ -1979,13 +2508,207 @@ H5PB_write(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size, /* case 5) -- page buffer configured for raw data only */ bypass_pb = TRUE; - } else if ( ( size >= pb_ptr->page_size ) && - ( ! ( pb_ptr->vfd_swmr_writer ) ) ) { + } else { - /* case 6) -- md read larger than one page and - * pb_ptr->vfd_swmr_writer is FALSE. + /* determine whether the write request must be split, + * and if so, compute the start points and sizes of + * of the sections. + * + * Note: The following code is almost identical to the + * similar code in H5PB_read(). Thus, on the surface, + * it is an obvious candidate for refactoring into a + * function or macro. + * + * However, there are subtle differences between + * the two pieces of code which are driven by the + * possibility of speculative reads. + * + * More to the point, further changes may be necessary. + * Thus we should wait on refactoring until this code has + * been in daily use for some time, and it is clear + * that further changes are unlikely. */ - bypass_pb = TRUE; + int mdc_client_id = -1; /* id of mdc client, or -1 if undef */ + uint64_t start_page; /* page index of first page in read */ + uint64_t second_page; /* page index of second page in read */ + uint64_t end_page; /* page index of last page in read */ + uint64_t body_page; /* page index of start of body */ + haddr_t start_page_addr; /* addr of first page in read */ + haddr_t second_page_addr;/* addr of second page in read */ + haddr_t end_page_addr; /* addr of last page in read */ + haddr_t end_addr; /* addr of last byte in read */ + + /* Calculate the aligned address of the first page */ + start_page = (addr / pb_ptr->page_size); + start_page_addr = start_page * pb_ptr->page_size; + + /* Calculate the aligned address of the last page */ + end_addr = addr + (haddr_t)(size - 1); + end_page = end_addr / (haddr_t)(pb_ptr->page_size); + end_page_addr = end_page * pb_ptr->page_size; + + HDassert(start_page_addr <= addr); + HDassert(addr < start_page_addr + (haddr_t)(pb_ptr->page_size)); + + HDassert(start_page <= end_page); + HDassert(end_page_addr <= ((addr + (haddr_t)size - 1))); + HDassert((addr + (haddr_t)size - 1) < + (end_page_addr + pb_ptr->page_size)); + + /* test to see if the write crosses a page boundary, and + * does not start on a page boundary, and is not of an + * integral number of pages. + */ + if ( ( start_page < end_page ) && + ( ! ( ( addr == start_page_addr ) && + ( end_page_addr + (haddr_t)(pb_ptr->page_size) == + end_addr + 1 ) ) ) ) { + + /* the read crosses a page boundary and is not + * page aligned and of length some multiple of page size. + * + * Test to see if the read is for a metadata entry that + * is sub-allocated from a larger space allocation. + * + * Note that the following test may have to be + * adjusted. + */ + mdc_client_id = H5C_get_curr_io_client_type(shared->cache); + + if ( ( mdc_client_id == (int)H5AC_EARRAY_DBLK_PAGE_ID ) || \ + ( mdc_client_id == (int)H5AC_FARRAY_DBLK_PAGE_ID ) ) { + + split_write = TRUE; + + } else { + + HDassert(addr == start_page_addr); + HDassert(size > pb_ptr->page_size); + + if ( ! pb_ptr->vfd_swmr_writer ) { + + /* case 6) -- multi-page entry with fixed / + * extensible array filtered out, and no + * no VFD swmr. + */ + bypass_pb = TRUE; + } + } + } else if ( ( size > pb_ptr->page_size ) && + ( ! pb_ptr->vfd_swmr_writer ) ) { + + /* write is larger than page size and we are not + * in VFD SWMR mode -- bypass the page buffer. + * This is also case 6. We catch it here as + * the code to determine whether to split only + * looks at I/O requests that cross page bundaries + * and are not both page aligned and an integral + * number of pages in length. + */ + HDassert(start_page_addr == addr); + bypass_pb = TRUE; + } + + if ( split_write ) { + + /* compute the base addresses and length of the prefix, + * body, and suffix of the write, where these terms are + * defined as follows: + * + * prefix: All bytes from addr to the first page address + * at or after addr. If addr == start_page_addr, + * the prefix is empty. + * + * body: All bytes from the first page address covered + * by the write up to but not including the last + * page address in the write. Note that the + * length of the body must be a multiple of the + * page size. If only one page address is + * included in the write, the body is empty. + * + * suffix: All bytes from the last page address in the + * write until the end of the write. If the + * write ends on a page boundary, the suffix is + * empty. + * + * Since we know that the write crosses at least one + * page boundary, and we have aleady filtered out the + * body only case, at least two of the above must be + * non-empty. + */ + + second_page = start_page + 1; + second_page_addr = + (haddr_t)(second_page * pb_ptr->page_size); + + if ( addr > start_page_addr ) { /* prefix exists */ + + prefix_addr = addr; + prefix_size = (size_t)(second_page_addr - addr); + + HDassert(prefix_addr > start_page_addr); + HDassert(prefix_size < pb_ptr->page_size); + HDassert(((size_t)(addr - start_page_addr) + \ + prefix_size) == pb_ptr->page_size); + } + + if ( size - prefix_size >= pb_ptr->page_size ) { + + /* body exists */ + + if ( addr == start_page_addr ) { + + body_page = start_page; + body_addr = start_page_addr; + + } else { + + body_page = second_page; + body_addr = second_page_addr; + } + + if ( end_addr < end_page_addr + + (haddr_t)(pb_ptr->page_size - 1) ) { + + /* suffix exists */ + body_size = (size_t)(end_page - body_page) * + pb_ptr->page_size; + + } else { + + /* suffix is empty */ + body_size = (size_t)(end_page - body_page + 1) * + pb_ptr->page_size; + } + + HDassert((body_page == start_page) || \ + (body_page == start_page + 1)); + + HDassert(body_addr == \ + (haddr_t)(body_page * pb_ptr->page_size)); + + HDassert(body_size < size); + HDassert(body_size >= pb_ptr->page_size); + + + HDassert(body_addr == \ + addr + (haddr_t)prefix_size); + HDassert((body_addr + (haddr_t)body_size) \ + <= (end_addr + 1)); + } + + if ( end_addr < end_page_addr + + (haddr_t)(pb_ptr->page_size - 1) ) { + + suffix_addr = end_page_addr; + suffix_size = (end_addr + 1) - end_page_addr; + + HDassert(suffix_addr == \ + addr + (haddr_t)(prefix_size + body_size)); + } + + HDassert(size == prefix_size + body_size + suffix_size); + } } } } @@ -2001,6 +2724,7 @@ H5PB_write(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size, } /* end if */ #endif /* H5_HAVE_PARALLEL */ + if ( bypass_pb ) { /* cases 1, 2. 5, and 6 */ if ( H5FD_write(shared->lf, type, addr, size, buf) < 0 ) @@ -2022,15 +2746,84 @@ H5PB_write(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size, HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \ "H5PB_read_raw() failed") + } else if ( split_write ) { + + /* handle the sub-allocated entry case */ + + /* write prefix if it exists */ + if ( prefix_size > 0 ) { + + if ( H5PB__write_meta(shared, type, addr, + prefix_size, buf) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \ + "H5PB__write_meta() failed on prefix") + } + + /* write the body if it exists */ + if ( body_size > 0 ) { + + /* The "body_size == pb_ptr->page_size" clause in the + * following if is required since in normal operating + * mode, the page buffer buffers metadata I/O + * requests of page size or less. + * + * Thus this clause ensures that a single page body + * does not bypass the page buffer, setting the potential + * for an older version to shadow the most recent version. + * + * Note: The page buffer really shouldn't buffer page + * aligned single page metadata I/O requests, as it + * creates extra overhead to no purpose. However, + * fixing this is a bit tricky, and the case doesn't + * appear to be common. Thus, while it should be + * fixed, I don't think it is urgent. + * + * JRM 4/19/20 + */ + if ( ( pb_ptr->vfd_swmr ) || + ( body_size == pb_ptr->page_size ) ) { + + if ( H5PB__write_meta(shared, type, body_addr, body_size, + (const void *)((const uint8_t *)buf + + prefix_size)) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \ + "H5PB__write_meta() failed on body") + + } else { + + if ( H5FD_write(shared->lf, type, body_addr, body_size, + (const void *)((const uint8_t *)buf + + prefix_size)) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \ + "write through of body failed") + + H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size); + } + } + + /* write the suffix if it exists */ + if ( suffix_size > 0 ) { + + if ( H5PB__write_meta(shared, type, suffix_addr, suffix_size, + (const void *)((const uint8_t *)buf + + prefix_size + body_size)) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \ + "H5PB_write_meta() failed on suffix") + } + + H5PB__UPDATE_STATS_FOR_WRITE_SPLIT(pb_ptr) + } else { /* cases 7, and 8 */ - if ( metadata_multipart_write(shared, type, addr, size, buf) < 0 ) + if ( H5PB__write_meta(shared, type, addr, size, buf) < 0 ) HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \ - "H5PB_read_meta() failed") + "H5PB_write_meta() failed") } - - H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size); } done: @@ -3024,118 +3817,6 @@ done: } /* H5PB__mark_entry_dirty() */ -static void -metadata_section_split(size_t pgsz, haddr_t addr, size_t len, const void *_buf, - metadata_section_t *section) -{ - int i; - size_t totlen = 0; - haddr_t whole_pgaddr, tail_pgaddr; - const char *buf = _buf; - metadata_section_t *head = §ion[0], *middle = §ion[1], - *tail = §ion[2]; - - /* Try to find the address of the first whole page, and the address of - * the page after the last whole page. - */ - whole_pgaddr = roundup(addr, pgsz); - tail_pgaddr = rounddown(addr + len, pgsz); - - /* In the degenerate case where the first whole page is "after" the last, - * actually the entire access lands between page boundaries. - */ - if (whole_pgaddr > tail_pgaddr) { - assert(len < pgsz); - head->addr = addr; - head->len = len; - head->buf = buf; - return; - } - - /* `head` spans any range beginning before the first page boundary. */ - if (addr < whole_pgaddr) { - head->buf = buf; - head->len = pgsz - addr % pgsz; - head->addr = addr; - } - - /* `middle` spans one or more whole pages in between the end of - * `head` and before the beginning of `tail`. - */ - if (whole_pgaddr < tail_pgaddr) { - middle->buf = (buf == NULL) ? NULL : &buf[whole_pgaddr - addr]; - middle->len = tail_pgaddr - whole_pgaddr; - middle->addr = whole_pgaddr; - } - - /* `tail` spans residual bytes that follow the last page boundary. */ - if (tail_pgaddr < addr + len) { - tail->len = (addr + len) - tail_pgaddr; - tail->buf = (buf == NULL) ? NULL : &buf[tail_pgaddr - addr]; - tail->addr = tail_pgaddr; - } - - for (i = 0; i < 3; i++) { - metadata_section_t *iter = §ion[i]; - if (iter->len == 0) - continue; - assert(iter->addr == addr + totlen); - assert(iter->buf == ((buf == NULL) ? NULL : &buf[totlen])); -// assert(i == 0 || iter[-1].buf + iter[-1].len == iter->buf); - totlen += iter->len; - } - - assert(totlen == len); -} - -static herr_t -metadata_multipart_read(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, - size_t len, void *_buf/*out*/) -{ - herr_t rc; - int i; - const size_t pgsz = shared->pb_ptr->page_size; - metadata_section_t section[3] = {{0, 0, NULL}, {0, 0, NULL}, {0, 0, NULL}}; - - metadata_section_split(pgsz, addr, len, _buf, section); - - for (i = 0; i < 3; i++) { - metadata_section_t *iter = §ion[i]; - if (iter->buf == NULL) - continue; - rc = H5PB__read_meta(shared, type, iter->addr, iter->len, - (void *)(uintptr_t)iter->buf); - if (rc < 0) - return rc; - } - - return SUCCEED; -} - -static herr_t -metadata_multipart_write(H5F_shared_t *shared, H5FD_mem_t type, - haddr_t addr, size_t len, const void *_buf/*out*/) -{ - herr_t rc; - int i; - const size_t pgsz = shared->pb_ptr->page_size; - metadata_section_t section[3] = {{0, 0, NULL}, {0, 0, NULL}, {0, 0, NULL}}; - - metadata_section_split(pgsz, addr, len, _buf, section); - - for (i = 0; i < 3; i++) { - metadata_section_t *iter = §ion[i]; - - if (iter->buf == NULL) - continue; - rc = H5PB__write_meta(shared, type, iter->addr, iter->len, iter->buf); - if (rc < 0) - return rc; - } - - return SUCCEED; -} - /*------------------------------------------------------------------------- * @@ -3151,21 +3832,25 @@ metadata_multipart_write(H5F_shared_t *shared, H5FD_mem_t type, * existing page, it must not be a multi-page metadata * entry. It it is, flag an error. * + * Recall that by the time we get to this function, + * un-aligned page reads from the fixed and variable + * length array structures that cross page boundaries + * have already been split into two or three reads + * that conform to the usual pattern of metadata reads. + * * 7) If the read is for metadata, is page aligned, is larger * than one page, and there is no entry in the page buffer, * satisfy the read from the file * * 8) If the read is for metadata, is page aligned, is larger * than one page, and there is a regular entry at the target - * page address, test to see if the last read was for the - * same address. + * page address, test to see if the read is speculative. * - * If was, evict the page, and satisfy the read from file. - * Flag an error if the page was dirty. + * If it is not, evict the page, and satisfy the read from + * file. Flag an error if the page was dirty. * - * If the last read was for a different page, clip the read - * to one page, and satisfy the read from the existing - * regular entry. + * If it is, clip the read to one page, and satisfy the + * read from the existing regular entry. * * 9) If the read is for metadata, is page aligned, is larger * than one page, and there is a multi-page metadata entry @@ -3197,7 +3882,7 @@ metadata_multipart_write(H5F_shared_t *shared, H5FD_mem_t type, * * P/A == page aligned * size > PL == size > page length - * PA == previous address + * Spec == speculative read * A == current address * * In the entry exists column: @@ -3207,7 +3892,7 @@ metadata_multipart_write(H5F_shared_t *shared, H5FD_mem_t type, * MPMDE == multi-page metadata entry * * | size | entry | VFD | | - * P/A: | > PL | exists | SWMR | PA == A | Comments: + * P/A: | > PL | exists | SWMR | Spec | Comments: * ------+------+--------+------+---------+------------------------------------- * N | X | N || R | X | X | Clip read to page boundary if * | | | | | necessary @@ -3220,10 +3905,10 @@ metadata_multipart_write(H5F_shared_t *shared, H5FD_mem_t type, * ------+------+--------+------+---------+------------------------------------- * Y | Y | N | X | X | Satisfy read from file (case 7) * ------+------+--------+------+---------+------------------------------------- - * Y | Y | R | X | N | Clip read to page boundary + * Y | Y | R | X | Y | Clip read to page boundary * | | | | | Satisfy read from entry (case 8) * ------+------+--------+------+---------+------------------------------------- - * Y | Y | R | X | Y | Evict entry + * Y | Y | R | X | N | Evict entry * | | | | | (must be clean -- flag error if not) * | | | | | Satisfy read from file (case 8) * ------+------+--------+------+---------+------------------------------------- @@ -3261,20 +3946,25 @@ metadata_multipart_write(H5F_shared_t *shared, H5FD_mem_t type, * * Programmer: John Mainzer -- 10/11/18 * - * Changes: None. + * Changes: Updated to use the speculative read hint from the + * metadata cache, and remove the static variable + * containing the base address of the last read. + * + * JRM -- 4/5/20 * *------------------------------------------------------------------------- */ static herr_t -H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size, - void *buf/*out*/) +H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, + size_t size, void *buf/*out*/) { + hbool_t bypass = FALSE; /* flag indicating PB bypassed */ + hbool_t speculative = FALSE; /* speculative read hint from mdc */ H5PB_t *pb_ptr; /* Page buffer for this file */ H5PB_entry_t *entry_ptr; /* Pointer to page buffer entry */ H5FD_t *file; /* File driver pointer */ uint64_t page; /* page offset of addr */ haddr_t page_addr; /* page containing addr */ - static haddr_t prev_addr = HADDR_UNDEF; /* addr of last call */ size_t offset; /* offset of read in page */ size_t clipped_size; /* possibley clipped size */ herr_t ret_value = SUCCEED; /* Return value */ @@ -3333,7 +4023,8 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size TRUE, FALSE) if ( ( NULL == entry_ptr ) && - ( H5PB__load_page(shared, pb_ptr, page_addr, type, &entry_ptr) < 0 ) ) + ( H5PB__load_page(shared, pb_ptr, page_addr, + type, &entry_ptr) < 0 ) ) HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ "page buffer page load request failed (1)") @@ -3358,7 +4049,7 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size HDassert( page_addr == addr ); - if ( size >= pb_ptr->page_size ) { + if ( size > pb_ptr->page_size ) { /* search the page buffer for an entry at page */ H5PB__SEARCH_INDEX(pb_ptr, page, entry_ptr, FAIL) @@ -3367,10 +4058,11 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size if ( entry_ptr == NULL ) { /* case 7 */ /* update hit rate stats */ - H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, FALSE, TRUE, size > pb_ptr->page_size) + H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, FALSE, \ + TRUE, size > pb_ptr->page_size) - /* If the read is for metadata, is page aligned, is larger - * than one page, and there is no entry in the page buffer, + /* If the read is for metadata, is page aligned, is larger + * than page size, and there is no entry in the page buffer, * satisfy the read from the file */ if ( H5FD_read(file, type, addr, size, buf) < 0) @@ -3378,7 +4070,10 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ "driver read request failed (1)") + bypass = TRUE; + H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size); + } else { HDassert( entry_ptr ); @@ -3389,28 +4084,29 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size /* If the read is for metadata, is page aligned, is larger * than one page, and there is a regular entry at the target - * page address, test to see if the last read was for the - * same address. + * page address, test to see if the read is speculative. * - * If it was, evict the page, and satisfy the read from + * If it is not, evict the page, and satisfy the read from * file. Flag an error if the page was dirty. * - * If the last read was for a different page, clip the read - * to one page, and satisfy the read from the existing - * regular entry. + * If it is, clip the read to one page, and satisfy + * the read from the existing regular entry. */ HDassert( entry_ptr->size == pb_ptr->page_size ); - if ( addr == prev_addr ) { + speculative = H5C_get_curr_read_speculative(shared->cache); + + if ( ! speculative ) { - /* since this is a second try, don't update + /* since this is likely a second try, don't update * hit rate stats. */ HDassert( ! ( entry_ptr->is_dirty ) ); - if (H5PB__evict_entry(shared, entry_ptr, TRUE, false) < 0) + if ( H5PB__evict_entry(shared, entry_ptr, + TRUE, false) < 0 ) HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ "forced eviction failed (1)") @@ -3419,7 +4115,9 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ "driver read request failed (2)") + bypass = TRUE; H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size); + } else { HDassert( entry_ptr->image_ptr ); @@ -3439,7 +4137,8 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size } /* update hit rate stats */ - H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, TRUE, TRUE, FALSE) + H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, TRUE, \ + TRUE, FALSE) } } else { /* case 9 */ @@ -3509,7 +4208,8 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size TRUE, FALSE) if ( ( NULL == entry_ptr ) && - ( H5PB__load_page(shared, pb_ptr, page_addr, type, &entry_ptr) < 0)) + ( H5PB__load_page(shared, pb_ptr, page_addr, + type, &entry_ptr) < 0)) HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ "page buffer page load request failed (2)") @@ -3532,7 +4232,8 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size } } - prev_addr = addr; + if ( ! bypass ) + H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size); done: @@ -3830,6 +4531,8 @@ H5PB__read_raw(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size, } } /* end else */ + H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size); + done: FUNC_LEAVE_NOAPI(ret_value) @@ -4073,6 +4776,8 @@ H5PB__write_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, H5PB__INSERT_IN_TL(pb_ptr, entry_ptr, FAIL) } + H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size); + done: FUNC_LEAVE_NOAPI(ret_value) @@ -4121,8 +4826,8 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5PB__write_raw(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size, - const void *buf/*out*/) +H5PB__write_raw(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, + size_t size, const void *buf/*out*/) { H5PB_t *pb_ptr; /* Page buffer for this file */ H5PB_entry_t *entry_ptr; /* Pointer to page buffer entry */ @@ -4372,6 +5077,8 @@ H5PB__write_raw(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size } } + H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size); + done: FUNC_LEAVE_NOAPI(ret_value) diff --git a/src/H5PBpkg.h b/src/H5PBpkg.h index 49911d6..1cfeb59 100644 --- a/src/H5PBpkg.h +++ b/src/H5PBpkg.h @@ -670,19 +670,19 @@ if ( ( (entry_ptr) == NULL ) || \ #define H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size) \ { \ - int i; \ + int ii; \ \ HDassert(pb_ptr); \ HDassert((pb_ptr)->magic == H5PB__H5PB_T_MAGIC); \ \ if ( H5FD_MEM_DRAW == (type) ) { \ - i = H5PB__STATS_RD; \ + ii = H5PB__STATS_RD; \ } else if ( (size) > (pb_ptr)->page_size ) { \ - i = H5PB__STATS_MPMDE; \ + ii = H5PB__STATS_MPMDE; \ } else { \ - i = H5PB__STATS_MD; \ + ii = H5PB__STATS_MD; \ } \ - ((pb_ptr)->accesses[i])++; \ + ((pb_ptr)->accesses[ii])++; \ } /* H5PB__UPDATE_STATS_FOR_ACCESS */ @@ -812,6 +812,20 @@ if ( ( (entry_ptr) == NULL ) || \ ((pb_ptr)->loads[i])++; \ } /* H5PB__UPDATE_STATS_FOR_LOAD */ +#define H5PB__UPDATE_STATS_FOR_READ_SPLIT(pb_ptr) \ +{ \ + HDassert(pb_ptr); \ + HDassert((pb_ptr)->magic == H5PB__H5PB_T_MAGIC); \ + (pb_ptr->md_read_splits)++; \ +} /* H5PB__UPDATE_STATS_FOR_READ_SPLIT */ + +#define H5PB__UPDATE_STATS_FOR_WRITE_SPLIT(pb_ptr) \ +{ \ + HDassert(pb_ptr); \ + HDassert((pb_ptr)->magic == H5PB__H5PB_T_MAGIC); \ + (pb_ptr->md_write_splits)++; \ +} /* H5PB__UPDATE_STATS_FOR_READ_SPLIT */ + #else /* H5PB__COLLECT_PAGE_BUFFER_STATS */ #define H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, hit, is_metadata, is_mpmde) @@ -834,6 +848,8 @@ if ( ( (entry_ptr) == NULL ) || \ #define H5PB__UPDATE_STATS_FOR_CLEAR(pb_ptr, entry_ptr) #define H5PB__UPDATE_STATS_FOR_INSERTION(pb_ptr, entry_ptr) #define H5PB__UPDATE_STATS_FOR_LOAD(pb_ptr, entry_ptr) +#define H5PB__UPDATE_STATS_FOR_READ_SPLIT(pb_ptr) +#define H5PB__UPDATE_STATS_FOR_WRITE_SPLIT(pb_ptr) #endif /* H5PB__COLLECT_PAGE_BUFFER_STATS */ diff --git a/src/H5PBprivate.h b/src/H5PBprivate.h index 6b879c7..32e681e 100644 --- a/src/H5PBprivate.h +++ b/src/H5PBprivate.h @@ -249,6 +249,9 @@ typedef struct H5PB_entry_t H5PB_entry_t; * * FIELDS SUPPORTING VFD SWMR: * + * If the file is opened in VFD SWMR mode (i.e. vfd_swmr == TRUE), all + * raw data I/O must be passed through to the HDF5 file + * * If the file is opened as a VFD SWMR writer (i.e. vfd_swmr_writer == TRUE), * the page buffer must retain the data necessary to update the metadata * file at the end of each tick, and also delay writes as necessary so as @@ -285,8 +288,12 @@ typedef struct H5PB_entry_t H5PB_entry_t; * The remainder of this sections contains discussions of the fields and * data structures used to support the above operations. * + * vfd_swmr: Boolean flag that is set to TRUE IFF the file is opened + * in VFD SWMR mode -- either reader or writer. This field + * is used to exclude raw data from the page buffer. + * * vfd_swmr_writer: Boolean flag that is set to TRUE iff the file is - * the file is opened in VFD SWMR mode. The remaining + * is opened in VFD SWMR writer mode. The remaining * VFD SWMR fields are defined iff vfd_swmr_writer is TRUE. * * mpmde_count: int64_t containing the number of multi-page metadata @@ -528,6 +535,16 @@ typedef struct H5PB_entry_t H5PB_entry_t; * total_dwl_ins_depth: int64_t containing the total insertion depth * required to maintain the odering invarient on the * delayed write list. + * + * md_read_splits: int64_t containing the number of metadata reads that + * are split into two or three sub-reads to manage the + * case in which a group of metadata cache clients + * sub-allocate entries from a single file space allocationn. + * + * md_write_splits: int64_t containing the number of metadata writes that + * are split into two or three sub-writes to manage the + * case in which a group of metadata cache clients + * sub-allocate entries from a single file space allocationn. * ******************************************************************************/ @@ -578,6 +595,7 @@ typedef struct H5PB_t { /* Fields for VFD SWMR operations: */ + hbool_t vfd_swmr; hbool_t vfd_swmr_writer; int64_t mpmde_count; uint64_t cur_tick; @@ -645,6 +663,8 @@ typedef struct H5PB_t { int64_t max_dwl_len; int64_t max_dwl_size; int64_t total_dwl_ins_depth; + int64_t md_read_splits; + int64_t md_write_splits; } H5PB_t; @@ -670,6 +690,7 @@ H5_DLL herr_t H5PB_add_new_page(H5F_shared_t *, H5FD_mem_t, haddr_t); H5_DLL herr_t H5PB_update_entry(H5PB_t *, haddr_t, size_t, const void *); H5_DLL herr_t H5PB_remove_entry(H5F_shared_t *, haddr_t); + H5_DLL herr_t H5PB_remove_entries(H5F_shared_t *, haddr_t, hsize_t); H5_DLL herr_t H5PB_read(H5F_shared_t *, H5FD_mem_t, haddr_t, diff --git a/test/page_buffer.c b/test/page_buffer.c index 6b6de02..5da326e 100644 --- a/test/page_buffer.c +++ b/test/page_buffer.c @@ -24,6 +24,15 @@ #include "h5test.h" +/* + * This file needs to access private information from the H5C package. + * This file also needs to access the metadata cache testing code. + */ +#define H5C_FRIEND /*suppress error about including H5Cpkg */ +#define H5C_TESTING /*suppress warning about H5C testing funcs*/ +#include "H5Cpkg.h" /* Cache */ + + #include "H5CXprivate.h" /* API Contexts */ #include "H5Iprivate.h" #include "H5PBprivate.h" @@ -65,6 +74,12 @@ static unsigned test_raw_data_handling(hid_t orig_fapl, const char *env_h5_drvr, static unsigned test_lru_processing(hid_t orig_fapl, const char *env_h5_drvr); static unsigned test_min_threshold(hid_t orig_fapl, const char *env_h5_drvr); static unsigned test_stats_collection(hid_t orig_fapl, const char *env_h5_drvr); +static unsigned md_entry_splitting_smoke_check(hid_t orig_fapl, + const char *env_h5_drvr, bool); +static unsigned md_entry_splitting_boundary_test(hid_t orig_fapl, + const char *env_h5_drvr, bool); +static unsigned verify_page_buffering_disabled(hid_t orig_fapl, + const char *env_h5_drvr); #endif /* H5_HAVE_PARALLEL */ #define FILENAME "filepaged" @@ -333,7 +348,8 @@ error: HDfree(data); } H5E_END_TRY; return(1); -} + +} /* create_file() */ /*------------------------------------------------------------------------- @@ -488,7 +504,7 @@ set_multi_split(const char *env_h5_drvr, hid_t fapl, hsize_t pagesize) error: return 1; -} +} /* set_multi_split() */ #ifndef H5_HAVE_PARALLEL @@ -807,7 +823,8 @@ error: HDfree(odata); } H5E_END_TRY; return 1; -} + +} /* test_mpmde_delay_basic() */ /* @@ -1009,7 +1026,8 @@ error: HDfree(odata); } H5E_END_TRY; return 1; -} + +} /* test_spmde_lru_evict_basic() */ /* @@ -1146,7 +1164,8 @@ error: HDfree(odata); } H5E_END_TRY; return 1; -} + +} /* test_spmde_delay_basic() */ /* @@ -1179,6 +1198,19 @@ error: * page buffer. * * JRM -- 10/26/18 + * + * We have decided not to buffer raw data in the page buffer + * when operating in VFD SWMR mode. This is necessary as + * otherwise raw data can get stuck in the page buffer, thus + * delaying it's visibility to the reader. + * + * Obviously, there is a potential performance trade off + * here, but it shouldn't be significant in the expected + * VFD SWMR use cases. Needless to say, we will revisit this + * if necessary. + * + * JRM -- 4/8/20 + * */ /* Changes due to file space page size has a minimum size of 512 */ @@ -1235,7 +1267,8 @@ test_raw_data_handling(hid_t orig_fapl, const char *env_h5_drvr, TEST_ERROR; /* allocate space for 2000 elements */ - if (HADDR_UNDEF == (addr = H5MF_alloc(f, H5FD_MEM_DRAW, sizeof(int) * (size_t)num_elements))) + if (HADDR_UNDEF == (addr = H5MF_alloc(f, H5FD_MEM_DRAW, + sizeof(int) * (size_t)num_elements))) FAIL_STACK_ERROR; if ((data = (int *)HDcalloc((size_t)num_elements, sizeof(int))) == NULL) @@ -1244,7 +1277,8 @@ test_raw_data_handling(hid_t orig_fapl, const char *env_h5_drvr, /* initialize all the elements to have a value of -1 */ for(i=0 ; ishared->pb_ptr->curr_pages != page_count + base_page_cnt) + if ( ( f->shared->pb_ptr->curr_pages != page_count + base_page_cnt ) && + ( ( vfd_swmr_mode ) && + ( f->shared->pb_ptr->curr_pages != base_page_cnt ) ) ) FAIL_STACK_ERROR; /* update elements 300 - 450, with values 300 - - this will bring two more pages into the page buffer. */ for(i=0 ; i<150 ; i++) data[i] = i+300; - if (H5F_block_write(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 300), sizeof(int) * 150, data) < 0) + + if (H5F_block_write(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 300), + sizeof(int) * 150, data) < 0) FAIL_STACK_ERROR; + page_count += 2; - if (f->shared->pb_ptr->curr_pages != page_count + base_page_cnt) + + if ( ( f->shared->pb_ptr->curr_pages != page_count + base_page_cnt ) && + ( ( vfd_swmr_mode ) && + ( f->shared->pb_ptr->curr_pages != base_page_cnt ) ) ) FAIL_STACK_ERROR; /* update elements 100 - 300, this will go to disk but also update existing pages in the page buffer. */ for(i=0 ; i<200 ; i++) data[i] = i+100; - if (H5F_block_write(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 100), sizeof(int) * 200, data) < 0) + + if (H5F_block_write(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 100), + sizeof(int) * 200, data) < 0) FAIL_STACK_ERROR; - if (f->shared->pb_ptr->curr_pages != page_count + base_page_cnt) + + if ( ( f->shared->pb_ptr->curr_pages != page_count + base_page_cnt ) && + ( ( vfd_swmr_mode ) && + ( f->shared->pb_ptr->curr_pages != base_page_cnt ) ) ) FAIL_STACK_ERROR; /* Update elements 225-300 - this will update an existing page in the PB */ /* Changes: 450 - 600; 150 */ for(i=0 ; i<150 ; i++) data[i] = i+450; - if (H5F_block_write(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 450), sizeof(int) * 150, data) < 0) + + if (H5F_block_write(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 450), + sizeof(int) * 150, data) < 0) FAIL_STACK_ERROR; - if (f->shared->pb_ptr->curr_pages != page_count + base_page_cnt) + + if ( ( f->shared->pb_ptr->curr_pages != page_count + base_page_cnt ) && + ( ( vfd_swmr_mode ) && + ( f->shared->pb_ptr->curr_pages != base_page_cnt ) ) ) FAIL_STACK_ERROR; /* Do a full page write to block 600-800 - should bypass the PB */ for(i=0 ; i<200 ; i++) data[i] = i+600; - if (H5F_block_write(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 600), sizeof(int) * 200, data) < 0) + + if (H5F_block_write(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 600), + sizeof(int) * 200, data) < 0) FAIL_STACK_ERROR; - if (f->shared->pb_ptr->curr_pages != page_count + base_page_cnt) + + if ( ( f->shared->pb_ptr->curr_pages != page_count + base_page_cnt ) && + ( ( vfd_swmr_mode ) && + ( f->shared->pb_ptr->curr_pages != base_page_cnt ) ) ) FAIL_STACK_ERROR; - /* read elements 800 - 1200, this should not affect the PB, and should read -1s */ - if (H5F_block_read(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 800), sizeof(int) * 400, data) < 0) + /* read elements 800 - 1200, this should not affect the PB, and should + * read -1s + */ + if (H5F_block_read(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 800), + sizeof(int) * 400, data) < 0) FAIL_STACK_ERROR; + for (i=0; i < 400; i++) { if (data[i] != -1) { HDfprintf(stderr, "Read different values than written\n"); @@ -1307,14 +1368,19 @@ test_raw_data_handling(hid_t orig_fapl, const char *env_h5_drvr, FAIL_STACK_ERROR; } } - if (f->shared->pb_ptr->curr_pages != page_count + base_page_cnt) + + if ( ( f->shared->pb_ptr->curr_pages != page_count + base_page_cnt ) && + ( ( vfd_swmr_mode ) && + ( f->shared->pb_ptr->curr_pages != base_page_cnt ) ) ) FAIL_STACK_ERROR; /* read elements 1200 - 1201, this should read -1 and bring in an * entire page of addr 1200 */ - if (H5F_block_read(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 1200), sizeof(int) * 1, data) < 0) + if (H5F_block_read(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 1200), + sizeof(int) * 1, data) < 0) FAIL_STACK_ERROR; + for (i=0; i < 1; i++) { if (data[i] != -1) { HDfprintf(stderr, "Read different values than written\n"); @@ -1323,14 +1389,19 @@ test_raw_data_handling(hid_t orig_fapl, const char *env_h5_drvr, } } page_count ++; - if (f->shared->pb_ptr->curr_pages != page_count + base_page_cnt) + + if ( ( f->shared->pb_ptr->curr_pages != page_count + base_page_cnt ) && + ( ( vfd_swmr_mode ) && + ( f->shared->pb_ptr->curr_pages != base_page_cnt ) ) ) TEST_ERROR; /* read elements 175 - 225, this should use the PB existing pages */ /* Changes: 350 - 450 */ /* read elements 175 - 225, this should use the PB existing pages */ - if (H5F_block_read(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 350), sizeof(int) * 100, data) < 0) + if (H5F_block_read(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 350), + sizeof(int) * 100, data) < 0) FAIL_STACK_ERROR; + for (i=0; i < 100; i++) { if (data[i] != i + 350) { HDfprintf(stderr, "Read different values than written\n"); @@ -1339,16 +1410,27 @@ test_raw_data_handling(hid_t orig_fapl, const char *env_h5_drvr, TEST_ERROR; } } - if (f->shared->pb_ptr->curr_pages != page_count + base_page_cnt) + + if ( ( f->shared->pb_ptr->curr_pages != page_count + base_page_cnt ) && + ( ( vfd_swmr_mode ) && + ( f->shared->pb_ptr->curr_pages != base_page_cnt ) ) ) TEST_ERROR; - /* read elements 0 - 800 using the VFD.. this should result in -1s - except for the writes that went through the PB (100-300 & 600-800) */ - if (H5FD_read(f->shared->lf, H5FD_MEM_DRAW, addr, sizeof(int) * 800, data) < 0) + /* read elements 0 - 800 using the VFD. + * + * In the non-VFD SWMR case, this should result in -1s + * except for the writes that went through the PB (100-300 & 600-800) + * + * In the VFD SWMR case, the page buffer is bypassed for raw data, + * thus all writes should be visible. + */ + if (H5FD_read(f->shared->lf, H5FD_MEM_DRAW, addr, + sizeof(int) * 800, data) < 0) FAIL_STACK_ERROR; + i = 0; while (i < 800) { - if((i>=100 && i<300) || i >= 600) { + if((vfd_swmr_mode) || (i>=100 && i<300) || i >= 600) { if (data[i] != i) { HDfprintf(stderr, "Read different values than written\n"); HDfprintf(stderr, "data[%d] = %d, %d expected.\n", @@ -1372,8 +1454,12 @@ test_raw_data_handling(hid_t orig_fapl, const char *env_h5_drvr, */ if (H5F_block_read(f, H5FD_MEM_DRAW, addr, sizeof(int) * 800, data) < 0) FAIL_STACK_ERROR; - if (f->shared->pb_ptr->curr_pages != page_count + base_page_cnt) + + if ( ( f->shared->pb_ptr->curr_pages != page_count + base_page_cnt ) && + ( ( vfd_swmr_mode ) && + ( f->shared->pb_ptr->curr_pages != base_page_cnt ) ) ) TEST_ERROR; + for (i=0; i < 800; i++) { if (data[i] != i) { HDfprintf(stderr, "Read different values than written\n"); @@ -1389,10 +1475,16 @@ test_raw_data_handling(hid_t orig_fapl, const char *env_h5_drvr, */ for(i=0 ; i<1000 ; i++) data[i] = 0; - if (H5F_block_write(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 400), sizeof(int) * 1000, data) < 0) + + if (H5F_block_write(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 400), + sizeof(int) * 1000, data) < 0) FAIL_STACK_ERROR; + page_count -= 2; - if (f->shared->pb_ptr->curr_pages != page_count + base_page_cnt) + + if ( ( f->shared->pb_ptr->curr_pages != page_count + base_page_cnt ) && + ( ( vfd_swmr_mode ) && + ( f->shared->pb_ptr->curr_pages != base_page_cnt ) ) ) TEST_ERROR; /* read elements 0 - 1000.. this should go to disk then update the @@ -1400,6 +1492,7 @@ test_raw_data_handling(hid_t orig_fapl, const char *env_h5_drvr, */ if (H5F_block_read(f, H5FD_MEM_DRAW, addr, sizeof(int) * 1000, data) < 0) FAIL_STACK_ERROR; + i=0; while (i < 1000) { if(i<400) { @@ -1420,7 +1513,10 @@ test_raw_data_handling(hid_t orig_fapl, const char *env_h5_drvr, } i++; } - if (f->shared->pb_ptr->curr_pages != page_count + base_page_cnt) + + if ( ( f->shared->pb_ptr->curr_pages != page_count + base_page_cnt ) && + ( ( vfd_swmr_mode ) && + ( f->shared->pb_ptr->curr_pages != base_page_cnt ) ) ) TEST_ERROR; if (H5Fclose(file_id) < 0) @@ -2670,22 +2766,24 @@ test_stats_collection(hid_t orig_fapl, const char *env_h5_drvr) sizeof(int)*100, data) < 0) FAIL_STACK_ERROR; - if ( ( f->shared->pb_ptr->accesses[0] != 9 ) || + /* was 9, 16, 0 -- review this */ + if ( ( f->shared->pb_ptr->accesses[0] != 10 ) || ( f->shared->pb_ptr->accesses[1] != 16 ) || ( f->shared->pb_ptr->accesses[2] != 0 ) ) { - HDfprintf(stderr, "accesses[] = {%d, %d, %d}. {9, 16, 0} expected\n", + HDfprintf(stderr, "accesses[] = {%d, %d, %d}. {10, 16, 0} expected\n", f->shared->pb_ptr->accesses[0], f->shared->pb_ptr->accesses[1], f->shared->pb_ptr->accesses[2]); TEST_ERROR; } - if ( ( f->shared->pb_ptr->bypasses[0] != 2 ) || + /* was 2, 1, 1 -- review this */ + if ( ( f->shared->pb_ptr->bypasses[0] != 0 ) || ( f->shared->pb_ptr->bypasses[1] != 1 ) || ( f->shared->pb_ptr->bypasses[2] != 1 ) ) { - HDfprintf(stderr, "bypasses[] = {%d, %d, %d}. {2, 1, 1} expected\n", + HDfprintf(stderr, "bypasses[] = {%d, %d, %d}. {0, 1, 1} expected\n", f->shared->pb_ptr->bypasses[0], f->shared->pb_ptr->bypasses[1], f->shared->pb_ptr->bypasses[2]); @@ -2703,18 +2801,20 @@ test_stats_collection(hid_t orig_fapl, const char *env_h5_drvr) TEST_ERROR; } - if ( ( f->shared->pb_ptr->misses[0] != 9 ) || + /* was 9, 16. 0 -- review this */ + if ( ( f->shared->pb_ptr->misses[0] != 10 ) || ( f->shared->pb_ptr->misses[1] != 16 ) || ( f->shared->pb_ptr->misses[2] != 0 ) ) { - HDfprintf(stderr, "misses[] = {%d, %d, %d}. {9, 16, 0} expected\n", + HDfprintf(stderr, "misses[] = {%d, %d, %d}. {10, 16, 0} expected\n", f->shared->pb_ptr->misses[0], f->shared->pb_ptr->misses[1], f->shared->pb_ptr->misses[2]); TEST_ERROR; } - if ( ( f->shared->pb_ptr->evictions[0] != 7) || + /* was 7, 9, 0 -- review this */ + if ( ( f->shared->pb_ptr->evictions[0] != 9) || ( f->shared->pb_ptr->evictions[1] != 9) || ( f->shared->pb_ptr->evictions[2] != 0 ) ) { @@ -2736,17 +2836,19 @@ test_stats_collection(hid_t orig_fapl, const char *env_h5_drvr) evictions, bypasses) < 0) FAIL_STACK_ERROR; - if ( ( accesses[0] != 9 ) || + /* was 9, 16, 0 -- review this */ + if ( ( accesses[0] != 10 ) || ( accesses[1] != 16 ) || ( accesses[2] != 0 ) ) { HDfprintf(stderr, - "accesses[] = {%d, %d, %d}. {9, 16, 0} expected\n", + "accesses[] = {%d, %d, %d}. {10, 16, 0} expected\n", accesses[0], accesses[1], accesses[2]); TEST_ERROR; } - if ( ( bypasses[0] != 2 ) || + /* was 2, 1, 1 -- review this */ + if ( ( bypasses[0] != 0 ) || ( bypasses[1] != 1 ) || ( bypasses[2] != 1 ) ) { @@ -2764,22 +2866,24 @@ test_stats_collection(hid_t orig_fapl, const char *env_h5_drvr) TEST_ERROR; } - if ( ( misses[0] != 9 ) || + /* was 9, 16. 0 -- review this */ + if ( ( misses[0] != 10 ) || ( misses[1] != 16 ) || ( misses[2] != 0 ) ) { - HDfprintf(stderr, "misses[] = {%d, %d, %d}. {9, 16, 0} expected\n", + HDfprintf(stderr, "misses[] = {%d, %d, %d}. {10, 16, 0} expected\n", misses[0], misses[1], misses[2]); TEST_ERROR; } - if ( ( evictions[0] != 7 ) || + /* was 9, 9, 0 -- review this */ + if ( ( evictions[0] != 9 ) || ( evictions[1] != 9 ) || ( evictions[2] != 0 ) ) { HDfprintf(stderr, - "evictions[] = {%d, %d, %d}. {%d, %d, 0} expected\n", - evictions[0], evictions[1], evictions[2], 7, 9); + "evictions[] = {%d, %d, %d}. {9, 9, 0} expected\n", + evictions[0], evictions[1], evictions[2]); TEST_ERROR; } @@ -2955,101 +3059,1403 @@ error: return 1; -} +} /* verify_page_buffering_disabled() */ + #endif /* H5_HAVE_PARALLEL */ -/*------------------------------------------------------------------------- - * Function: main() +/************************************************************************* * - * Purpose: Main function for the page buffer tests. + * Function: md_entry_splitting_smoke_check() + * + * Purpose: Normally, file space for metadata entries is allocated + * indvidually. In the context of paged allocation, this + * ensures that all entries that cross page boundaries start + * on a page boundary, and that any space between the end of + * a multi-page metadata entry and the next page boundary + * is un-used. + * + * In the context of VFD SWMR, this fact along with atomic + * metadata entry I/O is used to minimize the size of the + * index in the metadata file, and to optimize metadata + * metadata reads on the VFD SWMR reader side. It is also + * used as a simplifying assumption in normal page buffer + * operation. + * + * Unfortunately, it turns out that some metadata cache + * clients (H5FA & H5EA) allocate the needed file space in + * a single block, and sub-allocate space for individual + * entries out of this block. + * + * While this is a design flaw from the perspective + * VFD SWMR, repairing the issue no feasible at this time, + * and in any case, there will always be the issue of + * existing files. + * + * Thus, for now at least, the page buffer has to code around + * the issue when operating in VFD SWMR mode. + * + * It does this by examining metadata I/O requests that + * cross page boundaries, and querying the metadata cache + * for the ID of the associated cache client. + * + * If the request is associated with a cache client that + * that uses sub-allocation, the I/O request must be broken + * into the minimal number of sub-requests such that each + * request either doesn't cross page boundaries, or is + * page aligned, and of length equal to some multiple of + * the page size. + * + * This test exists to verify that such entries are read + * and written correctly. + * + * Note that it does not concern itself with verifying + * the correct handling of the split I/O requests, as + * the split is done immediately upon receipt, and each + * of the sub-requests is treated as a normal metadata + * I/O request. + * + * Note that this test requires us to modify the page buffer + * hint fields in the metadata cache to trick it into + * re-laying the desired hints to the page buffer, even + * though it is not generating the I/O requests in this + * test. * * Return: 0 if test is sucessful * 1 if test fails * - * Programmer: unknown - * ?? / ?? / ?? + * Programmer: John Mainzer + * 4/9/20 * - *------------------------------------------------------------------------- - */ -int -main(void) + * Changes: None. + * + *************************************************************************/ + +#define HDR_SIZE 40 +#define MD_PAGE_SIZE 250 +#define TOT_SYNTH_ENTRY_SIZES (HDR_SIZE + (3 * MD_PAGE_SIZE)) + +static unsigned +md_entry_splitting_smoke_check(hid_t orig_fapl, const char *env_h5_drvr, + bool vfd_swmr_mode) { - hid_t fapl = -1; /* File access property list for data files */ - unsigned nerrors = 0; /* Cumulative error count */ - const char *env_h5_drvr = NULL; /* File Driver value from environment */ - hbool_t api_ctx_pushed = FALSE; /* Whether API context pushed */ + char filename[FILENAME_LEN]; /* Filename to use */ + hid_t file_id = -1; /* File ID */ + hid_t fcpl = -1; + hid_t fapl = -1; + int i; + int * synth_md_vals = NULL; + int * synth_md_test_buf = NULL; + haddr_t base_addr; + haddr_t p0_addr; + haddr_t p1_addr; + haddr_t p2_addr; + H5F_t *f = NULL; + const uint32_t max_lag = 5; - h5_reset(); + TESTING("%sMetadata Entry Splitting Smoke Check", \ + vfd_swmr_mode ? "VFD SWMR " : ""); - /* Get the VFD to use */ - env_h5_drvr = HDgetenv("HDF5_DRIVER"); - if(env_h5_drvr == NULL) - env_h5_drvr = "nomatch"; + h5_fixname(namebase, orig_fapl, filename, sizeof(filename)); - /* Temporary skip testing with multi/split drivers: - * Page buffering depends on paged aggregation which is - * currently disabled for multi/split drivers. + if ((fapl = H5Pcopy(orig_fapl)) < 0) + TEST_ERROR; + + if (set_multi_split(env_h5_drvr, fapl, sizeof(int) * 200) != 0) + TEST_ERROR; + + if ((fcpl = H5Pcreate(H5P_FILE_CREATE)) < 0) + TEST_ERROR; + + if (H5Pset_file_space_strategy(fcpl, H5F_FSPACE_STRATEGY_PAGE, 0, 1) < 0) + TEST_ERROR; + + if (H5Pset_file_space_page_size(fcpl, (size_t)1000) < 0) + TEST_ERROR; + + if (H5Pset_page_buffer_size(fapl, sizeof(int) * 2000, 0, 0) < 0) + TEST_ERROR; + + if (vfd_swmr_mode && swmr_fapl_augment(fapl, filename, max_lag) < 0) + TEST_ERROR; + + if ((file_id = H5Fcreate(filename, H5F_ACC_TRUNC, fcpl, fapl)) < 0) + FAIL_STACK_ERROR; + + /* Get a pointer to the internal file object */ + if(NULL == (f = (H5F_t *)H5VL_object(file_id))) + FAIL_STACK_ERROR; + + /* The objective is to perform a quick smoke check on I/O of metadata + * entries that have been sub-allocated out of a larger space allocation. + * We do this by simulating a structure similar to elements of the + * fixed array on disk structure. Specifically, we create a synthetic + * set of metadata entries that are allocated out of a single allocation + * from the free space manager, and perform several reads and writes to + * verify expected behaviour. + * + * The synthetic set of metadata entries are constucted of integers + * so as to allow easy assignement of unique values. It is constructed + * as follows: + * + * size values: addr: + * (ints) + * + * header: 40 0, 1, ... 39 base_addr + * page 0: 250 1040, 1041, ... 1289 base_addr + 40 * sizeof(int) + * page 1: 250 2290, 2291, ... 2539 base_addr + 290 * sizeof(int) + * page 2: 250 3540, 3541, ... 3789 base_addr + 540 * sizeof(int) + * + * The overall size of the compound metadata entry is 395 * sizeof(int). + * Since we use a page size of 100 * sizeof(int), this system of synthetic + * metadata entries spans four pages. */ - if((0 == HDstrcmp(env_h5_drvr, "multi")) || - (0 == HDstrcmp(env_h5_drvr, "split"))) { - SKIPPED() - HDputs("Skip page buffering test because paged aggregation is disabled for multi/split drivers"); - HDputs("Furthermore, VFD SWMR is not (yet) expected to work with multi/split drivers"); - HDexit(EXIT_SUCCESS); + /* allocate the buffers needed for the synthetic md entry test */ + if ( (synth_md_vals = (int *)HDcalloc((size_t)TOT_SYNTH_ENTRY_SIZES, + sizeof(int))) == NULL ) + TEST_ERROR + + if ( (synth_md_test_buf = (int *)HDcalloc((size_t)TOT_SYNTH_ENTRY_SIZES, + sizeof(int))) == NULL ) + TEST_ERROR + + /* allocate file space for the synthetic metadata entries and + * compute their addresses. + */ + if (HADDR_UNDEF == + (base_addr = H5MF_alloc(f, H5FD_MEM_BTREE, + sizeof(int) * (size_t)(TOT_SYNTH_ENTRY_SIZES)))) + FAIL_STACK_ERROR; + + p0_addr = base_addr + (haddr_t)(sizeof(int) * HDR_SIZE); + p1_addr = p0_addr + (haddr_t)(sizeof(int) * MD_PAGE_SIZE); + p2_addr = p1_addr + (haddr_t)(sizeof(int) * MD_PAGE_SIZE); + + + /* Set all cells in synth_md_vals[] to -1 and write directly to + * the underlying file via an H5FD call. This gives us a known + * set of values in the underlying file. + */ + for ( i = 0; i < TOT_SYNTH_ENTRY_SIZES; i++) { + + synth_md_vals[i] = -1; } - if((fapl = h5_fileaccess()) < 0) { - nerrors++; - PUTS_ERROR("Can't get VFD-dependent fapl") + if ( H5FD_write(f->shared->lf, H5FD_MEM_BTREE, base_addr, + sizeof(int) * TOT_SYNTH_ENTRY_SIZES, synth_md_vals) < 0) + FAIL_STACK_ERROR; + + /* touch up the metadata cache so that it will report that a metadata + * entry that was sub-allocated out of a larger file space allocation + * is the source of the current metadata I/O operation. + */ + H5C_set_curr_io_type_splitable(f->shared->cache, TRUE); + + /* initialize the buffer with the values of the synthetic metadata + * entries. + */ + for ( i = 0; i < TOT_SYNTH_ENTRY_SIZES; i++ ) { + + synth_md_vals[i] = i; + + if ( i > HDR_SIZE ) { + synth_md_vals[i] += 1000; + } + + if ( i > HDR_SIZE + MD_PAGE_SIZE ) { + synth_md_vals[i] += 1000; + } + + if ( i > HDR_SIZE + MD_PAGE_SIZE + MD_PAGE_SIZE ) { + synth_md_vals[i] += 1000; + } + } - /* Push API context */ - if(H5CX_push() < 0) FAIL_STACK_ERROR - api_ctx_pushed = TRUE; + /* write the header */ + if (H5F_block_write(f, H5FD_MEM_BTREE, base_addr, + sizeof(int) * (size_t)HDR_SIZE, synth_md_vals) < 0) + FAIL_STACK_ERROR; -#ifdef H5_HAVE_PARALLEL + /* read the header */ + if (H5F_block_read(f, H5FD_MEM_BTREE, base_addr, + sizeof(int) * (size_t)HDR_SIZE, synth_md_test_buf) < 0) + FAIL_STACK_ERROR; - HDputs("Page Buffering is disabled for parallel."); - nerrors += verify_page_buffering_disabled(fapl, env_h5_drvr); + /* write page 0 */ + if (H5F_block_write(f, H5FD_MEM_BTREE, p0_addr, + sizeof(int) * (size_t)MD_PAGE_SIZE, + &(synth_md_vals[HDR_SIZE])) < 0) + FAIL_STACK_ERROR; -#else /* H5_HAVE_PARALLEL */ + /* read page 0 */ + if (H5F_block_read(f, H5FD_MEM_BTREE, p0_addr, + sizeof(int) * (size_t)MD_PAGE_SIZE, + &(synth_md_test_buf[HDR_SIZE])) < 0) + FAIL_STACK_ERROR; - nerrors += test_args(fapl, env_h5_drvr); - nerrors += test_raw_data_handling(fapl, env_h5_drvr, false); - nerrors += test_raw_data_handling(fapl, env_h5_drvr, true); - nerrors += test_spmde_delay_basic(fapl, env_h5_drvr); - nerrors += test_mpmde_delay_basic(fapl, env_h5_drvr); - nerrors += test_spmde_lru_evict_basic(fapl, env_h5_drvr); - nerrors += test_lru_processing(fapl, env_h5_drvr); - nerrors += test_min_threshold(fapl, env_h5_drvr); - nerrors += test_stats_collection(fapl, env_h5_drvr); + /* write page 1 */ + if (H5F_block_write(f, H5FD_MEM_BTREE, p1_addr, + sizeof(int) * (size_t)MD_PAGE_SIZE, + &(synth_md_vals[HDR_SIZE + MD_PAGE_SIZE])) < 0) + FAIL_STACK_ERROR; -#endif /* H5_HAVE_PARALLEL */ + /* read page 1 */ + if (H5F_block_read(f, H5FD_MEM_BTREE, p1_addr, + sizeof(int) * (size_t)MD_PAGE_SIZE, + &(synth_md_test_buf[HDR_SIZE + MD_PAGE_SIZE])) < 0) + FAIL_STACK_ERROR; - h5_clean_files(namebases, fapl); + /* write page 2 */ + if (H5F_block_write(f, H5FD_MEM_BTREE, p2_addr, + sizeof(int) * (size_t)MD_PAGE_SIZE, + &(synth_md_vals[HDR_SIZE + 2 * MD_PAGE_SIZE])) < 0) + FAIL_STACK_ERROR; - if(nerrors) - goto error; + /* read page 2 */ + if (H5F_block_read(f, H5FD_MEM_BTREE, p2_addr, + sizeof(int) * (size_t)MD_PAGE_SIZE, + &(synth_md_test_buf[HDR_SIZE + 2 * MD_PAGE_SIZE])) < 0) + FAIL_STACK_ERROR; - /* Pop API context */ - if(api_ctx_pushed && H5CX_pop() < 0) FAIL_STACK_ERROR - api_ctx_pushed = FALSE; + /* verify reads */ + for ( i = 0; i < TOT_SYNTH_ENTRY_SIZES; i++ ) { - HDputs("All Page Buffering tests passed."); + if ( synth_md_vals[i] != synth_md_test_buf[i] ) { - HDexit(EXIT_SUCCESS); + HDfprintf(stderr, "(1) unexpected read %d: val %d -- %d expected\n", + i, synth_md_test_buf[i], synth_md_vals[i]); + TEST_ERROR; + } + } + + /* zero the test buffer, do the reads again in reverse order, and verify */ + + for ( i = 0; i < TOT_SYNTH_ENTRY_SIZES; i++) { + + synth_md_test_buf[i] = 0; + } + + /* read page 2 */ + if (H5F_block_read(f, H5FD_MEM_BTREE, p2_addr, + sizeof(int) * (size_t)MD_PAGE_SIZE, + &(synth_md_test_buf[HDR_SIZE + 2 * MD_PAGE_SIZE])) < 0) + FAIL_STACK_ERROR; + + /* read page 1 */ + if (H5F_block_read(f, H5FD_MEM_BTREE, p1_addr, + sizeof(int) * (size_t)MD_PAGE_SIZE, + &(synth_md_test_buf[HDR_SIZE + MD_PAGE_SIZE])) < 0) + FAIL_STACK_ERROR; + + /* read page 0 */ + if (H5F_block_read(f, H5FD_MEM_BTREE, p0_addr, + sizeof(int) * (size_t)MD_PAGE_SIZE, + &(synth_md_test_buf[HDR_SIZE])) < 0) + FAIL_STACK_ERROR; + + /* read the header */ + if (H5F_block_read(f, H5FD_MEM_BTREE, base_addr, + sizeof(int) * (size_t)HDR_SIZE, synth_md_test_buf) < 0) + FAIL_STACK_ERROR; + + /* verify reads again */ + for ( i = 0; i < TOT_SYNTH_ENTRY_SIZES; i++ ) { + + if ( synth_md_vals[i] != synth_md_test_buf[i] ) { + + HDfprintf(stderr, "(2) unexpected read %d: val %d -- %d expected\n", + i, synth_md_test_buf[i], synth_md_vals[i]); + TEST_ERROR; + } + } + + /* Undo the touchup of the metadata cache */ + H5C_set_curr_io_type_splitable(f->shared->cache, FALSE); + + /* free the test buffers */ + HDfree(synth_md_vals); + HDfree(synth_md_test_buf); + + if (H5Fclose(file_id) < 0) + FAIL_STACK_ERROR; + if (H5Pclose(fcpl) < 0) + FAIL_STACK_ERROR; + if (H5Pclose(fapl) < 0) + FAIL_STACK_ERROR; + + PASSED(); + return 0; error: - HDprintf("***** %d Page Buffering TEST%s FAILED! *****\n", - nerrors, nerrors > 1 ? "S" : ""); + + /* Undo the touchup of the metadata cache */ + if ( ( f ) && ( f->shared ) && ( f->shared->cache) ) + H5C_set_curr_io_type_splitable(f->shared->cache, FALSE); + + if ( synth_md_vals ) + HDfree(synth_md_vals); + + if ( synth_md_test_buf ) + HDfree(synth_md_test_buf); H5E_BEGIN_TRY { - H5Pclose(fapl); + if (fapl != H5I_INVALID_HID) + H5Pclose(fapl); + if (fcpl != H5I_INVALID_HID) + H5Pclose(fcpl); + if (file_id != H5I_INVALID_HID) + H5Fclose(file_id); } H5E_END_TRY; + return 1; - if(api_ctx_pushed) H5CX_pop(); +} /* md_entry_splitting_smoke_check() */ - HDexit(EXIT_FAILURE); -} +#undef HDR_SIZE +#undef MD_PAGE_SIZE +#undef TOT_SYNTH_ENTRY_SIZES + + +/************************************************************************* + * + * Function: md_entry_splitting_boundary_test() + * + * Purpose: Test to verify that I/O request splitting performs as + * as expected in various boundary conditions. + * + * The above md_entry_splitting_smoke_check() was directed + * at verifying that the page buffer behaved as expected + * in something approaching a typical use case. + * + * This test is directed at verifying that entries are + * split correctly under a variety of conditions that + * are unlikely unless the user chooses at odd page size. + * + * Return: 0 if test is sucessful + * 1 if test fails + * + * Programmer: John Mainzer + * 4/12/20 + * + * Changes: None. + * + *************************************************************************/ + + +static unsigned +md_entry_splitting_boundary_test(hid_t orig_fapl, const char *env_h5_drvr, + bool vfd_swmr_mode) +{ + char filename[FILENAME_LEN]; /* Filename to use */ + hid_t file_id = -1; /* File ID */ + hid_t fcpl = -1; + hid_t fapl = -1; + int64_t base_page_cnt; + int i; + H5F_t *f = NULL; + const uint32_t max_lag = 5; + size_t page_size = (size_t)512; + int pages_allocated = 32; + size_t alloc_size; + uint8_t * write_buf = NULL; + uint8_t * read_buf = NULL; + haddr_t base_addr = HADDR_UNDEF; + haddr_t first_page_addr = HADDR_UNDEF; + haddr_t start_addr = HADDR_UNDEF; + size_t test_len; + + TESTING("%sMetadata Entry Splitting Boundary Test", \ + vfd_swmr_mode ? "VFD SWMR " : ""); + + h5_fixname(namebase, orig_fapl, filename, sizeof(filename)); + + if ((fapl = H5Pcopy(orig_fapl)) < 0) + TEST_ERROR + + if (set_multi_split(env_h5_drvr, fapl, sizeof(int) * 200) != 0) + TEST_ERROR; + + if ((fcpl = H5Pcreate(H5P_FILE_CREATE)) < 0) + TEST_ERROR; + + if (H5Pset_file_space_strategy(fcpl, H5F_FSPACE_STRATEGY_PAGE, 0, 1) < 0) + TEST_ERROR; + + if (H5Pset_file_space_page_size(fcpl, page_size) < 0) + TEST_ERROR; + + if (H5Pset_page_buffer_size(fapl, 32 * page_size, 0, 0) < 0) + TEST_ERROR; + + if (vfd_swmr_mode && swmr_fapl_augment(fapl, filename, max_lag) < 0) + TEST_ERROR; + + if ((file_id = H5Fcreate(filename, H5F_ACC_TRUNC, fcpl, fapl)) < 0) + FAIL_STACK_ERROR; + + /* Get a pointer to the internal file object */ + if(NULL == (f = (H5F_t *)H5VL_object(file_id))) + FAIL_STACK_ERROR; + + /* opening the file inserts one or more pages into the page buffer. + * Get the number of pages inserted, and verify that it is the + * expected value. + */ + base_page_cnt = f->shared->pb_ptr->curr_pages; + if (base_page_cnt != 1) + TEST_ERROR; + + /* Test the folowing cases: + * + * 1) splittable md entry that is page aligned and exactly one + * page long. + * + * 2) splittable md entry that is page aligned and exactly two + * pages long + * + * 3) splittable md entry that is page aligned and is exactly one + * page and one byte long. + * + * 4) splittable md entry that is exactly one page and one byte + * long, and starts one byte before a page bundary. + * + * 5) splittable md entry that is exactly one page and two bytes + * long, and starts one byte before a page boundary. + * + * 6) splittable md entry that is two bytes long, and starts one + * byte before a page boundary. + * + * 7) splittable md entry that is page aligned and is exactly two + * pages and one byte long. + * + * 8) splittable md entry that is exactly two pages and one byte + * long, and starts one byte before a page bundary. + * + * 9) splittable md entry that is exactly two pages and two bytes + * long, and starts one byte before a page boundary. + * + */ + alloc_size = page_size * (size_t)pages_allocated; + + /* allocate the buffers needed for the synthetic md entry test */ + if ((write_buf = (uint8_t *)HDcalloc(alloc_size, sizeof(uint8_t))) == NULL) + TEST_ERROR + + if ((read_buf = (uint8_t *)HDcalloc(alloc_size, sizeof(uint8_t))) == NULL) + TEST_ERROR + + /* allocate file space for the tests */ + if (HADDR_UNDEF == (base_addr = H5MF_alloc(f, H5FD_MEM_SUPER, alloc_size))) + FAIL_STACK_ERROR; + + /* Set all cells write_buf[] to 0 and write directly to + * the underlying file via an H5FD call. This gives us a known + * set of values in the underlying file. + */ + for ( i = 0; i < (int)alloc_size; i++) { + + write_buf[i] = 0; + } + + if ( H5FD_write(f->shared->lf, H5FD_MEM_SUPER, base_addr, + alloc_size, write_buf) < 0) + FAIL_STACK_ERROR; + + /* touch up the metadata cache so that it will report that a metadata + * entry that was sub-allocated out of a larger file space allocation + * is the source of the current metadata I/O operation. + */ + H5C_set_curr_io_type_splitable(f->shared->cache, TRUE); + + + /* 1) splittable md entry that is page aligned and exactly one + * page long. + * + * Should not register as a split I/O. + * + * Should log 4 metadata accesses. + * should log 3 metadata hits + * should log 1 metadata misses + * should log 1 metadata loads + * should log 1 metadata insertions + * + * Note that exposes an inefficiency in the page buffer, as page + * aligned I/O requests of exactly oen page in length really should + * bypass the page buffer. + * + * This should be fixed, but I am bypassing it for now. + * + * JRM -- 4/18/20 + */ + first_page_addr = base_addr; + start_addr = base_addr; + test_len = page_size; + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 1; + + if ( H5PB_reset_stats(f->shared->pb_ptr) < 0 ) + FAIL_STACK_ERROR; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "1.1) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 2; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "1.2) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + if ( ( f->shared->pb_ptr->md_read_splits != 0 ) || + ( f->shared->pb_ptr->md_write_splits != 0 ) ) + TEST_ERROR; + + if ( ( f->shared->pb_ptr->accesses[H5PB__STATS_MD] != 4 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MD] != 3 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MD] != 1 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MD] != 1 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MD] != 1 ) ) + TEST_ERROR; + + + /* 2) splittable md entry that is page aligned and exactly two + * pages long + * + * Should not register as a split I/O. + * + * if vfd_swmr_mode + * + * Should log 0 multi-page metadata bypasses. + * Should log 4 multi-page metadata accesses. + * should log 3 multi-page metadata hits + * should log 1 multi-page metadata misses + * should log 0 multi-page metadata loads + * should log 1 multi-page metadata insertions + * + * else + * + * Should log 4 multi-page metadata bypasses. + * Should log 0 multi-page metadata accesses. + * should log 0 multi-page metadata hits + * should log 2 multi-page metadata misses + * should log 0 multi-page metadata loads + * should log 0 multi-page metadata insertions + * + * The misses in the normal operating mode could be avoided. + */ + first_page_addr = base_addr + (haddr_t)(page_size); + start_addr = first_page_addr; + test_len = 3 * page_size; + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 3; + + if ( H5PB_reset_stats(f->shared->pb_ptr) < 0 ) + FAIL_STACK_ERROR; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "2.1) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 4; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "2.2) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + if ( ( f->shared->pb_ptr->md_read_splits != 0 ) || + ( f->shared->pb_ptr->md_write_splits != 0 ) ) + TEST_ERROR; + + if ( vfd_swmr_mode ) { + if ( ( f->shared->pb_ptr->bypasses[H5PB__STATS_MPMDE] != 0 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MPMDE] != 4 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MPMDE] != 3 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MPMDE] != 1 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MPMDE] != 0 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MPMDE] != 1 ) ) + TEST_ERROR; + + } else { + if ( ( f->shared->pb_ptr->bypasses[H5PB__STATS_MPMDE] != 4 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MPMDE] != 0 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MPMDE] != 0 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MPMDE] != 2 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MPMDE] != 0 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MPMDE] != 0 ) ) + TEST_ERROR; + } + + + /* 3) splittable md entry that is page aligned and is exactly one + * page and one byte long. + * + * Should register 2 metadata read splits + * Should register 2 metadata write splits + * + * Should log 0 metadata bypasses. + * Should log 8 metadata accesses. + * should log 6 metadata hits + * should log 2 metadata misses + * should log 2 metadata loads + * should log 2 metadata insertions + */ + first_page_addr = base_addr + (haddr_t)(3 * page_size); + start_addr = first_page_addr; + test_len = page_size + 1; + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 5; + + if ( H5PB_reset_stats(f->shared->pb_ptr) < 0 ) + FAIL_STACK_ERROR; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "3.1) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 6; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "3.2) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + if ( ( f->shared->pb_ptr->md_read_splits != 2 ) || + ( f->shared->pb_ptr->md_write_splits != 2 ) ) + TEST_ERROR; + + if ( ( f->shared->pb_ptr->bypasses[H5PB__STATS_MD] != 0 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MD] != 8 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MD] != 6 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MD] != 2 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MD] != 2 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MD] != 2 ) ) + TEST_ERROR; + + + /* 4) splittable md entry that is exactly one page and one byte + * long, and starts one byte before a page bundary. + * + * Should register 2 metadata read splits + * Should register 2 metadata write splits + * + * Should log 0 metadata bypasses. + * Should log 8 metadata accesses. + * should log 6 metadata hits + * should log 2 metadata misses + * should log 2 metadata loads + * should log 2 metadata insertions + * + */ + first_page_addr = base_addr + (haddr_t)(5 * page_size); + start_addr = first_page_addr + (haddr_t)(page_size - 1);; + test_len = page_size + 1; + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 7; + + if ( H5PB_reset_stats(f->shared->pb_ptr) < 0 ) + FAIL_STACK_ERROR; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if ( f->shared->pb_ptr->md_write_splits != 1 ) + TEST_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + if ( f->shared->pb_ptr->md_read_splits != 1 ) + TEST_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "4.1) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 8; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "4.2) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + if ( ( f->shared->pb_ptr->md_read_splits != 2 ) || + ( f->shared->pb_ptr->md_write_splits != 2 ) ) + TEST_ERROR; + + if ( ( f->shared->pb_ptr->bypasses[H5PB__STATS_MD] != 0 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MD] != 8 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MD] != 6 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MD] != 2 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MD] != 2 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MD] != 2 ) ) + TEST_ERROR; + + + /* 5) splittable md entry that is exactly one page and two bytes + * long, and starts one byte before a page boundary. + * + * Should register 2 metadata read splits + * Should register 2 metadata write splits + * + * Should log 0 metadata bypasses. + * Should log 12 metadata accesses. + * should log 9 metadata hits + * should log 3 metadata misses + * should log 3 metadata loads + * should log 3 metadata insertions + */ + first_page_addr = base_addr + (haddr_t)(8 * page_size); + start_addr = first_page_addr + (haddr_t)(page_size - 1);; + test_len = page_size + 2; + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 9; + + if ( H5PB_reset_stats(f->shared->pb_ptr) < 0 ) + FAIL_STACK_ERROR; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "5.1) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 10; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "5.2) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + if ( ( f->shared->pb_ptr->md_read_splits != 2 ) || + ( f->shared->pb_ptr->md_write_splits != 2 ) ) + TEST_ERROR; + + if ( ( f->shared->pb_ptr->bypasses[H5PB__STATS_MD] != 0 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MD] != 12 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MD] != 9 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MD] != 3 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MD] != 3 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MD] != 3 ) ) + TEST_ERROR; + + + /* 6) splittable md entry that is two bytes long, and starts one + * byte before a page boundary. + * + * Should register 2 metadata read splits + * Should register 2 metadata write splits + * + * Should log 0 metadata bypasses. + * Should log 8 metadata accesses. + * should log 6 metadata hits + * should log 2 metadata misses + * should log 2 metadata loads + * should log 2 metadata insertions + */ + first_page_addr = base_addr + (haddr_t)(11 * page_size); + start_addr = first_page_addr + (haddr_t)(page_size - 1);; + test_len = 2; + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 11; + + if ( H5PB_reset_stats(f->shared->pb_ptr) < 0 ) + FAIL_STACK_ERROR; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "6.1) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 12; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "6.2) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + if ( ( f->shared->pb_ptr->md_read_splits != 2 ) || + ( f->shared->pb_ptr->md_write_splits != 2 ) ) + TEST_ERROR; + + if ( ( f->shared->pb_ptr->bypasses[H5PB__STATS_MD] != 0 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MD] != 8 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MD] != 6 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MD] != 2 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MD] != 2 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MD] != 2 ) ) + TEST_ERROR; + + /* 7) splittable md entry that is page aligned and is exactly two + * pages and one byte long. + * + * Should register 2 metadata read splits + * Should register 2 metadata write splits + * + * if vfd_swmr_mode + * + * Should log 0 multi-page metadata bypasses. + * Should log 4 multi-page metadata accesses. + * Should log 4 metadata accesses. + * should log 3 multi-page metadata hits + * should log 3 metadata hits + * should log 1 multi-page metadata misses + * should log 1 metadata misses + * should log 0 multi-page metadata loads + * should log 1 metadata loads + * should log 1 multi-page metadata insertions + * should log 1 metadata insertions + * + * else + * + * Should log 4 multi-page metadata bypasses. + * Should log 4 metadata accesses. + * should log 3 metadata hits + * should log 2 multi-page metadata misses + * should log 1 metadata misses + * should log 1 metadata loads + * should log 1 metadata insertions + * + * The misses in the normal operating mode could be avoided. + */ + first_page_addr = base_addr + (haddr_t)(13 * page_size); + start_addr = first_page_addr; + test_len = 2 * page_size + 1; + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 13; + + if ( H5PB_reset_stats(f->shared->pb_ptr) < 0 ) + FAIL_STACK_ERROR; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "3.1) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 14; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "3.2) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + if ( ( f->shared->pb_ptr->md_read_splits != 2 ) || + ( f->shared->pb_ptr->md_write_splits != 2 ) ) + TEST_ERROR; + + if ( vfd_swmr_mode ) { + if ( ( f->shared->pb_ptr->bypasses[H5PB__STATS_MPMDE] != 0 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MPMDE] != 4 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MD] != 4 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MPMDE] != 3 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MD] != 3 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MPMDE] != 1 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MD] != 1 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MPMDE] != 0 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MD] != 1 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MPMDE] != 1 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MD] != 1 ) ) + TEST_ERROR; + + } else { + if ( ( f->shared->pb_ptr->bypasses[H5PB__STATS_MPMDE] != 4 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MD] != 4 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MD] != 3 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MPMDE] != 2 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MD] != 1 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MD] != 1 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MD] != 1 ) ) + TEST_ERROR; + } + + + /* 8) splittable md entry that is exactly two pages and one byte + * long, and starts one byte before a page bundary. + * + * Should register 2 metadata read splits + * Should register 2 metadata write splits + * + * if vfd_swmr_mode + * + * Should log 0 multi-page metadata bypasses. + * Should log 4 multi-page metadata accesses. + * Should log 4 metadata accesses. + * should log 3 multi-page metadata hits + * should log 3 metadata hits + * should log 1 multi-page metadata misses + * should log 1 metadata misses + * should log 0 multi-page metadata loads + * should log 1 metadata loads + * should log 1 multi-page metadata insertions + * should log 1 metadata insertions + * + * else + * + * Should log 4 multi-page metadata bypasses. + * Should log 4 metadata accesses. + * should log 3 metadata hits + * should log 2 multi-page metadata misses + * should log 1 metadata misses + * should log 1 metadata loads + * should log 1 metadata insertions + * + * The misses in the normal operating mode could be avoided. + */ + first_page_addr = base_addr + (haddr_t)(16 * page_size); + start_addr = first_page_addr + (haddr_t)(page_size - 1);; + test_len =2 * page_size + 1; + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 15; + + if ( H5PB_reset_stats(f->shared->pb_ptr) < 0 ) + FAIL_STACK_ERROR; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if ( f->shared->pb_ptr->md_write_splits != 1 ) + TEST_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + if ( f->shared->pb_ptr->md_read_splits != 1 ) + TEST_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "4.1) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 16; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "4.2) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + if ( ( f->shared->pb_ptr->md_read_splits != 2 ) || + ( f->shared->pb_ptr->md_write_splits != 2 ) ) + TEST_ERROR; + + if ( vfd_swmr_mode ) { + if ( ( f->shared->pb_ptr->bypasses[H5PB__STATS_MPMDE] != 0 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MPMDE] != 4 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MD] != 4 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MPMDE] != 3 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MD] != 3 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MPMDE] != 1 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MD] != 1 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MPMDE] != 0 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MD] != 1 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MPMDE] != 1 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MD] != 1 ) ) + TEST_ERROR; + + } else { + if ( ( f->shared->pb_ptr->bypasses[H5PB__STATS_MPMDE] != 4 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MD] != 4 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MD] != 3 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MPMDE] != 2 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MD] != 1 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MD] != 1 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MD] != 1 ) ) + TEST_ERROR; + } + + + /* 9) splittable md entry that is exactly two pages and two bytes + * long, and starts one byte before a page boundary. + * + * if vfd_swmr_mode + * + * Should log 0 multi-page metadata bypasses. + * Should log 4 multi-page metadata accesses. + * Should log 8 metadata accesses. + * should log 3 multi-page metadata hits + * should log 6 metadata hits + * should log 1 multi-page metadata misses + * should log 2 metadata misses + * should log 0 multi-page metadata loads + * should log 2 metadata loads + * should log 1 multi-page metadata insertions + * should log 2 metadata insertions + * + * else + * + * Should log 4 multi-page metadata bypasses. + * Should log 4 metadata accesses. + * should log 3 metadata hits + * should log 2 multi-page metadata misses + * should log 1 metadata misses + * should log 1 metadata loads + * should log 1 metadata insertions + * + * The misses in the normal operating mode could be avoided. + */ + first_page_addr = base_addr + (haddr_t)(19 * page_size); + start_addr = first_page_addr + (haddr_t)(page_size - 1);; + test_len = 2 * page_size + 2; + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 17; + + if ( H5PB_reset_stats(f->shared->pb_ptr) < 0 ) + FAIL_STACK_ERROR; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "5.1) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 18; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "5.2) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + if ( ( f->shared->pb_ptr->md_read_splits != 2 ) || + ( f->shared->pb_ptr->md_write_splits != 2 ) ) + TEST_ERROR; + + if ( vfd_swmr_mode ) { + if ( ( f->shared->pb_ptr->bypasses[H5PB__STATS_MPMDE] != 0 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MPMDE] != 4 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MD] != 8 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MPMDE] != 3 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MD] != 6 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MPMDE] != 1 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MD] != 2 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MPMDE] != 0 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MD] != 2 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MPMDE] != 1 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MD] != 2 ) ) + TEST_ERROR; + + } else { + if ( ( f->shared->pb_ptr->bypasses[H5PB__STATS_MPMDE] != 4 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MD] != 8 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MD] != 6 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MPMDE] != 2 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MD] != 2 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MD] != 2 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MD] != 2 ) ) + TEST_ERROR; + } + + + /* Undo the touchup of the metadata cache */ + H5C_set_curr_io_type_splitable(f->shared->cache, FALSE); + + /* free the test buffers */ + HDfree(write_buf); + HDfree(read_buf); + + if (H5Fclose(file_id) < 0) + FAIL_STACK_ERROR; + if (H5Pclose(fcpl) < 0) + FAIL_STACK_ERROR; + if (H5Pclose(fapl) < 0) + FAIL_STACK_ERROR; + + PASSED(); + return 0; + +error: + + /* Undo the touchup of the metadata cache */ + if ( ( f ) && ( f->shared ) && ( f->shared->cache) ) + H5C_set_curr_io_type_splitable(f->shared->cache, FALSE); + + if ( write_buf ) + HDfree(write_buf); + + if ( read_buf ) + HDfree(read_buf); + + H5E_BEGIN_TRY { + if (fapl != H5I_INVALID_HID) + H5Pclose(fapl); + if (fcpl != H5I_INVALID_HID) + H5Pclose(fcpl); + if (file_id != H5I_INVALID_HID) + H5Fclose(file_id); + } H5E_END_TRY; + return 1; + +} /* md_entry_splitting_boundary_test() */ + + + +/*------------------------------------------------------------------------- + * Function: main() + * + * Purpose: Main function for the page buffer tests. + * + * Return: 0 if test is sucessful + * 1 if test fails + * + * Programmer: unknown + * ?? / ?? / ?? + * + *------------------------------------------------------------------------- + */ +int +main(void) +{ + hid_t fapl = -1; /* File access property list for data files */ + unsigned nerrors = 0; /* Cumulative error count */ + const char *env_h5_drvr = NULL; /* File Driver value from environment */ + hbool_t api_ctx_pushed = FALSE; /* Whether API context pushed */ + + h5_reset(); + + /* Get the VFD to use */ + env_h5_drvr = HDgetenv("HDF5_DRIVER"); + if(env_h5_drvr == NULL) + env_h5_drvr = "nomatch"; + + /* Temporary skip testing with multi/split drivers: + * Page buffering depends on paged aggregation which is + * currently disabled for multi/split drivers. + */ + if((0 == HDstrcmp(env_h5_drvr, "multi")) || + (0 == HDstrcmp(env_h5_drvr, "split"))) { + + SKIPPED() + HDputs("Skip page buffering test because paged aggregation is disabled for multi/split drivers"); + HDputs("Furthermore, VFD SWMR is not (yet) expected to work with multi/split drivers"); + HDexit(EXIT_SUCCESS); + } + + if((fapl = h5_fileaccess()) < 0) { + nerrors++; + PUTS_ERROR("Can't get VFD-dependent fapl") + } + + /* Push API context */ + if(H5CX_push() < 0) FAIL_STACK_ERROR + api_ctx_pushed = TRUE; + +#ifdef H5_HAVE_PARALLEL + + HDputs("Page Buffering is disabled for parallel."); + nerrors += verify_page_buffering_disabled(fapl, env_h5_drvr); + +#else /* H5_HAVE_PARALLEL */ + + nerrors += test_args(fapl, env_h5_drvr); + nerrors += test_raw_data_handling(fapl, env_h5_drvr, false); + nerrors += test_raw_data_handling(fapl, env_h5_drvr, true); + nerrors += test_spmde_delay_basic(fapl, env_h5_drvr); + nerrors += test_mpmde_delay_basic(fapl, env_h5_drvr); + nerrors += test_spmde_lru_evict_basic(fapl, env_h5_drvr); + nerrors += test_lru_processing(fapl, env_h5_drvr); + nerrors += test_min_threshold(fapl, env_h5_drvr); + nerrors += test_stats_collection(fapl, env_h5_drvr); + nerrors += md_entry_splitting_smoke_check(fapl, env_h5_drvr, false); + nerrors += md_entry_splitting_smoke_check(fapl, env_h5_drvr, true); + nerrors += md_entry_splitting_boundary_test(fapl, env_h5_drvr, false); + nerrors += md_entry_splitting_boundary_test(fapl, env_h5_drvr, true); + +#endif /* H5_HAVE_PARALLEL */ + + h5_clean_files(namebases, fapl); + + if(nerrors) + goto error; + + /* Pop API context */ + if(api_ctx_pushed && H5CX_pop() < 0) FAIL_STACK_ERROR + api_ctx_pushed = FALSE; + + HDputs("All Page Buffering tests passed."); + + HDexit(EXIT_SUCCESS); + +error: + HDprintf("***** %d Page Buffering TEST%s FAILED! *****\n", + nerrors, nerrors > 1 ? "S" : ""); + + H5E_BEGIN_TRY { + H5Pclose(fapl); + } H5E_END_TRY; + + if(api_ctx_pushed) H5CX_pop(); + + HDexit(EXIT_FAILURE); + +} /* main() */ -- cgit v0.12