diff options
-rw-r--r-- | src/H5C.c | 75 | ||||
-rw-r--r-- | src/H5Cimage.c | 29 | ||||
-rw-r--r-- | src/H5Cmpio.c | 13 | ||||
-rw-r--r-- | src/H5Cpkg.h | 165 | ||||
-rw-r--r-- | src/H5Cprivate.h | 2 | ||||
-rw-r--r-- | src/H5Cquery.c | 108 | ||||
-rw-r--r-- | src/H5Ctest.c | 56 | ||||
-rw-r--r-- | src/H5PB.c | 1218 | ||||
-rw-r--r-- | src/H5PBpkg.h | 16 | ||||
-rw-r--r-- | src/H5PBprivate.h | 23 | ||||
-rw-r--r-- | test/page_buffer.c | 1505 |
11 files changed, 2892 insertions, 318 deletions
@@ -477,6 +477,10 @@ H5C_create(size_t max_cache_size, cache_ptr->rdfsm_settled = FALSE; cache_ptr->mdfsm_settled = FALSE; + /* fields supporting page buffer hints */ + cache_ptr->curr_io_type = NULL; + cache_ptr->curr_read_speculative = FALSE; + if(H5C_reset_cache_hit_rate_stats(cache_ptr) < 0) /* this should be impossible... */ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, NULL, "H5C_reset_cache_hit_rate_stats failed") @@ -487,6 +491,7 @@ H5C_create(size_t max_cache_size, #ifndef NDEBUG cache_ptr->get_entry_ptr_from_addr_counter = 0; + cache_ptr->curr_io_type = NULL; #endif /* NDEBUG */ /* Set return value */ @@ -974,10 +979,13 @@ done: * * Programmer: John Mainzer -- 12/16/18 * - * Changes: None. + * Changes: Added macro calls to maintain the page buffer hints. + * + * JRM -- 3/20/20 * *------------------------------------------------------------------------- */ + herr_t H5C_evict_or_refresh_all_entries_in_page(H5F_t * f, uint64_t page, uint32_t length, uint64_t tick) @@ -994,7 +1002,7 @@ H5C_evict_or_refresh_all_entries_in_page(H5F_t * f, uint64_t page, H5C_cache_entry_t * entry_ptr; H5C_cache_entry_t * follow_ptr = NULL; herr_t ret_value = SUCCEED; /* Return value */ - bool found = false; + hbool_t found = FALSE; FUNC_ENTER_NOAPI(FAIL) @@ -1036,7 +1044,7 @@ H5C_evict_or_refresh_all_entries_in_page(H5F_t * f, uint64_t page, page * cache_ptr->page_size + length <= entry_ptr->addr + entry_ptr->size); - found = true; + found = TRUE; /* since end of tick occurs only on API call entry in * the VFD SWMR reader case, the entry must not be protected. @@ -1135,12 +1143,17 @@ H5C_evict_or_refresh_all_entries_in_page(H5F_t * f, uint64_t page, H5C_IMAGE_EXTRA_SPACE); #endif /* H5C_DO_MEMORY_SANITY_CHECKS */ + H5C__SET_PB_READ_HINTS(cache_ptr, entry_ptr->type, TRUE) + if ( H5F_block_read(f, entry_ptr->type->mem_type, entry_ptr->addr, - image_len, image_ptr) < 0 ) + image_len, image_ptr) < 0 ) { + H5C__RESET_PB_READ_HINTS(cache_ptr) HGOTO_ERROR(H5E_CACHE, H5E_READERROR, FAIL, \ "Can't read image (1)") + } + H5C__RESET_PB_READ_HINTS(cache_ptr) /* 3) Call the refresh callback. If it doesn't * request a different image size, goto 6) @@ -1172,12 +1185,18 @@ H5C_evict_or_refresh_all_entries_in_page(H5F_t * f, uint64_t page, H5C_IMAGE_EXTRA_SPACE); #endif /* H5C_DO_MEMORY_SANITY_CHECKS */ + H5C__SET_PB_READ_HINTS(cache_ptr, entry_ptr->type, TRUE) + if ( H5F_block_read(f, entry_ptr->type->mem_type, entry_ptr->addr, - image_len, image_ptr) < 0 ) + image_len, image_ptr) < 0 ) { + + H5C__RESET_PB_READ_HINTS(cache_ptr) HGOTO_ERROR(H5E_CACHE, H5E_READERROR, FAIL, \ "Can't read image (2)") + } + H5C__RESET_PB_READ_HINTS(cache_ptr) /* 5) Call the refresh callback again. Requesting * a different buffer size again is an error. @@ -6495,6 +6514,14 @@ done: * * Programmer: John Mainzer, 5/5/04 * + * Changes: Please maintain the changes list, and do not delete it + * unless you have merged it into the header comment + * proper. + * + * Added macro calls to maintain page buffer hints. + * + * JRM -- 3/20/20 + * *------------------------------------------------------------------------- */ herr_t @@ -6680,8 +6707,18 @@ H5C__flush_single_entry(H5F_t *f, H5C_cache_entry_t *entry_ptr, unsigned flags) else mem_type = entry_ptr->type->mem_type; - if(H5F_block_write(f, mem_type, entry_ptr->addr, entry_ptr->size, entry_ptr->image_ptr) < 0) - HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't write image to file") + H5C__SET_PB_WRITE_HINTS(cache_ptr, entry_ptr->type) + + if ( H5F_block_write(f, mem_type, entry_ptr->addr, + entry_ptr->size, + entry_ptr->image_ptr) < 0 ) { + + H5C__RESET_PB_WRITE_HINTS(cache_ptr) + + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, \ + "Can't write image to file") + } + H5C__RESET_PB_WRITE_HINTS(cache_ptr) #ifdef H5_HAVE_PARALLEL } #endif /* H5_HAVE_PARALLEL */ @@ -7083,6 +7120,10 @@ done: * small. * JRM -- 3/25/20 * + * Added macro calls to maintain the page buffer read hints. + * + * JRM -- 3/20/20 + * *------------------------------------------------------------------------- */ static void * @@ -7234,10 +7275,18 @@ H5C_load_entry(H5F_t * f, if ( !coll_access || 0 == mpi_rank ) { #endif /* H5_HAVE_PARALLEL */ - if ( H5F_block_read(f, type->mem_type, addr, len, image) < 0 ) + H5C__SET_PB_READ_HINTS(f->shared->cache, type, TRUE) + + if ( H5F_block_read(f, type->mem_type, addr, len, image) < 0 ) { + + H5C__RESET_PB_READ_HINTS(f->shared->cache) HGOTO_ERROR(H5E_CACHE, H5E_READERROR, NULL, \ "Can't read image*") + } + + H5C__RESET_PB_READ_HINTS(f->shared->cache) + #ifdef H5_HAVE_PARALLEL } /* end if */ /* if the collective metadata read optimization is turned on, @@ -7346,11 +7395,19 @@ H5C_load_entry(H5F_t * f, * * JRM -- 3/24/20 */ + + H5C__SET_PB_READ_HINTS(f->shared->cache, type, \ + FALSE); + if ( H5F_block_read(f, type->mem_type, addr, - actual_len, image) < 0) + actual_len, image) < 0 ) { + + H5C__RESET_PB_READ_HINTS(f->shared->cache) HGOTO_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, \ "can't read image") + } + H5C__RESET_PB_READ_HINTS(f->shared->cache) #endif /* JRM */ #ifdef H5_HAVE_PARALLEL } diff --git a/src/H5Cimage.c b/src/H5Cimage.c index ee286d9..9a6d667 100644 --- a/src/H5Cimage.c +++ b/src/H5Cimage.c @@ -1058,6 +1058,22 @@ H5C__read_cache_image(H5F_t *f, H5C_t *cache_ptr) #endif /* H5_HAVE_PARALLEL */ /* Read the buffer (if serial access, or rank 0 of parallel access) */ + + /* No need to set the page buffer hints here, as if paged + * allocation is in use, we know that the cache image was allocated + * directly from the free space manager, and thus either doesn't + * cross page boundaries, or is page aligned. Between this, + * and the fact that the cache image is never read speculatively, + * the page buffer should never request hints in this context. + * + * If for some reason it does, the NULL curr_io_type will trigger + * an assertion failure. + * + * Note that we will have to revisit this if we ever use + * cache_ptr->curr_io_type for something other than sanity + * checking + * JRM -- 3/30/20 + */ if(H5F_block_read(f, H5FD_MEM_SUPER, cache_ptr->image_addr, cache_ptr->image_len, cache_ptr->image_buffer) < 0) HGOTO_ERROR(H5E_CACHE, H5E_READERROR, FAIL, "Can't read metadata cache image block") @@ -3554,6 +3570,19 @@ H5C__write_cache_image(H5F_t *f, const H5C_t *cache_ptr) #endif /* H5_HAVE_PARALLEL */ /* Write the buffer (if serial access, or rank 0 for parallel access) */ + + /* No need to set the page buffer hints here. + * + * If paged allocation is in use, we know that the cache image + * was allocated directly from the free space manager, and thus + * either doesn't cross page boundaries, or is page aligned. + * Thus it should never trigger the sanity checks in the page buffer. + * + * If for some reason it does, the NULL curr_io_type will trigger + * an assertion failure. + * + * JRM -- 3/30/20 + */ if(H5F_block_write(f, H5FD_MEM_SUPER, cache_ptr->image_addr, cache_ptr->image_len, cache_ptr->image_buffer) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't write metadata cache image block to file") #ifdef H5_HAVE_PARALLEL diff --git a/src/H5Cmpio.c b/src/H5Cmpio.c index 199c494..16db2ad 100644 --- a/src/H5Cmpio.c +++ b/src/H5Cmpio.c @@ -1018,6 +1018,19 @@ H5C__collective_write(H5F_t *f) HGOTO_ERROR(H5E_CACHE, H5E_CANTSET, FAIL, "can't set MPI-I/O properties") /* Write data */ + /* + * At present the page buffer is disabled in the parallel case, and + * thus VFD SWMR can't be used either. Thus, for now, there is + * no point in setting the page buffer hints. + * + * More to the point, since we are actually writing a derived type + * containing multiple metadata cache entries, we couldn't set it + * to a meaningful value. + * + * When we enable the page buffer in parallel, we will have to + * revisit this. + * JRM -- 3/30/20 + */ if(H5F_block_write(f, H5FD_MEM_DEFAULT, (haddr_t)0, (size_t)1, base_buf) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to write entries collectively") diff --git a/src/H5Cpkg.h b/src/H5Cpkg.h index d9a1641..a5eafd6 100644 --- a/src/H5Cpkg.h +++ b/src/H5Cpkg.h @@ -3480,6 +3480,102 @@ if ( ( (entry_ptr) == NULL ) || \ } /* H5C__MOVE_TO_TOP_IN_COLL_LIST */ #endif /* H5_HAVE_PARALLEL */ + +/***************************************/ +/* page buffer hint maintenance macros */ +/***************************************/ + +/*------------------------------------------------------------------------- + * + * Macro: H5C__SET/RESET_PB_READ_HINTS + * + * Purpose: Set or reset the fields needed to provide hints to the + * page buffer so that it can disambuate between speculative + * reads that cross page boundaries and read of metadata + * entries that cross page boundaries without starting on + * a page boundary. This latter behaviour shouldn't happen, + * and the hints allow the page buffer to detect this + * behaviour by un-expected cache client. + * + * See the discussion of the PB hint fields in the header + * comment for H5C_t for further details. + * + * Return: N/A + * + * Programmer: John Mainzer, 3/30/20 + * + * Modifications: + * + * None. + * + *------------------------------------------------------------------------- + */ + +#define H5C__SET_PB_READ_HINTS(cache_ptr, type, may_be_speculative) \ +{ \ + HDassert(cache_ptr); \ + HDassert((cache_ptr)->magic == H5C__H5C_T_MAGIC); \ + HDassert((cache_ptr)->curr_io_type == NULL); \ + HDassert(type); \ + (cache_ptr)->curr_io_type = (type); \ + (cache_ptr)->curr_read_speculative = (may_be_speculative) && \ + ((cache_ptr)->curr_io_type->flags & H5AC__CLASS_SPECULATIVE_LOAD_FLAG); \ + \ +} /* H5C__SET_PB_READ_HINTS() */ + +#define H5C__RESET_PB_READ_HINTS(cache_ptr) \ +{ \ + HDassert(cache_ptr); \ + HDassert((cache_ptr)->magic == H5C__H5C_T_MAGIC); \ + HDassert((cache_ptr)->curr_io_type); \ + (cache_ptr)->curr_io_type = NULL; \ + (cache_ptr)->curr_read_speculative = FALSE; \ + \ +} /* H5C__SET_PB_READ_HINTS() */ + + +/*------------------------------------------------------------------------- + * + * Macro: H5C__SET/RESET_PB_WRITE_HINTS + * + * Purpose: Set or reset the fields needed to provide hints to the + * page buffer so that it can detect un-expected writes of + * metadata entries that cross page boundaries and do not + * start on page boundaries. + * + * See the discussion of the PB hint fields in the header + * comment for H5C_t for further details. + * + * Return: N/A + * + * Programmer: John Mainzer, 3/30/20 + * + * Modifications: + * + * None. + * + *------------------------------------------------------------------------- + */ + +#define H5C__SET_PB_WRITE_HINTS(cache_ptr, type) \ +{ \ + HDassert(cache_ptr); \ + HDassert((cache_ptr)->magic == H5C__H5C_T_MAGIC); \ + HDassert((cache_ptr)->curr_io_type == NULL); \ + HDassert(type); \ + (cache_ptr)->curr_io_type = (type); \ + \ +} /* H5C__SET_PB_WRITE_HINTS() */ + +#define H5C__RESET_PB_WRITE_HINTS(cache_ptr) \ +{ \ + HDassert(cache_ptr); \ + HDassert((cache_ptr)->magic == H5C__H5C_T_MAGIC); \ + HDassert((cache_ptr)->curr_io_type); \ + (cache_ptr)->curr_io_type = NULL; \ + \ +} /* H5C__SET_PB_WRITE_HINTS() */ + /****************************/ /* Package Private Typedefs */ @@ -4413,6 +4509,47 @@ typedef struct H5C_tag_info_t { * managers that are involved in allocating space for free * space managers. * + * Page Buffer Related Fields: + * + * Due to the irregular behavior of some of the cache clients, the + * page buffer occasionally need hints to manage metadta I/O requests + * from the metadata cache -- particularly in the context of VFD SWMR. + * The following fields exist to support this. + * + * + * curr_io_type: Pointer to the instance of H5C_class_t associated with + * the current I/O operation. This pointer should be set + * just before any I/O operation by the metadata cache, and + * re-set to NULL immediately thereafter. + * + * This field exists because the fixed and variable length + * array cache clients allocate numerous entries in a single + * block, and sub-allocate metadata cache entries out of this + * block. The effect of this is to break the invarient, + * normally maintained by the free space managers in paged + * allocation mode, that no entry of less than a page in + * size crosses page boundaries, and that entries of page + * size or greater are page aligned. This in turn causes + * problems for the page buffer -- particularly in VFD SWMR + * mode. + * + * The correct solution is to modify the fixed and variable + * length array cache client to repair this. However, in + * the interrim, this field exists to detect similar + * behaviour elsewhere. + * + * To complicate matters, speculative reads for metadata + * cache entries which must determine their lengths via + * inspection of the on disk image of the entry, may mimic + * the behaviour of the fixed and extensible arrays. Thus + * the curr_io_type is also needed to dis-ambiguate reads. + * + * curr_read_speculative: Boolean flag indicating whether the current + * read request is guaranteed to be of the correct length. + * Field is used to distinguish between the initial and final + * read attempts + * + * * * Statistics collection fields: * @@ -4744,6 +4881,28 @@ typedef struct H5C_tag_info_t { * called successfully. This field is only defined when * NDEBUG is not #defined. * + * curr_io_type: Pointer to the instance of H5C_class_t associated with + * the current I/O operation. This pointer should be set + * just before any I/O operation by the metadata cache, and + * re-set to NULL immediately thereafter. This field is + * only defined when NDEBUG is not #defined. + * + * This field exists because the fixed and variable length + * array cache clients allocate numerous entries in a single + * block, and sub-allocate metadata cache entries out of this + * block. The effect of this is to break the invarient, + * normally maintained by the free space managers in paged + * allocation mode, that no entry of less than a page in + * size crosses page boundaries, and that entries of page + * size or greater are page aligned. This in turn causes + * problems for the page buffer -- particularly in VFD SWMR + * mode. + * + * The correct solution is to modify the fixed and variable + * length array cache client to repair this. However, in + * the interrim, this field exists to detect similar + * behaviour elsewhere. + * ****************************************************************************/ struct H5C_t { uint32_t magic; @@ -4892,6 +5051,10 @@ struct H5C_t { hbool_t rdfsm_settled; hbool_t mdfsm_settled; + /* Fields supporting page buffer hints */ + const H5C_class_t * curr_io_type; + hbool_t curr_read_speculative; + #if H5C_COLLECT_CACHE_STATS /* stats fields */ int64_t hits[H5C__MAX_NUM_TYPE_IDS + 1]; @@ -5025,6 +5188,8 @@ H5_DLL herr_t H5C__untag_entry(H5C_t *cache, H5C_cache_entry_t *entry); /* Testing functions */ #ifdef H5C_TESTING H5_DLL herr_t H5C__verify_cork_tag_test(hid_t fid, H5O_token_t tag_token, hbool_t status); +H5_DLL void H5C_set_curr_io_type_splitable(H5C_t * cache_ptr, + hbool_t set_splitable); #endif /* H5C_TESTING */ #endif /* _H5Cpkg_H */ diff --git a/src/H5Cprivate.h b/src/H5Cprivate.h index 23091cb..7678911 100644 --- a/src/H5Cprivate.h +++ b/src/H5Cprivate.h @@ -2411,6 +2411,8 @@ H5_DLL herr_t H5C_get_cache_size(H5C_t *cache_ptr, size_t *max_size_ptr, uint32_t *cur_num_entries_ptr); H5_DLL herr_t H5C_get_cache_flush_in_progress(H5C_t *cache_ptr, hbool_t *flush_in_progress_ptr); H5_DLL herr_t H5C_get_cache_hit_rate(H5C_t *cache_ptr, double *hit_rate_ptr); +H5_DLL int H5C_get_curr_io_client_type(H5C_t * cache_ptr); +H5_DLL hbool_t H5C_get_curr_read_speculative(H5C_t * cache_ptr); H5_DLL herr_t H5C_get_entry_status(const H5F_t *f, haddr_t addr, size_t *size_ptr, hbool_t *in_cache_ptr, hbool_t *is_dirty_ptr, hbool_t *is_protected_ptr, hbool_t *is_pinned_ptr, hbool_t *is_corked_ptr, diff --git a/src/H5Cquery.c b/src/H5Cquery.c index 9f1ec31..477a8ba 100644 --- a/src/H5Cquery.c +++ b/src/H5Cquery.c @@ -452,3 +452,111 @@ done: FUNC_LEAVE_NOAPI(ret_value) } /* H5C_get_mdc_image_info() */ + +/*------------------------------------------------------------------------- + * Function: H5C_get_curr_io_client_type + * + * Purpose: Return the type id associated with the metadata cache + * client whose data is currently being read or written. + * + * This id is obtained via the curr_io_type field in + * H5C_t, which is set just before most I/O calls from the + * metadata cache, and reset to NULL immediately thereafter. + * + * If cache_ptr->curr_io_type is NULL, the function + * returns -1. + * + * Note: At present, cache_ptr->curr_io_type should always + * be defined in the serial case with the exception + * of cache image I/O. In general, it is not defined in + * the parallel case. This is not a problem for now, as + * this function is used in page buffer sanity checking, + * and for now at least, the page buffer is not enabled in + * the parallel case. + * + * Return: ID of cache client whose image is being read or written, + * or H5AC_NTYPES if cache_ptr->curr_io_type is undefined. + * + * Programmer: John Mainzer + * 3/31/20 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ + +int +H5C_get_curr_io_client_type(H5C_t * cache_ptr) +{ + int ret_value = -1; /* Return value */ + + FUNC_ENTER_NOAPI_NOINIT_NOERR + + HDassert(cache_ptr); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + + if ( cache_ptr->curr_io_type ) { + + ret_value = cache_ptr->curr_io_type->id; + } + + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5C_get_curr_io_client_type() */ + + +/*------------------------------------------------------------------------- + * Function: H5C_get_curr_read_speculative + * + * Purpose: Return a boolean flag indicating whether the current + * read is speculative. + * + * Note that this value is only defined during a read generated + * by the metadatat cache. At all other times, the return + * value undefined (although the current implementation + * returns FALSE in such cases). + * + * Note also that this function exists to provide hints to the + * page buffer, which for now at least, is only available in + * the serial case. It should not be depended upon in the + * parallel case -- at least until verified, and potential + * interactions with collective metadata reads are investigated + * and dismissed. + * + * Return: True if the current call to H5F_block_read() by the + * metadata cache is an initial read attempt for a cache + * client whose speculative read flag is set (in H5AC_class_t), + * and false otherwise. + * + * Return value is undefined if a call to H5F_block_read by + * the metadata cache is not in progress. + * + * Programmer: John Mainzer + * 3/31/20 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ + +hbool_t +H5C_get_curr_read_speculative(H5C_t * cache_ptr) +{ + hbool_t ret_value = FALSE; /* Return value */ + + FUNC_ENTER_NOAPI_NOINIT_NOERR + + HDassert(cache_ptr); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + + if ( cache_ptr->curr_io_type ) { + + ret_value = cache_ptr->curr_read_speculative; + } + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5C_get_curr_read_speculative() */ + + diff --git a/src/H5Ctest.c b/src/H5Ctest.c index 7f24302..b549da5 100644 --- a/src/H5Ctest.c +++ b/src/H5Ctest.c @@ -78,8 +78,6 @@ typedef struct { /* Local Variables */ /*******************/ - - /*------------------------------------------------------------------------- * Function: H5C__verify_cork_tag_test_cb @@ -167,3 +165,57 @@ done: FUNC_LEAVE_NOAPI(ret_value) } /* H5C__verify_cork_tag_test() */ + +/*------------------------------------------------------------------------- + * Function: H5C_set_curr_io_type_splitable() + * + * Purpose: To test the meta data entry splitting capability in the page + * buffer (needed to deal with H5FA and H5EA's unfortunate + * design choice of sub-allocating multiple metadata entries + * out of a single file space allocation), we must be able + * to configure the metadata cache to report that the + * current I/O request is for such an entry. + * + * To do this, we must set cache_ptr->curr_io_type to + * point to the instance of H5C_class_t with one such + * client. + * + * This function does this by setting cache_ptr->curr_io_type + * to H5AC_EARRAY_DBLK_PAGE if set_splitable is TRUE, and to + * NULL otherwise. + * + * Needless to say, this is purely a testing function, and + * should not be called otherwise. + * + * Return: void + * + * Programmer: John Mainzer + * 4/10/20 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ + +void +H5C_set_curr_io_type_splitable(H5C_t * cache_ptr, hbool_t set_splitable) +{ + FUNC_ENTER_NOAPI_NOINIT_NOERR + + HDassert(cache_ptr); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + + if ( set_splitable ) { + + cache_ptr->curr_io_type = H5AC_EARRAY_DBLK_PAGE; + + } else { + + cache_ptr->curr_io_type = NULL; + } + + + FUNC_LEAVE_NOAPI_VOID + +} /* H5C_set_curr_io_type_splitable() */ + @@ -52,9 +52,12 @@ /****************/ /* Round _x down to nearest _size. */ +/* not used at present */ +/* #ifndef rounddown #define rounddown(_x, _size) (((_x) / (_size)) * (_size)) #endif +*/ /* Round _x up to nearest _size. */ #ifndef roundup @@ -113,14 +116,6 @@ static herr_t H5PB__write_meta(H5F_shared_t *, H5FD_mem_t, haddr_t, static herr_t H5PB__write_raw(H5F_shared_t *, H5FD_mem_t, haddr_t, size_t, const void *); -static void metadata_section_split(size_t, haddr_t, size_t, const void *, - metadata_section_t *); - -static herr_t metadata_multipart_read(H5F_shared_t *, H5FD_mem_t, haddr_t, - size_t, void *); - -static herr_t metadata_multipart_write(H5F_shared_t *, H5FD_mem_t, haddr_t, - size_t, const void *); static void H5PB_log_access_by_size_counts(const H5PB_t *); @@ -225,6 +220,8 @@ H5PB_reset_stats(H5PB_t *pb_ptr) pb_ptr->max_dwl_len = 0; pb_ptr->max_dwl_size = 0; pb_ptr->total_dwl_ins_depth = 0; + pb_ptr->md_read_splits = 0; + pb_ptr->md_write_splits = 0; FUNC_LEAVE_NOAPI(SUCCEED) @@ -255,7 +252,13 @@ H5PB_reset_stats(H5PB_t *pb_ptr) * --bypasses: the number of metadata and raw data accesses * that bypass the page buffer layer * - * Return: Non-negative on success/Negative on failure + * TODO: The available stats have changed considerably + * since Mohamad wrote this routine. Update + * the function once things settle down. + * + * JRM -- 4/13/20 + * + * Return: Non-negative on success/Negative on failure * * Programmer: Mohamad Chaarawi * @@ -300,7 +303,9 @@ H5PB_get_stats(const H5PB_t *pb_ptr, unsigned accesses[2], unsigned hits[2], * * Programmer: John Mainzer -- 10/12/18 * - * Changes: None. + * Changes: Added support for md_read_splits and md_write_splits. + * + * JRM -- 4/11/20 * *------------------------------------------------------------------------- */ @@ -407,10 +412,14 @@ H5PB_print_stats(const H5PB_t *pb_ptr) ave_delayed_write_ins_depth = (double)(pb_ptr->total_dwl_ins_depth) / (double)(pb_ptr->delayed_writes); } + HDfprintf(stdout, "delayed writes / ave delay / ave ins depth = %lld / %llf / %llf\n", pb_ptr->delayed_writes, ave_delayed_write, ave_delayed_write_ins_depth); + HDfprintf(stdout, "metadata read / write splits = %lld / %lld.\n", + pb_ptr->md_read_splits, pb_ptr->md_write_splits); + FUNC_LEAVE_NOAPI(SUCCEED) } /* H5PB_print_stats */ @@ -447,7 +456,10 @@ H5PB_print_stats(const H5PB_t *pb_ptr) * * Programmer: John Mainzer -- 10/12/18 * - * Changes: None. + * Changes: Modified function to function to prevent the insertion + * of raw data pages when operating in VFD SWMR mode. + * + * JRM -- 3/25/20 * *------------------------------------------------------------------------- */ @@ -471,7 +483,8 @@ H5PB_add_new_page(H5F_shared_t *shared, H5FD_mem_t type, haddr_t page_addr) if ( H5FD_MEM_DRAW == type ) { /* raw data page insertion */ - if ( pb_ptr->min_md_pages == pb_ptr->max_pages ) { + if ( ( pb_ptr->min_md_pages == pb_ptr->max_pages ) || + ( pb_ptr->vfd_swmr ) ) { can_insert = FALSE; @@ -517,7 +530,12 @@ done: * * Programmer: John Mainzer -- 10/11/18 * - * Changes: None. + * Changes: Added initialization for the vfd_swmr field. Also + * added code to force min_rd_pages to 0 if vfd_swrm is + * TRUE. Do this since we now exclude raw data from the + * page buffer when operating in VFD SWMR mode. + * + * JRM -- 3/28/20 * *------------------------------------------------------------------------- */ @@ -525,6 +543,7 @@ herr_t H5PB_create(H5F_shared_t *shared, size_t size, unsigned page_buf_min_meta_perc, unsigned page_buf_min_raw_perc) { + hbool_t vfd_swmr = FALSE; hbool_t vfd_swmr_writer = FALSE; int i; int32_t min_md_pages; @@ -575,11 +594,21 @@ H5PB_create(H5F_shared_t *shared, size_t size, unsigned page_buf_min_meta_perc, (int32_t)(size / shared->fs_page_size)); - /* compute vfd_swmr_writer */ - if ( ( H5F_SHARED_VFD_SWMR_CONFIG(shared) ) && ( H5F_SHARED_INTENT(shared) & H5F_ACC_RDWR ) ) { + /* compute vfd_swrm and vfd_swmr_writer */ + if ( H5F_SHARED_VFD_SWMR_CONFIG(shared) ) { + + vfd_swmr = TRUE; + + /* force min_rd_pages to zero since raw data is exclued from + * the page buffer in VFD SWMR mode. + */ + min_rd_pages = 0; + + if ( H5F_SHARED_INTENT(shared) & H5F_ACC_RDWR ) { - HDassert(shared->vfd_swmr_config.writer); - vfd_swmr_writer = TRUE; + HDassert(shared->vfd_swmr_config.writer); + vfd_swmr_writer = TRUE; + } } @@ -629,6 +658,7 @@ H5PB_create(H5F_shared_t *shared, size_t size, unsigned page_buf_min_meta_perc, /* VFD SWMR specific fields. * The following fields are defined iff vfd_swmr_writer is TRUE. */ + pb_ptr->vfd_swmr = vfd_swmr; pb_ptr->vfd_swmr_writer = vfd_swmr_writer; pb_ptr->mpmde_count = 0; pb_ptr->cur_tick = 0; @@ -965,9 +995,11 @@ H5PB_log_access_by_size_counts(const H5PB_t *pb) * * 2) If the read is for raw data, and the page buffer is * configured for metadata only (i.e. min_md_pages == - * max_pages), simply read from the HDF5 file and return. + * max_pages), or if we are operating in VFD SWMR mode + * (i.e. vfd_swmr == TRUE), simply read from the HDF5 + * file and return. * - * 3) If the read is for raw data, and it of page size or + * 3) If the read is for raw data, and is of page size or * larger, read it directly from the HDF5 file. * * It is possible that the page buffer contains dirty pages @@ -997,17 +1029,41 @@ H5PB_log_access_by_size_counts(const H5PB_t *pb) * between small and multi-page metadata entries so that * pages containing the former will be buffered and the * latter be read directly from file. - * - * Unfortunately, the metadata cache does not always know the + * + * Unfortunately, there are several flies in the ointment. + * + * First, the fixed and extensible array on disk data + * structures allocate multiple metadata cache entries in + * a single block, and use this fact to make the addresses + * of all but the first entry in the block computable. While + * this simplifies the fixed and extensible array on disk data + * structures, if complicates the metadata cache and the page + * buffer. Needless to say, the correct solution to this + * problem is to remove the complexity at its source. However, + * for now, we must code around the problem. + * + * Thus, this function must examine each read request + * to determine if it crosses page boundaries and is not for + * two or more complete pages. If it does, and it is one of + * the fixed or extensible array entries that is sub-allocated + * from a larger space allocation, the read request must be + * split into the minimal set of read requests that either + * don't cross page boundaries, or are page aligned and + * consist of an integral number of pages. + * + * + * Second, the metadata cache does not always know the * size of metadata entries when it tries to read them. In * such cases, it issues speculative reads that may be either * smaller or larger than the actual size of the piece of * metadata that is finally read. * * Since we are guaranteed that all metadata allocations larger - * that one page are page aligned, we can safely clip at the - * page boundary any non page aligned metadata read that crosses - * page boundaries. + * that one page are page aligned (with the exception of those + * sub-allocated from larger allocations -- which we deal with + * by splitting I/O requests as discussed above), we can safely + * clip at the page boundary any non page aligned metadata + * read that crosses page boundaries. * * However, page aligned reads could wind up being either * small or multi-page. This results in two scenarios that @@ -1048,15 +1104,13 @@ H5PB_log_access_by_size_counts(const H5PB_t *pb) * * 8) If the read is for metadata, is page aligned, is larger * than one page, and there is a regular entry at the target - * page address, test to see if the last read was for the - * same address. + * page address, test to see if the read is speculative. * - * If was, evict the page, and satisfy the read from file. - * Flag an error if the page was dirty. + * If it is not, evict the page, and satisfy the read from + * file. Flag an error if the page was dirty. * - * If the last read was for a different page, clip the read - * to one page, and satisfy the read from the existing - * regular entry. + * If it is, clip the read to one page, and satisfy the + * read from the existing regular entry. * * 9) If the read is for metadata, is page aligned, is larger * than one page, and there is a multi-page metadata entry @@ -1091,63 +1145,337 @@ H5PB_log_access_by_size_counts(const H5PB_t *pb) * * Programmer: John Mainzer -- 10/11/18 * - * Changes: None. + * Changes: Updated for discovery of the fact that the fixed and + * extensible array data structures allocate multiple + * metadata cache entries in a single block, and thus + * violate that invarient that metadata entries either + * do not cross page boundaries, or are page aligned. + * + * JRM -- 3/28/20 * *------------------------------------------------------------------------- */ -/* TBD Add optional raw-data bypass here and at H5PB_write when we - * are operating in parallel mode. - */ + herr_t H5PB_read(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size, void *buf/*out*/) { - H5PB_t *pb_ptr; /* Page buffer for this file */ + H5PB_t *pb_ptr; /* Page buffer for this file */ + hbool_t bypass_pb = FALSE; /* Whether to bypass page buffering */ + hbool_t split_read = FALSE; /* whether the read must be split */ herr_t ret_value = SUCCEED; /* Return value */ + /* the following six fields are defined iff split_read is TRUE */ + haddr_t prefix_addr = HADDR_UNDEF; /* addr of prefix -- if defined */ + haddr_t body_addr = HADDR_UNDEF; /* addr of body -- if defined */ + haddr_t suffix_addr = HADDR_UNDEF; /* addr of suffix -- if defined */ + size_t prefix_size = 0; /* size of prefix */ + size_t body_size = 0; /* size of body */ + size_t suffix_size = 0; /* size of suffix */ + + FUNC_ENTER_NOAPI(FAIL) + /* Sanity checks */ + HDassert(shared); + hlog_fast(pbrd, "%s %p type %d %" PRIuHADDR " size %zu", __func__, (void *)shared, type, addr, size); + pb_ptr = shared->pb_ptr; if (pb_ptr != NULL && type != H5FD_MEM_DRAW) H5PB_count_meta_access_by_size(pb_ptr, size); - HDassert(pb_ptr == NULL || pb_ptr->magic == H5PB__H5PB_T_MAGIC); + if ( pb_ptr == NULL ) { - /* Bypass the page buffer in case - * 1) page buffer is disabled - * _) MPI I/O is enabled - * 2) page buffer configured for metadata only, and it's a raw-data access - * 5) page buffer configured for raw data only, and it's a metadata access - */ - if (pb_ptr == NULL || H5F_SHARED_HAS_FEATURE(shared, H5FD_FEAT_HAS_MPI) || - (H5FD_MEM_DRAW == type && pb_ptr->min_md_pages == pb_ptr->max_pages) || - (H5FD_MEM_DRAW != type && pb_ptr->min_rd_pages == pb_ptr->max_pages)) { + bypass_pb = TRUE; /* case 1) -- page buffer is disabled */ + + } else { + + HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); + + if ( H5FD_MEM_DRAW == type ) { /* raw data read */ + + if ( ( pb_ptr->min_md_pages == pb_ptr->max_pages ) || + ( pb_ptr->vfd_swmr ) ) { + + /* case 2) -- page buffer configured for metadata only + * or vfd swmr. + */ + bypass_pb = TRUE; + + } + } else { /* metadata read */ + + if ( pb_ptr->min_rd_pages == pb_ptr->max_pages ) { + + /* case 5) -- page buffer configured for raw data only */ + bypass_pb = TRUE; + + } else { + /* determine whether the read request must be split, + * and if so, compute the start points and sizes of + * of the sections. + * + * Note: The following code is almost identical to the + * similar code in H5PB_write(). Thus, on the surface, + * it is an obvious candidate for refactoring into a + * function 0r macro. + * + * However, there are subtle differences between + * the two pieces of code which are driven by the + * possibility of speculative reads. + * + * More to the point, further changes may be necessary. + * Thus we should wait on refactoring until this code has + * been in daily use for some time, and it is clear + * that further changes are unlikely. + */ + int mdc_client_id = -1; /* id of mdc client, or -1 if undef */ + uint64_t start_page; /* page index of first page in read */ + uint64_t second_page; /* page index of second page in read */ + uint64_t end_page; /* page index of last page in read */ + uint64_t body_page; /* page index of start of body */ + haddr_t start_page_addr; /* addr of first page in read */ + haddr_t second_page_addr;/* addr of second page in read */ + haddr_t end_page_addr; /* addr of last page in read */ + haddr_t end_addr; /* addr of last byte in read */ + + /* Calculate the aligned address of the first page */ + start_page = (addr / pb_ptr->page_size); + start_page_addr = start_page * pb_ptr->page_size; + + /* Calculate the aligned address of the last page */ + end_addr = addr + (haddr_t)(size - 1); + end_page = end_addr / (haddr_t)(pb_ptr->page_size); + end_page_addr = end_page * pb_ptr->page_size; + + HDassert(start_page_addr <= addr); + HDassert(addr < start_page_addr + (haddr_t)(pb_ptr->page_size)); + + HDassert(start_page <= end_page); + HDassert(end_page_addr <= ((addr + (haddr_t)size - 1))); + HDassert((addr + (haddr_t)size - 1) < + (end_page_addr + pb_ptr->page_size)); + + /* test to see if the read crosses a page boundary, and + * does not start on a page boundary, and is not of an + * integral number of pages. + */ + if ( ( start_page < end_page ) && + ( ! ( ( addr == start_page_addr ) && + ( end_page_addr + (haddr_t)(pb_ptr->page_size) == + end_addr + 1 ) ) ) ) { + + /* the read crosses a page boundary and is not + * page aligned and of length some multiple of page size. + * + * Test to see if the read is for a metadata entry that + * is sub-allocated from a larger space allocation. + * + * Note that the following test may have to be + * adjusted. + */ + mdc_client_id = H5C_get_curr_io_client_type(shared->cache); + + if ( ( mdc_client_id == (int)H5AC_EARRAY_DBLK_PAGE_ID ) || \ + ( mdc_client_id == (int)H5AC_FARRAY_DBLK_PAGE_ID ) ) { + + split_read = TRUE; + } + } + + if ( split_read ) { + + /* compute the base addresses and length of the prefix, + * body, and suffix of the read, where these terms are + * defined as follows: + * + * prefix: All bytes from addr to the first page address + * at or after addr. If addr == start_page_addr, + * the prefix is empty. + * + * body: All bytes from the first page address covered + * by the read up to but not including the last + * page address in the read. Note that the + * length of the body must be a multiple of the + * page size. If only one page address is + * included in the read, the body is empty. + * + * suffix: All bytes from the last page address in the + * read until the end of the read. If the + * read ends on a page boundary, the suffix is + * empty. + * + * Since we know that the read crosses at least one + * page boundary, and we have aleady filtered out the + * body only case, at least two of the above must be + * non-empty. + */ + + second_page = start_page + 1; + second_page_addr = + (haddr_t)(second_page * pb_ptr->page_size); + + if ( addr > start_page_addr ) { /* prefix exists */ + + prefix_addr = addr; + prefix_size = (size_t)(second_page_addr - addr); + + HDassert(prefix_addr > start_page_addr); + HDassert(prefix_size < pb_ptr->page_size); + HDassert(((size_t)(addr - start_page_addr) + \ + prefix_size) == pb_ptr->page_size); + } + + if ( size - prefix_size >= pb_ptr->page_size ) { + + /* body exists */ + + if ( addr == start_page_addr ) { + + body_page = start_page; + body_addr = start_page_addr; + + } else { + + body_page = second_page; + body_addr = second_page_addr; + } + + if ( end_addr < end_page_addr + + (haddr_t)(pb_ptr->page_size - 1) ) { + + /* suffix exists */ + body_size = (size_t)(end_page - body_page) * + pb_ptr->page_size; + + } else { + + /* suffix is empty */ + body_size = (size_t)(end_page - body_page + 1) * + pb_ptr->page_size; + } + + HDassert((body_page == start_page) || \ + (body_page == start_page + 1)); + + HDassert(body_addr == \ + (haddr_t)(body_page * pb_ptr->page_size)); + + HDassert(body_size < size); + HDassert(body_size >= pb_ptr->page_size); + + + HDassert(body_addr == \ + addr + (haddr_t)prefix_size); + HDassert((body_addr + (haddr_t)body_size) \ + <= (end_addr + 1)); + } - if (H5FD_read(shared->lf, type, addr, size, buf) < 0) { - HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, - "read through lower VFD failed"); + if ( end_addr < end_page_addr + + (haddr_t)(pb_ptr->page_size - 1) ) { + + suffix_addr = end_page_addr; + suffix_size = (end_addr + 1) - end_page_addr; + + HDassert(suffix_addr == \ + addr + (haddr_t)(prefix_size + body_size)); + } + + HDassert(size == prefix_size + body_size + suffix_size); + } + } } + } + +#ifdef H5_HAVE_PARALLEL + /* at present, the page buffer must be disabled in the parallel case. + * However, just in case ... + */ + if ( H5F_SHARED_HAS_FEATURE(shared, H5FD_FEAT_HAS_MPI) ) { + + bypass_pb = TRUE; + + } /* end if */ +#endif /* H5_HAVE_PARALLEL */ + + + if ( bypass_pb ) { /* cases 1, 2. and 5 */ + + if ( H5FD_read(shared->lf, type, addr, size, buf) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ + "read through failed") + + /* Update statistics */ + if ( pb_ptr ) { - if (pb_ptr != NULL) H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size); - HGOTO_DONE(SUCCEED); - } + } + } else { - if (H5FD_MEM_DRAW == type) { /* cases 3 and 4 */ - if (H5PB__read_raw(shared, type, addr, size, buf) < 0) - HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, "raw read failed"); - } else if (metadata_multipart_read(shared, type, addr, size, buf) < 0) - HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, "meta read failed"); + if ( H5FD_MEM_DRAW == type ) { /* cases 3 and 4 */ - H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size); + if ( H5PB__read_raw(shared, type, addr, size, buf) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ + "H5PB_read_raw() failed") + + } else if ( split_read ) { + + /* handle the sub-allocated entry case */ + + /* read prefix if it exists */ + if ( prefix_size > 0 ) { + + if ( H5PB__read_meta(shared, type, prefix_addr, + prefix_size, buf) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ + "H5PB_read_meta() failed on prefix") + } + + /* read body -- if it exists. */ + if ( body_size > 0 ) { + + if ( H5PB__read_meta(shared, type, body_addr, body_size, + (void *)((uint8_t *)buf + + prefix_size)) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ + "H5PB_read_meta() failed on body") + } + + /* read suffix -- if it exists. */ + if ( suffix_size > 0 ) { + + if ( H5PB__read_meta(shared, type, suffix_addr, suffix_size, + (void *)((uint8_t *)buf + prefix_size + + body_size)) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ + "H5PB_read_meta() failed on suffix") + } + + H5PB__UPDATE_STATS_FOR_READ_SPLIT(pb_ptr) + + } else { /* pass to H5PB_read_meta() -- cases 6, 7, 8, 9, & 10 */ + + if ( H5PB__read_meta(shared, type, addr, size, buf) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ + "H5PB_read_meta() failed") + } + } done: + FUNC_LEAVE_NOAPI(ret_value) -} + +} /* H5PB_read() */ /* Remove the entry corresponding to lower-file page number `page`. * Return 0 if there was no such entry or if the entry was removed @@ -1241,12 +1569,16 @@ herr_t H5PB_remove_entry(H5F_shared_t *shared, haddr_t addr) { uint64_t page; - H5PB_t *pb_ptr; + H5PB_t *pb_ptr = NULL; H5PB_entry_t *entry_ptr = NULL; - herr_t ret_value = SUCCEED; + herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_NOAPI(FAIL) + /* Sanity checks */ + HDassert(shared); + HDassert(shared->pb_ptr); + pb_ptr = shared->pb_ptr; /* Calculate the page offset */ @@ -1306,49 +1638,169 @@ done: } /* H5PB_remove_entry */ + +/*------------------------------------------------------------------------- + * + * Function: H5PB_remove_entries + * + * Purpose: Remove entries in the page buffer associated with a + * newly freed multi-page block of file space. + * + * There are several possible situations here. + * + * In the context of metadata, there are two possible cases. + * + * 1) The block of file space is associated with a metadata + * entry. + * + * In regular operating mode, this entry will not be + * cached in the page buffer, so there should be nothing + * to do. + * + * In VFD SWMR mode, the entry may be cached in a single + * multi-page entry. + * + * 2) The block of file space has been sub-allocated + * into multiple metadata entries (i.e. fixed and extensible + * array). In this case, the individual entries may cross + * boundaries without being page aligned -- however, for + * purposes of the page buffer, I/O requests on these + * entries will have been broken up into requests that + * either do not cross page boundaries or are page aligned. + * + * In the context of raw data, the page buffer may or may + * not contain regular entries scattered over the space + * touched by the newly freed file space. + * + * In all contexts, there is no guarantee that the page buffer + * will contain any of the possible entries. + * + * Space allocations larger than one page must be page alligned. + * Further, any space between the end of a multi-page allocation + * and the next page boundary will remain un-allocated until after + * the original allocation is freed. This implies that: + * + * 1) The address passed into this call must be page aligned. + * + * 2) The page buffer may safely discard any page that + * intersects with the newly freed file space allocation. + * + * The bottom line here is that we must scan the page buffer + * index, and discard all entries that intersect the supplied + * address and length. As a sanity check, we must verify that + * any such entries don't overlap. + * + * Also, in the context of the VFD SWMR write, it is possible + * that the discarded pages will reside in the tick list or + * the delayed write list -- if so, they must be removed + * prior to eviction. + * + * Note: + * + * This function scans the page buffer hash table to + * find entries to remove. While this is normally + * pretty in-expensive, a very large (i.e. GB) file + * space free may impose significant cost. + * + * As best I understand it, such frees are rare, so + * the current solution should be good enough for now. + * However, if we determine that the current solution + * is too expensive, two alternate solutions come to mine. + * + * a) Scan the index list instead of the hash table + * if the free is sufficiently large. Also, skip + * entirely if the page buffer doesn't contain any + * pages of the appropriate type. + * + * b) Whenever writing a large metadata entry, scan for + * intersecting entries and delete them. (potential + * issues with fixed and variable array entries are + * dealt with via the splitting mechanism.) In this + * case we would also have to simply ignore writes + * beyond EOA on flush or close. + * + * Note that we already scan for intersecting entries + * on large raw data writes -- with possible performance + * issues for large writes. + * + * JRM -- 4/25/20 + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: John Mainzer 4/25/20 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ + herr_t H5PB_remove_entries(H5F_shared_t *shared, haddr_t addr, hsize_t size) { - H5PB_t *pb_ptr; - herr_t ret_value = SUCCEED; - metadata_section_t section[3] = {{0, 0, NULL}, {0, 0, NULL}, {0, 0, NULL}}; - int i; + uint64_t i; + uint64_t start_page; + uint64_t end_page; + int64_t entry_pages = 0; + hsize_t entry_size; + H5PB_t *pb_ptr = NULL; + H5PB_entry_t *entry_ptr = NULL; + herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_NOAPI(FAIL) + /* Sanity checks */ + HDassert(shared); + HDassert(shared->pb_ptr); + pb_ptr = shared->pb_ptr; - HDassert(addr % pb_ptr->page_size == 0); + /* Calculate the start_page offset */ + start_page = (addr / pb_ptr->page_size); - if (size > pb_ptr->page_size) { - hlog_fast(pbrm, - "removing multipage region [%" PRIuHADDR ", %" PRIuHADDR ")", - addr, addr + size); - } + HDassert(addr == start_page * pb_ptr->page_size); - metadata_section_split(pb_ptr->page_size, addr, size, NULL, section); + /* Calculate the end_page offset */ + end_page = ((addr + (haddr_t)(size - 1)) / pb_ptr->page_size); - for (i = 0; i < 3; i++) { - metadata_section_t *iter = §ion[i]; + HDassert(start_page <= end_page); + HDassert(((end_page - start_page) * pb_ptr->page_size) <= size); + HDassert(size <= ((end_page - start_page + 1) * pb_ptr->page_size)); + + for ( i = start_page; i <= end_page; i++ ) + { + /* test to see if page i exists */ + H5PB__SEARCH_INDEX(pb_ptr, i, entry_ptr, FAIL) - if (iter->len == 0) - continue; + if ( entry_ptr ) { - if (iter->len < size) { - hlog_fast(pbrm, "removing entry [%" PRIuHADDR ", %" PRIuHADDR ") " - "for split region [%" PRIuHADDR ", %" PRIuHADDR ")", - iter->addr, iter->addr + iter->len, addr, addr + size); - } + /* verify that this entry doesn't overlap with a previously + * visited entry. + */ + HDassert(entry_pages <= 0); - assert(iter->addr % pb_ptr->page_size == 0); + entry_size = entry_ptr->size; + entry_pages = (int64_t)(entry_size / pb_ptr->page_size); - if (H5PB_remove_entry(shared, iter->addr) < 0) - HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, "forced eviction failed") + if ( (uint64_t)entry_pages * pb_ptr->page_size < entry_size ) { + + entry_pages++; + } + + /* remove the entry */ + if ( H5PB_remove_entry(shared, entry_ptr->addr) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "H5PB_remove_entry() failed") + + } + entry_pages--; } done: + FUNC_LEAVE_NOAPI(ret_value) -} + +} /* H5PB_remove_entries() */ /*------------------------------------------------------------------------- @@ -1748,9 +2200,9 @@ done: * *------------------------------------------------------------------------- */ -herr_t -H5PB_vfd_swmr__update_index(H5F_t *f, - uint32_t * idx_ent_added_ptr, +herr_t +H5PB_vfd_swmr__update_index(H5F_t *f, + uint32_t * idx_ent_added_ptr, uint32_t * idx_ent_modified_ptr, uint32_t * idx_ent_not_in_tl_ptr, uint32_t * idx_ent_not_in_tl_flushed_ptr) @@ -1776,7 +2228,7 @@ H5PB_vfd_swmr__update_index(H5F_t *f, idx = shared->mdf_idx; HDassert(idx); - + pb_ptr = shared->pb_ptr; HDassert(pb_ptr); @@ -1805,7 +2257,7 @@ H5PB_vfd_swmr__update_index(H5F_t *f, if ( ie_ptr == NULL ) { /* alloc new entry in the metadata file index*/ uint32_t new_index_entry_index; - new_index_entry_index = shared->mdf_idx_entries_used + + new_index_entry_index = shared->mdf_idx_entries_used + idx_ent_added++; if (new_index_entry_index >= shared->mdf_idx_len && @@ -1858,7 +2310,7 @@ H5PB_vfd_swmr__update_index(H5F_t *f, ie_ptr->tick_of_last_flush = 0; } - /* scan the metadata file index for entries that don't appear in the + /* scan the metadata file index for entries that don't appear in the * tick list. If the index entry is dirty, and either doesn't appear * in the page buffer, or is clean in the page buffer, mark the index * entry clean and as having been flushed in the current tick. @@ -1890,7 +2342,7 @@ H5PB_vfd_swmr__update_index(H5F_t *f, } } - HDassert(idx_ent_modified + idx_ent_not_in_tl == + HDassert(idx_ent_modified + idx_ent_not_in_tl == shared->mdf_idx_entries_used); HDassert(idx_ent_modified + idx_ent_not_in_tl + idx_ent_added <= @@ -1902,8 +2354,10 @@ H5PB_vfd_swmr__update_index(H5F_t *f, *idx_ent_not_in_tl_flushed_ptr = idx_ent_not_in_tl_flushed; done: + FUNC_LEAVE_NOAPI(ret_value) -} + +} /* H5PB_vfd_swmr__update_index() */ /*------------------------------------------------------------------------- @@ -1918,9 +2372,10 @@ done: * * 2) If the write is raw data, and the page buffer is * configured for metadata only (i.e. min_md_pages == - * max_pages), simply write to the HDF5 file and return. + * max_pages), or if the page buffer is operating in + * vfd_swmr mode, simply write to the HDF5 file and return. * - * 3) If the write is raw data, and it of page size or + * 3) If the write is raw data, and is of page size or * larger, write directly from the HDF5 file. * * It is possible that the write intersects one or more @@ -1940,13 +2395,68 @@ done: * configured for raw data only (i.e. min_rd_pages == * max_pages), simply write to the HDF5 file and return. * + * The free space manager guarantees that allocations larger + * than one page will be page alligned, and that allocations + * of size less than or equal to page size will not cross page + * boundaries. Further, unlike raw data, metadata is always + * written and read atomically. + * + * In principle, this should make it easy to discriminate + * between small and multi-page metadata entries so that + * pages containing the former will be buffered and the + * latter be written directly to file. + * + * Unfortunately, there is a fly in the ointment. + * + * The fixed and extensible array on disk data + * structures allocate multiple metadata cache entries in + * a single block, and use this fact to make the addresses + * of all but the first entry in the block computable. While + * this simplifies the fixed and extensible array on disk data + * structures, it complicates the metadata cache and the page + * buffer. + * + * From the page buffer perspective, it breaks the invarient + * that metadata entries of less than page size don't cross + * page boundaries, and those of size greater than or equal + * to page size start on page boundaries -- which is important + * for VFD SWMR as it allows efficient management of multi-page + * metadata entries. + * + * While it is tempting to repair the fixed and extensible + * array data structures so as to remove this irregularity, + * and remove the resulting complexity from both the metadata + * cache and the page buffer, this is a ticklish task, as there + * are already files in the wild that use the existing versions + * of these data structures. Thus, due to resource constraints, + * we have to program around the issue for now. + * + * Fortunately, for purposes of the page buffer, this is + * relatively easy -- when we encounter a metadata write + * that crosses one or more page boundaries, and is not + * both page aligned and an integral number of pages, we + * query the metadata cache to determine the type of the + * client whose data is being writtne. If it is one of the + * mis-behaving types, we split it into two or three writes + * such that each write either doesn't cross page boundaries, + * or is page aligned and an integral number of pages. + * + * This is done in this function, and is not reflected in + * the case analysis in the rest of this comment. + * * 6) If the write is of metadata, the write is larger than - * one page, and vfd_swmr_writer is FALSE, simply read - * from the HDF5 file. There is no need to check the + * one page, and vfd_swmr_writer is FALSE, simply write + * to the HDF5 file. There is no need to check the * page buffer, as metadata is always read atomically, * and entries of this size are not buffered in the page * buffer. * + * Observe that this write must be page aligned. This + * should be enforced by the free space manager, but + * for now it is enforced by the above mentioned practice + * of splitting writes from cache client that don't + * allocate each entry separately. + * * 7) If the write is of metadata, the write is larger than * one page, and vfd_swmr_writer is TRUE, the write must * buffered in the page buffer until the end of the tick. @@ -1979,7 +2489,17 @@ done: * * Programmer: John Mainzer -- 10/11/18 * - * Changes: None. + * Changes: Updated to support splitting of metadata writes that + * are not page aligned and cross page boundaries into + * 2 or 3 writes that are either page aligned or do not + * cross page boundaries. Full details in the header + * comment above, that has been updated to document + * this change. + * + * Also updated case 2 to bypass the page buffer for raw + * data writes in vfd swmr mode. + * + * JRM -- 4/5/20 * *------------------------------------------------------------------------- */ @@ -1987,10 +2507,19 @@ herr_t H5PB_write(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size, const void *buf) { - H5PB_t *pb_ptr; /* Page buffer for this file */ + H5PB_t *pb_ptr; /* Page buffer for this file */ hbool_t bypass_pb = FALSE; /* Whether to bypass page buffering */ + hbool_t split_write = FALSE; /* whether md write must be split */ herr_t ret_value = SUCCEED; /* Return value */ + /* the following six fields are defined iff split_write is TRUE */ + haddr_t prefix_addr = HADDR_UNDEF; /* addr of prefix -- if defined */ + haddr_t body_addr = HADDR_UNDEF; /* addr of body -- if defined */ + haddr_t suffix_addr = HADDR_UNDEF; /* addr of suffix -- if defined */ + size_t prefix_size = 0; /* size of prefix */ + size_t body_size = 0; /* size of body */ + size_t suffix_size = 0; /* size of suffix */ + FUNC_ENTER_NOAPI(FAIL) hlog_fast(pbwr, "%s %p type %d addr %" PRIuHADDR " size %zu", @@ -2011,7 +2540,8 @@ H5PB_write(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size, if ( H5FD_MEM_DRAW == type ) { /* raw data write */ - if ( pb_ptr->min_md_pages == pb_ptr->max_pages ) { + if ( ( pb_ptr->min_md_pages == pb_ptr->max_pages ) || + ( pb_ptr->vfd_swmr ) ) { /* case 2) -- page buffer configured for metadata only */ bypass_pb = TRUE; @@ -2024,13 +2554,207 @@ H5PB_write(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size, /* case 5) -- page buffer configured for raw data only */ bypass_pb = TRUE; - } else if ( ( size >= pb_ptr->page_size ) && - ( ! ( pb_ptr->vfd_swmr_writer ) ) ) { + } else { - /* case 6) -- md read larger than one page and - * pb_ptr->vfd_swmr_writer is FALSE. + /* determine whether the write request must be split, + * and if so, compute the start points and sizes of + * of the sections. + * + * Note: The following code is almost identical to the + * similar code in H5PB_read(). Thus, on the surface, + * it is an obvious candidate for refactoring into a + * function or macro. + * + * However, there are subtle differences between + * the two pieces of code which are driven by the + * possibility of speculative reads. + * + * More to the point, further changes may be necessary. + * Thus we should wait on refactoring until this code has + * been in daily use for some time, and it is clear + * that further changes are unlikely. */ - bypass_pb = TRUE; + int mdc_client_id = -1; /* id of mdc client, or -1 if undef */ + uint64_t start_page; /* page index of first page in read */ + uint64_t second_page; /* page index of second page in read */ + uint64_t end_page; /* page index of last page in read */ + uint64_t body_page; /* page index of start of body */ + haddr_t start_page_addr; /* addr of first page in read */ + haddr_t second_page_addr;/* addr of second page in read */ + haddr_t end_page_addr; /* addr of last page in read */ + haddr_t end_addr; /* addr of last byte in read */ + + /* Calculate the aligned address of the first page */ + start_page = (addr / pb_ptr->page_size); + start_page_addr = start_page * pb_ptr->page_size; + + /* Calculate the aligned address of the last page */ + end_addr = addr + (haddr_t)(size - 1); + end_page = end_addr / (haddr_t)(pb_ptr->page_size); + end_page_addr = end_page * pb_ptr->page_size; + + HDassert(start_page_addr <= addr); + HDassert(addr < start_page_addr + (haddr_t)(pb_ptr->page_size)); + + HDassert(start_page <= end_page); + HDassert(end_page_addr <= ((addr + (haddr_t)size - 1))); + HDassert((addr + (haddr_t)size - 1) < + (end_page_addr + pb_ptr->page_size)); + + /* test to see if the write crosses a page boundary, and + * does not start on a page boundary, and is not of an + * integral number of pages. + */ + if ( ( start_page < end_page ) && + ( ! ( ( addr == start_page_addr ) && + ( end_page_addr + (haddr_t)(pb_ptr->page_size) == + end_addr + 1 ) ) ) ) { + + /* the read crosses a page boundary and is not + * page aligned and of length some multiple of page size. + * + * Test to see if the read is for a metadata entry that + * is sub-allocated from a larger space allocation. + * + * Note that the following test may have to be + * adjusted. + */ + mdc_client_id = H5C_get_curr_io_client_type(shared->cache); + + if ( ( mdc_client_id == (int)H5AC_EARRAY_DBLK_PAGE_ID ) || \ + ( mdc_client_id == (int)H5AC_FARRAY_DBLK_PAGE_ID ) ) { + + split_write = TRUE; + + } else { + + HDassert(addr == start_page_addr); + HDassert(size > pb_ptr->page_size); + + if ( ! pb_ptr->vfd_swmr_writer ) { + + /* case 6) -- multi-page entry with fixed / + * extensible array filtered out, and no + * no VFD swmr. + */ + bypass_pb = TRUE; + } + } + } else if ( ( size > pb_ptr->page_size ) && + ( ! pb_ptr->vfd_swmr_writer ) ) { + + /* write is larger than page size and we are not + * in VFD SWMR mode -- bypass the page buffer. + * This is also case 6. We catch it here as + * the code to determine whether to split only + * looks at I/O requests that cross page bundaries + * and are not both page aligned and an integral + * number of pages in length. + */ + HDassert(start_page_addr == addr); + bypass_pb = TRUE; + } + + if ( split_write ) { + + /* compute the base addresses and length of the prefix, + * body, and suffix of the write, where these terms are + * defined as follows: + * + * prefix: All bytes from addr to the first page address + * at or after addr. If addr == start_page_addr, + * the prefix is empty. + * + * body: All bytes from the first page address covered + * by the write up to but not including the last + * page address in the write. Note that the + * length of the body must be a multiple of the + * page size. If only one page address is + * included in the write, the body is empty. + * + * suffix: All bytes from the last page address in the + * write until the end of the write. If the + * write ends on a page boundary, the suffix is + * empty. + * + * Since we know that the write crosses at least one + * page boundary, and we have aleady filtered out the + * body only case, at least two of the above must be + * non-empty. + */ + + second_page = start_page + 1; + second_page_addr = + (haddr_t)(second_page * pb_ptr->page_size); + + if ( addr > start_page_addr ) { /* prefix exists */ + + prefix_addr = addr; + prefix_size = (size_t)(second_page_addr - addr); + + HDassert(prefix_addr > start_page_addr); + HDassert(prefix_size < pb_ptr->page_size); + HDassert(((size_t)(addr - start_page_addr) + \ + prefix_size) == pb_ptr->page_size); + } + + if ( size - prefix_size >= pb_ptr->page_size ) { + + /* body exists */ + + if ( addr == start_page_addr ) { + + body_page = start_page; + body_addr = start_page_addr; + + } else { + + body_page = second_page; + body_addr = second_page_addr; + } + + if ( end_addr < end_page_addr + + (haddr_t)(pb_ptr->page_size - 1) ) { + + /* suffix exists */ + body_size = (size_t)(end_page - body_page) * + pb_ptr->page_size; + + } else { + + /* suffix is empty */ + body_size = (size_t)(end_page - body_page + 1) * + pb_ptr->page_size; + } + + HDassert((body_page == start_page) || \ + (body_page == start_page + 1)); + + HDassert(body_addr == \ + (haddr_t)(body_page * pb_ptr->page_size)); + + HDassert(body_size < size); + HDassert(body_size >= pb_ptr->page_size); + + + HDassert(body_addr == \ + addr + (haddr_t)prefix_size); + HDassert((body_addr + (haddr_t)body_size) \ + <= (end_addr + 1)); + } + + if ( end_addr < end_page_addr + + (haddr_t)(pb_ptr->page_size - 1) ) { + + suffix_addr = end_page_addr; + suffix_size = (end_addr + 1) - end_page_addr; + + HDassert(suffix_addr == \ + addr + (haddr_t)(prefix_size + body_size)); + } + + HDassert(size == prefix_size + body_size + suffix_size); + } } } } @@ -2046,6 +2770,7 @@ H5PB_write(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size, } /* end if */ #endif /* H5_HAVE_PARALLEL */ + if ( bypass_pb ) { /* cases 1, 2. 5, and 6 */ if ( H5FD_write(shared->lf, type, addr, size, buf) < 0 ) @@ -2067,15 +2792,84 @@ H5PB_write(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size, HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \ "H5PB_read_raw() failed") + } else if ( split_write ) { + + /* handle the sub-allocated entry case */ + + /* write prefix if it exists */ + if ( prefix_size > 0 ) { + + if ( H5PB__write_meta(shared, type, addr, + prefix_size, buf) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \ + "H5PB__write_meta() failed on prefix") + } + + /* write the body if it exists */ + if ( body_size > 0 ) { + + /* The "body_size == pb_ptr->page_size" clause in the + * following if is required since in normal operating + * mode, the page buffer buffers metadata I/O + * requests of page size or less. + * + * Thus this clause ensures that a single page body + * does not bypass the page buffer, setting the potential + * for an older version to shadow the most recent version. + * + * Note: The page buffer really shouldn't buffer page + * aligned single page metadata I/O requests, as it + * creates extra overhead to no purpose. However, + * fixing this is a bit tricky, and the case doesn't + * appear to be common. Thus, while it should be + * fixed, I don't think it is urgent. + * + * JRM 4/19/20 + */ + if ( ( pb_ptr->vfd_swmr ) || + ( body_size == pb_ptr->page_size ) ) { + + if ( H5PB__write_meta(shared, type, body_addr, body_size, + (const void *)((const uint8_t *)buf + + prefix_size)) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \ + "H5PB__write_meta() failed on body") + + } else { + + if ( H5FD_write(shared->lf, type, body_addr, body_size, + (const void *)((const uint8_t *)buf + + prefix_size)) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \ + "write through of body failed") + + H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size); + } + } + + /* write the suffix if it exists */ + if ( suffix_size > 0 ) { + + if ( H5PB__write_meta(shared, type, suffix_addr, suffix_size, + (const void *)((const uint8_t *)buf + + prefix_size + body_size)) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \ + "H5PB_write_meta() failed on suffix") + } + + H5PB__UPDATE_STATS_FOR_WRITE_SPLIT(pb_ptr) + } else { /* cases 7, and 8 */ - if ( metadata_multipart_write(shared, type, addr, size, buf) < 0 ) + if ( H5PB__write_meta(shared, type, addr, size, buf) < 0 ) HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \ - "H5PB_read_meta() failed") + "H5PB_write_meta() failed") } - - H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size); } done: @@ -3070,118 +3864,6 @@ done: } /* H5PB__mark_entry_dirty() */ -static void -metadata_section_split(size_t pgsz, haddr_t addr, size_t len, const void *_buf, - metadata_section_t *section) -{ - int i; - size_t totlen = 0; - haddr_t whole_pgaddr, tail_pgaddr; - const char *buf = _buf; - metadata_section_t *head = §ion[0], *middle = §ion[1], - *tail = §ion[2]; - - /* Try to find the address of the first whole page, and the address of - * the page after the last whole page. - */ - whole_pgaddr = roundup(addr, pgsz); - tail_pgaddr = rounddown(addr + len, pgsz); - - /* In the degenerate case where the first whole page is "after" the last, - * actually the entire access lands between page boundaries. - */ - if (whole_pgaddr > tail_pgaddr) { - assert(len < pgsz); - head->addr = addr; - head->len = len; - head->buf = buf; - return; - } - - /* `head` spans any range beginning before the first page boundary. */ - if (addr < whole_pgaddr) { - head->buf = buf; - head->len = pgsz - addr % pgsz; - head->addr = addr; - } - - /* `middle` spans one or more whole pages in between the end of - * `head` and before the beginning of `tail`. - */ - if (whole_pgaddr < tail_pgaddr) { - middle->buf = (buf == NULL) ? NULL : &buf[whole_pgaddr - addr]; - middle->len = tail_pgaddr - whole_pgaddr; - middle->addr = whole_pgaddr; - } - - /* `tail` spans residual bytes that follow the last page boundary. */ - if (tail_pgaddr < addr + len) { - tail->len = (addr + len) - tail_pgaddr; - tail->buf = (buf == NULL) ? NULL : &buf[tail_pgaddr - addr]; - tail->addr = tail_pgaddr; - } - - for (i = 0; i < 3; i++) { - metadata_section_t *iter = §ion[i]; - if (iter->len == 0) - continue; - assert(iter->addr == addr + totlen); - assert(iter->buf == ((buf == NULL) ? NULL : &buf[totlen])); -// assert(i == 0 || iter[-1].buf + iter[-1].len == iter->buf); - totlen += iter->len; - } - - assert(totlen == len); -} - -static herr_t -metadata_multipart_read(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, - size_t len, void *_buf/*out*/) -{ - herr_t rc; - int i; - const size_t pgsz = shared->pb_ptr->page_size; - metadata_section_t section[3] = {{0, 0, NULL}, {0, 0, NULL}, {0, 0, NULL}}; - - metadata_section_split(pgsz, addr, len, _buf, section); - - for (i = 0; i < 3; i++) { - metadata_section_t *iter = §ion[i]; - if (iter->buf == NULL) - continue; - rc = H5PB__read_meta(shared, type, iter->addr, iter->len, - (void *)(uintptr_t)iter->buf); - if (rc < 0) - return rc; - } - - return SUCCEED; -} - -static herr_t -metadata_multipart_write(H5F_shared_t *shared, H5FD_mem_t type, - haddr_t addr, size_t len, const void *_buf/*out*/) -{ - herr_t rc; - int i; - const size_t pgsz = shared->pb_ptr->page_size; - metadata_section_t section[3] = {{0, 0, NULL}, {0, 0, NULL}, {0, 0, NULL}}; - - metadata_section_split(pgsz, addr, len, _buf, section); - - for (i = 0; i < 3; i++) { - metadata_section_t *iter = §ion[i]; - - if (iter->buf == NULL) - continue; - rc = H5PB__write_meta(shared, type, iter->addr, iter->len, iter->buf); - if (rc < 0) - return rc; - } - - return SUCCEED; -} - /*------------------------------------------------------------------------- * @@ -3197,21 +3879,25 @@ metadata_multipart_write(H5F_shared_t *shared, H5FD_mem_t type, * existing page, it must not be a multi-page metadata * entry. It it is, flag an error. * + * Recall that by the time we get to this function, + * un-aligned page reads from the fixed and variable + * length array structures that cross page boundaries + * have already been split into two or three reads + * that conform to the usual pattern of metadata reads. + * * 7) If the read is for metadata, is page aligned, is larger * than one page, and there is no entry in the page buffer, * satisfy the read from the file * * 8) If the read is for metadata, is page aligned, is larger * than one page, and there is a regular entry at the target - * page address, test to see if the last read was for the - * same address. + * page address, test to see if the read is speculative. * - * If was, evict the page, and satisfy the read from file. - * Flag an error if the page was dirty. + * If it is not, evict the page, and satisfy the read from + * file. Flag an error if the page was dirty. * - * If the last read was for a different page, clip the read - * to one page, and satisfy the read from the existing - * regular entry. + * If it is, clip the read to one page, and satisfy the + * read from the existing regular entry. * * 9) If the read is for metadata, is page aligned, is larger * than one page, and there is a multi-page metadata entry @@ -3243,7 +3929,7 @@ metadata_multipart_write(H5F_shared_t *shared, H5FD_mem_t type, * * P/A == page aligned * size > PL == size > page length - * PA == previous address + * Spec == speculative read * A == current address * * In the entry exists column: @@ -3253,7 +3939,7 @@ metadata_multipart_write(H5F_shared_t *shared, H5FD_mem_t type, * MPMDE == multi-page metadata entry * * | size | entry | VFD | | - * P/A: | > PL | exists | SWMR | PA == A | Comments: + * P/A: | > PL | exists | SWMR | Spec | Comments: * ------+------+--------+------+---------+------------------------------------- * N | X | N || R | X | X | Clip read to page boundary if * | | | | | necessary @@ -3266,10 +3952,10 @@ metadata_multipart_write(H5F_shared_t *shared, H5FD_mem_t type, * ------+------+--------+------+---------+------------------------------------- * Y | Y | N | X | X | Satisfy read from file (case 7) * ------+------+--------+------+---------+------------------------------------- - * Y | Y | R | X | N | Clip read to page boundary + * Y | Y | R | X | Y | Clip read to page boundary * | | | | | Satisfy read from entry (case 8) * ------+------+--------+------+---------+------------------------------------- - * Y | Y | R | X | Y | Evict entry + * Y | Y | R | X | N | Evict entry * | | | | | (must be clean -- flag error if not) * | | | | | Satisfy read from file (case 8) * ------+------+--------+------+---------+------------------------------------- @@ -3307,20 +3993,25 @@ metadata_multipart_write(H5F_shared_t *shared, H5FD_mem_t type, * * Programmer: John Mainzer -- 10/11/18 * - * Changes: None. + * Changes: Updated to use the speculative read hint from the + * metadata cache, and remove the static variable + * containing the base address of the last read. + * + * JRM -- 4/5/20 * *------------------------------------------------------------------------- */ static herr_t -H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size, - void *buf/*out*/) +H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, + size_t size, void *buf/*out*/) { + hbool_t bypass = FALSE; /* flag indicating PB bypassed */ + hbool_t speculative = FALSE; /* speculative read hint from mdc */ H5PB_t *pb_ptr; /* Page buffer for this file */ H5PB_entry_t *entry_ptr; /* Pointer to page buffer entry */ H5FD_t *file; /* File driver pointer */ uint64_t page; /* page offset of addr */ haddr_t page_addr; /* page containing addr */ - static haddr_t prev_addr = HADDR_UNDEF; /* addr of last call */ size_t offset; /* offset of read in page */ size_t clipped_size; /* possibley clipped size */ herr_t ret_value = SUCCEED; /* Return value */ @@ -3379,7 +4070,8 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size TRUE, FALSE) if ( ( NULL == entry_ptr ) && - ( H5PB__load_page(shared, pb_ptr, page_addr, type, &entry_ptr) < 0 ) ) + ( H5PB__load_page(shared, pb_ptr, page_addr, + type, &entry_ptr) < 0 ) ) HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ "page buffer page load request failed (1)") @@ -3404,7 +4096,7 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size HDassert( page_addr == addr ); - if ( size >= pb_ptr->page_size ) { + if ( size > pb_ptr->page_size ) { /* search the page buffer for an entry at page */ H5PB__SEARCH_INDEX(pb_ptr, page, entry_ptr, FAIL) @@ -3413,10 +4105,11 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size if ( entry_ptr == NULL ) { /* case 7 */ /* update hit rate stats */ - H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, FALSE, TRUE, size > pb_ptr->page_size) + H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, FALSE, \ + TRUE, size > pb_ptr->page_size) - /* If the read is for metadata, is page aligned, is larger - * than one page, and there is no entry in the page buffer, + /* If the read is for metadata, is page aligned, is larger + * than page size, and there is no entry in the page buffer, * satisfy the read from the file */ if ( H5FD_read(file, type, addr, size, buf) < 0) @@ -3424,7 +4117,10 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ "driver read request failed (1)") + bypass = TRUE; + H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size); + } else { HDassert( entry_ptr ); @@ -3435,28 +4131,29 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size /* If the read is for metadata, is page aligned, is larger * than one page, and there is a regular entry at the target - * page address, test to see if the last read was for the - * same address. + * page address, test to see if the read is speculative. * - * If it was, evict the page, and satisfy the read from + * If it is not, evict the page, and satisfy the read from * file. Flag an error if the page was dirty. * - * If the last read was for a different page, clip the read - * to one page, and satisfy the read from the existing - * regular entry. + * If it is, clip the read to one page, and satisfy + * the read from the existing regular entry. */ HDassert( entry_ptr->size == pb_ptr->page_size ); - if ( addr == prev_addr ) { + speculative = H5C_get_curr_read_speculative(shared->cache); + + if ( ! speculative ) { - /* since this is a second try, don't update + /* since this is likely a second try, don't update * hit rate stats. */ HDassert( ! ( entry_ptr->is_dirty ) ); - if (H5PB__evict_entry(shared, entry_ptr, TRUE, false) < 0) + if ( H5PB__evict_entry(shared, entry_ptr, + TRUE, false) < 0 ) HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ "forced eviction failed (1)") @@ -3465,7 +4162,9 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ "driver read request failed (2)") + bypass = TRUE; H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size); + } else { HDassert( entry_ptr->image_ptr ); @@ -3485,7 +4184,8 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size } /* update hit rate stats */ - H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, TRUE, TRUE, FALSE) + H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, TRUE, \ + TRUE, FALSE) } } else { /* case 9 */ @@ -3555,7 +4255,8 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size TRUE, FALSE) if ( ( NULL == entry_ptr ) && - ( H5PB__load_page(shared, pb_ptr, page_addr, type, &entry_ptr) < 0)) + ( H5PB__load_page(shared, pb_ptr, page_addr, + type, &entry_ptr) < 0)) HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ "page buffer page load request failed (2)") @@ -3578,7 +4279,8 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size } } - prev_addr = addr; + if ( ! bypass ) + H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size); done: @@ -3876,6 +4578,8 @@ H5PB__read_raw(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size, } } /* end else */ + H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size); + done: FUNC_LEAVE_NOAPI(ret_value) @@ -4119,6 +4823,8 @@ H5PB__write_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, H5PB__INSERT_IN_TL(pb_ptr, entry_ptr, FAIL) } + H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size); + done: FUNC_LEAVE_NOAPI(ret_value) @@ -4167,8 +4873,8 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5PB__write_raw(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size, - const void *buf/*out*/) +H5PB__write_raw(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, + size_t size, const void *buf/*out*/) { H5PB_t *pb_ptr; /* Page buffer for this file */ H5PB_entry_t *entry_ptr; /* Pointer to page buffer entry */ @@ -4418,6 +5124,8 @@ H5PB__write_raw(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size } } + H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size); + done: FUNC_LEAVE_NOAPI(ret_value) diff --git a/src/H5PBpkg.h b/src/H5PBpkg.h index fb9f29f..14804ac 100644 --- a/src/H5PBpkg.h +++ b/src/H5PBpkg.h @@ -812,6 +812,20 @@ if ( ( (entry_ptr) == NULL ) || \ ((pb_ptr)->loads[i])++; \ } /* H5PB__UPDATE_STATS_FOR_LOAD */ +#define H5PB__UPDATE_STATS_FOR_READ_SPLIT(pb_ptr) \ +{ \ + HDassert(pb_ptr); \ + HDassert((pb_ptr)->magic == H5PB__H5PB_T_MAGIC); \ + (pb_ptr->md_read_splits)++; \ +} /* H5PB__UPDATE_STATS_FOR_READ_SPLIT */ + +#define H5PB__UPDATE_STATS_FOR_WRITE_SPLIT(pb_ptr) \ +{ \ + HDassert(pb_ptr); \ + HDassert((pb_ptr)->magic == H5PB__H5PB_T_MAGIC); \ + (pb_ptr->md_write_splits)++; \ +} /* H5PB__UPDATE_STATS_FOR_READ_SPLIT */ + #else /* H5PB__COLLECT_PAGE_BUFFER_STATS */ #define H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, hit, is_metadata, is_mpmde) @@ -834,6 +848,8 @@ if ( ( (entry_ptr) == NULL ) || \ #define H5PB__UPDATE_STATS_FOR_CLEAR(pb_ptr, entry_ptr) #define H5PB__UPDATE_STATS_FOR_INSERTION(pb_ptr, entry_ptr) #define H5PB__UPDATE_STATS_FOR_LOAD(pb_ptr, entry_ptr) +#define H5PB__UPDATE_STATS_FOR_READ_SPLIT(pb_ptr) +#define H5PB__UPDATE_STATS_FOR_WRITE_SPLIT(pb_ptr) #endif /* H5PB__COLLECT_PAGE_BUFFER_STATS */ diff --git a/src/H5PBprivate.h b/src/H5PBprivate.h index 983d183..97de7ae 100644 --- a/src/H5PBprivate.h +++ b/src/H5PBprivate.h @@ -249,6 +249,9 @@ typedef struct H5PB_entry_t H5PB_entry_t; * * FIELDS SUPPORTING VFD SWMR: * + * If the file is opened in VFD SWMR mode (i.e. vfd_swmr == TRUE), all + * raw data I/O must be passed through to the HDF5 file + * * If the file is opened as a VFD SWMR writer (i.e. vfd_swmr_writer == TRUE), * the page buffer must retain the data necessary to update the metadata * file at the end of each tick, and also delay writes as necessary so as @@ -285,8 +288,12 @@ typedef struct H5PB_entry_t H5PB_entry_t; * The remainder of this sections contains discussions of the fields and * data structures used to support the above operations. * + * vfd_swmr: Boolean flag that is set to TRUE IFF the file is opened + * in VFD SWMR mode -- either reader or writer. This field + * is used to exclude raw data from the page buffer. + * * vfd_swmr_writer: Boolean flag that is set to TRUE iff the file is - * the file is opened in VFD SWMR mode. The remaining + * is opened in VFD SWMR writer mode. The remaining * VFD SWMR fields are defined iff vfd_swmr_writer is TRUE. * * mpmde_count: int64_t containing the number of multi-page metadata @@ -528,6 +535,16 @@ typedef struct H5PB_entry_t H5PB_entry_t; * total_dwl_ins_depth: int64_t containing the total insertion depth * required to maintain the odering invarient on the * delayed write list. + * + * md_read_splits: int64_t containing the number of metadata reads that + * are split into two or three sub-reads to manage the + * case in which a group of metadata cache clients + * sub-allocate entries from a single file space allocationn. + * + * md_write_splits: int64_t containing the number of metadata writes that + * are split into two or three sub-writes to manage the + * case in which a group of metadata cache clients + * sub-allocate entries from a single file space allocationn. * ******************************************************************************/ @@ -578,6 +595,7 @@ typedef struct H5PB_t { /* Fields for VFD SWMR operations: */ + hbool_t vfd_swmr; hbool_t vfd_swmr_writer; int64_t mpmde_count; uint64_t cur_tick; @@ -646,6 +664,8 @@ typedef struct H5PB_t { int64_t max_dwl_len; int64_t max_dwl_size; int64_t total_dwl_ins_depth; + int64_t md_read_splits; + int64_t md_write_splits; } H5PB_t; @@ -671,6 +691,7 @@ H5_DLL herr_t H5PB_add_new_page(H5F_shared_t *, H5FD_mem_t, haddr_t); H5_DLL herr_t H5PB_update_entry(H5PB_t *, haddr_t, size_t, const void *); H5_DLL herr_t H5PB_remove_entry(H5F_shared_t *, haddr_t); + H5_DLL herr_t H5PB_remove_entries(H5F_shared_t *, haddr_t, hsize_t); H5_DLL herr_t H5PB_read(H5F_shared_t *, H5FD_mem_t, haddr_t, diff --git a/test/page_buffer.c b/test/page_buffer.c index 10db2e9..88b3317 100644 --- a/test/page_buffer.c +++ b/test/page_buffer.c @@ -24,6 +24,15 @@ #include "h5test.h" +/* + * This file needs to access private information from the H5C package. + * This file also needs to access the metadata cache testing code. + */ +#define H5C_FRIEND /*suppress error about including H5Cpkg */ +#define H5C_TESTING /*suppress warning about H5C testing funcs*/ +#include "H5Cpkg.h" /* Cache */ + + #include "H5CXprivate.h" /* API Contexts */ #include "H5Iprivate.h" #include "H5PBprivate.h" @@ -65,6 +74,10 @@ static unsigned test_raw_data_handling(hid_t orig_fapl, const char *env_h5_drvr, static unsigned test_lru_processing(hid_t orig_fapl, const char *env_h5_drvr); static unsigned test_min_threshold(hid_t orig_fapl, const char *env_h5_drvr); static unsigned test_stats_collection(hid_t orig_fapl, const char *env_h5_drvr); +static unsigned md_entry_splitting_smoke_check(hid_t orig_fapl, + const char *env_h5_drvr, bool); +static unsigned md_entry_splitting_boundary_test(hid_t orig_fapl, + const char *env_h5_drvr, bool); #endif /* H5_HAVE_PARALLEL */ #define FILENAME "filepaged" @@ -339,7 +352,8 @@ error: HDfree(data); } H5E_END_TRY; return(1); -} + +} /* create_file() */ /*------------------------------------------------------------------------- @@ -494,7 +508,7 @@ set_multi_split(const char *env_h5_drvr, hid_t fapl, hsize_t pagesize) error: return 1; -} +} /* set_multi_split() */ #ifndef H5_HAVE_PARALLEL @@ -813,7 +827,8 @@ error: HDfree(odata); } H5E_END_TRY; return 1; -} + +} /* test_mpmde_delay_basic() */ /* @@ -1015,7 +1030,8 @@ error: HDfree(odata); } H5E_END_TRY; return 1; -} + +} /* test_spmde_lru_evict_basic() */ /* @@ -1152,7 +1168,8 @@ error: HDfree(odata); } H5E_END_TRY; return 1; -} + +} /* test_spmde_delay_basic() */ /* @@ -1185,6 +1202,19 @@ error: * page buffer. * * JRM -- 10/26/18 + * + * We have decided not to buffer raw data in the page buffer + * when operating in VFD SWMR mode. This is necessary as + * otherwise raw data can get stuck in the page buffer, thus + * delaying it's visibility to the reader. + * + * Obviously, there is a potential performance trade off + * here, but it shouldn't be significant in the expected + * VFD SWMR use cases. Needless to say, we will revisit this + * if necessary. + * + * JRM -- 4/8/20 + * */ /* Changes due to file space page size has a minimum size of 512 */ @@ -1241,7 +1271,8 @@ test_raw_data_handling(hid_t orig_fapl, const char *env_h5_drvr, TEST_ERROR; /* allocate space for 2000 elements */ - if (HADDR_UNDEF == (addr = H5MF_alloc(f, H5FD_MEM_DRAW, sizeof(int) * (size_t)num_elements))) + if (HADDR_UNDEF == (addr = H5MF_alloc(f, H5FD_MEM_DRAW, + sizeof(int) * (size_t)num_elements))) FAIL_STACK_ERROR; if ((data = (int *)HDcalloc((size_t)num_elements, sizeof(int))) == NULL) @@ -1250,7 +1281,8 @@ test_raw_data_handling(hid_t orig_fapl, const char *env_h5_drvr, /* initialize all the elements to have a value of -1 */ for(i=0 ; i<num_elements ; i++) data[i] = -1; - if (H5F_block_write(f, H5FD_MEM_DRAW, addr, sizeof(int) * (size_t)num_elements, data) < 0) + if (H5F_block_write(f, H5FD_MEM_DRAW, addr, + sizeof(int) * (size_t)num_elements, data) < 0) FAIL_STACK_ERROR; /* update the first 100 elements to have values 0-99 - this will be @@ -1264,48 +1296,75 @@ test_raw_data_handling(hid_t orig_fapl, const char *env_h5_drvr, page_count ++; - if (f->shared->pb_ptr->curr_pages != page_count + base_page_cnt) + if ( ( f->shared->pb_ptr->curr_pages != page_count + base_page_cnt ) && + ( ( vfd_swmr_mode ) && + ( f->shared->pb_ptr->curr_pages != base_page_cnt ) ) ) FAIL_STACK_ERROR; /* update elements 300 - 450, with values 300 - - this will bring two more pages into the page buffer. */ for(i=0 ; i<150 ; i++) data[i] = i+300; - if (H5F_block_write(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 300), sizeof(int) * 150, data) < 0) + + if (H5F_block_write(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 300), + sizeof(int) * 150, data) < 0) FAIL_STACK_ERROR; + page_count += 2; - if (f->shared->pb_ptr->curr_pages != page_count + base_page_cnt) + + if ( ( f->shared->pb_ptr->curr_pages != page_count + base_page_cnt ) && + ( ( vfd_swmr_mode ) && + ( f->shared->pb_ptr->curr_pages != base_page_cnt ) ) ) FAIL_STACK_ERROR; /* update elements 100 - 300, this will go to disk but also update existing pages in the page buffer. */ for(i=0 ; i<200 ; i++) data[i] = i+100; - if (H5F_block_write(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 100), sizeof(int) * 200, data) < 0) + + if (H5F_block_write(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 100), + sizeof(int) * 200, data) < 0) FAIL_STACK_ERROR; - if (f->shared->pb_ptr->curr_pages != page_count + base_page_cnt) + + if ( ( f->shared->pb_ptr->curr_pages != page_count + base_page_cnt ) && + ( ( vfd_swmr_mode ) && + ( f->shared->pb_ptr->curr_pages != base_page_cnt ) ) ) FAIL_STACK_ERROR; /* Update elements 225-300 - this will update an existing page in the PB */ /* Changes: 450 - 600; 150 */ for(i=0 ; i<150 ; i++) data[i] = i+450; - if (H5F_block_write(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 450), sizeof(int) * 150, data) < 0) + + if (H5F_block_write(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 450), + sizeof(int) * 150, data) < 0) FAIL_STACK_ERROR; - if (f->shared->pb_ptr->curr_pages != page_count + base_page_cnt) + + if ( ( f->shared->pb_ptr->curr_pages != page_count + base_page_cnt ) && + ( ( vfd_swmr_mode ) && + ( f->shared->pb_ptr->curr_pages != base_page_cnt ) ) ) FAIL_STACK_ERROR; /* Do a full page write to block 600-800 - should bypass the PB */ for(i=0 ; i<200 ; i++) data[i] = i+600; - if (H5F_block_write(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 600), sizeof(int) * 200, data) < 0) + + if (H5F_block_write(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 600), + sizeof(int) * 200, data) < 0) FAIL_STACK_ERROR; - if (f->shared->pb_ptr->curr_pages != page_count + base_page_cnt) + + if ( ( f->shared->pb_ptr->curr_pages != page_count + base_page_cnt ) && + ( ( vfd_swmr_mode ) && + ( f->shared->pb_ptr->curr_pages != base_page_cnt ) ) ) FAIL_STACK_ERROR; - /* read elements 800 - 1200, this should not affect the PB, and should read -1s */ - if (H5F_block_read(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 800), sizeof(int) * 400, data) < 0) + /* read elements 800 - 1200, this should not affect the PB, and should + * read -1s + */ + if (H5F_block_read(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 800), + sizeof(int) * 400, data) < 0) FAIL_STACK_ERROR; + for (i=0; i < 400; i++) { if (data[i] != -1) { HDfprintf(stderr, "Read different values than written\n"); @@ -1313,14 +1372,19 @@ test_raw_data_handling(hid_t orig_fapl, const char *env_h5_drvr, FAIL_STACK_ERROR; } } - if (f->shared->pb_ptr->curr_pages != page_count + base_page_cnt) + + if ( ( f->shared->pb_ptr->curr_pages != page_count + base_page_cnt ) && + ( ( vfd_swmr_mode ) && + ( f->shared->pb_ptr->curr_pages != base_page_cnt ) ) ) FAIL_STACK_ERROR; /* read elements 1200 - 1201, this should read -1 and bring in an * entire page of addr 1200 */ - if (H5F_block_read(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 1200), sizeof(int) * 1, data) < 0) + if (H5F_block_read(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 1200), + sizeof(int) * 1, data) < 0) FAIL_STACK_ERROR; + for (i=0; i < 1; i++) { if (data[i] != -1) { HDfprintf(stderr, "Read different values than written\n"); @@ -1329,14 +1393,19 @@ test_raw_data_handling(hid_t orig_fapl, const char *env_h5_drvr, } } page_count ++; - if (f->shared->pb_ptr->curr_pages != page_count + base_page_cnt) + + if ( ( f->shared->pb_ptr->curr_pages != page_count + base_page_cnt ) && + ( ( vfd_swmr_mode ) && + ( f->shared->pb_ptr->curr_pages != base_page_cnt ) ) ) TEST_ERROR; /* read elements 175 - 225, this should use the PB existing pages */ /* Changes: 350 - 450 */ /* read elements 175 - 225, this should use the PB existing pages */ - if (H5F_block_read(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 350), sizeof(int) * 100, data) < 0) + if (H5F_block_read(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 350), + sizeof(int) * 100, data) < 0) FAIL_STACK_ERROR; + for (i=0; i < 100; i++) { if (data[i] != i + 350) { HDfprintf(stderr, "Read different values than written\n"); @@ -1345,16 +1414,27 @@ test_raw_data_handling(hid_t orig_fapl, const char *env_h5_drvr, TEST_ERROR; } } - if (f->shared->pb_ptr->curr_pages != page_count + base_page_cnt) + + if ( ( f->shared->pb_ptr->curr_pages != page_count + base_page_cnt ) && + ( ( vfd_swmr_mode ) && + ( f->shared->pb_ptr->curr_pages != base_page_cnt ) ) ) TEST_ERROR; - /* read elements 0 - 800 using the VFD.. this should result in -1s - except for the writes that went through the PB (100-300 & 600-800) */ - if (H5FD_read(f->shared->lf, H5FD_MEM_DRAW, addr, sizeof(int) * 800, data) < 0) + /* read elements 0 - 800 using the VFD. + * + * In the non-VFD SWMR case, this should result in -1s + * except for the writes that went through the PB (100-300 & 600-800) + * + * In the VFD SWMR case, the page buffer is bypassed for raw data, + * thus all writes should be visible. + */ + if (H5FD_read(f->shared->lf, H5FD_MEM_DRAW, addr, + sizeof(int) * 800, data) < 0) FAIL_STACK_ERROR; + i = 0; while (i < 800) { - if((i>=100 && i<300) || i >= 600) { + if((vfd_swmr_mode) || (i>=100 && i<300) || i >= 600) { if (data[i] != i) { HDfprintf(stderr, "Read different values than written\n"); HDfprintf(stderr, "data[%d] = %d, %d expected.\n", @@ -1378,8 +1458,12 @@ test_raw_data_handling(hid_t orig_fapl, const char *env_h5_drvr, */ if (H5F_block_read(f, H5FD_MEM_DRAW, addr, sizeof(int) * 800, data) < 0) FAIL_STACK_ERROR; - if (f->shared->pb_ptr->curr_pages != page_count + base_page_cnt) + + if ( ( f->shared->pb_ptr->curr_pages != page_count + base_page_cnt ) && + ( ( vfd_swmr_mode ) && + ( f->shared->pb_ptr->curr_pages != base_page_cnt ) ) ) TEST_ERROR; + for (i=0; i < 800; i++) { if (data[i] != i) { HDfprintf(stderr, "Read different values than written\n"); @@ -1395,10 +1479,16 @@ test_raw_data_handling(hid_t orig_fapl, const char *env_h5_drvr, */ for(i=0 ; i<1000 ; i++) data[i] = 0; - if (H5F_block_write(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 400), sizeof(int) * 1000, data) < 0) + + if (H5F_block_write(f, H5FD_MEM_DRAW, addr + (sizeof(int) * 400), + sizeof(int) * 1000, data) < 0) FAIL_STACK_ERROR; + page_count -= 2; - if (f->shared->pb_ptr->curr_pages != page_count + base_page_cnt) + + if ( ( f->shared->pb_ptr->curr_pages != page_count + base_page_cnt ) && + ( ( vfd_swmr_mode ) && + ( f->shared->pb_ptr->curr_pages != base_page_cnt ) ) ) TEST_ERROR; /* read elements 0 - 1000.. this should go to disk then update the @@ -1406,6 +1496,7 @@ test_raw_data_handling(hid_t orig_fapl, const char *env_h5_drvr, */ if (H5F_block_read(f, H5FD_MEM_DRAW, addr, sizeof(int) * 1000, data) < 0) FAIL_STACK_ERROR; + i=0; while (i < 1000) { if(i<400) { @@ -1426,7 +1517,10 @@ test_raw_data_handling(hid_t orig_fapl, const char *env_h5_drvr, } i++; } - if (f->shared->pb_ptr->curr_pages != page_count + base_page_cnt) + + if ( ( f->shared->pb_ptr->curr_pages != page_count + base_page_cnt ) && + ( ( vfd_swmr_mode ) && + ( f->shared->pb_ptr->curr_pages != base_page_cnt ) ) ) TEST_ERROR; if (H5Fclose(file_id) < 0) @@ -2448,7 +2542,6 @@ error: HDfree(data); } H5E_END_TRY; return 1; - } /* test_min_threshold */ @@ -2676,22 +2769,24 @@ test_stats_collection(hid_t orig_fapl, const char *env_h5_drvr) sizeof(int)*100, data) < 0) FAIL_STACK_ERROR; - if ( ( f->shared->pb_ptr->accesses[0] != 9 ) || + /* was 9, 16, 0 -- review this */ + if ( ( f->shared->pb_ptr->accesses[0] != 10 ) || ( f->shared->pb_ptr->accesses[1] != 16 ) || ( f->shared->pb_ptr->accesses[2] != 0 ) ) { - HDfprintf(stderr, "accesses[] = {%d, %d, %d}. {9, 16, 0} expected\n", + HDfprintf(stderr, "accesses[] = {%d, %d, %d}. {10, 16, 0} expected\n", f->shared->pb_ptr->accesses[0], f->shared->pb_ptr->accesses[1], f->shared->pb_ptr->accesses[2]); TEST_ERROR; } - if ( ( f->shared->pb_ptr->bypasses[0] != 2 ) || + /* was 2, 1, 1 -- review this */ + if ( ( f->shared->pb_ptr->bypasses[0] != 0 ) || ( f->shared->pb_ptr->bypasses[1] != 1 ) || ( f->shared->pb_ptr->bypasses[2] != 1 ) ) { - HDfprintf(stderr, "bypasses[] = {%d, %d, %d}. {2, 1, 1} expected\n", + HDfprintf(stderr, "bypasses[] = {%d, %d, %d}. {0, 1, 1} expected\n", f->shared->pb_ptr->bypasses[0], f->shared->pb_ptr->bypasses[1], f->shared->pb_ptr->bypasses[2]); @@ -2709,18 +2804,20 @@ test_stats_collection(hid_t orig_fapl, const char *env_h5_drvr) TEST_ERROR; } - if ( ( f->shared->pb_ptr->misses[0] != 9 ) || + /* was 9, 16. 0 -- review this */ + if ( ( f->shared->pb_ptr->misses[0] != 10 ) || ( f->shared->pb_ptr->misses[1] != 16 ) || ( f->shared->pb_ptr->misses[2] != 0 ) ) { - HDfprintf(stderr, "misses[] = {%d, %d, %d}. {9, 16, 0} expected\n", + HDfprintf(stderr, "misses[] = {%d, %d, %d}. {10, 16, 0} expected\n", f->shared->pb_ptr->misses[0], f->shared->pb_ptr->misses[1], f->shared->pb_ptr->misses[2]); TEST_ERROR; } - if ( ( f->shared->pb_ptr->evictions[0] != 7) || + /* was 7, 9, 0 -- review this */ + if ( ( f->shared->pb_ptr->evictions[0] != 9) || ( f->shared->pb_ptr->evictions[1] != 9) || ( f->shared->pb_ptr->evictions[2] != 0 ) ) { @@ -2742,17 +2839,19 @@ test_stats_collection(hid_t orig_fapl, const char *env_h5_drvr) evictions, bypasses) < 0) FAIL_STACK_ERROR; - if ( ( accesses[0] != 9 ) || + /* was 9, 16, 0 -- review this */ + if ( ( accesses[0] != 10 ) || ( accesses[1] != 16 ) || ( accesses[2] != 0 ) ) { HDfprintf(stderr, - "accesses[] = {%d, %d, %d}. {9, 16, 0} expected\n", + "accesses[] = {%d, %d, %d}. {10, 16, 0} expected\n", accesses[0], accesses[1], accesses[2]); TEST_ERROR; } - if ( ( bypasses[0] != 2 ) || + /* was 2, 1, 1 -- review this */ + if ( ( bypasses[0] != 0 ) || ( bypasses[1] != 1 ) || ( bypasses[2] != 1 ) ) { @@ -2770,22 +2869,24 @@ test_stats_collection(hid_t orig_fapl, const char *env_h5_drvr) TEST_ERROR; } - if ( ( misses[0] != 9 ) || + /* was 9, 16. 0 -- review this */ + if ( ( misses[0] != 10 ) || ( misses[1] != 16 ) || ( misses[2] != 0 ) ) { - HDfprintf(stderr, "misses[] = {%d, %d, %d}. {9, 16, 0} expected\n", + HDfprintf(stderr, "misses[] = {%d, %d, %d}. {10, 16, 0} expected\n", misses[0], misses[1], misses[2]); TEST_ERROR; } - if ( ( evictions[0] != 7 ) || + /* was 9, 9, 0 -- review this */ + if ( ( evictions[0] != 9 ) || ( evictions[1] != 9 ) || ( evictions[2] != 0 ) ) { HDfprintf(stderr, - "evictions[] = {%d, %d, %d}. {%d, %d, 0} expected\n", - evictions[0], evictions[1], evictions[2], 7, 9); + "evictions[] = {%d, %d, %d}. {9, 9, 0} expected\n", + evictions[0], evictions[1], evictions[2]); TEST_ERROR; } @@ -2961,10 +3062,1307 @@ error: return 1; -} +} /* verify_page_buffering_disabled() */ + #endif /* H5_HAVE_PARALLEL */ +/************************************************************************* + * + * Function: md_entry_splitting_smoke_check() + * + * Purpose: Normally, file space for metadata entries is allocated + * indvidually. In the context of paged allocation, this + * ensures that all entries that cross page boundaries start + * on a page boundary, and that any space between the end of + * a multi-page metadata entry and the next page boundary + * is un-used. + * + * In the context of VFD SWMR, this fact along with atomic + * metadata entry I/O is used to minimize the size of the + * index in the metadata file, and to optimize metadata + * metadata reads on the VFD SWMR reader side. It is also + * used as a simplifying assumption in normal page buffer + * operation. + * + * Unfortunately, it turns out that some metadata cache + * clients (H5FA & H5EA) allocate the needed file space in + * a single block, and sub-allocate space for individual + * entries out of this block. + * + * While this is a design flaw from the perspective + * VFD SWMR, repairing the issue no feasible at this time, + * and in any case, there will always be the issue of + * existing files. + * + * Thus, for now at least, the page buffer has to code around + * the issue when operating in VFD SWMR mode. + * + * It does this by examining metadata I/O requests that + * cross page boundaries, and querying the metadata cache + * for the ID of the associated cache client. + * + * If the request is associated with a cache client that + * that uses sub-allocation, the I/O request must be broken + * into the minimal number of sub-requests such that each + * request either doesn't cross page boundaries, or is + * page aligned, and of length equal to some multiple of + * the page size. + * + * This test exists to verify that such entries are read + * and written correctly. + * + * Note that it does not concern itself with verifying + * the correct handling of the split I/O requests, as + * the split is done immediately upon receipt, and each + * of the sub-requests is treated as a normal metadata + * I/O request. + * + * Note that this test requires us to modify the page buffer + * hint fields in the metadata cache to trick it into + * re-laying the desired hints to the page buffer, even + * though it is not generating the I/O requests in this + * test. + * + * Return: 0 if test is sucessful + * 1 if test fails + * + * Programmer: John Mainzer + * 4/9/20 + * + * Changes: None. + * + *************************************************************************/ + +#define HDR_SIZE 40 +#define MD_PAGE_SIZE 250 +#define TOT_SYNTH_ENTRY_SIZES (HDR_SIZE + (3 * MD_PAGE_SIZE)) + +static unsigned +md_entry_splitting_smoke_check(hid_t orig_fapl, const char *env_h5_drvr, + bool vfd_swmr_mode) +{ + char filename[FILENAME_LEN]; /* Filename to use */ + hid_t file_id = -1; /* File ID */ + hid_t fcpl = -1; + hid_t fapl = -1; + int i; + int * synth_md_vals = NULL; + int * synth_md_test_buf = NULL; + haddr_t base_addr; + haddr_t p0_addr; + haddr_t p1_addr; + haddr_t p2_addr; + H5F_t *f = NULL; + const uint32_t max_lag = 5; + + TESTING("%sMetadata Entry Splitting Smoke Check", \ + vfd_swmr_mode ? "VFD SWMR " : ""); + + h5_fixname(namebase, orig_fapl, filename, sizeof(filename)); + + if ((fapl = H5Pcopy(orig_fapl)) < 0) + TEST_ERROR; + + if (set_multi_split(env_h5_drvr, fapl, sizeof(int) * 200) != 0) + TEST_ERROR; + + if ((fcpl = H5Pcreate(H5P_FILE_CREATE)) < 0) + TEST_ERROR; + + if (H5Pset_file_space_strategy(fcpl, H5F_FSPACE_STRATEGY_PAGE, 0, 1) < 0) + TEST_ERROR; + + if (H5Pset_file_space_page_size(fcpl, (size_t)1000) < 0) + TEST_ERROR; + + if (H5Pset_page_buffer_size(fapl, sizeof(int) * 2000, 0, 0) < 0) + TEST_ERROR; + + if (vfd_swmr_mode && swmr_fapl_augment(fapl, filename, max_lag) < 0) + TEST_ERROR; + + if ((file_id = H5Fcreate(filename, H5F_ACC_TRUNC, fcpl, fapl)) < 0) + FAIL_STACK_ERROR; + + /* Get a pointer to the internal file object */ + if(NULL == (f = (H5F_t *)H5VL_object(file_id))) + FAIL_STACK_ERROR; + + /* The objective is to perform a quick smoke check on I/O of metadata + * entries that have been sub-allocated out of a larger space allocation. + * We do this by simulating a structure similar to elements of the + * fixed array on disk structure. Specifically, we create a synthetic + * set of metadata entries that are allocated out of a single allocation + * from the free space manager, and perform several reads and writes to + * verify expected behaviour. + * + * The synthetic set of metadata entries are constucted of integers + * so as to allow easy assignement of unique values. It is constructed + * as follows: + * + * size values: addr: + * (ints) + * + * header: 40 0, 1, ... 39 base_addr + * page 0: 250 1040, 1041, ... 1289 base_addr + 40 * sizeof(int) + * page 1: 250 2290, 2291, ... 2539 base_addr + 290 * sizeof(int) + * page 2: 250 3540, 3541, ... 3789 base_addr + 540 * sizeof(int) + * + * The overall size of the compound metadata entry is 395 * sizeof(int). + * Since we use a page size of 100 * sizeof(int), this system of synthetic + * metadata entries spans four pages. + */ + + /* allocate the buffers needed for the synthetic md entry test */ + if ( (synth_md_vals = (int *)HDcalloc((size_t)TOT_SYNTH_ENTRY_SIZES, + sizeof(int))) == NULL ) + TEST_ERROR + + if ( (synth_md_test_buf = (int *)HDcalloc((size_t)TOT_SYNTH_ENTRY_SIZES, + sizeof(int))) == NULL ) + TEST_ERROR + + /* allocate file space for the synthetic metadata entries and + * compute their addresses. + */ + if (HADDR_UNDEF == + (base_addr = H5MF_alloc(f, H5FD_MEM_BTREE, + sizeof(int) * (size_t)(TOT_SYNTH_ENTRY_SIZES)))) + FAIL_STACK_ERROR; + + p0_addr = base_addr + (haddr_t)(sizeof(int) * HDR_SIZE); + p1_addr = p0_addr + (haddr_t)(sizeof(int) * MD_PAGE_SIZE); + p2_addr = p1_addr + (haddr_t)(sizeof(int) * MD_PAGE_SIZE); + + + /* Set all cells in synth_md_vals[] to -1 and write directly to + * the underlying file via an H5FD call. This gives us a known + * set of values in the underlying file. + */ + for ( i = 0; i < TOT_SYNTH_ENTRY_SIZES; i++) { + + synth_md_vals[i] = -1; + } + + if ( H5FD_write(f->shared->lf, H5FD_MEM_BTREE, base_addr, + sizeof(int) * TOT_SYNTH_ENTRY_SIZES, synth_md_vals) < 0) + FAIL_STACK_ERROR; + + /* touch up the metadata cache so that it will report that a metadata + * entry that was sub-allocated out of a larger file space allocation + * is the source of the current metadata I/O operation. + */ + H5C_set_curr_io_type_splitable(f->shared->cache, TRUE); + + /* initialize the buffer with the values of the synthetic metadata + * entries. + */ + for ( i = 0; i < TOT_SYNTH_ENTRY_SIZES; i++ ) { + + synth_md_vals[i] = i; + + if ( i > HDR_SIZE ) { + synth_md_vals[i] += 1000; + } + + if ( i > HDR_SIZE + MD_PAGE_SIZE ) { + synth_md_vals[i] += 1000; + } + + if ( i > HDR_SIZE + MD_PAGE_SIZE + MD_PAGE_SIZE ) { + synth_md_vals[i] += 1000; + } + + } + + /* write the header */ + if (H5F_block_write(f, H5FD_MEM_BTREE, base_addr, + sizeof(int) * (size_t)HDR_SIZE, synth_md_vals) < 0) + FAIL_STACK_ERROR; + + /* read the header */ + if (H5F_block_read(f, H5FD_MEM_BTREE, base_addr, + sizeof(int) * (size_t)HDR_SIZE, synth_md_test_buf) < 0) + FAIL_STACK_ERROR; + + /* write page 0 */ + if (H5F_block_write(f, H5FD_MEM_BTREE, p0_addr, + sizeof(int) * (size_t)MD_PAGE_SIZE, + &(synth_md_vals[HDR_SIZE])) < 0) + FAIL_STACK_ERROR; + + /* read page 0 */ + if (H5F_block_read(f, H5FD_MEM_BTREE, p0_addr, + sizeof(int) * (size_t)MD_PAGE_SIZE, + &(synth_md_test_buf[HDR_SIZE])) < 0) + FAIL_STACK_ERROR; + + /* write page 1 */ + if (H5F_block_write(f, H5FD_MEM_BTREE, p1_addr, + sizeof(int) * (size_t)MD_PAGE_SIZE, + &(synth_md_vals[HDR_SIZE + MD_PAGE_SIZE])) < 0) + FAIL_STACK_ERROR; + + /* read page 1 */ + if (H5F_block_read(f, H5FD_MEM_BTREE, p1_addr, + sizeof(int) * (size_t)MD_PAGE_SIZE, + &(synth_md_test_buf[HDR_SIZE + MD_PAGE_SIZE])) < 0) + FAIL_STACK_ERROR; + + /* write page 2 */ + if (H5F_block_write(f, H5FD_MEM_BTREE, p2_addr, + sizeof(int) * (size_t)MD_PAGE_SIZE, + &(synth_md_vals[HDR_SIZE + 2 * MD_PAGE_SIZE])) < 0) + FAIL_STACK_ERROR; + + /* read page 2 */ + if (H5F_block_read(f, H5FD_MEM_BTREE, p2_addr, + sizeof(int) * (size_t)MD_PAGE_SIZE, + &(synth_md_test_buf[HDR_SIZE + 2 * MD_PAGE_SIZE])) < 0) + FAIL_STACK_ERROR; + + /* verify reads */ + for ( i = 0; i < TOT_SYNTH_ENTRY_SIZES; i++ ) { + + if ( synth_md_vals[i] != synth_md_test_buf[i] ) { + + HDfprintf(stderr, "(1) unexpected read %d: val %d -- %d expected\n", + i, synth_md_test_buf[i], synth_md_vals[i]); + TEST_ERROR; + } + } + + /* zero the test buffer, do the reads again in reverse order, and verify */ + + for ( i = 0; i < TOT_SYNTH_ENTRY_SIZES; i++) { + + synth_md_test_buf[i] = 0; + } + + /* read page 2 */ + if (H5F_block_read(f, H5FD_MEM_BTREE, p2_addr, + sizeof(int) * (size_t)MD_PAGE_SIZE, + &(synth_md_test_buf[HDR_SIZE + 2 * MD_PAGE_SIZE])) < 0) + FAIL_STACK_ERROR; + + /* read page 1 */ + if (H5F_block_read(f, H5FD_MEM_BTREE, p1_addr, + sizeof(int) * (size_t)MD_PAGE_SIZE, + &(synth_md_test_buf[HDR_SIZE + MD_PAGE_SIZE])) < 0) + FAIL_STACK_ERROR; + + /* read page 0 */ + if (H5F_block_read(f, H5FD_MEM_BTREE, p0_addr, + sizeof(int) * (size_t)MD_PAGE_SIZE, + &(synth_md_test_buf[HDR_SIZE])) < 0) + FAIL_STACK_ERROR; + + /* read the header */ + if (H5F_block_read(f, H5FD_MEM_BTREE, base_addr, + sizeof(int) * (size_t)HDR_SIZE, synth_md_test_buf) < 0) + FAIL_STACK_ERROR; + + /* verify reads again */ + for ( i = 0; i < TOT_SYNTH_ENTRY_SIZES; i++ ) { + + if ( synth_md_vals[i] != synth_md_test_buf[i] ) { + + HDfprintf(stderr, "(2) unexpected read %d: val %d -- %d expected\n", + i, synth_md_test_buf[i], synth_md_vals[i]); + TEST_ERROR; + } + } + + /* Undo the touchup of the metadata cache */ + H5C_set_curr_io_type_splitable(f->shared->cache, FALSE); + + /* free the test buffers */ + HDfree(synth_md_vals); + HDfree(synth_md_test_buf); + + if (H5Fclose(file_id) < 0) + FAIL_STACK_ERROR; + if (H5Pclose(fcpl) < 0) + FAIL_STACK_ERROR; + if (H5Pclose(fapl) < 0) + FAIL_STACK_ERROR; + + PASSED(); + return 0; + +error: + + /* Undo the touchup of the metadata cache */ + if ( ( f ) && ( f->shared ) && ( f->shared->cache) ) + H5C_set_curr_io_type_splitable(f->shared->cache, FALSE); + + if ( synth_md_vals ) + HDfree(synth_md_vals); + + if ( synth_md_test_buf ) + HDfree(synth_md_test_buf); + + H5E_BEGIN_TRY { + if (fapl != H5I_INVALID_HID) + H5Pclose(fapl); + if (fcpl != H5I_INVALID_HID) + H5Pclose(fcpl); + if (file_id != H5I_INVALID_HID) + H5Fclose(file_id); + } H5E_END_TRY; + return 1; + +} /* md_entry_splitting_smoke_check() */ + +#undef HDR_SIZE +#undef MD_PAGE_SIZE +#undef TOT_SYNTH_ENTRY_SIZES + + +/************************************************************************* + * + * Function: md_entry_splitting_boundary_test() + * + * Purpose: Test to verify that I/O request splitting performs as + * as expected in various boundary conditions. + * + * The above md_entry_splitting_smoke_check() was directed + * at verifying that the page buffer behaved as expected + * in something approaching a typical use case. + * + * This test is directed at verifying that entries are + * split correctly under a variety of conditions that + * are unlikely unless the user chooses at odd page size. + * + * Return: 0 if test is sucessful + * 1 if test fails + * + * Programmer: John Mainzer + * 4/12/20 + * + * Changes: None. + * + *************************************************************************/ + + +static unsigned +md_entry_splitting_boundary_test(hid_t orig_fapl, const char *env_h5_drvr, + bool vfd_swmr_mode) +{ + char filename[FILENAME_LEN]; /* Filename to use */ + hid_t file_id = -1; /* File ID */ + hid_t fcpl = -1; + hid_t fapl = -1; + int64_t base_page_cnt; + int i; + H5F_t *f = NULL; + const uint32_t max_lag = 5; + size_t page_size = (size_t)512; + int pages_allocated = 32; + size_t alloc_size; + uint8_t * write_buf = NULL; + uint8_t * read_buf = NULL; + haddr_t base_addr = HADDR_UNDEF; + haddr_t first_page_addr = HADDR_UNDEF; + haddr_t start_addr = HADDR_UNDEF; + size_t test_len; + + TESTING("%sMetadata Entry Splitting Boundary Test", \ + vfd_swmr_mode ? "VFD SWMR " : ""); + + h5_fixname(namebase, orig_fapl, filename, sizeof(filename)); + + if ((fapl = H5Pcopy(orig_fapl)) < 0) + TEST_ERROR + + if (set_multi_split(env_h5_drvr, fapl, sizeof(int) * 200) != 0) + TEST_ERROR; + + if ((fcpl = H5Pcreate(H5P_FILE_CREATE)) < 0) + TEST_ERROR; + + if (H5Pset_file_space_strategy(fcpl, H5F_FSPACE_STRATEGY_PAGE, 0, 1) < 0) + TEST_ERROR; + + if (H5Pset_file_space_page_size(fcpl, page_size) < 0) + TEST_ERROR; + + if (H5Pset_page_buffer_size(fapl, 32 * page_size, 0, 0) < 0) + TEST_ERROR; + + if (vfd_swmr_mode && swmr_fapl_augment(fapl, filename, max_lag) < 0) + TEST_ERROR; + + if ((file_id = H5Fcreate(filename, H5F_ACC_TRUNC, fcpl, fapl)) < 0) + FAIL_STACK_ERROR; + + /* Get a pointer to the internal file object */ + if(NULL == (f = (H5F_t *)H5VL_object(file_id))) + FAIL_STACK_ERROR; + + /* opening the file inserts one or more pages into the page buffer. + * Get the number of pages inserted, and verify that it is the + * expected value. + */ + base_page_cnt = f->shared->pb_ptr->curr_pages; + if (base_page_cnt != 1) + TEST_ERROR; + + /* Test the folowing cases: + * + * 1) splittable md entry that is page aligned and exactly one + * page long. + * + * 2) splittable md entry that is page aligned and exactly two + * pages long + * + * 3) splittable md entry that is page aligned and is exactly one + * page and one byte long. + * + * 4) splittable md entry that is exactly one page and one byte + * long, and starts one byte before a page bundary. + * + * 5) splittable md entry that is exactly one page and two bytes + * long, and starts one byte before a page boundary. + * + * 6) splittable md entry that is two bytes long, and starts one + * byte before a page boundary. + * + * 7) splittable md entry that is page aligned and is exactly two + * pages and one byte long. + * + * 8) splittable md entry that is exactly two pages and one byte + * long, and starts one byte before a page bundary. + * + * 9) splittable md entry that is exactly two pages and two bytes + * long, and starts one byte before a page boundary. + * + */ + alloc_size = page_size * (size_t)pages_allocated; + + /* allocate the buffers needed for the synthetic md entry test */ + if ((write_buf = (uint8_t *)HDcalloc(alloc_size, sizeof(uint8_t))) == NULL) + TEST_ERROR + + if ((read_buf = (uint8_t *)HDcalloc(alloc_size, sizeof(uint8_t))) == NULL) + TEST_ERROR + + /* allocate file space for the tests */ + if (HADDR_UNDEF == (base_addr = H5MF_alloc(f, H5FD_MEM_SUPER, alloc_size))) + FAIL_STACK_ERROR; + + /* Set all cells write_buf[] to 0 and write directly to + * the underlying file via an H5FD call. This gives us a known + * set of values in the underlying file. + */ + for ( i = 0; i < (int)alloc_size; i++) { + + write_buf[i] = 0; + } + + if ( H5FD_write(f->shared->lf, H5FD_MEM_SUPER, base_addr, + alloc_size, write_buf) < 0) + FAIL_STACK_ERROR; + + /* touch up the metadata cache so that it will report that a metadata + * entry that was sub-allocated out of a larger file space allocation + * is the source of the current metadata I/O operation. + */ + H5C_set_curr_io_type_splitable(f->shared->cache, TRUE); + + + /* 1) splittable md entry that is page aligned and exactly one + * page long. + * + * Should not register as a split I/O. + * + * Should log 4 metadata accesses. + * should log 3 metadata hits + * should log 1 metadata misses + * should log 1 metadata loads + * should log 1 metadata insertions + * + * Note that exposes an inefficiency in the page buffer, as page + * aligned I/O requests of exactly oen page in length really should + * bypass the page buffer. + * + * This should be fixed, but I am bypassing it for now. + * + * JRM -- 4/18/20 + */ + first_page_addr = base_addr; + start_addr = base_addr; + test_len = page_size; + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 1; + + if ( H5PB_reset_stats(f->shared->pb_ptr) < 0 ) + FAIL_STACK_ERROR; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "1.1) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 2; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "1.2) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + if ( ( f->shared->pb_ptr->md_read_splits != 0 ) || + ( f->shared->pb_ptr->md_write_splits != 0 ) ) + TEST_ERROR; + + if ( ( f->shared->pb_ptr->accesses[H5PB__STATS_MD] != 4 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MD] != 3 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MD] != 1 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MD] != 1 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MD] != 1 ) ) + TEST_ERROR; + + + /* 2) splittable md entry that is page aligned and exactly two + * pages long + * + * Should not register as a split I/O. + * + * if vfd_swmr_mode + * + * Should log 0 multi-page metadata bypasses. + * Should log 4 multi-page metadata accesses. + * should log 3 multi-page metadata hits + * should log 1 multi-page metadata misses + * should log 0 multi-page metadata loads + * should log 1 multi-page metadata insertions + * + * else + * + * Should log 4 multi-page metadata bypasses. + * Should log 0 multi-page metadata accesses. + * should log 0 multi-page metadata hits + * should log 2 multi-page metadata misses + * should log 0 multi-page metadata loads + * should log 0 multi-page metadata insertions + * + * The misses in the normal operating mode could be avoided. + */ + first_page_addr = base_addr + (haddr_t)(page_size); + start_addr = first_page_addr; + test_len = 3 * page_size; + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 3; + + if ( H5PB_reset_stats(f->shared->pb_ptr) < 0 ) + FAIL_STACK_ERROR; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "2.1) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 4; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "2.2) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + if ( ( f->shared->pb_ptr->md_read_splits != 0 ) || + ( f->shared->pb_ptr->md_write_splits != 0 ) ) + TEST_ERROR; + + if ( vfd_swmr_mode ) { + if ( ( f->shared->pb_ptr->bypasses[H5PB__STATS_MPMDE] != 0 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MPMDE] != 4 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MPMDE] != 3 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MPMDE] != 1 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MPMDE] != 0 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MPMDE] != 1 ) ) + TEST_ERROR; + + } else { + if ( ( f->shared->pb_ptr->bypasses[H5PB__STATS_MPMDE] != 4 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MPMDE] != 0 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MPMDE] != 0 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MPMDE] != 2 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MPMDE] != 0 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MPMDE] != 0 ) ) + TEST_ERROR; + } + + + /* 3) splittable md entry that is page aligned and is exactly one + * page and one byte long. + * + * Should register 2 metadata read splits + * Should register 2 metadata write splits + * + * Should log 0 metadata bypasses. + * Should log 8 metadata accesses. + * should log 6 metadata hits + * should log 2 metadata misses + * should log 2 metadata loads + * should log 2 metadata insertions + */ + first_page_addr = base_addr + (haddr_t)(3 * page_size); + start_addr = first_page_addr; + test_len = page_size + 1; + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 5; + + if ( H5PB_reset_stats(f->shared->pb_ptr) < 0 ) + FAIL_STACK_ERROR; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "3.1) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 6; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "3.2) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + if ( ( f->shared->pb_ptr->md_read_splits != 2 ) || + ( f->shared->pb_ptr->md_write_splits != 2 ) ) + TEST_ERROR; + + if ( ( f->shared->pb_ptr->bypasses[H5PB__STATS_MD] != 0 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MD] != 8 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MD] != 6 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MD] != 2 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MD] != 2 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MD] != 2 ) ) + TEST_ERROR; + + + /* 4) splittable md entry that is exactly one page and one byte + * long, and starts one byte before a page bundary. + * + * Should register 2 metadata read splits + * Should register 2 metadata write splits + * + * Should log 0 metadata bypasses. + * Should log 8 metadata accesses. + * should log 6 metadata hits + * should log 2 metadata misses + * should log 2 metadata loads + * should log 2 metadata insertions + * + */ + first_page_addr = base_addr + (haddr_t)(5 * page_size); + start_addr = first_page_addr + (haddr_t)(page_size - 1);; + test_len = page_size + 1; + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 7; + + if ( H5PB_reset_stats(f->shared->pb_ptr) < 0 ) + FAIL_STACK_ERROR; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if ( f->shared->pb_ptr->md_write_splits != 1 ) + TEST_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + if ( f->shared->pb_ptr->md_read_splits != 1 ) + TEST_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "4.1) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 8; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "4.2) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + if ( ( f->shared->pb_ptr->md_read_splits != 2 ) || + ( f->shared->pb_ptr->md_write_splits != 2 ) ) + TEST_ERROR; + + if ( ( f->shared->pb_ptr->bypasses[H5PB__STATS_MD] != 0 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MD] != 8 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MD] != 6 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MD] != 2 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MD] != 2 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MD] != 2 ) ) + TEST_ERROR; + + + /* 5) splittable md entry that is exactly one page and two bytes + * long, and starts one byte before a page boundary. + * + * Should register 2 metadata read splits + * Should register 2 metadata write splits + * + * Should log 0 metadata bypasses. + * Should log 12 metadata accesses. + * should log 9 metadata hits + * should log 3 metadata misses + * should log 3 metadata loads + * should log 3 metadata insertions + */ + first_page_addr = base_addr + (haddr_t)(8 * page_size); + start_addr = first_page_addr + (haddr_t)(page_size - 1);; + test_len = page_size + 2; + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 9; + + if ( H5PB_reset_stats(f->shared->pb_ptr) < 0 ) + FAIL_STACK_ERROR; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "5.1) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 10; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "5.2) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + if ( ( f->shared->pb_ptr->md_read_splits != 2 ) || + ( f->shared->pb_ptr->md_write_splits != 2 ) ) + TEST_ERROR; + + if ( ( f->shared->pb_ptr->bypasses[H5PB__STATS_MD] != 0 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MD] != 12 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MD] != 9 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MD] != 3 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MD] != 3 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MD] != 3 ) ) + TEST_ERROR; + + + /* 6) splittable md entry that is two bytes long, and starts one + * byte before a page boundary. + * + * Should register 2 metadata read splits + * Should register 2 metadata write splits + * + * Should log 0 metadata bypasses. + * Should log 8 metadata accesses. + * should log 6 metadata hits + * should log 2 metadata misses + * should log 2 metadata loads + * should log 2 metadata insertions + */ + first_page_addr = base_addr + (haddr_t)(11 * page_size); + start_addr = first_page_addr + (haddr_t)(page_size - 1);; + test_len = 2; + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 11; + + if ( H5PB_reset_stats(f->shared->pb_ptr) < 0 ) + FAIL_STACK_ERROR; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "6.1) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 12; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "6.2) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + if ( ( f->shared->pb_ptr->md_read_splits != 2 ) || + ( f->shared->pb_ptr->md_write_splits != 2 ) ) + TEST_ERROR; + + if ( ( f->shared->pb_ptr->bypasses[H5PB__STATS_MD] != 0 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MD] != 8 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MD] != 6 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MD] != 2 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MD] != 2 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MD] != 2 ) ) + TEST_ERROR; + + /* 7) splittable md entry that is page aligned and is exactly two + * pages and one byte long. + * + * Should register 2 metadata read splits + * Should register 2 metadata write splits + * + * if vfd_swmr_mode + * + * Should log 0 multi-page metadata bypasses. + * Should log 4 multi-page metadata accesses. + * Should log 4 metadata accesses. + * should log 3 multi-page metadata hits + * should log 3 metadata hits + * should log 1 multi-page metadata misses + * should log 1 metadata misses + * should log 0 multi-page metadata loads + * should log 1 metadata loads + * should log 1 multi-page metadata insertions + * should log 1 metadata insertions + * + * else + * + * Should log 4 multi-page metadata bypasses. + * Should log 4 metadata accesses. + * should log 3 metadata hits + * should log 2 multi-page metadata misses + * should log 1 metadata misses + * should log 1 metadata loads + * should log 1 metadata insertions + * + * The misses in the normal operating mode could be avoided. + */ + first_page_addr = base_addr + (haddr_t)(13 * page_size); + start_addr = first_page_addr; + test_len = 2 * page_size + 1; + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 13; + + if ( H5PB_reset_stats(f->shared->pb_ptr) < 0 ) + FAIL_STACK_ERROR; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "3.1) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 14; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "3.2) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + if ( ( f->shared->pb_ptr->md_read_splits != 2 ) || + ( f->shared->pb_ptr->md_write_splits != 2 ) ) + TEST_ERROR; + + if ( vfd_swmr_mode ) { + if ( ( f->shared->pb_ptr->bypasses[H5PB__STATS_MPMDE] != 0 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MPMDE] != 4 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MD] != 4 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MPMDE] != 3 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MD] != 3 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MPMDE] != 1 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MD] != 1 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MPMDE] != 0 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MD] != 1 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MPMDE] != 1 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MD] != 1 ) ) + TEST_ERROR; + + } else { + if ( ( f->shared->pb_ptr->bypasses[H5PB__STATS_MPMDE] != 4 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MD] != 4 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MD] != 3 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MPMDE] != 2 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MD] != 1 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MD] != 1 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MD] != 1 ) ) + TEST_ERROR; + } + + + /* 8) splittable md entry that is exactly two pages and one byte + * long, and starts one byte before a page bundary. + * + * Should register 2 metadata read splits + * Should register 2 metadata write splits + * + * if vfd_swmr_mode + * + * Should log 0 multi-page metadata bypasses. + * Should log 4 multi-page metadata accesses. + * Should log 4 metadata accesses. + * should log 3 multi-page metadata hits + * should log 3 metadata hits + * should log 1 multi-page metadata misses + * should log 1 metadata misses + * should log 0 multi-page metadata loads + * should log 1 metadata loads + * should log 1 multi-page metadata insertions + * should log 1 metadata insertions + * + * else + * + * Should log 4 multi-page metadata bypasses. + * Should log 4 metadata accesses. + * should log 3 metadata hits + * should log 2 multi-page metadata misses + * should log 1 metadata misses + * should log 1 metadata loads + * should log 1 metadata insertions + * + * The misses in the normal operating mode could be avoided. + */ + first_page_addr = base_addr + (haddr_t)(16 * page_size); + start_addr = first_page_addr + (haddr_t)(page_size - 1);; + test_len =2 * page_size + 1; + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 15; + + if ( H5PB_reset_stats(f->shared->pb_ptr) < 0 ) + FAIL_STACK_ERROR; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if ( f->shared->pb_ptr->md_write_splits != 1 ) + TEST_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + if ( f->shared->pb_ptr->md_read_splits != 1 ) + TEST_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "4.1) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 16; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "4.2) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + if ( ( f->shared->pb_ptr->md_read_splits != 2 ) || + ( f->shared->pb_ptr->md_write_splits != 2 ) ) + TEST_ERROR; + + if ( vfd_swmr_mode ) { + if ( ( f->shared->pb_ptr->bypasses[H5PB__STATS_MPMDE] != 0 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MPMDE] != 4 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MD] != 4 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MPMDE] != 3 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MD] != 3 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MPMDE] != 1 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MD] != 1 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MPMDE] != 0 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MD] != 1 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MPMDE] != 1 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MD] != 1 ) ) + TEST_ERROR; + + } else { + if ( ( f->shared->pb_ptr->bypasses[H5PB__STATS_MPMDE] != 4 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MD] != 4 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MD] != 3 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MPMDE] != 2 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MD] != 1 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MD] != 1 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MD] != 1 ) ) + TEST_ERROR; + } + + + /* 9) splittable md entry that is exactly two pages and two bytes + * long, and starts one byte before a page boundary. + * + * if vfd_swmr_mode + * + * Should log 0 multi-page metadata bypasses. + * Should log 4 multi-page metadata accesses. + * Should log 8 metadata accesses. + * should log 3 multi-page metadata hits + * should log 6 metadata hits + * should log 1 multi-page metadata misses + * should log 2 metadata misses + * should log 0 multi-page metadata loads + * should log 2 metadata loads + * should log 1 multi-page metadata insertions + * should log 2 metadata insertions + * + * else + * + * Should log 4 multi-page metadata bypasses. + * Should log 4 metadata accesses. + * should log 3 metadata hits + * should log 2 multi-page metadata misses + * should log 1 metadata misses + * should log 1 metadata loads + * should log 1 metadata insertions + * + * The misses in the normal operating mode could be avoided. + */ + first_page_addr = base_addr + (haddr_t)(19 * page_size); + start_addr = first_page_addr + (haddr_t)(page_size - 1);; + test_len = 2 * page_size + 2; + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 17; + + if ( H5PB_reset_stats(f->shared->pb_ptr) < 0 ) + FAIL_STACK_ERROR; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "5.1) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + for ( i = 0; i < (int)test_len; i++ ) + write_buf[i] = 18; + + if (H5F_block_write(f, H5FD_MEM_SUPER, start_addr, test_len, write_buf) < 0) + FAIL_STACK_ERROR; + + if (H5F_block_read(f, H5FD_MEM_SUPER, start_addr, test_len, read_buf) < 0) + FAIL_STACK_ERROR; + + for ( i = 0; i < (int)test_len; i++ ) { + if ( write_buf[i] != read_buf[i] ) { + HDfprintf(stdout, "5.2) write_buf[%d] = %d != %d = read_buf[%d]\n", + i, (int)(write_buf[i]), (int)(read_buf[i]), i); + TEST_ERROR; + } + } + + if ( ( f->shared->pb_ptr->md_read_splits != 2 ) || + ( f->shared->pb_ptr->md_write_splits != 2 ) ) + TEST_ERROR; + + if ( vfd_swmr_mode ) { + if ( ( f->shared->pb_ptr->bypasses[H5PB__STATS_MPMDE] != 0 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MPMDE] != 4 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MD] != 8 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MPMDE] != 3 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MD] != 6 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MPMDE] != 1 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MD] != 2 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MPMDE] != 0 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MD] != 2 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MPMDE] != 1 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MD] != 2 ) ) + TEST_ERROR; + + } else { + if ( ( f->shared->pb_ptr->bypasses[H5PB__STATS_MPMDE] != 4 ) || + ( f->shared->pb_ptr->accesses[H5PB__STATS_MD] != 8 ) || + ( f->shared->pb_ptr->hits[H5PB__STATS_MD] != 6 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MPMDE] != 2 ) || + ( f->shared->pb_ptr->misses[H5PB__STATS_MD] != 2 ) || + ( f->shared->pb_ptr->loads[H5PB__STATS_MD] != 2 ) || + ( f->shared->pb_ptr->insertions[H5PB__STATS_MD] != 2 ) ) + TEST_ERROR; + } + + + /* Undo the touchup of the metadata cache */ + H5C_set_curr_io_type_splitable(f->shared->cache, FALSE); + + /* free the test buffers */ + HDfree(write_buf); + HDfree(read_buf); + + if (H5Fclose(file_id) < 0) + FAIL_STACK_ERROR; + if (H5Pclose(fcpl) < 0) + FAIL_STACK_ERROR; + if (H5Pclose(fapl) < 0) + FAIL_STACK_ERROR; + + PASSED(); + return 0; + +error: + + /* Undo the touchup of the metadata cache */ + if ( ( f ) && ( f->shared ) && ( f->shared->cache) ) + H5C_set_curr_io_type_splitable(f->shared->cache, FALSE); + + if ( write_buf ) + HDfree(write_buf); + + if ( read_buf ) + HDfree(read_buf); + + H5E_BEGIN_TRY { + if (fapl != H5I_INVALID_HID) + H5Pclose(fapl); + if (fcpl != H5I_INVALID_HID) + H5Pclose(fcpl); + if (file_id != H5I_INVALID_HID) + H5Fclose(file_id); + } H5E_END_TRY; + return 1; + +} /* md_entry_splitting_boundary_test() */ + + + /*------------------------------------------------------------------------- * Function: main() * @@ -2997,7 +4395,7 @@ main(void) * Page buffering depends on paged aggregation which is * currently disabled for multi/split drivers. */ - if((0 == HDstrcmp(env_h5_drvr, "multi")) || + if((0 == HDstrcmp(env_h5_drvr, "multi")) || (0 == HDstrcmp(env_h5_drvr, "split"))) { SKIPPED() @@ -3015,7 +4413,7 @@ main(void) if(H5CX_push() < 0) FAIL_STACK_ERROR api_ctx_pushed = TRUE; -#ifdef H5_HAVE_PARALLEL +#ifdef H5_HAVE_PARALLEL HDputs("Page Buffering is disabled for parallel."); nerrors += verify_page_buffering_disabled(fapl, env_h5_drvr); @@ -3031,6 +4429,10 @@ main(void) nerrors += test_lru_processing(fapl, env_h5_drvr); nerrors += test_min_threshold(fapl, env_h5_drvr); nerrors += test_stats_collection(fapl, env_h5_drvr); + nerrors += md_entry_splitting_smoke_check(fapl, env_h5_drvr, false); + nerrors += md_entry_splitting_smoke_check(fapl, env_h5_drvr, true); + nerrors += md_entry_splitting_boundary_test(fapl, env_h5_drvr, false); + nerrors += md_entry_splitting_boundary_test(fapl, env_h5_drvr, true); #endif /* H5_HAVE_PARALLEL */ @@ -3058,4 +4460,5 @@ error: if(api_ctx_pushed) H5CX_pop(); HDexit(EXIT_FAILURE); -} + +} /* main() */ |