diff options
author | mainzer <mainzer#hdfgroup.org> | 2018-11-04 23:54:01 (GMT) |
---|---|---|
committer | mainzer <mainzer#hdfgroup.org> | 2018-11-04 23:54:01 (GMT) |
commit | e62f4bd4fab00b0cd03e269a858c21558a9321fa (patch) | |
tree | 49506dd4873e1413b6c960ebcaf22c269933411c /src | |
parent | d4d7687ad1df35101ed72567c99f1c57536b5ccd (diff) | |
download | hdf5-e62f4bd4fab00b0cd03e269a858c21558a9321fa.zip hdf5-e62f4bd4fab00b0cd03e269a858c21558a9321fa.tar.gz hdf5-e62f4bd4fab00b0cd03e269a858c21558a9321fa.tar.bz2 |
Initial checkin of page buffer re-implementation to support VFD SWMR.
Tested serial / debug on Charis and Jelly.
Two known issues:
1) New page buffer seems to expose issues in the accumulator code.
For whatever reason, fheap with the new page buffer exposes corruption
issues if the page buffer uses H5F__accum_read/write(), but the problems
go away if the page buffers uses H5FD_read/write() instead.
Need to either chase this or dis-able page bufffer in combination with
the accumulator.
2) Encountered a number of assertion failures that are explainable by the
free space manager code not telling the page buffer to discard pages
when they are freed.
Wrote code to handle this -- once the free space manager is modified,
this code should be removed and the original assertions restored.
Diffstat (limited to 'src')
-rw-r--r-- | src/H5C.c | 4 | ||||
-rw-r--r-- | src/H5F.c | 8 | ||||
-rw-r--r-- | src/H5Fint.c | 8 | ||||
-rw-r--r-- | src/H5Fpkg.h | 4 | ||||
-rw-r--r-- | src/H5MF.c | 2 | ||||
-rw-r--r-- | src/H5MFsection.c | 2 | ||||
-rw-r--r-- | src/H5PB.c | 4147 | ||||
-rw-r--r-- | src/H5PBpkg.h | 1498 | ||||
-rw-r--r-- | src/H5PBprivate.h | 566 |
9 files changed, 5147 insertions, 1092 deletions
@@ -6412,8 +6412,8 @@ H5C__flush_single_entry(H5F_t *f, H5C_cache_entry_t *entry_ptr, unsigned flags) HDassert(!destroy); HDassert(entry_ptr->image_ptr); - if(f->shared->page_buf && f->shared->page_buf->page_size >= entry_ptr->size) - if(H5PB_update_entry(f->shared->page_buf, entry_ptr->addr, entry_ptr->size, entry_ptr->image_ptr) > 0) + if(f->shared->pb_ptr && f->shared->pb_ptr->page_size >= entry_ptr->size) + if(H5PB_update_entry(f->shared->pb_ptr, entry_ptr->addr, entry_ptr->size, entry_ptr->image_ptr) > 0) HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Failed to update PB with metadata cache") } /* end if */ @@ -1822,11 +1822,11 @@ H5Freset_page_buffering_stats(hid_t file_id) /* Check args */ if(NULL == (file = (H5F_t *)H5I_object(file_id))) HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "invalid file identifier") - if(NULL == file->shared->page_buf) + if(NULL == file->shared->pb_ptr) HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "page buffering not enabled on file") /* Reset the statistics */ - if(H5PB_reset_stats(file->shared->page_buf) < 0) + if(H5PB_reset_stats(file->shared->pb_ptr) < 0) HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, "can't reset stats for page buffering") done: @@ -1857,13 +1857,13 @@ H5Fget_page_buffering_stats(hid_t file_id, unsigned accesses[2], unsigned hits[2 /* Check args */ if(NULL == (file = (H5F_t *)H5I_object_verify(file_id, H5I_FILE))) HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "not a file ID") - if(NULL == file->shared->page_buf) + if(NULL == file->shared->pb_ptr) HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "page buffering not enabled on file") if(NULL == accesses || NULL == hits || NULL == misses || NULL == evictions || NULL == bypasses) HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL input parameters for stats") /* Get the statistics */ - if(H5PB_get_stats(file->shared->page_buf, accesses, hits, misses, evictions, bypasses) < 0) + if(H5PB_get_stats(file->shared->pb_ptr, accesses, hits, misses, evictions, bypasses) < 0) HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "can't retrieve stats for page buffering") done: diff --git a/src/H5Fint.c b/src/H5Fint.c index f815a4b..bca09b2 100644 --- a/src/H5Fint.c +++ b/src/H5Fint.c @@ -233,12 +233,12 @@ H5F_get_access_plist(H5F_t *f, hbool_t app_ref) efc_size = H5F__efc_max_nfiles(f->shared->efc); if(H5P_set(new_plist, H5F_ACS_EFC_SIZE_NAME, &efc_size) < 0) HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, H5I_INVALID_HID, "can't set elink file cache size") - if(f->shared->page_buf != NULL) { - if(H5P_set(new_plist, H5F_ACS_PAGE_BUFFER_SIZE_NAME, &(f->shared->page_buf->max_size)) < 0) + if(f->shared->pb_ptr != NULL) { + if(H5P_set(new_plist, H5F_ACS_PAGE_BUFFER_SIZE_NAME, &(f->shared->pb_ptr->max_size)) < 0) HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, H5I_INVALID_HID, "can't set page buffer size") - if(H5P_set(new_plist, H5F_ACS_PAGE_BUFFER_MIN_META_PERC_NAME, &(f->shared->page_buf->min_meta_perc)) < 0) + if(H5P_set(new_plist, H5F_ACS_PAGE_BUFFER_MIN_META_PERC_NAME, &(f->shared->pb_ptr->min_meta_perc)) < 0) HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, H5I_INVALID_HID, "can't set minimum metadata fraction of page buffer") - if(H5P_set(new_plist, H5F_ACS_PAGE_BUFFER_MIN_RAW_PERC_NAME, &(f->shared->page_buf->min_raw_perc)) < 0) + if(H5P_set(new_plist, H5F_ACS_PAGE_BUFFER_MIN_RAW_PERC_NAME, &(f->shared->pb_ptr->min_raw_perc)) < 0) HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, H5I_INVALID_HID, "can't set minimum raw data fraction of page buffer") } /* end if */ diff --git a/src/H5Fpkg.h b/src/H5Fpkg.h index 3760c41..9e523de 100644 --- a/src/H5Fpkg.h +++ b/src/H5Fpkg.h @@ -295,7 +295,9 @@ struct H5F_file_t { unsigned long feature_flags; /* VFL Driver feature Flags */ haddr_t maxaddr; /* Maximum address for file */ - H5PB_t *page_buf; /* The page buffer cache */ + H5PB_t *pb_ptr; /* pointer to the page buffer, or NULL */ + /* if the page buffer is disabled. */ + H5AC_t *cache; /* The object cache */ H5AC_cache_config_t mdc_initCacheCfg; /* initial configuration for the */ @@ -981,7 +981,7 @@ HDfprintf(stderr, "%s: alloc_type = %u, size = %Hu\n", FUNC, (unsigned)alloc_typ /* Insert the new page into the Page Buffer list of new pages so we don't read an empty page from disk */ - if(f->shared->page_buf != NULL && H5PB_add_new_page(f, alloc_type, new_page) < 0) + if(f->shared->pb_ptr != NULL && H5PB_add_new_page(f, alloc_type, new_page) < 0) HGOTO_ERROR(H5E_RESOURCE, H5E_CANTINSERT, HADDR_UNDEF, "can't add new page to Page Buffer new page list") ret_value = new_page; diff --git a/src/H5MFsection.c b/src/H5MFsection.c index 3ebc5c8..1d217a1 100644 --- a/src/H5MFsection.c +++ b/src/H5MFsection.c @@ -775,7 +775,7 @@ H5MF__sect_small_merge(H5FS_section_info_t **_sect1, H5FS_section_info_t *_sect2 /* This is in response to the data corruption bug from fheap.c with page buffering + page strategy */ /* Note: Large metadata page bypasses the PB cache */ /* Note: Update of raw data page (large or small sized) is handled by the PB cache */ - if(udata->f->shared->page_buf != NULL && udata->alloc_type != H5FD_MEM_DRAW) + if(udata->f->shared->pb_ptr != NULL && udata->alloc_type != H5FD_MEM_DRAW) if(H5PB_remove_entry(udata->f, (*sect1)->sect_info.addr) < 0) HGOTO_ERROR(H5E_RESOURCE, H5E_CANTFREE, FAIL, "can't free merged section") @@ -13,9 +13,11 @@ /*------------------------------------------------------------------------- * - * Created: H5PB.c - * - * Purpose: Page Buffer routines. + * Created: H5PB2.c + * + * Purpose: Re-implementation of the page buffer with added features to + * support VFD SWMR. + * JRM -- 10/11/18 * *------------------------------------------------------------------------- */ @@ -24,8 +26,10 @@ /* Module Setup */ /****************/ -#define H5F_FRIEND /*suppress error about including H5Fpkg */ -#include "H5PBmodule.h" /* This source code file is part of the H5PB module */ +#define H5F_FRIEND /* suppress error about including H5Fpkg */ +#include "H5PBmodule.h" /* This source code file is part of the + * H5PB module + */ /***********/ @@ -36,83 +40,30 @@ #include "H5Fpkg.h" /* Files */ #include "H5FDprivate.h" /* File drivers */ #include "H5Iprivate.h" /* IDs */ +#include "H5FLprivate.h" /* Free lists */ +#include "H5MMprivate.h" /* Memory management */ #include "H5PBpkg.h" /* File access */ -#include "H5SLprivate.h" /* Skip List */ /****************/ /* Local Macros */ /****************/ -#define H5PB__PREPEND(page_ptr, head_ptr, tail_ptr, len) { \ - if((head_ptr) == NULL) { \ - (head_ptr) = (page_ptr); \ - (tail_ptr) = (page_ptr); \ - } /* end if */ \ - else { \ - (head_ptr)->prev = (page_ptr); \ - (page_ptr)->next = (head_ptr); \ - (head_ptr) = (page_ptr); \ - } /* end else */ \ - (len)++; \ -} /* H5PB__PREPEND() */ - -#define H5PB__REMOVE(page_ptr, head_ptr, tail_ptr, len) { \ - if((head_ptr) == (page_ptr)) { \ - (head_ptr) = (page_ptr)->next; \ - if((head_ptr) != NULL) \ - (head_ptr)->prev = NULL; \ - } /* end if */ \ - else \ - (page_ptr)->prev->next = (page_ptr)->next; \ - if((tail_ptr) == (page_ptr)) { \ - (tail_ptr) = (page_ptr)->prev; \ - if((tail_ptr) != NULL) \ - (tail_ptr)->next = NULL; \ - } /* end if */ \ - else \ - (page_ptr)->next->prev = (page_ptr)->prev; \ - page_ptr->next = NULL; \ - page_ptr->prev = NULL; \ - (len)--; \ -} - -#define H5PB__INSERT_LRU(page_buf, page_ptr) { \ - HDassert(page_buf); \ - HDassert(page_ptr); \ - /* insert the entry at the head of the list. */ \ - H5PB__PREPEND((page_ptr), (page_buf)->LRU_head_ptr, \ - (page_buf)->LRU_tail_ptr, (page_buf)->LRU_list_len) \ -} - -#define H5PB__REMOVE_LRU(page_buf, page_ptr) { \ - HDassert(page_buf); \ - HDassert(page_ptr); \ - /* remove the entry from the list. */ \ - H5PB__REMOVE((page_ptr), (page_buf)->LRU_head_ptr, \ - (page_buf)->LRU_tail_ptr, (page_buf)->LRU_list_len) \ -} - -#define H5PB__MOVE_TO_TOP_LRU(page_buf, page_ptr) { \ - HDassert(page_buf); \ - HDassert(page_ptr); \ - /* Remove entry and insert at the head of the list. */ \ - H5PB__REMOVE((page_ptr), (page_buf)->LRU_head_ptr, \ - (page_buf)->LRU_tail_ptr, (page_buf)->LRU_list_len) \ - H5PB__PREPEND((page_ptr), (page_buf)->LRU_head_ptr, \ - (page_buf)->LRU_tail_ptr, (page_buf)->LRU_list_len) \ -} + +/* In principle, we should be able to run the page buffer with the + * accumulator. However, for whatever reason, the fheap test encounteres + * metadata corruption if the page buffer uses H5F__accum_read/write() + * for I/O. + * + * The following #define controls this. Set VFD_IO to FALSE to reproduce + * the bug. + */ +#define VFD_IO TRUE /******************/ /* Local Typedefs */ /******************/ -/* Iteration context for destroying page buffer */ -typedef struct { - H5PB_t *page_buf; - hbool_t actual_slist; -} H5PB_ud1_t; - /********************/ /* Package Typedefs */ @@ -122,9 +73,44 @@ typedef struct { /********************/ /* Local Prototypes */ /********************/ -static herr_t H5PB__insert_entry(H5PB_t *page_buf, H5PB_entry_t *page_entry); -static htri_t H5PB__make_space(H5F_t *f, H5PB_t *page_buf, H5FD_mem_t inserted_type); -static herr_t H5PB__write_entry(H5F_t *f, H5PB_entry_t *page_entry); + +static H5PB_entry_t * H5PB__allocate_page(H5PB_t *pb_ptr, size_t buf_size, + hbool_t clean_image); + +static herr_t H5PB__create_new_page(H5PB_t *pb_ptr, haddr_t addr, size_t size, + H5FD_mem_t type, hbool_t clean_image, H5PB_entry_t **entry_ptr_ptr); + +static void H5PB__deallocate_page(H5PB_entry_t *entry_ptr); + +static herr_t H5PB__evict_entry(H5PB_t *pb_ptr, H5PB_entry_t *entry_ptr, + hbool_t force); + +static herr_t H5PB__flush_entry(H5F_t *f, H5PB_t *pb_ptr, + H5PB_entry_t *entry_ptr); + +static herr_t H5PB__load_page(H5F_t *f, H5PB_t *pb_ptr, haddr_t addr, + H5FD_mem_t type, H5PB_entry_t **entry_ptr_ptr); + +static herr_t H5PB__make_space(H5F_t *f, H5PB_t *pb_ptr, + H5FD_mem_t inserted_type); + +static herr_t H5PB__mark_entry_clean(H5PB_t *pb_ptr, + H5PB_entry_t *entry_ptr); + +static herr_t H5PB__mark_entry_dirty(H5PB_t *pb_ptr, + H5PB_entry_t *entry_ptr); + +static herr_t H5PB__read_meta(H5F_t *f, H5FD_mem_t type, haddr_t addr, + size_t size, void *buf/*out*/); + +static herr_t H5PB__read_raw(H5F_t *f, H5FD_mem_t type, haddr_t addr, + size_t size, void *buf/*out*/); + +static herr_t H5PB__write_meta(H5F_t *f, H5FD_mem_t type, haddr_t addr, + size_t size, const void *buf/*out*/); + +static herr_t H5PB__write_raw(H5F_t *f, H5FD_mem_t type, haddr_t addr, + size_t size, const void *buf/*out*/); /*********************/ @@ -143,6 +129,8 @@ hbool_t H5_PKG_INIT_VAR = FALSE; /*******************/ /* Local Variables */ /*******************/ + + /* Declare a free list to manage the H5PB_t struct */ H5FL_DEFINE_STATIC(H5PB_t); @@ -152,39 +140,70 @@ H5FL_DEFINE_STATIC(H5PB_entry_t); /*------------------------------------------------------------------------- - * Function: H5PB_reset_stats * - * Purpose: This function was created without documentation. - * What follows is my best understanding of Mohamad's intent. + * Function: H5PB_reset_stats * - * Reset statistics collected for the page buffer layer. + * Purpose: Reset statistics collected for the page buffer layer. * * Return: Non-negative on success/Negative on failure * - * Programmer: Mohamad Chaarawi + * Programmer: John Mainzer -- 10/12/18 + * + * Changes: None. * *------------------------------------------------------------------------- */ herr_t -H5PB_reset_stats(H5PB_t *page_buf) +H5PB_reset_stats(H5PB_t *pb_ptr) { + int i; + FUNC_ENTER_NOAPI_NOERR /* Sanity checks */ - HDassert(page_buf); - - page_buf->accesses[0] = 0; - page_buf->accesses[1] = 0; - page_buf->hits[0] = 0; - page_buf->hits[1] = 0; - page_buf->misses[0] = 0; - page_buf->misses[1] = 0; - page_buf->evictions[0] = 0; - page_buf->evictions[1] = 0; - page_buf->bypasses[0] = 0; - page_buf->bypasses[1] = 0; + HDassert(pb_ptr); + HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); + + for ( i = 0; i < H5PB__NUM_STAT_TYPES; i++ ) { + + pb_ptr->bypasses[i] = 0; + pb_ptr->accesses[i] = 0; + pb_ptr->hits[i] = 0; + pb_ptr->misses[i] = 0; + pb_ptr->loads[i] = 0; + pb_ptr->insertions[i] = 0; + pb_ptr->flushes[i] = 0; + pb_ptr->evictions[i] = 0; + pb_ptr->clears[i] = 0; + } + + pb_ptr->max_lru_len = 0; + pb_ptr->max_lru_size = 0; + pb_ptr->lru_md_skips = 0; + pb_ptr->lru_rd_skips = 0; + pb_ptr->total_ht_insertions = 0; + pb_ptr->total_ht_deletions = 0; + pb_ptr->successful_ht_searches = 0; + pb_ptr->total_successful_ht_search_depth = 0; + pb_ptr->failed_ht_searches = 0; + pb_ptr->total_failed_ht_search_depth = 0; + pb_ptr->max_index_len = 0; + pb_ptr->max_index_size = 0; + pb_ptr->max_rd_pages = 0; + pb_ptr->max_md_pages = 0; + pb_ptr->max_mpmde_count = 0; + pb_ptr->lru_tl_skips = 0; + pb_ptr->lru_dwl_skips = 0; + pb_ptr->max_tl_len = 0; + pb_ptr->max_tl_size = 0; + pb_ptr->delayed_writes = 0; + pb_ptr->total_delay = 0; + pb_ptr->max_dwl_len = 0; + pb_ptr->max_dwl_size = 0; + pb_ptr->total_dwl_ins_depth = 0; FUNC_LEAVE_NOAPI(SUCCEED) + } /* H5PB_reset_stats() */ @@ -208,89 +227,264 @@ H5PB_reset_stats(H5PB_t *page_buf) *------------------------------------------------------------------------- */ herr_t -H5PB_get_stats(const H5PB_t *page_buf, unsigned accesses[2], unsigned hits[2], +H5PB_get_stats(const H5PB_t *pb_ptr, unsigned accesses[2], unsigned hits[2], unsigned misses[2], unsigned evictions[2], unsigned bypasses[2]) { FUNC_ENTER_NOAPI_NOERR /* Sanity checks */ - HDassert(page_buf); - - accesses[0] = page_buf->accesses[0]; - accesses[1] = page_buf->accesses[1]; - hits[0] = page_buf->hits[0]; - hits[1] = page_buf->hits[1]; - misses[0] = page_buf->misses[0]; - misses[1] = page_buf->misses[1]; - evictions[0] = page_buf->evictions[0]; - evictions[1] = page_buf->evictions[1]; - bypasses[0] = page_buf->bypasses[0]; - bypasses[1] = page_buf->bypasses[1]; + HDassert(pb_ptr); + + accesses[0] = (unsigned)pb_ptr->accesses[0]; + accesses[1] = (unsigned)pb_ptr->accesses[1]; + accesses[2] = (unsigned)pb_ptr->accesses[2]; + hits[0] = (unsigned)pb_ptr->hits[0]; + hits[1] = (unsigned)pb_ptr->hits[1]; + hits[2] = (unsigned)pb_ptr->hits[2]; + misses[0] = (unsigned)pb_ptr->misses[0]; + misses[1] = (unsigned)pb_ptr->misses[1]; + misses[2] = (unsigned)pb_ptr->misses[2]; + evictions[0] = (unsigned)pb_ptr->evictions[0]; + evictions[1] = (unsigned)pb_ptr->evictions[1]; + evictions[2] = (unsigned)pb_ptr->evictions[2]; + bypasses[0] = (unsigned)pb_ptr->bypasses[0]; + bypasses[1] = (unsigned)pb_ptr->bypasses[1]; + bypasses[2] = (unsigned)pb_ptr->bypasses[2]; FUNC_LEAVE_NOAPI(SUCCEED) } /* H5PB_get_stats */ /*------------------------------------------------------------------------- + * * Function: H5PB_print_stats() * - * Purpose: This function was created without documentation. - * What follows is my best understanding of Mohamad's intent. + * Purpose: Print out statistics collected for the page buffer layer. * - * Print out statistics collected for the page buffer layer. + * Return: Non-negative on success/Negative on failure * - * Return: Non-negative on success/Negative on failure + * Programmer: John Mainzer -- 10/12/18 * - * Programmer: Mohamad Chaarawi + * Changes: None. * *------------------------------------------------------------------------- */ herr_t -H5PB_print_stats(const H5PB_t *page_buf) +H5PB_print_stats(const H5PB_t *pb_ptr) { + double ave_succ_search_depth = 0.0L; + double ave_failed_search_depth = 0.0L; + double ave_delayed_write = 0.0L; + double ave_delayed_write_ins_depth = 0.0L; + FUNC_ENTER_NOAPI_NOINIT_NOERR - HDassert(page_buf); + HDassert(pb_ptr); + HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); + + HDfprintf(stdout, "\n\nPage Buffer Statistics (raw/meta/mpmde): \n\n"); + + HDfprintf(stdout, "bypasses = %lld (%lld/%lld/%lld)\n", + (pb_ptr->bypasses[0] + pb_ptr->bypasses[1] + pb_ptr->bypasses[2]), + pb_ptr->bypasses[0], pb_ptr->bypasses[1], pb_ptr->bypasses[2]); + + HDfprintf(stdout, "acesses = %lld (%lld/%lld/%lld)\n", + (pb_ptr->accesses[0] + pb_ptr->accesses[1] + pb_ptr->accesses[2]), + pb_ptr->accesses[0], pb_ptr->accesses[1], pb_ptr->accesses[2]); + + HDfprintf(stdout, "hits = %lld (%lld/%lld/%lld)\n", + (pb_ptr->hits[0] + pb_ptr->hits[1] + pb_ptr->hits[2]), + pb_ptr->hits[0], pb_ptr->hits[1], pb_ptr->hits[2]); + + HDfprintf(stdout, "misses = %lld (%lld/%lld/%lld)\n", + (pb_ptr->misses[0] + pb_ptr->misses[1] + pb_ptr->misses[2]), + pb_ptr->misses[0], pb_ptr->misses[1], pb_ptr->misses[2]); + + HDfprintf(stdout, "loads = %lld (%lld/%lld/%lld)\n", + (pb_ptr->loads[0] + pb_ptr->loads[1] + pb_ptr->loads[2]), + pb_ptr->loads[0], pb_ptr->loads[1], pb_ptr->loads[2]); + + HDfprintf(stdout, "insertions = %lld (%lld/%lld/%lld)\n", + (pb_ptr->insertions[0] + pb_ptr->insertions[1] + + pb_ptr->insertions[2]), + pb_ptr->insertions[0], pb_ptr->insertions[1], + pb_ptr->insertions[2]); + + HDfprintf(stdout, "flushes = %lld (%lld/%lld/%lld)\n", + (pb_ptr->flushes[0] + pb_ptr->flushes[1] + pb_ptr->flushes[2]), + pb_ptr->flushes[0], pb_ptr->flushes[1], pb_ptr->flushes[2]); + + HDfprintf(stdout, "evictions = %lld (%lld/%lld/%lld)\n", + (pb_ptr->evictions[0] + pb_ptr->evictions[1] + + pb_ptr->evictions[2]), + pb_ptr->evictions[0], pb_ptr->evictions[1], pb_ptr->evictions[2]); + + HDfprintf(stdout, "clears = %lld (%lld/%lld/%lld)\n", + (pb_ptr->clears[0] + pb_ptr->clears[1] + pb_ptr->clears[2]), + pb_ptr->clears[0], pb_ptr->clears[1], pb_ptr->clears[2]); + + HDfprintf(stdout, "max LRU len / size = %lld / %lld\n", + pb_ptr->max_lru_len, pb_ptr->max_lru_size); + + HDfprintf(stdout, + "LRU make space md/rd/tl/dwl skips = %lld/%lld/%lld/%lld\n", + pb_ptr->lru_md_skips, pb_ptr->lru_rd_skips, + pb_ptr->lru_tl_skips, pb_ptr->lru_dwl_skips); + + HDfprintf(stdout, "hash table insertions / deletions = %lld / %lld\n", + pb_ptr->total_ht_insertions, pb_ptr->total_ht_deletions); + + if ( pb_ptr->successful_ht_searches > 0 ) { + + ave_succ_search_depth = + (double)(pb_ptr->total_successful_ht_search_depth) / + (double)(pb_ptr->successful_ht_searches); + } + HDfprintf(stdout, "successful ht searches / ave depth = %lld / %llf\n", + pb_ptr->successful_ht_searches, ave_succ_search_depth); + + if ( pb_ptr->failed_ht_searches > 0 ) { - printf("PAGE BUFFER STATISTICS:\n"); + ave_failed_search_depth = + (double)(pb_ptr->total_failed_ht_search_depth) / + (double)(pb_ptr->failed_ht_searches); + } + HDfprintf(stdout, "failed ht searches / ave depth = %lld / %llf\n", + pb_ptr->failed_ht_searches, ave_failed_search_depth); - HDprintf("******* METADATA\n"); - HDprintf("\t Total Accesses: %u\n", page_buf->accesses[0]); - HDprintf("\t Hits: %u\n", page_buf->hits[0]); - HDprintf("\t Misses: %u\n", page_buf->misses[0]); - HDprintf("\t Evictions: %u\n", page_buf->evictions[0]); - HDprintf("\t Bypasses: %u\n", page_buf->bypasses[0]); - HDprintf("\t Hit Rate = %f%%\n", ((double)page_buf->hits[0]/(page_buf->accesses[0] - page_buf->bypasses[0]))*100); - HDprintf("*****************\n\n"); + HDfprintf(stdout, "max index length / size = %lld / %lld\n", + pb_ptr->max_index_len, pb_ptr->max_index_size); - HDprintf("******* RAWDATA\n"); - HDprintf("\t Total Accesses: %u\n", page_buf->accesses[1]); - HDprintf("\t Hits: %u\n", page_buf->hits[1]); - HDprintf("\t Misses: %u\n", page_buf->misses[1]); - HDprintf("\t Evictions: %u\n", page_buf->evictions[1]); - HDprintf("\t Bypasses: %u\n", page_buf->bypasses[1]); - HDprintf("\t Hit Rate = %f%%\n", ((double)page_buf->hits[1]/(page_buf->accesses[1]-page_buf->bypasses[0]))*100); - HDprintf("*****************\n\n"); + HDfprintf(stdout, "max rd / md / mpmde entries = %lld / %lld / %lld\n", + pb_ptr->max_rd_pages, pb_ptr->max_md_pages, + pb_ptr->max_mpmde_count); + + HDfprintf(stdout, "tick list max len / size = %lld / %lld\n", + pb_ptr->max_tl_len, pb_ptr->max_tl_size); + + HDfprintf(stdout, "delayed write list max len / size = %lld / %lld\n", + pb_ptr->max_dwl_len, pb_ptr->max_dwl_size); + + if ( pb_ptr->delayed_writes > 0 ) { + + ave_delayed_write = (double)(pb_ptr->total_delay) / + (double)(pb_ptr->delayed_writes); + ave_delayed_write_ins_depth = (double)(pb_ptr->total_dwl_ins_depth) / + (double)(pb_ptr->delayed_writes); + } + HDfprintf(stdout, + "delayed writes / ave delay / ave ins depth = %lld / %llf / %llf\n", + pb_ptr->delayed_writes, ave_delayed_write, ave_delayed_write_ins_depth); FUNC_LEAVE_NOAPI(SUCCEED) + } /* H5PB_print_stats */ /*------------------------------------------------------------------------- + * + * Function: H5PB_add_new_page + * + * Purpose: Insert a new blank page to the page buffer if the page + * buffer is configured to allow pages of the specified + * type. + * + * This function is called by the + * from the MF layer when a new page is allocated to + * indicate to the page buffer layer that a read of the page + * from the file is not necessary since it's an empty page. + * + * Note that this function inserts the new page without + * attempting to make space. This can result in the page + * buffer exceeding its maximum size. + * + * Note also that it is possible that the page (marked clean) + * will be evicted before its first use. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: John Mainzer -- 10/12/18 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +herr_t +H5PB_add_new_page(H5F_t *f, H5FD_mem_t type, haddr_t page_addr) +{ + hbool_t can_insert = TRUE; + H5PB_t *pb_ptr = NULL; + H5PB_entry_t *entry_ptr = NULL; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(FAIL) + + /* Sanity checks */ + HDassert(f); + HDassert(f->shared); + HDassert(f->shared->pb_ptr); + + pb_ptr = f->shared->pb_ptr; + + HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); + + if ( H5FD_MEM_DRAW == type ) { /* raw data page insertion */ + + if ( pb_ptr->min_md_pages == pb_ptr->max_pages ) { + + can_insert = FALSE; + + } + } else { /* metadata page insertion */ + + if ( pb_ptr->min_rd_pages == pb_ptr->max_pages ) { + + can_insert = FALSE; + } + } + + if ( can_insert ) { + + if ( H5PB__create_new_page(pb_ptr, page_addr, + (size_t)(pb_ptr->page_size), + type, TRUE, &entry_ptr) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "new page buffer page creation failed.") + + /* updates stats */ + H5PB__UPDATE_STATS_FOR_INSERTION(pb_ptr, entry_ptr); + } + +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5PB_add_new_page */ + + +/*------------------------------------------------------------------------- + * * Function: H5PB_create * - * Purpose: Create and setup the PB on the file. + * Purpose: Setup a page buffer for the supplied file. * * Return: Non-negative on success/Negative on failure * - * Programmer: Mohamad Chaarawi + * Programmer: John Mainzer -- 10/11/18 + * + * Changes: None. * *------------------------------------------------------------------------- */ herr_t -H5PB_create(H5F_t *f, size_t size, unsigned page_buf_min_meta_perc, unsigned page_buf_min_raw_perc) +H5PB_create(H5F_t *f, size_t size, unsigned page_buf_min_meta_perc, + unsigned page_buf_min_raw_perc) { - H5PB_t *page_buf = NULL; + int i; + int32_t min_md_pages; + int32_t min_rd_pages; + H5PB_t *pb_ptr = NULL; herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_NOAPI(FAIL) @@ -298,1221 +492,3134 @@ H5PB_create(H5F_t *f, size_t size, unsigned page_buf_min_meta_perc, unsigned pag /* Sanity checks */ HDassert(f); HDassert(f->shared); + HDassert(page_buf_min_meta_perc <= 100); + HDassert(page_buf_min_raw_perc <= 100); + HDassert((page_buf_min_meta_perc + page_buf_min_raw_perc) <= 100); /* Check args */ - if(f->shared->fs_strategy != H5F_FSPACE_STRATEGY_PAGE) - HGOTO_ERROR(H5E_FILE, H5E_CANTINIT, FAIL, "Enabling Page Buffering requires PAGE file space strategy") - /* round down the size if it is larger than the page size */ - else if(size > f->shared->fs_page_size) { + if ( f->shared->fs_strategy != H5F_FSPACE_STRATEGY_PAGE ) + + HGOTO_ERROR(H5E_FILE, H5E_CANTINIT, FAIL, \ + "Enabling Page Buffering requires PAGE file space strategy") + + else if ( size > f->shared->fs_page_size ) { + + /* round size down to the next multiple of fs_page_size */ + hsize_t temp_size; temp_size = (size / f->shared->fs_page_size) * f->shared->fs_page_size; + H5_CHECKED_ASSIGN(size, size_t, temp_size, hsize_t); - } /* end if */ - else if(0 != size % f->shared->fs_page_size) - HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTINIT, FAIL, "Page Buffer size must be >= to the page size") - /* Allocate the new page buffering structure */ - if(NULL == (page_buf = H5FL_CALLOC(H5PB_t))) - HGOTO_ERROR(H5E_PAGEBUF, H5E_NOSPACE, FAIL, "memory allocation failed") + } /* end if */ + else if ( 0 != size % f->shared->fs_page_size ) - page_buf->max_size = size; - H5_CHECKED_ASSIGN(page_buf->page_size, size_t, f->shared->fs_page_size, hsize_t); - page_buf->min_meta_perc = page_buf_min_meta_perc; - page_buf->min_raw_perc = page_buf_min_raw_perc; + HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTINIT, FAIL, \ + "Page Buffer size must be >= to the page size") /* Calculate the minimum page count for metadata and raw data * based on the fractions provided */ - page_buf->min_meta_count = (unsigned)((size * page_buf_min_meta_perc) / (f->shared->fs_page_size * 100)); - page_buf->min_raw_count = (unsigned)((size * page_buf_min_raw_perc) / (f->shared->fs_page_size * 100)); + min_md_pages = (int32_t)((size * page_buf_min_meta_perc) / + (f->shared->fs_page_size * 100)); + min_rd_pages = (int32_t)((size * page_buf_min_raw_perc) / + (f->shared->fs_page_size * 100)); + HDassert(min_md_pages >= 0); + HDassert(min_rd_pages >= 0); + HDassert((min_md_pages + min_rd_pages) <= + (int32_t)(size / f->shared->fs_page_size)); + + + /* Allocate the new page buffering structure */ + if(NULL == (pb_ptr = H5FL_MALLOC(H5PB_t))) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_NOSPACE, FAIL, "memory allocation failed") + + /* initialize the new instance of H5PB_t */ + + pb_ptr->magic = H5PB__H5PB_T_MAGIC; + pb_ptr->page_size = f->shared->fs_page_size; + H5_CHECKED_ASSIGN(pb_ptr->page_size, size_t, \ + f->shared->fs_page_size, hsize_t); + pb_ptr->max_pages = (int32_t)(size / f->shared->fs_page_size); + pb_ptr->curr_pages = 0; + pb_ptr->curr_md_pages = 0; + pb_ptr->curr_rd_pages = 0; + pb_ptr->min_md_pages = min_md_pages; + pb_ptr->min_rd_pages = min_rd_pages; + + pb_ptr->max_size = size; + pb_ptr->min_meta_perc = page_buf_min_meta_perc; + pb_ptr->min_raw_perc = page_buf_min_raw_perc; + + /* index */ + for ( i = 0; i < H5PB__HASH_TABLE_LEN; i++ ) + pb_ptr->ht[i] = NULL; + pb_ptr->index_len = 0; + pb_ptr->index_size = 0; + + /* LRU */ + pb_ptr->LRU_len = 0; + pb_ptr->LRU_size = 0; + pb_ptr->LRU_head_ptr = NULL; + pb_ptr->LRU_tail_ptr = NULL; + + + /* VFD SWMR specific fields. + * The following fields are defined iff vfd_swmr_writer is TRUE. + */ + pb_ptr->vfd_swmr_writer = FALSE; + pb_ptr->mpmde_count = 0; + pb_ptr->cur_tick = 0; - if(NULL == (page_buf->slist_ptr = H5SL_create(H5SL_TYPE_HADDR, NULL))) - HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTCREATE, FAIL, "can't create skip list") - if(NULL == (page_buf->mf_slist_ptr = H5SL_create(H5SL_TYPE_HADDR, NULL))) - HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTCREATE, FAIL, "can't create skip list") + /* delayed write list */ + pb_ptr->max_delay = 0; + pb_ptr->dwl_len = 0; + pb_ptr->dwl_size = 0; + pb_ptr->dwl_head_ptr = NULL; + pb_ptr->dwl_tail_ptr = NULL; - if(NULL == (page_buf->page_fac = H5FL_fac_init(page_buf->page_size))) - HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTINIT, FAIL, "can't create page factory") + /* tick list */ + pb_ptr->tl_len = 0; + pb_ptr->tl_size = 0; + pb_ptr->tl_head_ptr = NULL; + pb_ptr->tl_tail_ptr = NULL; - f->shared->page_buf = page_buf; + H5PB_reset_stats(pb_ptr); + + f->shared->pb_ptr = pb_ptr; done: - if(ret_value < 0) { - if(page_buf != NULL) { - if(page_buf->slist_ptr != NULL) - H5SL_close(page_buf->slist_ptr); - if(page_buf->mf_slist_ptr != NULL) - H5SL_close(page_buf->mf_slist_ptr); - if(page_buf->page_fac != NULL) - H5FL_fac_term(page_buf->page_fac); - page_buf = H5FL_FREE(H5PB_t, page_buf); - } /* end if */ - } /* end if */ + + if ( ret_value < 0 ) { + + if ( pb_ptr != NULL ) { + + pb_ptr = H5FL_FREE(H5PB_t, pb_ptr); + + } + } FUNC_LEAVE_NOAPI(ret_value) + } /* H5PB_create */ /*------------------------------------------------------------------------- - * Function: H5PB__flush_cb * - * Purpose: Callback to flush PB skiplist entries. + * Function: H5PB_dest + * + * Purpose: Flush (if necessary) and evict all entries in the page + * buffer, and then discard the page buffer. * * Return: Non-negative on success/Negative on failure * - * Programmer: Mohamad Chaarawi + * Programmer: John Mainzer -- 10/22/18 + * + * Changes: None. * *------------------------------------------------------------------------- */ -static herr_t -H5PB__flush_cb(void *item, void H5_ATTR_UNUSED *key, void *_op_data) +herr_t +H5PB_dest(H5F_t *f) { - H5PB_entry_t *page_entry = (H5PB_entry_t *)item; /* Pointer to page entry node */ - H5F_t *f = (H5F_t *)_op_data; + int i; + H5PB_t *pb_ptr = NULL; + H5PB_entry_t *entry_ptr = NULL; + H5PB_entry_t *evict_ptr = NULL; herr_t ret_value = SUCCEED; /* Return value */ - FUNC_ENTER_STATIC + FUNC_ENTER_NOAPI(FAIL) - /* Sanity checks */ - HDassert(page_entry); + /* Sanity check */ HDassert(f); + HDassert(f->shared); - /* Flush the page if it's dirty */ - if(page_entry->is_dirty) - if(H5PB__write_entry(f, page_entry) < 0) - HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, "file write failed") + /* flush and destroy the page buffer, if it exists */ + if ( f->shared->pb_ptr ) { + + pb_ptr = f->shared->pb_ptr; + + HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); + + /* the current implementation if very inefficient, and will + * fail if there are any outstanding delayed writes -- must fix this + */ + for ( i = 0; i < H5PB__HASH_TABLE_LEN; i++ ) { + + entry_ptr = pb_ptr->ht[i]; + + while ( entry_ptr ) { + + HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC); + + evict_ptr = entry_ptr; + entry_ptr = entry_ptr->ht_next; + + if ( evict_ptr->is_dirty ) { + + if ( H5PB__flush_entry(f, pb_ptr, evict_ptr) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \ + "Can't flush entry") + } + + if ( H5PB__evict_entry(pb_ptr, evict_ptr, TRUE) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "forced eviction failed") + + entry_ptr = pb_ptr->ht[i]; + } + } + + /* regular operations fields */ + HDassert(pb_ptr->curr_pages == 0); + HDassert(pb_ptr->curr_md_pages == 0); + HDassert(pb_ptr->curr_rd_pages == 0); + HDassert(pb_ptr->index_len == 0); + HDassert(pb_ptr->index_size == 0); + HDassert(pb_ptr->LRU_len == 0); + HDassert(pb_ptr->LRU_size == 0); + HDassert(pb_ptr->LRU_head_ptr == NULL); + HDassert(pb_ptr->LRU_tail_ptr == NULL); + + /* VFD SWMR fields */ + HDassert(pb_ptr->dwl_len == 0); + HDassert(pb_ptr->dwl_size == 0); + HDassert(pb_ptr->dwl_head_ptr == NULL); + HDassert(pb_ptr->dwl_tail_ptr == NULL); + + HDassert(pb_ptr->tl_len == 0); + HDassert(pb_ptr->tl_size == 0); + HDassert(pb_ptr->tl_head_ptr == NULL); + HDassert(pb_ptr->tl_tail_ptr == NULL); + + pb_ptr->magic = 0; + f->shared->pb_ptr = H5FL_FREE(H5PB_t, pb_ptr); + } done: + FUNC_LEAVE_NOAPI(ret_value) -} /* H5PB__flush_cb() */ + +} /* H5PB_dest */ /*------------------------------------------------------------------------- + * * Function: H5PB_flush * - * Purpose: Flush/Free all the PB entries to the file. + * Purpose: If the page buffer is defined, flush all entries. * * Return: Non-negative on success/Negative on failure * - * Programmer: Mohamad Chaarawi + * Programmer: John Mainzer -- 10/22/18 + * + * Changes: None. * *------------------------------------------------------------------------- */ herr_t H5PB_flush(H5F_t *f) { + int i; + H5PB_t *pb_ptr = NULL; + H5PB_entry_t *entry_ptr = NULL; + H5PB_entry_t *flush_ptr = NULL; herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_NOAPI(FAIL) /* Sanity check */ HDassert(f); + HDassert(f->shared); - /* Flush all the entries in the PB skiplist, if we have write access on the file */ - if(f->shared->page_buf && (H5F_ACC_RDWR & H5F_INTENT(f))) { - H5PB_t *page_buf = f->shared->page_buf; + pb_ptr = f->shared->pb_ptr; - /* Iterate over all entries in page buffer skip list */ - if(H5SL_iterate(page_buf->slist_ptr, H5PB__flush_cb, (void *)f)) - HGOTO_ERROR(H5E_PAGEBUF, H5E_BADITER, FAIL, "can't flush page buffer skip list") - } /* end if */ + if ( pb_ptr ) { + + HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); + + /* the current implementation if very inefficient, and will + * fail if there are any delayed writes -- must fix this + */ + for ( i = 0; i < H5PB__HASH_TABLE_LEN; i++ ) { + + entry_ptr = pb_ptr->ht[i]; + + while ( entry_ptr ) { + + HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC); + + flush_ptr = entry_ptr; + entry_ptr = entry_ptr->ht_next; + + if ( flush_ptr->is_dirty ) { + + if ( H5PB__flush_entry(f, pb_ptr, flush_ptr) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \ + "Can't flush entry") + } + } + } + } done: + FUNC_LEAVE_NOAPI(ret_value) + } /* H5PB_flush */ /*------------------------------------------------------------------------- - * Function: H5PB__dest_cb * - * Purpose: Callback to free PB skiplist entries. + * Function: H5PB_page_exists + * + * Purpose: Test to see if a page buffer page exists at the specified + * address. Set *page_exists_ptr to TRUE or FALSE accordingly. + * + * This function exists for the convenience of the test + * code * * Return: Non-negative on success/Negative on failure * - * Programmer: Mohamad Chaarawi + * Programmer: John Mainzer -- 10/22/18 + * + * Changes: None. * *------------------------------------------------------------------------- */ -static herr_t -H5PB__dest_cb(void *item, void H5_ATTR_UNUSED *key, void *_op_data) +herr_t +H5PB_page_exists(H5F_t *f, haddr_t addr, hbool_t *page_exists_ptr) { - H5PB_entry_t *page_entry = (H5PB_entry_t *)item; /* Pointer to page entry node */ - H5PB_ud1_t *op_data = (H5PB_ud1_t *)_op_data; + uint64_t page; + H5PB_t *pb_ptr = NULL; + H5PB_entry_t *entry_ptr = NULL; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(FAIL) - FUNC_ENTER_STATIC_NOERR + /* Sanity check */ + HDassert(f); + HDassert(f->shared); + HDassert(f->shared->pb_ptr); - /* Sanity checking */ - HDassert(page_entry); - HDassert(op_data); - HDassert(op_data->page_buf); + pb_ptr = f->shared->pb_ptr; - /* Remove entry from LRU list */ - if(op_data->actual_slist) { - H5PB__REMOVE_LRU(op_data->page_buf, page_entry) - page_entry->page_buf_ptr = H5FL_FAC_FREE(op_data->page_buf->page_fac, page_entry->page_buf_ptr); - } /* end if */ + HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); + HDassert(page_exists_ptr); - /* Free page entry */ - page_entry = H5FL_FREE(H5PB_entry_t, page_entry); + /* Calculate the page offset */ + page = (addr / pb_ptr->page_size); - FUNC_LEAVE_NOAPI(SUCCEED) -} /* H5PB__dest_cb() */ + /* the supplied address should be page aligned */ + HDassert(addr == page * pb_ptr->page_size); + + /* Search for page in the hash table */ + H5PB__SEARCH_INDEX(pb_ptr, page, entry_ptr, FAIL) + + HDassert((NULL == entry_ptr) || (entry_ptr->addr == addr)); + + *page_exists_ptr = ( entry_ptr != NULL ); + +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5PB_page_exists */ /*------------------------------------------------------------------------- - * Function: H5PB_dest * - * Purpose: Flush and destroy the PB on the file if it exists. + * Function: H5PB_read + * + * Purpose: Satisfy the read from the page buffer if possible. + * + * 1) If the page buffer is disabled, simply read from the + * HDF5 file and return. + * + * 2) If the read is for raw data, and the page buffer is + * configured for metadata only (i.e. min_md_pages == + * max_pages), simply read from the HDF5 file and return. + * + * 3) If the read is for raw data, and it of page size or + * larger, read it directly from the HDF5 file. + * + * It is possible that the page buffer contains dirty pages + * that intersect with the read -- test for this and update + * the read buffer from the page buffer if any such pages + * exist. + * + * Note that no pages are inserted into the page buffer in + * this case. + * + * 4) If the read is for raw data, and it is of size less + * than the page size, satisfy the read from the page + * buffer, loading and inserting pages into the + * page buffer as necessary + * + * 5) If the read is for metadata, and the page buffer is + * configured for raw data only (i.e. min_rd_pages == + * max_pages), simply read from the HDF5 file and return. + * + * The free space manager guarantees that allocations larger + * than one page will be page alligned, and that allocations + * of size less than or equal to page size will not cross page + * boundaries. Further, unlike raw data, metadata is always + * written and read atomically. + * + * In principle, this should make it easy to discriminate + * between small and multi-page metadata entries so that + * pages containing the former will be buffered and the + * latter be read directly from file. + * + * Unfortunately, the metadata cache does not always know the + * size of metadata entries when it tries to read them. In + * such cases, it issues speculative reads that may be either + * smaller or larger than the actual size of the piece of + * metadata that is finally read. + * + * Since we are guaranteed that all metadata allocations larger + * that one page are page aligned, we can safely clip at the + * page boundary any non page aligned metadata read that crosses + * page boundaries. + * + * However, page aligned reads could wind up being either + * small or multi-page. This results in two scenarios that + * we must handle: + * + * a) A page aligned read of size less than one page + * turns out to be mult-page. + * + * In this case, the initial speculative read will + * result in a page load and insertion into the page + * buffer. This page must be evicted on the subsequent + * read of size greater than page size. + * + * In the context of VFD SWMR, it is also possible that + * that the multi-page metadata entry is already in the + * page buffer -- in which case the initial read should + * be satisfied from the multi-page page buffer entry. + * + * b) A page aligned, larger than one page read turns out + * to be small (less than one page). + * + * If there is already a page in the page buffer with + * same address, we can safely clip the original + * read to page size + * + * The above considerations resolve into the following cases: + * + * 6) If the read is for metadata and not page aligned, clip + * the read to the end of the current page if necessary. + * Load the relevant page if necessary and satisfy the + * read from the page buffer. Note that it there is an + * existing page, it must not be a multi-page metadata + * entry. It it is, flag an error. + * + * 7) If the read is for metadata, is page aligned, is larger + * than one page, and there is no entry in the page buffer, + * satisfy the read from the file + * + * 8) If the read is for metadata, is page aligned, is larger + * than one page, and there is a regular entry at the target + * page address, test to see if the last read was for the + * same address. + * + * If was, evict the page, and satisfy the read from file. + * Flag an error if the page was dirty. + * + * If the last read was for a different page, clip the read + * to one page, and satisfy the read from the existing + * regular entry. + * + * 9) If the read is for metadata, is page aligned, is larger + * than one page, and there is a multi-page metadata entry + * at the target page address, test to see if + * pb_ptr->vfd_swmr_write is TRUE. + * + * If it is, satisfy the read from the multi-page metadata + * entry, clipping the read if necessary. + * + * if pb_ptr->vfd_swmr_write is FALSE, flag an error. + * + * 10) If the read is for metadata, is page aligned, is no + * larger than a page, test to see if the page buffer + * contains a page at the target address. + * + * If it doesn't, load the page and satisfy the read + * from it. + * + * If it contains a regular page entry, satisfy the read + * from it. + * + * If it contains a multipage metadata entry at the target + * address, satisfy the read from the multi-page metadata + * entry if pb_ptr->vfd_swmr_write is TRUE, and flag an + * error otherwise. + * + * Observe that this function handles casses 1, 2, and 5 + * directly, calls H5PB_read_raw() for cases 3 & 4, and + * calls H5PB_read_meta() for cases 6), 7, 8, 9), and 10). * * Return: Non-negative on success/Negative on failure * - * Programmer: Mohamad Chaarawi + * Programmer: John Mainzer -- 10/11/18 + * + * Changes: None. * *------------------------------------------------------------------------- */ herr_t -H5PB_dest(H5F_t *f) +H5PB_read(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, + void *buf/*out*/) { - herr_t ret_value = SUCCEED; /* Return value */ + H5PB_t *pb_ptr; /* Page buffer for this file */ + hbool_t bypass_pb = FALSE; /* Whether to bypass page buffering */ + herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_NOAPI(FAIL) /* Sanity checks */ HDassert(f); + HDassert(f->shared); + HDassert(type != H5FD_MEM_GHEAP); - /* flush and destroy the page buffer, if it exists */ - if(f->shared->page_buf) { - H5PB_t *page_buf = f->shared->page_buf; - H5PB_ud1_t op_data; /* Iteration context */ + pb_ptr = f->shared->pb_ptr; + + if ( pb_ptr == NULL ) { + + bypass_pb = TRUE; /* case 1) -- page buffer is disabled */ + + } else { + + HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); + + if ( H5FD_MEM_DRAW == type ) { /* raw data read */ + + if ( pb_ptr->min_md_pages == pb_ptr->max_pages ) { - if(H5PB_flush(f) < 0) - HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTFLUSH, FAIL, "can't flush page buffer") + /* case 2) -- page buffer configured for metadata only */ + bypass_pb = TRUE; - /* Set up context info */ - op_data.page_buf = page_buf; + } + } else { /* metadata read */ - /* Destroy the skip list containing all the entries in the PB */ - op_data.actual_slist = TRUE; - if(H5SL_destroy(page_buf->slist_ptr, H5PB__dest_cb, &op_data)) - HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTCLOSEOBJ, FAIL, "can't destroy page buffer skip list") + if ( pb_ptr->min_rd_pages == pb_ptr->max_pages ) { - /* Destroy the skip list containing the new entries */ - op_data.actual_slist = FALSE; - if(H5SL_destroy(page_buf->mf_slist_ptr, H5PB__dest_cb, &op_data)) - HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTCLOSEOBJ, FAIL, "can't destroy page buffer skip list") + /* case 5) -- page buffer configured for raw data only */ + bypass_pb = TRUE; + } + } + } - /* Destroy the page factory */ - if(H5FL_fac_term(page_buf->page_fac) < 0) - HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTRELEASE, FAIL, "can't destroy page buffer page factory") +#ifdef H5_HAVE_PARALLEL + /* at present, the page buffer must be disabled in the parallel case. + * However, just in case ... + */ + if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) { + + bypass_pb = TRUE; - f->shared->page_buf = H5FL_FREE(H5PB_t, page_buf); } /* end if */ +#endif /* H5_HAVE_PARALLEL */ + + if ( bypass_pb ) { /* cases 1, 2. and 5 */ + +#if VFD_IO + if ( H5FD_read(f->shared->lf, type, addr, size, buf) < 0 ) +#else /* VFD_IO */ + if ( H5F__accum_read(f, type, addr, size, buf) < 0 ) +#endif /* VFD_IO */ + + HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ + "read through metadata accumulator failed") + + /* Update statistics */ + if ( pb_ptr ) { + + H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size); + } + } else { + + if ( H5FD_MEM_DRAW == type ) { /* cases 3 and 4 */ + + if ( H5PB__read_raw(f, type, addr, size, buf) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ + "H5PB_read_raw() failed") + + } else { /* cases 6, 7, 8, 9, and 10 */ + + if ( H5PB__read_meta(f, type, addr, size, buf) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ + "H5PB_read_meta() failed") + } + + H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size); + } done: + FUNC_LEAVE_NOAPI(ret_value) -} /* H5PB_dest */ + +} /* end H5PB_read() */ /*------------------------------------------------------------------------- - * Function: H5PB_add_new_page * - * Purpose: Add a new page to the new page skip list. This is called - * from the MF layer when a new page is allocated to - * indicate to the page buffer layer that a read of the page - * from the file is not necessary since it's an empty page. + * Function: H5PB_remove_entry * - * Return: Non-negative on success/Negative on failure + * Purpose: Remove possible metadata entry with ADDR from the PB cache. + * This is in response to the data corruption bug from fheap.c + * with page buffering + page strategy. + * Note: Large metadata page bypasses the PB cache. + * Note: Update of raw data page (large or small sized) is + * handled by the PB cache. * - * Programmer: Mohamad Chaarawi + * Return: Non-negative on success/Negative on failure + * + * Programmer: Vailin Choi; Feb 2017 + * + * Changes: Reworked function for re-implementation of the page buffer. + * + * Vailin: I think we need to do this for raw data as well. + * + * JRM -- 10/23/18 * *------------------------------------------------------------------------- */ -herr_t -H5PB_add_new_page(H5F_t *f, H5FD_mem_t type, haddr_t page_addr) +herr_t +H5PB_remove_entry(const H5F_t *f, haddr_t addr) { - H5PB_t *page_buf = f->shared->page_buf; - H5PB_entry_t *page_entry = NULL; /* pointer to the corresponding page entry */ - herr_t ret_value = SUCCEED; /* Return value */ + uint64_t page; + H5PB_t *pb_ptr = NULL; + H5PB_entry_t *entry_ptr = NULL; + herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_NOAPI(FAIL) /* Sanity checks */ - HDassert(page_buf); + HDassert(f); + HDassert(f->shared); + HDassert(f->shared->pb_ptr); - /* If there is an existing page, this means that at some point the - * file free space manager freed and re-allocated a page at the same - * address. No need to do anything here then... - */ - /* MSC - to be safe, might want to dig in the MF layer and remove - * the page when it is freed from this list if it still exists and - * remove this check - */ - if(NULL == H5SL_search(page_buf->mf_slist_ptr, &(page_addr))) { - /* Create the new PB entry */ - if(NULL == (page_entry = H5FL_CALLOC(H5PB_entry_t))) - HGOTO_ERROR(H5E_PAGEBUF, H5E_NOSPACE, FAIL, "memory allocation failed") - - /* Initialize page fields */ - page_entry->addr = page_addr; - page_entry->type = (H5F_mem_page_t)type; - page_entry->is_dirty = FALSE; - - /* Insert entry in skip list */ - if(H5SL_insert(page_buf->mf_slist_ptr, page_entry, &(page_entry->addr)) < 0) - HGOTO_ERROR(H5E_PAGEBUF, H5E_BADVALUE, FAIL, "Can't insert entry in skip list") - } /* end if */ + pb_ptr = f->shared->pb_ptr; + + /* Calculate the page offset */ + page = (addr / pb_ptr->page_size); + + HDassert(addr == page * pb_ptr->page_size); + + /* Search for page in the hash table */ + H5PB__SEARCH_INDEX(pb_ptr, page, entry_ptr, FAIL) + + if ( entry_ptr ) { + + HDassert(entry_ptr->addr == addr); + HDassert(entry_ptr->size == pb_ptr->page_size); + + /* if the entry is dirty, mark it clean before we evict */ + if ( ( entry_ptr->is_dirty ) && + ( H5PB__mark_entry_clean(pb_ptr, entry_ptr) < 0 ) ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "mark entry clean failed") + + if ( H5PB__evict_entry(pb_ptr, entry_ptr, TRUE) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, "forced eviction failed") + + } done: - if(ret_value < 0) - if(page_entry) - page_entry = H5FL_FREE(H5PB_entry_t, page_entry); FUNC_LEAVE_NOAPI(ret_value) -} /* H5PB_add_new_page */ + +} /* H5PB_remove_entry */ /*------------------------------------------------------------------------- + * * Function: H5PB_update_entry * - * Purpose: In PHDF5, entries that are written by other processes and just - * marked clean by this process have to have their corresponding - * pages updated if they exist in the page buffer. - * This routine checks and update the pages. + * Purpose: In PHDF5, metadata cache entries that are written by other + * processes are simply marked clean in the current process. + * However, if the page buffer is enabled, entries marked + * clean must still be written to the page buffer so as to + * keep the contents of metadata pages consistent on all + * processes. + * + * Do this as follows: + * + * 1) Test to see if the page buffer is configured to accept + * metadata pages. If it isn't, return. + * + * 2) Test to see if the page buffer contains the page that + * contains the supplied metadata cache entry. If it + * doesn't, return. + * + * 3) Write the supplied buffer to page at the appropriate + * offset. + * + * Note that at present, page buffering is disabled in the + * parallel case. Thus this function has not been tested. * * Return: Non-negative on success/Negative on failure * - * Programmer: Mohamad Chaarawi + * Programmer: John Mainzer -- 10/23/18 + * + * Changes: None. * *------------------------------------------------------------------------- */ herr_t -H5PB_update_entry(H5PB_t *page_buf, haddr_t addr, size_t size, const void *buf) +H5PB_update_entry(H5PB_t *pb_ptr, haddr_t addr, size_t size, const void *buf) { - H5PB_entry_t *page_entry; /* Pointer to the corresponding page entry */ + uint64_t page; + size_t offset; + H5PB_entry_t *entry_ptr = NULL; haddr_t page_addr; + herr_t ret_value = SUCCEED; /* Return value */ - FUNC_ENTER_NOAPI_NOERR + FUNC_ENTER_NOAPI(FAIL) /* Sanity checks */ - HDassert(page_buf); - HDassert(size <= page_buf->page_size); + HDassert(pb_ptr); + HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); + HDassert(size > 0); + HDassert(size <= pb_ptr->page_size); HDassert(buf); - /* calculate the aligned address of the first page */ - page_addr = (addr / page_buf->page_size) * page_buf->page_size; + if ( pb_ptr->min_rd_pages < pb_ptr->max_pages ) { - /* search for the page and update if found */ - page_entry = (H5PB_entry_t *)H5SL_search(page_buf->slist_ptr, (void *)(&page_addr)); - if(page_entry) { - haddr_t offset; + /* page buffer is configured to accept metadata pages */ - HDassert(addr + size <= page_addr + page_buf->page_size); - offset = addr - page_addr; - HDmemcpy((uint8_t *)page_entry->page_buf_ptr + offset, buf, size); + /* Calculate the aligned address of the containing page */ + page = (addr / pb_ptr->page_size); + page_addr = page * pb_ptr->page_size; - /* move to top of LRU list */ - H5PB__MOVE_TO_TOP_LRU(page_buf, page_entry) - } /* end if */ + H5PB__SEARCH_INDEX(pb_ptr, page, entry_ptr, FAIL) + + if ( entry_ptr ) { + + HDassert( entry_ptr->is_metadata ); + HDassert( ! (entry_ptr->is_mpmde) ); + HDassert(addr + size <= page_addr + pb_ptr->page_size); + + offset = addr - page_addr; + + HDmemcpy(((uint8_t *)(entry_ptr->image_ptr) + offset), + buf, size); + + /* should we mark the page dirty? If so, replace the following + * with a call to H5PB__mark_entry_dirty() + */ + H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, FAIL) + } + } + +done: + + FUNC_LEAVE_NOAPI(ret_value) - FUNC_LEAVE_NOAPI(SUCCEED) } /* H5PB_update_entry */ /*------------------------------------------------------------------------- - * Function: H5PB_remove_entry * - * Purpose: Remove possible metadata entry with ADDR from the PB cache. - * This is in response to the data corruption bug from fheap.c - * with page buffering + page strategy. - * Note: Large metadata page bypasses the PB cache. - * Note: Update of raw data page (large or small sized) is handled by the PB cache. + * Function: H5PB_write * - * Return: Non-negative on success/Negative on failure + * Purpose: Write data into the Page Buffer if practical, and to file + * otherwise. Specifically: * - * Programmer: Vailin Choi; Feb 2017 + * 1) If the page buffer is disabled, simply write to the + * HDF5 file and return. + * + * 2) If the write is raw data, and the page buffer is + * configured for metadata only (i.e. min_md_pages == + * max_pages), simply write to the HDF5 file and return. + * + * 3) If the write is raw data, and it of page size or + * larger, write directly from the HDF5 file. + * + * It is possible that the write intersects one or more + * pages in the page buffer -- test for this and update + * any partially written pages, and evict any pages + * that are completely overwritten. + * + * Note that no pages are inserted into the page buffer in + * this case. + * + * 4) If the write is of raw data, and it is of size less + * than the page size, write the page into the page + * buffer, loading and inserting pages into the + * page buffer as necessary + * + * 5) If the write is of metadata, and the page buffer is + * configured for raw data only (i.e. min_rd_pages == + * max_pages), simply write to the HDF5 file and return. + * + * 6) If the write is of metadata, the write is larger than + * one page, and vfd_swmr_writer is FALSE, simply read + * from the HDF5 file. There is no need to check the + * page buffer, as metadata is always read atomically, + * and entries of this size are not buffered in the page + * buffer. + * + * 7) If the write is of metadata, the write is larger than + * one page, and vfd_swmr_writer is TRUE, the write must + * buffered in the page buffer until the end of the tick. + * + * Create a multi-page metadata entry in the page buffer + * and copy the write into it. Insert the new entry in + * the tick list. + * + * Test to see if the write of the multi-page metadata + * entry must be delayed. If so, place the entry in + * the delayed write list. Otherwise, write the multi-page + * metadata entry to the HDF5 file. + * + * 8) If the write is of metadata, and the write is of size + * less than or equal to the page size, write the data + * into the page buffer, loading and inserting a page + * if necessary. + * + * If, in addition, vfd_swmr_writer is TRUE, add the page + * touched by the write to the tick list. + * + * Observe that this function handles casses 1, 2, 5, and 6 + * directly, calls H5PB_write_raw() for cases 3 & 4, and + * calls H5PB_read_meta() for cases 7, and 8. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: John Mainzer -- 10/11/18 + * + * Changes: None. * *------------------------------------------------------------------------- */ herr_t -H5PB_remove_entry(const H5F_t *f, haddr_t addr) +H5PB_write(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, + const void *buf) { - H5PB_t *page_buf = f->shared->page_buf; - H5PB_entry_t *page_entry = NULL; /* pointer to the page entry being searched */ - herr_t ret_value = SUCCEED; /* Return value */ + H5PB_t *pb_ptr; /* Page buffer for this file */ + hbool_t bypass_pb = FALSE; /* Whether to bypass page buffering */ + herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_NOAPI(FAIL) /* Sanity checks */ - HDassert(page_buf); + HDassert(f); + HDassert(f->shared); + HDassert(type != H5FD_MEM_GHEAP); + + pb_ptr = f->shared->pb_ptr; + + if ( pb_ptr == NULL ) { + + bypass_pb = TRUE; /* case 1) -- page buffer is disabled */ + + } else { + + HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); + + if ( H5FD_MEM_DRAW == type ) { /* raw data read */ - /* Search for address in the skip list */ - page_entry = (H5PB_entry_t *)H5SL_search(page_buf->slist_ptr, (void *)(&addr)); + if ( pb_ptr->min_md_pages == pb_ptr->max_pages ) { - /* If found, remove the entry from the PB cache */ - if(page_entry) { - HDassert(page_entry->type != H5F_MEM_PAGE_DRAW); - if(NULL == H5SL_remove(page_buf->slist_ptr, &(page_entry->addr))) - HGOTO_ERROR(H5E_CACHE, H5E_BADVALUE, FAIL, "Page Entry is not in skip list") + /* case 2) -- page buffer configured for metadata only */ + bypass_pb = TRUE; - /* Remove from LRU list */ - H5PB__REMOVE_LRU(page_buf, page_entry) - HDassert(H5SL_count(page_buf->slist_ptr) == page_buf->LRU_list_len); + } + } else { /* metadata read */ - page_buf->meta_count--; + if ( pb_ptr->min_rd_pages == pb_ptr->max_pages ) { + + /* case 5) -- page buffer configured for raw data only */ + bypass_pb = TRUE; + + } else if ( ( size > pb_ptr->page_size ) && + ( ! ( pb_ptr->vfd_swmr_writer ) ) ) { + + /* case 6) -- md read larger than one page and + * pb_ptr->vfd_swmr_writer is FALSE. + */ + bypass_pb = TRUE; + } + } + } + +#ifdef H5_HAVE_PARALLEL + /* at present, the page buffer must be disabled in the parallel case. + * However, just in case ... + */ + if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) { + + bypass_pb = TRUE; - page_entry->page_buf_ptr = H5FL_FAC_FREE(page_buf->page_fac, page_entry->page_buf_ptr); - page_entry = H5FL_FREE(H5PB_entry_t, page_entry); } /* end if */ +#endif /* H5_HAVE_PARALLEL */ + + if ( bypass_pb ) { /* cases 1, 2. 5, and 6 */ + +#if VFD_IO + if ( H5FD_write(f->shared->lf, type, addr, size, buf) < 0 ) +#else /* VFD_IO */ + if ( H5F__accum_write(f, type, addr, size, buf) < 0 ) +#endif /* VFD_IO */ + + HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \ + "write through metadata accumulator failed") + + /* Update statistics */ + if ( pb_ptr ) { + + H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size); + } + } else { + + if ( H5FD_MEM_DRAW == type ) { /* cases 3 and 4 */ + + if ( H5PB__write_raw(f, type, addr, size, buf) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \ + "H5PB_read_raw() failed") + + } else { /* cases 7, and 8 */ + + if ( H5PB__write_meta(f, type, addr, size, buf) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \ + "H5PB_read_meta() failed") + } + + H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size); + } done: + FUNC_LEAVE_NOAPI(ret_value) -} /* H5PB_remove_entry */ + +} /* end H5PB_write() */ +/**************************************************************************/ +/***************************** STATIC FUNCTIONS ***************************/ +/**************************************************************************/ + /*------------------------------------------------------------------------- - * Function: H5PB_read * - * Purpose: Reads in the data from the page containing it if it exists - * in the PB cache; otherwise reads in the page through the VFD. + * Function: H5PB__allocate_page + * + * Purpose: Allocate an instance of H5PB_entry_t and its associated + * buffer. The supplied size must be greater than or + * equal to pb_ptr->page_size, and equal to that value if + * pb_ptr->vfd_swmr_writer is FALSE. + * + * The associated buffer is zeroed if clean_image is TRUE. + * + * Return: Pointer to the newly allocated instance of H5PB_entry_t + * on success, and NULL on failure. + * + * Programmer: John Mainzer -- 10/12/18 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +static H5PB_entry_t * +H5PB__allocate_page(H5PB_t *pb_ptr, size_t size, hbool_t clean_image) +{ + H5PB_entry_t *entry_ptr = NULL; + void * image_ptr = NULL; + H5PB_entry_t *ret_value = NULL; /* Return value */ + + FUNC_ENTER_NOAPI(NULL) + + /* sanity checks */ + HDassert(pb_ptr); + HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); + HDassert(size >= pb_ptr->page_size); + HDassert((size == pb_ptr->page_size) || (pb_ptr->vfd_swmr_writer)); + + /* allocate the entry and its associated image buffer */ + if ( NULL == (entry_ptr = H5FL_MALLOC(H5PB_entry_t))) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_NOSPACE, NULL, \ + "memory allocation for H5PB_entry_t failed") + + if ( clean_image ) { + + image_ptr = H5MM_calloc(size); + + } else { + + image_ptr = H5MM_malloc(size); + } + + if ( NULL == image_ptr ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_NOSPACE, NULL, \ + "memory allocation for page image failed") + + /* initialize the new page buffer entry */ + entry_ptr->magic = H5PB__H5PB_ENTRY_T_MAGIC; + entry_ptr->pb_ptr = pb_ptr; + entry_ptr->addr = HADDR_UNDEF; + entry_ptr->page = 0; + entry_ptr->size = size; + entry_ptr->image_ptr = image_ptr; + entry_ptr->mem_type = H5FD_MEM_DEFAULT; + entry_ptr->is_metadata = FALSE; + entry_ptr->is_mpmde = FALSE; + entry_ptr->is_dirty = FALSE; + + /* fields supporting the hash table */ + entry_ptr->ht_prev = NULL; + entry_ptr->ht_next = NULL; + + /* fields supporting replacement policise */ + entry_ptr->next = NULL; + entry_ptr->prev = NULL; + + /* fields supporting VFD SWMR */ + entry_ptr->is_mpmde = FALSE; + entry_ptr->loaded = FALSE; + entry_ptr->modified_this_tick = FALSE; + entry_ptr->delay_write_until = 0; + entry_ptr->tl_next = NULL; + entry_ptr->tl_prev = NULL; + + ret_value = entry_ptr; + +done: + + if ( NULL == ret_value ) { + + if ( entry_ptr ) { + + entry_ptr->magic = 0; + entry_ptr = H5FL_FREE(H5PB_entry_t, entry_ptr); + } + + if ( image_ptr ) { + + image_ptr = H5MM_xfree(image_ptr); + } + } /* end if */ + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5PB__allocate_page() */ + + +/*------------------------------------------------------------------------- + * + * Function: H5PB__create_new_page + * + * Purpose: Create a new page and insert it in the page buffer with + * the specified address and type. If entry_ptr_ptr is not + * NULL, return a pointer to the new entry in *entry_ptr_ptr. + * + * Throw an error if a page already exists at the specified + * address. * * Return: Non-negative on success/Negative on failure * - * Programmer: Mohamad Chaarawi + * Programmer: John Mainzer -- 10/12/18 + * + * Changes: None. * *------------------------------------------------------------------------- */ -herr_t -H5PB_read(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, void *buf/*out*/) +herr_t +H5PB__create_new_page(H5PB_t *pb_ptr, haddr_t addr, size_t size, + H5FD_mem_t type, hbool_t clean_image, H5PB_entry_t **entry_ptr_ptr) { - H5PB_t *page_buf; /* Page buffering info for this file */ - H5PB_entry_t *page_entry; /* Pointer to the corresponding page entry */ - H5FD_t *file; /* File driver pointer */ - haddr_t first_page_addr, last_page_addr; /* Addresses of the first and last pages covered by I/O */ - haddr_t offset; - haddr_t search_addr; /* Address of current page */ - hsize_t num_touched_pages; /* Number of pages accessed */ - size_t access_size; - hbool_t bypass_pb = FALSE; /* Whether to bypass page buffering */ - hsize_t i; /* Local index variable */ - herr_t ret_value = SUCCEED; /* Return value */ + hbool_t inserted_in_index = FALSE; + hbool_t inserted_in_lru = FALSE; + uint64_t page; + H5PB_entry_t *entry_ptr = NULL; + herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_NOAPI(FAIL) /* Sanity checks */ - HDassert(f); - HDassert(type != H5FD_MEM_GHEAP); + HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); - /* Get pointer to page buffer info for this file */ - page_buf = f->shared->page_buf; + page = (uint64_t)addr / (uint64_t)(pb_ptr->page_size); -#ifdef H5_HAVE_PARALLEL - if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) { -#if 1 - bypass_pb = TRUE; -#else - /* MSC - why this stopped working ? */ - int mpi_size; - - if((mpi_size = H5F_mpi_get_size(f)) < 0) - HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTGET, FAIL, "can't retrieve MPI communicator size") - if(1 != mpi_size) - bypass_pb = TRUE; -#endif - } /* end if */ -#endif + HDassert((uint64_t)(addr) == (page * (uint64_t)(pb_ptr->page_size))); - /* If page buffering is disabled, or the I/O size is larger than that of a - * single page, or if this is a parallel raw data access, bypass page - * buffering. - */ - if(NULL == page_buf || size >= page_buf->page_size || - (bypass_pb && H5FD_MEM_DRAW == type)) { - if(H5F__accum_read(f, type, addr, size, buf) < 0) - HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, "read through metadata accumulator failed") + HDassert(size >= pb_ptr->page_size); + HDassert((size == pb_ptr->page_size) || + ((pb_ptr->vfd_swmr_writer) && (type != H5FD_MEM_DRAW))); + HDassert((NULL == entry_ptr_ptr) || (NULL == *entry_ptr_ptr)); - /* Update statistics */ - if(page_buf) { - if(type == H5FD_MEM_DRAW) - page_buf->bypasses[1] ++; - else - page_buf->bypasses[0] ++; - } /* end if */ - - /* If page buffering is disabled, or if this is a large metadata access, - * or if this is parallel raw data access, we are done here + H5PB__SEARCH_INDEX(pb_ptr, page, entry_ptr, FAIL); + + if ( entry_ptr != NULL ) { + +#if 0 /* JRM */ + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "page buffer already contains a page at the specified address") +#else /* JRM */ + /* this should be an error, but until we update the page allocation + * code to tell the page buffer to discard the associated entry + * whenever a page is freed, this situation can occur. + * + * For now, just force the eviction of the existing page. + * Delete this code as soon as the paged allocation code is + * updated accordingly */ - if(NULL == page_buf || (size >= page_buf->page_size && H5FD_MEM_DRAW != type) || - (bypass_pb && H5FD_MEM_DRAW == type)) - HGOTO_DONE(SUCCEED) - } /* end if */ + if ( H5PB__evict_entry(pb_ptr, entry_ptr, TRUE) < 0 ) - /* Update statistics */ - if(page_buf) { - if(type == H5FD_MEM_DRAW) - page_buf->accesses[1]++; - else - page_buf->accesses[0]++; - } /* end if */ + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, "forced eviction failed") - /* Calculate the aligned address of the first page */ - first_page_addr = (addr / page_buf->page_size) * page_buf->page_size; +#endif /* JRM */ + } + + entry_ptr = H5PB__allocate_page(pb_ptr, size, clean_image); + + if ( NULL == entry_ptr ) + HGOTO_ERROR(H5E_PAGEBUF, H5E_NOSPACE, FAIL, \ + "Can't allocate new page buffer entry") + + /* perform additional initialization */ + HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC); + HDassert(entry_ptr->pb_ptr == pb_ptr); + entry_ptr->addr = addr; + entry_ptr->page = page; + HDassert(entry_ptr->size == size); + HDassert(entry_ptr->image_ptr); + entry_ptr->mem_type = type; + entry_ptr->is_metadata = (type != H5FD_MEM_DRAW); + entry_ptr->is_mpmde = ((entry_ptr->is_metadata) && + (size > pb_ptr->page_size)); + entry_ptr->is_dirty = FALSE; + + /* insert in the hash table */ + H5PB__INSERT_IN_INDEX(pb_ptr, entry_ptr, FAIL) + inserted_in_index = TRUE; - /* For Raw data calculate the aligned address of the last page and - * the number of pages accessed if more than 1 page is accessed + /* insert at the head of the LRU */ + H5PB__UPDATE_RP_FOR_INSERTION(pb_ptr, entry_ptr, FAIL) + inserted_in_lru = TRUE; + + /* updates stats */ + H5PB__UPDATE_STATS_FOR_INSERTION(pb_ptr, entry_ptr); + + if ( entry_ptr_ptr ) { + + *entry_ptr_ptr = entry_ptr; + } + +done: + + if ( ret_value < 0 ) { + + if ( entry_ptr ) { + + if ( inserted_in_lru ) { + + H5PB__UPDATE_RP_FOR_EVICTION(pb_ptr, entry_ptr, FAIL); + } + + if ( inserted_in_index ) { + + H5PB__DELETE_FROM_INDEX(pb_ptr, entry_ptr, FAIL) + } + + H5PB__deallocate_page(entry_ptr); + entry_ptr = NULL; + } + } + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5PB_add_new_page */ + + +/*------------------------------------------------------------------------- + * + * Function: H5PB__deallocate_page + * + * Purpose: Free the supplied instance of H5PB_entry_t and its + * associated buffer. The entry must be clean and removed + * from the page buffer before this function is called. + * + * Return: void + * + * Programmer: John Mainzer -- 10/12/18 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +static void +H5PB__deallocate_page(H5PB_entry_t *entry_ptr) +{ + FUNC_ENTER_NOAPI_NOINIT_NOERR + + /* sanity checks */ + HDassert(entry_ptr); + HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC); + HDassert(entry_ptr->size > 0); + HDassert(entry_ptr->image_ptr); + HDassert(!(entry_ptr->is_dirty)); + HDassert(entry_ptr->ht_next == NULL); + HDassert(entry_ptr->ht_prev == NULL); + HDassert(entry_ptr->next == NULL); + HDassert(entry_ptr->prev == NULL); + HDassert(entry_ptr->tl_next == NULL); + HDassert(entry_ptr->tl_prev == NULL); + + entry_ptr->magic = 0; + entry_ptr->image_ptr = H5MM_xfree(entry_ptr->image_ptr); + entry_ptr = H5FL_FREE(H5PB_entry_t, entry_ptr); + + FUNC_LEAVE_NOAPI_VOID + +} /* H5PB__deallocate_page() */ + + +/*------------------------------------------------------------------------- + * + * Function: H5PB__evict_entry + * + * Purpose: Evict the target entry from the from the page buffer, and + * de-allocate its associated image and instance of + * H5PB_entry_t.. + * + * In general, entries must be clean before they can be + * evicted, and the minimum metadata and raw data limits + * must be respected. Attempts to evict an entry that + * that do not respect these constraints will generate + * and error unless the force parameter is TRUE, in which + * case, these constraints are igmored. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: John Mainzer -- 10/14/18 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +static herr_t +H5PB__evict_entry(H5PB_t *pb_ptr, H5PB_entry_t *entry_ptr, hbool_t force) +{ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(FAIL) + + /* sanity checks */ + HDassert(pb_ptr); + HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); + HDassert(entry_ptr); + HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC); + HDassert(entry_ptr->size > 0); + HDassert(entry_ptr->image_ptr); + /* entries on either the tick list or the delayed write + * list may not be evicted -- verify this. */ - if(H5FD_MEM_DRAW == type) { - last_page_addr = ((addr + size - 1) / page_buf->page_size) * page_buf->page_size; - - /* How many pages does this write span */ - num_touched_pages = (last_page_addr / page_buf->page_size + 1) - - (first_page_addr / page_buf->page_size); - if(first_page_addr == last_page_addr) { - HDassert(1 == num_touched_pages); - last_page_addr = HADDR_UNDEF; - } /* end if */ - } /* end if */ - /* Otherwise set last page addr to HADDR_UNDEF */ - else { - num_touched_pages = 1; - last_page_addr = HADDR_UNDEF; - } /* end else */ + HDassert(!(entry_ptr->modified_this_tick)); + HDassert(entry_ptr->delay_write_until == 0); + + if ( ( ! force ) && ( entry_ptr->is_dirty ) ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "Attempt to evict a dirty entry"); + + if ( ! force ) { + + /* it is OK to evict an metadata page if pb_ptr->curr_md_pages == + * pb_ptr->min_md_pages - 1 if we are about to replace it with another + * metadata page. + * + * Similarly, it is OK to evict an raw data page if + * pb_ptr->curr_rd_pages == pb_ptr->min_rd_pages - 1 if we are + * about to replace it with another raw data page. + * + * Assume sanity checks have been made before this call, and + * allow the above without testing the intended replacement. + */ + if ( ( entry_ptr->is_metadata ) && + ( pb_ptr->curr_md_pages < pb_ptr->min_md_pages ) ) { - /* Translate to file driver I/O info object */ - file = f->shared->lf; + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "Attempt to violate min_md_pages"); + + } else if ( ( ! entry_ptr->is_metadata ) && + ( pb_ptr->curr_rd_pages < pb_ptr->min_rd_pages ) ) { + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "Attempt to violate min_rd_pages"); + } + } else if ( ( entry_ptr->is_dirty ) && + ( H5PB__mark_entry_clean(pb_ptr, entry_ptr) < 0 ) ) { + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, "mark entry clean failed") + } + + /* remove the entry from the LRU */ + H5PB__UPDATE_RP_FOR_EVICTION(pb_ptr, entry_ptr, FAIL) + + /* remove the entry from the hash table */ + H5PB__DELETE_FROM_INDEX(pb_ptr, entry_ptr, FAIL) + + /* update stats for eviction */ + H5PB__UPDATE_STATS_FOR_EVICTION(pb_ptr, entry_ptr) + + /* deallocate the page */ + H5PB__deallocate_page(entry_ptr); + +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5PB__evict_entry() */ + + +/*------------------------------------------------------------------------- + * + * Function: H5PB__flush_entry + * + * Purpose: Flush the target entry to file. + * + * Under normal circumstances, the entry will be in the + * replacement policy. In this, also update the replacement + * policy for flush. + * + * If pb_ptr->vfd_swmr_writer, it is possible that the target + * is a multi-page metadata entry. In this case, the entry + * is not in the replacement policy, and thus the policy + * should not be updated. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: John Mainzer -- 10/14/18 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +static herr_t +H5PB__flush_entry(H5F_t *f, H5PB_t *pb_ptr, H5PB_entry_t *entry_ptr) +{ + hbool_t skip_write = FALSE; + size_t write_size; + haddr_t eoa; /* Current EOA for the file */ + H5FD_t *file; /* file driver */ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(FAIL) + + /* sanity checks */ + HDassert(f); + HDassert(f->shared); + HDassert(f->shared->lf); + HDassert(pb_ptr); + HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); + HDassert(entry_ptr); + HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC); + HDassert(entry_ptr->size > 0); + HDassert(entry_ptr->size >= pb_ptr->page_size); + HDassert((entry_ptr->size == pb_ptr->page_size) || (entry_ptr->is_mpmde)); + HDassert(entry_ptr->image_ptr); + HDassert(entry_ptr->is_dirty); + HDassert((pb_ptr->vfd_swmr_writer) || (!(entry_ptr->is_mpmde))); + HDassert( ( ! (pb_ptr->vfd_swmr_writer) ) || + ( (pb_ptr->cur_tick) >= (entry_ptr->delay_write_until) ) ); + + /* Retrieve the 'eoa' for the file */ + if ( HADDR_UNDEF == (eoa = H5F_get_eoa(f, entry_ptr->mem_type)) ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTGET, FAIL, \ + "driver get_eoa request failed") + +#if 0 /* JRM */ + /* TODO: update the free space manager to inform the page buffer when + * space is de-allocated so that the following assertions will be + * true in all cases. + */ + + /* Verify that the base addresss of the page is within the EOA. If it + * isn't, the associated page has been discarded and should have been + * removed from the page buffer. This is a bug in the HDF5 library, so + * an assertion is adequate here. + */ + HDassert( eoa > entry_ptr->addr ); + + /* Space at the end of the file should be allocate in increments of + * pages. Thus the entire page should be within the EOA. Again, + * an assertion is adequate here. + */ + HDassert( eoa >= entry_ptr->addr + entry_ptr->size ); +#else /* JRM */ + if ( eoa < entry_ptr->addr ) { + + skip_write = TRUE; - /* Copy raw data from dirty pages into the read buffer if the read - request spans pages in the page buffer*/ - if(H5FD_MEM_DRAW == type && size >= page_buf->page_size) { - H5SL_node_t *node; + } else if ( eoa < entry_ptr->addr + entry_ptr->size ) { - /* For each touched page in the page buffer, check if it - * exists in the page Buffer and is dirty. If it does, we - * update the buffer with what's in the page so we get the up - * to date data into the buffer after the big read from the file. + /* adjust the size of the write so that the write + * will not extend beyond EOA. */ - node = H5SL_find(page_buf->slist_ptr, (void *)(&first_page_addr)); - for(i = 0; i < num_touched_pages; i++) { - search_addr = i*page_buf->page_size + first_page_addr; + write_size = (size_t)(eoa - entry_ptr->addr); - /* if we still haven't located a starting page, search again */ - if(!node && i!=0) - node = H5SL_find(page_buf->slist_ptr, (void *)(&search_addr)); + } else { - /* if the current page is in the Page Buffer, do the updates */ - if(node) { - page_entry = (H5PB_entry_t *)H5SL_item(node); + write_size = entry_ptr->size; + } + +#endif /* JRM */ - HDassert(page_entry); - /* If the current page address falls out of the access - block, then there are no more pages to go over */ - if(page_entry->addr >= addr + size) - break; + /* flush the entry */ + if ( ! skip_write ) { +#if VFD_IO /* JRM */ + file = f->shared->lf; - HDassert(page_entry->addr == search_addr); + if ( H5FD_write(file, entry_ptr->mem_type, entry_ptr->addr, + write_size, entry_ptr->image_ptr) < 0 ) +#else /* VFD_IO */ /* JRM */ + if ( H5F__accum_write(f, entry_ptr->mem_type, entry_ptr->addr, + write_size, entry_ptr->image_ptr) < 0 ) +#endif /* VFD_IO */ /* JRM */ - if(page_entry->is_dirty) { - /* special handling for the first page if it is not a full page access */ - if(i == 0 && first_page_addr != addr) { - offset = addr - first_page_addr; - HDassert(page_buf->page_size > offset); + HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, "file write failed") + } - HDmemcpy(buf, (uint8_t *)page_entry->page_buf_ptr + offset, - page_buf->page_size - (size_t)offset); + /* mark the entry clean */ + entry_ptr->is_dirty = FALSE; - /* move to top of LRU list */ - H5PB__MOVE_TO_TOP_LRU(page_buf, page_entry) - } /* end if */ - /* special handling for the last page if it is not a full page access */ - else if(num_touched_pages > 1 && i == num_touched_pages-1 && search_addr < addr+size) { - offset = (num_touched_pages-2)*page_buf->page_size + - (page_buf->page_size - (addr - first_page_addr)); - HDmemcpy((uint8_t *)buf + offset, page_entry->page_buf_ptr, - (size_t)((addr + size) - last_page_addr)); + /* if the entry is on the LRU, update the replacement policy */ + if ( ! (entry_ptr->is_mpmde) ) { - /* move to top of LRU list */ - H5PB__MOVE_TO_TOP_LRU(page_buf, page_entry) - } /* end else-if */ - /* copy the entire fully accessed pages */ - else { - offset = i*page_buf->page_size; - - HDmemcpy((uint8_t *)buf+(i*page_buf->page_size) , page_entry->page_buf_ptr, - page_buf->page_size); - } /* end else */ - } /* end if */ - node = H5SL_next(node); - } /* end if */ - } /* end for */ - } /* end if */ - else { - /* A raw data access could span 1 or 2 PB entries at this point so - we need to handle that */ - HDassert(1 == num_touched_pages || 2 == num_touched_pages); - for(i = 0 ; i < num_touched_pages; i++) { - haddr_t buf_offset; - - /* Calculate the aligned address of the page to search for it in the skip list */ - search_addr = (0==i ? first_page_addr : last_page_addr); - - /* Calculate the access size if the access spans more than 1 page */ - if(1 == num_touched_pages) - access_size = size; - else - access_size = (0 == i ? (size_t)((first_page_addr + page_buf->page_size) - addr) : (size - access_size)); - - /* Lookup the page in the skip list */ - page_entry = (H5PB_entry_t *)H5SL_search(page_buf->slist_ptr, (void *)(&search_addr)); - - /* if found */ - if(page_entry) { - offset = (0 == i ? addr - page_entry->addr : 0); - buf_offset = (0 == i ? 0 : size - access_size); - - /* copy the requested data from the page into the input buffer */ - HDmemcpy((uint8_t *)buf + buf_offset, (uint8_t *)page_entry->page_buf_ptr + offset, access_size); - - /* Update LRU */ - H5PB__MOVE_TO_TOP_LRU(page_buf, page_entry) - - /* Update statistics */ - if(type == H5FD_MEM_DRAW) - page_buf->hits[1]++; - else - page_buf->hits[0]++; - } /* end if */ - /* if not found */ - else { - void *new_page_buf = NULL; - size_t page_size = page_buf->page_size; - haddr_t eoa; - - /* make space for new entry */ - if((H5SL_count(page_buf->slist_ptr) * page_buf->page_size) >= page_buf->max_size) { - htri_t can_make_space; - - /* check if we can make space in page buffer */ - if((can_make_space = H5PB__make_space(f, page_buf, type)) < 0) - HGOTO_ERROR(H5E_PAGEBUF, H5E_NOSPACE, FAIL, "make space in Page buffer Failed") - - /* if make_space returns 0, then we can't use the page - buffer for this I/O and we need to bypass */ - if(0 == can_make_space) { - /* make space can't return FALSE on second touched page since the first is of the same type */ - HDassert(0 == i); - - /* read entire block from VFD and return */ - if(H5FD_read(file, type, addr, size, buf) < 0) - HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, "driver read request failed") - - /* Break out of loop */ - break; - } /* end if */ - } /* end if */ - - /* Read page from VFD */ - if(NULL == (new_page_buf = H5FL_FAC_MALLOC(page_buf->page_fac))) - HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTALLOC, FAIL, "memory allocation failed for page buffer entry") - - /* Read page through the VFD layer, but make sure we don't read past the EOA. */ - - /* Retrieve the 'eoa' for the file */ - if(HADDR_UNDEF == (eoa = H5F_get_eoa(f, type))) - HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTGET, FAIL, "driver get_eoa request failed") - - /* If the entire page falls outside the EOA, then fail */ - if(search_addr > eoa) - HGOTO_ERROR(H5E_PAGEBUF, H5E_BADVALUE, FAIL, "reading an entire page that is outside the file EOA") - - /* Adjust the read size to not go beyond the EOA */ - if(search_addr + page_size > eoa) - page_size = (size_t)(eoa - search_addr); - - /* Read page from VFD */ - if(H5FD_read(file, type, search_addr, page_size, new_page_buf) < 0) - HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, "driver read request failed") - - /* Copy the requested data from the page into the input buffer */ - offset = (0 == i ? addr - search_addr : 0); - buf_offset = (0 == i ? 0 : size - access_size); - HDmemcpy((uint8_t *)buf + buf_offset, (uint8_t *)new_page_buf + offset, access_size); - - /* Create the new PB entry */ - if(NULL == (page_entry = H5FL_CALLOC(H5PB_entry_t))) - HGOTO_ERROR(H5E_PAGEBUF, H5E_NOSPACE, FAIL, "memory allocation failed") - - page_entry->page_buf_ptr = new_page_buf; - page_entry->addr = search_addr; - page_entry->type = (H5F_mem_page_t)type; - page_entry->is_dirty = FALSE; - - /* Insert page into PB */ - if(H5PB__insert_entry(page_buf, page_entry) < 0) - HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTSET, FAIL, "error inserting new page in page buffer") - - /* Update statistics */ - if(type == H5FD_MEM_DRAW) - page_buf->misses[1]++; - else - page_buf->misses[0]++; - } /* end else */ - } /* end for */ - } /* end else */ + H5PB__UPDATE_RP_FOR_FLUSH(pb_ptr, entry_ptr, FAIL) + } + + /* update stats for flush */ + H5PB__UPDATE_STATS_FOR_FLUSH(pb_ptr, entry_ptr) done: + FUNC_LEAVE_NOAPI(ret_value) -} /* end H5PB_read() */ + +} /* H5PB__flush_entry() */ /*------------------------------------------------------------------------- - * Function: H5PB_write * - * Purpose: Write data into the Page Buffer. If the page exists in the - * cache, update it; otherwise read it from disk, update it, and - * insert into cache. + * Function: H5PB__load_page * - * Return: Non-negative on success/Negative on failure + * Purpose: Load the page with the specified base address and insert + * it into the page buffer. If necessary and possible, make + * space for the new page first. * - * Programmer: Mohamad Chaarawi + * Note that the size of the page is always pb_ptr->page_size, + * even in the VFD SWMR case, as in this context, multi-page + * metadata entries are always written in full, and they + * may only enter the page buffer as the result of a write. + * + * Return: SUCCEED if no errors are encountered, and + * FAIL otherwise. + * + * Programmer: John Mainzer -- 10/18/18 + * + * Changes: None. * *------------------------------------------------------------------------- */ -herr_t -H5PB_write(H5F_t *f, H5FD_mem_t type, haddr_t addr, - size_t size, const void *buf) +static herr_t +H5PB__load_page(H5F_t *f, H5PB_t *pb_ptr, haddr_t addr, H5FD_mem_t type, + H5PB_entry_t **entry_ptr_ptr) { - H5PB_t *page_buf; /* Page buffering info for this file */ - H5PB_entry_t *page_entry; /* Pointer to the corresponding page entry */ + hbool_t skip_read = FALSE; + haddr_t eoa; + haddr_t eof = HADDR_UNDEF; + H5PB_entry_t *entry_ptr = NULL; + void *image_ptr = NULL; H5FD_t *file; /* File driver pointer */ - haddr_t first_page_addr, last_page_addr; /* Addresses of the first and last pages covered by I/O */ - haddr_t offset; - haddr_t search_addr; /* Address of current page */ - hsize_t num_touched_pages; /* Number of pages accessed */ - size_t access_size; - hbool_t bypass_pb = FALSE; /* Whether to bypass page buffering */ - hsize_t i; /* Local index variable */ - herr_t ret_value = SUCCEED; /* Return value */ + herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_NOAPI(FAIL) - /* Sanity checks */ + /* sanity checks */ HDassert(f); + HDassert(f->shared); + HDassert(f->shared->lf); - /* Get pointer to page buffer info for this file */ - page_buf = f->shared->page_buf; + file = f->shared->lf; -#ifdef H5_HAVE_PARALLEL - if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) { -#if 1 - bypass_pb = TRUE; -#else - /* MSC - why this stopped working ? */ - int mpi_size; - - if((mpi_size = H5F_mpi_get_size(f)) < 0) - HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTGET, FAIL, "can't retrieve MPI communicator size") - if(1 != mpi_size) - bypass_pb = TRUE; -#endif - } /* end if */ -#endif + HDassert(pb_ptr); + HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); + HDassert((entry_ptr_ptr == NULL) || (*entry_ptr_ptr == NULL)); + + /* Retrieve the 'eoa' for the file */ + if ( HADDR_UNDEF == (eoa = H5F_get_eoa(f, type))) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTGET, FAIL, \ + "driver get_eoa request failed") + + if ( addr + ((haddr_t)(pb_ptr->page_size)) > eoa ) - /* If page buffering is disabled, or the I/O size is larger than that of a - * single page, or if this is a parallel raw data access, bypass page - * buffering. + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "Attempt to load page that extends past EOA") + + if ( HADDR_UNDEF == (eof = H5FD_get_eof(f->shared->lf, H5FD_MEM_DEFAULT)) ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTGET, FAIL, \ + "driver get_eof request failed") + + /* It is possible that this page been allocated but not + * written. Skip the read if addr > EOF. In this case, tell + * H5PB__create_new_page() to zero the page image. */ - if(NULL == page_buf || size >= page_buf->page_size || bypass_pb) { - if(H5F__accum_write(f, type, addr, size, buf) < 0) - HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, "write through metadata accumulator failed") + skip_read = (addr >= eof); - /* Update statistics */ - if(page_buf) { - if(type == H5FD_MEM_DRAW || type == H5FD_MEM_GHEAP) - page_buf->bypasses[1]++; - else - page_buf->bypasses[0]++; - } /* end if */ - - /* If page buffering is disabled, or if this is a large metadata access, - * or if this is a parallel raw data access, we are done here - */ - if(NULL == page_buf || (size >= page_buf->page_size && H5FD_MEM_DRAW != type) || - (bypass_pb && H5FD_MEM_DRAW == type)) - HGOTO_DONE(SUCCEED) -#ifdef H5_HAVE_PARALLEL - if(bypass_pb) { - if(H5PB_update_entry(page_buf, addr, size, buf) > 0) - HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTUPDATE, FAIL, "failed to update PB with metadata cache") - HGOTO_DONE(SUCCEED) - } /* end if */ -#endif - } /* end if */ + /* make space in the page buffer if necessary */ + if ( ( pb_ptr->curr_pages >= pb_ptr->max_pages ) && + ( H5PB__make_space(f, pb_ptr, type) < 0 ) ) - /* Update statistics */ - if(page_buf) { - if(type == H5FD_MEM_DRAW || type == H5FD_MEM_GHEAP) - page_buf->accesses[1]++; - else - page_buf->accesses[0]++; - } /* end if */ + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "H5PB__make_space() reports an error") - /* Calculate the aligned address of the first page */ - first_page_addr = (addr / page_buf->page_size) * page_buf->page_size; - /* For raw data calculate the aligned address of the last page and - * the number of pages accessed if more than 1 page is accessed + /* Create a new page buffer page and insert it into the page buffer */ + if ( H5PB__create_new_page(pb_ptr, addr, (size_t)(pb_ptr->page_size), + type, skip_read, &entry_ptr) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "can't create new page buffer page") + + HDassert(entry_ptr); + HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC); + HDassert(entry_ptr->addr == addr); + + image_ptr = entry_ptr->image_ptr; + + HDassert(image_ptr); + + /* Read the contents of the page from file, and store it in the + * image buffer associated with the new entry. */ - if(H5FD_MEM_DRAW == type) { - last_page_addr = (addr + size - 1) / page_buf->page_size * page_buf->page_size; - - /* how many pages does this write span */ - num_touched_pages = (last_page_addr/page_buf->page_size + 1) - - (first_page_addr / page_buf->page_size); - if(first_page_addr == last_page_addr) { - HDassert(1 == num_touched_pages); - last_page_addr = HADDR_UNDEF; - } /* end if */ - } /* end if */ - /* Otherwise set last page addr to HADDR_UNDEF */ - else { - num_touched_pages = 1; - last_page_addr = HADDR_UNDEF; - } /* end else */ +#if VFD_IO /* JRM */ + if ( ( ! skip_read ) && + ( H5FD_read(file, type, addr, entry_ptr->size, image_ptr) < 0 ) ) +#else /* VFD_IO */ /* JRM */ + if ( ( ! skip_read ) && + ( H5F__accum_read(f, type, addr, entry_ptr->size, image_ptr) < 0 ) ) +#endif /* VFD_IO */ /* JRM */ + + HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ + "driver read request failed") + + H5PB__UPDATE_STATS_FOR_LOAD(pb_ptr, entry_ptr) + + if ( entry_ptr_ptr ) { + + *entry_ptr_ptr = entry_ptr; + } + +done: + + /* add cleanup in case of failure */ + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5PB__load_page() */ + + +/*------------------------------------------------------------------------- + * + * Function: H5PB__make_space + * + * Purpose: Evict one or more pages from the page buffer so as to + * reduce the size of the page buffer to pb_ptr->max_pages - 1. + * if possible. + * + * Note that the function must not be called under + * non-sencicle conditions -- thus if either + * + * 1) the inserted type is metadata and min_rd_pages == + * max_pages, or + * + * 2) the inserted type is raw data and min_md_pages == + * max_pages + * + * holds, the function has been called in error, and an + * assertion failure is appropriate. + * + * If the page buffer is below its maximum size, we are + * done, and the function simply returns. + * + * Otherwise, scan upwards from the bottom of the LRU list, + * examining each entry in turn. + * + * If the entry is dirty, flush it, move it to the top of the + * LRU, and continue with the scan. Note in the VFD SWMR case, + * we do not have to concern ourselves with delayed writes in + * this context, as all entries which are subject to delayed + * writes must reside on the delayed write list, not the LRU list. + * + * If the entry is: + * + * 1) clean + * + * 2) either: + * + * a) the target entry is metadata and + * curr_md_pages > min_md_pages. + * + * b) the target entry is raw data and + * curr_rd_pages > min_rd_pages. + * + * c) the target entry is metadata, the inserted_type + * is metadata, and curr_md_pages == min_md_pages. + * + * d) the target entry is raw data, the inserted_type + * is raw data, and curr_rd_pages == min_rd_pages. + * + * 3) The entry is not on the tick list (which can only + * happen if pb_ptr->vfd_swmr_writer is TRUE). + * + * evict the entry and test to see if pb_ptr->curr_pages < + * pb_ptr->max_pages. If it is, return. Otherwise, continue + * the scan until either the above condidtion is fulfilled, + * or the head of the LRU is reach. + * + * Under normal circumstances, it should always be possible + * to reduce the size of the page buffer below pb_ptr->max_pages. + * However, due to prohibition on evicting entries on the + * tick list, and either flushing or evicting entries on the + * delayed write list, this will not in general be the case + * if pb_ptr->vfd_swmr_writer is TRUE. In this case, the + * page buffer may exceed its maximum size by an arbitrary + * amount. + * + * If this situation occurs with any regularity, we will + * need a mechanism to avoid attempts to make space when + * it is not possible to do so. + * + * Return: SUCCEED if no errors are encountered, and + * FAIL otherwise. + * + * Programmer: John Mainzer -- 10/14/18 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +static herr_t +H5PB__make_space(H5F_t *f, H5PB_t *pb_ptr, H5FD_mem_t inserted_type) +{ + hbool_t inserting_md; + H5PB_entry_t *search_ptr; + H5PB_entry_t *flush_ptr; + H5PB_entry_t *evict_ptr; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(FAIL) + + /* sanity checks */ + HDassert(f); + HDassert(pb_ptr); + HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); + HDassert(pb_ptr->min_md_pages + pb_ptr->min_rd_pages <= pb_ptr->max_pages); + + inserting_md = ( H5FD_MEM_DRAW != inserted_type ); + + if ( ( inserting_md ) && ( pb_ptr->min_rd_pages == pb_ptr->max_pages ) ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, + "can't make space for metadata -- pb config for raw data only") + + if ( ( ! inserting_md ) && ( pb_ptr->min_md_pages == pb_ptr->max_pages ) ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, + "can't make space for raw data -- pb config for metadata only") + + search_ptr = pb_ptr->LRU_tail_ptr; + + while ( ( search_ptr ) && ( pb_ptr->curr_pages >= pb_ptr->max_pages ) ) { + + HDassert(search_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC); + + if ( search_ptr->modified_this_tick ) { /* entry is on tick list */ + + search_ptr = search_ptr->prev; + H5PB__UPDATE_STATS_FOR_LRU_TL_SKIP(pb_ptr); + + } else if ( ( inserting_md ) && + ( ! (search_ptr->is_metadata) ) && + ( pb_ptr->curr_rd_pages <= pb_ptr->min_rd_pages ) ) { + + search_ptr = search_ptr->prev; + H5PB__UPDATE_STATS_FOR_LRU_RD_SKIP(pb_ptr); + + } else if ( ( ! inserting_md ) && + ( search_ptr->is_metadata ) && + ( pb_ptr->curr_md_pages <= pb_ptr->min_md_pages ) ) { + + search_ptr = search_ptr->prev; + H5PB__UPDATE_STATS_FOR_LRU_MD_SKIP(pb_ptr); + + } else if ( search_ptr->is_dirty ) { + + /* One can make the argument that we should test for dirty + * entries first, instead of skipping potentially dirty + * entries in the above clauses. However, I suspect that + * this would result in excessive flushes. Lets try it + * this way for now. + */ + + flush_ptr = search_ptr; + + /* if the *search_ptr has a predecessor in the LRU, + * set set search_ptr equal to search_ptr->prev. Otherwise, + * leave search_ptr unchanged, so that it can be examined + * on the next pass through the while loop after it has been + * flushed. + */ + if ( search_ptr->prev ) { + + search_ptr = search_ptr->prev; + } + + if ( H5PB__flush_entry(f, pb_ptr, flush_ptr) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \ + "Can't flush entry") + + } else { /* evict the entry */ + + evict_ptr = search_ptr; + search_ptr = search_ptr->prev; + if ( H5PB__evict_entry(pb_ptr, evict_ptr, FALSE) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \ + "Can't evict entry") + } + } + + HDassert( ( search_ptr == NULL ) || + ( pb_ptr->curr_pages < pb_ptr->max_pages ) ); + +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5PB__make_space() */ + + +/*------------------------------------------------------------------------- + * + * Function: H5PB__mark_entry_clean + * + * Purpose: Mark the target entry clean + * + * This function is typically used when an entry has been + * completely overwritten and is about to be evicted. In + * this case, the entry must be marked clean to avoid + * sanity check failures on evictions. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: John Mainzer -- 10/14/18 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +static herr_t +H5PB__mark_entry_clean(H5PB_t *pb_ptr, H5PB_entry_t *entry_ptr) +{ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(FAIL) + + /* sanity checks */ + HDassert(pb_ptr); + HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); + HDassert(entry_ptr); + HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC); + HDassert(entry_ptr->size > 0); + HDassert(entry_ptr->size >= pb_ptr->page_size); + HDassert((entry_ptr->size == pb_ptr->page_size) || (entry_ptr->is_mpmde)); + HDassert(entry_ptr->image_ptr); + HDassert((pb_ptr->vfd_swmr_writer) || (!(entry_ptr->is_mpmde))); + + /* mark the entry clean */ + entry_ptr->is_dirty = FALSE; + + /* delete this once we start tracking clean and dirty entry is the hash + * table. + */ + if ( ! (entry_ptr->is_mpmde) ) { + + H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, FAIL) + } + +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5PB__mark_entry_clean() */ + + +/*------------------------------------------------------------------------- + * + * Function: H5PB__mark_entry_dirty + * + * Purpose: Mark the target entry as dirty. + * + * Under normal circumstances, the entry will be in the + * replacement policy. In this, also update the replacement + * policy for and access. + * + * If pb_ptr->vfd_swmr_writer, it is possible that the target + * is a multi-page metadata entry. In this case, the entry + * is not in the replacement policy, and thus the policy + * should not be updated. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: John Mainzer -- 10/14/18 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +static herr_t +H5PB__mark_entry_dirty(H5PB_t *pb_ptr, H5PB_entry_t *entry_ptr) +{ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(FAIL) + + /* sanity checks */ + HDassert(pb_ptr); + HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); + HDassert(entry_ptr); + HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC); + HDassert(entry_ptr->size > 0); + HDassert(entry_ptr->size >= pb_ptr->page_size); + HDassert((entry_ptr->size == pb_ptr->page_size) || (entry_ptr->is_mpmde)); + HDassert(entry_ptr->image_ptr); + HDassert((pb_ptr->vfd_swmr_writer) || (!(entry_ptr->is_mpmde))); + + /* mark the entry dirty */ + entry_ptr->is_dirty = TRUE; + + /* if the entry is on the LRU, update the replacement policy */ + if ( ( ! (entry_ptr->is_mpmde) ) && + ( entry_ptr->delay_write_until == 0 ) ) { + + H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, FAIL) + } + +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5PB__mark_entry_dirty() */ + + +/*------------------------------------------------------------------------- + * + * Function: H5PB__read_meta + * + * Purpose: Satisfy a metadata read in cases 7, 8, 9, and 10) + * H5PB_read(). Specifically: + * + * 6) If the read is for metadata and not page aligned, clip + * the read to the end of the current page if necessary. + * Load the relevant page if necessary and satisfy the + * read from the page buffer. Note that it there is an + * existing page, it must not be a multi-page metadata + * entry. It it is, flag an error. + * + * 7) If the read is for metadata, is page aligned, is larger + * than one page, and there is no entry in the page buffer, + * satisfy the read from the file + * + * 8) If the read is for metadata, is page aligned, is larger + * than one page, and there is a regular entry at the target + * page address, test to see if the last read was for the + * same address. + * + * If was, evict the page, and satisfy the read from file. + * Flag an error if the page was dirty. + * + * If the last read was for a different page, clip the read + * to one page, and satisfy the read from the existing + * regular entry. + * + * 9) If the read is for metadata, is page aligned, is larger + * than one page, and there is a multi-page metadata entry + * at the target page address, test to see if + * pb_ptr->vfd_swmr_write is TRUE. + * + * If it is, satisfy the read from the multi-page metadata + * entry, clipping the read if necessary. + * + * if pb_ptr->vfd_swmr_write is FALSE, flag an error. + * + * 10) If the read is for metadata, is page aligned, is no + * larger than a page, test to see if the page buffer + * contains a page at the target address. + * + * If it doesn't, load the page and satisfy the read + * from it. + * + * If it contains a regular page entry, satisfy the read + * from it. + * + * If it contains a multipage metadata entry at the target + * address, satisfy the read from the multi-page metadata + * entry if pb_ptr->vfd_swmr_write is TRUE, and flag an + * error otherwise. + * + * The above case analysis may be a bit hard to read. If so, + * the table shown below may help to clarify. Here: + * + * P/A == page aligned + * size > PL == size > page length + * PA == previous address + * A == current address + * + * In the entry exists column: + * + * N == no entry + * R == regular (1 page) entry + * MPMDE == multi-page metadata entry + * + * | size | entry | VFD | | + * P/A: | > PL | exists | SWMR | PA == A | Comments: + * ------+------+--------+------+---------+------------------------------------- + * N | X | N || R | X | X | Clip read to page boundary if + * | | | | | necessary + * | | | | | Load entry if necessary + * | | | | | Satisfy read from entry (case 6) + * ------+------+--------+------+---------+------------------------------------- + * N | X | MPMDE | X | X | Error (case 6) + * ------+------+--------+------+---------+------------------------------------- + * | | | | | + * ------+------+--------+------+---------+------------------------------------- + * Y | Y | N | X | X | Satisfy read from file (case 7) + * ------+------+--------+------+---------+------------------------------------- + * Y | Y | R | X | N | Clip read to page boundary + * | | | | | Satisfy read from entry (case 8) + * ------+------+--------+------+---------+------------------------------------- + * Y | Y | R | X | Y | Evict entry + * | | | | | (must be clean -- flag error if not) + * | | | | | Satisfy read from file (case 8) + * ------+------+--------+------+---------+------------------------------------- + * Y | Y | MPMDE | N | X | Error (case 9) + * ------+------+--------+------+---------+------------------------------------- + * Y | Y | MPMDE | Y | X | Clip read to MPE size if required. + * | | | | | Satify read from MPE (case 9) + * ------+------+--------+------+---------+------------------------------------- + * | | | | | + * ------+------+--------+------+---------+------------------------------------- + * Y | N | N | X | X | Load entry + * | | | | | Satisfy read from entry (case 10) + * ------+------+--------+------+---------+------------------------------------- + * Y | N | R | X | X | Satisfy read from entry (case 10) + * ------+------+--------+------+---------+------------------------------------- + * Y | N | MPMDE | Y | X | Satisfy read from entry (case 10) + * ------+------+--------+------+---------+------------------------------------- + * Y | N | MPMDE | N | X | Error (case 10) + * ------+------+--------+------+---------+------------------------------------- + * + * Observe that the above cases imply that: + * + * 1) The page buffer is defined. + * + * 2) The page buffer has been configured to accept at least + * one page of metadata. + * + * 3) This is a metadata read. + * + * Note also that if the metadata read is of size + * no larger than page size, it may not cross page + * boundaries. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: John Mainzer -- 10/11/18 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +static herr_t +H5PB__read_meta(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, + void *buf/*out*/) +{ + H5PB_t *pb_ptr; /* Page buffer for this file */ + H5PB_entry_t *entry_ptr; /* Pointer to page buffer entry */ + H5FD_t *file; /* File driver pointer */ + uint64_t page; /* page offset of addr */ + haddr_t page_addr; /* page containg addr */ + static haddr_t prev_addr = HADDR_UNDEF; /* addr of last call */ + size_t offset; /* offset of read in page */ + size_t clipped_size; /* possibley clipped size */ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(FAIL) + + /* Sanity checks */ + HDassert(f); + HDassert(f->shared); + HDassert(f->shared->pb_ptr); + + pb_ptr = f->shared->pb_ptr; + + HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); + HDassert(pb_ptr->min_rd_pages < pb_ptr->max_pages); + HDassert(f->shared->lf); - /* Translate to file driver I/O info object */ file = f->shared->lf; - /* Check if existing pages for raw data need to be updated since raw data access is not atomic */ - if(H5FD_MEM_DRAW == type && size >= page_buf->page_size) { - /* For each touched page, check if it exists in the page buffer, and - * update it with the data in the buffer to keep it up to date + HDassert(H5FD_MEM_DRAW != type); + HDassert(buf); + + /* Calculate the aligned address of the first page */ + page = (addr / pb_ptr->page_size); + page_addr = page * pb_ptr->page_size; + + if ( page_addr != addr ) { /* case 6 */ + + /* If the read is for metadata and not page aligned, clip + * the read to the end of the current page if necessary. + * Load the relevant page if necessary and satisfy the + * read from the page buffer. Note that it there is an + * existing page, it must not be a multi-page metadata + * entry. It it is, flag an error. */ - for(i = 0; i < num_touched_pages; i++) { - search_addr = i * page_buf->page_size + first_page_addr; - /* Special handling for the first page if it is not a full page update */ - if(i == 0 && first_page_addr != addr) { - /* Lookup the page in the skip list */ - page_entry = (H5PB_entry_t *)H5SL_search(page_buf->slist_ptr, (void *)(&search_addr)); - if(page_entry) { - offset = addr - first_page_addr; - HDassert(page_buf->page_size > offset); - - /* Update page's data */ - HDmemcpy((uint8_t *)page_entry->page_buf_ptr + offset, buf, page_buf->page_size - (size_t)offset); - - /* Mark page dirty and push to top of LRU */ - page_entry->is_dirty = TRUE; - H5PB__MOVE_TO_TOP_LRU(page_buf, page_entry) - } /* end if */ - } /* end if */ - /* Special handling for the last page if it is not a full page update */ - else if(num_touched_pages > 1 && i == (num_touched_pages - 1) && - (search_addr + page_buf->page_size) != (addr + size)) { - HDassert(search_addr+page_buf->page_size > addr+size); - - /* Lookup the page in the skip list */ - page_entry = (H5PB_entry_t *)H5SL_search(page_buf->slist_ptr, (void *)(&search_addr)); - if(page_entry) { - offset = (num_touched_pages - 2) * page_buf->page_size + - (page_buf->page_size - (addr - first_page_addr)); - - /* Update page's data */ - HDmemcpy(page_entry->page_buf_ptr, (const uint8_t *)buf + offset, - (size_t)((addr + size) - last_page_addr)); + offset = addr - page_addr; - /* Mark page dirty and push to top of LRU */ - page_entry->is_dirty = TRUE; - H5PB__MOVE_TO_TOP_LRU(page_buf, page_entry) - } /* end if */ - } /* end else-if */ - /* Discard all fully written pages from the page buffer */ - else { - page_entry = (H5PB_entry_t *)H5SL_remove(page_buf->slist_ptr, (void *)(&search_addr)); - if(page_entry) { - /* Remove from LRU list */ - H5PB__REMOVE_LRU(page_buf, page_entry) - - /* Decrement page count of appropriate type */ - if(H5F_MEM_PAGE_DRAW == page_entry->type || H5F_MEM_PAGE_GHEAP == page_entry->type) - page_buf->raw_count--; - else - page_buf->meta_count--; - - /* Free page info */ - page_entry->page_buf_ptr = H5FL_FAC_FREE(page_buf->page_fac, page_entry->page_buf_ptr); - page_entry = H5FL_FREE(H5PB_entry_t, page_entry); - } /* end if */ - } /* end else */ - } /* end for */ - } /* end if */ - else { - /* An access could span 1 or 2 PBs at this point so we need to handle that */ - HDassert(1 == num_touched_pages || 2 == num_touched_pages); - for(i = 0; i < num_touched_pages; i++) { - haddr_t buf_offset; - - /* Calculate the aligned address of the page to search for it in the skip list */ - search_addr = (0 == i ? first_page_addr : last_page_addr); - - /* Calculate the access size if the access spans more than 1 page */ - if(1 == num_touched_pages) - access_size = size; - else - access_size = (0 == i ? (size_t)(first_page_addr + page_buf->page_size - addr) : (size - access_size)); - - /* Lookup the page in the skip list */ - page_entry = (H5PB_entry_t *)H5SL_search(page_buf->slist_ptr, (void *)(&search_addr)); - - /* If found */ - if(page_entry) { - offset = (0 == i ? addr - page_entry->addr : 0); - buf_offset = (0 == i ? 0 : size - access_size); - - /* Copy the requested data from the input buffer into the page */ - HDmemcpy((uint8_t *)page_entry->page_buf_ptr + offset, (const uint8_t *)buf + buf_offset, access_size); - - /* Mark page dirty and push to top of LRU */ - page_entry->is_dirty = TRUE; - H5PB__MOVE_TO_TOP_LRU(page_buf, page_entry) - - /* Update statistics */ - if(type == H5FD_MEM_DRAW || type == H5FD_MEM_GHEAP) - page_buf->hits[1]++; - else - page_buf->hits[0]++; - } /* end if */ - /* If not found */ - else { - void *new_page_buf; - size_t page_size = page_buf->page_size; - - /* Make space for new entry */ - if((H5SL_count(page_buf->slist_ptr) * page_buf->page_size) >= page_buf->max_size) { - htri_t can_make_space; - - /* Check if we can make space in page buffer */ - if((can_make_space = H5PB__make_space(f, page_buf, type)) < 0) - HGOTO_ERROR(H5E_PAGEBUF, H5E_NOSPACE, FAIL, "make space in Page buffer Failed") - - /* If make_space returns 0, then we can't use the page - * buffer for this I/O and we need to bypass + if ( (offset + size) <= pb_ptr->page_size ) { + + clipped_size = size; + + } else { + + clipped_size = size - ( (offset + size) - pb_ptr->page_size); + } + + HDassert( clipped_size > 0 ); + HDassert( clipped_size <= size ); + HDassert( (offset + clipped_size) <= pb_ptr->page_size ); + + /* get the containing page */ + H5PB__SEARCH_INDEX(pb_ptr, page, entry_ptr, FAIL) + + /* update hit rate stats */ + H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, ((entry_ptr) != NULL), \ + TRUE, FALSE) + + if ( ( NULL == entry_ptr ) && + ( H5PB__load_page(f, pb_ptr, page_addr, type, &entry_ptr) < 0 ) ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ + "page buffer page load request failed (1)") + + HDassert(entry_ptr); + HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC); + HDassert(entry_ptr->addr == page_addr); + HDassert(entry_ptr->is_metadata); + HDassert(!(entry_ptr->is_mpmde)); + + /* copy data from the page into read buffer */ + HDmemcpy((uint8_t *)buf, (uint8_t *)(entry_ptr->image_ptr) + offset, + clipped_size); + + /* if the entry is on the LRU, update the replacement policy */ + if ( ( ! (entry_ptr->is_mpmde) ) && + ( entry_ptr->delay_write_until == 0 ) ) { + + H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, FAIL) + } + } else { + + HDassert( page_addr == addr ); + + if ( size > pb_ptr->page_size ) { + + /* search the page buffer for an entry at page */ + H5PB__SEARCH_INDEX(pb_ptr, page, entry_ptr, FAIL) + + + if ( entry_ptr == NULL ) { /* case 7 */ + + /* update hit rate stats */ + H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, FALSE, TRUE, TRUE) + + /* If the read is for metadata, is page aligned, is larger + * than one page, and there is no entry in the page buffer, + * satisfy the read from the file + */ +#if VFD_IO /* JRM */ + if ( H5FD_read(file, type, addr, size, buf) < 0) +#else /* VFD_IO */ /* JRM */ + if ( H5F__accum_read(f, type, addr, size, buf) < 0 ) +#endif /* VFD_IO */ /* JRM */ + + HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ + "driver read request failed (1)") + } else { + + HDassert( entry_ptr ); + HDassert( entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC ); + HDassert( entry_ptr->is_metadata ); + + if ( ! ( entry_ptr->is_mpmde ) ) { /* case 8 */ + + /* If the read is for metadata, is page aligned, is larger + * than one page, and there is a regular entry at the target + * page address, test to see if the last read was for the + * same address. + * + * If was, evict the page, and satisfy the read from file. + * Flag an error if the page was dirty. + * + * If the last read was for a different page, clip the read + * to one page, and satisfy the read from the existing + * regular entry. */ - if(0 == can_make_space) { - HDassert(0 == i); - - /* Write to VFD and return */ - if(H5FD_write(file, type, addr, size, buf) < 0) - HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, "driver write request failed") - - /* Break out of loop */ - break; - } /* end if */ - } /* end if */ - - /* Don't bother searching if there is no write access */ - if(H5F_ACC_RDWR & H5F_INTENT(f)) - /* Lookup & remove the page from the new skip list page if - * it exists to see if this is a new page from the MF layer + + HDassert( entry_ptr->size == pb_ptr->page_size ); + + if ( addr == prev_addr ) { + + /* since this is a second try, don't update + * hit rate stats. + */ + + HDassert( ! ( entry_ptr->is_dirty ) ); + + if ( H5PB__evict_entry(pb_ptr, entry_ptr, TRUE) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "forced eviction failed (1)") +#if VFD_IO /* JRM */ + if ( H5FD_read(file, type, addr, size, buf) < 0) +#else /* VFD_IO */ /* JRM */ + if ( H5F__accum_read(f, type, addr, size, buf) < 0 ) +#endif /* VFD_IO */ /* JRM */ + + HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ + "driver read request failed (2)") + } else { + + HDassert( entry_ptr->image_ptr ); + + /* copy data from the page into read buffer */ + HDmemcpy((uint8_t *)buf, + (uint8_t *)(entry_ptr->image_ptr), + entry_ptr->size); + + /* if the entry is on the LRU, update the replacement + * policy + */ + if ( ( ! (entry_ptr->is_mpmde) ) && + ( entry_ptr->delay_write_until == 0 ) ) { + + H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, FAIL) + } + + /* update hit rate stats */ + H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, TRUE, TRUE, TRUE) + } + } else { /* case 9 */ + + /* If the read is for metadata, is page aligned, is larger + * than one page, and there is a multi-page metadata entry + * at the target page address, test to see if + * pb_ptr->vfd_swmr_write is TRUE. + * + * If it is, satisfy the read from the multi-page metadata + * entry, clipping the read if necessary. + * + * if pb_ptr->vfd_swmr_write is FALSE, flag an error. */ - page_entry = (H5PB_entry_t *)H5SL_remove(page_buf->mf_slist_ptr, (void *)(&search_addr)); - - /* Calculate offset into the buffer of the page and the user buffer */ - offset = (0 == i ? addr - search_addr : 0); - buf_offset = (0 == i ? 0 : size - access_size); - - /* If found, then just update the buffer pointer to the newly allocate buffer */ - if(page_entry) { - /* Allocate space for the page buffer */ - if(NULL == (new_page_buf = H5FL_FAC_MALLOC(page_buf->page_fac))) - HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTALLOC, FAIL, "memory allocation failed for page buffer entry") - HDmemset(new_page_buf, 0, (size_t)offset); - HDmemset((uint8_t *)new_page_buf + offset + access_size, 0, page_size - ((size_t)offset + access_size)); - - page_entry->page_buf_ptr = new_page_buf; - - /* Update statistics */ - if(type == H5FD_MEM_DRAW || type == H5FD_MEM_GHEAP) - page_buf->hits[1]++; - else - page_buf->hits[0]++; - } /* end if */ - /* Otherwise read page through the VFD layer, but make sure we don't read past the EOA. */ - else { - haddr_t eoa, eof = HADDR_UNDEF; - - /* Allocate space for the page buffer */ - if(NULL == (new_page_buf = H5FL_FAC_CALLOC(page_buf->page_fac))) - HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTALLOC, FAIL, "memory allocation failed for page buffer entry") - - /* Create the new loaded PB entry */ - if(NULL == (page_entry = H5FL_CALLOC(H5PB_entry_t))) - HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTALLOC, FAIL, "memory allocation failed") - - page_entry->page_buf_ptr = new_page_buf; - page_entry->addr = search_addr; - page_entry->type = (H5F_mem_page_t)type; - - /* Retrieve the 'eoa' for the file */ - if(HADDR_UNDEF == (eoa = H5F_get_eoa(f, type))) - HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTGET, FAIL, "driver get_eoa request failed") - - /* If the entire page falls outside the EOA, then fail */ - if(search_addr > eoa) - HGOTO_ERROR(H5E_PAGEBUF, H5E_BADVALUE, FAIL, "writing to a page that is outside the file EOA") - - /* Retrieve the 'eof' for the file - The MPI-VFD EOF - * returned will most likely be HADDR_UNDEF, so skip - * that check. + HDassert( entry_ptr->is_mpmde ); + HDassert( pb_ptr->vfd_swmr_writer ); + + if ( size > entry_ptr->size ) { + + clipped_size = entry_ptr->size; + + } else { + + clipped_size = size; + } + + /* copy data from the page into read buffer */ + HDmemcpy((uint8_t *)buf, (uint8_t *)(entry_ptr->image_ptr), + clipped_size); + + /* if the entry is on the LRU, update the replacement + * policy */ - if(!H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) - if(HADDR_UNDEF == (eof = H5FD_get_eof(f->shared->lf, H5FD_MEM_DEFAULT))) - HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTGET, FAIL, "driver get_eof request failed") - - /* Adjust the read size to not go beyond the EOA */ - if(search_addr + page_size > eoa) - page_size = (size_t)(eoa - search_addr); - - if(search_addr < eof) { - if(H5FD_read(file, type, search_addr, page_size, new_page_buf) < 0) - HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, "driver read request failed") - - /* Update statistics */ - if(type == H5FD_MEM_DRAW || type == H5FD_MEM_GHEAP) - page_buf->misses[1]++; - else - page_buf->misses[0]++; - } /* end if */ - } /* end else */ - - /* Copy the requested data from the page into the input buffer */ - HDmemcpy((uint8_t *)new_page_buf + offset, (const uint8_t *)buf+buf_offset, access_size); - - /* Page is dirty now */ - page_entry->is_dirty = TRUE; - - /* Insert page into PB, evicting other pages as necessary */ - if(H5PB__insert_entry(page_buf, page_entry) < 0) - HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTSET, FAIL, "error inserting new page in page buffer") - } /* end else */ - } /* end for */ - } /* end else */ + if ( ( ! (entry_ptr->is_mpmde) ) && + ( entry_ptr->delay_write_until == 0 ) ) { + + H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, FAIL) + } + + /* update hit rate stats */ + H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, TRUE, TRUE, TRUE) + } + } + } else { /* case 10 */ + + /* If the read is for metadata, is page aligned, is no + * larger than a page, test to see if the page buffer + * contains a page at the target address. + * + * If it doesn't, load the page and satisfy the read + * from it. + * + * If it contains a regular page entry, satisfy the read + * from it. + * + * If it contains a multipage metadata entry at the target + * address, satisfy the read from the multi-page metadata + * entry if pb_ptr->vfd_swmr_write is TRUE, and flag an + * error otherwise. + */ + HDassert( size <= pb_ptr->page_size ); + + /* get the containing page */ + H5PB__SEARCH_INDEX(pb_ptr, page, entry_ptr, FAIL) + + /* update hit rate stats */ + H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, (entry_ptr != NULL), \ + TRUE, FALSE) + + if ( ( NULL == entry_ptr ) && + ( H5PB__load_page(f, pb_ptr, page_addr, type, &entry_ptr) < 0)) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ + "page buffer page load request failed (2)") + + HDassert( entry_ptr ); + HDassert( entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC ); + HDassert( entry_ptr->is_metadata ); + HDassert( ( ! ( entry_ptr->is_mpmde ) ) || + ( pb_ptr->vfd_swmr_writer) ); + + /* copy data from the page into read buffer */ + HDmemcpy((uint8_t *)buf, (uint8_t *)(entry_ptr->image_ptr), size); + + /* if the entry is on the LRU, update the replacement policy */ + if ( ( ! (entry_ptr->is_mpmde) ) && + ( entry_ptr->delay_write_until == 0 ) ) { + + H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, FAIL) + } + } + } + + prev_addr = addr; done: + FUNC_LEAVE_NOAPI(ret_value) -} /* end H5PB_write() */ + +} /* end H5PB__read_meta() */ /*------------------------------------------------------------------------- - * Function: H5PB__insert_entry() * - * Purpose: ??? + * Function: H5PB__read_raw * - * This function was created without documentation. - * What follows is my best understanding of Mohamad's intent. + * Purpose: Satisfy a raw data read in cases 3 and 4 from H5PB_read(). + * Specifically: * - * Insert the supplied page into the page buffer, both the - * skip list and the LRU. + * 3) If the read is for raw data, and it is larger than the + * page size, read it directly from the HDF5 file. * - * As best I can tell, this function imposes no limit on the - * number of entries in the page buffer beyond an assertion - * failure it the page count exceeds the limit. + * It is possible that the page buffer contains dirty pages + * that intersect with the read -- test for this and update + * the read buffer from the page buffer if any such pages + * exist. * - * JRM -- 12/22/16 + * Note that no pages are inserted into the page buffer in + * this case. * + * 4) If the read is for raw data, and it is of size less + * than or equal to the page size, satisfy the read from + * the page buffer, loading and inserting pages into the + * page buffer as necessary + * + * Observe that this implies that: + * + * 1) The page buffer is defined. + * + * 2) The page buffer has been configured to accept at least + * one page of raw data. + * + * 2) This is a raw data read. * * Return: Non-negative on success/Negative on failure * - * Programmer: Mohamad Chaarawi + * Programmer: John Mainzer -- 10/11/18 + * + * Changes: None. * *------------------------------------------------------------------------- */ -static herr_t -H5PB__insert_entry(H5PB_t *page_buf, H5PB_entry_t *page_entry) +static herr_t +H5PB__read_raw(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, + void *buf/*out*/) { - herr_t ret_value = SUCCEED; /* Return value */ + H5PB_t *pb_ptr; /* Page buffer for this file */ + H5PB_entry_t *entry_ptr; /* Pointer to page buffer entry */ + uint64_t first_page; /* page offset of first I/O */ + uint64_t last_page; /* page offset of last I/O */ + uint64_t search_page; /* page offset of current page */ + haddr_t first_page_addr; /* address of first page of I/O */ + haddr_t last_page_addr; /* address of last page of I/O */ + haddr_t search_addr; /* Address of current page */ + hsize_t num_touched_pages; /* Number of pages accessed */ + size_t offset; /* offset of read in page */ + size_t length; /* length of read in page */ + hsize_t i; /* Local index variable */ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(FAIL) - FUNC_ENTER_STATIC + /* Sanity checks */ + HDassert(f); + HDassert(f->shared); + HDassert(f->shared->pb_ptr); - /* Insert entry in skip list */ - if(H5SL_insert(page_buf->slist_ptr, page_entry, &(page_entry->addr)) < 0) - HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTINSERT, FAIL, "can't insert entry in skip list") - HDassert(H5SL_count(page_buf->slist_ptr) * page_buf->page_size <= page_buf->max_size); + pb_ptr = f->shared->pb_ptr; - /* Increment appropriate page count */ - if(H5F_MEM_PAGE_DRAW == page_entry->type || H5F_MEM_PAGE_GHEAP == page_entry->type) - page_buf->raw_count++; - else - page_buf->meta_count++; + HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); + HDassert(pb_ptr->min_md_pages < pb_ptr->max_pages); + HDassert(H5FD_MEM_DRAW == type); - /* Insert entry in LRU */ - H5PB__INSERT_LRU(page_buf, page_entry) + + /* Calculate the aligned address of the first page */ + first_page = (addr / pb_ptr->page_size); + first_page_addr = first_page * pb_ptr->page_size; + + /* Calculate the aligned address of the last page */ + last_page = ((addr + size - 1) / pb_ptr->page_size); + last_page_addr = last_page * pb_ptr->page_size; + + /* Calculate number of pages that this read spans. */ + num_touched_pages = last_page - first_page + 1; + + if ( first_page_addr == last_page_addr ) { + + HDassert(1 == num_touched_pages); + last_page_addr = HADDR_UNDEF; + + } + + /* case 3) raw data read of page size or greater. */ + if ( size >= pb_ptr->page_size ) { + +#if VFD_IO + if ( H5FD_read(f->shared->lf, type, addr, size, buf) < 0) +#else /* VFD_IO */ + if ( H5F__accum_read(f, type, addr, size, buf) < 0 ) +#endif /* VFD_IO */ + + HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ + "read through metadata accumulator failed") + + + /* For each page that intersects with the above read, check to see + * if it exists in the page buffer, and if so, if it is dirty. + * + * If it does and is, update the read buffer with the contents + * of the page so we get the up to date data into the buffer + * after the big read from the file. + */ + search_page = first_page; + search_addr = first_page_addr; + + for(i = 0; i < num_touched_pages; i++) { + + H5PB__SEARCH_INDEX(pb_ptr, search_page, entry_ptr, FAIL) + + /* update hit rate stats */ + H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, (entry_ptr != NULL), \ + FALSE, FALSE) + + if ( entry_ptr ) { + + HDassert( entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC ); + HDassert( ! ( entry_ptr->is_metadata ) ); + HDassert( entry_ptr->page == search_page ); + HDassert( entry_ptr->addr == search_addr ); + HDassert( entry_ptr->size == pb_ptr->page_size ); + HDassert( entry_ptr->delay_write_until == 0 ); + HDassert( entry_ptr->addr <= addr + size ); + HDassert( entry_ptr->addr + entry_ptr->size <= addr + size ); + + if ( entry_ptr->is_dirty ) { + + if ( i == 0 ) { + + /* handle the possible partial access of the + * first page. + */ + + HDassert( search_addr == first_page_addr ); + HDassert( search_page == first_page ); + + offset = addr - first_page_addr; + + HDassert((( offset == 0 ) && (search_addr == addr )) || + (( offset > 0 ) && ( search_addr < addr ))); + + HDassert(pb_ptr->page_size >= offset); + + HDassert( size >= pb_ptr->page_size - (size_t)offset ); + + HDmemcpy(buf, (uint8_t *)entry_ptr->image_ptr + offset, + pb_ptr->page_size - (size_t)offset); + + } else if ( i == num_touched_pages - 1 ) { + + /* handle the possible partial access of the + * last page. + */ + HDassert( i > 0 ); + HDassert( search_addr == last_page_addr ); + HDassert( search_page == last_page ); + HDassert( addr < last_page_addr ); + HDassert( last_page_addr < addr + size ); + + offset = (num_touched_pages - 2) * pb_ptr->page_size + + (pb_ptr->page_size - (addr - first_page_addr)); + + HDmemcpy((uint8_t *)buf + offset, entry_ptr->image_ptr, + (size_t)((addr + size) - last_page_addr)); + + } else { + + /* this is an internal page -- copy it in its + * entireity. + */ + + offset = (i - 1) * pb_ptr->page_size + + (pb_ptr->page_size - (addr - first_page_addr)); + + HDassert ( addr + offset == search_addr ); + HDassert ( offset + pb_ptr->page_size <= size ); + + HDmemcpy(((uint8_t *)(buf) + offset), + entry_ptr->image_ptr, + pb_ptr->page_size); + } + + /* we have touched the entry -- move it to the top + * of the LRU if it resides there. + * + * The entry will be on the LRU if both it is not + * a multi-page metadata entry and it is not + * subject to a delayed write. + * + * As this is a raw data page buffer entry, both of + * these must be true, and are asserted above. + * + * Thus, just update the LRU. + */ + H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, FAIL) + + } /* if ( entry_ptr->is_dirty ) */ + } /* if ( entry_ptr ) */ + + search_page++; + search_addr += pb_ptr->page_size; + + } /* end for */ + } else { + /* case 4: Raw data read of size less than page size. + * + * In this case, read the desired data from the page buffer, loading + * pages if necessary. + */ + HDassert(size < pb_ptr->page_size); + + /* first page */ + offset = addr - first_page_addr; + + if ( (offset + size) < pb_ptr->page_size ) { + + HDassert(num_touched_pages == 1); + length = size; + + } else { + + HDassert(num_touched_pages == 2); + length = size - (pb_ptr->page_size - offset); + } + + /* get the first page */ + H5PB__SEARCH_INDEX(pb_ptr, first_page, entry_ptr, FAIL) + + /* update hit rate stats */ + H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, (entry_ptr != NULL), \ + FALSE, FALSE) + + if ( ( NULL == entry_ptr ) && + ( H5PB__load_page(f, pb_ptr, first_page_addr, + type, &entry_ptr) < 0 ) ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ + "page buffer page load request failed (1)") + + HDassert(entry_ptr); + HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC); + HDassert(entry_ptr->addr == first_page_addr); + + + /* copy data from first page into read buffer */ + HDmemcpy((uint8_t *)buf, ((uint8_t *)(entry_ptr->image_ptr) + offset), + length); + + H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, FAIL) + + /* second page, if it exists */ + if ( num_touched_pages == 2 ) { + + offset = length; + length = size - offset; + + HDassert(offset + length == size); + + /* get the second page */ + H5PB__SEARCH_INDEX(pb_ptr, last_page, entry_ptr, FAIL) + + /* update hit rate stats */ + H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, (entry_ptr != NULL), \ + FALSE, FALSE) + + if ( ( NULL == entry_ptr ) && + ( H5PB__load_page(f, pb_ptr, last_page_addr, + type, &entry_ptr) < 0 ) ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ + "page buffer page load request failed (2)") + + HDassert(entry_ptr); + HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC); + HDassert(entry_ptr->addr == last_page_addr); + HDassert(entry_ptr->page == last_page); + + /* copy data from second page into read buffer */ + HDmemcpy(((uint8_t *)(buf) + offset), + (uint8_t *)(entry_ptr->image_ptr), length); + + H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, FAIL) + } + } /* end else */ done: + FUNC_LEAVE_NOAPI(ret_value) -} /* end H5PB__insert_entry() */ + +} /* end H5PB__read_raw() */ /*------------------------------------------------------------------------- - * Function: H5PB__make_space() * - * Purpose: ??? + * Function: H5PB__write_meta + * + * Purpose: Satisfy a metadata read in cases 7 and 8 from H5PB_write(). + * Specifically: + * + * 7) If the write is of metadata, the write is larger than + * one page, and vfd_swmr_writer is TRUE, the write must + * buffered in the page buffer until the end of the tick. + * + * Create a multi-page metadata entry in the page buffer + * and copy the write into it. Insert the new entry in + * the tick list. + * + * Test to see if the write of the multi-page metadata + * entry must be delayed. If so, place the entry in + * the delayed write list. Otherwise, write the multi-page + * metadata entry to the HDF5 file. + * + * 8) If the write is of metadata, and the write is of size + * less than or equal to the page size, write the data + * into the page buffer, loading and inserting a page + * if necessary. + * + * If, in addition, vfd_swmr_writer is TRUE, we must: * - * This function was created without documentation. - * What follows is my best understanding of Mohamad's intent. + * * add the page touched by the write to the tick list + * so that it will be buffered until the end of the + * tick. * - * If necessary and if possible, evict a page from the page - * buffer to make space for the supplied page. Depending on - * the page buffer configuration and contents, and the page - * supplied this may or may not be possible. + * * test to see if the write must be delayed, and + * add the page to the delayed write list if so. * - * JRM -- 12/22/16 + * Observe that this implies that: + * + * 1) The page buffer is defined. + * + * 2) The page buffer has been configured to accept at least + * one page of metadata. + * + * 3) This is a metadata read. + * + * Note also that if the metadata read is of size + * no larger than page size, it may not cross page + * boundaries. + * + * Further, for reads larger than page size (case 7 only), + * the base address must be page aligned. * * Return: Non-negative on success/Negative on failure * - * Programmer: Mohamad Chaarawi + * Programmer: John Mainzer -- 10/11/18 + * + * Changes: None. * *------------------------------------------------------------------------- */ -static htri_t -H5PB__make_space(H5F_t *f, H5PB_t *page_buf, H5FD_mem_t inserted_type) +static herr_t +H5PB__write_meta(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, + const void *buf/*out*/) { - H5PB_entry_t *page_entry; /* Pointer to page eviction candidate */ - htri_t ret_value = TRUE; /* Return value */ + H5PB_t *pb_ptr; /* Page buffer for this file */ + H5PB_entry_t *entry_ptr; /* Pointer to page buffer entry */ + uint64_t page; /* page offset of addr */ + haddr_t page_addr; /* page containg addr */ + size_t offset; /* offset of write in page */ + herr_t ret_value = SUCCEED; /* Return value */ - FUNC_ENTER_STATIC + FUNC_ENTER_NOAPI(FAIL) - /* Sanity check */ + /* Sanity checks */ HDassert(f); - HDassert(page_buf); - - /* Get oldest entry */ - page_entry = page_buf->LRU_tail_ptr; - - if(H5FD_MEM_DRAW == inserted_type) { - /* If threshould is 100% metadata and page buffer is full of - metadata, then we can't make space for raw data */ - if(0 == page_buf->raw_count && page_buf->min_meta_count == page_buf->meta_count) { - HDassert(page_buf->meta_count * page_buf->page_size == page_buf->max_size); - HGOTO_DONE(FALSE) - } /* end if */ - - /* check the metadata threshold before evicting metadata items */ - while(1) { - if(page_entry->prev && H5F_MEM_PAGE_META == page_entry->type && - page_buf->min_meta_count >= page_buf->meta_count) - page_entry = page_entry->prev; - else - break; - } /* end while */ - } /* end if */ - else { - /* If threshould is 100% raw data and page buffer is full of - raw data, then we can't make space for meta data */ - if(0 == page_buf->meta_count && page_buf->min_raw_count == page_buf->raw_count) { - HDassert(page_buf->raw_count * page_buf->page_size == page_buf->max_size); - HGOTO_DONE(FALSE) - } /* end if */ - - /* check the raw data threshold before evicting raw data items */ - while(1) { - if(page_entry->prev && (H5F_MEM_PAGE_DRAW == page_entry->type || H5F_MEM_PAGE_GHEAP == page_entry->type) && - page_buf->min_raw_count >= page_buf->raw_count) - page_entry = page_entry->prev; - else - break; - } /* end while */ - } /* end else */ + HDassert(f->shared); + HDassert(f->shared->pb_ptr); - /* Remove from page index */ - if(NULL == H5SL_remove(page_buf->slist_ptr, &(page_entry->addr))) - HGOTO_ERROR(H5E_PAGEBUF, H5E_BADVALUE, FAIL, "Tail Page Entry is not in skip list") + pb_ptr = f->shared->pb_ptr; - /* Remove entry from LRU list */ - H5PB__REMOVE_LRU(page_buf, page_entry) - HDassert(H5SL_count(page_buf->slist_ptr) == page_buf->LRU_list_len); + HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); + HDassert(pb_ptr->min_rd_pages < pb_ptr->max_pages); + HDassert(H5FD_MEM_DRAW != type); + HDassert(buf); - /* Decrement appropriate page type counter */ - if(H5F_MEM_PAGE_DRAW == page_entry->type || H5F_MEM_PAGE_GHEAP == page_entry->type) - page_buf->raw_count--; - else - page_buf->meta_count--; + /* Calculate the aligned address of the first page */ + page = (addr / pb_ptr->page_size); + page_addr = page * pb_ptr->page_size; + + /* if size > pb_ptr->page_size, addr must be page aligned */ + HDassert((size <= pb_ptr->page_size) || (addr == page_addr)); + + + /* case 7) metadata read of size greater than page size. */ + if ( size > pb_ptr->page_size ) { + + /* The write must be for a multi-page metadata entry, and + * we must be running as a VFD SWMR writer. + * + * This requires the following actions: + * + * 1) If the multi-page metadata entry is not alrady in the + * page buffer, create an entry for it. + * + * 2) Overwrite the image of the entry with the write buffer. + * + * 3) If the entry is not alread on the tick list, add it to + * the tick list. + * + * 4) If the entry is not alread on the delayed write list, + * test to see if it should be, and move it from the + * LRU to the delayed write list and set the delay_write_until + * field appropriately. + */ + HDassert(pb_ptr->vfd_swmr_writer); + HDassert(addr == page_addr); - /* Flush page if dirty */ - if(page_entry->is_dirty) - if(H5PB__write_entry(f, page_entry) < 0) - HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, "file write failed") + H5PB__SEARCH_INDEX(pb_ptr, page, entry_ptr, FAIL) + + /* update hit rate stats */ + H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, (entry_ptr != NULL), \ + TRUE, TRUE) + + if ( NULL == entry_ptr ) { + + /* the multi-page metadata entry is not currently in the page + * buffer. Create an entry for it, and insert it into the LRU. + * + * Don't bother to try to make space for it, as VFD SWMR + * ignores the limits on page buffer size. + */ + if ( H5PB__create_new_page(pb_ptr, addr, size, type, + FALSE, &entry_ptr) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "can't create new page buffer page") + } + + /* at this point, one way or the other, the multi-page metadata + * entry must be in the page buffer. + */ + HDassert(entry_ptr->is_metadata); + HDassert(entry_ptr->is_mpmde); + HDassert(size == entry_ptr->size); + HDassert(type == entry_ptr->mem_type); + + /* overwrite the entry image with the write buffer */ + HDmemcpy((uint8_t *)(entry_ptr->image_ptr), buf, size); + + /* mark the entry dirty */ + if ( H5PB__mark_entry_dirty(pb_ptr, entry_ptr) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "mark entry dirty failed (1)") + + + /* insert in tick list if not there already */ + if ( ! ( entry_ptr->modified_this_tick ) ) { + + H5PB__INSERT_IN_TL(pb_ptr, entry_ptr, FAIL) + } + + /* Test to see if we must delay the write of the multi-page + * metadata entry, and move it from the LRU to the delayed write + * list if so. + */ + + /* Write function for this -- assert false for now */ + HDassert(FALSE); + + } else { + /* case 8) metadata write of size no larger than page size */ + + offset = addr - page_addr; + + /* write cannot cross page boundaries. */ + HDassert((offset + size) <= pb_ptr->page_size); + + /* get the containing page */ + H5PB__SEARCH_INDEX(pb_ptr, page, entry_ptr, FAIL) + + /* update hit rate stats */ + H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, (entry_ptr != NULL), \ + TRUE, FALSE) + +#if 1 /* JRM */ + /* Since the space allocation code doesn't always tell the page + * buffer when a page is freed, it is possible that the page + * found by the index search is an ophaned raw data page. + * + * Until this is fixed, test to see entry_ptr points to + * a raw data page, and force its eviction if it does. + * + * Remove this code as soon as the space allocation code is + * updated to tell the page buffer to discard pages when + * they are freed. + */ + if ( ( entry_ptr ) && ( ! ( entry_ptr->is_metadata ) ) ) { + + if ( H5PB__evict_entry(pb_ptr, entry_ptr, TRUE) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "forced eviction failed") - /* Update statistics */ - if(page_entry->type == H5F_MEM_PAGE_DRAW || H5F_MEM_PAGE_GHEAP == page_entry->type) - page_buf->evictions[1]++; - else - page_buf->evictions[0]++; + entry_ptr = NULL; + } +#endif /* JRM */ - /* Release page */ - page_entry->page_buf_ptr = H5FL_FAC_FREE(page_buf->page_fac, page_entry->page_buf_ptr); - page_entry = H5FL_FREE(H5PB_entry_t, page_entry); + if ( ( NULL == entry_ptr ) && + ( H5PB__load_page(f, pb_ptr, page_addr, type, &entry_ptr) < 0 ) ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ + "page buffer page load request failed (1)") + + HDassert(entry_ptr); + HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC); + HDassert(entry_ptr->addr == page_addr); + HDassert(entry_ptr->is_metadata); + HDassert(!(entry_ptr->is_mpmde)); + HDassert(entry_ptr->size == pb_ptr->page_size); + HDassert(size <= entry_ptr->size); + + /* copy data from the write buffer into the page image */ + HDmemcpy(((uint8_t *)(entry_ptr->image_ptr) + offset), + (const uint8_t *)buf, size); + + if ( H5PB__mark_entry_dirty(pb_ptr, entry_ptr) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "mark entry dirty failed (2)") + + if ( pb_ptr->vfd_swmr_writer ) { + + /* test to see if the entry is on the tick list, and insert + * it if it is not. This will force the page buffer to retain + * the page until the end of the tick. + */ + if ( ! ( entry_ptr->modified_this_tick ) ) { + + H5PB__INSERT_IN_TL(pb_ptr, entry_ptr, FAIL) + } + + /* Test to see if we must delay the write of the multi-page + * metadata entry, and move it from the LRU to the delayed write + * list if so. + */ + + /* Write function for this -- assert false for now */ + HDassert(FALSE); + } + } done: + FUNC_LEAVE_NOAPI(ret_value) -} /* end H5PB__make_space() */ + +} /* end H5PB__write_meta() */ /*------------------------------------------------------------------------- - * Function: H5PB__write_entry() * - * Purpose: ??? + * Function: H5PB__write_raw + * + * Purpose: Satisfy a raw data read in cases 3 and 4 from H5PB_write(). + * Specifically: + * + * 3) If the write is raw data, and it of page size or + * larger, write directly from the HDF5 file. + * + * It is possible that the write intersects one or more + * pages in the page buffer -- test for this and update + * any partially written pages, and evict any pages + * that are completely overwritten. + * + * Note that no pages are inserted into the page buffer in + * this case. * - * This function was created without documentation. - * What follows is my best understanding of Mohamad's intent. + * 4) If the write is of raw data, and it is of size less + * than the page size, write the page into the page + * buffer, loading and inserting pages into the + * page buffer as necessary * + * Observe that this implies that: + * + * 1) The page buffer is defined. + * + * 2) The page buffer has been configured to accept at least + * one page of raw data. + * + * 2) This is a raw data write. * * Return: Non-negative on success/Negative on failure * - * Programmer: Mohamad Chaarawi + * Programmer: John Mainzer -- 10/11/18 + * + * Changes: None. * *------------------------------------------------------------------------- */ static herr_t -H5PB__write_entry(H5F_t *f, H5PB_entry_t *page_entry) +H5PB__write_raw(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, + const void *buf/*out*/) { - haddr_t eoa; /* Current EOA for the file */ - herr_t ret_value = SUCCEED; /* Return value */ + H5PB_t *pb_ptr; /* Page buffer for this file */ + H5PB_entry_t *entry_ptr; /* Pointer to page buffer entry */ + uint64_t first_page; /* page offset of first I/O */ + uint64_t last_page; /* page offset of last I/O */ + uint64_t search_page; /* page offset of current page */ + haddr_t first_page_addr; /* address of first page of I/O */ + haddr_t last_page_addr; /* address of last page of I/O */ + haddr_t search_addr; /* Address of current page */ + hsize_t num_touched_pages; /* Number of pages accessed */ + hsize_t i; /* Local index variable */ + size_t length; /* length of write in a page */ + size_t offset; /* offset of write in a page */ + herr_t ret_value = SUCCEED; /* Return value */ - FUNC_ENTER_STATIC + FUNC_ENTER_NOAPI(FAIL) - /* Sanity check */ + /* Sanity checks */ HDassert(f); - HDassert(page_entry); + HDassert(f->shared); + HDassert(f->shared->pb_ptr); - /* Retrieve the 'eoa' for the file */ - if(HADDR_UNDEF == (eoa = H5F_get_eoa(f, (H5FD_mem_t)page_entry->type))) - HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTGET, FAIL, "driver get_eoa request failed") + pb_ptr = f->shared->pb_ptr; - /* If the starting address of the page is larger than - * the EOA, then the entire page is discarded without writing. - */ - if(page_entry->addr <= eoa) { - H5FD_t *file; /* File driver I/O info */ - size_t page_size = f->shared->page_buf->page_size; + HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); + HDassert(pb_ptr->min_md_pages < pb_ptr->max_pages); + HDassert(f->shared->lf); - /* Adjust the page length if it exceeds the EOA */ - if((page_entry->addr + page_size) > eoa) - page_size = (size_t)(eoa - page_entry->addr); + HDassert(H5FD_MEM_DRAW == type); - /* Translate to file driver I/O info object */ - file = f->shared->lf; + /* Calculate the aligned address of the first page */ + first_page = (addr / pb_ptr->page_size); + first_page_addr = first_page * pb_ptr->page_size; - if(H5FD_write(file, (H5FD_mem_t)page_entry->type, page_entry->addr, page_size, page_entry->page_buf_ptr) < 0) - HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, "file write failed") - } /* end if */ + /* Calculate the aligned address of the last page */ + last_page = ((addr + size - 1) / pb_ptr->page_size); + last_page_addr = last_page * pb_ptr->page_size; + + /* Calculate number of pages that this read spans. */ + num_touched_pages = last_page - first_page + 1; + + if ( first_page_addr == last_page_addr ) { + + HDassert(1 == num_touched_pages); + last_page_addr = HADDR_UNDEF; + + } + + /* case 3) raw data write of page size or greater. */ + if ( size >= pb_ptr->page_size ) { +#if VFD_IO + if ( H5FD_write(f->shared->lf, type, addr, size, buf) < 0 ) +#else /* VFD_IO */ + if ( H5F__accum_write(f, type, addr, size, buf) < 0 ) +#endif /* VFD_IO */ + + HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \ + "write through metadata accumulator failed") + + /* For each page that intersects with the above write, check to see + * if it exists in the page buffer. + * + * If it does and is, and if the write overwrites page fully, + * mark the page clean and evict it. + * + * If the write only partially intersects a page, update the + * page and mark it dirty. + */ + search_page = first_page; + search_addr = first_page_addr; + + for(i = 0; i < num_touched_pages; i++) { + + H5PB__SEARCH_INDEX(pb_ptr, search_page, entry_ptr, FAIL) + + /* update hit rate stats */ + H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, (entry_ptr != NULL), \ + FALSE, FALSE) + +#if 1 /* JRM */ + /* Since the space allocation code doesn't always tell the page + * buffer when a page is freed, it is possible that the page + * found by the index search is an ophaned metadata page. + * + * Until this is fixed, test to see entry_ptr points to + * a metadata page, and force its eviction if it does. + * + * Remove this code as soon as the space allocation code is + * updated to tell the page buffer to discard pages when + * they are freed. + */ + if ( ( entry_ptr ) && ( entry_ptr->is_metadata ) ) { + + if ( H5PB__evict_entry(pb_ptr, entry_ptr, TRUE) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "forced eviction failed") + + entry_ptr = NULL; + } +#endif /* JRM */ + + if ( entry_ptr ) { + + HDassert( entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC ); + HDassert( ! ( entry_ptr->is_metadata ) ); + HDassert( entry_ptr->page == search_page ); + HDassert( entry_ptr->addr == search_addr ); + HDassert( entry_ptr->size == pb_ptr->page_size ); + HDassert( entry_ptr->delay_write_until == 0 ); + HDassert( entry_ptr->addr <= addr + size ); + + if ( ( addr <= entry_ptr->addr ) && + ( entry_ptr->addr + entry_ptr->size <= addr + size ) ) { + + /* the page is completely overwritten -- mark it clean + * and evict it. + */ + if ( ( entry_ptr->is_dirty ) && + ( H5PB__mark_entry_clean(pb_ptr, entry_ptr) < 0 ) ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "mark entry clean failed") + + if ( H5PB__evict_entry(pb_ptr, entry_ptr, TRUE) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "forced eviction failed (1)") + + } else if ( i == 0 ) { + + /* handle partial overwrite of the first page. */ + + HDassert( search_addr == first_page_addr ); + HDassert( search_page == first_page ); + HDassert( search_addr < addr ); + HDassert( entry_ptr->addr + entry_ptr->size <= + addr + size ); + + offset = addr - first_page_addr; + + HDassert( offset > 0 ); + HDassert( pb_ptr->page_size >= offset ); + HDassert( size >= pb_ptr->page_size - (size_t)offset ); + + HDmemcpy((uint8_t *)entry_ptr->image_ptr + offset, buf, + pb_ptr->page_size - (size_t)offset); + + if ( H5PB__mark_entry_dirty(pb_ptr, entry_ptr) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "mark entry dirty failed (1)") + + } else if ( i == num_touched_pages - 1 ) { + + /* handle partial overwrite of the last page. */ + HDassert( i > 0 ); + HDassert( search_addr == last_page_addr ); + HDassert( search_page == last_page ); + HDassert( addr < last_page_addr ); + HDassert( last_page_addr < addr + size ); + + offset = (num_touched_pages - 2) * pb_ptr->page_size + + (pb_ptr->page_size - (addr - first_page_addr)); + + HDmemcpy(entry_ptr->image_ptr, + (const uint8_t *)buf + offset, + (size_t)((addr + size) - last_page_addr)); + + if ( H5PB__mark_entry_dirty(pb_ptr, entry_ptr) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "mark entry dirty failed (2)") + } else { - page_entry->is_dirty = FALSE; + /* this should be un-reachable */ + HDassert(FALSE); + + } + } /* if ( entry_ptr ) */ + + search_page++; + search_addr += pb_ptr->page_size; + + } /* end for */ + } else { + /* case 4: Raw data write of size less than page size. + * + * In this case, write the data to the page buffer, loading + * pages if necessary. + */ + HDassert(size < pb_ptr->page_size); + + /* first page */ + offset = addr - first_page_addr; + + if ( (offset + size) <= pb_ptr->page_size ) { + + HDassert(num_touched_pages == 1); + length = size; + + } else { + + HDassert(num_touched_pages == 2); + length = pb_ptr->page_size - offset; + HDassert( offset + length == pb_ptr->page_size ); + } + + /* get the first page */ + H5PB__SEARCH_INDEX(pb_ptr, first_page, entry_ptr, FAIL) + + /* update hit rate stats */ + H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, (entry_ptr != NULL), \ + FALSE, FALSE) + + if ( ( NULL == entry_ptr ) && + ( H5PB__load_page(f, pb_ptr, first_page_addr, + type, &entry_ptr) < 0 ) ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ + "page buffer page load request failed (1)") + + HDassert(entry_ptr); + HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC); + HDassert(entry_ptr->addr == first_page_addr); + + + /* copy data from the write buffer into the first page */ + HDmemcpy(((uint8_t *)(entry_ptr->image_ptr)) + offset, + (const uint8_t *)buf, length); + + if ( H5PB__mark_entry_dirty(pb_ptr, entry_ptr) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "mark entry dirty failed (3)") + + /* second page, if it exists */ + if ( num_touched_pages == 2 ) { + + offset = length; + length = size - offset; + + HDassert(offset + length == size); + + /* get the first page */ + H5PB__SEARCH_INDEX(pb_ptr, last_page, entry_ptr, FAIL) + + /* update hit rate stats */ + H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, (entry_ptr != NULL), \ + FALSE, FALSE) + + if ( ( NULL == entry_ptr ) && + ( H5PB__load_page(f, pb_ptr, last_page_addr, + type, &entry_ptr) < 0 ) ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ + "page buffer page load request failed (2)") + + HDassert(entry_ptr); + HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC); + HDassert(entry_ptr->addr == last_page_addr); + HDassert(entry_ptr->page == last_page); + + /* copy data from the write buffer into the first page */ + HDmemcpy((uint8_t *)(entry_ptr->image_ptr), + ((const uint8_t *)(buf) + offset), length); + + if ( H5PB__mark_entry_dirty(pb_ptr, entry_ptr) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "mark entry dirty failed (3)") + } + } done: + FUNC_LEAVE_NOAPI(ret_value) -} /* end H5PB__write_entry() */ + +} /* end H5PB__write_raw() */ diff --git a/src/H5PBpkg.h b/src/H5PBpkg.h index 6b9168b..e71396a 100644 --- a/src/H5PBpkg.h +++ b/src/H5PBpkg.h @@ -21,38 +21,1504 @@ /* Get package's private header */ #include "H5PBprivate.h" -/* Other private headers needed by this file */ +/* + * File: H5PBpkg.h + * + * Purpose: This file contains declarations which are normally visible + * only within the H5PB package. + * + * Source files outside the H5PB package should include + * H5PBprivate.h instead. + * + * Programmer: John Mainzer -- 10/07/18 + */ /**************************/ /* Package Private Macros */ /**************************/ +/* page buffer configuration settings */ +#define H5PB__H5PB_ENTRY_T_MAGIC 0x02030405 +#define H5PB__DO_SANITY_CHECKS TRUE +#define H5PB__COLLECT_PAGE_BUFFER_STATS TRUE + + +/**************************************************************************** + * + * We maintain doubly linked lists of instances of H5PB_entry_t for a + * variety of reasons -- LRU list, tick list, and the delayed write list + * at present. The following macros support linking and unlinking + * of instances of H5PB_entry_t by both their regular and tick list next + * and previous pointers. Note that the tick list is only used in the + * context of VFD SWMR + * + * The size and length fields are also maintained. + * + * Note that the relevant pair of prev and next pointers are presumed to be + * NULL on entry in the insertion macros. + * + * Finally, observe that the sanity checking macros evaluate to the empty + * string when H5PB__DO_SANITY_CHECKS is FALSE. They also contain calls + * to the HGOTO_ERROR macro, which may not be appropriate in all cases. + * If so, we will need versions of the insertion and deletion macros which + * do not reference the sanity checking macros. + * JRM - 10/07/18 + * + ****************************************************************************/ + +#if H5PB__DO_SANITY_CHECKS + +#define H5PB__DLL_PRE_REMOVE_SC(entry_ptr, head_ptr, tail_ptr, len, Size, fv) \ +if ( ( (head_ptr) == NULL ) || \ + ( (tail_ptr) == NULL ) || \ + ( (entry_ptr) == NULL ) || \ + ( (len) <= 0 ) || \ + ( (size_t)(Size) < (entry_ptr)->size ) || \ + ( ( (entry_ptr)->prev == NULL ) && ( (head_ptr) != (entry_ptr) ) ) || \ + ( ( (entry_ptr)->next == NULL ) && ( (tail_ptr) != (entry_ptr) ) ) || \ + ( ( (len) == 1 ) && \ + ( ! ( ( (head_ptr) == (entry_ptr) ) && \ + ( (tail_ptr) == (entry_ptr) ) && \ + ( (entry_ptr)->next == NULL ) && \ + ( (entry_ptr)->prev == NULL ) && \ + ( (Size) == (int64_t)((entry_ptr)->size) ) \ + ) \ + ) \ + ) \ + ) { \ + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, (fv), "DLL pre remove SC failed") \ +} + +#define H5PB__DLL_SC(head_ptr, tail_ptr, len, Size, fv) \ +if ( ( ( ( (head_ptr) == NULL ) || ( (tail_ptr) == NULL ) ) && \ + ( (head_ptr) != (tail_ptr) ) \ + ) || \ + ( (len) < 0 ) || \ + ( (Size) < 0 ) || \ + ( ( (len) == 1 ) && \ + ( ( (head_ptr) != (tail_ptr) ) || \ + ( (head_ptr) == NULL ) || ( (head_ptr)->size != (size_t)(Size) ) \ + ) \ + ) || \ + ( ( (len) >= 1 ) && \ + ( ( (head_ptr) == NULL ) || ( (head_ptr)->prev != NULL ) || \ + ( (tail_ptr) == NULL ) || ( (tail_ptr)->next != NULL ) \ + ) \ + ) \ + ) { \ + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, (fv), "DLL sanity check failed") \ +} + +#define H5PB__DLL_PRE_INSERT_SC(entry_ptr, head_ptr, tail_ptr, len, Size, fv) \ +if ( ( (entry_ptr) == NULL ) || \ + ( (entry_ptr)->next != NULL ) || \ + ( (entry_ptr)->prev != NULL ) || \ + ( ( ( (head_ptr) == NULL ) || ( (tail_ptr) == NULL ) ) && \ + ( (head_ptr) != (tail_ptr) ) \ + ) || \ + ( ( (len) == 1 ) && \ + ( ( (head_ptr) != (tail_ptr) ) || \ + ( (head_ptr) == NULL ) || ( (head_ptr)->size != (size_t)(Size) ) \ + ) \ + ) || \ + ( ( (len) >= 1 ) && \ + ( ( (head_ptr) == NULL ) || ( (head_ptr)->prev != NULL ) || \ + ( (tail_ptr) == NULL ) || ( (tail_ptr)->next != NULL ) \ + ) \ + ) \ + ) { \ + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, (fv), "DLL pre insert SC failed") \ +} + +#else /* H5PB__DO_SANITY_CHECKS */ + +#define H5PB__DLL_PRE_REMOVE_SC(entry_ptr, head_ptr, tail_ptr, len, Size, fv) +#define H5PB__DLL_SC(head_ptr, tail_ptr, len, Size, fv) +#define H5PB__DLL_PRE_INSERT_SC(entry_ptr, head_ptr, tail_ptr, len, Size, fv) + +#endif /* H5PB__DO_SANITY_CHECKS */ + + +#define H5PB__DLL_APPEND(entry_ptr, head_ptr, tail_ptr, len, Size, fail_val) \ +{ \ + H5PB__DLL_PRE_INSERT_SC(entry_ptr, head_ptr, tail_ptr, len, Size, \ + fail_val) \ + if ( (head_ptr) == NULL ) \ + { \ + (head_ptr) = (entry_ptr); \ + (tail_ptr) = (entry_ptr); \ + } \ + else \ + { \ + (tail_ptr)->next = (entry_ptr); \ + (entry_ptr)->prev = (tail_ptr); \ + (tail_ptr) = (entry_ptr); \ + } \ + (len)++; \ + (Size) += (int64_t)((entry_ptr)->size); \ +} /* H5PB__DLL_APPEND() */ + +#define H5PB__DLL_PREPEND(entry_ptr, head_ptr, tail_ptr, len, Size, fail_val) \ +{ \ + H5PB__DLL_PRE_INSERT_SC(entry_ptr, head_ptr, tail_ptr, len, Size, \ + fail_val) \ + if ( (head_ptr) == NULL ) \ + { \ + (head_ptr) = (entry_ptr); \ + (tail_ptr) = (entry_ptr); \ + } \ + else \ + { \ + (head_ptr)->prev = (entry_ptr); \ + (entry_ptr)->next = (head_ptr); \ + (head_ptr) = (entry_ptr); \ + } \ + (len)++; \ + (Size) += (int64_t)((entry_ptr)->size); \ +} /* H5PB__DLL_PREPEND() */ + +#define H5PB__DLL_INSERT_BEFORE(entry_ptr, suc_ptr, head_ptr, tail_ptr, len, \ + Size, fail_val) \ +{ \ + HDassert( ((suc_ptr) == NULL) || \ + ((suc_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC) ); \ + \ + if ( suc_ptr == NULL ) \ + /* list empty or no successor -- append */ \ + H5PB__DLL_APPEND(entry_ptr, head_ptr, tail_ptr, len, Size, fail_val) \ + \ + else if ( suc_ptr->prev == NULL ) \ + /* successor at head of list -- prepend */ \ + H5PB__DLL_PREPEND(entry_ptr, head_ptr, tail_ptr, len, Size, fail_val) \ + \ + else /* sucessor in body of list -- insert before it */ \ + { \ + H5PB__DLL_PRE_INSERT_SC(entry_ptr, head_ptr, tail_ptr, len, Size, \ + fail_val) \ + HDassert(suc_ptr->prev->magic == H5PB__H5PB_ENTRY_T_MAGIC); \ + HDassert(suc_ptr->prev->next == suc_ptr); \ + entry_ptr->prev = suc_ptr->prev; \ + entry_ptr->prev->next = entry_ptr; \ + entry_ptr->next = suc_ptr; \ + suc_ptr->prev = entry_ptr; \ + (len)++; \ + (Size) += (int64_t)((entry_ptr)->size); \ + } \ +} /* H5PB__DLL_INSERT_BEFORE() */ + +#define H5PB__DLL_REMOVE(entry_ptr, head_ptr, tail_ptr, len, Size, fail_val) \ +{ \ + H5PB__DLL_PRE_REMOVE_SC(entry_ptr, head_ptr, tail_ptr, len, Size, \ + fail_val) \ + { \ + if ( (head_ptr) == (entry_ptr) ) \ + { \ + (head_ptr) = (entry_ptr)->next; \ + if ( (head_ptr) != NULL ) \ + (head_ptr)->prev = NULL; \ + } \ + else \ + (entry_ptr)->prev->next = (entry_ptr)->next; \ + if ( (tail_ptr) == (entry_ptr) ) \ + { \ + (tail_ptr) = (entry_ptr)->prev; \ + if ( (tail_ptr) != NULL ) \ + (tail_ptr)->next = NULL; \ + } \ + else \ + (entry_ptr)->next->prev = (entry_ptr)->prev; \ + entry_ptr->next = NULL; \ + entry_ptr->prev = NULL; \ + (len)--; \ + (Size) -= (int64_t)((entry_ptr)->size); \ + } \ +} /* H5PB__DLL_REMOVE() */ + + +#if H5PB__DO_SANITY_CHECKS + +#define H5PB__TL_DLL_PRE_REMOVE_SC(entry_ptr, hd_ptr, tail_ptr, len, Size, fv) \ +if ( ( (hd_ptr) == NULL ) || \ + ( (tail_ptr) == NULL ) || \ + ( (entry_ptr) == NULL ) || \ + ( (len) <= 0 ) || \ + ( (Size) < (entry_ptr)->size ) || \ + ( ( (Size) == (entry_ptr)->size ) && ( ! ( (len) == 1 ) ) ) || \ + ( ( (entry_ptr)->tl_prev == NULL ) && ( (hd_ptr) != (entry_ptr) ) ) || \ + ( ( (entry_ptr)->tl_next == NULL ) && ( (tail_ptr) != (entry_ptr) ) ) || \ + ( ( (len) == 1 ) && \ + ( ! ( ( (hd_ptr) == (entry_ptr) ) && ( (tail_ptr) == (entry_ptr) ) && \ + ( (entry_ptr)->tl_next == NULL ) && \ + ( (entry_ptr)->tlx_prev == NULL ) && \ + ( (Size) == (entry_ptr)->size ) \ + ) \ + ) \ + ) \ + ) { \ + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, (fv), "TL DLL pre remove SC failed") \ +} + +#define H5PB__TL_DLL_SC(head_ptr, tail_ptr, len, Size, fv) \ +if ( ( ( ( (head_ptr) == NULL ) || ( (tail_ptr) == NULL ) ) && \ + ( (head_ptr) != (tail_ptr) ) \ + ) || \ + ( (len) < 0 ) || \ + ( (Size) < 0 ) || \ + ( ( (len) == 1 ) && \ + ( ( (head_ptr) != (tail_ptr) ) || ( (Size) <= 0 ) || \ + ( (head_ptr) == NULL ) || ( (head_ptr)->size != (Size) ) \ + ) \ + ) || \ + ( ( (len) >= 1 ) && \ + ( ( (head_ptr) == NULL ) || ( (head_ptr)->tl_prev != NULL ) || \ + ( (tail_ptr) == NULL ) || ( (tail_ptr)->tl_next != NULL ) \ + ) \ + ) \ + ) { \ + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, (fv), "TL DLL sanity check failed") \ +} + +#define H5PB__TL_DLL_PRE_INSERT_SC(entry_ptr, hd_ptr, tail_ptr, len, Size, fv) \ +if ( ( (entry_ptr) == NULL ) || \ + ( (entry_ptr)->tl_next != NULL ) || \ + ( (entry_ptr)->tl_prev != NULL ) || \ + ( ( ( (hd_ptr) == NULL ) || ( (tail_ptr) == NULL ) ) && \ + ( (hd_ptr) != (tail_ptr) ) \ + ) || \ + ( ( (len) == 1 ) && \ + ( ( (hd_ptr) != (tail_ptr) ) || ( (Size) <= 0 ) || \ + ( (hd_ptr) == NULL ) || ( (int64_t)((hd_ptr)->size) != (Size) ) \ + ) \ + ) || \ + ( ( (len) >= 1 ) && \ + ( ( (hd_ptr) == NULL ) || ( (hd_ptr)->tl_prev != NULL ) || \ + ( (tail_ptr) == NULL ) || ( (tail_ptr)->tl_next != NULL ) \ + ) \ + ) \ + ) { \ + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, (fv), "TL DLL pre insert SC failed") \ +} + +#else /* H5PB__DO_SANITY_CHECKS */ + +#define H5PB__TL_DLL_PRE_REMOVE_SC(entry_ptr, hd_ptr, tail_ptr, len, Size, fv) +#define H5PB__TL_DLL_SC(head_ptr, tail_ptr, len, Size, fv) +#define H5PB__TL_DLL_PRE_INSERT_SC(entry_ptr, hd_ptr, tail_ptr, len, Size, fv) + +#endif /* H5PB__DO_SANITY_CHECKS */ + + +#define H5PB__TL_DLL_APPEND(entry_ptr, head_ptr, tail_ptr, len, Size, fail_val)\ +{ \ + H5PB__TL_DLL_PRE_INSERT_SC(entry_ptr, head_ptr, tail_ptr, len, Size, \ + fail_val) \ + if ( (head_ptr) == NULL ) \ + { \ + (head_ptr) = (entry_ptr); \ + (tail_ptr) = (entry_ptr); \ + } \ + else \ + { \ + (tail_ptr)->tl_next = (entry_ptr); \ + (entry_ptr)->tl_prev = (tail_ptr); \ + (tail_ptr) = (entry_ptr); \ + } \ + (len)++; \ + (Size) += entry_ptr->size; \ +} /* H5PB__AUX_DLL_APPEND() */ + +#define H5PB__TL_DLL_PREPEND(entry_ptr, head_ptr, tail_ptr, len, Size, fv) \ +{ \ + H5PB__TL_DLL_PRE_INSERT_SC(entry_ptr, head_ptr, tail_ptr, len, Size, fv) \ + if ( (head_ptr) == NULL ) \ + { \ + (head_ptr) = (entry_ptr); \ + (tail_ptr) = (entry_ptr); \ + } \ + else \ + { \ + (head_ptr)->tl_prev = (entry_ptr); \ + (entry_ptr)->tl_next = (head_ptr); \ + (head_ptr) = (entry_ptr); \ + } \ + (len)++; \ + (Size) += (int64_t)(entry_ptr->size); \ +} /* H5PB__TL_DLL_PREPEND() */ + +#define H5PB__TL_DLL_REMOVE(entry_ptr, head_ptr, tail_ptr, len, Size, fv) \ +{ \ + H5PB__TL_DLL_PRE_REMOVE_SC(entry_ptr, head_ptr, tail_ptr, len, Size, fv) \ + { \ + if ( (head_ptr) == (entry_ptr) ) \ + { \ + (head_ptr) = (entry_ptr)->tl_next; \ + if ( (head_ptr) != NULL ) \ + (head_ptr)->tl_prev = NULL; \ + } \ + else \ + (entry_ptr)->tl_prev->tl_next = (entry_ptr)->tl_next; \ + if ( (tail_ptr) == (entry_ptr) ) \ + { \ + (tail_ptr) = (entry_ptr)->tl_prev; \ + if ( (tail_ptr) != NULL ) \ + (tail_ptr)->tl_next = NULL; \ + } \ + else \ + (entry_ptr)->tl_next->tl_prev = (entry_ptr)->tl_prev; \ + entry_ptr->tl_next = NULL; \ + entry_ptr->tl_prev = NULL; \ + (len)--; \ + (Size) -= entry_ptr->size; \ + } \ +} /* H5PB__TL_DLL_REMOVE() */ + + +/*********************************************************************** + * + * Stats collection macros + * + * The following macros must handle stats collection when this collection + * is enabled, and evaluate to the empty string when it is not. + * + * The sole exception to this rule is + * H5PB__UPDATE_PB_HIT_RATE_STATS(), which is always active as + * the page buffer hit rate stats are always collected and available. + * + ***********************************************************************/ + +#if H5PB__COLLECT_PAGE_BUFFER_STATS + +#define H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, hit, is_metadata, is_mpmde) \ +{ \ + int ii; \ + \ + HDassert(pb_ptr); \ + HDassert((pb_ptr)->magic == H5PB__H5PB_T_MAGIC); \ + \ + if ( is_metadata ) { \ + if ( is_mpmde ) { \ + ii = H5PB__STATS_MPMDE; \ + } else { \ + ii = H5PB__STATS_MD; \ + } \ + } else { \ + ii = H5PB__STATS_RD; \ + } \ + if ( hit ) \ + ((pb_ptr)->hits[ii])++; \ + else \ + ((pb_ptr)->misses[ii])++; \ +} /* H5PB__UPDATE_PB_HIT_RATE_STATS */ + +#define H5PB__UPDATE_HT_SIZE_STATS(pb_ptr) \ + if ( (pb_ptr)->index_len > (pb_ptr)->max_index_len ) \ + (pb_ptr)->max_index_len = (pb_ptr)->index_len; \ + if ( (pb_ptr)->index_size > (pb_ptr)->max_index_size ) \ + (pb_ptr)->max_index_size = (pb_ptr)->index_size; \ + if ( (pb_ptr)->curr_md_pages > (pb_ptr)->max_md_pages ) \ + (pb_ptr)->max_md_pages = (pb_ptr)->curr_md_pages; \ + if ( (pb_ptr)->curr_rd_pages > (pb_ptr)->max_rd_pages ) \ + (pb_ptr)->max_rd_pages = (pb_ptr)->curr_rd_pages; \ + if ( (pb_ptr)->mpmde_count > (pb_ptr)->max_mpmde_count ) \ + (pb_ptr)->max_rd_pages = (pb_ptr)->curr_rd_pages; + +#define H5PB__UPDATE_STATS_FOR_HT_INSERTION(pb_ptr) \ + ((pb_ptr)->total_ht_insertions)++; + + +#define H5PB__UPDATE_STATS_FOR_HT_DELETION(pb_ptr) \ + (pb_ptr)->total_ht_deletions++; + +#define H5PB__UPDATE_STATS_FOR_HT_SEARCH(pb_ptr, success, depth) \ + HDassert(depth >= 0); \ + if ( success ) { \ + (pb_ptr)->successful_ht_searches++; \ + (pb_ptr)->total_successful_ht_search_depth += (int64_t)depth; \ + } else { \ + (pb_ptr)->failed_ht_searches++; \ + (pb_ptr)->total_failed_ht_search_depth += (int64_t)depth; \ + } + +#define H5PB__UPDATE_LRU_SIZE_STATS(pb_ptr) \ + if ( (pb_ptr)->LRU_len > (pb_ptr)->max_lru_len ) \ + (pb_ptr)->max_lru_len = (pb_ptr)->LRU_len; \ + if ( (pb_ptr)->LRU_size > (pb_ptr)->max_lru_size ) \ + (pb_ptr)->max_lru_size = (pb_ptr)->LRU_size; + +#define H5PB__UPDATE_STATS_FOR_LRU_MD_SKIP(pb_ptr) \ + ((pb_ptr)->lru_md_skips)++; + +#define H5PB__UPDATE_STATS_FOR_LRU_RD_SKIP(pb_ptr) \ + ((pb_ptr)->lru_rd_skips)++; + +#define H5PB__UPDATE_STATS_FOR_LRU_TL_SKIP(pb_ptr) \ +{ \ + HDassert(pb_ptr->vfd_swmr_writer); \ + ((pb_ptr)->lru_tl_skips)++; \ +} + +#define H5PB__UPDATE_STATS_FOR_LRU_DWL_SKIP(pb_ptr) \ +{ \ + HDassert((pb_ptr)->vfd_swmr_writer); \ + ((pb_ptr)->lru_dwl_skips)++; \ +} + +#define H5PB__UPDATE_TL_SIZE_STATS(pb_ptr) \ +{ \ + HDassert((pb_ptr)->vfd_swmr_writer); \ + if ( (pb_ptr)->tl_len > (pb_ptr)->max_tl_len ) \ + (pb_ptr)->max_tl_len = (pb_ptr)->tl_len; \ + if ( (pb_ptr)->tl_size > (pb_ptr)->max_tl_size ) \ + (pb_ptr)->max_tl_size = (pb_ptr)->tl_size; \ +} + +#define H5PB__UPDATE_DWL_SIZE_STATS(pb_ptr) \ +{ \ + HDassert((pb_ptr)->vfd_swmr_writer); \ + if ( (pb_ptr)->dwl_len > (pb_ptr)->max_dwl_len ) \ + (pb_ptr)->max_dwl_len = (pb_ptr)->dwl_len; \ + if ( (pb_ptr)->dwl_size > (pb_ptr)->max_dwl_size ) \ + (pb_ptr)->max_dwl_size = (pb_ptr)->dwl_size; \ +} + +#define H5PB__UPDATE_DWL_DELAYED_WRITES(pb_ptr, insertion_depth, delay) \ +{ \ + HDassert((pb_ptr)->vfd_swmr_writer); \ + (pb_ptr)delayed_writes++; \ + (pb_ptr)total_delay += delay; \ + (pb_ptr)total_dwl_ins_depth += (insertion_depth) \ +} + + +#define H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size) \ +{ \ + int i; \ + \ + HDassert(pb_ptr); \ + HDassert((pb_ptr)->magic == H5PB__H5PB_T_MAGIC); \ + \ + if ( H5FD_MEM_DRAW == (type) ) { \ + i = H5PB__STATS_RD; \ + } else if ( (size) > (pb_ptr)->page_size ) { \ + i = H5PB__STATS_MPMDE; \ + } else { \ + i = H5PB__STATS_MD; \ + } \ + ((pb_ptr)->accesses[i])++; \ +} /* H5PB__UPDATE_STATS_FOR_ACCESS */ + + +#define H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size) \ +{ \ + int i; \ + \ + HDassert(pb_ptr); \ + HDassert((pb_ptr)->magic == H5PB__H5PB_T_MAGIC); \ + \ + if ( H5FD_MEM_DRAW == (type) ) { \ + i = H5PB__STATS_RD; \ + } else if ( (size) > (pb_ptr)->page_size ) { \ + i = H5PB__STATS_MPMDE; \ + } else { \ + i = H5PB__STATS_MD; \ + } \ + ((pb_ptr)->bypasses[i])++; \ +} /* H5PB__UPDATE_STATS_FOR_BYPASS */ + + +#define H5PB__UPDATE_STATS_FOR_FLUSH(pb_ptr, entry_ptr) \ +{ \ + int i; \ + \ + HDassert(pb_ptr); \ + HDassert((pb_ptr)->magic == H5PB__H5PB_T_MAGIC); \ + HDassert(entry_ptr); \ + HDassert((entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC); \ + \ + if ( (entry_ptr)->is_metadata ) { \ + if ( (entry_ptr)->is_mpmde ) { \ + i = H5PB__STATS_MPMDE; \ + } else { \ + i = H5PB__STATS_MD; \ + } \ + } else { \ + i = H5PB__STATS_RD; \ + } \ + ((pb_ptr)->flushes[i])++; \ +} /* H5PB__UPDATE_STATS_FOR_FLUSH */ + + +#define H5PB__UPDATE_STATS_FOR_EVICTION(pb_ptr, entry_ptr) \ +{ \ + int i; \ + \ + HDassert(pb_ptr); \ + HDassert((pb_ptr)->magic == H5PB__H5PB_T_MAGIC); \ + HDassert(entry_ptr); \ + HDassert((entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC); \ + \ + if ( (entry_ptr)->is_metadata ) { \ + if ( (entry_ptr)->is_mpmde ) { \ + i = H5PB__STATS_MPMDE; \ + } else { \ + i = H5PB__STATS_MD; \ + } \ + } else { \ + i = H5PB__STATS_RD; \ + } \ + ((pb_ptr)->evictions[i])++; \ +} /* H5PB__UPDATE_STATS_FOR_EVICTION */ + + +#define H5PB__UPDATE_STATS_FOR_CLEAR(pb_ptr, entry_ptr) \ +{ \ + int i; \ + \ + HDassert(pb_ptr); \ + HDassert((pb_ptr)->magic == H5PB__H5PB_T_MAGIC); \ + HDassert(entry_ptr); \ + HDassert((entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC); \ + \ + if ( (entry_ptr)->is_metadata ) { \ + if ( (entry_ptr)->is_mpmde ) { \ + i = H5PB__STATS_MPMDE; \ + } else { \ + i = H5PB__STATS_MD; \ + } \ + } else { \ + i = H5PB__STATS_RD; \ + } \ + ((pb_ptr)->clears[i])++; \ +} /* H5PB__UPDATE_STATS_FOR_CLEAR */ + + +#define H5PB__UPDATE_STATS_FOR_INSERTION(pb_ptr, entry_ptr) \ +{ \ + int i; \ + \ + HDassert(pb_ptr); \ + HDassert((pb_ptr)->magic == H5PB__H5PB_T_MAGIC); \ + HDassert(entry_ptr); \ + HDassert((entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC); \ + \ + if ( (entry_ptr)->is_metadata ) { \ + if ( (entry_ptr)->is_mpmde ) { \ + i = H5PB__STATS_MPMDE; \ + } else { \ + i = H5PB__STATS_MD; \ + } \ + } else { \ + i = H5PB__STATS_RD; \ + } \ + ((pb_ptr)->insertions[i])++; \ +} /* H5PB__UPDATE_STATS_FOR_INSERTION */ + +#define H5PB__UPDATE_STATS_FOR_LOAD(pb_ptr, entry_ptr) \ +{ \ + int i; \ + \ + HDassert(pb_ptr); \ + HDassert((pb_ptr)->magic == H5PB__H5PB_T_MAGIC); \ + HDassert(entry_ptr); \ + HDassert((entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC); \ + \ + if ( (entry_ptr)->is_metadata ) { \ + if ( (entry_ptr)->is_mpmde ) { \ + i = H5PB__STATS_MPMDE; \ + } else { \ + i = H5PB__STATS_MD; \ + } \ + } else { \ + i = H5PB__STATS_RD; \ + } \ + ((pb_ptr)->loads[i])++; \ +} /* H5PB__UPDATE_STATS_FOR_LOAD */ + +#else /* H5PB__COLLECT_PAGE_BUFFER_STATS */ + +#define H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, hit, is_metadata, is_mpmde) +#define H5PB__UPDATE_HT_SIZE_STATS(pb_ptr) +#define H5PB__UPDATE_STATS_FOR_HT_INSERTION(pb_ptr) +#define H5PB__UPDATE_STATS_FOR_HT_DELETION(pb_ptr) +#define H5PB__UPDATE_HT_SEARCH_STATS(pb_ptr, success, depth) +#define H5PB__UPDATE_LRU_SIZE_STATS(pb_ptr) +#define H5PB__UPDATE_STATS_FOR_LRU_MD_SKIP(pb_ptr) +#define H5PB__UPDATE_STATS_FOR_LRU_RD_SKIP(pb_ptr) +#define H5PB__UPDATE_STATS_FOR_LRU_TL_SKIP(pb_ptr) +#define H5PB__UPDATE_STATS_FOR_LRU_DWL_SKIP(pb_ptr) +#define H5PB__UPDATE_TL_SIZE_STATS(pb_ptr) +#define H5PB__UPDATE_DWL_SIZE_STATS(pb_ptr) +#define H5PB__UPDATE_DWL_DELAYED_WRITES(pb_ptr, insertion_depth, delay) +#define H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size) +#define H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size) +#define H5PB__UPDATE_STATS_FOR_FLUSH(pb_ptr, entry_ptr) +#define H5PB__UPDATE_STATS_FOR_EVICTION(pb_ptr, entry_ptr) +#define H5PB__UPDATE_STATS_FOR_CLEAR(pb_ptr, entry_ptr) +#define H5PB__UPDATE_STATS_FOR_INSERTION(pb_ptr, entry_ptr) +#define H5PB__UPDATE_STATS_FOR_LOAD(pb_ptr, entry_ptr) + +#endif /* H5PB__COLLECT_PAGE_BUFFER_STATS */ + + +/*********************************************************************** + * + * Hash table access and manipulation macros: + * + * The following macros handle searches, insertions, and deletion in + * the hash table. + * + * Note that the input to the hash function is the page of the page + * buffer entry, not it address (recall that page * page_size) == addr). + * + * JRM -- 10/09/18 + * + * Changes: + * + * - None + * + ***********************************************************************/ + +#define H5PB__HASH_MASK ((uint64_t)(H5PB__HASH_TABLE_LEN - 1)) + +#define H5PB__HASH_FCN(x) (int)(((uint64_t)(x)) & H5PB__HASH_MASK) + +#if H5PB__DO_SANITY_CHECKS + +#define H5PB__PRE_HT_INSERT_SC(pb_ptr, entry_ptr, fail_val) \ +if ( ( (pb_ptr) == NULL ) || \ + ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \ + ( (entry_ptr) == NULL ) || \ + ( (entry_ptr)->magic != H5PB__H5PB_ENTRY_T_MAGIC ) || \ + ( (entry_ptr)->ht_next != NULL ) || \ + ( (entry_ptr)->ht_prev != NULL ) || \ + ( (entry_ptr)->size < pb_ptr->page_size ) || \ + ( H5PB__HASH_FCN((entry_ptr)->page) < 0 ) || \ + ( H5PB__HASH_FCN((entry_ptr)->page) >= H5PB__HASH_TABLE_LEN ) || \ + ( (pb_ptr)->index_len < 0 ) || \ + ( (pb_ptr)->index_size < 0 ) || \ + ( (pb_ptr)->curr_pages < 0 ) || \ + ( (pb_ptr)->curr_rd_pages < 0 ) || \ + ( (pb_ptr)->curr_md_pages < 0 ) || \ + ( ((pb_ptr)->curr_pages != \ + ((pb_ptr)->curr_md_pages + (pb_ptr)->curr_rd_pages)) ) || \ + ( (pb_ptr)->mpmde_count < 0 ) || \ + ( (pb_ptr)->index_len != \ + ((pb_ptr)->curr_pages + (pb_ptr)->mpmde_count) ) ) { \ + HDassert(FALSE); \ + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, fail_val, "pre HT insert SC failed") \ +} + +#define H5PB__POST_HT_INSERT_SC(pb_ptr, entry_ptr, fail_val) \ +if ( ( (pb_ptr) == NULL ) || \ + ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \ + ( (entry_ptr)->magic != H5PB__H5PB_ENTRY_T_MAGIC ) || \ + ( (pb_ptr)->index_len < 1 ) || \ + ( (pb_ptr)->index_len != \ + ((pb_ptr)->curr_pages + (pb_ptr)->mpmde_count) ) || \ + ( (pb_ptr)->index_size < (int64_t)((entry_ptr)->size) ) ) { \ + HDassert(FALSE); \ + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, fail_val, "post HT insert SC failed") \ +} + +#define H5PB__PRE_HT_REMOVE_SC(pb_ptr, entry_ptr) \ +if ( ( (pb_ptr) == NULL ) || \ + ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \ + ( (pb_ptr)->index_len < 1 ) || \ + ( (entry_ptr) == NULL ) || \ + ( (entry_ptr)->magic != H5PB__H5PB_ENTRY_T_MAGIC ) || \ + ( (entry_ptr)->size < pb_ptr->page_size ) || \ + ( (pb_ptr)->index_len < 1 ) || \ + ( (pb_ptr)->index_size < (int64_t)((entry_ptr)->size) ) || \ + ( ((pb_ptr)->ht)[(H5PB__HASH_FCN((entry_ptr)->page))] \ + == NULL ) || \ + ( ( ((pb_ptr)->ht)[(H5PB__HASH_FCN((entry_ptr)->page))] \ + != (entry_ptr) ) && \ + ( (entry_ptr)->ht_prev == NULL ) ) || \ + ( ( ((pb_ptr)->ht)[(H5PB__HASH_FCN((entry_ptr)->page))] == \ + (entry_ptr) ) && \ + ( (entry_ptr)->ht_prev != NULL ) ) ) { \ + HDassert(FALSE); \ + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "pre HT remove SC failed") \ +} + +#define H5PB__POST_HT_REMOVE_SC(pb_ptr, entry_ptr) \ +if ( ( (pb_ptr) == NULL ) || \ + ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \ + ( (entry_ptr) == NULL ) || \ + ( (entry_ptr)->magic != H5PB__H5PB_ENTRY_T_MAGIC ) || \ + ( (entry_ptr)->size < (pb_ptr)->page_size ) || \ + ( (entry_ptr)->ht_prev != NULL ) || \ + ( (entry_ptr)->ht_prev != NULL ) || \ + ( (pb_ptr)->index_len < 0 ) || \ + ( (pb_ptr)->index_size < 0 ) || \ + ( (pb_ptr)->curr_pages < 0 ) || \ + ( (pb_ptr)->curr_rd_pages < 0 ) || \ + ( (pb_ptr)->curr_md_pages < 0 ) || \ + ( ((pb_ptr)->curr_pages != \ + ((pb_ptr)->curr_md_pages + (pb_ptr)->curr_rd_pages)) ) || \ + ( (pb_ptr)->mpmde_count < 0 ) || \ + ( (pb_ptr)->index_len != \ + ((pb_ptr)->curr_pages + (pb_ptr)->mpmde_count) ) ) { \ + HDassert(FALSE); \ + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "post HT remove SC failed") \ +} + +#define H5PB__PRE_HT_SEARCH_SC(pb_ptr, page, fail_val) \ +if ( ( (pb_ptr) == NULL ) || \ + ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \ + ( H5PB__HASH_FCN(page) < 0 ) || \ + ( H5PB__HASH_FCN(page) >= H5PB__HASH_TABLE_LEN ) ) { \ + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, fail_val, "pre HT search SC failed") \ +} + +#define H5PB__POST_SUC_HT_SEARCH_SC(pb_ptr, entry_ptr, k, fail_val) \ +if ( ( (pb_ptr) == NULL ) || \ + ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \ + ( (pb_ptr)->index_len < 1 ) || \ + ( (entry_ptr) == NULL ) || \ + ( (entry_ptr)->magic != H5PB__H5PB_ENTRY_T_MAGIC ) || \ + ( (pb_ptr)->index_size < (int64_t)((entry_ptr)->size) ) || \ + ( (pb_ptr)->index_len < 1 ) || \ + ( (entry_ptr)->size < (pb_ptr)->page_size ) || \ + ( ( k < 0 ) || ( k >= H5PB__HASH_TABLE_LEN ) ) || \ + ( ((pb_ptr)->ht)[k] == NULL ) || \ + ( ( ((pb_ptr)->ht)[k] != (entry_ptr) ) && \ + ( (entry_ptr)->ht_prev == NULL ) ) || \ + ( ( ((pb_ptr)->ht)[k] == (entry_ptr) ) && \ + ( (entry_ptr)->ht_prev != NULL ) ) || \ + ( ( (entry_ptr)->ht_prev != NULL ) && \ + ( (entry_ptr)->ht_prev->ht_next != (entry_ptr) ) ) || \ + ( ( (entry_ptr)->ht_next != NULL ) && \ + ( (entry_ptr)->ht_next->ht_prev != (entry_ptr) ) ) ) { \ + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, fail_val, \ + "post successful HT search SC failed") \ +} + +#define H5PB__POST_HT_SHIFT_TO_FRONT_SC(pb_ptr, entry_ptr, k, fail_val) \ +if ( ( (pb_ptr) == NULL ) || \ + ( ((pb_ptr)->ht)[k] != (entry_ptr) ) || \ + ( (entry_ptr)->ht_prev != NULL ) ) { \ + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, fail_val, \ + "post HT shift to front SC failed") \ +} + +#else /* H5PB__DO_SANITY_CHECKS */ + +#define H5PB__PRE_HT_INSERT_SC(pb_ptr, entry_ptr, fail_val) +#define H5PB__POST_HT_INSERT_SC(pb_ptr, entry_ptr, fail_val) +#define H5PB__PRE_HT_REMOVE_SC(pb_ptr, entry_ptr) +#define H5PB__POST_HT_REMOVE_SC(pb_ptr, entry_ptr) +#define H5PB__PRE_HT_SEARCH_SC(pb_ptr, page, fail_val) +#define H5PB__POST_SUC_HT_SEARCH_SC(pb_ptr, entry_ptr, k, fail_val) +#define H5PB__POST_HT_SHIFT_TO_FRONT_SC(pb_ptr, entry_ptr, k, fail_val) + +#endif /* H5PB__DO_SANITY_CHECKS */ + +#define H5PB__INSERT_IN_INDEX(pb_ptr, entry_ptr, fail_val) \ +{ \ + int k; \ + H5PB__PRE_HT_INSERT_SC(pb_ptr, entry_ptr, fail_val) \ + k = H5PB__HASH_FCN((entry_ptr)->page); \ + if(((pb_ptr)->ht)[k] != NULL) { \ + (entry_ptr)->ht_next = ((pb_ptr)->ht)[k]; \ + (entry_ptr)->ht_next->ht_prev = (entry_ptr); \ + } \ + ((pb_ptr)->ht)[k] = (entry_ptr); \ + (pb_ptr)->index_len++; \ + (pb_ptr)->index_size += (int64_t)((entry_ptr)->size); \ + if ( (entry_ptr)->is_metadata ) { \ + if ( (entry_ptr)->is_mpmde ) { \ + ((pb_ptr)->mpmde_count)++; \ + } else { \ + ((pb_ptr)->curr_md_pages)++; \ + (pb_ptr)->curr_pages++; \ + } \ + } else { \ + ((pb_ptr)->curr_rd_pages)++; \ + (pb_ptr)->curr_pages++; \ + } \ + H5PB__UPDATE_STATS_FOR_HT_INSERTION(pb_ptr) \ + H5PB__UPDATE_HT_SIZE_STATS(pb_ptr) \ + H5PB__POST_HT_INSERT_SC(pb_ptr, entry_ptr, fail_val) \ +} + +#define H5PB__DELETE_FROM_INDEX(pb_ptr, entry_ptr, fail_val) \ +{ \ + int k; \ + H5PB__PRE_HT_REMOVE_SC(pb_ptr, entry_ptr) \ + k = H5PB__HASH_FCN((entry_ptr)->page); \ + if((entry_ptr)->ht_next) \ + (entry_ptr)->ht_next->ht_prev = (entry_ptr)->ht_prev; \ + if((entry_ptr)->ht_prev) \ + (entry_ptr)->ht_prev->ht_next = (entry_ptr)->ht_next; \ + if(((pb_ptr)->ht)[k] == (entry_ptr)) \ + ((pb_ptr)->ht)[k] = (entry_ptr)->ht_next; \ + (entry_ptr)->ht_next = NULL; \ + (entry_ptr)->ht_prev = NULL; \ + (pb_ptr)->index_len--; \ + (pb_ptr)->index_size -= (int64_t)((entry_ptr)->size); \ + if ( (entry_ptr)->is_metadata ) { \ + if ( (entry_ptr)->is_mpmde ) { \ + ((pb_ptr)->mpmde_count)--; \ + } else { \ + ((pb_ptr)->curr_md_pages)--; \ + (pb_ptr)->curr_pages--; \ + } \ + } else { \ + ((pb_ptr)->curr_rd_pages)--; \ + (pb_ptr)->curr_pages--; \ + } \ + H5PB__UPDATE_STATS_FOR_HT_DELETION(pb_ptr) \ + H5PB__POST_HT_REMOVE_SC(pb_ptr, entry_ptr) \ +} + +#define H5PB__SEARCH_INDEX(pb_ptr, pg, entry_ptr, f_val) \ +{ \ + int k; \ + int depth = 0; \ + H5PB__PRE_HT_SEARCH_SC((pb_ptr), (pg), (f_val)) \ + k = H5PB__HASH_FCN((pg)); \ + entry_ptr = ((pb_ptr)->ht)[k]; \ + while ( entry_ptr ) { \ + if ( (pg) == (entry_ptr)->page ) { \ + H5PB__POST_SUC_HT_SEARCH_SC(pb_ptr, entry_ptr, k, f_val) \ + if ( entry_ptr != ((pb_ptr)->ht)[k] ) { \ + if ( (entry_ptr)->ht_next ) \ + (entry_ptr)->ht_next->ht_prev = (entry_ptr)->ht_prev; \ + HDassert((entry_ptr)->ht_prev != NULL); \ + (entry_ptr)->ht_prev->ht_next = (entry_ptr)->ht_next; \ + ((pb_ptr)->ht)[k]->ht_prev = (entry_ptr); \ + (entry_ptr)->ht_next = ((pb_ptr)->ht)[k]; \ + (entry_ptr)->ht_prev = NULL; \ + ((pb_ptr)->ht)[k] = (entry_ptr); \ + H5PB__POST_HT_SHIFT_TO_FRONT_SC(pb_ptr, entry_ptr, k, f_val) \ + } \ + break; \ + } \ + (entry_ptr) = (entry_ptr)->ht_next; \ + (depth)++; \ + } \ + H5PB__UPDATE_STATS_FOR_HT_SEARCH(pb_ptr, (entry_ptr != NULL), depth) \ +} + + +/*********************************************************************** + * + * Replacement policy update macros + * + * The following macros handle updates to the replacement policy for + * insertions, flushes, and evictions. + * + * At present, the only replacement policy is a modified LRU policy. + * + * JRM -- 10/09/18 + * + ***********************************************************************/ + +/*------------------------------------------------------------------------- + * + * Macro: H5PB__UPDATE_RP_FOR_EVICTION + * + * Purpose: Update the replacement policy data structures for an + * eviction of the specified page buffer entry. + * + * At present, we only support the modified LRU policy, so + * this function deals with that case unconditionally. If + * we ever support other replacement policies, the function + * should switch on the current policy and act accordingly. + * + * Return: Non-negative on success/Negative on failure. + * + * Programmer: John Mainzer, 10/09/18 + * + * Modifications: + * + * None. + * + *------------------------------------------------------------------------- + */ + +#define H5PB__UPDATE_RP_FOR_EVICTION(pb_ptr, entry_ptr, fail_val) \ +{ \ + HDassert( (pb_ptr) ); \ + HDassert( (pb_ptr)->magic == H5PB__H5PB_T_MAGIC ); \ + HDassert( (entry_ptr) ); \ + HDassert( (entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC ); \ + HDassert( !((entry_ptr)->is_dirty) ); \ + HDassert( (entry_ptr)->size >= pb_ptr->page_size ); \ + \ + /* modified LRU specific code */ \ + \ + /* remove the entry from the LRU list. */ \ + \ + H5PB__DLL_REMOVE((entry_ptr), (pb_ptr)->LRU_head_ptr, \ + (pb_ptr)->LRU_tail_ptr, (pb_ptr)->LRU_len, \ + (pb_ptr)->LRU_size, (fail_val)) \ + \ + /* End modified LRU specific code. */ \ + \ +} /* H5PB__UPDATE_RP_FOR_EVICTION */ + + +/*------------------------------------------------------------------------- + * + * Macro: H5PB__UPDATE_RP_FOR_ACCESS + * + * Purpose: Update the replacement policy data structures for an + * access of the specified page buffer entry. + * + * At present, we only support the modified LRU policy, so + * this function deals with that case unconditionally. If + * we ever support other replacement policies, the function + * should switch on the current policy and act accordingly. + * + * Return: N/A + * + * Programmer: John Mainzer, 10/09/18 + * + * Modifications: + * + * None. + * + *------------------------------------------------------------------------- + */ + +#define H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, fail_val) \ +{ \ + HDassert( (pb_ptr) ); \ + HDassert( (pb_ptr)->magic == H5PB__H5PB_T_MAGIC ); \ + HDassert( (entry_ptr) ); \ + HDassert( (entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC ); \ + HDassert( (entry_ptr)->size >= pb_ptr->page_size ); \ + \ + /* modified LRU specific code */ \ + \ + /* Move entry to the head of the LRU */ \ + \ + H5PB__DLL_REMOVE((entry_ptr), (pb_ptr)->LRU_head_ptr, \ + (pb_ptr)->LRU_tail_ptr, (pb_ptr)->LRU_len, \ + (pb_ptr)->LRU_size, (fail_val)) \ + \ + H5PB__DLL_PREPEND((entry_ptr), (pb_ptr)->LRU_head_ptr, \ + (pb_ptr)->LRU_tail_ptr, (pb_ptr)->LRU_len, \ + (pb_ptr)->LRU_size, (fail_val)) \ + \ + /* End modified LRU specific code. */ \ + \ +} /* H5PB__UPDATE_RP_FOR_ACCESS */ + + +/*------------------------------------------------------------------------- + * + * Macro: H5PB__UPDATE_RP_FOR_FLUSH + * + * Purpose: Update the replacement policy data structures for a flush + * of the specified page buffer entry. + * + * At present, we only support the modified LRU policy, so + * this function deals with that case unconditionally. If + * we ever support other replacement policies, the function + * should switch on the current policy and act accordingly. + * + * Return: N/A + * + * Programmer: John Mainzer, 10/09/18 + * + * Modifications: + * + * None. + * + *------------------------------------------------------------------------- + */ + +#define H5PB__UPDATE_RP_FOR_FLUSH(pb_ptr, entry_ptr, fail_val) \ +{ \ + H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, fail_val) \ + \ +} /* H5PB__UPDATE_RP_FOR_FLUSH */ + + +/*------------------------------------------------------------------------- + * + * Macro: H5PB__UPDATE_RP_FOR_INSERT_APPEND + * + * Purpose: Update the replacement policy data structures for an + * insertion of the specified cache entry. + * + * Unlike H5PB__UPDATE_RP_FOR_INSERTION below, mark the + * new entry as the LEAST recently used entry, not the + * most recently used. + * + * At present, we only support the modified LRU policy, so + * this function deals with that case unconditionally. If + * we ever support other replacement policies, the function + * should switch on the current policy and act accordingly. + * + * Return: N/A + * + * Programmer: John Mainzer, 10/10/18 + * + * Modifications: + * + * None. + * + *------------------------------------------------------------------------- + */ + +#define H5PB__UPDATE_RP_FOR_INSERT_APPEND(pb_ptr, entry_ptr, fail_val) \ +{ \ + HDassert( (pb_ptr) ); \ + HDassert( (pb_ptr)->magic == H5PB__H5PB_T_MAGIC ); \ + HDassert( (entry_ptr) ); \ + HDassert( (entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC ); \ + HDassert( (entry_ptr)->size >= pb_ptr->page_size ); \ + \ + /* modified LRU specific code */ \ + \ + /* insert the entry at the tail of the LRU list. */ \ + \ + H5PB__DLL_APPEND((entry_ptr), (pb_ptr)->LRU_head_ptr, \ + (pb_ptr)->LRU_tail_ptr, (pb_ptr)->LRU_len, \ + (pb_ptr)->LRU_size, (fail_val)) \ + \ + H5PB__UPDATE_LRU_SIZE_STATS(pb_ptr) \ + \ + /* End modified LRU specific code. */ \ +} + + +/*------------------------------------------------------------------------- + * + * Macro: H5PB__UPDATE_RP_FOR_INSERTION + * + * Purpose: Update the replacement policy data structures for an + * insertion of the specified cache entry. + * + * At present, we only support the modified LRU policy, so + * this function deals with that case unconditionally. If + * we ever support other replacement policies, the function + * should switch on the current policy and act accordingly. + * + * Return: N/A + * + * Programmer: John Mainzer, 10/10/18 + * + * Modifications: + * + * None. + * + *------------------------------------------------------------------------- + */ + +#define H5PB__UPDATE_RP_FOR_INSERTION(pb_ptr, entry_ptr, fail_val) \ +{ \ + HDassert( (pb_ptr) ); \ + HDassert( (pb_ptr)->magic == H5PB__H5PB_T_MAGIC ); \ + HDassert( (entry_ptr) ); \ + HDassert( (entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC ); \ + HDassert( (entry_ptr)->size >= pb_ptr->page_size ); \ + \ + /* modified LRU specific code */ \ + \ + /* insert the entry at the head of the LRU list. */ \ + \ + H5PB__DLL_PREPEND((entry_ptr), (pb_ptr)->LRU_head_ptr, \ + (pb_ptr)->LRU_tail_ptr, (pb_ptr)->LRU_len, \ + (pb_ptr)->LRU_size, (fail_val)) \ + \ + H5PB__UPDATE_LRU_SIZE_STATS(pb_ptr) \ + \ + /* End modified LRU specific code. */ \ +} + + +/*********************************************************************** + * + * Tick list management macros + * + * When the target file is opened in VFD SWMR writer mode, the page + * buffer must retain copies of all metadata writes during each tick so + * that the metadata file can be updated correctly in end of tick + * processing. + * + * Once tick processing is complete, all entries are removed from the + * tick list, to leave it empty for the next tick. Metadata pages from + * the tick list are already in the replacement policy, and thus require + * no further action. + * + * Multi-page metadata entries are evicted from the page buffer if they + * are not subject to delayed write, or left in the delayed write list + * for later flush and eviction if they are. + * + * The macros required to support this are defined below. + * + * JRM -- 10/09/18 + * + ***********************************************************************/ + +/*------------------------------------------------------------------------- + * + * Macro: H5PB__INSERT_IN_TL + * + * Purpose: Insert the specified page buffer entry at the head of the + * tick list. + * + * Return: N/A + * + * Programmer: John Mainzer, 10/10/18 + * + * Modifications: + * + * None. + * + *------------------------------------------------------------------------- + */ + +#define H5PB__INSERT_IN_TL(pb_ptr, entry_ptr, fail_val) \ +{ \ + HDassert( (pb_ptr) ); \ + HDassert( (pb_ptr)->magic == H5PB__H5PB_T_MAGIC ); \ + HDassert( (pb_ptr)->vfd_swmr_writer ); \ + HDassert( (entry_ptr) ); \ + HDassert( (entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC ); \ + HDassert( (entry_ptr)->modified_this_tick ); \ + HDassert( (entry_ptr)->size >= pb_ptr->page_size ); \ + \ + /* insert the entry at the head of the tick list. */ \ + \ + H5PB__TL_DLL_PREPEND((entry_ptr), (pb_ptr)->tl_head_ptr, \ + (pb_ptr)->tl_tail_ptr, (pb_ptr)->tl_len, \ + (pb_ptr)->tl_size, (fail_val)) \ + \ + H5PB__UPDATE_TL_SIZE_STATS(pb_ptr) \ + \ +} /* H5PB__INSERT_IN_TL */ + + +/*------------------------------------------------------------------------- + * + * Macro: H5PB__REMOVE_FROM_TL + * + * Purpose: Remove the specified page buffer entry from the tick list. + * + * Return: N/A + * + * Programmer: John Mainzer, 10/10/18 + * + * Modifications: + * + * None. + * + *------------------------------------------------------------------------- + */ + +#define H5PB__REMOVE_FROM_TL(pb_ptr, entry_ptr, fail_val) \ +{ \ + HDassert( (pb_ptr) ); \ + HDassert( (pb_ptr)->magic == H5PB__H5PB_T_MAGIC ); \ + HDassert( (pb_ptr)->vfd_swmr_writer ) \ + HDassert( (entry_ptr) ); \ + HDassert( (entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC ); \ + HDassert( (entry_ptr)->modified_this_tick ); \ + HDassert( (entry_ptr)->size >= pb_ptr->page_size ); \ + \ + /* remove the entry from the tick list. */ \ + \ + H5PB__TL_DLL_REMOVE((entry_ptr), (pb_ptr)->tl_head_ptr, \ + (pb_ptr)->tl_tail_ptr, (pb_ptr)->tl_len, \ + (pb_ptr)->tl_size, (fail_val)) \ + \ + \ +} /* H5PB__REMOVE_FROM_TL */ + + +/*********************************************************************** + * + * Delayed write list management macros + * + * When the target file is opened in VFD SWMR writer mode, the page + * buffer must delay flush of all metadata pages and multi-page metadata + * entries that: + * + * 1) have not appeared in the metadata file index for at least max_lag + * ticks, and + * + * 2) a previous version of the metadata page or multi-page metadata + * cache entry exists in the file. + * + * Failure to do so can result in VFD SWMR readers to receive messages + * from the future. + * + * To minimize overhead, the delayed write list is sorted in decreasing + * values of the constituent delay_write_until fields. + * + * Entries are removed from the delayed write list when their + * delay_write_until fields are satisfied. Metadata pages are inserted + * at the bottom of the replacement policy, and multi-page metadata + * entries are immediately flushed and evicted. + * + * The macros required to support this are defined below. + * + * JRM -- 10/09/18 + * + ***********************************************************************/ + +/*------------------------------------------------------------------------- + * + * Macro: H5PB__INSERT_IN_DWL + * + * Insert the supplied page buffer entry in the delayed write list + * maintaining the invarient: + * + * entry_ptr->next == NULL || + * entry_ptr->delay_write_until >= entry_ptr->next->delay_write_until + * + * In passing update pb_ptr->max_delay if appropriate. + * + * Return: N/A + * + * Programmer: John Mainzer, 10/10/18 + * + * Modifications: + * + * None. + * + *------------------------------------------------------------------------- + */ + +#define H5PB__INSERT_IN_DWL(pb_ptr, entry_ptr, fail_val) \ +{ \ + int insertion_depth = 0; \ + uint64_t delay; \ + H5PB_entry_t * suc_ptr; \ + \ + HDassert( (pb_ptr) ); \ + HDassert( (pb_ptr)->magic == H5PB__H5PB_T_MAGIC ); \ + HDassert( (pb_ptr)->vfd_swmr_writer ) \ + HDassert( (entry_ptr) ); \ + HDassert( (entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC ); \ + HDassert( (entry_ptr)->size >= pb_ptr->page_size ); \ + HDassert( (entry_ptr)->delay_write_until > (pb_ptr)->cur_tick ); \ + \ + delay = (entry_ptr)->delay_write_until - (pb_ptr)->cur_tick; \ + suc_ptr = pb_ptr->dwl_head_ptr; \ + \ + while ( (suc_ptr) && \ + ((suc_ptr)->delay_write_until > (entry_ptr)->delay_write_until) ) \ + { \ + insertion_depth++; \ + suc_ptr = suc_ptr->next; \ + } \ + \ + H5PB__DLL_INSERT_BEFORE((entry_ptr), (suc_ptr), (pb_ptr)->dwl_head_ptr, \ + (pb_ptr)->dwl_tail_ptr, (pb_ptr)->dwl_len, \ + (pb_ptr)->dwl_size), (fail_val)) \ + \ + if ( entry_ptr->delay_write_until > pb_ptr->max_delay ) \ + pb_ptr->max_delay = entry_ptr->delay_write_until; \ + \ + H5PB__UPDATE_DWL_SIZE_STATS(pb_ptr) \ + H5PB__UPDATE_DWL_DELAYED_WRITES(pb_ptr, insertion_depth, delay) \ + \ +} /* H5PB__INSERT_IN_DWL */ + + +/*------------------------------------------------------------------------- + * + * Macro: H5PB__REMOVE_FROM_DWL + * + * Purpose: Remove the specified page buffer entry from the delayed + * write list. + * + * Return: N/A + * + * Programmer: John Mainzer, 10/10/18 + * + * Modifications: + * + * None. + * + *------------------------------------------------------------------------- + */ + +#define H5PB__REMOVE_FROM_DWL(pb_ptr, entry_ptr, fail_val) \ +{ \ + HDassert( (pb_ptr) ); \ + HDassert( (pb_ptr)->magic == H5PB__H5PB_T_MAGIC ); \ + HDassert( (pb_ptr)->vfd_swmr_writer ) \ + HDassert( (entry_ptr) ); \ + HDassert( (entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC ); \ + HDassert( (entry_ptr)->size >= pb_ptr->page_size ); \ + HDassert( (entry_ptr)->delay_write_until < (pb_ptr)->cur_tick ); \ + \ + /* remove the entry from the delayed write list. */ \ + \ + H5PB__TL_DLL_REMOVE((entry_ptr), (pb_ptr)->dwl_head_ptr, \ + (pb_ptr)->dwl_tail_ptr, (pb_ptr)->dwl_len, \ + (pb_ptr)->dwl_size, (fail_val)) \ + \ + \ +} /* H5PB__REMOVE_FROM_DWLL */ + + /****************************/ /* Package Private Typedefs */ /****************************/ -typedef struct H5PB_entry_t { - void *page_buf_ptr; /* Pointer to the buffer containing the data */ - haddr_t addr; /* Address of the page in the file */ - H5F_mem_page_t type; /* Type of the page entry (H5F_MEM_PAGE_RAW/META) */ - hbool_t is_dirty; /* Flag indicating whether the page has dirty data or not */ +/**************************************************************************** + * + * structure H5PB_entry_t + * + * Individual instances of the H5PB_entry_t structure are used to manage + * individual pages in the page buffer. In the case of a VFD SWMR writer, + * they are also used to manage multi-page metadata entries. + * + * The fields of this structure are discussed below: + * + * JRM - 9/27/18 + * + * magic: Unsigned 32 bit integer that must always be set to + * H5PB__H5PB_ENTRY_T_MAGIC when the entry is valid. + * + * pb_ptr: Pointer to the page buffer that contains this entry. + * + * addr: Base address of the page in the file. + * + * page: Page offset of the page -- i.e. addr / pb_ptr->page_size. + * Note that addr must always equal page * pb_ptr->page_size. + * + * size: Size of the page buffer entry in bytes. Under normal + * circumstance, this will always be equal to pb_ptr->page_size. + * However, in the context of a VFD SWMR writer, the page + * buffer may be used to store multi-page metadata entries + * until the end of tick, or to delay writes of such entries + * for up to max_lag ticks. + * + * In such cases, size must be greater than pb_ptr->page_size. + * + * image_ptr: Pointer to void. When not NULL, this field points to a + * dynamically allocated block of size bytes in which the + * on disk image of the page. In the context of VFD SWMR, + * it points to the image of the multi-page metadata entry. + * + * mem_type: Type (H5F_mem_t) of the page buffer entry. This value + * is needed when reading or writing the entry from/to file. + * + * is_metadata: Boolean flag that is set to TRUE iff the associated + * entry is a page of metadata (or, in the context of VFD + * SWMR, a multi-page metadata entry). + * + * is_dirty: Boolean flag indicating whether the contents of the page + * buffer entry has been modified since the last time it + * was written to disk. + * + * + * Fields supporting the hash table: + * + * Entries in the page buffer are indexed by a more or less conventional + * hash table with chaining (see header comment on H5PB_t for futher details). + * If there are multiple entries in any hash bin, they are stored in a doubly + * linked list. + * + * ht_next: Next pointer used by the hash table to store multiple + * entries in a single hash bin. This field points to the + * next entry in the doubly linked list of entries in the + * hash bin, or NULL if there is no next entry. + * + * ht_prev: Prev pointer used by the hash table to store multiple + * entries in a single hash bin. This field points to the + * previous entry in the doubly linked list of entries in + * the hash bin, or NULL if there is no previuos entry. + * + * + * Fields supporting replacement policies: + * + * The page buffer must have a replacement policy, and it will usually be + * necessary for this structure to contain fields supporting that policy. + * + * At present, only a modified LRU replacement policy is contemplated, + * (see header comment for H5PB_t for details), for which the following + * fields are adequate. + * + * next: Next pointer in either the LRU, or (in the context of + * VFD SWMR) the delayed write list. If there is no next entry + * on the list, this field should be set to NULL. + * + * prev: Prev pointer in either the LRU, or (in the context of + * VFD SWMR) the delayed write list. If there is no previous + * entry on the list, this field should be set to NULL. + * + * Fields supporting VFD SWMR: + * + * is_mpmde: Boolean flag that is set to TRUE iff the entry + * is a multi-page metadata entry. In the absense of VFD + * SWMR, the field should always be set to FALSE. + * + * Observe that: + * + * is_mpmde <==> is_metadata && size > pb_ptr->page_size + * + * loaded: Boolean flag that is set to TRUE iff the entry was loaded + * from file. This is a necessary input in determining + * whether the write of the entry must be delayed. + * + * This field is only maintained in the VFD SWMR case + * and should be false otherwise. + * + * modified_this_tick: This field is set to TRUE iff pb_ptr->vfd_swrm_write + * and the entry has been modified in the current tick. If + * modified_this_tick is TRUE, the entry must also be in the + * tick list. + * + * delay_write_until: Unsigned 64 bit integer containing the first tick + * in which the entry may be written to file, or 0 if there + * is no such constraint. It should be set ot 0 when VFD + * is not enabled. + * + * tl_next: Next pointer on the list of entries modified in the current + * tick, If the enty is not on the tick list, or if there is + * no next entry on the list, this field should be set to NULL. + * + * tl_prev: Prev pointer on the list of entries modified in the current + * tick, If the enty is not on the tick list, or if there is + * no previous entry on the list, this field should be set to + * NULL. + * + ****************************************************************************/ + - /* Fields supporting replacement policies */ - struct H5PB_entry_t *next; /* next pointer in the LRU list */ - struct H5PB_entry_t *prev; /* previous pointer in the LRU list */ -} H5PB_entry_t; +#define H5PB__H5PB_ENTRY_T_MAGIC 0x02030405 +struct H5PB_entry_t { -/*****************************/ -/* Package Private Variables */ -/*****************************/ + uint32_t magic; + H5PB_t *pb_ptr; + haddr_t addr; + uint64_t page; + size_t size; + void *image_ptr; + H5FD_mem_t mem_type; + hbool_t is_metadata; + hbool_t is_dirty; + /* fields supporting the hash table: */ + struct H5PB_entry_t *ht_next; + struct H5PB_entry_t *ht_prev; -/******************************/ -/* Package Private Prototypes */ -/******************************/ + /* fields supporting replacement policies: */ + struct H5PB_entry_t *next; + struct H5PB_entry_t *prev; + /* fields supporting VFD SWMR */ + hbool_t is_mpmde; + hbool_t loaded; + hbool_t modified_this_tick; + uint64_t delay_write_until; + struct H5PB_entry_t *tl_next; + struct H5PB_entry_t *tl_prev; + +}; /* H5PB_entry_t */ #endif /* _H5PBpkg_H */ diff --git a/src/H5PBprivate.h b/src/H5PBprivate.h index b94b845..2c1f3cb 100644 --- a/src/H5PBprivate.h +++ b/src/H5PBprivate.h @@ -11,68 +11,530 @@ * help@hdfgroup.org. * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ -/*------------------------------------------------------------------------- +/* + * File: H5PBprivate.h * - * Created: H5PBprivate.h - * June 2014 - * Mohamad Chaarawi + * Purpose: This file contains declarations which are normally visible + * within the HDF5 library, but are not visible at the user + * level * - *------------------------------------------------------------------------- + * Programmer: John Mainzer -- 10/07/18 */ #ifndef _H5PBprivate_H #define _H5PBprivate_H /* Include package's public header */ -#ifdef NOT_YET -#include "H5PBpublic.h" -#endif /* NOT_YET */ + +/* no H5PBpublic.h at present */ + /* Private headers needed by this header */ #include "H5private.h" /* Generic Functions */ -#include "H5Fprivate.h" /* File access */ -#include "H5FLprivate.h" /* Free Lists */ -#include "H5SLprivate.h" /* Skip List */ /**************************/ /* Library Private Macros */ /**************************/ +#define H5PB__HASH_TABLE_LEN 4096 /* must be a power of 2 */ + /****************************/ /* Library Private Typedefs */ /****************************/ -/* Forward declaration for a page buffer entry */ -struct H5PB_entry_t; +/* Typedef for the page buffer entry structure (defined in H5PBpkg.h) */ +typedef struct H5PB_entry_t H5PB_entry_t; + + + +/****************************************************************************** + * + * structure H5PB_t + * + * Catchall structure for all variables specific to an instance of the page + * buffer. + * + * At present, the page buffer serves two purposes in the HDF5 library. + * + * Under normal operating conditions, it serves as a normal page buffer whose + * purpose is to minimize and optimize file I/O by aggregating small metadata + * and raw data writes into pages, and by caching frequently used pages. + * + * In addition, when a file is opened for VFD SWMR writing, the page buffer is + * used to retain copies of all metadata pages and multi-page metadata entries + * that are written in a given tick, and under certain cases, to delay metadata + * page and/or multi-page metadata entry writes for some number of ticks. + * If the entry has not appeared in the VFD SWMR index for at least max_lag + * ticks, this is necessary to avoid message from the future bugs. See the + * VFD SWMR RFC for further details. + * + * To reflect this, the fields of this structure are divided into three + * sections. Specifically fields needed for general operations, fields needed + * for VFD SWMR, and statistics. + * + * FIELDS FOR GENERAL OPERATIONS: + * + * magic: Unsigned 32 bit integer that must always be set to + * H5PB__H5PB_T_MAGIC. This field is used to validate pointers to + * instances of H5PB_t. + * + * page_size: size_t containing the page buffer page size in bytes. + * + * max_pages: 64 bit integer containing the nominal maximum number + * of pages in the page buffer. Note that on creation, the page + * buffer is empty, and that under certain circumstances (mostly + * related to VFD SWMR) this limit can be exceeded by large + * amounts. + * + * curr_pages: 64 bit integer containing the current number of pages + * in the page buffer. curr_pages must always equal the sum of + * curr_md_pages + curr_rd_pages. + * + * Note that in the context of VFD SWMR, this count does NOT + * include multi-page metadata entries. + * + * curr_md_pages: 64 bit integer containing the current number of + * metadata pages in the page buffer. + * + * Note that in the context of VFD SWMR, this count does NOT + * include multi-page metadata entries. + * + * curr_rd_pages: 64 bit integer containing the current number of + * raw data pages in the page buffer. + * + * min_md_pages: 64 bit integer containing the number of pages in the + * page buffer reserved for metadata. No metadata page may be + * evicted from the page buffer if curr_md_pages is less than or + * equal to this value. + * + * min_rd_pages: 64 bin integer containing the number of pages in the + * page buffer reserved for raw data. No page or raw data may be + * evicted from the page buffer if curr_rd_pages is less than or + * equal to this value. + * + * The FAPL fields are used to store the page buffer configuration data + * provided to the page buffer in the H5PB_create() call. + * + * max_size: Maximum page buffer size supplied by the FAPL. + * + * min_meta_perc: Percent of the page buffer reserved for metadata as + * supplied in the FAPL. + * + * min_raw_perc: Percent of the page buffer reserved for metadata as + * supplied in the FAPL. + * + * The purpose of the index is to allow us to efficiently look up all pages + * (and multi-page metadata entries in the context of VFD SWMR) in the + * page buffer. + * + * This function is provided by a hash table with chaining, albeit with one + * un-unusual feature. + * + * Specifically hash table size must be a power of two, and the hash function + * simply clips the high order bits off the page offset of the entry. + * + * This should work, as space is typically allocated sequentually, and thus + * via a reverse principle of locality argument, hot pages are unlikely to + * hash to the same bucket. That said, we must collect statistics to alert + * us should this not be the case. + * + * index Array of pointer to H5PB_entry_t of size + * H5PB__HASH_TABLE_LEN. This size must ba a power of 2, + * not the usual prime number. + * + * index_len: Number of entries currently in the hash table used to index + * the page buffer. + * + * index_size: Number of bytes currently stored in the hash table used to + * index the page buffer. Under normal circumstances, this + * value will be index_len * page size. However, if + * vfd_swmr_writer is TRUE, it may be larger. + * + * Fields supporting the modified LRU policy: + * + * See most any OS text for a discussion of the LRU replacement policy. + * + * Discussions of the individual fields used by the modified LRU replacement + * policy follow: + * + * LRU_len: Number of page buffer entries currently on the LRU. + * + * Observe that LRU_len + dwl_len must always equal + * index_len. + * + * LRU_size: Number of bytes of page buffer entries currently residing + * on the LRU list. + * + * Observe that LRU_size + dwl_size must always equal + * index_size. + * + * LRU_head_ptr: Pointer to the head of the doubly linked LRU list. Page + * buffer entries on this list are linked by their next and + * prev fields. + * + * This field is NULL if the list is empty. + * + * LRU_tail_ptr: Pointer to the tail of the doubly linked LRU list. Page + * buffer entries on this list are linked by their next and + * prev fields. + * + * This field is NULL if the list is empty. + * + * + * FIELDS FOR VFD SWMR: + * + * vfd_swmr_writer: Boolean flag that is set to TRUE iff the file is + * the file is opened in VFD SWMR mode. The remaining + * VFD SWMR flags are defined iff vfd_swmr_writer is TRUE. + * + * mpmde_count: int64_t containing the number of multi-page metadata + * entries currently resident in the page buffer. Observe + * that index_len should always equal curr_pages + mpmde_count. + * + * cur_tick: uint64_t containing the current tick. This is a copy of + * the same field in the associated instance of H5F_file_t, + * and is maintained as a convenience. + * + * In the context of VFD SWMR the delayed write list allows us to delay + * metadata writes to the HDF5 file until it appears in all indexes in the + * last max_lag ticks. This is essential if a version of the page or + * multi-page metadata entry already exists in the HDF5 file -- failure to + * delay the write can result in a message from the future which will + * likely be perciived as file corruption by the reader. + * + * To facilitate identification of entries that must be removed from the + * DWL, the list always observes the following invarient for any entry + * on the list: + * + * entry_ptr->next == NULL || + * entry_ptr->delay_write_until >= entry_ptr->next->delay_write_until + * + * Discussion of the fields used to implement the delayed write list follows: + * + * max_delay: Maximum of the delay_write_until fields of the entries on + * the delayed write list. This must never be more than max_lag + * ticks in advance of the current tick, and should be set to + * zero if the delayed write list is empty. + * + * dwl_len: Number of page buffer entries currently on the delayed + * write list. + * + * Observe that LRU_len + dwl_len must always equal + * index_len. + * + * dwl_size: Number of bytes of page buffer entries currently residing + * on the LRU list. + * + * Observe that LRU_size + dwl_size must always equal + * index_size. + * + * dwl_head_ptr: Pointer to the head of the doubly linked delayed write list. + * Page buffer entries on this list are linked by their next and + * prev fields. + * + * This field is NULL if the list is empty. + * + * dwl_tail_ptr: Pointer to the tail of the doubly linked delayed write list. + * Page buffer entries on this list are linked by their next and + * prev fields. + * + * This field is NULL if the list is empty. + * + * For VFD SWMR to function, copies of all pages modified during a tick must + * be retained in the page buffer to allow correct updates to the index and + * metadata file at the end of tick. + * + * To implement this, all entries modified during the current tick are placed + * on the tick list. Entries are removed from the tick list during end of + * tick processing, so each tick starts with an empty tick list. + * + * Unless the entry also resides on the delayed write list, entries on the + * tick list may be flushed, but they may not be evicted. + * + * Discussion of the fields used to implement the tick list follows: + * + * tl_len: Number of page buffer entries currently on the tick list + * + * tl_size: Number of bytes of page buffer entries currently residing + * on the tick list. + * + * tl_head_ptr: Pointer to the head of the doubly linked tick list. + * Page buffer entries on this list are linked by their tl_next + * and tl_prev fields. + * + * This field is NULL if the list is empty. + * + * tl_tail_ptr: Pointer to the tail of the doubly linked tick list. + * Page buffer entries on this list are linked by their tl_next + * and tl_prev fields. + * + * This field is NULL if the list is empty. + * + * + * STATISTICS: + * + * Multi-page metadata entries (which may only appear in VFD + * SWMR mode) are NOT counted in the following statistics. + * + * Note that all statistics fields contain only data since the last time + * that statistics were reset. + * + * bypasses: Array of int64_t of length H5PB__NUM_STAT_TYPES containing + * the number of times that the page buffer has been + * bypassed for raw data, metadata, and for multi-page + * metadata entries (VFD SWMR only) as indexed by 5PB__STATS_MD, + * H5PB__STATS_RD, and H5PB__STATS_MPMDE respectively. + * + * accesses: Array of int64_t of length H5PB__NUM_STAT_TYPES containing + * the number of page buffer accesses for raw data, metadata, + * and for multi-page metadata entries (VFD SWMR only) as + * indexed by 5PB__STATS_MD, H5PB__STATS_RD, and + * H5PB__STATS_MPMDE respectively. + * + * hits: Array of int64_t of length H5PB__NUM_STAT_TYPES containing + * the number of page buffer hits for raw data, metadata, + * and for multi-page metadata entries (VFD SWMR only) as + * indexed by 5PB__STATS_MD, H5PB__STATS_RD, and + * H5PB__STATS_MPMDE respectively. + * + * misses: Array of int64_t of length H5PB__NUM_STAT_TYPES containing + * the number of page buffer misses for raw data, metadata, + * and for multi-page metadata entries (VFD SWMR only) as + * indexed by 5PB__STATS_MD, H5PB__STATS_RD, and + * H5PB__STATS_MPMDE respectively. + * + * loads: Array of int64_t of length H5PB__NUM_STAT_TYPES containing + * the number of page buffer loads for raw data, metadata, + * and for multi-page metadata entries (VFD SWMR only) as + * indexed by 5PB__STATS_MD, H5PB__STATS_RD, and + * H5PB__STATS_MPMDE respectively. + * + * insertions: Array of int64_t of length H5PB__NUM_STAT_TYPES containing + * the number of page buffer insertions of raw data, metadata, + * and for multi-page metadata entries (VFD SWMR only) as + * indexed by 5PB__STATS_MD, H5PB__STATS_RD, and + * H5PB__STATS_MPMDE respectively. + * + * flushes: Array of int64_t of length H5PB__NUM_STAT_TYPES containing + * the number of page buffer flushes of raw data, metadata, + * and for multi-page metadata entries (VFD SWMR only) as + * indexed by 5PB__STATS_MD, H5PB__STATS_RD, and + * H5PB__STATS_MPMDE respectively. + * + * evictions: Array of int64_t of length H5PB__NUM_STAT_TYPES containing + * the number of page buffer evictions of raw data, metadata, + * and for multi-page metadata entries (VFD SWMR only) as + * indexed by 5PB__STATS_MD, H5PB__STATS_RD, and + * H5PB__STATS_MPMDE respectively. + * + * clears: Array of int64_t of length H5PB__NUM_STAT_TYPES containing + * the number of page buffer entry clears of raw data, metadata, + * and for multi-page metadata entries (VFD SWMR only) as + * indexed by 5PB__STATS_MD, H5PB__STATS_RD, and + * H5PB__STATS_MPMDE respectively. + * + * max_lru_len: int64_t containing the maximum number of entries that + * have appeared in the LRU. + * + * max_lru_size: int64_t containing the maximum size of the LRU. + * + * lru_md_skips: When searching for an entry to evict, metadata entries on + * the LRU must be skipped if the number of metadata pages + * in the page buffer fails to exceed min_md_pages. + * + * This int64_t is used to keep a count of these skips. + * + * If this number becomes excessive, it will be necessary to + * add a holding tank for such entries. + * + * lru_rd_skips: When searching for an entry to evict, raw data entries on + * the LRU must be skipped if the number of raw data pages + * in the page buffer fails to exceed min_rd_pages. + * + * This int64_t is used to keep a count of these skips. + * + * If this number becomes excessive, it will be necessary to + * add a holding tank for such entries. + * + * Multi-page metadata entries (which appear only in VFD SWMR mode) are + * listed in the hash take, and thus they are counted in the following + * statistics. + * + * total_ht_insertions: Number of times entries have been inserted into the + * hash table. + * + * total_ht_deletions: Number of times entries have been deleted from the + * hash table. + * + * successful_ht_searches: int64 containing the total number of successful + * searches of the hash table. + * + * total_successful_ht_search_depth: int64 containing the total number of + * entries other than the targets examined in successful + * searches of the hash table. + * + * failed_ht_searches: int64 containing the total number of unsuccessful + * searches of the hash table. + * + * total_failed_ht_search_depth: int64 containing the total number of + * entries examined in unsuccessful searches of the hash + * table. + * + * max_index_len: Largest value attained by the index_len field. + * + * max_index_size: Largest value attained by the index_size field. + * + * max_rd_pages: Maximum number of raw data pages in the page buffer. + * + * max_md_pages: Maximum number of metadata pages in the page buffer. + * + * + * Statistics pretaining to VFD SWMR. + * + * max_mpmde_count: Maximum number of multi-page metadata entries in the + * page buffer. + * + * lru_tl_skips: When searching for an entry to evict, metadata entries on + * the LRU must be skipped if they also reside on the tick list. + * + * This int64_t is used to keep a count of these skips. + * + * If this number becomes excessive, it will be necessary to + * add a holding tank for such entries. + * + * lru_dwl_skips: When searching for an entry to evict, metadata entries on + * the LRU must be skipped if they also reside on the tick list. + * + * This int64_t is used to keep a count of these skips. + * + * If this number becomes excessive, it will be necessary to + * add a holding tank for such entries. + * + * max_tl_len: int64_t containing the maximum value of tl_len. + * + * max_tl_size: int64_t containing the maximum value of tl_size. + * + * delayed_writes: int64_t containing the total number of delayed writes. + * + * total_delay: int64_t containing the total number of ticks by which + * entry writes have been delayed. + * + * max_dwl_len: int64_t containing the maximum value of dwl_len. + * + * max_dwl_size: int64_t containing the maximum value of dwl_size. + * + * total_dwl_ins_depth: int64_t containing the total insertion depth + * required to maintain the odering invarient on the + * delayed write list. + * + ******************************************************************************/ + +#define H5PB__H5PB_T_MAGIC 0x01020304 + +#define H5PB__STATS_MD 0 +#define H5PB__STATS_RD 1 +#define H5PB__STATS_MPMDE 2 +#define H5PB__NUM_STAT_TYPES 3 -/* Typedef for the main structure for the page buffer */ typedef struct H5PB_t { - size_t max_size; /* The total page buffer size */ - size_t page_size; /* Size of a single page */ - unsigned min_meta_perc; /* Minimum ratio of metadata entries required before evicting meta entries */ - unsigned min_raw_perc; /* Minimum ratio of raw data entries required before evicting raw entries */ - unsigned meta_count; /* Number of entries for metadata */ - unsigned raw_count; /* Number of entries for raw data */ - unsigned min_meta_count; /* Minimum # of entries for metadata */ - unsigned min_raw_count; /* Minimum # of entries for raw data */ - - H5SL_t *slist_ptr; /* Skip list with all the active page entries */ - H5SL_t *mf_slist_ptr; /* Skip list containing newly allocated page entries inserted from the MF layer */ - - size_t LRU_list_len; /* Number of entries in the LRU (identical to slist_ptr count) */ - struct H5PB_entry_t *LRU_head_ptr; /* Head pointer of the LRU */ - struct H5PB_entry_t *LRU_tail_ptr; /* Tail pointer of the LRU */ - - H5FL_fac_head_t *page_fac; /* Factory for allocating pages */ - - /* Statistics */ - unsigned accesses[2]; - unsigned hits[2]; - unsigned misses[2]; - unsigned evictions[2]; - unsigned bypasses[2]; + + /* Fields for general operations: */ + + uint32_t magic; + size_t page_size; + int64_t max_pages; + int64_t curr_pages; + int64_t curr_md_pages; + int64_t curr_rd_pages; + int64_t min_md_pages; + int64_t min_rd_pages; + + /* FAPL fields */ + size_t max_size; + unsigned min_meta_perc; + unsigned min_raw_perc; + + /* index */ + H5PB_entry_t *(ht[H5PB__HASH_TABLE_LEN]); + int64_t index_len; + int64_t index_size; + + /* LRU */ + int64_t LRU_len; + int64_t LRU_size; + H5PB_entry_t * LRU_head_ptr; + H5PB_entry_t * LRU_tail_ptr; + + + /* Fields for VFD SWMR operations: */ + + hbool_t vfd_swmr_writer; + int64_t mpmde_count; + uint64_t cur_tick; + + /* delayed write list */ + uint64_t max_delay; + int64_t dwl_len; + int64_t dwl_size; + H5PB_entry_t * dwl_head_ptr; + H5PB_entry_t * dwl_tail_ptr; + + /* tick list */ + int64_t tl_len; + int64_t tl_size; + H5PB_entry_t * tl_head_ptr; + H5PB_entry_t * tl_tail_ptr; + + /* Statistics: */ + + /* general operations statistics: */ + /* these statistics count pages only, not multi-page metadata entries + * (that occur only in the VFD SWMR writer case). + */ + int64_t bypasses[H5PB__NUM_STAT_TYPES]; + int64_t accesses[H5PB__NUM_STAT_TYPES]; + int64_t hits[H5PB__NUM_STAT_TYPES]; + int64_t misses[H5PB__NUM_STAT_TYPES]; + int64_t loads[H5PB__NUM_STAT_TYPES]; + int64_t insertions[H5PB__NUM_STAT_TYPES]; + int64_t flushes[H5PB__NUM_STAT_TYPES]; + int64_t evictions[H5PB__NUM_STAT_TYPES]; + int64_t clears[H5PB__NUM_STAT_TYPES]; + int64_t max_lru_len; + int64_t max_lru_size; + int64_t lru_md_skips; + int64_t lru_rd_skips; + + /* In the VFD SWMR case, both pages and multi-page metadata entries + * are stored in the index. Thus mult-page metadata entries are + * included in the index related statistics. + */ + int64_t total_ht_insertions; + int64_t total_ht_deletions; + int64_t successful_ht_searches; + int64_t total_successful_ht_search_depth; + int64_t failed_ht_searches; + int64_t total_failed_ht_search_depth; + int64_t max_index_len; + int64_t max_index_size; + int64_t max_rd_pages; + int64_t max_md_pages; + + + /* vfd swmr statistics */ + int64_t max_mpmde_count; + int64_t lru_tl_skips; + int64_t lru_dwl_skips; + int64_t max_tl_len; + int64_t max_tl_size; + int64_t delayed_writes; + int64_t total_delay; + int64_t max_dwl_len; + int64_t max_dwl_size; + int64_t total_dwl_ins_depth; + } H5PB_t; /*****************************/ @@ -85,20 +547,38 @@ typedef struct H5PB_t { /***************************************/ /* General routines */ -H5_DLL herr_t H5PB_create(H5F_t *file, size_t page_buffer_size, unsigned page_buf_min_meta_perc, unsigned page_buf_min_raw_perc); +H5_DLL herr_t H5PB_create(H5F_t *file, size_t page_buffer_size, + unsigned page_buf_min_meta_perc, unsigned page_buf_min_raw_perc); + H5_DLL herr_t H5PB_flush(H5F_t *f); + H5_DLL herr_t H5PB_dest(H5F_t *f); + H5_DLL herr_t H5PB_add_new_page(H5F_t *f, H5FD_mem_t type, haddr_t page_addr); -H5_DLL herr_t H5PB_update_entry(H5PB_t *page_buf, haddr_t addr, size_t size, const void *buf); + +H5_DLL herr_t H5PB_update_entry(H5PB_t *page_buf, haddr_t addr, size_t size, + const void *buf); + H5_DLL herr_t H5PB_remove_entry(const H5F_t *f, haddr_t addr); -H5_DLL herr_t H5PB_read(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, void *buf/*out*/); -H5_DLL herr_t H5PB_write(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, const void *buf); + +H5_DLL herr_t H5PB_read(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, + void *buf/*out*/); + +H5_DLL herr_t H5PB_write(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, + const void *buf); /* Statistics routines */ H5_DLL herr_t H5PB_reset_stats(H5PB_t *page_buf); + H5_DLL herr_t H5PB_get_stats(const H5PB_t *page_buf, unsigned accesses[2], - unsigned hits[2], unsigned misses[2], unsigned evictions[2], unsigned bypasses[2]); + unsigned hits[2], unsigned misses[2], unsigned evictions[2], + unsigned bypasses[2]); + H5_DLL herr_t H5PB_print_stats(const H5PB_t *page_buf); +/* test & debug functions */ +H5_DLL herr_t H5PB_page_exists(H5F_t *f, haddr_t addr, + hbool_t *page_exists_ptr); + #endif /* !_H5PBprivate_H */ |