summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/H5.c1
-rw-r--r--src/H5AC.c70
-rw-r--r--src/H5ACmpio.c2
-rw-r--r--src/H5ACprivate.h2
-rw-r--r--src/H5ACproxy_entry.c1
-rw-r--r--src/H5B2cache.c3
-rw-r--r--src/H5Bcache.c1
-rw-r--r--src/H5C.c1184
-rw-r--r--src/H5Cdbg.c16
-rw-r--r--src/H5Cepoch.c27
-rw-r--r--src/H5Cpkg.h219
-rw-r--r--src/H5Cprefetched.c1
-rw-r--r--src/H5Cprivate.h226
-rw-r--r--src/H5Ctag.c6
-rw-r--r--src/H5Dbtree.c3
-rw-r--r--src/H5Dbtree2.c3
-rw-r--r--src/H5Dchunk.c68
-rw-r--r--src/H5Dearray.c3
-rw-r--r--src/H5Dfarray.c3
-rw-r--r--src/H5Dint.c2
-rw-r--r--src/H5Dnone.c3
-rw-r--r--src/H5Dpkg.h5
-rw-r--r--src/H5Dsingle.c3
-rw-r--r--src/H5Dvirtual.c213
-rw-r--r--src/H5EAcache.c5
-rw-r--r--src/H5F.c105
-rw-r--r--src/H5FAcache.c3
-rw-r--r--src/H5FD.c104
-rw-r--r--src/H5FDcore.c1
-rw-r--r--src/H5FDfamily.c1
-rw-r--r--src/H5FDhdfs.c1
-rw-r--r--src/H5FDint.c3
-rw-r--r--src/H5FDlog.c1
-rw-r--r--src/H5FDmulti.c1
-rw-r--r--src/H5FDpkg.h1
-rw-r--r--src/H5FDprivate.h214
-rw-r--r--src/H5FDpublic.h7
-rw-r--r--src/H5FDsec2.c1
-rw-r--r--src/H5FDsplitter.c1
-rw-r--r--src/H5FDstdio.c1
-rw-r--r--src/H5FDtest.c72
-rw-r--r--src/H5FDvfd_swmr.c1607
-rw-r--r--src/H5FDvfd_swmr.h38
-rw-r--r--src/H5FDvfd_swmr_instr.c28
-rw-r--r--src/H5FDvfd_swmr_private.h93
-rw-r--r--src/H5FScache.c2
-rw-r--r--src/H5FSprivate.h1
-rw-r--r--src/H5FSsection.c2
-rw-r--r--src/H5Fint.c306
-rw-r--r--src/H5Fio.c83
-rw-r--r--src/H5Fpkg.h126
-rw-r--r--src/H5Fprivate.h42
-rw-r--r--src/H5Fpublic.h97
-rw-r--r--src/H5Fquery.c23
-rw-r--r--src/H5Fsfile.c30
-rw-r--r--src/H5Fspace.c1
-rw-r--r--src/H5Fsuper_cache.c269
-rw-r--r--src/H5Ftest.c376
-rw-r--r--src/H5Fvfd_swmr.c2111
-rw-r--r--src/H5Gcache.c1
-rw-r--r--src/H5HFcache.c3
-rw-r--r--src/H5HG.c3
-rw-r--r--src/H5HGcache.c1
-rw-r--r--src/H5HGprivate.h2
-rw-r--r--src/H5HGtrap.c30
-rw-r--r--src/H5HLcache.c2
-rw-r--r--src/H5MF.c349
-rw-r--r--src/H5MFaggr.c3
-rw-r--r--src/H5MFprivate.h2
-rw-r--r--src/H5MFsection.c27
-rw-r--r--src/H5MV.c721
-rw-r--r--src/H5MVmodule.h33
-rw-r--r--src/H5MVpkg.h85
-rw-r--r--src/H5MVprivate.h58
-rw-r--r--src/H5MVsection.c395
-rw-r--r--src/H5Ocache.c2
-rw-r--r--src/H5Oflush.c59
-rw-r--r--src/H5Oprivate.h11
-rw-r--r--src/H5PB.c5028
-rw-r--r--src/H5PBpkg.h1917
-rw-r--r--src/H5PBprivate.h701
-rw-r--r--src/H5Pfapl.c233
-rw-r--r--src/H5Pint.c1
-rw-r--r--src/H5Ppublic.h6
-rw-r--r--src/H5SMcache.c2
-rw-r--r--src/H5VLnative.h3
-rw-r--r--src/H5VLnative_file.c35
-rw-r--r--src/H5private.h59
-rw-r--r--src/H5public.h137
-rw-r--r--src/H5queue.h847
-rw-r--r--src/H5retry_private.h114
-rw-r--r--src/H5system.c25
-rw-r--r--src/H5time_private.h109
-rw-r--r--src/Makefile.am19
-rw-r--r--src/hdf5.h1
-rw-r--r--src/hlog.c366
-rw-r--r--src/hlog.h138
97 files changed, 17459 insertions, 1891 deletions
diff --git a/src/H5.c b/src/H5.c
index 31b8546..4b9b36c 100644
--- a/src/H5.c
+++ b/src/H5.c
@@ -85,7 +85,6 @@ char H5_lib_vers_info_g[] = H5_VERS_INFO;
static hbool_t H5_dont_atexit_g = FALSE;
H5_debug_t H5_debug_g; /* debugging info */
-
/*******************/
/* Local Variables */
/*******************/
diff --git a/src/H5AC.c b/src/H5AC.c
index 6972a31..9402634 100644
--- a/src/H5AC.c
+++ b/src/H5AC.c
@@ -101,27 +101,29 @@ hbool_t H5_coll_api_sanity_check_g = false;
*/
static const H5AC_class_t *const H5AC_class_s[] = {
- H5AC_BT, /* ( 0) B-tree nodes */
- H5AC_SNODE, /* ( 1) symbol table nodes */
- H5AC_LHEAP_PRFX, /* ( 2) local heap prefix */
- H5AC_LHEAP_DBLK, /* ( 3) local heap data block */
- H5AC_GHEAP, /* ( 4) global heap */
- H5AC_OHDR, /* ( 5) object header */
- H5AC_OHDR_CHK, /* ( 6) object header chunk */
- H5AC_BT2_HDR, /* ( 7) v2 B-tree header */
- H5AC_BT2_INT, /* ( 8) v2 B-tree internal node */
- H5AC_BT2_LEAF, /* ( 9) v2 B-tree leaf node */
- H5AC_FHEAP_HDR, /* (10) fractal heap header */
- H5AC_FHEAP_DBLOCK, /* (11) fractal heap direct block */
- H5AC_FHEAP_IBLOCK, /* (12) fractal heap indirect block */
- H5AC_FSPACE_HDR, /* (13) free space header */
- H5AC_FSPACE_SINFO, /* (14) free space sections */
- H5AC_SOHM_TABLE, /* (15) shared object header message master table */
- H5AC_SOHM_LIST, /* (16) shared message index stored as a list */
- H5AC_EARRAY_HDR, /* (17) extensible array header */
- H5AC_EARRAY_IBLOCK, /* (18) extensible array index block */
- H5AC_EARRAY_SBLOCK, /* (19) extensible array super block */
- H5AC_EARRAY_DBLOCK, /* (20) extensible array data block */
+ H5AC_BT, /* ( 0) B-tree nodes */
+ H5AC_SNODE, /* ( 1) symbol table nodes */
+ H5AC_LHEAP_PRFX, /* ( 2) local heap prefix */
+ H5AC_LHEAP_DBLK, /* ( 3) local heap data block */
+ H5AC_GHEAP, /* ( 4) global heap */
+ H5AC_OHDR, /* ( 5) object header */
+ H5AC_OHDR_CHK, /* ( 6) object header chunk */
+ H5AC_BT2_HDR, /* ( 7) v2 B-tree header */
+ H5AC_BT2_INT, /* ( 8) v2 B-tree internal node */
+ H5AC_BT2_LEAF, /* ( 9) v2 B-tree leaf node */
+ H5AC_FHEAP_HDR, /* (10) fractal heap header */
+ H5AC_FHEAP_DBLOCK, /* (11) fractal heap direct block */
+ H5AC_FHEAP_IBLOCK, /* (12) fractal heap indirect block */
+ H5AC_FSPACE_HDR, /* (13) free space header */
+ H5AC_FSPACE_SINFO, /* (14) free space sections */
+ H5AC_SOHM_TABLE, /* (15) shared object header message */
+ /* master table */
+ H5AC_SOHM_LIST, /* (16) shared message index stored as */
+ /* a list */
+ H5AC_EARRAY_HDR, /* (17) extensible array header */
+ H5AC_EARRAY_IBLOCK, /* (18) extensible array index block */
+ H5AC_EARRAY_SBLOCK, /* (19) extensible array super block */
+ H5AC_EARRAY_DBLOCK, /* (20) extensible array data block */
H5AC_EARRAY_DBLK_PAGE, /* (21) extensible array data block page */
H5AC_FARRAY_HDR, /* (22) fixed array header */
H5AC_FARRAY_DBLOCK, /* (23) fixed array data block */
@@ -278,6 +280,11 @@ H5AC_cache_image_pending(const H5F_t *f)
* matzke@llnl.gov
* Jul 9 1997
*
+ * Changes: Added code to configrue the metadata cache for VFD SWMR
+ * reader operations when indicated.
+ *
+ * JRM -- 1/15/19
+ *
*-------------------------------------------------------------------------
*/
herr_t
@@ -416,6 +423,23 @@ H5AC_create(const H5F_t *f, H5AC_cache_config_t *config_ptr, H5AC_cache_image_co
if(H5C_log_set_up(f->shared->cache, H5F_MDC_LOG_LOCATION(f), H5C_LOG_STYLE_JSON, H5F_START_MDC_LOG_ON_ACCESS(f)) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_LOGGING, FAIL, "mdc logging setup failed")
+ /* Configure the metadata cache for VFD SWMR reader operation if
+ * specified.
+ */
+ if ( ( H5F_VFD_SWMR_CONFIG(f) ) &&
+ ( !f->shared->vfd_swmr_config.writer ) ) {
+
+ HDassert(!(H5F_INTENT(f) & H5F_ACC_RDWR));
+ HDassert(f->shared->fs_page_size > 0);
+
+ if ( H5C_set_vfd_swmr_reader(f->shared->cache, TRUE,
+ f->shared->fs_page_size) < 0 )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTSET, FAIL, \
+ "can't configure MDC for VFD SWMR reader operations");
+
+ }
+
/* Set the cache parameters */
if(H5AC_set_cache_auto_resize_config(f->shared->cache, config_ptr) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTSET, FAIL, "auto resize configuration failed")
@@ -2310,7 +2334,7 @@ done:
*------------------------------------------------------------------------------
*/
herr_t
-H5AC_expunge_tag_type_metadata(H5F_t *f, haddr_t tag, int type_id, unsigned flags)
+H5AC_expunge_tag_type_metadata(H5F_t *f, haddr_t tag, int type_id, unsigned flags, hbool_t type_match)
{
/* Variable Declarations */
herr_t ret_value = SUCCEED;
@@ -2323,7 +2347,7 @@ H5AC_expunge_tag_type_metadata(H5F_t *f, haddr_t tag, int type_id, unsigned flag
HDassert(f->shared);
/* Call cache level function to expunge entries with specified tag and type id */
- if(H5C_expunge_tag_type_metadata(f, tag, type_id, flags) < 0)
+ if(H5C_expunge_tag_type_metadata(f, tag, type_id, flags, type_match)<0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Cannot expunge tagged type entries")
done:
diff --git a/src/H5ACmpio.c b/src/H5ACmpio.c
index f097e83..824475d 100644
--- a/src/H5ACmpio.c
+++ b/src/H5ACmpio.c
@@ -498,7 +498,7 @@ done:
*-------------------------------------------------------------------------
*/
static herr_t
-H5AC__construct_candidate_list(H5AC_t *cache_ptr, H5AC_aux_t H5_ATTR_NDEBUG_UNUSED *aux_ptr,
+H5AC__construct_candidate_list(H5AC_t *cache_ptr, H5AC_aux_t H5_ATTR_SANITY_CHECK *aux_ptr,
int sync_point_op)
{
herr_t ret_value = SUCCEED; /* Return value */
diff --git a/src/H5ACprivate.h b/src/H5ACprivate.h
index b932e16..7010a60 100644
--- a/src/H5ACprivate.h
+++ b/src/H5ACprivate.h
@@ -439,7 +439,7 @@ H5_DLL void H5AC_set_ring(H5AC_ring_t ring, H5AC_ring_t *orig_ring);
H5_DLL herr_t H5AC_unsettle_entry_ring(void *entry);
H5_DLL herr_t H5AC_unsettle_ring(H5F_t * f, H5AC_ring_t ring);
H5_DLL herr_t H5AC_expunge_tag_type_metadata(H5F_t *f, haddr_t tag, int type_id,
- unsigned flags);
+ unsigned flags, hbool_t type_match);
H5_DLL herr_t H5AC_get_tag(const void *thing, /*OUT*/ haddr_t *tag);
/* Virtual entry routines */
diff --git a/src/H5ACproxy_entry.c b/src/H5ACproxy_entry.c
index 498d023..1302b83 100644
--- a/src/H5ACproxy_entry.c
+++ b/src/H5ACproxy_entry.c
@@ -82,6 +82,7 @@ const H5AC_class_t H5AC_PROXY_ENTRY[1] = {{
H5AC__proxy_entry_notify, /* 'notify' callback */
H5AC__proxy_entry_free_icr, /* 'free_icr' callback */
NULL, /* 'fsf_size' callback */
+ NULL, /* VFD SWMR 'refresh' callback */
}};
diff --git a/src/H5B2cache.c b/src/H5B2cache.c
index 80cb6c5..5b0b8b2 100644
--- a/src/H5B2cache.c
+++ b/src/H5B2cache.c
@@ -113,6 +113,7 @@ const H5AC_class_t H5AC_BT2_HDR[1] = {{
H5B2__cache_hdr_notify, /* 'notify' callback */
H5B2__cache_hdr_free_icr, /* 'free_icr' callback */
NULL, /* 'fsf_size' callback */
+ NULL, /* VFD SWMR 'refresh' callback */
}};
/* H5B2 inherits cache-like properties from H5AC */
@@ -131,6 +132,7 @@ const H5AC_class_t H5AC_BT2_INT[1] = {{
H5B2__cache_int_notify, /* 'notify' callback */
H5B2__cache_int_free_icr, /* 'free_icr' callback */
NULL, /* 'fsf_size' callback */
+ NULL, /* VFD SWMR 'refresh' callback */
}};
/* H5B2 inherits cache-like properties from H5AC */
@@ -149,6 +151,7 @@ const H5AC_class_t H5AC_BT2_LEAF[1] = {{
H5B2__cache_leaf_notify, /* 'notify' callback */
H5B2__cache_leaf_free_icr, /* 'free_icr' callback */
NULL, /* 'fsf_size' callback */
+ NULL, /* VFD SWMR 'refresh' callback */
}};
diff --git a/src/H5Bcache.c b/src/H5Bcache.c
index c2c7a80..24a6716 100644
--- a/src/H5Bcache.c
+++ b/src/H5Bcache.c
@@ -82,6 +82,7 @@ const H5AC_class_t H5AC_BT[1] = {{
NULL, /* 'notify' callback */
H5B__cache_free_icr, /* 'free_icr' callback */
NULL, /* 'fsf_size' callback */
+ NULL, /* VFD SWMR 'refresh' callback */
}};
/*******************/
diff --git a/src/H5C.c b/src/H5C.c
index 91e4158..948e781 100644
--- a/src/H5C.c
+++ b/src/H5C.c
@@ -80,6 +80,7 @@
/* Headers */
/***********/
#include "H5private.h" /* Generic Functions */
+#include "H5retry_private.h" /* Retry loops. */
#include "H5Cpkg.h" /* Cache */
#include "H5CXprivate.h" /* API Contexts */
#include "H5Eprivate.h" /* Error handling */
@@ -320,14 +321,22 @@ H5C_create(size_t max_cache_size,
cache_ptr->slist_ring_size[i] = (size_t)0;
} /* end for */
- for(i = 0; i < H5C__HASH_TABLE_LEN; i++)
+ for(i = 0; i < H5C__HASH_TABLE_LEN; i++) {
(cache_ptr->index)[i] = NULL;
+ }
cache_ptr->il_len = 0;
cache_ptr->il_size = (size_t)0;
cache_ptr->il_head = NULL;
cache_ptr->il_tail = NULL;
+ /* Fields supporting VFD SWMR */
+ cache_ptr->vfd_swmr_reader = FALSE;
+ for(i = 0; i < H5C__PAGE_HASH_TABLE_LEN; i++) {
+ (cache_ptr->page_index)[i] = NULL;
+ }
+ cache_ptr->page_size = 0;
+
/* Tagging Field Initializations */
cache_ptr->ignore_tags = FALSE;
cache_ptr->num_objs_corked = 0;
@@ -737,8 +746,9 @@ herr_t
H5C_prep_for_file_close(H5F_t *f)
{
H5C_t * cache_ptr;
- hbool_t image_generated = FALSE; /* Whether a cache image was generated */
- herr_t ret_value = SUCCEED; /* Return value */
+ hbool_t image_generated = FALSE; /* Whether a cache image was */
+ /* generated */
+ herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_NOAPI(FAIL)
@@ -919,6 +929,396 @@ done:
/*-------------------------------------------------------------------------
+ * Function: H5C_evict_or_refresh_all_entries_in_page
+ *
+ * Purpose: When a file is opened in VFD SWMR reader mode, we must be
+ * able to ensure that the metadata cache contains no stale
+ * entries at the end of each tick.
+ *
+ * To do this, we must identify pages that have changed in
+ * the last tick, and either evict, or refresh all modified
+ * entries in the modified pages. If an evicted entry is
+ * needed subsequently, it must be reloaded, almost always
+ * from the metadata file.
+ *
+ * This function performs this function of a given page buffer
+ * page.
+ *
+ * This is done by mapping the supplied page to associated
+ * hash bucket in the page_index, and then scanning the
+ * contents of the bucket for entries residing in the
+ * target page.
+ *
+ * For each such entry, we test to see if it is pinned.
+ * If it is not, we simply evict it.
+ *
+ * Pinned entries may in turn be divided into tagged and
+ * un-tagged entries.
+ *
+ * For pinned tagged entries, it would be best if we could
+ * simply tell the associated cache client to refresh it.
+ * However, until we have that facility, we look up its tag,
+ * and evict all entries associated with that on disk object.
+ *
+ * For pinned, un-tagged entries (i.e. super block, global
+ * heaps, etc. we must instruct the client to refresh the
+ * entry. Fortunately, this is only necessary for the
+ * super block in the initial VFD SWMR implementation.
+ *
+ * Note that there is also the possibility that while the
+ * page was modified, one or more metadata entries in
+ * that page were not. Eventually we should write code
+ * to detect this -- but not for the prototype.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: John Mainzer -- 12/16/18
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5C_evict_or_refresh_all_entries_in_page(H5F_t * f, uint64_t page,
+ uint32_t length, uint64_t tick)
+{
+ int i;
+ size_t image_len;
+ size_t original_image_len;
+ void * image_ptr = NULL;
+ void * new_image_ptr = NULL;
+ unsigned flush_flags = (H5C__FLUSH_INVALIDATE_FLAG |
+ H5C__FLUSH_CLEAR_ONLY_FLAG);
+ haddr_t tag;
+ H5C_t * cache_ptr = NULL;
+ H5C_cache_entry_t * entry_ptr;
+ H5C_cache_entry_t * follow_ptr = NULL;
+ herr_t ret_value = SUCCEED; /* Return value */
+ bool found = false;
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ /* Sanity check */
+ HDassert(f);
+ HDassert(f->shared);
+
+ cache_ptr = f->shared->cache;
+
+ HDassert(cache_ptr);
+ HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC);
+ HDassert(cache_ptr->vfd_swmr_reader);
+
+#if 0 /* JRM */
+ HDfprintf(stderr,
+ "H5C_evict_or_refresh_all_entries_in_page() entering. page = %lld\n",
+ page);
+#endif /* JRM */
+
+ /* since file must be opened R/O for a VFD SWMR reader, the skip
+ * list must be empty. Verify this.
+ */
+ HDassert(cache_ptr->slist_len == 0);
+
+ i = H5C__PI_HASH_FCN(page);
+
+ entry_ptr = (cache_ptr->page_index)[i];
+
+ while (entry_ptr) {
+
+ HDassert(entry_ptr->magic == H5C__H5C_CACHE_ENTRY_T_MAGIC);
+
+ if ( entry_ptr->page == page ) {
+
+ HDassert(entry_ptr->addr >= (haddr_t)(page * cache_ptr->page_size));
+ HDassert(entry_ptr->addr <
+ (haddr_t)((page+1) * cache_ptr->page_size));
+ HDassert(length == cache_ptr->page_size ||
+ page * cache_ptr->page_size + length <=
+ entry_ptr->addr + entry_ptr->size);
+
+ found = true;
+
+ /* since end of tick occurs only on API call entry in
+ * the VFD SWMR reader case, the entry must not be protected.
+ *
+ * since the VFD SWMR reader must have opened the file R/O,
+ * the entry must be clean.
+ */
+ HDassert(!(entry_ptr->is_protected));
+ HDassert(!(entry_ptr->is_dirty));
+
+ /* we must evict the entry, as page has been modified, and
+ * thus the entry may be out of date.
+ *
+ * Note that we should eventually modify this code to be more
+ * intelligent, and only evict entries if they have in fact changed.
+ * However, no time for that in the first cut.
+ */
+ if ( entry_ptr->is_pinned ) {
+
+ /* if the entry has tag_info and there is no refresh
+ * callback, a call to H5C_evict_tagged_entries() is the
+ * only option available.
+ */
+ if ( ( entry_ptr->tag_info ) &&
+ ( entry_ptr->type->refresh == NULL ) ) {
+
+ tag = entry_ptr->tag_info->tag;
+
+ HDassert(!(entry_ptr->tag_info->corked));
+#if 0 /* JRM */
+ HDfprintf(stderr,
+ "evicting tagged entries addr/page/tag == %lld/%lld/%lld\n",
+ entry_ptr->addr, entry_ptr->page, tag);
+#endif /* JRM */
+
+ /* passing TRUE for the match_global parameter. Look
+ * into this and verify that it is the right thing to
+ * do.
+ */
+ if ( H5C_evict_tagged_entries(f, tag, TRUE) < 0 )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTEXPUNGE, FAIL, \
+ "can't evict pinned and tagged entries")
+
+ /* Both follow_ptr and entry_ptr may have been removed.
+ * Set both to NULL to force the scan to restart.
+ */
+ follow_ptr = entry_ptr = NULL;
+ } else if ( entry_ptr->type->refresh ) {
+#if 0 /* JRM */
+ HDfprintf(stderr, "refreshing addr/page/tag == %lld/%lld\n",
+ entry_ptr->addr, entry_ptr->page);
+#endif /* JRM */
+ /* If there is a refresh callback, use it to minimize
+ * overhead.
+ *
+ * At present, the only refresh call is for the
+ * superblock. This is essential, as the superblock
+ * is manually pinned for as long as the file is open,
+ * and thus cannot be evicted.
+ *
+ * there may be other examples of this, but for the
+ * prototype, we seem to be able to avoid them.
+ */
+
+ /* 1) Get the on disk size of the entry. Since the
+ * the entry is already loaded, we can use the
+ * size listed in the entry.
+ *
+ * This will almost always be correct, but we
+ * allow a second try as it is possible that the
+ * version of the entry may change on the writer.
+ */
+ image_len = entry_ptr->size;
+ original_image_len = image_len;
+
+ /* 2) Allocate and read the buffer.
+ *
+ * Note that this will be satisfied from the metadata
+ * file via the VFD SWMR reade VFD.
+ *
+ * For this reason, we don't nead to check for reads
+ * past the EOA. Torn reads and checksums are also
+ * not an issue, since pages in the metadata file
+ * are checksumed and re-tried if necessary in the
+ * VFD SWMR reader VFD.
+ */
+ if ( NULL == (image_ptr = (uint8_t *)
+ H5MM_malloc(image_len + H5C_IMAGE_EXTRA_SPACE)) )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, \
+ "memory allocation failed for image buffer")
+
+#if H5C_DO_MEMORY_SANITY_CHECKS
+ HDmemcpy(image_ptr + image_len, H5C_IMAGE_SANITY_VALUE,
+ H5C_IMAGE_EXTRA_SPACE);
+#endif /* H5C_DO_MEMORY_SANITY_CHECKS */
+
+ if ( H5F_block_read(f, entry_ptr->type->mem_type,
+ entry_ptr->addr,
+ image_len, image_ptr) < 0 )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_READERROR, FAIL, \
+ "Can't read image (1)")
+
+ /* 3) Call the refresh callback. If it doesn't
+ * request a different image size, goto 6)
+ */
+ if ( entry_ptr->type->refresh(f, (void *)entry_ptr,
+ image_ptr, &image_len) < 0 )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTLOAD, FAIL, \
+ "Can't refresh entry (1)")
+
+ if ( image_len != original_image_len ) {
+
+ /* 4) If image_len has changed, re-allocate and re-read
+ * the image.
+ *
+ * Note: Generate a log entry in this case
+ */
+
+ if ( NULL == (new_image_ptr = H5MM_realloc(image_ptr,
+ image_len + H5C_IMAGE_EXTRA_SPACE)) )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, \
+ "re-alloc of image buffer failed.")
+
+ image_ptr = new_image_ptr;
+
+#if H5C_DO_MEMORY_SANITY_CHECKS
+ HDmemcpy(image_ptr + image_len, H5C_IMAGE_SANITY_VALUE,
+ H5C_IMAGE_EXTRA_SPACE);
+#endif /* H5C_DO_MEMORY_SANITY_CHECKS */
+
+ if ( H5F_block_read(f, entry_ptr->type->mem_type,
+ entry_ptr->addr,
+ image_len, image_ptr) < 0 )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_READERROR, FAIL, \
+ "Can't read image (2)")
+
+ /* 5) Call the refresh callback again. Requesting
+ * a different buffer size again is an error.
+ */
+ original_image_len = image_len;
+ if ( entry_ptr->type->refresh(f, (void *)entry_ptr,
+ image_ptr,
+ &image_len) < 0 )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTLOAD, FAIL, \
+ "Can't refresh entry (2)")
+
+ if ( image_len != original_image_len )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, \
+ "2nd refresh call changed image_len.")
+ }
+
+ /* 6) Mark the entry as having been looked at this
+ * this tick to accooadate later sanity chackes.
+ */
+ entry_ptr->refreshed_in_tick = tick;
+
+ /* 7) Free the old image if it exists, and replace
+ * it with the new image.
+ */
+ if ( entry_ptr->image_ptr ) {
+
+ entry_ptr->image_ptr = H5MM_xfree(entry_ptr->image_ptr);
+ }
+ entry_ptr->image_ptr = image_ptr;
+
+ /* 8) Since *entry_ptr has been refreshed and not
+ * evicted, we can leave entry_ptr defined, and
+ * and continue the scan of the bucket from
+ * that point.
+ */
+
+ } else {
+
+ /* The entry is pinned, is not tagged, and has no
+ * refresh callback.
+ *
+ * This should be un-reachable. If it is reached, we
+ * probably have another refresh callback to write.
+ */
+ HDassert(FALSE);
+ }
+ } else { /* simply evict the entry */
+
+ /* since the entry is clean, it must not be on the
+ * skip list -- thus no need for the
+ * H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG.
+ */
+#if 0 /* JRM */
+ if ( entry_ptr->tag_info ) {
+
+ HDfprintf(stderr,
+ "evicting entry addr/page/tag == %lld/%lld/%lld\n",
+ entry_ptr->addr, entry_ptr->page,
+ entry_ptr->tag_info->tag);
+ } else {
+ HDfprintf(stderr,
+ "evicting entry addr/page == %lld/%lld no tag\n",
+ entry_ptr->addr, entry_ptr->page);
+ }
+#endif /* JRM */
+ if ( H5C__flush_single_entry(f, entry_ptr, flush_flags) < 0 )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTEXPUNGE, FAIL, \
+ "can't evict unpinned entry")
+
+ /* *entry_ptr should be evicted -- set entry_ptr to NULL */
+ entry_ptr = NULL;
+ }
+
+ /* If entry_ptr is NULL, it was evicted, and we must continue
+ * the scan from follow_ptr, or start at the head of the
+ * bucket list it follow_ptr is NULL as well.
+ *
+ * If follow_ptr isn't NULL, set entry_ptr to follow_ptr->pi_next.
+ * Otherwise, set entry_ptr to point to the first item in the hash
+ * bucket.
+ */
+ if ( entry_ptr ) {
+
+ /* *entry_ptr was refreshed, not evicted. Continue the
+ * the scan from that point, and update follow_ptr.
+ */
+ follow_ptr = entry_ptr;
+ entry_ptr = entry_ptr->pi_next;
+
+ } else if ( follow_ptr ) {
+
+ /* *entry_ptr was evicted. Since follow_ptr is not NULL,
+ * we can continue the scan from that point.
+ */
+ entry_ptr = follow_ptr->pi_next;
+
+ } else {
+
+ /* follow_ptr is null as well, so we have to re-start
+ * the scan from the head of the page index bucket list.
+ */
+
+ entry_ptr = (cache_ptr->page_index)[i];
+ }
+ } else {
+
+ /* entry belongs to another page -- skip it and go on. */
+ follow_ptr = entry_ptr;
+ entry_ptr = entry_ptr->pi_next;
+ }
+ } /* end while */
+
+ /* at this point, all entries residing in the target page should have
+ * been either evicted or refreshed -- verify this.
+ */
+ entry_ptr = (cache_ptr->page_index)[i];
+
+ while (entry_ptr) {
+
+ HDassert((entry_ptr->page != page) ||
+ (entry_ptr->refreshed_in_tick == tick));;
+
+ entry_ptr = entry_ptr->pi_next;
+ }
+
+ if (!found) {
+ hlog_fast(mdc_invalidation, "no MDC match for page %" PRIu64
+ " length %" PRIu32 " tick %" PRIu64, page, length, tick);
+ }
+
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5C_evict_or_refresh_all_entries_in_page() */
+
+
+/*-------------------------------------------------------------------------
* Function: H5C_expunge_entry
*
* Purpose: Use this function to tell the cache to expunge an entry
@@ -937,7 +1337,8 @@ H5C_expunge_entry(H5F_t *f, const H5C_class_t *type, haddr_t addr, unsigned flag
{
H5C_t * cache_ptr;
H5C_cache_entry_t * entry_ptr = NULL;
- unsigned flush_flags = (H5C__FLUSH_INVALIDATE_FLAG | H5C__FLUSH_CLEAR_ONLY_FLAG);
+ unsigned flush_flags = (H5C__FLUSH_INVALIDATE_FLAG |
+ H5C__FLUSH_CLEAR_ONLY_FLAG);
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_NOAPI(FAIL)
@@ -1249,7 +1650,8 @@ H5C_insert_entry(H5F_t * f,
hbool_t insert_pinned;
hbool_t flush_last;
#ifdef H5_HAVE_PARALLEL
- hbool_t coll_access = FALSE; /* whether access to the cache entry is done collectively */
+ hbool_t coll_access = FALSE; /* whether access to the cache */
+ /* entry is done collectively */
#endif /* H5_HAVE_PARALLEL */
hbool_t set_flush_marker;
hbool_t write_permitted = TRUE;
@@ -1268,6 +1670,11 @@ H5C_insert_entry(H5F_t * f,
HDassert( cache_ptr );
HDassert( cache_ptr->magic == H5C__H5C_T_MAGIC );
+
+ /* if this is a VFD SWMR reader, verify that the page size is defined */
+ HDassert( ( ! cache_ptr->vfd_swmr_reader ) ||
+ ( cache_ptr->page_size > 0 ) );
+
HDassert( type );
HDassert( type->mem_type == cache_ptr->class_table_ptr[type->id]->mem_type );
HDassert( type->image_len );
@@ -1372,25 +1779,39 @@ H5C_insert_entry(H5F_t * f,
#endif /* H5_HAVE_PARALLEL */
/* initialize cache image related fields */
- entry_ptr->include_in_image = FALSE;
- entry_ptr->lru_rank = 0;
- entry_ptr->image_dirty = FALSE;
- entry_ptr->fd_parent_count = 0;
- entry_ptr->fd_parent_addrs = NULL;
- entry_ptr->fd_child_count = 0;
- entry_ptr->fd_dirty_child_count = 0;
- entry_ptr->image_fd_height = 0;
- entry_ptr->prefetched = FALSE;
- entry_ptr->prefetch_type_id = 0;
- entry_ptr->age = 0;
- entry_ptr->prefetched_dirty = FALSE;
+ entry_ptr->include_in_image = FALSE;
+ entry_ptr->lru_rank = 0;
+ entry_ptr->image_dirty = FALSE;
+ entry_ptr->fd_parent_count = 0;
+ entry_ptr->fd_parent_addrs = NULL;
+ entry_ptr->fd_child_count = 0;
+ entry_ptr->fd_dirty_child_count = 0;
+ entry_ptr->image_fd_height = 0;
+ entry_ptr->prefetched = FALSE;
+ entry_ptr->prefetch_type_id = 0;
+ entry_ptr->age = 0;
+ entry_ptr->prefetched_dirty = FALSE;
#ifndef NDEBUG /* debugging field */
- entry_ptr->serialization_count = 0;
+ entry_ptr->serialization_count = 0;
#endif /* NDEBUG */
- entry_ptr->tl_next = NULL;
- entry_ptr->tl_prev = NULL;
- entry_ptr->tag_info = NULL;
+ /* initialize tag list fields */
+ entry_ptr->tl_next = NULL;
+ entry_ptr->tl_prev = NULL;
+ entry_ptr->tag_info = NULL;
+
+ /* initialize fields supporting VFD SWMR */
+ if ( cache_ptr->vfd_swmr_reader ) {
+
+ entry_ptr->page = (addr / cache_ptr->page_size);
+
+ } else {
+
+ entry_ptr->page = 0;
+ }
+ entry_ptr->refreshed_in_tick = 0;
+ entry_ptr->pi_next = NULL;
+ entry_ptr->pi_prev = NULL;
/* Apply tag to newly inserted entry */
if(H5C__tag_entry(cache_ptr, entry_ptr) < 0)
@@ -1399,36 +1820,60 @@ H5C_insert_entry(H5F_t * f,
H5C__RESET_CACHE_ENTRY_STATS(entry_ptr)
- if(cache_ptr->flash_size_increase_possible &&
- (entry_ptr->size > cache_ptr->flash_size_increase_threshold))
- if(H5C__flash_increase_cache_size(cache_ptr, 0, entry_ptr->size) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTINS, FAIL, "H5C__flash_increase_cache_size failed")
+ if ( cache_ptr->flash_size_increase_possible &&
+ ( entry_ptr->size > cache_ptr->flash_size_increase_threshold ) ) {
+
+ if ( H5C__flash_increase_cache_size(cache_ptr, 0, entry_ptr->size) < 0 )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTINS, FAIL, \
+ "H5C__flash_increase_cache_size failed")
+ }
+
+ if(cache_ptr->index_size >= cache_ptr->max_cache_size) {
- if(cache_ptr->index_size >= cache_ptr->max_cache_size)
empty_space = 0;
- else
+
+ } else {
+
empty_space = cache_ptr->max_cache_size - cache_ptr->index_size;
+ }
- if(cache_ptr->evictions_enabled &&
- (((cache_ptr->index_size + entry_ptr->size) > cache_ptr->max_cache_size)
+ if ( ( cache_ptr->evictions_enabled ) &&
+ ( ( (cache_ptr->index_size + entry_ptr->size) >
+ cache_ptr->max_cache_size
+ )
||
- (((empty_space + cache_ptr->clean_index_size) < cache_ptr->min_clean_size)))) {
+ ( (empty_space + cache_ptr->clean_index_size) <
+ cache_ptr->min_clean_size
+ )
+ )
+ ) {
size_t space_needed;
- if(empty_space <= entry_ptr->size)
+ if ( empty_space <= entry_ptr->size ) {
+
cache_ptr->cache_full = TRUE;
+ }
+
+ if ( cache_ptr->check_write_permitted != NULL ) {
+
+ if ( ( cache_ptr->check_write_permitted)(f, &write_permitted) < 0 )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTINS, FAIL, \
+ "Can't get write_permitted")
+
+ } else {
- if(cache_ptr->check_write_permitted != NULL) {
- if((cache_ptr->check_write_permitted)(f, &write_permitted) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTINS, FAIL, "Can't get write_permitted")
- } /* end if */
- else
write_permitted = cache_ptr->write_permitted;
+ }
HDassert(entry_ptr->size <= H5C_MAX_ENTRY_SIZE);
space_needed = entry_ptr->size;
- if(space_needed > cache_ptr->max_cache_size)
+
+ if ( space_needed > cache_ptr->max_cache_size ) {
+
space_needed = cache_ptr->max_cache_size;
+ }
/* Note that space_needed is just the amount of space that
* needed to insert the new entry without exceeding the cache
@@ -1455,8 +1900,10 @@ H5C_insert_entry(H5F_t * f,
* no point in worrying about the third.
*/
- if(H5C__make_space_in_cache(f, space_needed, write_permitted) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTINS, FAIL, "H5C__make_space_in_cache failed")
+ if ( H5C__make_space_in_cache(f, space_needed, write_permitted) < 0 )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTINS, FAIL, \
+ "H5C__make_space_in_cache failed")
} /* end if */
H5C__INSERT_IN_INDEX(cache_ptr, entry_ptr, FAIL)
@@ -1471,30 +1918,65 @@ H5C_insert_entry(H5F_t * f,
if((H5C_validate_protected_entry_list(cache_ptr) < 0) ||
(H5C_validate_pinned_entry_list(cache_ptr) < 0) ||
(H5C_validate_lru_list(cache_ptr) < 0))
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "an extreme sanity check failed just before done")
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, \
+ "an extreme sanity check failed just before done")
#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
/* If the entry's type has a 'notify' callback send a 'after insertion'
* notice now that the entry is fully integrated into the cache.
*/
- if(entry_ptr->type->notify &&
- (entry_ptr->type->notify)(H5C_NOTIFY_ACTION_AFTER_INSERT, entry_ptr) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTNOTIFY, FAIL, "can't notify client about entry inserted into cache")
+ if ( ( entry_ptr->type->notify ) &&
+ ( (entry_ptr->type->notify)(H5C_NOTIFY_ACTION_AFTER_INSERT,
+ entry_ptr) < 0 ) )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTNOTIFY, FAIL, \
+ "can't notify client about entry inserted into cache")
H5C__UPDATE_STATS_FOR_INSERTION(cache_ptr, entry_ptr)
#ifdef H5_HAVE_PARALLEL
- if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI))
- coll_access = H5CX_get_coll_metadata_read();
+ if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) {
+
+ coll_access = (H5P_USER_TRUE == f->coll_md_read ? TRUE : FALSE);
+
+ /* If not explicitly disabled, get the cmdr setting from the
+ * API context
+ */
+ if(!coll_access && H5P_FORCE_FALSE != f->coll_md_read) {
+
+ coll_access = H5CX_get_coll_metadata_read();
+ }
+ } /* end if */
entry_ptr->coll_access = coll_access;
+
if(coll_access) {
H5C__INSERT_IN_COLL_LIST(cache_ptr, entry_ptr, FAIL)
- /* Make sure the size of the collective entries in the cache remain in check */
- if(cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100)
- if(H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't clear collective metadata entries")
+ /* Make sure the size of the collective entries in the cache
+ * remain in check
+ */
+ if(H5P_USER_TRUE == f->coll_md_read) {
+
+ if ( cache_ptr->max_cache_size * 80 <
+ cache_ptr->coll_list_size * 100) {
+
+ if(H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, \
+ "can't clear collective metadata entries")
+ } /* end if */
+ } /* end if */
+ else {
+ if ( cache_ptr->max_cache_size * 40 <
+ cache_ptr->coll_list_size * 100) {
+
+ if(H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, \
+ "can't clear collective metadata entries")
+ } /* end if */
+ } /* end else */
} /* end if */
#endif
@@ -1503,14 +1985,17 @@ done:
if((H5C_validate_protected_entry_list(cache_ptr) < 0) ||
(H5C_validate_pinned_entry_list(cache_ptr) < 0) ||
(H5C_validate_lru_list(cache_ptr) < 0))
- HDONE_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "an extreme sanity check failed on exit")
+ HDONE_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, \
+ "an extreme sanity check failed on exit")
#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
if(ret_value < 0 && entry_tagged)
if(H5C__untag_entry(cache_ptr, entry_ptr) < 0)
- HDONE_ERROR(H5E_CACHE, H5E_CANTREMOVE, FAIL, "can't remove entry from tag list")
+ HDONE_ERROR(H5E_CACHE, H5E_CANTREMOVE, FAIL, \
+ "can't remove entry from tag list")
FUNC_LEAVE_NOAPI(ret_value)
+
} /* H5C_insert_entry() */
@@ -1806,6 +2291,10 @@ done:
* Programmer: John Mainzer
* 6/2/04
*
+ * Changes: Added code to update cache entry page field required
+ * by VFD SWMR.
+ * JRM -- 12/13/18
+ *
*-------------------------------------------------------------------------
*/
herr_t
@@ -1816,29 +2305,38 @@ H5C_move_entry(H5C_t * cache_ptr,
{
H5C_cache_entry_t * entry_ptr = NULL;
H5C_cache_entry_t * test_entry_ptr = NULL;
- herr_t ret_value = SUCCEED; /* Return value */
+ herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_NOAPI(FAIL)
HDassert(cache_ptr);
HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC);
+
+ /* if this is a VFD SWMR reader, verify that the page size is defined */
+ HDassert(( ! cache_ptr->vfd_swmr_reader ) ||
+ ( cache_ptr->page_size > 0 ) );
+
HDassert(type);
HDassert(H5F_addr_defined(old_addr));
HDassert(H5F_addr_defined(new_addr));
HDassert(H5F_addr_ne(old_addr, new_addr));
#if H5C_DO_EXTREME_SANITY_CHECKS
- if((H5C_validate_protected_entry_list(cache_ptr) < 0) ||
- (H5C_validate_pinned_entry_list(cache_ptr) < 0) ||
- (H5C_validate_lru_list(cache_ptr) < 0))
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "an extreme sanity check failed on entry")
+ if ( ( H5C_validate_protected_entry_list(cache_ptr) < 0 ) ||
+ ( H5C_validate_pinned_entry_list(cache_ptr) < 0 ) ||
+ ( H5C_validate_lru_list(cache_ptr) < 0 ) )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, \
+ "an extreme sanity check failed on entry")
#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
H5C__SEARCH_INDEX(cache_ptr, old_addr, entry_ptr, FAIL)
- if(entry_ptr == NULL || entry_ptr->type != type)
+ if ( ( entry_ptr == NULL ) || ( entry_ptr->type != type ) ) {
+
/* the old item doesn't exist in the cache, so we are done. */
HGOTO_DONE(SUCCEED)
+ }
HDassert(entry_ptr->addr == old_addr);
HDassert(entry_ptr->type == type);
@@ -1847,16 +2345,21 @@ H5C_move_entry(H5C_t * cache_ptr,
/* (Moving a R/O entry would mark it dirty, which shouldn't
* happen. QAK - 2016/12/02)
*/
- if(entry_ptr->is_read_only)
+ if ( entry_ptr->is_read_only )
HGOTO_ERROR(H5E_CACHE, H5E_CANTMOVE, FAIL, "can't move R/O entry")
H5C__SEARCH_INDEX(cache_ptr, new_addr, test_entry_ptr, FAIL)
- if(test_entry_ptr != NULL) { /* we are hosed */
- if(test_entry_ptr->type == type)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTMOVE, FAIL, "target already moved & reinserted???")
+ if ( test_entry_ptr != NULL ) { /* we are hosed */
+
+ if ( test_entry_ptr->type == type )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTMOVE, FAIL, \
+ "target already moved & reinserted???")
else
- HGOTO_ERROR(H5E_CACHE, H5E_CANTMOVE, FAIL, "new address already in use?")
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTMOVE, FAIL, \
+ "new address already in use?")
} /* end if */
/* If we get this far we have work to do. Remove *entry_ptr from
@@ -1874,10 +2377,12 @@ H5C_move_entry(H5C_t * cache_ptr,
* change the addr. If the entry is only in the process of being flushed,
* don't mark it as dirty either, lest we confuse the flush call back.
*/
- if(!entry_ptr->destroy_in_progress) {
+ if ( ! entry_ptr->destroy_in_progress ) {
+
H5C__DELETE_FROM_INDEX(cache_ptr, entry_ptr, FAIL)
- if(entry_ptr->in_slist) {
+ if ( entry_ptr->in_slist ) {
+
HDassert(cache_ptr->slist_ptr);
H5C__REMOVE_ENTRY_FROM_SLIST(cache_ptr, entry_ptr, FALSE)
} /* end if */
@@ -1885,8 +2390,18 @@ H5C_move_entry(H5C_t * cache_ptr,
entry_ptr->addr = new_addr;
- if(!entry_ptr->destroy_in_progress) {
- hbool_t was_dirty; /* Whether the entry was previously dirty */
+ /* update the page in which the entry resides if the file is opened
+ * as a VFD SWMR reader.
+ */
+ if ( cache_ptr->vfd_swmr_reader ) {
+
+ entry_ptr->page = (new_addr / cache_ptr->page_size);
+
+ }
+
+ if ( ! entry_ptr->destroy_in_progress ) {
+
+ hbool_t was_dirty; /* Whether the entry was previously dirty */
/* Remember previous dirty status */
was_dirty = entry_ptr->is_dirty;
@@ -1895,11 +2410,17 @@ H5C_move_entry(H5C_t * cache_ptr,
entry_ptr->is_dirty = TRUE;
/* This shouldn't be needed, but it keeps the test code happy */
- if(entry_ptr->image_up_to_date) {
+ if ( entry_ptr->image_up_to_date ) {
+
entry_ptr->image_up_to_date = FALSE;
- if(entry_ptr->flush_dep_nparents > 0)
- if(H5C__mark_flush_dep_unserialized(entry_ptr) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTNOTIFY, FAIL, "Can't propagate serialization status to fd parents")
+
+ if ( entry_ptr->flush_dep_nparents > 0 ) {
+
+ if ( H5C__mark_flush_dep_unserialized(entry_ptr) < 0 )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTNOTIFY, FAIL, \
+ "Can't propagate serialization status to fd parents")
+ }
} /* end if */
/* Modify cache data structures */
@@ -1907,23 +2428,35 @@ H5C_move_entry(H5C_t * cache_ptr,
H5C__INSERT_ENTRY_IN_SLIST(cache_ptr, entry_ptr, FAIL)
/* Skip some actions if we're in the middle of flushing the entry */
- if(!entry_ptr->flush_in_progress) {
+ if ( !entry_ptr->flush_in_progress ) {
+
/* Update the replacement policy for the entry */
H5C__UPDATE_RP_FOR_MOVE(cache_ptr, entry_ptr, was_dirty, FAIL)
/* Check for entry changing status and do notifications, etc. */
if(!was_dirty) {
- /* If the entry's type has a 'notify' callback send a 'entry dirtied'
- * notice now that the entry is fully integrated into the cache.
+
+ /* If the entry's type has a 'notify' callback send a 'entry
+ * dirtied' notice now that the entry is fully integrated
+ * into the cache.
*/
- if(entry_ptr->type->notify &&
- (entry_ptr->type->notify)(H5C_NOTIFY_ACTION_ENTRY_DIRTIED, entry_ptr) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTNOTIFY, FAIL, "can't notify client about entry dirty flag set")
+ if ( ( entry_ptr->type->notify ) &&
+ ( (entry_ptr->type->notify)
+ (H5C_NOTIFY_ACTION_ENTRY_DIRTIED, entry_ptr) < 0 ) )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTNOTIFY, FAIL, \
+ "can't notify client about entry dirty flag set")
- /* Propagate the dirty flag up the flush dependency chain if appropriate */
- if(entry_ptr->flush_dep_nparents > 0)
- if(H5C__mark_flush_dep_dirty(entry_ptr) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTMARKDIRTY, FAIL, "Can't propagate flush dep dirty flag")
+ /* Propagate the dirty flag up the flush dependency chain
+ * if appropriate
+ */
+ if ( entry_ptr->flush_dep_nparents > 0 ) {
+
+ if ( H5C__mark_flush_dep_dirty(entry_ptr) < 0 )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTMARKDIRTY, FAIL, \
+ "Can't propagate flush dep dirty flag")
+ }
} /* end if */
} /* end if */
} /* end if */
@@ -1931,14 +2464,18 @@ H5C_move_entry(H5C_t * cache_ptr,
H5C__UPDATE_STATS_FOR_MOVE(cache_ptr, entry_ptr)
done:
+
#if H5C_DO_EXTREME_SANITY_CHECKS
- if((H5C_validate_protected_entry_list(cache_ptr) < 0) ||
- (H5C_validate_pinned_entry_list(cache_ptr) < 0) ||
- (H5C_validate_lru_list(cache_ptr) < 0))
- HDONE_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "an extreme sanity check failed on exit")
+ if ( ( H5C_validate_protected_entry_list(cache_ptr) < 0 ) ||
+ ( H5C_validate_pinned_entry_list(cache_ptr) < 0 ) ||
+ ( H5C_validate_lru_list(cache_ptr) < 0 ) )
+
+ HDONE_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, \
+ "an extreme sanity check failed on exit")
#endif /* H5C_DO_EXTREME_SANITY_CHECKS */
FUNC_LEAVE_NOAPI(ret_value)
+
} /* H5C_move_entry() */
@@ -2879,6 +3416,43 @@ done:
/*-------------------------------------------------------------------------
+ * Function: H5C_set_vfd_swmr_reader()
+ *
+ * Purpose: Set cache_ptr->vfd_swmr_reader and cache_ptr->page_size to
+ * the values specified in the parameter list.
+ *
+ * Return: SUCCEED on success, and FAIL on failure.
+ *
+ * Programmer: John Mainzer
+ * 1/15/19
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5C_set_vfd_swmr_reader(H5C_t *cache_ptr, hbool_t vfd_swmr_reader,
+ hsize_t page_size)
+{
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ if((cache_ptr == NULL) || (cache_ptr->magic != H5C__H5C_T_MAGIC))
+
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Bad cache_ptr on entry")
+
+ cache_ptr->vfd_swmr_reader = vfd_swmr_reader;
+ cache_ptr->page_size = page_size;
+
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5C_set_vfd_swmr_reader() */
+
+
+/*-------------------------------------------------------------------------
* Function: H5C_unpin_entry()
*
* Purpose: Unpin a cache entry. The entry can be either protected or
@@ -6387,14 +6961,24 @@ H5C__flush_single_entry(H5F_t *f, H5C_cache_entry_t *entry_ptr, unsigned flags)
/* Check if we have to update the page buffer with cleared entries
* so it doesn't go out of date
*/
+
+ /* VFD SWMR TODO: Think on this, and decide if we need to extend
+ * this for multi page metadata entries.
+ */
if(update_page_buffer) {
/* Sanity check */
HDassert(!destroy);
HDassert(entry_ptr->image_ptr);
- if(f->shared->page_buf && f->shared->page_buf->page_size >= entry_ptr->size)
- if(H5PB_update_entry(f->shared->page_buf, entry_ptr->addr, entry_ptr->size, entry_ptr->image_ptr) > 0)
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Failed to update PB with metadata cache")
+ if ( ( f->shared->pb_ptr ) &&
+ ( f->shared->pb_ptr->page_size >= entry_ptr->size ) ) {
+
+ if ( H5PB_update_entry(f->shared->pb_ptr, entry_ptr->addr,
+ entry_ptr->size, entry_ptr->image_ptr) > 0 )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, \
+ "Failed to update PB with metadata cache")
+ }
} /* end if */
if(cache_ptr->log_flush)
@@ -6491,6 +7075,14 @@ done:
*
* Programmer: John Mainzer, 5/18/04
*
+ * Changes: Please maintain the change list and do not delete entries
+ * unless the have been folded into the header comment.
+ *
+ * Reverted optimization that avoided re-reading the prefix
+ * of a metadata entry when a speculative read proved too
+ * small.
+ * JRM -- 3/25/20
+ *
*-------------------------------------------------------------------------
*/
static void *
@@ -6502,17 +7094,22 @@ H5C_load_entry(H5F_t * f,
haddr_t addr,
void * udata)
{
- hbool_t dirty = FALSE; /* Flag indicating whether thing was dirtied during deserialize */
- uint8_t * image = NULL; /* Buffer for disk image */
- void * thing = NULL; /* Pointer to thing loaded */
- H5C_cache_entry_t *entry = NULL; /* Alias for thing loaded, as cache entry */
- size_t len; /* Size of image in file */
+ hbool_t dirty = FALSE; /* Flag indicating whether thing */
+ /* was dirtied during deserialize */
+ uint8_t * image = NULL; /* Buffer for disk image */
+ void * thing = NULL; /* Pointer to thing loaded */
+ H5C_cache_entry_t *entry = NULL; /* Alias for thing loaded, as */
+ /* cache entry */
+#if 0
+ size_t init_len;
+#endif
+ size_t len; /* Size of image in file */
#ifdef H5_HAVE_PARALLEL
- int mpi_rank = 0; /* MPI process rank */
- MPI_Comm comm = MPI_COMM_NULL; /* File MPI Communicator */
- int mpi_code; /* MPI error code */
+ int mpi_rank = 0; /* MPI process rank */
+ MPI_Comm comm = MPI_COMM_NULL; /* File MPI Communicator */
+ int mpi_code; /* MPI error code */
#endif /* H5_HAVE_PARALLEL */
- void * ret_value = NULL; /* Return value */
+ void * ret_value = NULL; /* Return value */
FUNC_ENTER_NOAPI_NOINIT
@@ -6520,13 +7117,25 @@ H5C_load_entry(H5F_t * f,
HDassert(f);
HDassert(f->shared);
HDassert(f->shared->cache);
+ HDassert(f->shared->cache->magic == H5C__H5C_T_MAGIC );
+
+ /* if this is a VFD SWMR reader, verify that the page size is defined */
+ HDassert( ( ! f->shared->cache->vfd_swmr_reader ) ||
+ ( f->shared->cache->page_size > 0 ) );
+
HDassert(type);
HDassert(H5F_addr_defined(addr));
HDassert(type->get_initial_load_size);
- if(type->flags & H5C__CLASS_SPECULATIVE_LOAD_FLAG)
+
+ if ( type->flags & H5C__CLASS_SPECULATIVE_LOAD_FLAG ) {
+
HDassert(type->get_final_load_size);
- else
+
+ } else {
+
HDassert(NULL == type->get_final_load_size);
+ }
+
HDassert(type->deserialize);
/* Can't see how skip reads could be usefully combined with
@@ -6535,44 +7144,64 @@ H5C_load_entry(H5F_t * f,
HDassert(!((type->flags & H5C__CLASS_SKIP_READS) &&
(type->flags & H5C__CLASS_SPECULATIVE_LOAD_FLAG)));
- /* Call the get_initial_load_size callback, to retrieve the initial size of image */
- if(type->get_initial_load_size(udata, &len) < 0)
+ /* Call the get_initial_load_size callback, to retrieve the initial
+ * size of image
+ */
+ if ( type->get_initial_load_size(udata, &len) < 0 )
+
HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, NULL, "can't retrieve image size")
+
HDassert(len > 0);
+#if 0
+ init_len = len;
+#endif
+
/* Check for possible speculative read off the end of the file */
- if(type->flags & H5C__CLASS_SPECULATIVE_LOAD_FLAG)
- if(H5C__verify_len_eoa(f, type, addr, &len, FALSE) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_BADVALUE, NULL, "invalid len with respect to EOA")
+ if ( type->flags & H5C__CLASS_SPECULATIVE_LOAD_FLAG ) {
+
+ if ( H5C__verify_len_eoa(f, type, addr, &len, FALSE) < 0 )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_BADVALUE, NULL, \
+ "invalid len with respect to EOA")
+ }
/* Allocate the buffer for reading the on-disk entry image */
- if(NULL == (image = (uint8_t *)H5MM_malloc(len + H5C_IMAGE_EXTRA_SPACE)))
- HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, NULL, "memory allocation failed for on disk image buffer")
+ if ( NULL == (image = (uint8_t *)H5MM_malloc(len + H5C_IMAGE_EXTRA_SPACE)) )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, NULL, \
+ "memory allocation failed for on disk image buffer")
+
#if H5C_DO_MEMORY_SANITY_CHECKS
H5MM_memcpy(image + len, H5C_IMAGE_SANITY_VALUE, H5C_IMAGE_EXTRA_SPACE);
#endif /* H5C_DO_MEMORY_SANITY_CHECKS */
#ifdef H5_HAVE_PARALLEL
- if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) {
- if((mpi_rank = H5F_mpi_get_rank(f)) < 0)
+ if ( H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI) ) {
+
+ if ( (mpi_rank = H5F_mpi_get_rank(f)) < 0 )
+
HGOTO_ERROR(H5E_FILE, H5E_CANTGET, NULL, "Can't get MPI rank")
- if((comm = H5F_mpi_get_comm(f)) == MPI_COMM_NULL)
+
+ if ( (comm = H5F_mpi_get_comm(f)) == MPI_COMM_NULL )
+
HGOTO_ERROR(H5E_FILE, H5E_CANTGET, NULL, "get_comm request failed")
+
} /* end if */
#endif /* H5_HAVE_PARALLEL */
/* Get the on-disk entry image */
- if(0 == (type->flags & H5C__CLASS_SKIP_READS)) {
- unsigned tries, max_tries; /* The # of read attempts */
- unsigned retries; /* The # of retries */
- htri_t chk_ret; /* return from verify_chksum callback */
- size_t actual_len = len; /* The actual length, after speculative reads have been resolved */
- uint64_t nanosec = 1; /* # of nanoseconds to sleep between retries */
- void *new_image; /* Pointer to image */
- hbool_t len_changed = TRUE; /* Whether to re-check speculative entries */
-
- /* Get the # of read attempts */
- max_tries = tries = H5F_GET_READ_ATTEMPTS(f);
+ if ( 0 == (type->flags & H5C__CLASS_SKIP_READS) ) {
+
+ unsigned tries; /* The # of retries */
+ htri_t chk_ret; /* return from verify_chksum callback */
+ size_t actual_len = len; /* The actual length, after speculative */
+ /* reads have been resolved */
+ void *new_image; /* Pointer to image */
+ hbool_t len_changed = TRUE; /* Whether to re-check speculative */
+ /* entries */
+ bool do_try;
+ h5_retry_t retry;
/*
* This do/while loop performs the following till the metadata checksum
@@ -6581,32 +7210,48 @@ H5C_load_entry(H5F_t * f,
* --determine the actual size of the metadata
* --perform checksum verification
*/
- do {
- if(actual_len != len) {
- if(NULL == (new_image = H5MM_realloc(image, len + H5C_IMAGE_EXTRA_SPACE)))
- HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, NULL, "image null after H5MM_realloc()")
+ for (do_try = h5_retry_init(&retry, H5F_GET_READ_ATTEMPTS(f),
+ 1, H5_RETRY_ONE_HOUR / 3600 / 100);
+ do_try;
+ do_try = h5_retry_next(&retry)) {
+ if ( actual_len != len ) {
+
+ if ( NULL == (new_image = H5MM_realloc(image,
+ len + H5C_IMAGE_EXTRA_SPACE)) )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, NULL, \
+ "image null after H5MM_realloc()")
+
image = (uint8_t *)new_image;
+
#if H5C_DO_MEMORY_SANITY_CHECKS
- H5MM_memcpy(image + len, H5C_IMAGE_SANITY_VALUE, H5C_IMAGE_EXTRA_SPACE);
+ H5MM_memcpy(image + len, H5C_IMAGE_SANITY_VALUE,
+ H5C_IMAGE_EXTRA_SPACE);
#endif /* H5C_DO_MEMORY_SANITY_CHECKS */
} /* end if */
#ifdef H5_HAVE_PARALLEL
- if(!coll_access || 0 == mpi_rank) {
+ if ( !coll_access || 0 == mpi_rank ) {
#endif /* H5_HAVE_PARALLEL */
- if(H5F_block_read(f, type->mem_type, addr, len, image) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_READERROR, NULL, "Can't read image*")
+
+ if ( H5F_block_read(f, type->mem_type, addr, len, image) < 0 )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_READERROR, NULL, \
+ "Can't read image*")
#ifdef H5_HAVE_PARALLEL
} /* end if */
/* if the collective metadata read optimization is turned on,
* bcast the metadata read from process 0 to all ranks in the file
* communicator
*/
- if(coll_access) {
+ if ( coll_access ) {
+
int buf_size;
H5_CHECKED_ASSIGN(buf_size, int, len, size_t);
- if(MPI_SUCCESS != (mpi_code = MPI_Bcast(image, buf_size, MPI_BYTE, 0, comm)))
+ if ( MPI_SUCCESS !=
+ (mpi_code = MPI_Bcast(image, buf_size, MPI_BYTE, 0, comm)))
+
HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code)
} /* end if */
#endif /* H5_HAVE_PARALLEL */
@@ -6614,46 +7259,118 @@ H5C_load_entry(H5F_t * f,
/* If the entry could be read speculatively and the length is still
* changing, check for updating the actual size
*/
- if((type->flags & H5C__CLASS_SPECULATIVE_LOAD_FLAG) && len_changed) {
+ if( ( type->flags & H5C__CLASS_SPECULATIVE_LOAD_FLAG ) &&
+ ( len_changed ) ) {
+
/* Retrieve the actual length */
actual_len = len;
- if(type->get_final_load_size(image, len, udata, &actual_len) < 0)
- continue; /* Transfer control to while() and count towards retries */
+ if ( type->get_final_load_size(image, len, udata,
+ &actual_len) < 0 ) {
+
+ /* Transfer control to while() and count towards retries */
+ continue;
+ }
/* Check for the length changing */
- if(actual_len != len) {
- /* Verify that the length isn't past the EOA for the file */
- if(H5C__verify_len_eoa(f, type, addr, &actual_len, TRUE) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_BADVALUE, NULL, "actual_len exceeds EOA")
+ if ( actual_len != len ) {
+
+ /* Verify that the length isn't past the EOA for
+ * the file
+ */
+ if ( H5C__verify_len_eoa(f, type, addr,
+ &actual_len, TRUE) < 0)
+
+ HGOTO_ERROR(H5E_CACHE, H5E_BADVALUE, NULL, \
+ "actual_len exceeds EOA")
/* Expand buffer to new size */
- if(NULL == (new_image = H5MM_realloc(image, actual_len + H5C_IMAGE_EXTRA_SPACE)))
- HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, NULL, "image null after H5MM_realloc()")
+ if ( NULL ==
+ (new_image = H5MM_realloc(image,
+ actual_len + H5C_IMAGE_EXTRA_SPACE)))
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, NULL, \
+ "image null after H5MM_realloc()")
+
image = (uint8_t *)new_image;
+
#if H5C_DO_MEMORY_SANITY_CHECKS
- H5MM_memcpy(image + actual_len, H5C_IMAGE_SANITY_VALUE, H5C_IMAGE_EXTRA_SPACE);
+ H5MM_memcpy(image + actual_len, H5C_IMAGE_SANITY_VALUE,
+ H5C_IMAGE_EXTRA_SPACE);
#endif /* H5C_DO_MEMORY_SANITY_CHECKS */
- if(actual_len > len) {
+ if ( actual_len > len ) {
#ifdef H5_HAVE_PARALLEL
- if(!coll_access || 0 == mpi_rank) {
+ if ( !coll_access || 0 == mpi_rank ) {
#endif /* H5_HAVE_PARALLEL */
- /* If the thing's image needs to be bigger for a speculatively
- * loaded thing, go get the on-disk image again (the extra portion).
+#if 0 /* JRM */
+ /* If the thing's image needs to be bigger for
+ * a speculatively loaded thing, go get the
+ * on-disk image again (the extra portion).
*/
- if(H5F_block_read(f, type->mem_type, addr + len, actual_len - len, image + len) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, "can't read image")
+ if ( H5F_block_read(f, type->mem_type, addr + len,
+ actual_len - len, image + len) < 0)
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, \
+ "can't read image")
+#else /* JRM */
+
+ /* the original version of this code re-read
+ * the entire buffer. At some point, someone
+ * reworked this code to avoid re-reading the
+ * initial portion of the buffer.
+ *
+ * In addition to being of questionable utility,
+ * this optimization changed the invarient that
+ * that metadata is read and written atomically.
+ * While this didn't cause immediate problems,
+ * the page buffer in VFD SWMR depends on this
+ * invarient in its management of multi-page
+ * metadata entries.
+ *
+ * To repair this issue, I have reverted to
+ * the original algorithm for managing the
+ * speculative load case. Note that I have
+ * done so crudely -- before merge, we should
+ * remove the infrastructure that supports the
+ * optimization.
+ *
+ * We should also verify my impression that the
+ * that the optimization is of no measurable
+ * value. If it is, we will put it back, but
+ * disable it in the VFD SWMR case.
+ *
+ * While this issue was detected in the global
+ * heap case, note that the super bloc, the
+ * local heap, and the fractal heap also use
+ * speculative loads.
+ *
+ * JRM -- 3/24/20
+ */
+ if ( H5F_block_read(f, type->mem_type, addr,
+ actual_len, image) < 0)
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, \
+ "can't read image")
+#endif /* JRM */
#ifdef H5_HAVE_PARALLEL
}
- /* If the collective metadata read optimization is turned on,
- * Bcast the metadata read from process 0 to all ranks in the file
- * communicator */
- if(coll_access) {
+ /* If the collective metadata read optimization is
+ * turned on, Bcast the metadata read from process
+ * 0 to all ranks in the file communicator
+ */
+ if ( coll_access ) {
+
int buf_size;
- H5_CHECKED_ASSIGN(buf_size, int, actual_len - len, size_t);
- if(MPI_SUCCESS != (mpi_code = MPI_Bcast(image + len, buf_size, MPI_BYTE, 0, comm)))
- HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code)
+ H5_CHECKED_ASSIGN(buf_size, int, actual_len - len, \
+ size_t);
+
+ if ( MPI_SUCCESS !=
+ (mpi_code = MPI_Bcast(image + len, buf_size,
+ MPI_BYTE, 0, comm)) )
+
+ HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", \
+ mpi_code)
} /* end if */
#endif /* H5_HAVE_PARALLEL */
} /* end if */
@@ -6674,28 +7391,48 @@ H5C_load_entry(H5F_t * f,
break;
/* Verify the checksum for the metadata image */
- if((chk_ret = type->verify_chksum(image, actual_len, udata)) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, NULL, "failure from verify_chksum callback")
+ if ( (chk_ret = type->verify_chksum(image, actual_len, udata)) < 0)
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, NULL, \
+ "failure from verify_chksum callback")
+
if(chk_ret == TRUE)
break;
-
- /* Sleep for some time */
- H5_nanosleep(nanosec);
- nanosec *= 2; /* Double the sleep time next time */
- } while(--tries);
+ }
/* Check for too many tries */
- if(tries == 0)
- HGOTO_ERROR(H5E_CACHE, H5E_READERROR, NULL, "incorrect metadatda checksum after all read attempts")
+ if (!do_try) {
+#if 0 /* JRM */
+ haddr_t eoa;
+ int64_t page = (int64_t)(addr / f->shared->cache->page_size);
+
+ eoa = H5F_get_eoa(f, type->mem_type);
+
+ HDfprintf(stderr, "addr = 0x%llx, init_len = %lld, len = %lld\n",
+ (int64_t)addr, (int64_t)init_len, (int64_t)len);
+ HDfprintf(stderr, "type = %s, eoa = 0x%llx, tick = %lld\n",
+ type->name, (int64_t)eoa, f->shared->tick_num);
+ HDfprintf(stderr, "page = %lld, index_len = %d\n",
+ page, f->shared->mdf_idx_entries_used);
+ H5FD_vfd_swmr_dump_status(f->shared->lf, page);
+#endif /* JRM */
+ HGOTO_ERROR(H5E_CACHE, H5E_READERROR, NULL, \
+ "incorrect metadata checksum after all read attempts addr %" PRIuHADDR " size %zu", addr, len);
+ }
/* Calculate and track the # of retries */
- retries = max_tries - tries;
- if(retries) /* Does not track 0 retry */
- if(H5F_track_metadata_read_retries(f, (unsigned)type->mem_type, retries) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_BADVALUE, NULL, "cannot track read tries = %u ", retries)
+ if ((tries = h5_retry_tries(&retry)) > 1) { /* Does not track 0 retry */
+
+ if ( H5F_track_metadata_read_retries(f, (unsigned)type->mem_type,
+ tries - 1) < 0)
+
+ HGOTO_ERROR(H5E_CACHE, H5E_BADVALUE, NULL, \
+ "cannot track read tries = %u ", tries)
+ }
/* Set the final length (in case it wasn't set earlier) */
len = actual_len;
+
} /* end if !H5C__CLASS_SKIP_READS */
/* Deserialize the on-disk image into the native memory form */
@@ -6733,7 +7470,7 @@ H5C_load_entry(H5F_t * f,
entry->image_ptr = image;
entry->image_up_to_date = !dirty;
entry->type = type;
- entry->is_dirty = dirty;
+ entry->is_dirty = dirty;
entry->dirtied = FALSE;
entry->is_protected = FALSE;
entry->is_read_only = FALSE;
@@ -6793,9 +7530,23 @@ H5C_load_entry(H5F_t * f,
entry->serialization_count = 0;
#endif /* NDEBUG */
- entry->tl_next = NULL;
- entry->tl_prev = NULL;
- entry->tag_info = NULL;
+ /* initialize tag list fields */
+ entry->tl_next = NULL;
+ entry->tl_prev = NULL;
+ entry->tag_info = NULL;
+
+ /* initialize fields supporting VFD SWMR */
+ if ( f->shared->cache->vfd_swmr_reader ) {
+
+ entry->page = (addr / f->shared->cache->page_size);
+
+ } else {
+
+ entry->page = 0;
+ }
+ entry->refreshed_in_tick = 0;
+ entry->pi_next = NULL;
+ entry->pi_prev = NULL;
H5C__RESET_CACHE_ENTRY_STATS(entry);
@@ -8500,6 +9251,11 @@ done:
* Programmer: Mohamad Chaarawi
* 2/10/16
*
+ * Changes: Added code to update the page field in the VFD SWMR reader
+ * case.
+ *
+ * JRM -- 12/14/18
+ *
*-------------------------------------------------------------------------
*/
herr_t
@@ -8517,6 +9273,11 @@ H5C__generate_image(H5F_t *f, H5C_t *cache_ptr, H5C_cache_entry_t *entry_ptr)
HDassert(f);
HDassert(cache_ptr);
HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC);
+
+ /* if this is a VFD SWMR reader, verify that the page size is defined */
+ HDassert( ( ! cache_ptr->vfd_swmr_reader ) ||
+ ( cache_ptr->page_size > 0 ) );
+
HDassert(entry_ptr);
HDassert(entry_ptr->magic == H5C__H5C_CACHE_ENTRY_T_MAGIC);
HDassert(!entry_ptr->image_up_to_date);
@@ -8534,10 +9295,14 @@ H5C__generate_image(H5F_t *f, H5C_t *cache_ptr, H5C_cache_entry_t *entry_ptr)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to pre-serialize entry")
/* Check for any flags set in the pre-serialize callback */
- if(serialize_flags != H5C__SERIALIZE_NO_FLAGS_SET) {
+ if ( serialize_flags != H5C__SERIALIZE_NO_FLAGS_SET ) {
+
/* Check for unexpected flags from serialize callback */
- if(serialize_flags & ~(H5C__SERIALIZE_RESIZED_FLAG | H5C__SERIALIZE_MOVED_FLAG))
- HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unknown serialize flag(s)")
+ if ( serialize_flags & ~(H5C__SERIALIZE_RESIZED_FLAG |
+ H5C__SERIALIZE_MOVED_FLAG) )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, \
+ "unknown serialize flag(s)")
#ifdef H5_HAVE_PARALLEL
/* In the parallel case, resizes and moves in
@@ -8566,28 +9331,40 @@ H5C__generate_image(H5F_t *f, H5C_t *cache_ptr, H5C_cache_entry_t *entry_ptr)
* If that ceases to be the case, further
* tests will be necessary.
*/
- if(cache_ptr->aux_ptr != NULL)
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "resize/move in serialize occurred in parallel case")
+ if ( cache_ptr->aux_ptr != NULL )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, \
+ "resize/move in serialize occurred in parallel case")
#endif
/* If required, resize the buffer and update the entry and the cache
- * data structures */
- if(serialize_flags & H5C__SERIALIZE_RESIZED_FLAG) {
+ * data structures
+ */
+ if ( serialize_flags & H5C__SERIALIZE_RESIZED_FLAG ) {
+
/* Sanity check */
HDassert(new_len > 0);
/* Allocate a new image buffer */
- if(NULL == (entry_ptr->image_ptr = H5MM_realloc(entry_ptr->image_ptr, new_len + H5C_IMAGE_EXTRA_SPACE)))
- HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for on disk image buffer")
+ if ( NULL == (entry_ptr->image_ptr =
+ H5MM_realloc(entry_ptr->image_ptr,
+ new_len + H5C_IMAGE_EXTRA_SPACE)) )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, \
+ "memory allocation failed for on disk image buffer")
+
#if H5C_DO_MEMORY_SANITY_CHECKS
- H5MM_memcpy(((uint8_t *)entry_ptr->image_ptr) + new_len, H5C_IMAGE_SANITY_VALUE, H5C_IMAGE_EXTRA_SPACE);
+ H5MM_memcpy(((uint8_t *)entry_ptr->image_ptr) + new_len,
+ H5C_IMAGE_SANITY_VALUE, H5C_IMAGE_EXTRA_SPACE);
#endif /* H5C_DO_MEMORY_SANITY_CHECKS */
/* Update statistics for resizing the entry */
- H5C__UPDATE_STATS_FOR_ENTRY_SIZE_CHANGE(cache_ptr, entry_ptr, new_len);
+ H5C__UPDATE_STATS_FOR_ENTRY_SIZE_CHANGE(cache_ptr, entry_ptr, \
+ new_len);
/* Update the hash table for the size change */
- H5C__UPDATE_INDEX_FOR_SIZE_CHANGE(cache_ptr, entry_ptr->size, new_len, entry_ptr, !(entry_ptr->is_dirty));
+ H5C__UPDATE_INDEX_FOR_SIZE_CHANGE(cache_ptr, entry_ptr->size, \
+ new_len, entry_ptr, !(entry_ptr->is_dirty));
/* The entry can't be protected since we are in the process of
* flushing it. Thus we must update the replacement policy data
@@ -8602,21 +9379,25 @@ H5C__generate_image(H5F_t *f, H5C_t *cache_ptr, H5C_cache_entry_t *entry_ptr)
*/
HDassert(entry_ptr->is_dirty);
HDassert(entry_ptr->in_slist);
- H5C__UPDATE_SLIST_FOR_SIZE_CHANGE(cache_ptr, entry_ptr->size, new_len);
+ H5C__UPDATE_SLIST_FOR_SIZE_CHANGE(cache_ptr, entry_ptr->size, \
+ new_len);
/* Finally, update the entry for its new size */
entry_ptr->size = new_len;
+
} /* end if */
/* If required, udate the entry and the cache data structures
* for a move
*/
- if(serialize_flags & H5C__SERIALIZE_MOVED_FLAG) {
+ if ( serialize_flags & H5C__SERIALIZE_MOVED_FLAG ) {
+
/* Update stats and entries relocated counter */
H5C__UPDATE_STATS_FOR_MOVE(cache_ptr, entry_ptr)
/* We must update cache data structures for the change in address */
if(entry_ptr->addr == old_addr) {
+
/* Delete the entry from the hash table and the slist */
H5C__DELETE_FROM_INDEX(cache_ptr, entry_ptr, FAIL);
H5C__REMOVE_ENTRY_FROM_SLIST(cache_ptr, entry_ptr, FALSE);
@@ -8624,21 +9405,37 @@ H5C__generate_image(H5F_t *f, H5C_t *cache_ptr, H5C_cache_entry_t *entry_ptr)
/* Update the entry for its new address */
entry_ptr->addr = new_addr;
+ /* In the VFD SWMR reader case, update the entry page field */
+ if ( cache_ptr->vfd_swmr_reader ) {
+
+ entry_ptr->page = (new_addr / cache_ptr->page_size);
+ }
+
/* And then reinsert in the index and slist */
H5C__INSERT_IN_INDEX(cache_ptr, entry_ptr, FAIL);
H5C__INSERT_ENTRY_IN_SLIST(cache_ptr, entry_ptr, FAIL);
- } /* end if */
- else /* move is already done for us -- just do sanity checks */
+
+ } else { /* move is already done for us -- just do sanity checks */
+
HDassert(entry_ptr->addr == new_addr);
+ HDassert(( ! cache_ptr->vfd_swmr_reader ) ||
+ ( entry_ptr->page ==
+ (entry_ptr->addr / cache_ptr->page_size) ));
+ }
} /* end if */
} /* end if(serialize_flags != H5C__SERIALIZE_NO_FLAGS_SET) */
/* Serialize object into buffer */
- if(entry_ptr->type->serialize(f, entry_ptr->image_ptr, entry_ptr->size, (void *)entry_ptr) < 0)
+ if ( entry_ptr->type->serialize(f, entry_ptr->image_ptr, entry_ptr->size,
+ (void *)entry_ptr) < 0)
+
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to serialize entry")
+
#if H5C_DO_MEMORY_SANITY_CHECKS
- HDassert(0 == HDmemcmp(((uint8_t *)entry_ptr->image_ptr) + entry_ptr->size, H5C_IMAGE_SANITY_VALUE, H5C_IMAGE_EXTRA_SPACE));
+ HDassert(0 == HDmemcmp(((uint8_t *)entry_ptr->image_ptr) + entry_ptr->size,
+ H5C_IMAGE_SANITY_VALUE, H5C_IMAGE_EXTRA_SPACE));
#endif /* H5C_DO_MEMORY_SANITY_CHECKS */
+
entry_ptr->image_up_to_date = TRUE;
/* Propagate the fact that the entry is serialized up the
@@ -8648,12 +9445,19 @@ H5C__generate_image(H5F_t *f, H5C_t *cache_ptr, H5C_cache_entry_t *entry_ptr)
* for flush dependency parents.
*/
HDassert(entry_ptr->flush_dep_nunser_children == 0);
- if(entry_ptr->flush_dep_nparents > 0)
- if(H5C__mark_flush_dep_serialized(entry_ptr) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTNOTIFY, FAIL, "Can't propagate serialization status to fd parents")
+
+ if ( entry_ptr->flush_dep_nparents > 0 ) {
+
+ if ( H5C__mark_flush_dep_serialized(entry_ptr) < 0 )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTNOTIFY, FAIL, \
+ "Can't propagate serialization status to fd parents")
+ }
done:
+
FUNC_LEAVE_NOAPI(ret_value)
+
} /* H5C__generate_image */
diff --git a/src/H5Cdbg.c b/src/H5Cdbg.c
index d5599f2..749d49e 100644
--- a/src/H5Cdbg.c
+++ b/src/H5Cdbg.c
@@ -289,8 +289,8 @@ H5C_dump_cache_skip_list(H5C_t * cache_ptr, char * calling_fcn)
HDassert(calling_fcn != NULL);
HDfprintf(stdout, "\n\nDumping metadata cache skip list from %s.\n", calling_fcn);
- HDfprintf(stdout, " slist len = %u.\n", cache_ptr->slist_len);
- HDfprintf(stdout, " slist size = %lld.\n", (long long)(cache_ptr->slist_size));
+ HDfprintf(stdout, " slist len = %" PRIu32 ".\n", cache_ptr->slist_len);
+ HDfprintf(stdout, " slist size = %zu.\n", cache_ptr->slist_size);
if(cache_ptr->slist_len > 0) {
/* If we get this far, all entries in the cache are listed in the
@@ -310,10 +310,10 @@ H5C_dump_cache_skip_list(H5C_t * cache_ptr, char * calling_fcn)
HDassert( entry_ptr->magic == H5C__H5C_CACHE_ENTRY_T_MAGIC );
HDfprintf(stdout,
- "%s%d 0x%016llx %4lld %d/%d %d %s\n",
+ "%s%d 0x%016" PRIxHADDR " %4zu %d/%d %d %s\n",
cache_ptr->prefix, i,
- (long long)(entry_ptr->addr),
- (long long)(entry_ptr->size),
+ entry_ptr->addr,
+ entry_ptr->size,
(int)(entry_ptr->is_protected),
(int)(entry_ptr->is_pinned),
(int)(entry_ptr->is_dirty),
@@ -408,10 +408,10 @@ H5C_dump_coll_write_list(H5C_t * cache_ptr, char * calling_fcn)
HDassert(entry_ptr->magic == H5C__H5C_CACHE_ENTRY_T_MAGIC);
HDfprintf(stdout,
- "%s%d 0x%016llx %4lld %d/%d %d %s\n",
+ "%s%d 0x%016" PRIxHADDR " %4%zu %d/%d %d %s\n",
cache_ptr->prefix, i,
- (long long)(entry_ptr->addr),
- (long long)(entry_ptr->size),
+ entry_ptr->addr,
+ entry_ptr->size,
(int)(entry_ptr->is_protected),
(int)(entry_ptr->is_pinned),
(int)(entry_ptr->is_dirty),
diff --git a/src/H5Cepoch.c b/src/H5Cepoch.c
index 6451019..e6a395f 100644
--- a/src/H5Cepoch.c
+++ b/src/H5Cepoch.c
@@ -91,20 +91,21 @@ static herr_t H5C__epoch_marker_fsf_size(const void H5_ATTR_UNUSED * thing,
const H5AC_class_t H5AC_EPOCH_MARKER[1] = {{
- /* id = */ H5AC_EPOCH_MARKER_ID,
- /* name = */ "epoch marker",
- /* mem_type = */ H5FD_MEM_DEFAULT, /* value doesn't matter */
- /* flags = */ H5AC__CLASS_NO_FLAGS_SET,
+ /* id = */ H5AC_EPOCH_MARKER_ID,
+ /* name = */ "epoch marker",
+ /* mem_type = */ H5FD_MEM_DEFAULT, /* value doesn't matter */
+ /* flags = */ H5AC__CLASS_NO_FLAGS_SET,
/* get_initial_load_size = */ H5C__epoch_marker_get_initial_load_size,
- /* get_final_load_size = */ H5C__epoch_marker_get_final_load_size,
- /* verify_chksum = */ H5C__epoch_marker_verify_chksum,
- /* deserialize = */ H5C__epoch_marker_deserialize,
- /* image_len = */ H5C__epoch_marker_image_len,
- /* pre_serialize = */ H5C__epoch_marker_pre_serialize,
- /* serialize = */ H5C__epoch_marker_serialize,
- /* notify = */ H5C__epoch_marker_notify,
- /* free_icr = */ H5C__epoch_marker_free_icr,
- /* fsf_size = */ H5C__epoch_marker_fsf_size,
+ /* get_final_load_size = */ H5C__epoch_marker_get_final_load_size,
+ /* verify_chksum = */ H5C__epoch_marker_verify_chksum,
+ /* deserialize = */ H5C__epoch_marker_deserialize,
+ /* image_len = */ H5C__epoch_marker_image_len,
+ /* pre_serialize = */ H5C__epoch_marker_pre_serialize,
+ /* serialize = */ H5C__epoch_marker_serialize,
+ /* notify = */ H5C__epoch_marker_notify,
+ /* free_icr = */ H5C__epoch_marker_free_icr,
+ /* fsf_size = */ H5C__epoch_marker_fsf_size,
+ /* refresh = */ NULL,
}};
diff --git a/src/H5Cpkg.h b/src/H5Cpkg.h
index 8712af5..d9a1641 100644
--- a/src/H5Cpkg.h
+++ b/src/H5Cpkg.h
@@ -48,8 +48,9 @@
#define H5C__MAX_EPOCH_MARKERS 10
/* Cache configuration settings */
-#define H5C__HASH_TABLE_LEN (64 * 1024) /* must be a power of 2 */
-#define H5C__H5C_T_MAGIC 0x005CAC0E
+#define H5C__HASH_TABLE_LEN (64 * 1024) /* must be a power of 2 */
+#define H5C__PAGE_HASH_TABLE_LEN ( 4 * 1024) /* must be a poser of 2 */
+#define H5C__H5C_T_MAGIC 0x005CAC0E
/* Initial allocated size of the "flush_dep_parent" array */
#define H5C_FLUSH_DEP_PARENT_INIT 8
@@ -977,14 +978,31 @@ if ( ( ( ( (head_ptr) == NULL ) || ( (tail_ptr) == NULL ) ) && \
*
* JRM -- 10/15/15
*
+ * - Updated the existing index macros to maintain a second
+ * hash table when cache_ptr->vfd_swrm_writer is true. This
+ * hash table bins entries by the page buffer page they reside
+ * in, thus facilitating the eviction of entries on a given page
+ * when that page is modified.
+ *
+ * JRM -- 12/14/18
+ *
***********************************************************************/
-/* H5C__HASH_TABLE_LEN is defined in H5Cpkg.h. It mut be a power of two. */
+/* H5C__HASH_TABLE_LEN is defined in H5Cpkg.h. It must be a power of two. */
#define H5C__HASH_MASK ((size_t)(H5C__HASH_TABLE_LEN - 1) << 3)
#define H5C__HASH_FCN(x) (int)((unsigned)((x) & H5C__HASH_MASK) >> 3)
+
+/* H5C__PAGE_HASH_TABLE_LEN is defined in H5Cpkg.h.
+ * It must ve a power of two.
+ */
+#define H5C__PI_HASH_MASK ((uint64_t)(H5C__PAGE_HASH_TABLE_LEN - 1))
+
+#define H5C__PI_HASH_FCN(x) (int)(((uint64_t)(x)) & H5C__PI_HASH_MASK)
+
+
#if H5C_DO_SANITY_CHECKS
#define H5C__PRE_HT_INSERT_SC(cache_ptr, entry_ptr, fail_val) \
@@ -994,6 +1012,8 @@ if ( ( (cache_ptr) == NULL ) || \
( ! H5F_addr_defined((entry_ptr)->addr) ) || \
( (entry_ptr)->ht_next != NULL ) || \
( (entry_ptr)->ht_prev != NULL ) || \
+ ( (entry_ptr)->pi_next != NULL ) || \
+ ( (entry_ptr)->pi_prev != NULL ) || \
( (entry_ptr)->size <= 0 ) || \
( H5C__HASH_FCN((entry_ptr)->addr) < 0 ) || \
( H5C__HASH_FCN((entry_ptr)->addr) >= H5C__HASH_TABLE_LEN ) || \
@@ -1039,45 +1059,52 @@ if ( ( (cache_ptr) == NULL ) || \
HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, fail_val, "post HT insert SC failed") \
}
-#define H5C__PRE_HT_REMOVE_SC(cache_ptr, entry_ptr) \
-if ( ( (cache_ptr) == NULL ) || \
- ( (cache_ptr)->magic != H5C__H5C_T_MAGIC ) || \
- ( (cache_ptr)->index_len < 1 ) || \
- ( (entry_ptr) == NULL ) || \
- ( (cache_ptr)->index_size < (entry_ptr)->size ) || \
- ( ! H5F_addr_defined((entry_ptr)->addr) ) || \
- ( (entry_ptr)->size <= 0 ) || \
- ( H5C__HASH_FCN((entry_ptr)->addr) < 0 ) || \
- ( H5C__HASH_FCN((entry_ptr)->addr) >= H5C__HASH_TABLE_LEN ) || \
- ( ((cache_ptr)->index)[(H5C__HASH_FCN((entry_ptr)->addr))] \
- == NULL ) || \
- ( ( ((cache_ptr)->index)[(H5C__HASH_FCN((entry_ptr)->addr))] \
- != (entry_ptr) ) && \
- ( (entry_ptr)->ht_prev == NULL ) ) || \
- ( ( ((cache_ptr)->index)[(H5C__HASH_FCN((entry_ptr)->addr))] == \
- (entry_ptr) ) && \
- ( (entry_ptr)->ht_prev != NULL ) ) || \
- ( (cache_ptr)->index_size != \
- ((cache_ptr)->clean_index_size + \
- (cache_ptr)->dirty_index_size) ) || \
- ( (cache_ptr)->index_size < ((cache_ptr)->clean_index_size) ) || \
- ( (cache_ptr)->index_size < ((cache_ptr)->dirty_index_size) ) || \
- ( (entry_ptr)->ring <= H5C_RING_UNDEFINED ) || \
- ( (entry_ptr)->ring >= H5C_RING_NTYPES ) || \
- ( (cache_ptr)->index_ring_len[(entry_ptr)->ring] <= 0 ) || \
- ( (cache_ptr)->index_ring_len[(entry_ptr)->ring] > \
- (cache_ptr)->index_len ) || \
- ( (cache_ptr)->index_ring_size[(entry_ptr)->ring] < \
- (entry_ptr)->size ) || \
- ( (cache_ptr)->index_ring_size[(entry_ptr)->ring] > \
- (cache_ptr)->index_size ) || \
- ( (cache_ptr)->index_ring_size[(entry_ptr)->ring] != \
- ((cache_ptr)->clean_index_ring_size[(entry_ptr)->ring] + \
- (cache_ptr)->dirty_index_ring_size[(entry_ptr)->ring]) ) || \
- ( (cache_ptr)->index_len != (cache_ptr)->il_len ) || \
- ( (cache_ptr)->index_size != (cache_ptr)->il_size ) ) { \
- HDassert(FALSE); \
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "pre HT remove SC failed") \
+#define H5C__PRE_HT_REMOVE_SC(cache_ptr, entry_ptr) \
+if ( ( (cache_ptr) == NULL ) || \
+ ( (cache_ptr)->magic != H5C__H5C_T_MAGIC ) || \
+ ( (cache_ptr)->index_len < 1 ) || \
+ ( (entry_ptr) == NULL ) || \
+ ( (cache_ptr)->index_size < (entry_ptr)->size ) || \
+ ( ! H5F_addr_defined((entry_ptr)->addr) ) || \
+ ( (entry_ptr)->size <= 0 ) || \
+ ( H5C__HASH_FCN((entry_ptr)->addr) < 0 ) || \
+ ( H5C__HASH_FCN((entry_ptr)->addr) >= H5C__HASH_TABLE_LEN ) || \
+ ( ((cache_ptr)->index)[(H5C__HASH_FCN((entry_ptr)->addr))] \
+ == NULL ) || \
+ ( ( ((cache_ptr)->index)[(H5C__HASH_FCN((entry_ptr)->addr))] \
+ != (entry_ptr) ) && \
+ ( (entry_ptr)->ht_prev == NULL ) ) || \
+ ( ( ((cache_ptr)->index)[(H5C__HASH_FCN((entry_ptr)->addr))] == \
+ (entry_ptr) ) && \
+ ( (entry_ptr)->ht_prev != NULL ) ) || \
+ ( (cache_ptr)->index_size != \
+ ((cache_ptr)->clean_index_size + \
+ (cache_ptr)->dirty_index_size) ) || \
+ ( ( (cache_ptr)->vfd_swmr_reader ) && \
+ ( ( ( (cache_ptr)->page_index[(H5C__PI_HASH_FCN((entry_ptr)->page))] \
+ != (entry_ptr) ) && \
+ ( (entry_ptr)->pi_prev == NULL ) ) || \
+ ( ( (cache_ptr)->page_index[(H5C__PI_HASH_FCN((entry_ptr)->page))] \
+ == (entry_ptr) ) && \
+ ( (entry_ptr)->pi_prev != NULL ) ) ) ) || \
+ ( (cache_ptr)->index_size < ((cache_ptr)->clean_index_size) ) || \
+ ( (cache_ptr)->index_size < ((cache_ptr)->dirty_index_size) ) || \
+ ( (entry_ptr)->ring <= H5C_RING_UNDEFINED ) || \
+ ( (entry_ptr)->ring >= H5C_RING_NTYPES ) || \
+ ( (cache_ptr)->index_ring_len[(entry_ptr)->ring] <= 0 ) || \
+ ( (cache_ptr)->index_ring_len[(entry_ptr)->ring] > \
+ (cache_ptr)->index_len ) || \
+ ( (cache_ptr)->index_ring_size[(entry_ptr)->ring] < \
+ (entry_ptr)->size ) || \
+ ( (cache_ptr)->index_ring_size[(entry_ptr)->ring] > \
+ (cache_ptr)->index_size ) || \
+ ( (cache_ptr)->index_ring_size[(entry_ptr)->ring] != \
+ ((cache_ptr)->clean_index_ring_size[(entry_ptr)->ring] + \
+ (cache_ptr)->dirty_index_ring_size[(entry_ptr)->ring]) ) || \
+ ( (cache_ptr)->index_len != (cache_ptr)->il_len ) || \
+ ( (cache_ptr)->index_size != (cache_ptr)->il_size ) ) { \
+ HDassert(FALSE && "pre HT remove SC failed"); \
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "pre HT remove SC failed") \
}
#define H5C__POST_HT_REMOVE_SC(cache_ptr, entry_ptr) \
@@ -1087,7 +1114,9 @@ if ( ( (cache_ptr) == NULL ) || \
( ! H5F_addr_defined((entry_ptr)->addr) ) || \
( (entry_ptr)->size <= 0 ) || \
( (entry_ptr)->ht_prev != NULL ) || \
- ( (entry_ptr)->ht_prev != NULL ) || \
+ ( (entry_ptr)->ht_next != NULL ) || \
+ ( (entry_ptr)->pi_prev != NULL ) || \
+ ( (entry_ptr)->pi_next != NULL ) || \
( (cache_ptr)->index_size != \
((cache_ptr)->clean_index_size + \
(cache_ptr)->dirty_index_size) ) || \
@@ -1102,7 +1131,7 @@ if ( ( (cache_ptr) == NULL ) || \
(cache_ptr)->dirty_index_ring_size[(entry_ptr)->ring]) ) || \
( (cache_ptr)->index_len != (cache_ptr)->il_len ) || \
( (cache_ptr)->index_size != (cache_ptr)->il_size ) ) { \
- HDassert(FALSE); \
+ HDassert(FALSE && "post HT remove SC failed"); \
HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "post HT remove SC failed") \
}
@@ -1118,7 +1147,9 @@ if ( ( (cache_ptr) == NULL ) || \
HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, fail_val, "pre HT search SC failed") \
}
-/* (Keep in sync w/H5C_TEST__POST_SUC_HT_SEARCH_SC macro in test/cache_common.h -QAK) */
+/* (Keep in sync w/H5C_TEST__POST_SUC_HT_SEARCH_SC macro in
+ * test/cache_common.h -QAK)
+ */
#define H5C__POST_SUC_HT_SEARCH_SC(cache_ptr, entry_ptr, k, fail_val) \
if ( ( (cache_ptr) == NULL ) || \
( (cache_ptr)->magic != H5C__H5C_T_MAGIC ) || \
@@ -1137,15 +1168,19 @@ if ( ( (cache_ptr) == NULL ) || \
( (entry_ptr)->ht_prev->ht_next != (entry_ptr) ) ) || \
( ( (entry_ptr)->ht_next != NULL ) && \
( (entry_ptr)->ht_next->ht_prev != (entry_ptr) ) ) ) { \
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, fail_val, "post successful HT search SC failed") \
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, fail_val, \
+ "post successful HT search SC failed") \
}
-/* (Keep in sync w/H5C_TEST__POST_HT_SHIFT_TO_FRONT macro in test/cache_common.h -QAK) */
+/* (Keep in sync w/H5C_TEST__POST_HT_SHIFT_TO_FRONT macro in
+ * test/cache_common.h -QAK)
+ */
#define H5C__POST_HT_SHIFT_TO_FRONT(cache_ptr, entry_ptr, k, fail_val) \
if ( ( (cache_ptr) == NULL ) || \
( ((cache_ptr)->index)[k] != (entry_ptr) ) || \
( (entry_ptr)->ht_prev != NULL ) ) { \
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, fail_val, "post HT shift to front SC failed") \
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, fail_val, \
+ "post HT shift to front SC failed") \
}
#define H5C__PRE_HT_ENTRY_SIZE_CHANGE_SC(cache_ptr, old_size, new_size, \
@@ -1180,7 +1215,8 @@ if ( ( (cache_ptr) == NULL ) || \
( (cache_ptr)->index_len != (cache_ptr)->il_len ) || \
( (cache_ptr)->index_size != (cache_ptr)->il_size ) ) { \
HDassert(FALSE); \
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "pre HT entry size change SC failed") \
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, \
+ "pre HT entry size change SC failed") \
}
#define H5C__POST_HT_ENTRY_SIZE_CHANGE_SC(cache_ptr, old_size, new_size, \
@@ -1210,7 +1246,8 @@ if ( ( (cache_ptr) == NULL ) || \
( (cache_ptr)->index_len != (cache_ptr)->il_len ) || \
( (cache_ptr)->index_size != (cache_ptr)->il_size ) ) { \
HDassert(FALSE); \
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "post HT entry size change SC failed") \
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, \
+ "post HT entry size change SC failed") \
}
#define H5C__PRE_HT_UPDATE_FOR_ENTRY_CLEAN_SC(cache_ptr, entry_ptr) \
@@ -1237,7 +1274,8 @@ if ( \
((cache_ptr)->clean_index_ring_size[(entry_ptr)->ring] + \
(cache_ptr)->dirty_index_ring_size[(entry_ptr)->ring]) ) ) { \
HDassert(FALSE); \
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "pre HT update for entry clean SC failed") \
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, \
+ "pre HT update for entry clean SC failed") \
}
#define H5C__PRE_HT_UPDATE_FOR_ENTRY_DIRTY_SC(cache_ptr, entry_ptr) \
@@ -1264,7 +1302,8 @@ if ( \
((cache_ptr)->clean_index_ring_size[(entry_ptr)->ring] + \
(cache_ptr)->dirty_index_ring_size[(entry_ptr)->ring]) ) ) { \
HDassert(FALSE); \
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "pre HT update for entry dirty SC failed") \
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, \
+ "pre HT update for entry dirty SC failed") \
}
#define H5C__POST_HT_UPDATE_FOR_ENTRY_CLEAN_SC(cache_ptr, entry_ptr) \
@@ -1280,7 +1319,8 @@ if ( ( (cache_ptr)->index_size != \
((cache_ptr)->clean_index_ring_size[(entry_ptr)->ring] + \
(cache_ptr)->dirty_index_ring_size[(entry_ptr)->ring]) ) ) { \
HDassert(FALSE); \
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "post HT update for entry clean SC failed") \
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, \
+ "post HT update for entry clean SC failed") \
}
#define H5C__POST_HT_UPDATE_FOR_ENTRY_DIRTY_SC(cache_ptr, entry_ptr) \
@@ -1296,7 +1336,8 @@ if ( ( (cache_ptr)->index_size != \
((cache_ptr)->clean_index_ring_size[(entry_ptr)->ring] + \
(cache_ptr)->dirty_index_ring_size[(entry_ptr)->ring]) ) ) { \
HDassert(FALSE); \
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "post HT update for entry dirty SC failed") \
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, \
+ "post HT update for entry dirty SC failed") \
}
#else /* H5C_DO_SANITY_CHECKS */
@@ -1324,6 +1365,14 @@ if ( ( (cache_ptr)->index_size != \
{ \
int k; \
H5C__PRE_HT_INSERT_SC(cache_ptr, entry_ptr, fail_val) \
+ if ( cache_ptr->vfd_swmr_reader ) { \
+ k = H5C__PI_HASH_FCN((entry_ptr)->page); \
+ if ( ( (cache_ptr)->page_index)[k] != NULL ) { \
+ (entry_ptr)->pi_next = ((cache_ptr)->page_index)[k]; \
+ (entry_ptr)->pi_next->pi_prev = (entry_ptr); \
+ } \
+ ((cache_ptr)->page_index)[k] = (entry_ptr); \
+ } \
k = H5C__HASH_FCN((entry_ptr)->addr); \
if(((cache_ptr)->index)[k] != NULL) { \
(entry_ptr)->ht_next = ((cache_ptr)->index)[k]; \
@@ -1359,13 +1408,30 @@ if ( ( (cache_ptr)->index_size != \
{ \
int k; \
H5C__PRE_HT_REMOVE_SC(cache_ptr, entry_ptr) \
+ if ( cache_ptr->vfd_swmr_reader ) { \
+ k = H5C__PI_HASH_FCN((entry_ptr)->page); \
+ if ( (entry_ptr)->pi_next ) { \
+ (entry_ptr)->pi_next->pi_prev = (entry_ptr)->pi_prev; \
+ } \
+ if ( (entry_ptr)->pi_prev ) { \
+ (entry_ptr)->pi_prev->pi_next = (entry_ptr)->pi_next; \
+ } \
+ if ( ( (cache_ptr)->page_index)[k] == (entry_ptr) ) { \
+ ((cache_ptr)->page_index)[k] = (entry_ptr)->pi_next; \
+ } \
+ (entry_ptr)->pi_next = NULL; \
+ (entry_ptr)->pi_prev = NULL; \
+ } \
k = H5C__HASH_FCN((entry_ptr)->addr); \
- if((entry_ptr)->ht_next) \
+ if ( (entry_ptr)->ht_next ) { \
(entry_ptr)->ht_next->ht_prev = (entry_ptr)->ht_prev; \
- if((entry_ptr)->ht_prev) \
+ } \
+ if ( (entry_ptr)->ht_prev ) { \
(entry_ptr)->ht_prev->ht_next = (entry_ptr)->ht_next; \
- if(((cache_ptr)->index)[k] == (entry_ptr)) \
+ } \
+ if ( ( (cache_ptr)->index)[k] == (entry_ptr) ) { \
((cache_ptr)->index)[k] = (entry_ptr)->ht_next; \
+ } \
(entry_ptr)->ht_next = NULL; \
(entry_ptr)->ht_prev = NULL; \
(cache_ptr)->index_len--; \
@@ -1373,7 +1439,7 @@ if ( ( (cache_ptr)->index_size != \
((cache_ptr)->index_ring_len[entry_ptr->ring])--; \
((cache_ptr)->index_ring_size[entry_ptr->ring]) \
-= (entry_ptr)->size; \
- if((entry_ptr)->is_dirty) { \
+ if ( (entry_ptr)->is_dirty ) { \
(cache_ptr)->dirty_index_size -= (entry_ptr)->size; \
((cache_ptr)->dirty_index_ring_size[entry_ptr->ring]) \
-= (entry_ptr)->size; \
@@ -1382,7 +1448,7 @@ if ( ( (cache_ptr)->index_size != \
((cache_ptr)->clean_index_ring_size[entry_ptr->ring]) \
-= (entry_ptr)->size; \
} \
- if((entry_ptr)->flush_me_last) { \
+ if ( (entry_ptr)->flush_me_last ) { \
(cache_ptr)->num_last_entries--; \
HDassert((cache_ptr)->num_last_entries <= 1); \
} \
@@ -3690,6 +3756,36 @@ typedef struct H5C_tag_info_t {
*
* This field is NULL if the index is empty.
*
+ * Page Index:
+ *
+ * For the VFD SWMR reader, it is necessary to map modified pages to
+ * entries contained in that page so that they can be invalidated. The
+ * page index is a hash table that provides this service. Note that it
+ * is only maintained for files that are opened in VFD SWMR reader mode.
+ *
+ * Structurally, the page index is identical to the index in the page
+ * buffer. Specifically, it is a hash table with chaining. The hash
+ * table size must be a power of two, not the usual prime number. The
+ * hash function simply clips the high order bits off the page offset
+ * of the entry's base address.
+ *
+ * The page index is maintained by the same macros that maintain the
+ * regular index. As such, it does not require separate length and
+ * size fields, as it shares them with the regular index. Instead,
+ * the only ancilary field needed is the vfd_swrm_reader boolean, which
+ * indicates whether the page index must be maintained.
+ *
+ * vfd_swmr_reader: Boolean flag that is TRUE iff the file has been
+ * opened as a VFD SWMR reader. The remaining fields in
+ * the page index section are valid iff this field is TRUE.
+ *
+ * page_index Array of pointer to H5C_cache_entry_t of size
+ * H5C__PAGE_HASH_TABLE_LEN. This size must be a power of
+ * two, not the usual prime number.
+ *
+ * page_size: Convenience copy of the page size used by the page
+ * buffer.
+ *
*
* With the addition of the take ownership flag, it is possible that
* an entry may be removed from the cache as the result of the flush of
@@ -4679,6 +4775,11 @@ struct H5C_t {
H5C_cache_entry_t * il_head;
H5C_cache_entry_t * il_tail;
+ /* Fields supporting VFD SWMR */
+ hbool_t vfd_swmr_reader;
+ H5C_cache_entry_t * page_index[H5C__PAGE_HASH_TABLE_LEN];
+ hsize_t page_size;
+
/* Fields to detect entries removed during scans */
int64_t entries_removed_counter;
H5C_cache_entry_t * last_entry_removed_ptr;
diff --git a/src/H5Cprefetched.c b/src/H5Cprefetched.c
index 954dd60..0c32fd5 100644
--- a/src/H5Cprefetched.c
+++ b/src/H5Cprefetched.c
@@ -106,6 +106,7 @@ const H5AC_class_t H5AC_PREFETCHED_ENTRY[1] = {{
/* notify = */ H5C__prefetched_entry_notify,
/* free_icr = */ H5C__prefetched_entry_free_icr,
/* fsf_size = */ H5C__prefetched_entry_fsf_size,
+ /* refresh = */ NULL,
}};
diff --git a/src/H5Cprivate.h b/src/H5Cprivate.h
index 0ba0234..23091cb 100644
--- a/src/H5Cprivate.h
+++ b/src/H5Cprivate.h
@@ -384,10 +384,11 @@ typedef struct H5C_t H5C_t;
*
* The typedef for the get_load_size callback is as follows:
*
- * typedef herr_t (*H5C_get_final_load_size_func_t)(const void *image_ptr,
- * size_t image_len,
- * void *udata_ptr,
- * size_t *actual_len_ptr);
+ * typedef
+ * herr_t (*H5C_get_final_load_size_func_t)(const void *image_ptr,
+ * size_t image_len,
+ * void *udata_ptr,
+ * size_t *actual_len_ptr);
*
* The parameters of the get_load_size callback are as follows:
*
@@ -404,7 +405,8 @@ typedef struct H5C_t H5C_t;
* actual_len_ptr: Pointer to the location containing the actual length
* of the metadata entry on disk.
*
- * Processing in the get_final_load_size function should proceed as follows:
+ * Processing in the get_final_load_size function should proceed as
+ * follows:
*
* If successful, the function will place the length in the *actual_len_ptr
* associated with supplied image and/or user data and then return SUCCEED.
@@ -843,6 +845,103 @@ typedef struct H5C_t H5C_t;
* push error information on the error stack with the error API
* routines.
*
+ * REFRESH_ENTRY: Pointer to the refresh entry callback.
+ *
+ * This callback exists to support VFD SWMR readers, and should not
+ * be used outside this context.
+ *
+ * At the end of each tick, the VFD SWMR reader is informed of pages
+ * in the page buffer that have been modified since the last tick.
+ *
+ * To avoid message from the past bugs, it is necessary to either
+ * evict or refresh entries that have been modified in the past tick,
+ * and thus reside in such modified pages.
+ *
+ * To this end, the metadata cache is informed of all such pages,
+ * and must either evict, or update all entries contained in these
+ * pages, or determine that the entry in question has not been modified,
+ * and thus that no action is required.
+ *
+ * If the entry is unpinned, it is possible to simply evict it, and
+ * this is probably the most efficient way to address the issue.
+ *
+ * If the entry is pinned and tagged, it is possible to evict the
+ * entire on disk data structure of which it is part via the evict
+ * tagged entry facility. This is inefficient, but it is simple and
+ * uses existing code -- hence this is plan A for the initial
+ * implementation of VFD SWMR.
+ *
+ * However, there remains the case of the pinned entry that is not
+ * tagged, and thus not subject to eviction via the evict tagged
+ * entries call -- the most important example of this is the super
+ * block which is pinned and may not be evicted until file close.
+ *
+ * Another example is free space manager headers -- however, these
+ * are a non-issue in the context of VFD SWMR readers as such files
+ * must only be opened R/O and thus will not have active free space
+ * managers.
+ *
+ * The refresh entry callback exists to address this issue. As
+ * indicated above, it is essential for the superblock, and desireable
+ * whenever it is not possible to simply evict an entry that resides
+ * in a modified page cache page.
+ *
+ * Functionally, the call is similar to the deserialize call, the
+ * primary difference being that the client receives both a pointer
+ * to the existing entry, and a buffer containing its image. The
+ * client must deserialize this image an update itself as appropriate.
+ *
+ * The typedef for the VFD SWMR refresh callback is as follows:
+ *
+ * typedef void *(*H5C_vfd_swmr_refresh_func_t)(H5F_t * f,
+ * void * entry_ptr,
+ * const void * image_ptr,
+ * size_t * len_ptr);
+ *
+ * The parameters of the deserialize callback are as follows:
+ *
+ * f: Pointer to the containing instance of H5F_t.
+ *
+ * entry_ptr: Pointer to the metadata cache entry that is being
+ * refreshed. This entry is place on the protected list
+ * for the duration of the refresh callback as the client
+ * will typically modify it during the refresh operation.
+ *
+ * image_ptr: Pointer to a buffer of length *len_ptr containing the
+ * most recent version of the entry's on disk image from
+ * the VFD SWMR metadata file. The length of the buffer
+ * is specified in the len parameter below.
+ *
+ * len_ptr: Pointer to size_t containing the length in
+ * bytes of the buffer pointed to by *image_ptr.
+ *
+ * If the supplied buffer is too small, the callback must
+ * place the correct value in *len_ptr and return success.
+ * The metadata cache will read the larger image, and call
+ * the refresh function again.
+ *
+ * Processing in the refresh function should proceed as follows:
+ *
+ * The target entry will be protected for the duration of the
+ * refresh call. This allows entry resizes if necessary, and
+ * prevents re-entrant refresh calls.
+ *
+ * If the supplied image contains valid data, and is of the correct
+ * length, the refresh function must parse it, and apply updates to
+ * the in core representatin of the metadata cache entry as required.
+ * Note that since the file is opened R/O, any updates must not
+ * cause the entry to be marked dirty.
+ *
+ * If the image contains valid data, but is too small, the refresh
+ * callback must copy the correct image length to *len_ptr, and
+ * return success. The metadata cache will make a second call with
+ * the correct image length. If the entry must change size, the
+ * refresh callback must call H5C_resize_entry().
+ *
+ * If the image contains invalid data, or if, for whatever reason,
+ * the refresh function cannot apply its contents, the refresh
+ * function must return failure.
+ *
***************************************************************************/
/* Actions that can be reported to 'notify' client callback */
@@ -861,44 +960,59 @@ typedef enum H5C_notify_action_t {
*/
H5C_NOTIFY_ACTION_ENTRY_DIRTIED, /* Entry has been marked dirty. */
H5C_NOTIFY_ACTION_ENTRY_CLEANED, /* Entry has been marked clean. */
- H5C_NOTIFY_ACTION_CHILD_DIRTIED, /* Dependent child has been marked dirty. */
- H5C_NOTIFY_ACTION_CHILD_CLEANED, /* Dependent child has been marked clean. */
- H5C_NOTIFY_ACTION_CHILD_UNSERIALIZED, /* Dependent child has been marked unserialized. */
- H5C_NOTIFY_ACTION_CHILD_SERIALIZED /* Dependent child has been marked serialized. */
+ H5C_NOTIFY_ACTION_CHILD_DIRTIED, /* Dependent child has been marked
+ * dirty.
+ */
+ H5C_NOTIFY_ACTION_CHILD_CLEANED, /* Dependent child has been marked
+ * clean.
+ */
+ H5C_NOTIFY_ACTION_CHILD_UNSERIALIZED, /* Dependent child has been marked
+ * unserialized.
+ */
+ H5C_NOTIFY_ACTION_CHILD_SERIALIZED /* Dependent child has been marked
+ * serialized.
+ */
} H5C_notify_action_t;
/* Cache client callback function pointers */
-typedef herr_t (*H5C_get_initial_load_size_func_t)(void *udata_ptr, size_t *image_len_ptr);
+typedef herr_t (*H5C_get_initial_load_size_func_t)(void *udata_ptr,
+ size_t *image_len_ptr);
typedef herr_t (*H5C_get_final_load_size_func_t)(const void *image_ptr,
size_t image_len, void *udata_ptr, size_t *actual_len_ptr);
-typedef htri_t (*H5C_verify_chksum_func_t)(const void *image_ptr, size_t len, void *udata_ptr);
+typedef htri_t (*H5C_verify_chksum_func_t)(const void *image_ptr, size_t len,
+ void *udata_ptr);
typedef void *(*H5C_deserialize_func_t)(const void *image_ptr,
size_t len, void *udata_ptr, hbool_t *dirty_ptr);
-typedef herr_t (*H5C_image_len_func_t)(const void *thing, size_t *image_len_ptr);
+typedef herr_t (*H5C_image_len_func_t)(const void *thing,
+ size_t *image_len_ptr);
typedef herr_t (*H5C_pre_serialize_func_t)(H5F_t *f, void *thing, haddr_t addr,
- size_t len, haddr_t *new_addr_ptr, size_t *new_len_ptr, unsigned *flags_ptr);
+ size_t len, haddr_t *new_addr_ptr, size_t *new_len_ptr,
+ unsigned *flags_ptr);
typedef herr_t (*H5C_serialize_func_t)(const H5F_t *f, void *image_ptr,
size_t len, void *thing);
typedef herr_t (*H5C_notify_func_t)(H5C_notify_action_t action, void *thing);
typedef herr_t (*H5C_free_icr_func_t)(void *thing);
typedef herr_t (*H5C_get_fsf_size_t)(const void * thing, hsize_t *fsf_size_ptr);
+typedef herr_t (*H5C_vfd_swmr_refresh_func_t)(H5F_t * f, void * entry_ptr,
+ const void * image_ptr, size_t *len_ptr);
/* Metadata cache client class definition */
typedef struct H5C_class_t {
- int id;
- const char * name;
- H5FD_mem_t mem_type;
- unsigned flags;
+ int id;
+ const char * name;
+ H5FD_mem_t mem_type;
+ unsigned flags;
H5C_get_initial_load_size_func_t get_initial_load_size;
H5C_get_final_load_size_func_t get_final_load_size;
- H5C_verify_chksum_func_t verify_chksum;
- H5C_deserialize_func_t deserialize;
- H5C_image_len_func_t image_len;
- H5C_pre_serialize_func_t pre_serialize;
- H5C_serialize_func_t serialize;
- H5C_notify_func_t notify;
- H5C_free_icr_func_t free_icr;
- H5C_get_fsf_size_t fsf_size;
+ H5C_verify_chksum_func_t verify_chksum;
+ H5C_deserialize_func_t deserialize;
+ H5C_image_len_func_t image_len;
+ H5C_pre_serialize_func_t pre_serialize;
+ H5C_serialize_func_t serialize;
+ H5C_notify_func_t notify;
+ H5C_free_icr_func_t free_icr;
+ H5C_get_fsf_size_t fsf_size;
+ H5C_vfd_swmr_refresh_func_t refresh;
} H5C_class_t;
/* Type definitions of callback functions used by the cache as a whole */
@@ -1574,6 +1688,35 @@ typedef int H5C_ring_t;
* tag_info: Pointer to the common tag state for all entries belonging to
* an object. NULL for untagged entries.
*
+ * Fields supporting VFD SWMR
+ *
+ * The following fields exist to support the page index. These fields are
+ * only defined when the vfd_swmr_reader field in the associated instance of
+ * H5C_t is set to TRUE.
+ *
+ * page: Page offset of the page containing the base address of the
+ * metadata cache entry.
+ *
+ * refreshed_in_tick: When an entry is refreshed as part of the VFD SWMR
+ * reader end of tick processing, this field is used to
+ * record the tick in which this occured. The field is
+ * used primarily for sanity checking.
+ *
+ * pi_next: Next pointer used by the page index hash table that maps
+ * page buffer pages to any metadata cache entries that
+ * reside in the target page.
+ *
+ * This field points to the next entry in the doubly linked
+ * list of entries in the hash bin, or NULL if there is no
+ * next entry.
+ *
+ * pi_prev: Prev pointer used by the page index hash table that maps
+ * page buffer pages to any metadata cache entries that
+ * reside in the target page.
+ *
+ * This field points to the next entry in the doubly linked
+ * list of entries in the hash bin, or NULL if there is no
+ * next entry
*
* Cache entry stats collection fields:
*
@@ -1673,6 +1816,12 @@ typedef struct H5C_cache_entry_t {
struct H5C_cache_entry_t *tl_prev;
struct H5C_tag_info_t *tag_info;
+ /* fields supporting VFD SWMR */
+ uint64_t page;
+ uint64_t refreshed_in_tick;
+ struct H5C_cache_entry_t *pi_next;
+ struct H5C_cache_entry_t *pi_prev;
+
#if H5C_COLLECT_CACHE_ENTRY_STATS
/* cache entry stats fields */
int32_t accesses;
@@ -2239,13 +2388,15 @@ H5_DLL void H5C_def_auto_resize_rpt_fcn(H5C_t *cache_ptr, int32_t version,
size_t old_min_clean_size, size_t new_min_clean_size);
H5_DLL herr_t H5C_dest(H5F_t *f);
H5_DLL herr_t H5C_evict(H5F_t *f);
+H5_DLL herr_t H5C_evict_or_refresh_all_entries_in_page(H5F_t * f, uint64_t page,
+ uint32_t length, uint64_t tick);
H5_DLL herr_t H5C_expunge_entry(H5F_t *f, const H5C_class_t *type, haddr_t addr,
unsigned flags);
H5_DLL herr_t H5C_flush_cache(H5F_t *f, unsigned flags);
H5_DLL herr_t H5C_flush_tagged_entries(H5F_t *f, haddr_t tag);
H5_DLL herr_t H5C_force_cache_image_load(H5F_t * f);
H5_DLL herr_t H5C_evict_tagged_entries(H5F_t *f, haddr_t tag, hbool_t match_global);
-H5_DLL herr_t H5C_expunge_tag_type_metadata(H5F_t *f, haddr_t tag, int type_id, unsigned flags);
+H5_DLL herr_t H5C_expunge_tag_type_metadata(H5F_t *f, haddr_t tag, int type_id, unsigned flags, hbool_t type_match);
H5_DLL herr_t H5C_get_tag(const void *thing, /*OUT*/ haddr_t *tag);
#if H5C_DO_TAGGING_SANITY_CHECKS
herr_t H5C_verify_tag(int id, haddr_t tag);
@@ -2265,7 +2416,8 @@ H5_DLL herr_t H5C_get_entry_status(const H5F_t *f, haddr_t addr,
hbool_t *is_protected_ptr, hbool_t *is_pinned_ptr, hbool_t *is_corked_ptr,
hbool_t *is_flush_dep_parent_ptr, hbool_t *is_flush_dep_child_ptr,
hbool_t *image_up_to_date_ptr);
-H5_DLL herr_t H5C_get_evictions_enabled(const H5C_t *cache_ptr, hbool_t *evictions_enabled_ptr);
+H5_DLL herr_t H5C_get_evictions_enabled(const H5C_t *cache_ptr,
+ hbool_t *evictions_enabled_ptr);
H5_DLL void * H5C_get_aux_ptr(const H5C_t *cache_ptr);
H5_DLL herr_t H5C_image_stats(H5C_t * cache_ptr, hbool_t print_header);
H5_DLL herr_t H5C_insert_entry(H5F_t *f, const H5C_class_t *type, haddr_t addr,
@@ -2280,7 +2432,8 @@ H5_DLL herr_t H5C_move_entry(H5C_t *cache_ptr, const H5C_class_t *type,
haddr_t old_addr, haddr_t new_addr);
H5_DLL herr_t H5C_pin_protected_entry(void *thing);
H5_DLL herr_t H5C_prep_for_file_close(H5F_t *f);
-H5_DLL herr_t H5C_create_flush_dependency(void *parent_thing, void *child_thing);
+H5_DLL herr_t H5C_create_flush_dependency(void *parent_thing,
+ void *child_thing);
H5_DLL void * H5C_protect(H5F_t *f, const H5C_class_t *type, haddr_t addr,
void *udata, unsigned flags);
H5_DLL herr_t H5C_reset_cache_hit_rate_stats(H5C_t *cache_ptr);
@@ -2288,13 +2441,17 @@ H5_DLL herr_t H5C_resize_entry(void *thing, size_t new_size);
H5_DLL herr_t H5C_set_cache_auto_resize_config(H5C_t *cache_ptr, H5C_auto_size_ctl_t *config_ptr);
H5_DLL herr_t H5C_set_cache_image_config(const H5F_t *f, H5C_t *cache_ptr,
H5C_cache_image_ctl_t *config_ptr);
-H5_DLL herr_t H5C_set_evictions_enabled(H5C_t *cache_ptr, hbool_t evictions_enabled);
+H5_DLL herr_t H5C_set_evictions_enabled(H5C_t *cache_ptr,
+ hbool_t evictions_enabled);
+H5_DLL herr_t H5C_set_vfd_swmr_reader(H5C_t *cache_ptr,
+ hbool_t vfd_swmr_reader, hsize_t page_size);
H5_DLL herr_t H5C_set_prefix(H5C_t *cache_ptr, char *prefix);
H5_DLL herr_t H5C_stats(H5C_t *cache_ptr, const char *cache_name,
hbool_t display_detailed_stats);
H5_DLL void H5C_stats__reset(H5C_t *cache_ptr);
H5_DLL herr_t H5C_unpin_entry(void *thing);
-H5_DLL herr_t H5C_destroy_flush_dependency(void *parent_thing, void *child_thing);
+H5_DLL herr_t H5C_destroy_flush_dependency(void *parent_thing,
+ void *child_thing);
H5_DLL herr_t H5C_unprotect(H5F_t *f, haddr_t addr, void *thing,
unsigned int flags);
H5_DLL herr_t H5C_validate_cache_image_config(H5C_cache_image_ctl_t * ctl_ptr);
@@ -2304,15 +2461,18 @@ H5_DLL herr_t H5C_ignore_tags(H5C_t *cache_ptr);
H5_DLL hbool_t H5C_get_ignore_tags(const H5C_t *cache_ptr);
H5_DLL uint32_t H5C_get_num_objs_corked(const H5C_t *cache_ptr);
H5_DLL herr_t H5C_retag_entries(H5C_t * cache_ptr, haddr_t src_tag, haddr_t dest_tag);
-H5_DLL herr_t H5C_cork(H5C_t *cache_ptr, haddr_t obj_addr, unsigned action, hbool_t *corked);
-H5_DLL herr_t H5C_get_entry_ring(const H5F_t *f, haddr_t addr, H5C_ring_t *ring);
+H5_DLL herr_t H5C_cork(H5C_t *cache_ptr, haddr_t obj_addr, unsigned action,
+ hbool_t *corked);
+H5_DLL herr_t H5C_get_entry_ring(const H5F_t *f, haddr_t addr,
+ H5C_ring_t *ring);
H5_DLL herr_t H5C_unsettle_entry_ring(void *thing);
H5_DLL herr_t H5C_unsettle_ring(H5F_t * f, H5C_ring_t ring);
H5_DLL herr_t H5C_remove_entry(void *thing);
H5_DLL herr_t H5C_cache_image_status(H5F_t * f, hbool_t *load_ci_ptr,
hbool_t *write_ci_ptr);
H5_DLL hbool_t H5C_cache_image_pending(const H5C_t *cache_ptr);
-H5_DLL herr_t H5C_get_mdc_image_info(H5C_t *cache_ptr, haddr_t *image_addr, hsize_t *image_len);
+H5_DLL herr_t H5C_get_mdc_image_info(H5C_t *cache_ptr, haddr_t *image_addr,
+ hsize_t *image_len);
/* Logging functions */
H5_DLL herr_t H5C_start_logging(H5C_t *cache);
diff --git a/src/H5Ctag.c b/src/H5Ctag.c
index e92d0e4..2573e93 100644
--- a/src/H5Ctag.c
+++ b/src/H5Ctag.c
@@ -75,6 +75,7 @@ typedef struct {
H5F_t *f; /* File pointer for evicting entry */
int type_id; /* Cache entry type to expunge */
unsigned flags; /* Flags for expunging entry */
+ hbool_t type_match;
} H5C_tag_iter_ettm_ctx_t;
/* Typedef for tagged entry iterator callback context - mark corked */
@@ -837,7 +838,7 @@ H5C__expunge_tag_type_metadata_cb(H5C_cache_entry_t *entry, void *_ctx)
HDassert(ctx);
/* Found one with the same tag and type id */
- if(entry->type->id == ctx->type_id)
+ if(entry->type->id == ctx->type_id || !ctx->type_match)
if(H5C_expunge_entry(ctx->f, entry->type, entry->addr, ctx->flags) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTEXPUNGE, H5_ITER_ERROR, "can't expunge entry")
@@ -861,7 +862,7 @@ done:
*-------------------------------------------------------------------------
*/
herr_t
-H5C_expunge_tag_type_metadata(H5F_t *f, haddr_t tag, int type_id, unsigned flags)
+H5C_expunge_tag_type_metadata(H5F_t *f, haddr_t tag, int type_id, unsigned flags, hbool_t type_match)
{
H5C_t *cache; /* Pointer to cache structure */
H5C_tag_iter_ettm_ctx_t ctx; /* Context for iterator callback */
@@ -881,6 +882,7 @@ H5C_expunge_tag_type_metadata(H5F_t *f, haddr_t tag, int type_id, unsigned flags
ctx.f = f;
ctx.type_id = type_id;
ctx.flags = flags;
+ ctx.type_match = type_match;
/* Iterate through hash table entries, expunge those with specified tag and type id */
if(H5C__iter_tagged_entries(cache, tag, FALSE, H5C__expunge_tag_type_metadata_cb, &ctx) < 0)
diff --git a/src/H5Dbtree.c b/src/H5Dbtree.c
index 098e01b..7741e99 100644
--- a/src/H5Dbtree.c
+++ b/src/H5Dbtree.c
@@ -166,7 +166,8 @@ const H5D_chunk_ops_t H5D_COPS_BTREE[1] = {{
H5D__btree_idx_size, /* size */
H5D__btree_idx_reset, /* reset */
H5D__btree_idx_dump, /* dump */
- H5D__btree_idx_dest /* destroy */
+ H5D__btree_idx_dest, /* destroy */
+ NULL /* close */
}};
diff --git a/src/H5Dbtree2.c b/src/H5Dbtree2.c
index ccb786b..65a020f 100644
--- a/src/H5Dbtree2.c
+++ b/src/H5Dbtree2.c
@@ -161,7 +161,8 @@ const H5D_chunk_ops_t H5D_COPS_BT2[1] = {{
H5D__bt2_idx_size, /* size */
H5D__bt2_idx_reset, /* reset */
H5D__bt2_idx_dump, /* dump */
- H5D__bt2_idx_dest /* destroy */
+ H5D__bt2_idx_dest, /* destroy */
+ H5D__bt2_idx_dest /* close (same as destroy) */
}};
diff --git a/src/H5Dchunk.c b/src/H5Dchunk.c
index ee83564..7e7d6b4 100644
--- a/src/H5Dchunk.c
+++ b/src/H5Dchunk.c
@@ -267,6 +267,8 @@ static herr_t H5D__chunk_flush(H5D_t *dset);
static herr_t H5D__chunk_io_term(const H5D_chunk_map_t *fm);
static herr_t H5D__chunk_dest(H5D_t *dset);
+static herr_t H5D__chunk_index_close(const H5D_t *, bool);
+
/* Chunk query operation callbacks */
static int H5D__get_num_chunks_cb(const H5D_chunk_rec_t *chunk_rec, void *_udata);
static int H5D__get_chunk_info_cb(const H5D_chunk_rec_t *chunk_rec, void *_udata);
@@ -2626,6 +2628,19 @@ H5D__chunk_read(H5D_io_info_t *io_info, const H5D_type_info_t *type_info,
chunk_node = H5D_CHUNK_GET_NEXT_NODE(fm, chunk_node);
} /* end while */
+ /* Stopgap fix for VFD SWMR: close the chunk index so that
+ * pinned/tagged entries in the metadata cache (MDC) are released.
+ *
+ * Extensible chunked datasets use extensible arrays or btrees as
+ * chunk indices. Open chunk indices leave pinned/tagged entries
+ * in the MDC, and VFD SWMR cannot (yet) evict or refresh those
+ * entries. After we write refresh routines for those entries, this
+ * stopgap fix can go away.
+ */
+ if(H5D__chunk_index_close(io_info->dset, false) < 0)
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTFREE, FAIL,
+ "unable to close chunk index")
+
done:
FUNC_LEAVE_NOAPI(ret_value)
} /* H5D__chunk_read() */
@@ -2883,6 +2898,41 @@ done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5D__chunk_io_term() */
+/* Close the given dataset's chunk index, or destroy it if `destroy`
+ * is true. A closed index merely releases holds on metadata cache
+ * entries; the index can be reopened. Once a dataset's index is
+ * destroyed, however, the dataset must not try to use the index, again.
+ *
+ * A useful side-effect of closing the chunk index is the release
+ * pinned/tagged metadata cache entries connected with the index.
+ */
+static herr_t
+H5D__chunk_index_close(const H5D_t *dset, bool destroy)
+{
+ H5D_chk_idx_info_t idx_info;
+ H5O_storage_chunk_t *sc = &(dset->shared->layout.storage.u.chunk);
+ herr_t ret_value = SUCCEED; /* Return value */
+ H5D_chunk_close_func_t fn;
+
+ FUNC_ENTER_STATIC
+
+ H5D_CHUNK_STORAGE_INDEX_CHK(sc);
+
+ idx_info.f = dset->oloc.file;
+ idx_info.pline = &dset->shared->dcpl_cache.pline;
+ idx_info.layout = &dset->shared->layout.u.chunk;
+ idx_info.storage = sc;
+
+ fn = destroy ? sc->ops->dest : sc->ops->close;
+
+ if (fn != NULL && (*fn)(&idx_info) < 0) {
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTFREE, FAIL,
+ "unable to release chunk index info")
+ }
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+}
/*-------------------------------------------------------------------------
* Function: H5D__chunk_dest
@@ -2900,18 +2950,15 @@ done:
static herr_t
H5D__chunk_dest(H5D_t *dset)
{
- H5D_chk_idx_info_t idx_info; /* Chunked index info */
H5D_rdcc_t *rdcc = &(dset->shared->cache.chunk); /* Dataset's chunk cache */
H5D_rdcc_ent_t *ent = NULL, *next = NULL; /* Pointer to current & next cache entries */
int nerrors = 0; /* Accumulated count of errors */
- H5O_storage_chunk_t *sc = &(dset->shared->layout.storage.u.chunk);
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_STATIC_TAG(dset->oloc.addr)
/* Sanity checks */
HDassert(dset);
- H5D_CHUNK_STORAGE_INDEX_CHK(sc);
/* Flush all the cached chunks */
for(ent = rdcc->head; ent; ent = next) {
@@ -2929,15 +2976,10 @@ H5D__chunk_dest(H5D_t *dset)
rdcc->slot = H5FL_SEQ_FREE(H5D_rdcc_ent_ptr_t, rdcc->slot);
HDmemset(rdcc, 0, sizeof(H5D_rdcc_t));
- /* Compose chunked index info struct */
- idx_info.f = dset->oloc.file;
- idx_info.pline = &dset->shared->dcpl_cache.pline;
- idx_info.layout = &dset->shared->layout.u.chunk;
- idx_info.storage = sc;
-
- /* Free any index structures */
- if(sc->ops->dest && (sc->ops->dest)(&idx_info) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTFREE, FAIL, "unable to release chunk index info")
+ if (H5D__chunk_index_close(dset, true) < 0) {
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTFREE, FAIL,
+ "unable to close chunk index")
+ }
done:
FUNC_LEAVE_NOAPI_TAG(ret_value)
@@ -5696,7 +5738,7 @@ H5D__chunk_addrmap_cb(const H5D_chunk_rec_t *chunk_rec, void *_udata)
/* Set it in the userdata to return */
udata->chunk_addr[chunk_index] = chunk_rec->chunk_addr;
- FUNC_LEAVE_NOAPI(H5_ITER_CONT)
+ FUNC_LEAVE_NOAPI(ret_value)
} /* H5D__chunk_addrmap_cb() */
diff --git a/src/H5Dearray.c b/src/H5Dearray.c
index a53489e..8f34a07 100644
--- a/src/H5Dearray.c
+++ b/src/H5Dearray.c
@@ -165,7 +165,8 @@ const H5D_chunk_ops_t H5D_COPS_EARRAY[1] = {{
H5D__earray_idx_size, /* size */
H5D__earray_idx_reset, /* reset */
H5D__earray_idx_dump, /* dump */
- H5D__earray_idx_dest /* destroy */
+ H5D__earray_idx_dest, /* destroy */
+ H5D__earray_idx_dest /* close (same as destroy) */
}};
diff --git a/src/H5Dfarray.c b/src/H5Dfarray.c
index a9202c2..1417bc2 100644
--- a/src/H5Dfarray.c
+++ b/src/H5Dfarray.c
@@ -161,7 +161,8 @@ const H5D_chunk_ops_t H5D_COPS_FARRAY[1] = {{
H5D__farray_idx_size, /* size */
H5D__farray_idx_reset, /* reset */
H5D__farray_idx_dump, /* dump */
- H5D__farray_idx_dest /* destroy */
+ H5D__farray_idx_dest, /* destroy */
+ NULL /* close */
}};
diff --git a/src/H5Dint.c b/src/H5Dint.c
index c063bb9..8ba9b4f 100644
--- a/src/H5Dint.c
+++ b/src/H5Dint.c
@@ -3421,7 +3421,7 @@ done:
HDONE_ERROR(H5E_DATASET, H5E_BADVALUE, FAIL, "address undefined")
/* Expunge from cache all v1 B-tree type entries associated with tag */
- if(H5AC_expunge_tag_type_metadata(dataset->oloc.file, dataset->oloc.addr, H5AC_BT_ID, H5AC__NO_FLAGS_SET))
+ if(H5AC_expunge_tag_type_metadata(dataset->oloc.file, dataset->oloc.addr, H5AC_BT_ID, H5AC__NO_FLAGS_SET, TRUE))
HDONE_ERROR(H5E_DATASET, H5E_CANTEXPUNGE, FAIL, "unable to expunge index metadata")
} /* end if */
diff --git a/src/H5Dnone.c b/src/H5Dnone.c
index 40ddcb8..e054f08 100644
--- a/src/H5Dnone.c
+++ b/src/H5Dnone.c
@@ -95,7 +95,8 @@ const H5D_chunk_ops_t H5D_COPS_NONE[1] = {{
H5D__none_idx_size, /* size */
H5D__none_idx_reset, /* reset */
H5D__none_idx_dump, /* dump */
- NULL /* dest */
+ NULL, /* dest */
+ NULL /* close */
}};
diff --git a/src/H5Dpkg.h b/src/H5Dpkg.h
index 37a27d3..7f2f18a 100644
--- a/src/H5Dpkg.h
+++ b/src/H5Dpkg.h
@@ -308,7 +308,7 @@ typedef herr_t (*H5D_chunk_size_func_t)(const H5D_chk_idx_info_t *idx_info,
typedef herr_t (*H5D_chunk_reset_func_t)(H5O_storage_chunk_t *storage, hbool_t reset_addr);
typedef herr_t (*H5D_chunk_dump_func_t)(const H5O_storage_chunk_t *storage,
FILE *stream);
-typedef herr_t (*H5D_chunk_dest_func_t)(const H5D_chk_idx_info_t *idx_info);
+typedef herr_t (*H5D_chunk_close_func_t)(const H5D_chk_idx_info_t *idx_info);
/* Typedef for grouping chunk I/O routines */
typedef struct H5D_chunk_ops_t {
@@ -327,7 +327,8 @@ typedef struct H5D_chunk_ops_t {
H5D_chunk_size_func_t size; /* Routine to get size of indexing information */
H5D_chunk_reset_func_t reset; /* Routine to reset indexing information */
H5D_chunk_dump_func_t dump; /* Routine to dump indexing information */
- H5D_chunk_dest_func_t dest; /* Routine to destroy indexing information in memory */
+ H5D_chunk_close_func_t dest; /* Routine to destroy indexing information in memory */
+ H5D_chunk_close_func_t close; /* Routine to destroy indexing information in memory */
} H5D_chunk_ops_t;
/* Structure holding information about a chunk's selection for mapping */
diff --git a/src/H5Dsingle.c b/src/H5Dsingle.c
index 33274bb..3fa9bc2 100644
--- a/src/H5Dsingle.c
+++ b/src/H5Dsingle.c
@@ -97,7 +97,8 @@ const H5D_chunk_ops_t H5D_COPS_SINGLE[1] = {{
H5D__single_idx_size, /* size */
H5D__single_idx_reset, /* reset */
H5D__single_idx_dump, /* dump */
- NULL /* destroy */
+ NULL, /* destroy */
+ NULL /* close */
}};
diff --git a/src/H5Dvirtual.c b/src/H5Dvirtual.c
index e07f538..e0bfb1b 100644
--- a/src/H5Dvirtual.c
+++ b/src/H5Dvirtual.c
@@ -313,6 +313,8 @@ done:
herr_t
H5D_virtual_update_min_dims(H5O_layout_t *layout, size_t idx)
{
+ H5O_storage_virtual_t *virt = &layout->storage.u.virt;
+ H5O_storage_virtual_ent_t *ent = &virt->list[idx];
H5S_sel_type sel_type;
int rank;
hsize_t bounds_start[H5S_MAX_RANK];
@@ -324,10 +326,10 @@ H5D_virtual_update_min_dims(H5O_layout_t *layout, size_t idx)
HDassert(layout);
HDassert(layout->type == H5D_VIRTUAL);
- HDassert(idx < layout->storage.u.virt.list_nalloc);
+ HDassert(idx < virt->list_nalloc);
/* Get type of selection */
- if(H5S_SEL_ERROR == (sel_type = H5S_GET_SELECT_TYPE(layout->storage.u.virt.list[idx].source_dset.virtual_select)))
+ if(H5S_SEL_ERROR == (sel_type = H5S_GET_SELECT_TYPE(ent->source_dset.virtual_select)))
HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "unable to get selection type")
/* Do not update min_dims for "all" or "none" selections */
@@ -335,19 +337,19 @@ H5D_virtual_update_min_dims(H5O_layout_t *layout, size_t idx)
HGOTO_DONE(SUCCEED)
/* Get rank of vspace */
- if((rank = H5S_GET_EXTENT_NDIMS(layout->storage.u.virt.list[idx].source_dset.virtual_select)) < 0)
+ if((rank = H5S_GET_EXTENT_NDIMS(ent->source_dset.virtual_select)) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "unable to get number of dimensions")
/* Get selection bounds */
- if(H5S_SELECT_BOUNDS(layout->storage.u.virt.list[idx].source_dset.virtual_select, bounds_start, bounds_end) < 0)
+ if(H5S_SELECT_BOUNDS(ent->source_dset.virtual_select, bounds_start, bounds_end) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "unable to get selection bounds")
/* Update min_dims */
for(i = 0; i < rank; i++)
/* Don't check unlimited dimensions in the selection */
- if((i != layout->storage.u.virt.list[idx].unlim_dim_virtual)
- && (bounds_end[i] >= layout->storage.u.virt.min_dims[i]))
- layout->storage.u.virt.min_dims[i] = bounds_end[i] + (hsize_t)1;
+ if((i != ent->unlim_dim_virtual)
+ && (bounds_end[i] >= virt->min_dims[i]))
+ virt->min_dims[i] = bounds_end[i] + (hsize_t)1;
done:
FUNC_LEAVE_NOAPI(ret_value)
@@ -419,6 +421,7 @@ done:
herr_t
H5D__virtual_store_layout(H5F_t *f, H5O_layout_t *layout)
{
+ H5O_storage_virtual_t *virt = &layout->storage.u.virt;
uint8_t *heap_block = NULL; /* Block to add to heap */
size_t *str_size = NULL; /* Array for VDS entry string lengths */
uint8_t *heap_block_p; /* Pointer into the heap block, while encoding */
@@ -433,16 +436,16 @@ H5D__virtual_store_layout(H5F_t *f, H5O_layout_t *layout)
/* Sanity checking */
HDassert(f);
HDassert(layout);
- HDassert(layout->storage.u.virt.serial_list_hobjid.addr == HADDR_UNDEF);
+ HDassert(virt->serial_list_hobjid.addr == HADDR_UNDEF);
/* Create block if # of used entries > 0 */
- if(layout->storage.u.virt.list_nused > 0) {
+ if(virt->list_nused > 0) {
/* Set the low/high bounds according to 'f' for the API context */
H5CX_set_libver_bounds(f);
/* Allocate array for caching results of strlen */
- if(NULL == (str_size = (size_t *)H5MM_malloc(2 * layout->storage.u.virt.list_nused * sizeof(size_t))))
+ if(NULL == (str_size = (size_t *)H5MM_malloc(2 * virt->list_nused * sizeof(size_t))))
HGOTO_ERROR(H5E_OHDR, H5E_RESOURCE, FAIL, "unable to allocate string length array")
/*
@@ -453,29 +456,30 @@ H5D__virtual_store_layout(H5F_t *f, H5O_layout_t *layout)
block_size = (size_t)1 + H5F_SIZEOF_SIZE(f);
/* Calculate size of each entry */
- for(i = 0; i < layout->storage.u.virt.list_nused; i++) {
+ for(i = 0; i < virt->list_nused; i++) {
+ H5O_storage_virtual_ent_t *ent = &virt->list[i];
hssize_t select_serial_size; /* Size of serialized selection */
- HDassert(layout->storage.u.virt.list[i].source_file_name);
- HDassert(layout->storage.u.virt.list[i].source_dset_name);
- HDassert(layout->storage.u.virt.list[i].source_select);
- HDassert(layout->storage.u.virt.list[i].source_dset.virtual_select);
+ HDassert(ent->source_file_name);
+ HDassert(ent->source_dset_name);
+ HDassert(ent->source_select);
+ HDassert(ent->source_dset.virtual_select);
/* Source file name */
- str_size[2 * i] = HDstrlen(layout->storage.u.virt.list[i].source_file_name) + (size_t)1;
+ str_size[2 * i] = HDstrlen(ent->source_file_name) + (size_t)1;
block_size += str_size[2 * i];
/* Source dset name */
- str_size[(2 * i) + 1] = HDstrlen(layout->storage.u.virt.list[i].source_dset_name) + (size_t)1;
+ str_size[(2 * i) + 1] = HDstrlen(ent->source_dset_name) + (size_t)1;
block_size += str_size[(2 * i) + 1];
/* Source selection */
- if((select_serial_size = H5S_SELECT_SERIAL_SIZE(layout->storage.u.virt.list[i].source_select)) < 0)
+ if((select_serial_size = H5S_SELECT_SERIAL_SIZE(ent->source_select)) < 0)
HGOTO_ERROR(H5E_OHDR, H5E_CANTENCODE, FAIL, "unable to check dataspace selection size")
block_size += (size_t)select_serial_size;
/* Virtual dataset selection */
- if((select_serial_size = H5S_SELECT_SERIAL_SIZE(layout->storage.u.virt.list[i].source_dset.virtual_select)) < 0)
+ if((select_serial_size = H5S_SELECT_SERIAL_SIZE(ent->source_dset.virtual_select)) < 0)
HGOTO_ERROR(H5E_OHDR, H5E_CANTENCODE, FAIL, "unable to check dataspace selection size")
block_size += (size_t)select_serial_size;
} /* end for */
@@ -498,25 +502,26 @@ H5D__virtual_store_layout(H5F_t *f, H5O_layout_t *layout)
*heap_block_p++ = (uint8_t)H5O_LAYOUT_VDS_GH_ENC_VERS;
/* Number of entries */
- tmp_nentries = (hsize_t)layout->storage.u.virt.list_nused;
+ tmp_nentries = (hsize_t)virt->list_nused;
H5F_ENCODE_LENGTH(f, heap_block_p, tmp_nentries)
/* Encode each entry */
- for(i = 0; i < layout->storage.u.virt.list_nused; i++) {
+ for(i = 0; i < virt->list_nused; i++) {
+ H5O_storage_virtual_ent_t *ent = &virt->list[i];
/* Source file name */
- H5MM_memcpy((char *)heap_block_p, layout->storage.u.virt.list[i].source_file_name, str_size[2 * i]);
+ H5MM_memcpy((char *)heap_block_p, ent->source_file_name, str_size[2 * i]);
heap_block_p += str_size[2 * i];
/* Source dataset name */
- H5MM_memcpy((char *)heap_block_p, layout->storage.u.virt.list[i].source_dset_name, str_size[(2 * i) + 1]);
+ H5MM_memcpy((char *)heap_block_p, ent->source_dset_name, str_size[(2 * i) + 1]);
heap_block_p += str_size[(2 * i) + 1];
/* Source selection */
- if(H5S_SELECT_SERIALIZE(layout->storage.u.virt.list[i].source_select, &heap_block_p) < 0)
+ if(H5S_SELECT_SERIALIZE(ent->source_select, &heap_block_p) < 0)
HGOTO_ERROR(H5E_OHDR, H5E_CANTCOPY, FAIL, "unable to serialize source selection")
/* Virtual selection */
- if(H5S_SELECT_SERIALIZE(layout->storage.u.virt.list[i].source_dset.virtual_select, &heap_block_p) < 0)
+ if(H5S_SELECT_SERIALIZE(ent->source_dset.virtual_select, &heap_block_p) < 0)
HGOTO_ERROR(H5E_OHDR, H5E_CANTCOPY, FAIL, "unable to serialize virtual selection")
} /* end for */
@@ -525,7 +530,7 @@ H5D__virtual_store_layout(H5F_t *f, H5O_layout_t *layout)
UINT32ENCODE(heap_block_p, chksum)
/* Insert block into global heap */
- if(H5HG_insert(f, block_size, heap_block, &(layout->storage.u.virt.serial_list_hobjid)) < 0) /* Casting away const OK --NAF */
+ if(H5HG_insert(f, block_size, heap_block, &(virt->serial_list_hobjid)) < 0) /* Casting away const OK --NAF */
HGOTO_ERROR(H5E_OHDR, H5E_CANTINSERT, FAIL, "unable to insert virtual dataset heap block")
} /* end if */
@@ -556,6 +561,7 @@ herr_t
H5D__virtual_copy_layout(H5O_layout_t *layout)
{
H5O_storage_virtual_ent_t *orig_list = NULL;
+ H5O_storage_virtual_t *virt = &layout->storage.u.virt;
hid_t orig_source_fapl;
hid_t orig_source_dapl;
H5P_genplist_t *plist;
@@ -569,127 +575,129 @@ H5D__virtual_copy_layout(H5O_layout_t *layout)
/* Save original entry list and top-level property lists and reset in layout
* so the originals aren't closed on error */
- orig_source_fapl = layout->storage.u.virt.source_fapl;
- layout->storage.u.virt.source_fapl = -1;
- orig_source_dapl = layout->storage.u.virt.source_dapl;
- layout->storage.u.virt.source_dapl = -1;
- orig_list = layout->storage.u.virt.list;
- layout->storage.u.virt.list = NULL;
+ orig_source_fapl = virt->source_fapl;
+ virt->source_fapl = -1;
+ orig_source_dapl = virt->source_dapl;
+ virt->source_dapl = -1;
+ orig_list = virt->list;
+ virt->list = NULL;
/* Copy entry list */
- if(layout->storage.u.virt.list_nused > 0) {
+ if(virt->list_nused > 0) {
HDassert(orig_list);
/* Allocate memory for the list */
- if(NULL == (layout->storage.u.virt.list = (H5O_storage_virtual_ent_t *)H5MM_calloc(layout->storage.u.virt.list_nused * sizeof(H5O_storage_virtual_ent_t))))
+ if(NULL == (virt->list = H5MM_calloc(virt->list_nused * sizeof(virt->list[0]))))
HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "unable to allocate memory for virtual dataset entry list")
- layout->storage.u.virt.list_nalloc = layout->storage.u.virt.list_nused;
+ virt->list_nalloc = virt->list_nused;
/* Copy the list entries, though set source_dset.dset and sub_dset to
* NULL */
- for(i = 0; i < layout->storage.u.virt.list_nused; i++) {
+ for(i = 0; i < virt->list_nused; i++) {
+ H5O_storage_virtual_ent_t *ent = &virt->list[i];
+
/* Copy virtual selection */
- if(NULL == (layout->storage.u.virt.list[i].source_dset.virtual_select
+ if(NULL == (ent->source_dset.virtual_select
= H5S_copy(orig_list[i].source_dset.virtual_select, FALSE, TRUE)))
HGOTO_ERROR(H5E_DATASET, H5E_CANTCOPY, FAIL, "unable to copy virtual selection")
/* Copy original source names */
- if(NULL == (layout->storage.u.virt.list[i].source_file_name
+ if(NULL == (ent->source_file_name
= H5MM_strdup(orig_list[i].source_file_name)))
HGOTO_ERROR(H5E_DATASET, H5E_RESOURCE, FAIL, "unable to duplicate source file name")
- if(NULL == (layout->storage.u.virt.list[i].source_dset_name
+ if(NULL == (ent->source_dset_name
= H5MM_strdup(orig_list[i].source_dset_name)))
HGOTO_ERROR(H5E_DATASET, H5E_RESOURCE, FAIL, "unable to duplicate source dataset name")
/* Copy source selection */
- if(NULL == (layout->storage.u.virt.list[i].source_select
+ if(NULL == (ent->source_select
= H5S_copy(orig_list[i].source_select, FALSE, TRUE)))
HGOTO_ERROR(H5E_DATASET, H5E_CANTCOPY, FAIL, "unable to copy source selection")
/* Initialize clipped selections */
if(orig_list[i].unlim_dim_virtual < 0) {
- layout->storage.u.virt.list[i].source_dset.clipped_source_select = layout->storage.u.virt.list[i].source_select;
- layout->storage.u.virt.list[i].source_dset.clipped_virtual_select = layout->storage.u.virt.list[i].source_dset.virtual_select;
+ ent->source_dset.clipped_source_select = ent->source_select;
+ ent->source_dset.clipped_virtual_select = ent->source_dset.virtual_select;
} /* end if */
/* Copy parsed names */
- if(H5D__virtual_copy_parsed_name(&layout->storage.u.virt.list[i].parsed_source_file_name, orig_list[i].parsed_source_file_name) < 0)
+ if(H5D__virtual_copy_parsed_name(&ent->parsed_source_file_name, orig_list[i].parsed_source_file_name) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_CANTCOPY, FAIL, "unable to copy parsed source file name")
- layout->storage.u.virt.list[i].psfn_static_strlen = orig_list[i].psfn_static_strlen;
- layout->storage.u.virt.list[i].psfn_nsubs = orig_list[i].psfn_nsubs;
- if(H5D__virtual_copy_parsed_name(&layout->storage.u.virt.list[i].parsed_source_dset_name, orig_list[i].parsed_source_dset_name) < 0)
+ ent->psfn_static_strlen = orig_list[i].psfn_static_strlen;
+ ent->psfn_nsubs = orig_list[i].psfn_nsubs;
+ if(H5D__virtual_copy_parsed_name(&ent->parsed_source_dset_name, orig_list[i].parsed_source_dset_name) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_CANTCOPY, FAIL, "unable to copy parsed source dataset name")
- layout->storage.u.virt.list[i].psdn_static_strlen = orig_list[i].psdn_static_strlen;
- layout->storage.u.virt.list[i].psdn_nsubs = orig_list[i].psdn_nsubs;
+ ent->psdn_static_strlen = orig_list[i].psdn_static_strlen;
+ ent->psdn_nsubs = orig_list[i].psdn_nsubs;
/* Copy source names in source dset or add reference as appropriate
*/
if(orig_list[i].source_dset.file_name) {
if(orig_list[i].source_dset.file_name
== orig_list[i].source_file_name)
- layout->storage.u.virt.list[i].source_dset.file_name = layout->storage.u.virt.list[i].source_file_name;
+ ent->source_dset.file_name = ent->source_file_name;
else if(orig_list[i].parsed_source_file_name
&& (orig_list[i].source_dset.file_name
!= orig_list[i].parsed_source_file_name->name_segment)) {
- HDassert(layout->storage.u.virt.list[i].parsed_source_file_name);
- HDassert(layout->storage.u.virt.list[i].parsed_source_file_name->name_segment);
- layout->storage.u.virt.list[i].source_dset.file_name = layout->storage.u.virt.list[i].parsed_source_file_name->name_segment;
+ HDassert(ent->parsed_source_file_name);
+ HDassert(ent->parsed_source_file_name->name_segment);
+ ent->source_dset.file_name = ent->parsed_source_file_name->name_segment;
} /* end if */
else
- if(NULL == (layout->storage.u.virt.list[i].source_dset.file_name
+ if(NULL == (ent->source_dset.file_name
= H5MM_strdup(orig_list[i].source_dset.file_name)))
HGOTO_ERROR(H5E_DATASET, H5E_RESOURCE, FAIL, "unable to duplicate source file name")
} /* end if */
if(orig_list[i].source_dset.dset_name) {
if(orig_list[i].source_dset.dset_name
== orig_list[i].source_dset_name)
- layout->storage.u.virt.list[i].source_dset.dset_name = layout->storage.u.virt.list[i].source_dset_name;
+ ent->source_dset.dset_name = ent->source_dset_name;
else if(orig_list[i].parsed_source_dset_name
&& (orig_list[i].source_dset.dset_name
!= orig_list[i].parsed_source_dset_name->name_segment)) {
- HDassert(layout->storage.u.virt.list[i].parsed_source_dset_name);
- HDassert(layout->storage.u.virt.list[i].parsed_source_dset_name->name_segment);
- layout->storage.u.virt.list[i].source_dset.dset_name = layout->storage.u.virt.list[i].parsed_source_dset_name->name_segment;
+ HDassert(ent->parsed_source_dset_name);
+ HDassert(ent->parsed_source_dset_name->name_segment);
+ ent->source_dset.dset_name = ent->parsed_source_dset_name->name_segment;
} /* end if */
else
- if(NULL == (layout->storage.u.virt.list[i].source_dset.dset_name
+ if(NULL == (ent->source_dset.dset_name
= H5MM_strdup(orig_list[i].source_dset.dset_name)))
HGOTO_ERROR(H5E_DATASET, H5E_RESOURCE, FAIL, "unable to duplicate source dataset name")
} /* end if */
/* Copy other fields in entry */
- layout->storage.u.virt.list[i].unlim_dim_source = orig_list[i].unlim_dim_source;
- layout->storage.u.virt.list[i].unlim_dim_virtual = orig_list[i].unlim_dim_virtual;
- layout->storage.u.virt.list[i].unlim_extent_source = orig_list[i].unlim_extent_source;
- layout->storage.u.virt.list[i].unlim_extent_virtual = orig_list[i].unlim_extent_virtual;
- layout->storage.u.virt.list[i].clip_size_source = orig_list[i].clip_size_source;
- layout->storage.u.virt.list[i].clip_size_virtual = orig_list[i].clip_size_virtual;
- layout->storage.u.virt.list[i].source_space_status = orig_list[i].source_space_status;
- layout->storage.u.virt.list[i].virtual_space_status = orig_list[i].virtual_space_status;
+ ent->unlim_dim_source = orig_list[i].unlim_dim_source;
+ ent->unlim_dim_virtual = orig_list[i].unlim_dim_virtual;
+ ent->unlim_extent_source = orig_list[i].unlim_extent_source;
+ ent->unlim_extent_virtual = orig_list[i].unlim_extent_virtual;
+ ent->clip_size_source = orig_list[i].clip_size_source;
+ ent->clip_size_virtual = orig_list[i].clip_size_virtual;
+ ent->source_space_status = orig_list[i].source_space_status;
+ ent->virtual_space_status = orig_list[i].virtual_space_status;
} /* end for */
} /* end if */
else {
/* Zero out other fields related to list, just to be sure */
- layout->storage.u.virt.list = NULL;
- layout->storage.u.virt.list_nalloc = 0;
+ virt->list = NULL;
+ virt->list_nalloc = 0;
} /* end else */
/* Copy property lists */
if(orig_source_fapl >= 0) {
if(NULL == (plist = (H5P_genplist_t *)H5I_object_verify(orig_source_fapl, H5I_GENPROP_LST)))
HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a property list")
- if((layout->storage.u.virt.source_fapl = H5P_copy_plist(plist, FALSE)) < 0)
+ if((virt->source_fapl = H5P_copy_plist(plist, FALSE)) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_CANTCOPY, FAIL, "can't copy fapl")
} /* end if */
if(orig_source_dapl >= 0) {
if(NULL == (plist = (H5P_genplist_t *)H5I_object_verify(orig_source_dapl, H5I_GENPROP_LST)))
HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a property list")
- if((layout->storage.u.virt.source_dapl = H5P_copy_plist(plist, FALSE)) < 0)
+ if((virt->source_dapl = H5P_copy_plist(plist, FALSE)) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_CANTCOPY, FAIL, "can't copy dapl")
} /* end if */
/* New layout is not fully initialized */
- layout->storage.u.virt.init = FALSE;
+ virt->init = FALSE;
done:
/* Release allocated resources on failure */
@@ -721,6 +729,7 @@ herr_t
H5D__virtual_reset_layout(H5O_layout_t *layout)
{
size_t i, j;
+ H5O_storage_virtual_t *virt = &layout->storage.u.virt;
herr_t ret_value = SUCCEED;
FUNC_ENTER_PACKAGE
@@ -731,53 +740,54 @@ H5D__virtual_reset_layout(H5O_layout_t *layout)
/* Free the list entries. Note we always attempt to free everything even in
* the case of a failure. Because of this, and because we free the list
* afterwards, we do not need to zero out the memory in the list. */
- for(i = 0; i < layout->storage.u.virt.list_nused; i++) {
+ for(i = 0; i < virt->list_nused; i++) {
+ H5O_storage_virtual_ent_t *ent = &virt->list[i];
/* Free source_dset */
- if(H5D__virtual_reset_source_dset(&layout->storage.u.virt.list[i], &layout->storage.u.virt.list[i].source_dset) < 0)
+ if(H5D__virtual_reset_source_dset(ent, &ent->source_dset) < 0)
HDONE_ERROR(H5E_DATASET, H5E_CANTFREE, FAIL, "unable to reset source dataset")
/* Free original source names */
- (void)H5MM_xfree(layout->storage.u.virt.list[i].source_file_name);
- (void)H5MM_xfree(layout->storage.u.virt.list[i].source_dset_name);
+ (void)H5MM_xfree(ent->source_file_name);
+ (void)H5MM_xfree(ent->source_dset_name);
/* Free sub_dset */
- for(j = 0; j < layout->storage.u.virt.list[i].sub_dset_nalloc; j++)
- if(H5D__virtual_reset_source_dset(&layout->storage.u.virt.list[i], &layout->storage.u.virt.list[i].sub_dset[j]) < 0)
+ for(j = 0; j < ent->sub_dset_nalloc; j++)
+ if(H5D__virtual_reset_source_dset(ent, &ent->sub_dset[j]) < 0)
HDONE_ERROR(H5E_DATASET, H5E_CANTFREE, FAIL, "unable to reset source dataset")
- layout->storage.u.virt.list[i].sub_dset = (H5O_storage_virtual_srcdset_t *)H5MM_xfree(layout->storage.u.virt.list[i].sub_dset);
+ ent->sub_dset = H5MM_xfree(ent->sub_dset);
/* Free source_select */
- if(layout->storage.u.virt.list[i].source_select)
- if(H5S_close(layout->storage.u.virt.list[i].source_select) < 0)
+ if(ent->source_select)
+ if(H5S_close(ent->source_select) < 0)
HDONE_ERROR(H5E_DATASET, H5E_CLOSEERROR, FAIL, "unable to release source selection")
/* Free parsed_source_file_name */
- H5D_virtual_free_parsed_name(layout->storage.u.virt.list[i].parsed_source_file_name);
+ H5D_virtual_free_parsed_name(ent->parsed_source_file_name);
/* Free parsed_source_dset_name */
- H5D_virtual_free_parsed_name(layout->storage.u.virt.list[i].parsed_source_dset_name);
- } /* end for */
+ H5D_virtual_free_parsed_name(ent->parsed_source_dset_name);
+ }
/* Free the list */
- layout->storage.u.virt.list = (H5O_storage_virtual_ent_t *)H5MM_xfree(layout->storage.u.virt.list);
- layout->storage.u.virt.list_nalloc = (size_t)0;
- layout->storage.u.virt.list_nused = (size_t)0;
- (void)HDmemset(layout->storage.u.virt.min_dims, 0, sizeof(layout->storage.u.virt.min_dims));
+ virt->list = H5MM_xfree(virt->list);
+ virt->list_nalloc = (size_t)0;
+ virt->list_nused = (size_t)0;
+ (void)HDmemset(virt->min_dims, 0, sizeof(virt->min_dims));
/* Close access property lists */
- if(layout->storage.u.virt.source_fapl >= 0) {
- if(H5I_dec_ref(layout->storage.u.virt.source_fapl) < 0)
+ if(virt->source_fapl >= 0) {
+ if(H5I_dec_ref(virt->source_fapl) < 0)
HDONE_ERROR(H5E_DATASET, H5E_CANTFREE, FAIL, "can't close source fapl")
- layout->storage.u.virt.source_fapl = -1;
- } /* end if */
- if(layout->storage.u.virt.source_dapl >= 0) {
- if(H5I_dec_ref(layout->storage.u.virt.source_dapl) < 0)
+ virt->source_fapl = -1;
+ }
+ if(virt->source_dapl >= 0) {
+ if(H5I_dec_ref(virt->source_dapl) < 0)
HDONE_ERROR(H5E_DATASET, H5E_CANTFREE, FAIL, "can't close source dapl")
- layout->storage.u.virt.source_dapl = -1;
- } /* end if */
+ virt->source_dapl = -1;
+ }
/* The list is no longer initialized */
- layout->storage.u.virt.init = FALSE;
+ virt->init = FALSE;
/* Note the lack of a done: label. This is because there are no HGOTO_ERROR
* calls. If one is added, a done: label must also be added */
@@ -915,7 +925,14 @@ H5D__virtual_open_source_dset(const H5D_t *vdset,
intent = H5F_INTENT(vdset->oloc.file);
/* Try opening the file */
- src_file = H5F_prefix_open_file(vdset->oloc.file, H5F_PREFIX_VDS, vdset->shared->vds_prefix, source_dset->file_name, intent, vdset->shared->layout.storage.u.virt.source_fapl);
+ /* XXX Pass the special file-access property list ID,
+ * H5P_FILE_ACCESS_ANY_VFD, so that if the file is already open in
+ * VFD SWMR mode, the library just creates a new H5F_t for the file
+ * instead of returning an error because of the discrepancy between
+ * the default file-access properties and the already-open file's
+ * VFD SWMR properties.
+ */
+ src_file = H5F_prefix_open_file(vdset->oloc.file, H5F_PREFIX_VDS, vdset->shared->vds_prefix, source_dset->file_name, intent, H5P_FILE_ACCESS_ANY_VFD);
/* If we opened the source file here, we should close it when leaving */
if(src_file)
@@ -2781,7 +2798,7 @@ H5D__virtual_write_one(H5D_io_info_t *io_info, const H5D_type_info_t *type_info,
* extent in the unlimited dimension. -NAF */
/* Project intersection of file space and mapping virtual space onto
* mapping source space */
- if(H5S_select_project_intersection(source_dset->virtual_select, source_dset->clipped_source_select, file_space, &projected_src_space, TRUE) < 0)
+ if(H5S_select_project_intersection(source_dset->clipped_virtual_select, source_dset->clipped_source_select, file_space, &projected_src_space, TRUE) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_CANTCLIP, FAIL, "can't project virtual intersection onto source space")
/* Perform write on source dataset */
diff --git a/src/H5EAcache.c b/src/H5EAcache.c
index affa127..2ae4f84 100644
--- a/src/H5EAcache.c
+++ b/src/H5EAcache.c
@@ -145,6 +145,7 @@ const H5AC_class_t H5AC_EARRAY_HDR[1] = {{
H5EA__cache_hdr_notify, /* 'notify' callback */
H5EA__cache_hdr_free_icr, /* 'free_icr' callback */
NULL, /* 'fsf_size' callback */
+ NULL, /* VFD SWMR 'refresh' callback */
}};
/* H5EA index block inherits cache-like properties from H5AC */
@@ -163,6 +164,7 @@ const H5AC_class_t H5AC_EARRAY_IBLOCK[1] = {{
H5EA__cache_iblock_notify, /* 'notify' callback */
H5EA__cache_iblock_free_icr, /* 'free_icr' callback */
NULL, /* 'fsf_size' callback */
+ NULL, /* VFD SWMR 'refresh' callback */
}};
/* H5EA super block inherits cache-like properties from H5AC */
@@ -181,6 +183,7 @@ const H5AC_class_t H5AC_EARRAY_SBLOCK[1] = {{
H5EA__cache_sblock_notify, /* 'notify' callback */
H5EA__cache_sblock_free_icr, /* 'free_icr' callback */
NULL, /* 'fsf_size' callback */
+ NULL, /* VFD SWMR 'refresh' callback */
}};
/* H5EA data block inherits cache-like properties from H5AC */
@@ -199,6 +202,7 @@ const H5AC_class_t H5AC_EARRAY_DBLOCK[1] = {{
H5EA__cache_dblock_notify, /* 'notify' callback */
H5EA__cache_dblock_free_icr, /* 'free_icr' callback */
H5EA__cache_dblock_fsf_size, /* 'fsf_size' callback */
+ NULL, /* VFD SWMR 'refresh' callback */
}};
/* H5EA data block page inherits cache-like properties from H5AC */
@@ -217,6 +221,7 @@ const H5AC_class_t H5AC_EARRAY_DBLK_PAGE[1] = {{
H5EA__cache_dblk_page_notify, /* 'notify' callback */
H5EA__cache_dblk_page_free_icr, /* 'free_icr' callback */
NULL, /* 'fsf_size' callback */
+ NULL, /* VFD SWMR 'refresh' callback */
}};
diff --git a/src/H5F.c b/src/H5F.c
index 9d426ac..054c547 100644
--- a/src/H5F.c
+++ b/src/H5F.c
@@ -2044,3 +2044,108 @@ done:
FUNC_LEAVE_API(ret_value)
} /* H5Fset_dset_no_attrs_hint */
+
+/*-------------------------------------------------------------------------
+ * Function: H5Fvfd_swmr_end_tick()
+ *
+ * Purpose: To trigger end of tick processing
+ *
+ * Return: Non-negative on success/Negative on errors
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5Fvfd_swmr_end_tick(hid_t file_id)
+{
+ H5VL_object_t *vol_obj = NULL; /* File info */
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ /* Note: use the version of FUNC_ENTER_API without EOT processing */
+ FUNC_ENTER_API_NO_EOT(FAIL)
+ H5TRACE1("e", "i", file_id);
+
+ vol_obj = (H5VL_object_t *)H5I_object_verify(file_id, H5I_FILE);
+ if(NULL == vol_obj)
+ HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "invalid file identifier")
+
+ /* Check on this when go parallel for VFD SWMR */
+ /* Set up collective metadata if appropriate */
+ if(H5CX_set_loc(file_id) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, "can't set collective metadata read info")
+
+ if(H5VL_file_optional(vol_obj, H5VL_NATIVE_FILE_VFD_SWMR_END_TICK, H5P_DATASET_XFER_DEFAULT, H5_REQUEST_NULL) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, "unable to trigger end of tick processing for VFD SWMR")
+
+done:
+ /* Note: use the version of FUNC_LEAVE_API without EOT processing */
+ FUNC_LEAVE_API_NO_EOT(ret_value)
+} /* H5Fvfd_swmr_end_tick() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5Fvfd_swmr_disable_end_of_tick()
+ *
+ * Purpose: Disable end of tick processing
+ *
+ * Return: Non-negative on success/Negative on errors
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5Fvfd_swmr_disable_end_of_tick(hid_t file_id)
+{
+
+ H5VL_object_t *vol_obj = NULL; /* File info */
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_API(FAIL)
+ H5TRACE1("e", "i", file_id);
+
+ vol_obj = (H5VL_object_t *)H5I_object_verify(file_id, H5I_FILE);
+ if(NULL == vol_obj)
+ HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "invalid file identifier")
+
+ /* Check on this when go parallel for VFD SWMR */
+ /* Set up collective metadata if appropriate */
+ if(H5CX_set_loc(file_id) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, "can't set collective metadata read info")
+
+ if(H5VL_file_optional(vol_obj, H5VL_NATIVE_FILE_VFD_SWMR_DISABLE_EOT, H5P_DATASET_XFER_DEFAULT, H5_REQUEST_NULL) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, "unable to disable EOT for VFD SWMR")
+
+done:
+ FUNC_LEAVE_API(ret_value)
+} /* H5Fvfd_swmr_disable_end_of_tick() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5Fvfd_swmr_enable_end_of_tick()
+ *
+ * Purpose: Enable end of tick processing
+ *
+ * Return: Non-negative on success/Negative on errors
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5Fvfd_swmr_enable_end_of_tick(hid_t file_id)
+{
+
+ H5VL_object_t *vol_obj = NULL; /* File info */
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_API(FAIL)
+ H5TRACE1("e", "i", file_id);
+
+ vol_obj = (H5VL_object_t *)H5I_object_verify(file_id, H5I_FILE);
+ if(NULL == vol_obj)
+ HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "invalid file identifier")
+
+ /* Check on this when go parallel for VFD SWMR */
+ /* Set up collective metadata if appropriate */
+ if(H5CX_set_loc(file_id) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, "can't set collective metadata read info")
+
+ if(H5VL_file_optional(vol_obj, H5VL_NATIVE_FILE_VFD_SWMR_ENABLE_EOT, H5P_DATASET_XFER_DEFAULT, H5_REQUEST_NULL) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, "unable to enable EOT for VFD SWMR")
+
+done:
+ FUNC_LEAVE_API(ret_value)
+} /* H5Fvfd_swmr_enable_end_of_tick() */
diff --git a/src/H5FAcache.c b/src/H5FAcache.c
index 8f5e696..f2f6990 100644
--- a/src/H5FAcache.c
+++ b/src/H5FAcache.c
@@ -122,6 +122,7 @@ const H5AC_class_t H5AC_FARRAY_HDR[1] = {{
H5FA__cache_hdr_notify, /* 'notify' callback */
H5FA__cache_hdr_free_icr, /* 'free_icr' callback */
NULL, /* 'fsf_size' callback */
+ NULL, /* VFD SWMR 'refresh' callback */
}};
/* H5FA data block inherits cache-like properties from H5AC */
@@ -140,6 +141,7 @@ const H5AC_class_t H5AC_FARRAY_DBLOCK[1] = {{
H5FA__cache_dblock_notify, /* 'notify' callback */
H5FA__cache_dblock_free_icr, /* 'free_icr' callback */
H5FA__cache_dblock_fsf_size, /* 'fsf_size' callback */
+ NULL, /* VFD SWMR 'refresh' callback */
}};
/* H5FA data block page inherits cache-like properties from H5AC */
@@ -158,6 +160,7 @@ const H5AC_class_t H5AC_FARRAY_DBLK_PAGE[1] = {{
H5FA__cache_dblk_page_notify, /* 'notify' callback */
H5FA__cache_dblk_page_free_icr, /* 'free_icr' callback */
NULL, /* 'fsf_size' callback */
+ NULL, /* VFD SWMR 'refresh' callback */
}};
diff --git a/src/H5FD.c b/src/H5FD.c
index 2cd69df..600a825 100644
--- a/src/H5FD.c
+++ b/src/H5FD.c
@@ -92,6 +92,8 @@ hbool_t H5_PKG_INIT_VAR = FALSE;
*/
static unsigned long H5FD_file_serial_no_g;
+static TAILQ_HEAD(_all_vfds, H5FD_t) all_vfds = TAILQ_HEAD_INITIALIZER(all_vfds);
+
/* File driver ID class */
static const H5I_class_t H5I_VFL_CLS[1] = {{
H5I_VFL, /* ID class value */
@@ -677,6 +679,91 @@ done:
FUNC_LEAVE_API(ret_value)
}
+/* Return `other` if `self` has no de-duplication method. Otherwise, return
+ * `other` if it duplicates `self`, `self` if `other` does NOT duplicate it,
+ * NULL if `other` conflicts with `self` or if there is an error.
+ *
+ * Unlike H5FD_deduplicate(), this routine does not free `self` under any
+ * circumstances.
+ */
+static H5FD_t *
+H5FD_dedup(H5FD_t *self, H5FD_t *other, hid_t fapl)
+{
+ H5FD_t *(*dedup)(H5FD_t *, H5FD_t *, hid_t);
+
+ if ((dedup = self->cls->dedup) != NULL)
+ return (*dedup)(self, other, fapl);
+
+ if (H5FDcmp(self, other) == 0)
+ return self;
+
+ return other;
+}
+
+/* If any other open H5FD_t is functionally equivalent to `file` under
+ * the given file-access properties, then return it and close `file`.
+ *
+ * If any other open H5FD_t is not equivalent to `file`, but its
+ * operation would conflict with `file`, then return NULL and close `file`.
+ */
+H5FD_t *
+H5FD_deduplicate(H5FD_t *file, hid_t fapl)
+{
+ H5FD_t *deduped = file, *item;
+
+ TAILQ_FOREACH(item, &all_vfds, link) {
+ /* skip "self" */
+ if (item == file)
+ continue;
+
+ /* skip files with exclusive owners, for now */
+ if (item->exc_owner != NULL)
+ continue;
+
+ if ((deduped = H5FD_dedup(item, file, fapl)) != file)
+ goto finish;
+ }
+
+ /* If we reach this stage, then we identified neither a conflict nor a
+ * duplicate. If any lower VFD with an exclusive owner matches `file`,
+ * return NULL to indicate the conflict.
+ */
+ TAILQ_FOREACH(item, &all_vfds, link) {
+ if (item == file || item->exc_owner == NULL)
+ continue;
+
+ if (H5FDcmp(file, item) == 0) {
+ deduped = NULL;
+ break;
+ }
+ }
+
+finish:
+ if (deduped != file && H5FD_close(file) < 0) {
+ HERROR(H5E_FILE, H5E_CANTOPENFILE, "could not close file");
+ return NULL;
+ }
+ return deduped;
+}
+
+/* Return `true` if a second H5FD_t identical to `file`
+ * has an exclusive owner, `false` otherwise.
+ */
+bool
+H5FD_has_conflict(H5FD_t *file)
+{
+ H5FD_t *item;
+
+ TAILQ_FOREACH(item, &all_vfds, link) {
+ // skip "self", skip unowned
+ if (item == file || item->exc_owner == NULL)
+ continue;
+ if (H5FDcmp(file, item) == 0)
+ return true;
+ }
+ return false;
+}
+
/*-------------------------------------------------------------------------
* Function: H5FD_open
@@ -693,7 +780,7 @@ H5FD_t *
H5FD_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t maxaddr)
{
H5FD_class_t *driver; /* VFD for file */
- H5FD_t *file = NULL; /* VFD file struct */
+ H5FD_t *file;
H5FD_driver_prop_t driver_prop; /* Property for driver ID & info */
H5P_genplist_t *plist; /* Property list pointer */
unsigned long driver_flags = 0; /* File-inspecific driver feature flags */
@@ -737,9 +824,14 @@ H5FD_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t maxaddr)
/* Dispatch to file driver */
if(HADDR_UNDEF == maxaddr)
maxaddr = driver->maxaddr;
+#if 0 /* JRM */
+ HDfprintf(stderr, "H5FD_open(): calling %s.open().\n", driver->name);
+#endif /* JRM */
if(NULL == (file = (driver->open)(name, flags, fapl_id, maxaddr)))
HGOTO_ERROR(H5E_VFL, H5E_CANTINIT, NULL, "open failed")
+ file->exc_owner = NULL;
+
/* Set the file access flags */
file->access_flags = flags;
@@ -771,10 +863,13 @@ H5FD_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t maxaddr)
/* (This will be changed later, when the superblock is located) */
file->base_addr = 0;
+ TAILQ_INSERT_TAIL(&all_vfds, file, link);
+
/* Set return value */
ret_value = file;
done:
+ /* XXX We leak H5FD_t's on many error conditions. */
/* Can't cleanup 'file' information, since we don't know what type it is */
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD_open() */
@@ -829,6 +924,7 @@ herr_t
H5FD_close(H5FD_t *file)
{
const H5FD_class_t *driver;
+ H5FD_t *item;
herr_t ret_value = SUCCEED;
FUNC_ENTER_NOAPI(FAIL)
@@ -842,6 +938,12 @@ H5FD_close(H5FD_t *file)
if(H5I_dec_ref(file->driver_id) < 0)
HGOTO_ERROR(H5E_VFL, H5E_CANTDEC, FAIL, "can't close driver ID")
+ TAILQ_FOREACH(item, &all_vfds, link) {
+ if (item->exc_owner == file)
+ item->exc_owner = NULL;
+ }
+ TAILQ_REMOVE(&all_vfds, file, link);
+
/* Dispatch to the driver for actual close. If the driver fails to
* close the file then the file will be in an unusable state.
*/
diff --git a/src/H5FDcore.c b/src/H5FDcore.c
index 0551dd0..394380b 100644
--- a/src/H5FDcore.c
+++ b/src/H5FDcore.c
@@ -179,6 +179,7 @@ static const H5FD_class_t H5FD_core_g = {
H5FD__core_truncate, /* truncate */
H5FD_core_lock, /* lock */
H5FD_core_unlock, /* unlock */
+ NULL, /* dedup */
H5FD_FLMAP_DICHOTOMY /* fl_map */
};
diff --git a/src/H5FDfamily.c b/src/H5FDfamily.c
index d110ef7..cc97a0f 100644
--- a/src/H5FDfamily.c
+++ b/src/H5FDfamily.c
@@ -139,6 +139,7 @@ static const H5FD_class_t H5FD_family_g = {
H5FD_family_truncate, /*truncate */
H5FD_family_lock, /*lock */
H5FD_family_unlock, /*unlock */
+ NULL, /*dedup */
H5FD_FLMAP_DICHOTOMY /*fl_map */
};
diff --git a/src/H5FDhdfs.c b/src/H5FDhdfs.c
index 3d086ea..2c06420 100644
--- a/src/H5FDhdfs.c
+++ b/src/H5FDhdfs.c
@@ -521,6 +521,7 @@ static const H5FD_class_t H5FD_hdfs_g = {
H5FD_hdfs_truncate, /* truncate */
H5FD_hdfs_lock, /* lock */
H5FD_hdfs_unlock, /* unlock */
+ NULL, /* dedup */
H5FD_FLMAP_DICHOTOMY /* fl_map */
};
diff --git a/src/H5FDint.c b/src/H5FDint.c
index 8a2148a..97c81ab 100644
--- a/src/H5FDint.c
+++ b/src/H5FDint.c
@@ -183,6 +183,7 @@ H5FD_read(H5FD_t *file, H5FD_mem_t type, haddr_t addr, size_t size, void *buf/*o
* objects being written within the file by the application performing
* SWMR write operations.
*/
+#if 0 /* JRM */
if(!(file->access_flags & H5F_ACC_SWMR_READ)) {
haddr_t eoa;
@@ -192,6 +193,7 @@ H5FD_read(H5FD_t *file, H5FD_mem_t type, haddr_t addr, size_t size, void *buf/*o
if((addr + file->base_addr + size) > eoa)
HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow, addr = %llu, size = %llu, eoa = %llu", (unsigned long long)(addr + file->base_addr), (unsigned long long)size, (unsigned long long)eoa)
}
+#endif /* JRM */
/* Dispatch to driver */
if((file->cls->read)(file, type, dxpl_id, addr + file->base_addr, size, buf) < 0)
@@ -394,4 +396,3 @@ H5FD_driver_query(const H5FD_class_t *driver, unsigned long *flags/*out*/)
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD_driver_query() */
-
diff --git a/src/H5FDlog.c b/src/H5FDlog.c
index 78b7742..0d37260 100644
--- a/src/H5FDlog.c
+++ b/src/H5FDlog.c
@@ -215,6 +215,7 @@ static const H5FD_class_t H5FD_log_g = {
H5FD_log_truncate, /*truncate */
H5FD_log_lock, /*lock */
H5FD_log_unlock, /*unlock */
+ NULL, /*dedup */
H5FD_FLMAP_DICHOTOMY /*fl_map */
};
diff --git a/src/H5FDmulti.c b/src/H5FDmulti.c
index 72f4da5..132c2c3 100644
--- a/src/H5FDmulti.c
+++ b/src/H5FDmulti.c
@@ -170,6 +170,7 @@ static const H5FD_class_t H5FD_multi_g = {
H5FD_multi_truncate, /*truncate */
H5FD_multi_lock, /*lock */
H5FD_multi_unlock, /*unlock */
+ NULL, /*dedup */
H5FD_FLMAP_DEFAULT /*fl_map */
};
diff --git a/src/H5FDpkg.h b/src/H5FDpkg.h
index 22b5d17..903ce06 100644
--- a/src/H5FDpkg.h
+++ b/src/H5FDpkg.h
@@ -55,6 +55,7 @@ H5_DLL herr_t H5FD__free_real(H5FD_t *file, H5FD_mem_t type, haddr_t addr, hsize
/* Testing functions */
#ifdef H5FD_TESTING
H5_DLL hbool_t H5FD__supports_swmr_test(const char *vfd_name);
+H5_DLL herr_t H5FD__vfd_swmr_reader_md_test(H5FD_t *file, unsigned num_entries, H5FD_vfd_swmr_idx_entry_t index[]);
#endif /* H5FD_TESTING */
#endif /* _H5FDpkg_H */
diff --git a/src/H5FDprivate.h b/src/H5FDprivate.h
index 2e3d3ce..9f5309d 100644
--- a/src/H5FDprivate.h
+++ b/src/H5FDprivate.h
@@ -36,7 +36,205 @@
/**************************/
/* Length of filename buffer */
-#define H5FD_MAX_FILENAME_LEN 1024
+#define H5FD_MAX_FILENAME_LEN 1024
+
+/*
+ * VFD SWMR
+ */
+/* Metadata file header */
+#define H5FD_MD_HEADER_OFF 0 /* Header offset in the metadata file */
+#define H5FD_MD_HEADER_MAGIC "VHDR" /* Header magic */
+#define H5FD_SIZEOF_CHKSUM 4 /* Size of checksum */
+
+/* Size of the header in the metadata file */
+#define H5FD_MD_HEADER_SIZE \
+ ( \
+ H5_SIZEOF_MAGIC /* Signature */ \
+ + 4 /* Page size */ \
+ + 8 /* Tick number */ \
+ + 8 /* Index offset */ \
+ + 8 /* Index length number */ \
+ + H5FD_SIZEOF_CHKSUM /* Metadata header checksum */ \
+ )
+
+/* Size of an index entry in the metadata file */
+#define H5FD_MD_INDEX_ENTRY_SIZE \
+ ( \
+ 4 /* HDF5 file page offset */ \
+ + 4 /* Metadata file page offset */ \
+ + 4 /* Length */ \
+ + H5FD_SIZEOF_CHKSUM /* Index entry checksum */ \
+ )
+
+/* Metadata file index magic */
+#define H5FD_MD_INDEX_MAGIC "VIDX" /* Index magic */
+
+/* Size of the metadata file index */
+#define H5FD_MD_INDEX_SIZE(N) /* N is number of entries in index */ \
+ ( \
+ H5_SIZEOF_MAGIC /* Signature */ \
+ + 8 /* Tick num */ \
+ + 4 /* Number of entries */ \
+ + (N * H5FD_MD_INDEX_ENTRY_SIZE) /* Index entries */ \
+ + H5FD_SIZEOF_CHKSUM /* Metadata index checksum */ \
+ )
+
+/* Retries for metadata file */
+#define H5FD_VFD_SWMR_MD_FILE_RETRY_MAX 50 /* Maximum retries when opening the MD file */
+#define H5FD_VFD_SWMR_MD_LOAD_RETRY_MAX 120 /* Maximum retries when trying to load the MD file header and index */
+#define H5FD_VFD_SWMR_MD_INDEX_RETRY_MAX 5 /* Maximum retries when deserializing the MD file index */
+
+
+
+/* Internal representation of metadata file index entry */
+
+/*----------------------------------------------------------------------------
+ *
+ * struct H5FD_vfd_swmr_idx_entry_t
+ *
+ * Indicies into the VFD SWMR metadata file are maintained in arrays of
+ * instances of H5FD_vfd_swmr_index_t.
+ *
+ * The fields of H5FD_vfd_swmr_idx_entry_t are discussed below.
+ *
+ * hdf5_page_offset: Unsigned 64-bit value containing the base address of the
+ * metadata page, or multi page metadata entry in the HDF5
+ * file IN PAGES.
+ *
+ * To obtain byte offset, multiply this value by the page size.
+ *
+ * md_file_page_offset: Unsigned 64-bit value containing the base address of
+ * the metadata page, or multi page metadata entry in the metadata
+ * file IN PAGES.
+ *
+ * To obtain byte offset, multiply this value by the page size.
+ *
+ * length: The length of the metadata page or multi- page metadata entry
+ * in BYTES.
+ *
+ * chksum: Checksum for the metadata page or multi-page metadata entry.
+ * For the VFD SWMR writer, this value is undefined until the
+ * referenced entry has been written to the metadata file.
+ *
+ * entry_ptr: Used by the VFD SWMR writer only.
+ *
+ * For the VFD SWMR reader, this field should always be NULL.
+ * If the referenced metadata page or multi-page metadata
+ * entry was modified in the current tick, this field points to
+ * a buffer in the page buffer containing its value.
+ * This field is used by the metadata file creation/update code
+ * to access the metadata pages or multi-page metadata entries
+ * so that their current values can be copied into the metadata
+ * file. After this copy, this field should be set to NULL.
+ *
+ * tick_of_last_change: Number of the last tick in which this index entry
+ * was changed.
+ *
+ * Used by the VFD SWMR writer only.
+ *
+ * For the VFD SWMR reader, this field will always be set to 0.
+ *
+ * clean: Used by the VFD SWMR writer only.
+ *
+ * Set to TRUE whenever the referenced metadata page or
+ * multi-page metadata entry is written to the HDF5 file.
+ * Set to FALSE whenever it is marked dirty in the page buffer.
+ *
+ * tick_of_last_flush: Number of the tick in which this entry was last
+ * written to the lower file or zero if it has never been flushed.
+ *
+ * Used by the VFD SWMR writer only.
+ *
+ * For the VFD SWMR reader, this field should always be 0.
+ *
+ * delayed_flush: If the flush of the referenced metadata page or multi-page
+ * metadata entry must be delayed, the earliest tick in which
+ * it may be flushed, or zero if there is no such constraint.
+ *
+ * Used by the VFD SWMR writer only.
+ *
+ * moved_to_lower_file: Set to TRUE iff the entry referenced is in the
+ * lower file and is therefore about to be removed from the
+ * metadata file
+ *
+ *----------------------------------------------------------------------------
+ */
+typedef struct H5FD_vfd_swmr_idx_entry_t {
+ uint64_t hdf5_page_offset;
+ uint64_t md_file_page_offset;
+ uint32_t length;
+ uint32_t chksum;
+ void *entry_ptr;
+ uint64_t tick_of_last_change;
+ hbool_t clean;
+ uint64_t tick_of_last_flush;
+ uint64_t delayed_flush;
+ bool moved_to_lower_file;
+ bool garbage;
+} H5FD_vfd_swmr_idx_entry_t;
+
+/*
+ * tick_num: Sequence number of the current tick.
+ * Initialized to zero on file creation/open, and incremented by the
+ * VFD SWMR writer at the end of each tick.
+ * num_entries: The number of entires in the index.
+ * entries: The array of index entries
+ */
+typedef struct H5FD_vfd_swmr_md_index {
+ uint64_t tick_num;
+ uint32_t num_entries;
+ H5FD_vfd_swmr_idx_entry_t *entries;
+} H5FD_vfd_swmr_md_index;
+
+
+/*
+ * fs_page_size: Size of pages in both the HDF5 file and the metadata file IN BYTES
+ * tick_num: Sequence number of the current tick.
+ * Initialized to zero on file creation/open, and incremented by the
+ * VFD SWMR writer at the end of each tick.
+ * index_offset: The offset of the current metadata file index in the metadata file
+ * IN BYTES.
+ * index_length: The length of the current metadata file index IN BYTES.
+ */
+typedef struct H5FD_vfd_swmr_md_header {
+ uint32_t fs_page_size;
+ uint64_t tick_num;
+ uint64_t index_offset;
+ size_t index_length;
+} H5FD_vfd_swmr_md_header;
+
+static inline H5FD_vfd_swmr_idx_entry_t *
+vfd_swmr_pageno_to_mdf_idx_entry(H5FD_vfd_swmr_idx_entry_t *idx,
+ uint32_t nindices, uint64_t target_page, bool reuse_garbage)
+{
+ uint32_t top;
+ uint32_t bottom;
+ uint32_t probe;
+
+ if (nindices < 1)
+ return NULL;
+
+ bottom = 0;
+ top = nindices;
+
+ do {
+ probe = (top + bottom) / 2;
+
+ if (idx[probe].hdf5_page_offset < target_page)
+ bottom = probe + 1;
+ else if (idx[probe].hdf5_page_offset > target_page)
+ top = probe;
+ else /* found it */
+ return (reuse_garbage || !idx[probe].garbage) ? &idx[probe] : NULL;
+ } while (bottom < top);
+ /* Previous interval was [top - 1, top] or [bottom, bottom + 1].
+ * The new interval is [top, top] or [bottom, bottom], respectively.
+ * We probed idx[bottom] in the last step, and idx[top] (if it is
+ * not out of bounds) in an earlier round. So there is nothing
+ * to be found at (top + bottom) / 2.
+ */
+ return NULL;
+}
#ifdef H5_HAVE_PARALLEL
/* ======== Temporary data transfer properties ======== */
@@ -119,6 +317,8 @@ H5_DLL herr_t H5FD_free_driver_info(hid_t driver_id, const void *driver_info);
H5_DLL hid_t H5FD_register(const void *cls, size_t size, hbool_t app_ref);
H5_DLL H5FD_t *H5FD_open(const char *name, unsigned flags, hid_t fapl_id,
haddr_t maxaddr);
+bool H5FD_has_conflict(H5FD_t *);
+H5FD_t *H5FD_deduplicate(H5FD_t *, hid_t);
H5_DLL herr_t H5FD_close(H5FD_t *file);
H5_DLL int H5FD_cmp(const H5FD_t *f1, const H5FD_t *f2);
H5_DLL herr_t H5FD_driver_query(const H5FD_class_t *driver, unsigned long *flags/*out*/);
@@ -146,6 +346,18 @@ H5_DLL herr_t H5FD_get_vfd_handle(H5FD_t *file, hid_t fapl, void** file_handle);
H5_DLL herr_t H5FD_set_base_addr(H5FD_t *file, haddr_t base_addr);
H5_DLL haddr_t H5FD_get_base_addr(const H5FD_t *file);
H5_DLL herr_t H5FD_set_paged_aggr(H5FD_t *file, hbool_t paged);
+H5_DLL herr_t H5FD_get_driver_name(const H5FD_t *file, char **driver_name);
+
+/* Function prototypes for VFD SWMR */
+H5_DLL int shadow_image_defer_free(struct H5F_shared_t *,
+ const H5FD_vfd_swmr_idx_entry_t *);
+H5_DLL herr_t H5FD_vfd_swmr_get_tick_and_idx(H5FD_t *_file, hbool_t read_index,
+ uint64_t *tick_ptr, uint32_t *num_entries_ptr,
+ H5FD_vfd_swmr_idx_entry_t index[]);
+H5_DLL H5FD_vfd_swmr_idx_entry_t *vfd_swmr_enlarge_shadow_index(struct H5F_t *);
+H5_DLL void H5FD_vfd_swmr_dump_status(H5FD_t *, uint64_t);
+H5_DLL void H5FD_vfd_swmr_set_pb_configured(H5FD_t *_file);
+H5_DLL void H5FD_vfd_swmr_record_elapsed_ticks(H5FD_t *, uint64_t);
/* Function prototypes for MPI based VFDs*/
#ifdef H5_HAVE_PARALLEL
diff --git a/src/H5FDpublic.h b/src/H5FDpublic.h
index 61bf212..a921c29 100644
--- a/src/H5FDpublic.h
+++ b/src/H5FDpublic.h
@@ -18,6 +18,7 @@
#ifndef _H5FDpublic_H
#define _H5FDpublic_H
+#include "H5queue.h"
#include "H5public.h"
#include "H5Fpublic.h" /*for H5F_close_degree_t */
@@ -302,6 +303,7 @@ typedef struct H5FD_class_t {
herr_t (*truncate)(H5FD_t *file, hid_t dxpl_id, hbool_t closing);
herr_t (*lock)(H5FD_t *file, hbool_t rw);
herr_t (*unlock)(H5FD_t *file);
+ H5FD_t *(*dedup)(H5FD_t *, H5FD_t *, hid_t);
H5FD_mem_t fl_map[H5FD_MEM_NTYPES];
} H5FD_class_t;
@@ -319,6 +321,11 @@ typedef struct H5FD_free_t {
struct H5FD_t {
hid_t driver_id; /*driver ID for this file */
const H5FD_class_t *cls; /*constant class info */
+
+ TAILQ_ENTRY(H5FD_t) link; /* Linkage for list of all VFs. */
+ H5FD_t *exc_owner; /* Pointer to an exclusive owner
+ * or NULL if none.
+ */
unsigned long fileno; /* File 'serial' number */
unsigned access_flags; /* File access flags (from create or open) */
unsigned long feature_flags; /* VFL Driver feature Flags */
diff --git a/src/H5FDsec2.c b/src/H5FDsec2.c
index 3551905..37a6ae9 100644
--- a/src/H5FDsec2.c
+++ b/src/H5FDsec2.c
@@ -171,6 +171,7 @@ static const H5FD_class_t H5FD_sec2_g = {
H5FD_sec2_truncate, /* truncate */
H5FD_sec2_lock, /* lock */
H5FD_sec2_unlock, /* unlock */
+ NULL, /* dedup */
H5FD_FLMAP_DICHOTOMY /* fl_map */
};
diff --git a/src/H5FDsplitter.c b/src/H5FDsplitter.c
index 4ed3c4a..fae4bb4 100644
--- a/src/H5FDsplitter.c
+++ b/src/H5FDsplitter.c
@@ -164,6 +164,7 @@ static const H5FD_class_t H5FD_splitter_g = {
H5FD_splitter_truncate, /* truncate */
H5FD_splitter_lock, /* lock */
H5FD_splitter_unlock, /* unlock */
+ NULL, /* dedup */
H5FD_FLMAP_DICHOTOMY /* fl_map */
};
diff --git a/src/H5FDstdio.c b/src/H5FDstdio.c
index d29a1b4..3135709 100644
--- a/src/H5FDstdio.c
+++ b/src/H5FDstdio.c
@@ -209,6 +209,7 @@ static const H5FD_class_t H5FD_stdio_g = {
H5FD_stdio_truncate, /* truncate */
H5FD_stdio_lock, /* lock */
H5FD_stdio_unlock, /* unlock */
+ NULL, /* dedup */
H5FD_FLMAP_DICHOTOMY /* fl_map */
};
diff --git a/src/H5FDtest.c b/src/H5FDtest.c
index 2eb176d..53e31c7 100644
--- a/src/H5FDtest.c
+++ b/src/H5FDtest.c
@@ -25,8 +25,8 @@
/* Module Setup */
/****************/
-#include "H5FDmodule.h" /* This source code file is part of the H5FD module */
-#define H5FD_TESTING /* Suppress warning about H5FD testing funcs */
+#include "H5FDmodule.h" /* This source code file is part of the H5FD module */
+#define H5FD_TESTING /* Suppress warning about H5FD testing funcs */
/***********/
@@ -34,6 +34,8 @@
/***********/
#include "H5private.h" /* Generic Functions */
#include "H5FDpkg.h" /* File Drivers */
+#include "H5FLprivate.h" /* Free Lists */
+#include "H5Eprivate.h" /* Error handling */
/****************/
/* Local Macros */
@@ -63,6 +65,8 @@
/*****************************/
/* Library Private Variables */
/*****************************/
+/* Declare external the free list for H5FD_vfd_swmr_idx_entry_t */
+H5FL_SEQ_EXTERN(H5FD_vfd_swmr_idx_entry_t);
/*******************/
@@ -113,3 +117,67 @@ H5FD__supports_swmr_test(const char *vfd_name)
} /* end H5FD__supports_swmr_test() */
+/*
+ * Tests for VFD SWMR
+ */
+/*-------------------------------------------------------------------------
+ * Function: H5FD__vfd_swmr_md_test
+ *
+ * Purpose: Verify the info obtained from the driver's local copy is as
+ * indicated by the parameter: num_entries and index
+ *
+ * Return: SUCCEED/FAIL
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5FD__vfd_swmr_reader_md_test(H5FD_t *file, unsigned num_entries, H5FD_vfd_swmr_idx_entry_t index[])
+{
+ unsigned vfd_num_entries = 0;
+ H5FD_vfd_swmr_idx_entry_t *vfd_index = NULL;
+ unsigned i;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI_NOINIT
+
+ /* Retrieve index from VFD SWMR driver */
+ /* Initial call to get # of entries */
+ if(H5FD_vfd_swmr_get_tick_and_idx(file, TRUE, NULL, &vfd_num_entries, vfd_index) < 0)
+ HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "Error in retrieving index from driver")
+
+ /* Verify number of index entries */
+ if(vfd_num_entries != num_entries)
+ HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "Error in retrieving index from driver")
+
+ if(vfd_num_entries) {
+ /* Allocate memory for index entries */
+ if(NULL == (vfd_index = H5FL_SEQ_MALLOC(H5FD_vfd_swmr_idx_entry_t, vfd_num_entries)))
+ HGOTO_ERROR(H5E_VFL, H5E_CANTALLOC, FAIL, "memory allocation failed for index entries")
+
+ /* Second call to retrieve the index */
+ if(H5FD_vfd_swmr_get_tick_and_idx(file, FALSE, NULL, &vfd_num_entries, vfd_index) < 0)
+ HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "Error in retrieving index from driver")
+
+ /* Verify index entries */
+ for(i = 0; i < vfd_num_entries; i++) {
+ if(vfd_index[i].length != index[i].length)
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "incorrect length read from metadata file")
+
+ if(vfd_index[i].hdf5_page_offset != index[i].hdf5_page_offset)
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "incorrect hdf5_page_offset read from metadata file")
+
+ if(vfd_index[i].md_file_page_offset != index[i].md_file_page_offset)
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "incorrect md_file_page_offset read from metadata file")
+
+ if(vfd_index[i].chksum != index[i].chksum)
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "incorrect chksum read from metadata file")
+ }
+ }
+
+done:
+ /* Free local copy of index entries */
+ if(vfd_num_entries && vfd_index)
+ vfd_index = (H5FD_vfd_swmr_idx_entry_t *)H5FL_SEQ_FREE(H5FD_vfd_swmr_idx_entry_t, vfd_index);
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5FD__vfd_swmr_reader_md_test() */
diff --git a/src/H5FDvfd_swmr.c b/src/H5FDvfd_swmr.c
new file mode 100644
index 0000000..f0e0cfd
--- /dev/null
+++ b/src/H5FDvfd_swmr.c
@@ -0,0 +1,1607 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright by The HDF Group. *
+
+ * All rights reserved. *
+ * *
+ * This file is part of HDF5. The full HDF5 copyright notice, including *
+ * terms governing use, modification, and redistribution, is contained in *
+ * the COPYING file, which can be found at the root of the source code *
+ * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. *
+ * If you do not have access to either file, you may request a copy from *
+ * help@hdfgroup.org. *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+/*
+ * Purpose: VFD SWMR driver for the reader
+ */
+
+#include "H5FDdrvr_module.h" /* This source code file is part of the H5FD driver module */
+
+
+#include "H5Eprivate.h" /* Error handling */
+#include "H5Fprivate.h" /* File access */
+#include "H5FDprivate.h" /* File drivers */
+#include "H5FDvfd_swmr.h" /* VFD SWMR file driver */
+#include "H5FLprivate.h" /* Free Lists */
+#include "H5Iprivate.h" /* IDs */
+#include "H5MMprivate.h" /* Memory management */
+#include "H5Pprivate.h" /* Property lists */
+#include "H5retry_private.h"/* Retry loops. */
+
+/* The driver identification number, initialized at runtime */
+static hid_t H5FD_VFD_SWMR_g = 0;
+
+typedef struct H5FD_vfd_swmr_t {
+ H5FD_t pub; /* public stuff, must be */
+ /* first */
+
+ /* HDF5 file */
+ char hdf5_filename[H5FD_MAX_FILENAME_LEN]; /* Name of the HDF5 file from */
+ /* open */
+ H5FD_t *hdf5_file_lf; /* Driver info for the HDF5 */
+ /* file */
+
+ /* Metadata file */
+ int md_fd; /* File descriptor for the */
+ /* metadata file */
+ uint32_t md_pages_reserved; /* # of pages reserved at the */
+ /* head of the metadata file */
+ char md_file_path[H5FD_MAX_FILENAME_LEN]; /* Name of the metadate file */
+ H5FD_vfd_swmr_md_header md_header; /* Metadata file header */
+ H5FD_vfd_swmr_md_index md_index; /* Metadata file index */
+
+ uint32_t api_elapsed_nslots;
+ uint64_t *api_elapsed_ticks; /* Histogram of ticks elapsed
+ * inside the API (reader only).
+ */
+ hbool_t pb_configured; /* boolean flag set to TRUE */
+ /* when the page buffer is */
+ /* and to FALSE otherwise. */
+ /* Used for sanity checking. */
+ H5F_vfd_swmr_config_t config;
+ bool writer; /* True iff configured to write. */
+} H5FD_vfd_swmr_t;
+
+#define MAXADDR (((haddr_t)1<<(8*sizeof(HDoff_t)-1))-1)
+
+/* Prototypes */
+static herr_t H5FD_vfd_swmr_term(void);
+static H5FD_t *H5FD_vfd_swmr_open(const char *name, unsigned flags,
+ hid_t fapl_id, haddr_t maxaddr);
+static herr_t H5FD_vfd_swmr_close(H5FD_t *_file);
+static int H5FD_vfd_swmr_cmp(const H5FD_t *_f1, const H5FD_t *_f2);
+static H5FD_t *H5FD_vfd_swmr_dedup(H5FD_t *, H5FD_t *, hid_t);
+static herr_t H5FD_vfd_swmr_query(const H5FD_t *_f1, unsigned long *flags);
+static haddr_t H5FD_vfd_swmr_get_eoa(const H5FD_t *_file, H5FD_mem_t type);
+static herr_t H5FD_vfd_swmr_set_eoa(H5FD_t *_file, H5FD_mem_t type,
+ haddr_t addr);
+static haddr_t H5FD_vfd_swmr_get_eof(const H5FD_t *_file, H5FD_mem_t type);
+static herr_t H5FD_vfd_swmr_get_handle(H5FD_t *_file, hid_t fapl,
+ void** file_handle);
+static herr_t H5FD_vfd_swmr_read(H5FD_t *_file, H5FD_mem_t type,
+ hid_t fapl_id, haddr_t addr, size_t size, void *buf);
+static herr_t H5FD_vfd_swmr_write(H5FD_t *_file, H5FD_mem_t type,
+ hid_t fapl_id, haddr_t addr, size_t size, const void *buf);
+static herr_t H5FD_vfd_swmr_truncate(H5FD_t *_file, hid_t dxpl_id,
+ hbool_t closing);
+static herr_t H5FD_vfd_swmr_lock(H5FD_t *_file, hbool_t rw);
+static herr_t H5FD_vfd_swmr_unlock(H5FD_t *_file);
+
+/* VFD SWMR */
+static htri_t H5FD__vfd_swmr_header_deserialize(H5FD_vfd_swmr_t *,
+ H5FD_vfd_swmr_md_header *);
+static htri_t H5FD__vfd_swmr_index_deserialize(const H5FD_vfd_swmr_t *file,
+ H5FD_vfd_swmr_md_index *md_index, const H5FD_vfd_swmr_md_header *md_header);
+static herr_t H5FD__vfd_swmr_load_hdr_and_idx(H5FD_vfd_swmr_t *, hbool_t);
+
+HLOG_OUTLET_SHORT_DEFN(index_motion, swmr);
+HLOG_OUTLET_SHORT_DEFN(swmr_stats, swmr);
+HLOG_OUTLET_SHORT_DEFN(swmr_read, swmr);
+HLOG_OUTLET_SHORT_DEFN(swmr_read_exception, swmr_read);
+HLOG_OUTLET_MEDIUM_DEFN(swmr_read_err, swmr_read_exception, HLOG_OUTLET_S_ON);
+
+static const H5FD_class_t H5FD_vfd_swmr_g = {
+ "vfd_swmr", /* name */
+ MAXADDR, /* maxaddr */
+ H5F_CLOSE_WEAK, /* fc_degree */
+ H5FD_vfd_swmr_term, /* terminate */
+ NULL, /* sb_size */
+ NULL, /* sb_encode */
+ NULL, /* sb_decode */
+ 0, /* fapl_size */
+ NULL, /* fapl_get */
+ NULL, /* fapl_copy */
+ NULL, /* fapl_free */
+ 0, /* dxpl_size */
+ NULL, /* dxpl_copy */
+ NULL, /* dxpl_free */
+ H5FD_vfd_swmr_open, /* open */
+ H5FD_vfd_swmr_close, /* close */
+ H5FD_vfd_swmr_cmp, /* cmp */
+ H5FD_vfd_swmr_query, /* query */
+ NULL, /* get_type_map */
+ NULL, /* alloc */
+ NULL, /* free */
+ H5FD_vfd_swmr_get_eoa, /* get_eoa */
+ H5FD_vfd_swmr_set_eoa, /* set_eoa */
+ H5FD_vfd_swmr_get_eof, /* get_eof */
+ H5FD_vfd_swmr_get_handle, /* get_handle */
+ H5FD_vfd_swmr_read, /* read */
+ H5FD_vfd_swmr_write, /* write */
+ NULL, /* flush */
+ H5FD_vfd_swmr_truncate, /* truncate */
+ H5FD_vfd_swmr_lock, /* lock */
+ H5FD_vfd_swmr_unlock, /* unlock */
+ H5FD_vfd_swmr_dedup, /* dedup */
+ H5FD_FLMAP_DICHOTOMY /* fl_map */
+};
+
+/* Declare a free list to manage the H5FD_vfd_swmr_t struct */
+H5FL_DEFINE_STATIC(H5FD_vfd_swmr_t);
+
+/* Declare a free list to manage the H5FD_vfd_swmr_idx_entry_t sequence information */
+H5FL_SEQ_DEFINE(H5FD_vfd_swmr_idx_entry_t);
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD__init_package
+ *
+ * Purpose: Initializes any interface-specific data or routines.
+ *
+b
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD__init_package(void)
+{
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_STATIC
+
+ if(H5FD_vfd_swmr_init() < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "unable to initialize swmr VFD")
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5FD__init_package() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_vfd_swmr_init
+ *
+ * Purpose: Initialize this driver by registering the driver with the
+ * library.
+ *
+ * Return: Success: The driver ID for the VFD SWMR driver.
+ * Failure: Negative
+ *
+ * Programmer: Robb Matzke
+ * Thursday, July 29, 1999
+ *
+ *-------------------------------------------------------------------------
+ */
+hid_t
+H5FD_vfd_swmr_init(void)
+{
+ hid_t ret_value = H5I_INVALID_HID; /* Return value */
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ if(H5I_VFL != H5I_get_type(H5FD_VFD_SWMR_g))
+ H5FD_VFD_SWMR_g = H5FD_register(&H5FD_vfd_swmr_g, sizeof(H5FD_class_t), FALSE);
+
+ /* Set return value */
+ ret_value = H5FD_VFD_SWMR_g;
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5FD_vfd_swmr_init() */
+
+
+/*---------------------------------------------------------------------------
+ * Function: H5FD_vfd_swmr_term
+ *
+ * Purpose: Shut down the VFD
+ *
+ * Returns: SUCCEED (Can't fail)
+ *
+ * Programmer: Quincey Koziol
+ * Friday, Jan 30, 2004
+ *
+ *---------------------------------------------------------------------------
+ */
+static herr_t
+H5FD_vfd_swmr_term(void)
+{
+ FUNC_ENTER_NOAPI_NOINIT_NOERR
+
+ /* Reset VFL ID */
+ H5FD_VFD_SWMR_g = 0;
+
+ FUNC_LEAVE_NOAPI(SUCCEED)
+} /* end H5FD_vfd_swmr_term() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5Pset_fapl_vfd_swmr (Not yet)
+ *
+ * Purpose: Modify the file access property list to use the H5FD_SWMR
+ * driver
+ *
+ * Return: SUCCEED/FAIL
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5Pset_fapl_vfd_swmr(hid_t fapl_id)
+{
+ H5P_genplist_t *plist; /* Property list pointer */
+ herr_t ret_value;
+
+ FUNC_ENTER_API(FAIL)
+ H5TRACE1("e", "i", fapl_id);
+
+ if(NULL == (plist = H5P_object_verify(fapl_id, H5P_FILE_ACCESS)))
+ HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list")
+
+ ret_value = H5P_set_driver(plist, H5FD_VFD_SWMR, NULL);
+
+done:
+ FUNC_LEAVE_API(ret_value)
+} /* end H5Pset_fapl_vfd_swmr() */
+
+static herr_t
+H5FD__swmr_reader_open(H5FD_vfd_swmr_t *file)
+{
+ h5_retry_t retry; /* retry state */
+ bool do_try; /* more tries remain */
+ herr_t ret_value = SUCCEED;
+ FUNC_ENTER_STATIC
+
+ file->api_elapsed_nslots = file->config.max_lag + 1;
+
+ file->api_elapsed_ticks =
+ calloc(file->api_elapsed_nslots, sizeof(*file->api_elapsed_ticks));
+
+ if (file->api_elapsed_ticks == NULL) {
+ HGOTO_ERROR(H5E_FILE, H5E_CANTALLOC, FAIL,
+ "could not allocate API elapsed ticks");
+ }
+
+ /* Retry on opening the metadata file */
+ for (do_try = h5_retry_init(&retry, H5FD_VFD_SWMR_MD_FILE_RETRY_MAX,
+ H5_RETRY_DEFAULT_MINIVAL,
+ H5_RETRY_DEFAULT_MAXIVAL);
+ do_try;
+ do_try = h5_retry_next(&retry)) {
+ if((file->md_fd = HDopen(file->md_file_path, O_RDONLY)) >= 0)
+ break;
+ }
+
+ /* Exhaust all retries for opening the md file */
+ if(!do_try)
+ HGOTO_ERROR(H5E_VFL, H5E_OPENERROR, FAIL,
+ "unable to open the metadata file after all retry attempts");
+
+ /* Retry on loading and decoding the header and index in the
+ * metadata file
+ */
+ if(H5FD__vfd_swmr_load_hdr_and_idx(file, TRUE) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL,
+ "unable to load/decode the md file header/index");
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+}
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_vfd_swmr_open
+ *
+ * Purpose: Open the metadata file and the underlying HDF5 file
+ *
+ * Return: Success: A pointer to a new file data structure. The
+ * public fields will be initialized by the
+ * caller, which is always H5FD_open().
+ * Failure: NULL
+ *
+ *-------------------------------------------------------------------------
+ */
+static H5FD_t *
+H5FD_vfd_swmr_open(const char *name, unsigned flags, hid_t fapl_id,
+ haddr_t maxaddr)
+{
+ H5FD_vfd_swmr_t *file = NULL;
+ size_t page_buf_size;
+ H5P_genplist_t *plist;
+ H5F_vfd_swmr_config_t *vfd_swmr_config;
+ H5FD_t *ret_value = NULL; /* Return value */
+
+ FUNC_ENTER_NOAPI_NOINIT
+
+ /* Get file access property list */
+ if(NULL == (plist = (H5P_genplist_t *)H5I_object(fapl_id))) {
+ HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, NULL,
+ "not a file access property list");
+ }
+
+ if (H5P_get(plist, H5F_ACS_PAGE_BUFFER_SIZE_NAME, &page_buf_size) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTGET, NULL, "can't get page buffer size");
+
+ /* Paged allocation, too, has to be enabled, but the page buffer
+ * initialization (H5PB_create) will detect a conflicting configuration
+ * and return an error.
+ */
+ if (page_buf_size == 0) {
+ HGOTO_ERROR(H5E_FILE, H5E_CANTGET, NULL,
+ "page buffering must be enabled");
+ }
+
+ /* Create the new driver struct */
+ if(NULL == (file = H5FL_CALLOC(H5FD_vfd_swmr_t))) {
+ HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL,
+ "unable to allocate file struct");
+ }
+
+ vfd_swmr_config = &file->config;
+
+ /* Get VFD SWMR configuration */
+ if(H5P_get(plist, H5F_ACS_VFD_SWMR_CONFIG_NAME, vfd_swmr_config) < 0) {
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, NULL,
+ "can't get VFD SWMR config info");
+ }
+
+ file->md_fd = -1;
+ file->hdf5_file_lf = NULL;
+ file->md_pages_reserved = vfd_swmr_config->md_pages_reserved;
+
+ /* Retain a copy of the name used to open the HDF5 file */
+ HDstrncpy(file->hdf5_filename, name, sizeof(file->hdf5_filename));
+ file->hdf5_filename[sizeof(file->hdf5_filename) - 1] = '\0';
+
+ /* Retain a copy of the metadata file name */
+ HDstrncpy(file->md_file_path, vfd_swmr_config->md_file_path,
+ sizeof(file->md_file_path));
+ file->md_file_path[sizeof(file->md_file_path) - 1] = '\0';
+
+ file->writer = vfd_swmr_config->writer;
+
+ /* Ensure that this is the reader */
+ if (!vfd_swmr_config->writer && H5FD__swmr_reader_open(file) < 0) {
+ HGOTO_ERROR(H5E_VFL, H5E_OPENERROR, NULL,
+ "perform reader-specific opening steps failed");
+ }
+
+ /* Hard-wired to open the underlying HDF5 file with SEC2 */
+ if((file->hdf5_file_lf = H5FD_open(name, flags, H5P_FILE_ACCESS_DEFAULT,
+ maxaddr)) == NULL)
+ HGOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, NULL, "can't set driver info");
+
+ file->hdf5_file_lf->exc_owner = &file->pub;
+
+ /* set pb_configured to FALSE. This field should not exist, but
+ * until we modify the file open procedure to create the page buffer
+ * before there is any file I/O when opening a file VFD SWMR reader,
+ * we need to be able to turn off sanity checking in the read function
+ * until the page buffer is enabled. This field exists for this
+ * purpose, and should be remove when it is no longer necessary.
+ *
+ * JRM -- 1/29/19
+ */
+ file->pb_configured = FALSE;
+
+ /* Set return value */
+ ret_value = &file->pub;
+
+done:
+ /* Handle closing if error */
+ if(NULL == ret_value && file) {
+
+ if(H5FD_vfd_swmr_close(&file->pub) < 0)
+
+ HDONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, NULL, "error from closing")
+
+ } /* end if */
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5FD_vfd_swmr_open() */
+
+static void
+swmr_reader_close(H5FD_vfd_swmr_t *file)
+{
+ vfd_swmr_reader_did_increase_tick_to(0);
+
+ if (file->api_elapsed_ticks != NULL) {
+ uint32_t i;
+ for (i = 0; i < file->api_elapsed_nslots; i++) {
+ hlog_fast(swmr_stats,
+ "%s: %" PRIu32 " ticks elapsed in API %" PRIu64 " times",
+ __func__, i, file->api_elapsed_ticks[i]);
+ }
+ free(file->api_elapsed_ticks);
+ }
+
+ /* Close the metadata file */
+ if(file->md_fd >= 0 && HDclose(file->md_fd) < 0) {
+ /* Push error, but keep going */
+ HERROR(H5E_FILE, H5E_CANTCLOSEFILE,
+ "unable to close the metadata file");
+ }
+
+ /* Free the index entries */
+ if(file->md_index.num_entries && file->md_index.entries)
+ file->md_index.entries = H5FL_SEQ_FREE(H5FD_vfd_swmr_idx_entry_t,
+ file->md_index.entries);
+}
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_vfd_swmr_close
+ *
+ * Purpose: Handle closing for VFD SWMR driver
+ * --close the underlying HDF5 file
+ * --close the metadata file if open
+ * --free the index entries if available
+ *
+ * Return: Success: SUCCEED
+ * Failure: FAIL
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD_vfd_swmr_close(H5FD_t *_file)
+{
+ H5FD_vfd_swmr_t *file = (H5FD_vfd_swmr_t *)_file;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI_NOINIT
+
+ if (file->hdf5_file_lf != NULL) {
+ if (file->hdf5_file_lf->exc_owner != NULL) {
+ assert(file->hdf5_file_lf->exc_owner == &file->pub);
+ file->hdf5_file_lf->exc_owner = NULL;
+ }
+
+ /* Close the underlying file */
+ if (H5FD_close(file->hdf5_file_lf) < 0)
+ /* Push error, but keep going */
+ HDONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, \
+ "unable to close the HDF5 file")
+ }
+
+ if (!file->writer)
+ (void)swmr_reader_close(file);
+
+ /* Release the driver info */
+ file = H5FL_FREE(H5FD_vfd_swmr_t, file);
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* end H5FD_vfd_swmr_close() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_vfd_swmr_cmp
+ *
+ * Purpose: Compares two files belonging to this driver using an
+ * arbitrary (but consistent) ordering.
+ *
+ * Return: Success: A value like strcmp()
+ * Failure: never fails (arguments were checked by the
+ * caller).
+ *
+ *-------------------------------------------------------------------------
+ */
+static int
+H5FD_vfd_swmr_cmp(const H5FD_t *_f1, const H5FD_t *_f2)
+{
+ const H5FD_vfd_swmr_t *f1 = (const H5FD_vfd_swmr_t *)_f1;
+ const H5FD_vfd_swmr_t *f2 = (const H5FD_vfd_swmr_t *)_f2;
+ int ret_value = 0;
+
+ FUNC_ENTER_NOAPI_NOINIT_NOERR
+
+ ret_value = H5FD_cmp(f1->hdf5_file_lf, f2->hdf5_file_lf);
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5FD_vfd_swmr_cmp() */
+
+static H5FD_t *
+H5FD_vfd_swmr_dedup(H5FD_t *_self, H5FD_t *_other, hid_t fapl)
+{
+ H5FD_vfd_swmr_t *self = (H5FD_vfd_swmr_t *)_self;
+
+ assert(_self->driver_id == H5FD_VFD_SWMR_g);
+
+ if (_self->cls == _other->cls) {
+ H5FD_vfd_swmr_t *other = (H5FD_vfd_swmr_t *)_other;
+ H5P_genplist_t *plist;
+ H5F_vfd_swmr_config_t *config;
+ bool equal_configs;
+
+ if (H5FD_cmp(self->hdf5_file_lf, other->hdf5_file_lf) != 0)
+ return _other;
+
+ /* If fapl == _ANY_VFD, then the match between lower files is
+ * sufficient.
+ */
+ if (fapl == H5P_FILE_ACCESS_ANY_VFD)
+ return _self;
+
+ /* If fapl != _ANY_VFD, then we have either a duplicate or
+ * a conflict. If the VFD SWMR parameters match, then
+ * return `self` to indicate a duplicate. Otherwise, return
+ * NULL to indicate a mismatch.
+ */
+ if (NULL == (plist = H5I_object(fapl))) {
+ HERROR(H5E_ARGS, H5E_BADTYPE, "could not get fapl");
+ return NULL;
+ }
+
+ if ((config = malloc(sizeof(*config))) == NULL) {
+ HERROR(H5E_ARGS, H5E_BADTYPE, "could not allocate config");
+ return NULL;
+ }
+ if (H5P_get(plist, H5F_ACS_VFD_SWMR_CONFIG_NAME, config) < 0) {
+ HERROR(H5E_PLIST, H5E_CANTGET, "cannot get VFD SWMR config");
+ return NULL;
+ }
+
+ equal_configs = memcmp(&self->config, config, sizeof(*config)) == 0;
+
+ free(config);
+
+ if (equal_configs)
+ return _self;
+
+ HERROR(H5E_PLIST, H5E_CANTGET, "inconsistent VFD SWMR config");
+ return NULL;
+ } else if (H5FD_cmp(self->hdf5_file_lf, _other) == 0) {
+ return (fapl == H5P_FILE_ACCESS_ANY_VFD) ? _self : NULL;
+ } else {
+ return _other;
+ }
+}
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_vfd_swmr_query
+ *
+ * Purpose: Set the flags that this VFL driver is capable of supporting.
+ * (listed in H5FDpublic.h)
+ *
+ * Return: SUCCEED (Can't fail)
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD_vfd_swmr_query(const H5FD_t H5_ATTR_UNUSED *_file, unsigned long *flags /* out */)
+{
+ FUNC_ENTER_NOAPI_NOINIT_NOERR
+
+ /* Set the VFL feature flags that this driver supports */
+ if(flags) {
+ *flags = 0;
+ *flags |= H5FD_FEAT_AGGREGATE_METADATA; /* OK to aggregate */
+ /* metadata allocations */
+
+ *flags |= H5FD_FEAT_ACCUMULATE_METADATA; /* OK to accumulate */
+ /* metadata for faster */
+ /* writes */
+
+ *flags |= H5FD_FEAT_DATA_SIEVE; /* OK to perform data */
+ /* sieving for faster */
+ /* raw data reads & */
+ /* writes */
+
+ *flags |= H5FD_FEAT_AGGREGATE_SMALLDATA; /* OK to aggregate */
+ /* "small" raw data */
+ /* allocations */
+
+ *flags |= H5FD_FEAT_POSIX_COMPAT_HANDLE; /* get_handle callback */
+ /* returns a POSIX file */
+ /* descriptor */
+
+ *flags |= H5FD_FEAT_SUPPORTS_SWMR_IO; /* VFD supports the */
+ /* single-writer/ */
+ /* multiple-readers */
+ /* (SWMR) pattern */
+
+ *flags |= H5FD_FEAT_DEFAULT_VFD_COMPATIBLE; /* VFD creates a file */
+ /* which can be opened */
+ /* with the default VFD */
+
+ } /* end if */
+
+ FUNC_LEAVE_NOAPI(SUCCEED)
+} /* end H5FD_vfd_swmr_query() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_vfd_swmr_get_eoa
+ *
+ * Purpose: Gets the end-of-address marker for the file for the
+ * underlying HDF5 file. The EOA marker is the first address
+ * past the last byte allocated in the format address space.
+ *
+ * Return: The end-of-address marker.
+ *
+ *-------------------------------------------------------------------------
+ */
+static haddr_t
+H5FD_vfd_swmr_get_eoa(const H5FD_t *_file, H5FD_mem_t type)
+{
+ const H5FD_vfd_swmr_t *file = (const H5FD_vfd_swmr_t *)_file;
+ haddr_t ret_value = HADDR_UNDEF;
+
+ FUNC_ENTER_NOAPI_NOINIT
+
+ if((ret_value = H5FD_get_eoa(file->hdf5_file_lf, type)) == HADDR_UNDEF)
+
+ HGOTO_ERROR(H5E_FILE, H5E_CANTINIT, HADDR_UNDEF, \
+ "unable to get HDF5 file eoa")
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5FD_vfd_swmr_get_eoa() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_vfd_swmr_set_eoa
+ *
+ * Purpose: Set the end-of-address marker for the underlying HDF5 file.
+ * This function is called shortly after an existing HDF5 file
+ * is opened in order to tell the driver where the end of the
+ * HDF5 data is located.
+ *
+ * Return: SUCCEED (Can't fail)
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD_vfd_swmr_set_eoa(H5FD_t *_file, H5FD_mem_t type, haddr_t addr)
+{
+ H5FD_vfd_swmr_t *file = (H5FD_vfd_swmr_t *)_file;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_NOAPI_NOINIT
+
+ if(H5FD_set_eoa(file->hdf5_file_lf, type, addr) < 0)
+
+ HGOTO_ERROR(H5E_FILE, H5E_CANTINIT, FAIL, "unable to set HDF5 file eoa")
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5FD_vfd_swmr_set_eoa() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_vfd_swmr_get_eof
+ *
+ * Purpose: Returns the end-of-file marker, which is the greater of
+ * either the filesystem end-of-file or the HDF5 end-of-address
+ * markers for the underlying HDF5 file
+ *
+ * Return: End of file address, the first address past the end of the
+ * "file", either the filesystem file or the HDF5 file.
+ *
+ *-------------------------------------------------------------------------
+ */
+static haddr_t
+H5FD_vfd_swmr_get_eof(const H5FD_t *_file, H5FD_mem_t type)
+{
+ const H5FD_vfd_swmr_t *file = (const H5FD_vfd_swmr_t *)_file;
+ haddr_t ret_value = HADDR_UNDEF;
+
+ FUNC_ENTER_NOAPI_NOINIT
+
+ /* LATER: need to determine the metadata file or underlying HDF5 file ? */
+ if((ret_value = H5FD_get_eof(file->hdf5_file_lf, type)) == HADDR_UNDEF)
+
+ HGOTO_ERROR(H5E_FILE, H5E_CANTINIT, HADDR_UNDEF, \
+ "unable to set file eoa")
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5FD_vfd_swmr_get_eof() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_vfd_swmr_get_handle
+ *
+ * Purpose: Returns the file handle for the underling HDF5 file
+ *
+ * Returns: SUCCEED/FAIL
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD_vfd_swmr_get_handle(H5FD_t *_file, hid_t fapl, void **file_handle)
+{
+ H5FD_vfd_swmr_t *file = (H5FD_vfd_swmr_t *)_file;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_NOAPI_NOINIT
+
+ if(!file_handle)
+ HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file handle not valid")
+
+ /* LATER? H5P_get(plist, H5F_ACS_SWMR_FILE_NAME, &type) */
+
+ if((ret_value = H5FD_get_vfd_handle(file->hdf5_file_lf,
+ fapl, file_handle)) < 0)
+
+ HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, \
+ "unable to get handle for HDF5 file")
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5FD_vfd_swmr_get_handle() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_vfd_swmr_read
+ *
+ * Purpose: If the target page or multi-page metadata entry is
+ * defined in the current metadata file index, satisfy
+ * the read from the metadata file. Otherwise, pass the
+ * read through to the underlying VFD.
+ *
+ * Under normal operating conditions, the size of the
+ * read must always match the size supplied in the
+ * metadata file index. However, until we modify the
+ * file open process for VFD SWMR readers to create the
+ * page buffer before any reads, we must allow non
+ * full page / non full multi-page metadata entry reads
+ * until the page buffer is created.
+ *
+ * This is tracked by the pb_configured flag in
+ * H5FD_vfd_swmr_t. If this field is FALSE, the function
+ * must allow reads smaller than the size listed in the
+ * index, and possibly starting anywhere in the page.
+ * Note, however, that these reads must not cross page
+ * boundaries.
+ *
+ * Once we modify the file open code to start up the
+ * page buffer before we attempt any reads, this exception
+ * will not longer be necessary, and should be removed.
+ *
+ * JRM -- 1/29/19
+ *
+ * Return: Success: SUCCEED. Result is stored in caller-supplied
+ * buffer BUF.
+ * Failure: FAIL, Contents of buffer BUF are undefined.
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD_vfd_swmr_read(H5FD_t *_file, H5FD_mem_t type,
+ hid_t H5_ATTR_UNUSED dxpl_id,
+ const haddr_t addr, size_t size, void * const buf /*out*/)
+{
+ const size_t init_size = size;
+ haddr_t target_page;
+ haddr_t page_offset;
+ H5FD_vfd_swmr_t *file = (H5FD_vfd_swmr_t *)_file;
+ H5FD_vfd_swmr_idx_entry_t *index, *entry;
+ uint32_t num_entries = 0;
+ uint32_t fs_page_size;
+ herr_t ret_value = SUCCEED;
+ char *p = buf;
+
+ if (file->writer)
+ return H5FD_read(file->hdf5_file_lf, type, addr, size, buf);
+
+ FUNC_ENTER_NOAPI_NOINIT
+
+ HDassert(file && file->pub.cls);
+ HDassert(buf);
+
+ index = file->md_index.entries;
+ num_entries = file->md_index.num_entries;
+ fs_page_size = file->md_header.fs_page_size;
+
+ /* Try finding the addr from the index */
+ target_page = addr / fs_page_size;
+
+ entry = vfd_swmr_pageno_to_mdf_idx_entry(index, num_entries, target_page,
+ false);
+
+ hlog_fast(swmr_read, "%s: enter type %d addr %" PRIuHADDR " size %zu "
+ "file %s", __func__, type, addr, size,
+ (entry == NULL) ? "lower" : "shadow");
+
+ if (entry == NULL) {
+ /* Cannot find addr in index, read from the underlying hdf5 file */
+ if(H5FD_read(file->hdf5_file_lf, type, addr, size, buf) < 0) {
+ HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, \
+ "file read request failed");
+ }
+ HGOTO_DONE(SUCCEED);
+ }
+
+ /* Found in index, read from the metadata file */
+ HDassert(addr >= target_page * fs_page_size);
+
+ page_offset = addr - (target_page * fs_page_size);
+
+ HDassert( ( page_offset == 0 ) ||
+ ( ( ! file->pb_configured ) &&
+ ( page_offset + size <= fs_page_size ) ) );
+
+ HDassert(entry->hdf5_page_offset * fs_page_size <= addr);
+ HDassert(addr < (entry->hdf5_page_offset + 1) * fs_page_size);
+ HDassert(page_offset + init_size <= entry->length);
+
+ if(HDlseek(file->md_fd, (HDoff_t)
+ ((entry->md_file_page_offset * fs_page_size)
+ + page_offset), SEEK_SET) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_SEEKERROR, FAIL, "unable to seek in metadata file")
+
+ /* Coding borrowed from sec2 read */
+ while(size > 0) {
+
+ h5_posix_io_t bytes_in; /* # of bytes to read */
+ h5_posix_io_ret_t bytes_read; /* # of bytes actually read */
+
+ /* Trying to read more bytes than the return type can handle is
+ * undefined behavior in POSIX.
+ */
+ if(size > H5_POSIX_MAX_IO_BYTES)
+ bytes_in = MIN(H5_POSIX_MAX_IO_BYTES, size);
+ else
+ bytes_in = (h5_posix_io_t)size;
+
+ do {
+ bytes_read = HDread(file->md_fd, p, bytes_in);
+ } while (-1 == bytes_read && EINTR == errno);
+
+ if(-1 == bytes_read)
+ HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "error reading the page/multi-page entry from the md file")
+
+ HDassert(0 <= bytes_read && (size_t)bytes_read <= size);
+
+ size -= (size_t)bytes_read;
+ p += bytes_read;
+ }
+
+ /* Verify stored and computed checksums are equal.
+ *
+ * Ignore the checksum if the buffer (buf, size) is not large enough
+ * to hold the entire shadow image. Assume that the caller will
+ * read the entry fully, later.
+ *
+ * Ignore checksum if the page buffer is not configured---this
+ * is John's hack to allow the library to find the superblock
+ * signature.
+ */
+ if (!file->pb_configured) {
+ hlog_fast(swmr_read_exception,
+ "%s: skipping checksum, page buffer not configured", __func__);
+ } else if (entry->length != init_size) {
+ hlog_fast(swmr_read_exception,
+ "%s: skipping checksum, buffer size != entry size", __func__);
+ } else if (H5_checksum_metadata(buf, entry->length, 0) != entry->chksum) {
+ H5FD_vfd_swmr_md_header tmp_header;
+
+ hlog_fast(swmr_read_err, "%s: bad checksum", __func__);
+ hlog_fast(swmr_read_err, "addr %" PRIuHADDR " page %" PRIuHADDR
+ " len %zu type %d ...", addr, addr / fs_page_size, init_size, type);
+ hlog_fast(swmr_read_err, "... index[%" PRId64 "] lower pgno %" PRIu64
+ " shadow pgno %" PRIu64 " len %" PRIu32 " sum %" PRIx32,
+ (int64_t)(entry - index), entry->hdf5_page_offset,
+ entry->md_file_page_offset, entry->length, entry->chksum);
+
+ if (H5FD__vfd_swmr_header_deserialize(file, &tmp_header) != TRUE) {
+ HGOTO_ERROR(H5E_VFL, H5E_CANTLOAD, FAIL,
+ "checksum error in shadow file entry; could not load header");
+ }
+
+ hlog_fast(swmr_read_err, "... header tick last read %" PRIu64
+ " latest %" PRIu64, file->md_header.tick_num, tmp_header.tick_num);
+
+ HGOTO_ERROR(H5E_VFL, H5E_CANTLOAD, FAIL,
+ "checksum error in shadow file entry");
+ }
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5FD_vfd_swmr_read() */
+
+
+/*
+ * Function: H5FD_vfd_swmr_write
+ *
+ * Purpose: Writes SIZE bytes of data to FILE beginning at address ADDR
+ * from buffer BUF according to data transfer properties in
+ * DXPL_ID.
+ *
+ * Return: SUCCEED/FAIL
+ */
+static herr_t
+H5FD_vfd_swmr_write(H5FD_t *_file, H5FD_mem_t type,
+ hid_t H5_ATTR_UNUSED dxpl_id, haddr_t addr, size_t size, const void *buf)
+{
+ H5FD_vfd_swmr_t *file = (H5FD_vfd_swmr_t *)_file;
+
+ HDassert(file->writer);
+
+ return H5FD_write(file->hdf5_file_lf, type, addr, size, buf);
+}
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_vfd_swmr_truncate
+ *
+ * Purpose: Makes sure that the true file size is the same (or larger)
+ * than the end-of-address for the underlying HDF5 file
+ *
+ * Return: SUCCEED/FAIL
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD_vfd_swmr_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id,
+ hbool_t closing)
+{
+ H5FD_vfd_swmr_t *file = (H5FD_vfd_swmr_t *)_file; /* VFD SWMR file struct */
+
+ /* The VFD SWMR vfd should only be used by the VFD SWMR reader,
+ * and thus this file should only be opened R/O.
+ *
+ * Thus this function should never be called and should return error
+ *
+ * For now, just assert FALSE.
+ */
+ HDassert(file->writer);
+
+ return H5FD_truncate(file->hdf5_file_lf, closing);
+}
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_vfd_swmr_lock
+ *
+ * Purpose: To place an advisory lock on the underlying HDF5 file.
+ *
+ * Return: SUCCEED/FAIL
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD_vfd_swmr_lock(H5FD_t *_file, hbool_t rw)
+{
+ H5FD_vfd_swmr_t *file = (H5FD_vfd_swmr_t *)_file; /* VFD SWMR file struct */
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI_NOINIT
+
+ HDassert(file);
+
+ if(H5FD_lock(file->hdf5_file_lf, rw) < 0)
+
+ HGOTO_ERROR(H5E_IO, H5E_CANTLOCK, FAIL, \
+ "unable to lock the HDF5 file")
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5FD_vfd_swmr_lock() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_vfd_swmr_unlock
+ *
+ * Purpose: To remove the existing lock on the underlying HDF5 file
+ *
+ * Return: SUCCEED/FAIL
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD_vfd_swmr_unlock(H5FD_t *_file)
+{
+ H5FD_vfd_swmr_t *file = (H5FD_vfd_swmr_t *)_file; /* VFD SWMR file struct */
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI_NOINIT
+
+ HDassert(file);
+
+ if(H5FD_unlock(file->hdf5_file_lf) < 0)
+
+ HGOTO_ERROR(H5E_IO, H5E_CANTUNLOCK, FAIL, \
+ "unable to unlock the HDF5 file")
+
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* end H5FD_vfd_swmr_unlock() */
+
+
+/*
+ * Function: H5FD__vfd_swmr_load_hdr_and_idx()
+ *
+ * Purpose: Load and decode the header and index in the metadata file
+ *
+ * In H5FD__vfd_swmr_load_hdr_and_idx(), we follow this protocol for reading
+ * the shadow file:
+ *
+ * 0 If the maximum number of retries have been attempted, then exit
+ * with an error.
+ *
+ * 1 Try to read the shadow file *header*. If successful, continue to 2.
+ *
+ * If there is a hard failure, then return an error. If there is a failure
+ * that may be transient, then sleep and retry at 0.
+ *
+ * 2 If the tick number in the header is less than the tick last read by the
+ * VFD, then return an error.
+ *
+ * 3 If the tick number in the header is equal to the last tick read by the
+ * VFD, then exit without doing anything.
+ *
+ * 4 Try to read the shadow file *index*. If successful, continue to 5.
+ *
+ * If there is a hard failure, then return an error. If there is a failure
+ * that may be transient, then sleep and retry at 0.
+ *
+ * 5 If a different tick number was read from the index than from the index,
+ * then continue at 0.
+ *
+ * 6 Try to *re-read* the shadow file *header*. If successful, continue to 7.
+ *
+ * If there is a hard failure, then return an error. If there is a failure
+ * that may be transient, then sleep and retry at 0.
+ *
+ * 7 Compare the header that was read previously with the new header. If
+ * the new header is different than the old, then we may not have read
+ * the index at the right shadow-file offset, or the index may have been
+ * read in an inconsistent state, so sleep and retry at 0. Otherwise,
+ * return success.
+ *
+ * Return: Success: SUCCEED
+ * Failure: FAIL
+ *
+ */
+static herr_t
+H5FD__vfd_swmr_load_hdr_and_idx(H5FD_vfd_swmr_t *file, hbool_t open)
+{
+ bool do_try;
+ h5_retry_t retry;
+ H5FD_vfd_swmr_md_header md_header; /* Metadata file header, take 1 */
+ H5FD_vfd_swmr_md_header md_header_two; /* Metadata file header, take 2 */
+ H5FD_vfd_swmr_md_index md_index; /* Metadata file index */
+ herr_t ret_value = SUCCEED; /* Return value */
+ htri_t rc;
+ static uint64_t last_index_offset = 0;
+
+ FUNC_ENTER_STATIC
+
+ for (do_try = h5_retry_init(&retry, H5FD_VFD_SWMR_MD_LOAD_RETRY_MAX,
+ H5_RETRY_ONE_SECOND / 10, H5_RETRY_ONE_SECOND);
+ do_try;
+ do_try = h5_retry_next(&retry)) {
+
+ /* Load and decode the header. Go around again on a temporary
+ * failure (FALSE). Bail on an irrecoverable failure (FAIL).
+ */
+ rc = H5FD__vfd_swmr_header_deserialize(file, &md_header);
+
+ /* Temporary failure, try again. */
+ if (rc == FALSE)
+ continue;
+
+ if (rc != TRUE)
+ HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "could not read header");
+
+ if (md_header.index_offset != last_index_offset) {
+ hlog_fast(index_motion, "index offset changed %" PRIu64 "\n",
+ md_header.index_offset);
+ last_index_offset = md_header.index_offset;
+ }
+
+ if (open)
+ ; // ignore tick number on open
+ else if (md_header.tick_num == file->md_header.tick_num) {
+ /* If the tick number in the header hasn't increased since last
+ * time, then there is not a complete new index to read, so
+ * get out.
+ */
+ HGOTO_DONE(SUCCEED);
+ } else if (md_header.tick_num < file->md_header.tick_num) {
+ /* The tick number must not move backward. */
+ HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL,
+ "tick number in header moved backwards");
+ }
+
+ HDassert(md_header.tick_num > file->md_header.tick_num || open);
+
+ /* Load and decode the index. Go around again on a temporary
+ * failure (FALSE). Bail on an irrecoverable failure (FAIL).
+ */
+ rc = H5FD__vfd_swmr_index_deserialize(file, &md_index, &md_header);
+
+ if (rc == FALSE)
+ continue;
+
+ if (rc != TRUE)
+ HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "could not read index");
+
+ /* If the tick_num is the same in both header and index,
+ * and the header reads the same the second time as the first time,
+ * then we should have a consistent picture of the index.
+ */
+ if (md_header.tick_num == md_index.tick_num &&
+ (rc = H5FD__vfd_swmr_header_deserialize(file,
+ &md_header_two)) == TRUE &&
+ md_header.tick_num == md_header_two.tick_num &&
+ md_header.index_length == md_header_two.index_length &&
+ md_header.index_offset == md_header_two.index_offset)
+ break;
+
+ if (md_index.entries != NULL) {
+
+ HDassert(md_index.num_entries);
+ md_index.entries = (H5FD_vfd_swmr_idx_entry_t *)
+ H5FL_SEQ_FREE(H5FD_vfd_swmr_idx_entry_t,
+ md_index.entries);
+ }
+
+ if (rc == FAIL) {
+ HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL,
+ "could not re-read header");
+ }
+ }
+
+ /* Exhaust all retries for loading and decoding the md file header
+ * and index
+ */
+ if (!do_try) {
+ HGOTO_ERROR(H5E_VFL, H5E_CANTLOAD, FAIL, \
+ "error in loading/decoding the metadata file header and index")
+ }
+
+ /* Free VFD local entries */
+ if (file->md_index.entries != NULL) {
+
+ HDassert(file->md_index.num_entries);
+
+ file->md_index.entries = (H5FD_vfd_swmr_idx_entry_t *)
+ H5FL_SEQ_FREE(H5FD_vfd_swmr_idx_entry_t, file->md_index.entries);
+ }
+
+ /* Copy header and index to VFD */
+ file->md_header = md_header;
+ file->md_index = md_index;
+ md_index.entries = NULL;
+
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5FD__vfd_swmr_load_hdr_and_idx() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD__vfd_swmr_header_deserialize()
+ *
+ * Purpose: To load and decode the header in the metadata file
+ * --Retry to get a file with size at least the size of the header
+ * --Retry on loading the valid magic and checksum for the header
+ * --Decode the header
+ *
+ * Return: Success: SUCCEED
+ * Failure: FAIL
+ *
+ * Programmer: Vailin Choi
+ *
+ *-------------------------------------------------------------------------
+ */
+static htri_t
+H5FD__vfd_swmr_header_deserialize(H5FD_vfd_swmr_t *file,
+ H5FD_vfd_swmr_md_header *md_header)
+{
+ uint8_t image[H5FD_MD_HEADER_SIZE]; /* Buffer for element data */
+ uint32_t stored_chksum; /* Stored metadata checksum */
+ uint32_t computed_chksum; /* Computed metadata checksum */
+ uint8_t *p;
+ htri_t ret_value = FAIL;
+ uint64_t index_length;
+ ssize_t nread;
+
+ FUNC_ENTER_STATIC
+
+ /* Set file pointer to the beginning the file */
+ if (lseek(file->md_fd, H5FD_MD_HEADER_OFF, SEEK_SET) < 0) {
+ HGOTO_ERROR(H5E_VFL, H5E_SEEKERROR, FAIL, \
+ "unable to seek in metadata file");
+ }
+
+ /* Read the header */
+ nread = read(file->md_fd, image, H5FD_MD_HEADER_SIZE);
+
+ /* Try again if a signal interrupted the read. */
+ if (nread == -1 && errno == EINTR)
+ HGOTO_DONE(FALSE);
+
+ /* We cannot recover from any other error by trying again,
+ * so bail out.
+ */
+ if (nread == -1) {
+ HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL,
+ "error in reading the shadow header");
+ }
+
+ if ((uint64_t)nread < H5FD_MD_HEADER_SIZE)
+ HGOTO_DONE(FALSE);
+
+ /* Verify magic number */
+ if (memcmp(image, H5FD_MD_HEADER_MAGIC, H5_SIZEOF_MAGIC) != 0)
+ HGOTO_DONE(FALSE);
+
+ /* Verify stored and computed checksums are equal */
+ H5F_get_checksums(image, H5FD_MD_HEADER_SIZE, &stored_chksum,
+ &computed_chksum);
+
+ if (stored_chksum != computed_chksum)
+ HGOTO_DONE(FALSE);
+
+ /* Header magic is already valid */
+ p = image + H5_SIZEOF_MAGIC;
+
+ /* Deserialize page size, tick number, index offset, index length */
+ UINT32DECODE(p, md_header->fs_page_size);
+ UINT64DECODE(p, md_header->tick_num);
+ UINT64DECODE(p, md_header->index_offset);
+ if ((index_length = uint64_decode(&p)) > SIZE_MAX) {
+ HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL,
+ "index is too large to hold in core");
+ }
+
+ md_header->index_length = (size_t)index_length;
+
+ /* Checksum is already valid */
+ UINT32DECODE(p, stored_chksum);
+
+ /* Sanity check */
+ HDassert((size_t)(p - image) <= H5FD_MD_HEADER_SIZE);
+
+#if 0 /* JRM */
+ HDfprintf(stderr,
+ "---read header ps/tick/idx_off/idx_len = %d / %lld / %lld / %lld\n",
+ md_header->fs_page_size, md_header->tick_num,
+ md_header->index_offset, md_header->index_length);
+#endif /* JRM */
+
+ ret_value = TRUE;
+
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5FD__vfd_swmr_header_deserialize() */
+
+
+
+/*
+ * Function: H5FD__vfd_swmr_index_deserialize()
+ *
+ * Purpose: Load and decode the index in the metadata file
+ * --Retry to get a file with size at least the size of the
+ * (header+index)
+ * --Retry on loading the valid magic and checksum for the index
+ * --Decode the index
+ * --Decode the index entries if the tick number in the header and
+ * the index match
+ *
+ * Return: Success: TRUE
+ * Failure: FAIL
+ * Retry: FALSE
+ *
+ */
+static htri_t
+H5FD__vfd_swmr_index_deserialize(const H5FD_vfd_swmr_t *file,
+ H5FD_vfd_swmr_md_index *md_index, const H5FD_vfd_swmr_md_header *md_header)
+{
+ uint8_t *image; /* Buffer */
+ uint8_t *p = NULL; /* Pointer to buffer */
+ uint32_t stored_chksum; /* Stored metadata checksum value */
+ uint32_t computed_chksum; /* Computed metadata checksum value */
+ unsigned i; /* Local index variable */
+ htri_t ret_value = TRUE;
+ ssize_t nread;
+
+ FUNC_ENTER_STATIC
+
+ /* Allocate buffer for reading index */
+ if (NULL == (image = H5MM_malloc(md_header->index_length))) {
+ HGOTO_ERROR(H5E_VFL, H5E_CANTALLOC, FAIL,
+ "memory allocation failed for index's on disk image buffer");
+ }
+
+ /* We may seek past EOF. That's ok, the read(2) will catch that. */
+ if (lseek(file->md_fd, (HDoff_t)md_header->index_offset, SEEK_SET) < 0){
+ HGOTO_ERROR(H5E_VFL, H5E_SEEKERROR, FAIL,
+ "unable to seek in metadata file");
+ }
+
+ nread = read(file->md_fd, image, md_header->index_length);
+
+ /* Try again if a signal interrupted the read. */
+ if (nread == -1 && errno == EINTR)
+ HGOTO_DONE(FALSE);
+
+ /* We cannot recover from any other error by trying again,
+ * so bail out.
+ */
+ if (nread == -1) {
+ HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL,
+ "error in reading the header in metadata file");
+ }
+
+ /* Try again if the read was not full.
+ *
+ * XXX XXX XXX
+ * A short read should not be possible under the protocol that
+ * I intend to adopt: the writer will write(2) the new index.
+ * In a second write(2), the header describing that index
+ * will be written. POSIX will guarantee that the former
+ * write is visible before the latter. Under the protocol,
+ * there should always be `index_length` bytes available to
+ * read at `index_offset`. If not, the reader should treat it
+ * like an unrecoverable error instead of retrying.
+ */
+ if ((size_t)nread < md_header->index_length)
+ HGOTO_DONE(FALSE);
+
+ /* If the index magic is incorrect, then assume that is a
+ * temporary error and try again.
+ *
+ * XXX XXX XXX
+ * Under the new protocol, where the index is written in
+ * one write(2), and the header is written in a distinct
+ * second write(2), and the header and index are read in
+ * the reverse order, the index magic usually will be intact.
+ *
+ * It is possible under the new protocol that we read
+ * the header on tick `t`, then an arbitrary delay
+ * occurs (the user taps Control-Z, say), and then we
+ * read the index on tick `t + max_lag + 1` or later.
+ * In the mean time, the index may have moved, and its
+ * storage may have been reused. In that case, we could
+ * read bad magic. It's possible to recover by
+ * re-reading the header.
+ */
+ if (memcmp(image, H5FD_MD_INDEX_MAGIC, H5_SIZEOF_MAGIC) != 0)
+ HGOTO_DONE(FALSE);
+
+ /* Verify stored and computed checksums are equal */
+ H5F_get_checksums(image, md_header->index_length, &stored_chksum,
+ &computed_chksum);
+
+ if (stored_chksum != computed_chksum)
+ HGOTO_DONE(FALSE);
+
+ p = image + H5_SIZEOF_MAGIC;
+
+ /* Deserialize the index info: tick number, number of entries, entries,
+ * checksum
+ */
+ UINT64DECODE(p, md_index->tick_num);
+ UINT32DECODE(p, md_index->num_entries);
+
+ /* Read index entries */
+ if(md_index->num_entries) {
+ /* Allocate memory for index entries */
+ md_index->entries = H5FL_SEQ_CALLOC(H5FD_vfd_swmr_idx_entry_t,
+ md_index->num_entries);
+ if (NULL == md_index->entries) {
+ HGOTO_ERROR(H5E_VFL, H5E_CANTALLOC, FAIL,
+ "memory allocation failed for index entries");
+ }
+
+ /* Decode index entries */
+ for (i = 0; i < md_index->num_entries; i++) {
+ UINT32DECODE(p, md_index->entries[i].hdf5_page_offset);
+ UINT32DECODE(p, md_index->entries[i].md_file_page_offset);
+ UINT32DECODE(p, md_index->entries[i].length);
+ UINT32DECODE(p, md_index->entries[i].chksum);
+ }
+ } else
+ md_index->entries = NULL;
+
+ /* Checksum is already valid */
+ UINT32DECODE(p, stored_chksum);
+
+ /* Sanity check */
+ HDassert((size_t)(p - image) <= md_header->index_length);
+
+#if 0 /* JRM */
+ HDfprintf(stderr,
+ " ---- read index tick/num_entries = %lld / %d \n",
+ md_index->tick_num, md_index->num_entries);
+#endif /* JRM */
+
+
+done:
+ if (image != NULL)
+ image = H5MM_xfree(image);
+
+ if (ret_value == FAIL && md_index->entries != NULL) {
+
+ HDassert(md_index->num_entries != 0);
+
+ md_index->entries =
+ H5FL_SEQ_FREE(H5FD_vfd_swmr_idx_entry_t, md_index->entries);
+ }
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5FD__vfd_swmr_index_deserialize() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_vfd_swmr_get_tick_and_idx()
+ *
+ * Purpose: Retrieve tick_num, num_entries and index from the metadata
+ * file
+ *
+ * --If the parameter "reload_hdr_and_index" is true, load and
+ * decode the header and index via
+ * H5FD__vfd_swmr_load_hdr_and_idx(), which may replace the
+ * VFD's local copies of header and index with the
+ * latest info read.
+ *
+ * --Return tick_num, num_entries and index from the VFD's
+ * local copies.
+ *
+ * Return: Success: SUCCEED
+ * Failure: FAIL
+ *
+ * Programmer: Vailin Choi
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5FD_vfd_swmr_get_tick_and_idx(H5FD_t *_file, hbool_t reload_hdr_and_index,
+ uint64_t *tick_ptr, uint32_t *num_entries_ptr,
+ H5FD_vfd_swmr_idx_entry_t index[])
+{
+ H5FD_vfd_swmr_t *file = (H5FD_vfd_swmr_t *)_file; /* VFD SWMR file struct */
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ assert(index == NULL || num_entries_ptr != NULL);
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ /* Load and decode the header and index as indicated */
+ if (reload_hdr_and_index &&
+ H5FD__vfd_swmr_load_hdr_and_idx(file, FALSE) < 0) {
+ HGOTO_ERROR(H5E_VFL, H5E_CANTLOAD, FAIL,
+ "unable to load/decode md header and index")
+ }
+
+ /* Return tick_num */
+ if(tick_ptr != NULL)
+ *tick_ptr = file->md_header.tick_num;
+
+ if (index != NULL) {
+
+ if (*num_entries_ptr < file->md_index.num_entries) {
+ HGOTO_ERROR(H5E_VFL, H5E_CANTLOAD, FAIL,
+ "not enough space to copy index");
+ }
+
+ HDmemcpy(index, file->md_index.entries,
+ (file->md_index.num_entries *
+ sizeof(file->md_index.entries[0])));
+ }
+
+ if(num_entries_ptr != NULL)
+ *num_entries_ptr = file->md_index.num_entries;
+
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5FD_vfd_swmr_get_tick_and_idx() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_vfd_swmr_dump_status
+ *
+ * Purpose: Dump a variety of information about the vfd swmr reader
+ * vfd to stderr for debugging purposes.
+ *
+ * Return: Success: SUCCEED
+ * Failure: FAIL
+ *
+ *-------------------------------------------------------------------------
+ */
+void
+H5FD_vfd_swmr_dump_status(H5FD_t *_file, uint64_t page)
+{
+ hbool_t in_index = FALSE;
+ int i = 0;
+ uint32_t num_entries;
+ H5FD_vfd_swmr_idx_entry_t *index;
+ H5FD_vfd_swmr_t *file = (H5FD_vfd_swmr_t *)_file; /* VFD SWMR file struct */
+
+ FUNC_ENTER_NOAPI_NOINIT_NOERR
+
+ HDassert(file);
+
+ index = file->md_index.entries;
+ num_entries = file->md_index.num_entries;
+
+ while ( ( ! in_index ) && ( i < (int)num_entries ) ) {
+
+ if ( index[i].hdf5_page_offset == page ) {
+
+ in_index = TRUE;
+ }
+
+ HDassert( ( i == 0 ) ||
+ ( index[i-1].hdf5_page_offset < index[i].hdf5_page_offset ) );
+
+ i++;
+ }
+
+ HDfprintf(stderr,
+ "fd: tick = %" PRIu64 ", index_len = %" PRIu32 ", page %" PRIu64
+ " in index = %s.\n",
+ file->md_index.tick_num, num_entries, page,
+ in_index ? "true" : "false");
+
+ FUNC_LEAVE_NOAPI_VOID
+
+} /* H5FD_vfd_swmr_dump_status() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_vfd_swmr_set_pb_configured
+ *
+ * Purpose: Set the pb_configured field.
+ *
+ * This notifies the VFD that the page buffer is configured,
+ * and that therefore all reads to the metadata file should
+ * read complete pages or multi-page metadata entries.
+ *
+ * This function in necessary because we haven't modified
+ * the file open code to configure the page buffer prior
+ * to any file I/O when opening a file VFD SWMR reader.
+ * Once this is done, this function should be removed.
+ *
+ * Return: VOID
+ *
+ * Programmer: JRM -- 1/29/19
+ *
+ *-------------------------------------------------------------------------
+ */
+void
+H5FD_vfd_swmr_set_pb_configured(H5FD_t *_file)
+{
+ H5FD_vfd_swmr_t *file = (H5FD_vfd_swmr_t *)_file; /* VFD SWMR file struct */
+
+ FUNC_ENTER_NOAPI_NOINIT_NOERR
+
+ HDassert(file);
+
+ file->pb_configured = TRUE;
+
+ FUNC_LEAVE_NOAPI_VOID
+
+} /* H5FD_vfd_swmr_set_pb_configured() */
+
+void
+H5FD_vfd_swmr_record_elapsed_ticks(H5FD_t *_file, uint64_t elapsed)
+{
+ H5FD_vfd_swmr_t *file = (H5FD_vfd_swmr_t *)_file;
+
+ uint32_t elapsed_idx = MIN(elapsed, file->api_elapsed_nslots);
+
+ file->api_elapsed_ticks[elapsed_idx]++;
+}
diff --git a/src/H5FDvfd_swmr.h b/src/H5FDvfd_swmr.h
new file mode 100644
index 0000000..86e9d0f
--- /dev/null
+++ b/src/H5FDvfd_swmr.h
@@ -0,0 +1,38 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright by The HDF Group. *
+ * Copyright by the Board of Trustees of the University of Illinois. *
+ * All rights reserved. *
+ * *
+ * This file is part of HDF5. The full HDF5 copyright notice, including *
+ * terms governing use, modification, and redistribution, is contained in *
+ * the COPYING file, which can be found at the root of the source code *
+ * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. *
+ * If you do not have access to either file, you may request a copy from *
+ * help@hdfgroup.org. *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+/*
+ * The public header file for the VFD SWMR driver.
+ */
+#ifndef H5FDswmr_H
+#define H5FDswmr_H
+
+#include "H5api_adpt.h" /* H5_DLL */
+#include "H5public.h" /* uint64_t *ahem* */
+#include "H5Ipublic.h" /* hid_t */
+
+#define H5FD_VFD_SWMR (H5FD_vfd_swmr_init())
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+H5_DLL hid_t H5FD_vfd_swmr_init(void);
+H5_DLL herr_t H5Pset_fapl_vfd_swmr(hid_t fapl_id);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/H5FDvfd_swmr_instr.c b/src/H5FDvfd_swmr_instr.c
new file mode 100644
index 0000000..865483c
--- /dev/null
+++ b/src/H5FDvfd_swmr_instr.c
@@ -0,0 +1,28 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright by The HDF Group. *
+ * All rights reserved. *
+ * *
+ * This file is part of HDF5. The full HDF5 copyright notice, including *
+ * terms governing use, modification, and redistribution, is contained in *
+ * the COPYING file, which can be found at the root of the source code *
+ * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. *
+ * If you do not have access to either file, you may request a copy from *
+ * help@hdfgroup.org. *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#include "H5private.h" /* H5_ATTR_UNUSED */
+#include "H5Fpublic.h"
+#include "H5FDvfd_swmr.h"
+
+bool
+vfd_swmr_writer_may_increase_tick_to(uint64_t H5_ATTR_UNUSED tick_num,
+ bool H5_ATTR_UNUSED wait_for_reader)
+{
+ return true;
+}
+
+void
+vfd_swmr_reader_did_increase_tick_to(uint64_t H5_ATTR_UNUSED tick_num)
+{
+ return;
+}
diff --git a/src/H5FDvfd_swmr_private.h b/src/H5FDvfd_swmr_private.h
new file mode 100644
index 0000000..67a72ec
--- /dev/null
+++ b/src/H5FDvfd_swmr_private.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2019 The HDF Group. All rights reserved.
+ *
+ * This file is part of HDF5. The full HDF5 copyright notice, including
+ * terms governing use, modification, and redistribution, is contained in
+ * the COPYING file, which can be found at the root of the source code
+ * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases.
+ * If you do not have access to either file, you may request a copy from
+ * help@hdfgroup.org.
+ */
+
+#ifndef _H5FDvfd_swmr_private_H
+#define _H5FDvfd_swmr_private_H
+
+#include "H5queue.h" /* for TAILQ_* */
+#include "hlog.h" /* for TAILQ_* */
+
+/* Forward declaration */
+struct H5F_t;
+struct H5F_shared_t;
+struct H5FD_vfd_swmr_idx_entry_t;
+
+/*
+ * struct eot_queue_entry_t
+ *
+ * This is the structure for an entry on the end-of-tick queue (EOT queue) of files
+ * opened in either VFD SWMR write or VFD SWMR read mode. This queue is maintained
+ * in increasing end of tick time order.
+ * The structure contains all information required to determine whether the end
+ * of tick has arrived for the specified file, and to initiate end of tick processing
+ * if it has.
+ *
+ * The fields of eot_queue_entry_t are discussed below:
+ *
+ * vfd_swmr_file: Pointer to the instance of H5F_file_t containing the shared
+ * fields of the associated file that has been opened in VFD SWMR mode
+ * NOTE: for the time being use H5F_t instead of H5F_file_t
+ *
+ * vfd_swmr_writer: Boolean flag that is set to TRUE if the associated file
+ * has been opened in VFD SWMR writer mode, and FALSE if it has been
+ * opened in VFD SWMR reader mode.
+ *
+ * tick_num: Number of the current tick of the target file.
+ *
+ * end_of_tick: Expiration time of the current tick of the target file.
+ *
+ * link: Forward and backward linkage between the next element and the previous
+ * element (or the queue head). Note that if there is a following entry,
+ * `next`, then `next->end_of_tick` must be greater than or equal to
+ * `end_of_tick`.
+ */
+typedef struct eot_queue_entry {
+ hbool_t vfd_swmr_writer;
+ uint64_t tick_num;
+ struct timespec end_of_tick;
+ struct H5F_t *vfd_swmr_file; /* NOTE: for the time being use H5F_t instead H5F_file_t */
+ TAILQ_ENTRY(eot_queue_entry) link;
+} eot_queue_entry_t;
+
+extern unsigned int vfd_swmr_api_entries_g;
+
+/* The head of the EOT queue */
+typedef TAILQ_HEAD(eot_queue, eot_queue_entry) eot_queue_t;
+
+extern eot_queue_t eot_queue_g;
+
+HLOG_OUTLET_DECL(swmr);
+HLOG_OUTLET_DECL(pbwr);
+HLOG_OUTLET_DECL(shadow_index_reclaim);
+HLOG_OUTLET_DECL(mdc_invalidation);
+
+/***************************************/
+/* Library-private Function Prototypes */
+/***************************************/
+
+H5_DLL herr_t H5F_vfd_swmr_init(struct H5F_t *f, hbool_t file_create);
+H5_DLL herr_t H5F_vfd_swmr_close_or_flush(struct H5F_t *f, hbool_t closing);
+H5_DLL herr_t H5F_update_vfd_swmr_metadata_file(struct H5F_t *f,
+ uint32_t index_len, struct H5FD_vfd_swmr_idx_entry_t *index);
+H5_DLL herr_t H5F_vfd_swmr_writer__delay_write(struct H5F_shared_t *, uint64_t,
+ uint64_t *);
+H5_DLL herr_t H5F_vfd_swmr_writer__prep_for_flush_or_close(struct H5F_t *f);
+herr_t H5F_vfd_swmr_process_eot_queue(bool);
+H5_DLL herr_t H5F_vfd_swmr_writer_end_of_tick(struct H5F_t *f, bool);
+H5_DLL herr_t H5F_vfd_swmr_writer__dump_index(struct H5F_shared_t *);
+H5_DLL herr_t H5F_vfd_swmr_reader_end_of_tick(struct H5F_t *f, bool);
+
+H5_DLL herr_t H5F_vfd_swmr_remove_entry_eot(struct H5F_t *f);
+H5_DLL herr_t H5F_vfd_swmr_insert_entry_eot(struct H5F_t *f);
+H5_DLL void H5F_vfd_swmr_update_entry_eot(eot_queue_entry_t *);
+H5_DLL herr_t H5F_dump_eot_queue(void);
+
+#endif /* _H5FDvfd_swmr_private_H */
diff --git a/src/H5FScache.c b/src/H5FScache.c
index c3e3998..214e6aa 100644
--- a/src/H5FScache.c
+++ b/src/H5FScache.c
@@ -123,6 +123,7 @@ const H5AC_class_t H5AC_FSPACE_HDR[1] = {{
H5FS__cache_hdr_notify, /* 'notify' callback */
H5FS__cache_hdr_free_icr, /* 'free_icr' callback */
NULL, /* 'fsf_size' callback */
+ NULL, /* VFD SWMR 'refresh' callback */
}};
/* H5FS section info inherits cache-like properties from H5AC */
@@ -141,6 +142,7 @@ const H5AC_class_t H5AC_FSPACE_SINFO[1] = {{
H5FS__cache_sinfo_notify, /* 'notify' callback */
H5FS__cache_sinfo_free_icr, /* 'free_icr' callback */
NULL, /* 'fsf_size' callback */
+ NULL, /* VFD SWMR 'refresh' callback */
}};
diff --git a/src/H5FSprivate.h b/src/H5FSprivate.h
index d2e1f90..07152ba 100644
--- a/src/H5FSprivate.h
+++ b/src/H5FSprivate.h
@@ -130,6 +130,7 @@ struct H5FS_section_info_t {
typedef enum H5FS_client_t {
H5FS_CLIENT_FHEAP_ID = 0, /* Free space is used by fractal heap */
H5FS_CLIENT_FILE_ID, /* Free space is used by file */
+ H5FS_CLIENT_MD_VFD_ID, /* Free space is used by the metadata file for VFD SWMR */
H5FS_NUM_CLIENT_ID /* Number of free space client IDs (must be last) */
} H5FS_client_t;
diff --git a/src/H5FSsection.c b/src/H5FSsection.c
index d783901..77e7a89 100644
--- a/src/H5FSsection.c
+++ b/src/H5FSsection.c
@@ -1653,7 +1653,7 @@ H5FS_sect_try_merge(H5F_t *f, H5FS_t *fspace, H5FS_section_info_t *sect,
} /* end if */
else {
/* Check if section is merged */
- if(sect->size > saved_fs_size) {
+ if(sect->size != saved_fs_size) {
if(H5FS__sect_link(fspace, sect, flags) < 0)
HGOTO_ERROR(H5E_FSPACE, H5E_CANTINSERT, FAIL, "can't insert free space section into skip list")
sinfo_modified = TRUE;
diff --git a/src/H5Fint.c b/src/H5Fint.c
index 0bda894..0531223 100644
--- a/src/H5Fint.c
+++ b/src/H5Fint.c
@@ -33,6 +33,7 @@
#include "H5Iprivate.h" /* IDs */
#include "H5Lprivate.h" /* Links */
#include "H5MFprivate.h" /* File memory management */
+#include "H5MVprivate.h" /* File memory management for VFD SWMR */
#include "H5MMprivate.h" /* Memory management */
#include "H5Pprivate.h" /* Property lists */
#include "H5SMprivate.h" /* Shared Object Header Messages */
@@ -86,12 +87,10 @@ static herr_t H5F__build_actual_name(const H5F_t *f, const H5P_genplist_t *fapl,
static herr_t H5F__flush_phase1(H5F_t *f);
static herr_t H5F__flush_phase2(H5F_t *f, hbool_t closing);
-
/*********************/
/* Package Variables */
/*********************/
-
/*****************************/
/* Library Private Variables */
/*****************************/
@@ -233,14 +232,18 @@ H5F_get_access_plist(H5F_t *f, hbool_t app_ref)
efc_size = H5F__efc_max_nfiles(f->shared->efc);
if(H5P_set(new_plist, H5F_ACS_EFC_SIZE_NAME, &efc_size) < 0)
HGOTO_ERROR(H5E_FILE, H5E_CANTGET, H5I_INVALID_HID, "can't set elink file cache size")
- if(f->shared->page_buf != NULL) {
- if(H5P_set(new_plist, H5F_ACS_PAGE_BUFFER_SIZE_NAME, &(f->shared->page_buf->max_size)) < 0)
+ if(f->shared->pb_ptr != NULL) {
+ if(H5P_set(new_plist, H5F_ACS_PAGE_BUFFER_SIZE_NAME, &(f->shared->pb_ptr->max_size)) < 0)
HGOTO_ERROR(H5E_FILE, H5E_CANTGET, H5I_INVALID_HID, "can't set page buffer size")
- if(H5P_set(new_plist, H5F_ACS_PAGE_BUFFER_MIN_META_PERC_NAME, &(f->shared->page_buf->min_meta_perc)) < 0)
+ if(H5P_set(new_plist, H5F_ACS_PAGE_BUFFER_MIN_META_PERC_NAME, &(f->shared->pb_ptr->min_meta_perc)) < 0)
HGOTO_ERROR(H5E_FILE, H5E_CANTGET, H5I_INVALID_HID, "can't set minimum metadata fraction of page buffer")
- if(H5P_set(new_plist, H5F_ACS_PAGE_BUFFER_MIN_RAW_PERC_NAME, &(f->shared->page_buf->min_raw_perc)) < 0)
+ if(H5P_set(new_plist, H5F_ACS_PAGE_BUFFER_MIN_RAW_PERC_NAME, &(f->shared->pb_ptr->min_raw_perc)) < 0)
HGOTO_ERROR(H5E_FILE, H5E_CANTGET, H5I_INVALID_HID, "can't set minimum raw data fraction of page buffer")
} /* end if */
+
+ if(H5P_set(new_plist, H5F_ACS_VFD_SWMR_CONFIG_NAME, &(f->shared->vfd_swmr_config)) < 0)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set initial metadata cache resize config.")
+
#ifdef H5_HAVE_PARALLEL
if(H5P_set(new_plist, H5_COLL_MD_READ_FLAG_NAME, &(f->shared->coll_md_read)) < 0)
HGOTO_ERROR(H5E_FILE, H5E_CANTGET, H5I_INVALID_HID, "can't set collective metadata read flag")
@@ -1097,6 +1100,25 @@ H5F__new(H5F_shared_t *shared, unsigned flags, hid_t fcpl_id, hid_t fapl_id, H5F
if(H5P_get(plist, H5F_ACS_OBJECT_FLUSH_CB_NAME, &(f->shared->object_flush)) < 0)
HGOTO_ERROR(H5E_FILE, H5E_CANTGET, NULL, "can't get object flush cb info")
+ /* Get VFD SWMR configuration */
+ if(H5P_get(plist, H5F_ACS_VFD_SWMR_CONFIG_NAME, &(f->shared->vfd_swmr_config)) < 0)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, NULL, "can't get VFD SWMR config info")
+
+ /* Initialization for VFD SWMR */
+ f->shared->vfd_swmr = FALSE;
+ f->shared->vfd_swmr_writer = FALSE;
+ f->shared->tick_num = 0;
+ f->shared->mdf_idx = NULL;
+ f->shared->mdf_idx_len = 0;
+ f->shared->mdf_idx_entries_used = 0;
+ f->shared->old_mdf_idx = NULL;
+ f->shared->old_mdf_idx_len = 0;
+ f->shared->old_mdf_idx_entries_used = 0;
+
+ f->shared->vfd_swmr_md_fd = -1;
+ f->shared->fs_man_md = NULL;
+ TAILQ_INIT(&f->shared->shadow_defrees);
+
/* Get the VOL connector info */
if(H5F__set_vol_conn(f) < 0)
HGOTO_ERROR(H5E_FILE, H5E_CANTINIT, NULL, "can't cache VOL connector info")
@@ -1329,6 +1351,12 @@ H5F__dest(H5F_t *f, hbool_t flush)
/* Push error, but keep going*/
HDONE_ERROR(H5E_FILE, H5E_CANTRELEASE, FAIL, "problems closing file")
+ /* If this is a VFD SWMR writer, prep for flush or close */
+ if((f->shared->vfd_swmr) && (f->shared->vfd_swmr_writer) &&
+ (H5F_vfd_swmr_writer__prep_for_flush_or_close(f) < 0))
+ /* Push error, but keep going*/
+ HDONE_ERROR(H5E_IO, H5E_CANTFLUSH, FAIL, "vfd swmr prep for flush or close failed")
+
/* Shutdown the page buffer cache */
if(H5PB_dest(f->shared) < 0)
/* Push error, but keep going*/
@@ -1370,6 +1398,17 @@ H5F__dest(H5F_t *f, hbool_t flush)
/* Push error, but keep going*/
HDONE_ERROR(H5E_FILE, H5E_CANTDEC, FAIL, "can't close property list")
+ /* VFD SWMR: closing down */
+ if(H5F_ACC_RDWR & H5F_INTENT(f) && f->shared->vfd_swmr_md_fd >= 0) {
+ if(H5F_vfd_swmr_close_or_flush(f, TRUE) < 0)
+ HDONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, "unable to close the metadata file")
+ }
+
+ if(f->shared->vfd_swmr) {
+ if(H5F_vfd_swmr_remove_entry_eot(f) < 0)
+ HDONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, "unable to remove entry from EOT queue")
+ }
+
/* Clean up the cached VOL connector ID & info */
if(f->shared->vol_info)
if(H5VL_free_connector_info(f->shared->vol_id, f->shared->vol_info) < 0)
@@ -1386,6 +1425,15 @@ H5F__dest(H5F_t *f, hbool_t flush)
/* Push error, but keep going*/
HDONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, "unable to close file")
+ /* A VFD SWMR reader may still have a metadata index at this stage.
+ * If so, free it.
+ */
+ if (f->shared->vfd_swmr && f->shared->mdf_idx != NULL) {
+ HDfree(f->shared->mdf_idx);
+ f->shared->mdf_idx = NULL;
+ f->shared->mdf_idx_len = 0;
+ }
+
/* Free mount table */
f->shared->mtab.child = (H5F_mount_t *)H5MM_xfree(f->shared->mtab.child);
f->shared->mtab.nalloc = 0;
@@ -1408,6 +1456,11 @@ H5F__dest(H5F_t *f, hbool_t flush)
* Only decrement the reference count.
*/
--f->shared->nrefs;
+
+ if(f->shared->vfd_swmr) {
+ if(H5F_vfd_swmr_remove_entry_eot(f) < 0)
+ HDONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, "unable to remove entry from EOT queue")
+ }
}
/* Free the non-shared part of the file */
@@ -1518,10 +1571,33 @@ H5F_open(const char *name, unsigned flags, hid_t fcpl_id, hid_t fapl_id)
hbool_t use_file_locking; /*read from env var */
hbool_t ci_load = FALSE; /* whether MDC ci load requested */
hbool_t ci_write = FALSE; /* whether MDC CI write requested */
- H5F_t *ret_value = NULL; /*actual return value */
+ hbool_t file_create = FALSE; /* creating a new file or not */
+ H5F_vfd_swmr_config_t *vfd_swmr_config_ptr = NULL; /* Points to VFD SMWR config info */
+ H5F_t *ret_value = NULL; /*actual return value */
FUNC_ENTER_NOAPI(NULL)
+ /* Get the file access property list, for future queries */
+ if(NULL == (a_plist = (H5P_genplist_t *)H5I_object(fapl_id)))
+ HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, NULL, "not file access property list")
+
+ /* Allocate space for VFD SWMR configuration info */
+ if(NULL == (vfd_swmr_config_ptr = H5MM_calloc(sizeof(H5F_vfd_swmr_config_t))))
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL, "can't allocate memory for mdc log file name")
+
+ /* Get VFD SWMR configuration */
+ if(H5P_get(a_plist, H5F_ACS_VFD_SWMR_CONFIG_NAME, vfd_swmr_config_ptr) < 0)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, NULL, "can't get VFD SWMR config info")
+
+ /* When configured with VFD SWMR */
+ if(vfd_swmr_config_ptr->version) {
+ /* Verify that file access flags are consistent with VFD SWMR configuartion */
+ if((flags & H5F_ACC_RDWR) && !vfd_swmr_config_ptr->writer)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, NULL, "file access is writer but VFD SWMR config is reader")
+ if((flags & H5F_ACC_RDWR) == 0 && vfd_swmr_config_ptr->writer)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, NULL, "file access is reader but VFD SWMR config is writer")
+ }
+
/*
* If the driver has a `cmp' method then the driver is capable of
* determining when two file handles refer to the same file and the
@@ -1568,11 +1644,19 @@ H5F_open(const char *name, unsigned flags, hid_t fcpl_id, hid_t fapl_id)
HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to open file: name = '%s', tent_flags = %x", name, tent_flags)
} /* end if */
+ /* Avoid reusing a virtual file opened exclusively by a second virtual
+ * file, or opening the same file twice with different parameters.
+ */
+ if ((lf = H5FD_deduplicate(lf, fapl_id)) == NULL) {
+ HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL,
+ "an already-open file conflicts with '%s'", name);
+ }
+
/* Is the file already open? */
if((shared = H5F__sfile_search(lf)) != NULL) {
/*
- * The file is already open, so use that one instead of the one we
- * just opened. We only one one H5FD_t* per file so one doesn't
+ * The file is already open, so use the corresponding H5F_shared_t.
+ * We only allow one H5FD_t* per file so one doesn't
* confuse the other. But fail if this request was to truncate the
* file (since we can't do that while the file is open), or if the
* request was to create a non-existent file (since the file already
@@ -1580,8 +1664,6 @@ H5F_open(const char *name, unsigned flags, hid_t fcpl_id, hid_t fapl_id)
* readers don't expect the file to change under them), or if the
* SWMR write/read access flags don't agree.
*/
- if(H5FD_close(lf) < 0)
- HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to close low-level file info")
if(flags & H5F_ACC_TRUNC)
HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to truncate a file which is already open")
if(flags & H5F_ACC_EXCL)
@@ -1651,10 +1733,6 @@ H5F_open(const char *name, unsigned flags, hid_t fcpl_id, hid_t fapl_id)
shared = file->shared;
lf = shared->lf;
- /* Get the file access property list, for future queries */
- if(NULL == (a_plist = (H5P_genplist_t *)H5I_object(fapl_id)))
- HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, NULL, "not file access property list")
-
/* Check if page buffering is enabled */
if(H5P_get(a_plist, H5F_ACS_PAGE_BUFFER_SIZE_NAME, &page_buf_size) < 0)
HGOTO_ERROR(H5E_FILE, H5E_CANTGET, NULL, "can't get page buffer size")
@@ -1674,6 +1752,7 @@ H5F_open(const char *name, unsigned flags, hid_t fcpl_id, hid_t fapl_id)
HGOTO_ERROR(H5E_FILE, H5E_CANTGET, NULL, "can't get minimum raw data fraction of page buffer")
} /* end if */
+
/*
* Read or write the file superblock, depending on whether the file is
* empty or not.
@@ -1700,6 +1779,9 @@ H5F_open(const char *name, unsigned flags, hid_t fcpl_id, hid_t fapl_id)
*/
if(H5G_mkroot(file, TRUE) < 0)
HGOTO_ERROR(H5E_FILE, H5E_CANTINIT, NULL, "unable to create/open root group")
+
+ file_create = TRUE;
+
} /* end if */
else if(1 == shared->nrefs) {
/* Read the superblock if it hasn't been read before. */
@@ -1714,8 +1796,22 @@ H5F_open(const char *name, unsigned flags, hid_t fcpl_id, hid_t fapl_id)
/* Open the root group */
if(H5G_mkroot(file, FALSE) < 0)
HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to read root group")
+
} /* end if */
+ /* Checked if configured for VFD SWMR */
+ if(H5F_VFD_SWMR_CONFIG(file)) {
+ /* Initialization for VFD SWMR writer and reader */
+ if(1 == shared->nrefs) {
+ if(H5F_vfd_swmr_init(file, file_create) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTSET, NULL, "file open fail with initialization for VFD SWMR")
+ }
+
+ /* Insert the entry that corresponds to file onto the EOT queue */
+ if(H5F_vfd_swmr_insert_entry_eot(file) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTSET, NULL, "unable to insert entry into the EOT queue")
+ }
+
/*
* Decide the file close degree. If it's the first time to open the
* file, set the degree to access property list value; if it's the
@@ -1781,7 +1877,7 @@ H5F_open(const char *name, unsigned flags, hid_t fcpl_id, hid_t fapl_id)
} /* version 3 superblock */
file->shared->sblock->status_flags |= H5F_SUPER_WRITE_ACCESS;
- if(H5F_INTENT(file) & H5F_ACC_SWMR_WRITE)
+ if(H5F_INTENT(file) & H5F_ACC_SWMR_WRITE || H5F_USE_VFD_SWMR(file))
file->shared->sblock->status_flags |= H5F_SUPER_SWMR_WRITE_ACCESS;
/* Flush the superblock & superblock extension */
@@ -1793,7 +1889,7 @@ H5F_open(const char *name, unsigned flags, hid_t fcpl_id, hid_t fapl_id)
HGOTO_ERROR(H5E_FILE, H5E_CANTFLUSH, NULL, "unable to flush superblock extension")
/* Remove the file lock for SWMR_WRITE */
- if(use_file_locking && (H5F_INTENT(file) & H5F_ACC_SWMR_WRITE)) {
+ if(use_file_locking && ((H5F_INTENT(file) & H5F_ACC_SWMR_WRITE) || H5F_USE_VFD_SWMR(file))) {
if(H5FD_unlock(file->shared->lf) < 0)
HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to unlock the file")
} /* end if */
@@ -1801,7 +1897,7 @@ H5F_open(const char *name, unsigned flags, hid_t fcpl_id, hid_t fapl_id)
else { /* H5F_ACC_RDONLY: check consistency of status_flags */
/* Skip check of status_flags for file with < superblock version 3 */
if(file->shared->sblock->super_vers >= HDF5_SUPERBLOCK_VERSION_3) {
- if(H5F_INTENT(file) & H5F_ACC_SWMR_READ) {
+ if(H5F_INTENT(file) & H5F_ACC_SWMR_READ || H5F_USE_VFD_SWMR(file)) {
if((file->shared->sblock->status_flags & H5F_SUPER_WRITE_ACCESS &&
!(file->shared->sblock->status_flags & H5F_SUPER_SWMR_WRITE_ACCESS))
||
@@ -1820,9 +1916,17 @@ H5F_open(const char *name, unsigned flags, hid_t fcpl_id, hid_t fapl_id)
ret_value = file;
done:
- if((NULL == ret_value) && file)
+ if((NULL == ret_value) && file) {
+ if(file->shared->root_grp && file->shared->nrefs == 1) {
+ if(H5AC_expunge_tag_type_metadata(file, H5G_oloc(file->shared->root_grp)->addr, H5AC_OHDR_ID, H5AC__NO_FLAGS_SET, FALSE) < 0)
+ HDONE_ERROR(H5E_FILE, H5E_CANTEXPUNGE, NULL, "unable to expunge root group tagged entries")
+ }
+
if(H5F__dest(file, FALSE) < 0)
HDONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, NULL, "problems closing file")
+ }
+ if(vfd_swmr_config_ptr)
+ H5MM_free(vfd_swmr_config_ptr);
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5F_open() */
@@ -1951,6 +2055,12 @@ H5F__flush_phase2(H5F_t *f, hbool_t closing)
/* Push error, but keep going*/
HDONE_ERROR(H5E_IO, H5E_CANTFLUSH, FAIL, "unable to flush metadata accumulator")
+ /* If this is a VFD SWMR writer, prep for flush or close */
+ if((f->shared->vfd_swmr) && (f->shared->vfd_swmr_writer) &&
+ (H5F_vfd_swmr_writer__prep_for_flush_or_close(f) < 0))
+ /* Push error, but keep going*/
+ HDONE_ERROR(H5E_IO, H5E_CANTFLUSH, FAIL, "vfd swmr prep for flush or close failed")
+
/* Flush the page buffer */
if(H5PB_flush(f->shared) < 0)
/* Push error, but keep going*/
@@ -1994,6 +2104,14 @@ H5F__flush(H5F_t *f)
/* Push error, but keep going*/
HDONE_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to flush file data")
+ /* VFD SWMR when flushing the HDF5 file */
+ if(f->shared->nrefs == 1 && f->shared->vfd_swmr_writer && f->shared->vfd_swmr_md_fd >= 0) {
+ HDassert(H5F_ACC_RDWR & H5F_INTENT(f));
+
+ if(H5F_vfd_swmr_close_or_flush(f, FALSE) < 0)
+ HDONE_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, "unable to encode and write to the metadata file")
+ }
+
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5F__flush() */
@@ -3469,7 +3587,16 @@ H5F__start_swmr_write(H5F_t *f)
/* Refresh (reopen) the objects (groups & datasets) in the file */
for(u = 0; u < grp_dset_count; u++)
- if(H5O_refresh_metadata_reopen(obj_ids[u], &obj_glocs[u], vol_connector, TRUE) < 0)
+ /* XXX This routine probably should prepare legitimate
+ * H5O_refresh_state_t's, above, and pass those instead of NULL, so
+ * that non-persistent object properties---e.g., dataset access
+ * properties---are copied from old objects to reopened objects.
+ *
+ * Passing NULL here shouldn't introduce any bugs in Legacy SWMR,
+ * however, and it lets VFD SWMR development proceed, so I'm not
+ * going to sweat it, now.
+ */
+ if(H5O_refresh_metadata_reopen(obj_ids[u], &obj_glocs[u], NULL, vol_connector, TRUE) < 0)
HGOTO_ERROR(H5E_ATOM, H5E_CLOSEERROR, FAIL, "can't refresh-close object")
/* Unlock the file */
@@ -3666,3 +3793,142 @@ H5F_set_min_dset_ohdr(H5F_t *f, hbool_t minimize)
FUNC_LEAVE_NOAPI(SUCCEED)
} /* H5F_set_min_dset_ohdr() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5F__vfd_swmr_end_tick()
+ *
+ * Purpose: To trigger end of tick processing
+ *
+ * Return: Non-negative on success/Negative on errors
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5F__vfd_swmr_end_tick(H5F_t *f)
+{
+ eot_queue_entry_t *curr;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_PACKAGE
+
+ /* Sanity check */
+ HDassert(f);
+ HDassert(f->shared);
+
+ /* The file should be opened with VFD SWMR configured.*/
+ if(!(H5F_USE_VFD_SWMR(f)))
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "must have VFD SWMR configured for this public routine")
+
+ /* Search EOT queue */
+ TAILQ_FOREACH(curr, &eot_queue_g, link) {
+ if (curr->vfd_swmr_file == f)
+ break;
+ }
+
+ /* If the file does not exist on the EOT queue, flag an error */
+ if(curr == NULL)
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "EOT for the file has been disabled")
+
+ if (f->shared->vfd_swmr_writer) {
+ if (H5F_vfd_swmr_writer_end_of_tick(f, true) < 0)
+ HGOTO_ERROR(H5E_FUNC, H5E_CANTSET, FAIL,
+ "end of tick error for VFD SWMR writer");
+ } else if (H5F_vfd_swmr_reader_end_of_tick(f, true) < 0) {
+ HGOTO_ERROR(H5E_FUNC, H5E_CANTSET, FAIL,
+ "end of tick error for VFD SWMR reader");
+ }
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5F__vfd_swmr_end_tick() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5F__vfd_swmr_disable_end_of_tick()
+ *
+ * Purpose: To disable end of tick processing
+ *
+ * Return: Non-negative on success/Negative on errors
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5F__vfd_swmr_disable_end_of_tick(H5F_t *f)
+{
+ eot_queue_entry_t *curr;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_PACKAGE
+
+ /* Sanity check */
+ HDassert(f);
+ HDassert(f->shared);
+
+ /* The file should be opened with VFD SWMR configured.*/
+ if(!(H5F_USE_VFD_SWMR(f)))
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "must have VFD SWMR configured for this public routine")
+
+ /* Search EOT queue */
+ TAILQ_FOREACH(curr, &eot_queue_g, link) {
+ if (curr->vfd_swmr_file == f)
+ break;
+ }
+
+ /* If the file does not exist on the EOT queue, flag an error */
+ if(curr == NULL)
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "EOT for the file has already been disabled")
+
+ /* Remove the entry that corresponds to "f" from the EOT queue */
+ if(H5F_vfd_swmr_remove_entry_eot(f) < 0)
+ HDONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, "unable to remove entry from EOT queue")
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5F__vfd_swmr_disable_end_of_tick() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5F__vfd_swmr_enable_end_of_tick()
+ *
+ * Purpose: To enable end of tick processing
+ *
+ * Return: Non-negative on success/Negative on errors
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5F__vfd_swmr_enable_end_of_tick(H5F_t *f)
+{
+ eot_queue_entry_t *curr;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_PACKAGE
+
+ /* Sanity check */
+ HDassert(f);
+ HDassert(f->shared);
+
+ /* The file should be opened with VFD SWMR configured.*/
+ if(!(H5F_USE_VFD_SWMR(f)))
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "must have VFD SWMR configured for this public routine")
+
+ /* Search EOT queue */
+ TAILQ_FOREACH(curr, &eot_queue_g, link) {
+ if (curr->vfd_swmr_file == f)
+ break;
+ }
+
+ /* If the file already exists on the EOT queue, flag an error */
+ if(curr != NULL)
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "EOT for the file has already been enabled")
+
+ /* Insert the entry that corresponds to "f" onto the EOT queue */
+ if(H5F_vfd_swmr_insert_entry_eot(f) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, "unable to insert entry into the EOT queue")
+
+ /* Check if the tick has expired, if so call end of tick processing */
+ if(H5F_vfd_swmr_process_eot_queue(true) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, "error processing EOT queue")
+
+ /* FUNC_LEAVE_API could do the check, but not so for reader_end_of_tick() */
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5F__vfd_swmr_enable_end_of_tick() */
diff --git a/src/H5Fio.c b/src/H5Fio.c
index 34dd0d6..592d5f1 100644
--- a/src/H5Fio.c
+++ b/src/H5Fio.c
@@ -84,17 +84,11 @@
* address for the file.
*
* Return: Non-negative on success/Negative on failure
- *
- * Programmer: Robb Matzke
- * matzke@llnl.gov
- * Jul 10 1997
- *
*-------------------------------------------------------------------------
*/
herr_t
H5F_shared_block_read(H5F_shared_t *f_sh, H5FD_mem_t type, haddr_t addr, size_t size, void *buf/*out*/)
{
- H5FD_mem_t map_type; /* Mapped memory type */
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_NOAPI(FAIL)
@@ -108,11 +102,8 @@ H5F_shared_block_read(H5F_shared_t *f_sh, H5FD_mem_t type, haddr_t addr, size_t
if(H5F_addr_le(f_sh->tmp_addr, (addr + size)))
HGOTO_ERROR(H5E_IO, H5E_BADRANGE, FAIL, "attempting I/O in temporary file space")
- /* Treat global heap as raw data */
- map_type = (type == H5FD_MEM_GHEAP) ? H5FD_MEM_DRAW : type;
-
/* Pass through page buffer layer */
- if(H5PB_read(f_sh, map_type, addr, size, buf) < 0)
+ if(H5PB_read(f_sh, type, addr, size, buf) < 0)
HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "read through page buffer failed")
done:
@@ -128,40 +119,12 @@ done:
* address for the file.
*
* Return: Non-negative on success/Negative on failure
- *
- * Programmer: Robb Matzke
- * matzke@llnl.gov
- * Jul 10 1997
- *
*-------------------------------------------------------------------------
*/
herr_t
H5F_block_read(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, void *buf/*out*/)
{
- H5FD_mem_t map_type; /* Mapped memory type */
- herr_t ret_value = SUCCEED; /* Return value */
-
- FUNC_ENTER_NOAPI(FAIL)
-
- /* Sanity checks */
- HDassert(f);
- HDassert(f->shared);
- HDassert(buf);
- HDassert(H5F_addr_defined(addr));
-
- /* Check for attempting I/O on 'temporary' file address */
- if(H5F_addr_le(f->shared->tmp_addr, (addr + size)))
- HGOTO_ERROR(H5E_IO, H5E_BADRANGE, FAIL, "attempting I/O in temporary file space")
-
- /* Treat global heap as raw data */
- map_type = (type == H5FD_MEM_GHEAP) ? H5FD_MEM_DRAW : type;
-
- /* Pass through page buffer layer */
- if(H5PB_read(f->shared, map_type, addr, size, buf) < 0)
- HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "read through page buffer failed")
-
-done:
- FUNC_LEAVE_NOAPI(ret_value)
+ return H5F_shared_block_read(f->shared, type, addr, size, buf);
} /* end H5F_block_read() */
@@ -173,17 +136,11 @@ done:
* address.
*
* Return: Non-negative on success/Negative on failure
- *
- * Programmer: Robb Matzke
- * matzke@llnl.gov
- * Jul 10 1997
- *
*-------------------------------------------------------------------------
*/
herr_t
H5F_shared_block_write(H5F_shared_t *f_sh, H5FD_mem_t type, haddr_t addr, size_t size, const void *buf)
{
- H5FD_mem_t map_type; /* Mapped memory type */
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_NOAPI(FAIL)
@@ -198,11 +155,8 @@ H5F_shared_block_write(H5F_shared_t *f_sh, H5FD_mem_t type, haddr_t addr, size_t
if(H5F_addr_le(f_sh->tmp_addr, (addr + size)))
HGOTO_ERROR(H5E_IO, H5E_BADRANGE, FAIL, "attempting I/O in temporary file space")
- /* Treat global heap as raw data */
- map_type = (type == H5FD_MEM_GHEAP) ? H5FD_MEM_DRAW : type;
-
/* Pass through page buffer layer */
- if(H5PB_write(f_sh, map_type, addr, size, buf) < 0)
+ if(H5PB_write(f_sh, type, addr, size, buf) < 0)
HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "write through page buffer failed")
done:
@@ -218,41 +172,12 @@ done:
* address.
*
* Return: Non-negative on success/Negative on failure
- *
- * Programmer: Robb Matzke
- * matzke@llnl.gov
- * Jul 10 1997
- *
*-------------------------------------------------------------------------
*/
herr_t
H5F_block_write(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, const void *buf)
{
- H5FD_mem_t map_type; /* Mapped memory type */
- herr_t ret_value = SUCCEED; /* Return value */
-
- FUNC_ENTER_NOAPI(FAIL)
-
- /* Sanity checks */
- HDassert(f);
- HDassert(f->shared);
- HDassert(H5F_INTENT(f) & H5F_ACC_RDWR);
- HDassert(buf);
- HDassert(H5F_addr_defined(addr));
-
- /* Check for attempting I/O on 'temporary' file address */
- if(H5F_addr_le(f->shared->tmp_addr, (addr + size)))
- HGOTO_ERROR(H5E_IO, H5E_BADRANGE, FAIL, "attempting I/O in temporary file space")
-
- /* Treat global heap as raw data */
- map_type = (type == H5FD_MEM_GHEAP) ? H5FD_MEM_DRAW : type;
-
- /* Pass through page buffer layer */
- if(H5PB_write(f->shared, map_type, addr, size, buf) < 0)
- HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "write through page buffer failed")
-
-done:
- FUNC_LEAVE_NOAPI(ret_value)
+ return H5F_shared_block_write(f->shared, type, addr, size, buf);
} /* end H5F_block_write() */
diff --git a/src/H5Fpkg.h b/src/H5Fpkg.h
index 46185ff..1136cb4 100644
--- a/src/H5Fpkg.h
+++ b/src/H5Fpkg.h
@@ -26,12 +26,16 @@
#ifndef _H5Fpkg_H
#define _H5Fpkg_H
+/* BSD queue macros */
+#include "H5queue.h"
+
/* Get package's private header */
#include "H5Fprivate.h"
/* Other private headers needed by this file */
#include "H5private.h" /* Generic Functions */
#include "H5ACprivate.h" /* Metadata cache */
+#include "H5FDprivate.h" /* VFD -- for VFD SWMR */
#include "H5Bprivate.h" /* B-trees */
#include "H5FLprivate.h" /* Free Lists */
#include "H5FOprivate.h" /* File objects */
@@ -213,6 +217,26 @@ typedef struct H5F_mtab_t {
H5F_mount_t *child; /* An array of mount records */
} H5F_mtab_t;
+/*
+ * VFD SWMR: Entry for the delayed free space release doubly linked list
+ *
+ * md_file_page_offset: Unsigned 64-bit value containing the base address
+ * of the metadata page, or multi page metadata entry
+ * in the metadata file IN PAGES.
+ * To obtain byte offset, multiply this value by the
+ * page size.
+ * length: The length of the metadata page or multi page
+ * metadata entry in BYTES.
+ * tick_num: Sequence # of the current tick
+ * link: tailqueue linkage
+ */
+typedef struct shadow_defree {
+ uint64_t offset;
+ uint32_t length;
+ uint64_t tick_num;
+ TAILQ_ENTRY(shadow_defree) link;
+} shadow_defree_t;
+
/* Structure specifically to store superblock. This was originally
* maintained entirely within H5F_shared_t, but is now extracted
* here because the superblock is now handled by the cache */
@@ -232,6 +256,19 @@ typedef struct H5F_super_t {
H5G_entry_t *root_ent; /* Root group symbol table entry */
} H5F_super_t;
+/* VFD SWMR: deferred free on the lower VFD. */
+typedef struct lower_defree {
+ SIMPLEQ_ENTRY(lower_defree) link;
+ H5FD_mem_t alloc_type;
+ haddr_t addr;
+ hsize_t size;
+ uint64_t free_after_tick;
+} lower_defree_t;
+
+typedef SIMPLEQ_HEAD(lower_defree_queue, lower_defree) lower_defree_queue_t;
+
+typedef TAILQ_HEAD(shadow_defree_queue, shadow_defree) shadow_defree_queue_t;
+
/*
* Define the structure to store the file information for HDF5 files. One of
* these structures is allocated per file, not per H5Fopen(). That is, set of
@@ -271,7 +308,9 @@ struct H5F_shared_t {
unsigned long feature_flags; /* VFL Driver feature Flags */
haddr_t maxaddr; /* Maximum address for file */
- H5PB_t *page_buf; /* The page buffer cache */
+ H5PB_t *pb_ptr; /* pointer to the page buffer, or NULL */
+ /* if the page buffer is disabled. */
+
H5AC_t *cache; /* The object cache */
H5AC_cache_config_t
mdc_initCacheCfg; /* initial configuration for the */
@@ -357,6 +396,81 @@ struct H5F_shared_t {
H5F_object_flush_t object_flush; /* Information for object flush callback */
hbool_t crt_dset_min_ohdr_flag; /* flag to minimize created dataset object header */
+ /* VFD SWMR */
+
+ /* Configuration info */
+ H5F_vfd_swmr_config_t vfd_swmr_config; /* Copy of the VFD SWMR
+ * configuration from the
+ * FAPL used to open the file
+ */
+ haddr_t writer_index_offset;
+ hbool_t vfd_swmr; /* The file is opened with VFD
+ * SWMR configured or not
+ */
+ hbool_t vfd_swmr_writer; /* This is the VFD SWMR writer or
+ * not
+ */
+ uint64_t tick_num; /* Number of the current tick */
+ struct timespec end_of_tick; /* End time of the current tick */
+
+ lower_defree_queue_t lower_defrees; /* For use by VFD SWMR writers. */
+ /* VFD SWMR metadata file index */
+ H5FD_vfd_swmr_idx_entry_t * mdf_idx; /* pointer to an array of instance
+ * of H5FD_vfd_swmr_idx_entry_t of
+ * length mdf_idx_len. This array
+ * is used by the vfd swmr writer
+ * to assemble the metadata file
+ * index at the end of each tick,
+ * and by the vfd swmr readers to
+ * track changes in the index.
+ * With one brief exception during
+ * writer end of tick processing,
+ * this index will always be sorted
+ * in increasing HDF5 file page
+ * offset order.
+ *
+ * This field should be NULL unless
+ * the index is defined.
+ */
+ uint32_t mdf_idx_len; /* number of entries in the array
+ * of instances of
+ * H5FD_vfd_swmr_idx_entry_t pointed
+ * to by mdf_idx above. Note that
+ * not all entries in the index
+ * need be used.
+ */
+ uint32_t mdf_idx_entries_used; /* Number of entries in *mdf_idx
+ * that are in use -- these will
+ * be contiguous at indicies 0
+ * through mdf_idx_entries_used - 1.
+ */
+
+ /* Old VFD SWMMR metadata file index. These fields are used only
+ * by the VFD SWMR reader to store the previous version of the
+ * metadata file index so that it can be compared with the current
+ * version to identify page buffer and metadata cache entries that
+ * must be evicted or refreshed to avoid message from the past bugs.
+ */
+ H5FD_vfd_swmr_idx_entry_t * old_mdf_idx;
+ uint32_t old_mdf_idx_len;
+ uint32_t old_mdf_idx_entries_used;
+
+ /* Metadata file for VFD SWMR writer */
+ int vfd_swmr_md_fd; /* POSIX: file descriptor for the
+ * metadata file
+ */
+ haddr_t vfd_swmr_md_eoa; /* POSIX: eoa for the metadata
+ * file
+ */
+
+ /* Free space manager for the metadata file */
+ H5FS_t *fs_man_md; /* Free-space manager */
+ H5F_fs_state_t fs_state_md; /* State of the free space
+ * manager
+ */
+
+ /* Delayed free space release doubly linked list */
+ shadow_defree_queue_t shadow_defrees;
char *extpath; /* Path for searching target external link file */
#ifdef H5_HAVE_PARALLEL
@@ -366,6 +480,7 @@ struct H5F_shared_t {
};
+
/*
* This is the top-level file descriptor. One of these structures is
* allocated every time H5Fopen() is called although they may contain pointers
@@ -412,6 +527,9 @@ H5_DLL herr_t H5F__start_swmr_write(H5F_t *f);
H5_DLL herr_t H5F__close(H5F_t *f);
H5_DLL herr_t H5F__set_libver_bounds(H5F_t *f, H5F_libver_t low, H5F_libver_t high);
H5_DLL herr_t H5F__get_cont_info(const H5F_t *f, H5VL_file_cont_info_t *info);
+H5_DLL herr_t H5F__vfd_swmr_end_tick(H5F_t *f);
+H5_DLL herr_t H5F__vfd_swmr_disable_end_of_tick(H5F_t *f);
+H5_DLL herr_t H5F__vfd_swmr_enable_end_of_tick(H5F_t *f);
/* File mount related routines */
H5_DLL herr_t H5F__mount(H5G_loc_t *loc, const char *name, H5F_t *child, hid_t plist_id);
@@ -471,6 +589,12 @@ H5_DLL herr_t H5F__get_sohm_mesg_count_test(hid_t fid, unsigned type_id, size_t
H5_DLL herr_t H5F__check_cached_stab_test(hid_t file_id);
H5_DLL herr_t H5F__get_maxaddr_test(hid_t file_id, haddr_t *maxaddr);
H5_DLL herr_t H5F__get_sbe_addr_test(hid_t file_id, haddr_t *sbe_addr);
+
+/* VFD SWMR testing routines */
+H5_DLL herr_t H5F__vfd_swmr_writer_create_open_flush_test(hid_t file_id, hbool_t create);
+H5_DLL herr_t H5F__vfd_swmr_writer_md_test(hid_t, unsigned,
+ struct H5FD_vfd_swmr_idx_entry_t *, unsigned);
+
H5_DLL htri_t H5F__same_file_test(hid_t file_id1, hid_t file_id2);
#endif /* H5F_TESTING */
diff --git a/src/H5Fprivate.h b/src/H5Fprivate.h
index c5d4c89..fdacecd 100644
--- a/src/H5Fprivate.h
+++ b/src/H5Fprivate.h
@@ -203,6 +203,20 @@ typedef struct H5F_t H5F_t;
(p) += 8; \
}
+static inline uint64_t
+uint64_decode(uint8_t **pp)
+{
+ int i;
+ uint8_t *p = *pp;
+ uint64_t v = 0;
+
+ for (i = 0; i < 8; i++) {
+ v = (v << 8) | p[7 - i];
+ }
+ *pp += 8;
+ return v;
+}
+
# define UINT64DECODE(p, n) { \
/* WE DON'T CHECK FOR OVERFLOW! */ \
size_t _i; \
@@ -333,6 +347,10 @@ typedef struct H5F_t H5F_t;
#define H5F_THRESHOLD(F) ((F)->shared->threshold)
#define H5F_PGEND_META_THRES(F) ((F)->shared->fs.pgend_meta_thres)
#define H5F_POINT_OF_NO_RETURN(F) ((F)->shared->fs.point_of_no_return)
+#define H5F_FIRST_ALLOC_DEALLOC(F) ((F)->shared->first_alloc_dealloc)
+#define H5F_EOA_PRE_FSM_FSALLOC(F) ((F)->shared->eoa_pre_fsm_fsalloc)
+#define H5F_USE_VFD_SWMR(F) ((F)->shared->vfd_swmr)
+#define H5F_VFD_SWMR_MD_EOA(F) ((F)->shared->vfd_swmr_md_eoa)
#define H5F_NULL_FSM_ADDR(F) ((F)->shared->null_fsm_addr)
#define H5F_GET_MIN_DSET_OHDR(F) ((F)->shared->crt_dset_min_ohdr_flag)
#define H5F_SET_MIN_DSET_OHDR(F, V) ((F)->shared->crt_dset_min_ohdr_flag = (V))
@@ -395,6 +413,9 @@ typedef struct H5F_t H5F_t;
#define H5F_THRESHOLD(F) (H5F_get_threshold(F))
#define H5F_PGEND_META_THRES(F) (H5F_get_pgend_meta_thres(F))
#define H5F_POINT_OF_NO_RETURN(F) (H5F_get_point_of_no_return(F))
+#define H5F_FIRST_ALLOC_DEALLOC(F) (H5F_get_first_alloc_dealloc(F))
+#define H5F_EOA_PRE_FSM_FSALLOC(F) (H5F_get_eoa_pre_fsm_fsalloc(F))
+#define H5F_USE_VFD_SWMR(F) (H5F_use_vfd_swmr(F))
#define H5F_NULL_FSM_ADDR(F) (H5F_get_null_fsm_addr(F))
#define H5F_GET_MIN_DSET_OHDR(F) (H5F_get_min_dset_ohdr(F))
#define H5F_SET_MIN_DSET_OHDR(F, V) (H5F_set_min_dset_ohdr((F), (V)))
@@ -523,6 +544,21 @@ typedef struct H5F_t H5F_t;
#define H5F_ACS_MPI_PARAMS_INFO_NAME "mpi_params_info" /* the MPI info struct */
#endif /* H5_HAVE_PARALLEL */
+/* Default configuration for VFD SWMR: not configured */
+#define H5F_ACS_VFD_SWMR_CONFIG_NAME "vfd_swmr_config"
+#define H5F__DEFAULT_VFD_SWMR_CONFIG \
+{ \
+ /* int32_t version = */ 0, \
+ /* int32_t tick_len = */ 0, \
+ /* int32_t max_lag = */ 0, \
+ /* hbool_t vfd_swmr_writer = */ FALSE, \
+ /* hbool_t flush_raw_data = */ FALSE, \
+ /* int32_t md_pages_reserved = */ 0, \
+ /* int32_t pb_expansion_threshold = */ 0, \
+ /* char md_file_path[] = */ "", \
+ /* char log_file_path[] = */ "" \
+}
+
/* ======================== File Mount properties ====================*/
#define H5F_MNT_SYM_LOCAL_NAME "local" /* Whether absolute symlinks local to file. */
@@ -591,6 +627,10 @@ typedef struct H5F_t H5F_t;
#define H5F_SHARED_PAGED_AGGR(F_SH) ((F_SH)->fs_strategy == H5F_FSPACE_STRATEGY_PAGE && (F_SH)->fs_page_size)
#define H5F_PAGED_AGGR(F) (F->shared->fs_strategy == H5F_FSPACE_STRATEGY_PAGE && F->shared->fs_page_size)
+/* Check for file configured with VFD SWMR */
+#define H5F_SHARED_VFD_SWMR_CONFIG(S) (S->vfd_swmr_config.version >= H5F__CURR_VFD_SWMR_CONFIG_VERSION)
+#define H5F_VFD_SWMR_CONFIG(F) H5F_SHARED_VFD_SWMR_CONFIG(F->shared)
+
/* Metadata read attempt values */
#define H5F_METADATA_READ_ATTEMPTS 1 /* Default # of read attempts for non-SWMR access */
#define H5F_SWMR_METADATA_READ_ATTEMPTS 100 /* Default # of read attempts for SWMR access */
@@ -882,5 +922,7 @@ H5_DLL herr_t H5F_cwfs_remove_heap(H5F_shared_t *shared, struct H5HG_heap_t *hea
/* Debugging functions */
H5_DLL herr_t H5F_debug(H5F_t *f, FILE * stream, int indent, int fwidth);
+H5_DLL hbool_t H5F_use_vfd_swmr(const H5F_t *f);
+
#endif /* _H5Fprivate_H */
diff --git a/src/H5Fpublic.h b/src/H5Fpublic.h
index 02568c9..d4f6341 100644
--- a/src/H5Fpublic.h
+++ b/src/H5Fpublic.h
@@ -219,6 +219,96 @@ typedef struct H5F_retry_info_t {
/* Callback for H5Pset_object_flush_cb() in a file access property list */
typedef herr_t (*H5F_flush_cb_t)(hid_t object_id, void *udata);
+/* VFD SWMR configuration data used by H5Pset/get_vfd_swmr_config */
+#define H5F__CURR_VFD_SWMR_CONFIG_VERSION 1
+#define H5F__MAX_VFD_SWMR_FILE_NAME_LEN 1024
+#define H5F__MAX_PB_EXPANSION_THRESHOLD 100
+/*
+ * struct H5F_vfd_swmr_config_t
+ *
+ * Instances of H5F_vfd_swmr_config_t are used by VFD SWMR writers and readers
+ * to pass necessary configuration data to the HDF5 library on file open (or
+ * creation, in the case of writers).
+ *
+ * The fields of the structure are discussed below:
+ * version:
+ * An integer field indicating the version of the H5F_vfd_swmr_config
+ * structure used. This field must always be set to a known version
+ * number. The most recent version of the structure will always be
+ * H5F__CURR_VFD_SWMR_CONFIG_VERSION.
+ *
+ * tick_len:
+ * An integer field containing the length of a tick in tenths of
+ * a second. If tick_len is zero, end of tick processing may only be
+ * triggered manually via the H5Fvfd_swmr_end_tick() function.
+ *
+ * max_lag:
+ * An integer field indicating the maximum expected lag (in ticks)
+ * between the writer and the readers. This value must be at least 3,
+ * with 10 being the recommended minimum value.
+ *
+ * writer:
+ * A boolean flag indicating whether the file opened with this FAPL entry
+ * will be opened R/W. (i.e. as a VFD SWMR writer)
+ *
+ * flush_raw_data:
+ * A boolean flag indicating whether raw data should be flushed
+ * as part of the end of tick processing. If set to TRUE, raw
+ * data will be flushed and thus be consistent with the metadata file.
+ * However, this will also greatly increase end of tick I/O, and will
+ * likely break any real time guarantees unless a very large tick_len
+ * is selected.
+ *
+ * md_pages_reserved:
+ * An integer field indicating the number of pages reserved
+ * at the head of the metadata file. This value must be greater than
+ * or equal to 1.
+ * When the metadata file is created, the specified number of pages is
+ * reserved at the head of the metadata file. In the current
+ * implementation, the size of the metadata file header plus the
+ * index is limited to this size.
+ * Further, in the POSIX case, when readers check for an updated index,
+ * this check will start with a read of md_pages_reserved pages from
+ * the head of the metadata file.
+ *
+ * pb_expansion_threshold:
+ * An integer field indicating the threshold for the page buffer size.
+ * During a tick, the page buffer must expand as necessary to retain copies
+ * of all modified metadata pages and multi-page metadata entries.
+ * If the page buffer size exceeds this thresold, an early end of tick
+ * will be triggered.
+ * Note that this is not a limit on the maximum page buffer size, as the
+ * metadata cache is flushed as part of end of tick processing.
+ * This threshold must be in the range [0, 100]. If the threshold is 0,
+ * the feature is disabled. For all other values, the page buffer size is
+ * multiplied by this threshold. If this value is exceeded, an early end
+ * of tick is triggered.
+ *
+ * md_file_path:
+ * POSIX: this field contains the path of the metadata file.
+ * NFS: it contains the path and base name of the metadata file
+ * updater files.
+ * Object store: it contains the base URL for the objects used
+ * to store metadata file updater objects.
+ *
+ * log_file_path:
+ * This field contains the path to the log file. If defined, this path should
+ * be unique to each process. If this field contains the empty string, a log
+ * file will not be created.
+ *
+ */
+typedef struct H5F_vfd_swmr_config_t {
+ int32_t version;
+ uint32_t tick_len;
+ uint32_t max_lag;
+ hbool_t writer;
+ hbool_t flush_raw_data;
+ uint32_t md_pages_reserved;
+ uint32_t pb_expansion_threshold;
+ char md_file_path[H5F__MAX_VFD_SWMR_FILE_NAME_LEN + 1];
+ char log_file_path[H5F__MAX_VFD_SWMR_FILE_NAME_LEN + 1];
+} H5F_vfd_swmr_config_t;
+
/*********************/
/* Public Prototypes */
/*********************/
@@ -281,6 +371,13 @@ H5_DLL herr_t H5Fget_mdc_image_info(hid_t file_id, haddr_t *image_addr, hsize_t
H5_DLL herr_t H5Fget_dset_no_attrs_hint(hid_t file_id, hbool_t *minimize);
H5_DLL herr_t H5Fset_dset_no_attrs_hint(hid_t file_id, hbool_t minimize);
+/* VFD SWMR */
+H5_DLL herr_t H5Fvfd_swmr_end_tick(hid_t file_id);
+H5_DLL herr_t H5Fvfd_swmr_disable_end_of_tick(hid_t file_id);
+H5_DLL herr_t H5Fvfd_swmr_enable_end_of_tick(hid_t file_id);
+H5_DLL bool vfd_swmr_writer_may_increase_tick_to(uint64_t, bool);
+H5_DLL void vfd_swmr_reader_did_increase_tick_to(uint64_t);
+
#ifdef H5_HAVE_PARALLEL
H5_DLL herr_t H5Fset_mpi_atomicity(hid_t file_id, hbool_t flag);
H5_DLL herr_t H5Fget_mpi_atomicity(hid_t file_id, hbool_t *flag);
diff --git a/src/H5Fquery.c b/src/H5Fquery.c
index e1b11c8..b565f22 100644
--- a/src/H5Fquery.c
+++ b/src/H5Fquery.c
@@ -1364,3 +1364,26 @@ done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5F_get_cont_info */
+
+/*-------------------------------------------------------------------------
+ * Function: H5F_use_vfd_swmr
+ *
+ * Purpose: Quick and dirty routine to determine if VFD SWMR is
+ * enabled for this file.
+ * (Mainly added to stop non-file routines from poking about in the
+ * H5F_t data structure)
+ *
+ * Return: TRUE/FALSE on success/abort on failure (shouldn't fail)
+ *-------------------------------------------------------------------------
+ */
+hbool_t
+H5F_use_vfd_swmr(const H5F_t *f)
+{
+ /* Use FUNC_ENTER_NOAPI_NOINIT_NOERR here to avoid performance issues */
+ FUNC_ENTER_NOAPI_NOINIT_NOERR
+
+ HDassert(f);
+ HDassert(f->shared);
+
+ FUNC_LEAVE_NOAPI(f->shared->vfd_swmr)
+} /* end H5F_use_vfd_swmr() */
diff --git a/src/H5Fsfile.c b/src/H5Fsfile.c
index 9a9bbab..3cbd490 100644
--- a/src/H5Fsfile.c
+++ b/src/H5Fsfile.c
@@ -138,28 +138,14 @@ done:
H5F_shared_t *
H5F__sfile_search(H5FD_t *lf)
{
- H5F_sfile_node_t *curr; /* Current shared file node */
- H5F_shared_t *ret_value = NULL; /* Return value */
-
- FUNC_ENTER_PACKAGE_NOERR
-
- /* Sanity check */
- HDassert(lf);
-
- /* Iterate through low-level files for matching low-level file info */
- curr = H5F_sfile_head_g;
- while(curr) {
- /* Check for match */
- if(0 == H5FD_cmp(curr->shared->lf, lf))
- HGOTO_DONE(curr->shared)
-
- /* Advance to next shared file node */
- curr = curr->next;
- } /* end while */
-
-done:
- FUNC_LEAVE_NOAPI(ret_value)
-} /* end H5F__sfile_search() */
+ H5F_sfile_node_t *curr;
+
+ for (curr = H5F_sfile_head_g; curr != NULL; curr = curr->next) {
+ if(curr->shared->lf == lf)
+ return curr->shared;
+ }
+ return NULL;
+}
/*-------------------------------------------------------------------------
diff --git a/src/H5Fspace.c b/src/H5Fspace.c
index 6baf163..7de8b20 100644
--- a/src/H5Fspace.c
+++ b/src/H5Fspace.c
@@ -43,6 +43,7 @@
/****************/
+
/******************/
/* Local Typedefs */
/******************/
diff --git a/src/H5Fsuper_cache.c b/src/H5Fsuper_cache.c
index 119548c..d1227d0 100644
--- a/src/H5Fsuper_cache.c
+++ b/src/H5Fsuper_cache.c
@@ -84,6 +84,8 @@ static herr_t H5F__cache_drvrinfo_image_len(const void *thing, size_t *image_len
static herr_t H5F__cache_drvrinfo_serialize(const H5F_t *f, void *image, size_t len,
void *thing);
static herr_t H5F__cache_drvrinfo_free_icr(void *thing);
+static herr_t H5F__cache_superblock_refresh(H5F_t *f, void * _thing, const void * _image,
+ size_t * len_ptr);
/* Local encode/decode routines */
static herr_t H5F__superblock_prefix_decode(H5F_super_t *sblock,
@@ -114,6 +116,7 @@ const H5AC_class_t H5AC_SUPERBLOCK[1] = {{
NULL, /* 'notify' callback */
H5F__cache_superblock_free_icr, /* 'free_icr' callback */
NULL, /* 'fsf_size' callback */
+ H5F__cache_superblock_refresh, /* VFD SWMR 'refresh' callback */
}};
/* H5F driver info block inherits cache-like properties from H5AC */
@@ -132,6 +135,7 @@ const H5AC_class_t H5AC_DRVRINFO[1] = {{
NULL, /* 'notify' callback */
H5F__cache_drvrinfo_free_icr, /* 'free_icr' callback */
NULL, /* 'fsf_size' callback */
+ NULL, /* VFD SWMR 'refresh' callback */
}};
@@ -1087,3 +1091,268 @@ H5F__cache_drvrinfo_free_icr(void *_thing)
FUNC_LEAVE_NOAPI(SUCCEED)
} /* H5F__cache_drvrinfo_free_icr() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5F__cache_superblock_refresh
+ *
+ * Purpose: Examine the supplied image buffer, and update the
+ * superblock accordingly.
+ *
+ * This function is only called when the file is opened in
+ * VFD SWMR reader mode -- which implies that the file has
+ * been opened R/O. Thus the internal representation of
+ * the superblock must be clean, and may be modified without
+ * concern for local changes.
+ *
+ * Further, most of the superblock is fixed once the file
+ * is created, for the most part, this function simply
+ * verifies the expected values.
+ *
+ * Return: Success: SUCCEED
+ * Failure: FAIL
+ *
+ * Programmer: John Mainzer
+ * 12/21/19
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5F__cache_superblock_refresh(H5F_t *f, void * _thing, const void * _image,
+ size_t * len_ptr)
+{
+ H5F_super_t *sblock = (H5F_super_t *)_thing;
+ const uint8_t *image = (const uint8_t *)_image;
+ size_t expected_image_len;
+ unsigned super_vers; /* Superblock version */
+ uint8_t sizeof_addr; /* Size of addresses in file */
+ uint8_t sizeof_size; /* Size of offsets in file */
+ uint32_t status_flags; /* File status flags */
+ unsigned sym_leaf_k; /* Size of leaves in symbol tables */
+ haddr_t base_addr; /* Absolute base address for rel.addrs. */
+ /* (superblock for file is at this offset) */
+ haddr_t stored_eof;
+ haddr_t ext_addr; /* Relative address of superblock extension */
+ haddr_t driver_addr; /* File driver information block address */
+ haddr_t root_addr; /* Root group address */
+ H5G_entry_t root_ent; /* Root group symbol table entry */
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_STATIC
+
+ /* santity checks */
+ HDassert(f);
+ HDassert(sblock);
+ HDassert(sblock == f->shared->sblock);
+ HDassert(image);
+ HDassert(len_ptr);
+ HDassert(*len_ptr >= H5F_SUPERBLOCK_FIXED_SIZE + 6);
+
+ /* skip the signature */
+ image += H5F_SIGNATURE_LEN;
+
+ /* get the superblock version */
+ super_vers = *image++;
+
+ if ( sblock->super_vers != super_vers )
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "unexpected superblock vers")
+
+ /* verify sizes of addresses and offsets */
+ if(super_vers < HDF5_SUPERBLOCK_VERSION_2) {
+ sizeof_addr = image[4];
+ sizeof_size = image[5];
+ } /* end if */
+ else {
+ sizeof_addr = image[0];
+ sizeof_size = image[1];
+ } /* end else */
+
+ if ( sblock->sizeof_addr != sizeof_addr )
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "unexpected sizeof_addr")
+
+ if ( sblock->sizeof_size != sizeof_size )
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "unexpected sizeof_size")
+
+ /* compute expected image len */
+ expected_image_len = H5F_SUPERBLOCK_FIXED_SIZE +
+ (size_t)H5F_SUPERBLOCK_VARLEN_SIZE(super_vers, sizeof_addr, sizeof_size);
+
+ if ( expected_image_len != *len_ptr ) {
+
+ *len_ptr = expected_image_len;
+ HGOTO_DONE(SUCCEED)
+ }
+
+ /* at this point, we know that the supplied image is of
+ * the correct length.
+ */
+
+ /* validate the older version of the superblock */
+ if(sblock->super_vers < HDF5_SUPERBLOCK_VERSION_2) {
+
+ unsigned snode_btree_k; /* B-tree symbol table internal node 'K' value */
+ unsigned chunk_btree_k; /* B-tree chunk internal node 'K' value */
+
+ /* Freespace version (hard-wired) */
+ if(HDF5_FREESPACE_VERSION != *image++)
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "bad free space version number")
+
+ /* Root group version number (hard-wired) */
+ if(HDF5_OBJECTDIR_VERSION != *image++)
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "bad object directory version number")
+
+ /* Skip over reserved byte */
+ image++;
+
+ /* Shared header version number (hard-wired) */
+ if(HDF5_SHAREDHEADER_VERSION != *image++)
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "bad shared-header format version number")
+
+ /* Skip over size of file addresses (already decoded and checked) */
+ image++;
+
+ /* Skip over size of file sizes (already decoded and checked) */
+ image++;
+
+ /* Skip over reserved byte */
+ image++;
+
+ /* Various B-tree sizes */
+ UINT16DECODE(image, sym_leaf_k);
+ if ( sym_leaf_k != sblock->sym_leaf_k )
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "unexpected sym_leaf_k")
+
+ /* Need 'get' call to set other array values */
+ UINT16DECODE(image, snode_btree_k);
+ if ( snode_btree_k != sblock->btree_k[H5B_SNODE_ID] )
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "unexpected snode_btree_k")
+
+ /* File status flags (not really used yet) */
+ /* If the file has closed, the status flags will be zero.
+ * Allow this.
+ */
+ UINT32DECODE(image, status_flags);
+ if ( ( status_flags != sblock->status_flags ) &&
+ ( status_flags != 0 ) )
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "unexpected status_flags")
+
+ /*
+ * If the superblock version # is greater than 0, read in the indexed
+ * storage B-tree internal 'K' value
+ */
+ if(sblock->super_vers > HDF5_SUPERBLOCK_VERSION_DEF) {
+ UINT16DECODE(image, chunk_btree_k);
+
+ if ( chunk_btree_k != sblock->btree_k[H5B_CHUNK_ID] )
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "unexpected chunk_btree_k")
+
+ /* Reserved bytes are present only in version 1 */
+ if(sblock->super_vers == HDF5_SUPERBLOCK_VERSION_1)
+ image += 2; /* reserved */
+ } /* end if */
+
+ /* Remainder of "variable-sized" portion of superblock */
+ H5F_addr_decode(f, (const uint8_t **)&image, &base_addr/*out*/);
+ H5F_addr_decode(f, (const uint8_t **)&image, &ext_addr/*out*/);
+ H5F_addr_decode(f, (const uint8_t **)&image, &stored_eof/*out*/);
+ H5F_addr_decode(f, (const uint8_t **)&image, &driver_addr/*out*/);
+
+ if ( base_addr != sblock->base_addr )
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "unexpected base_addr")
+
+ if ( ext_addr != sblock->ext_addr )
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "unexpected ext_addr")
+
+ /* use stored_eof to update EOA below */
+
+ if ( driver_addr != sblock->driver_addr )
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "unexpected driver_addr")
+
+ /* decode the root group symbol table entry */
+ if(H5G_ent_decode(f, (const uint8_t **)&image, &root_ent) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTDECODE, FAIL, "can't decode root group symbol table entry")
+
+ /* Set the root group address to the correct value */
+ root_addr = root_ent.header;
+
+ if ( root_addr != sblock->root_addr )
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "unexpected root_addr")
+
+ HDassert(root_ent.type == H5G_CACHED_STAB);
+
+ if ( ( root_ent.type != sblock->root_ent->type ) ||
+ ( root_ent.cache.stab.btree_addr !=
+ sblock->root_ent->cache.stab.btree_addr ) ||
+ ( root_ent.cache.stab.heap_addr !=
+ sblock->root_ent->cache.stab.heap_addr ) )
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "unexpected root_ent data")
+
+
+ /* NOTE: Driver info block is decoded separately, later */
+
+ } /* end if */
+ else {
+ uint32_t read_chksum;
+ uint32_t computed_chksum;
+
+ /* Skip over size of file addresses (already decoded and checked) */
+ image++;
+
+ /* Skip over size of file sizes (already decoded and checked) */
+ image++;
+
+ /* File status flags (not really used yet) */
+ status_flags = *image++;
+
+ /* If the file has closed, the status flags will be zero.
+ * Allow this.
+ */
+ if ( ( status_flags != sblock->status_flags ) &&
+ ( status_flags != 0 ) )
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "unexpected status_flags")
+
+ /* Base, superblock extension, end of file & root group object
+ * header addresses
+ */
+ H5F_addr_decode(f, (const uint8_t **)&image, &base_addr/*out*/);
+ H5F_addr_decode(f, (const uint8_t **)&image, &ext_addr/*out*/);
+ H5F_addr_decode(f, (const uint8_t **)&image, &stored_eof/*out*/);
+ H5F_addr_decode(f, (const uint8_t **)&image, &root_addr/*out*/);
+
+ if ( base_addr != sblock->base_addr )
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "unexpected base_addr")
+
+ if ( ext_addr != sblock->ext_addr )
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "unexpected ext_addr")
+
+ if ( root_addr != sblock->root_addr )
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "unexpected root_addr")
+
+ /* use stored_eof to update EOA below */
+
+ /* Decode checksum */
+ UINT32DECODE(image, read_chksum);
+
+ if ( H5F_get_checksums((const uint8_t *)_image,
+ (size_t)(image - (const uint8_t *)_image),
+ NULL, &computed_chksum) < 0 )
+ HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, "can't compute chksum")
+
+ if ( read_chksum != computed_chksum )
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "unexpected checksum")
+
+ } /* end else */
+
+ /* Sanity check */
+ HDassert((size_t)(image - (const uint8_t *)_image) <= *len_ptr);
+
+ /* update the EOA */
+ if(H5F__set_eoa(f, H5FD_MEM_DEFAULT, stored_eof - base_addr) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, "unable to update EOA")
+
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* end H5F__cache_superblock_refresh() */
diff --git a/src/H5Ftest.c b/src/H5Ftest.c
index 49a2a22..f249d45 100644
--- a/src/H5Ftest.c
+++ b/src/H5Ftest.c
@@ -38,12 +38,14 @@
/* Headers */
/***********/
#include "H5private.h" /* Generic Functions */
+#include "H5FDprivate.h" /* File Drivers */
#include "H5CXprivate.h" /* API Contexts */
#include "H5Eprivate.h" /* Error handling */
#include "H5Fpkg.h" /* File access */
#include "H5Gpkg.h" /* Groups */
#include "H5Iprivate.h" /* IDs */
#include "H5SMpkg.h" /* Shared object header messages */
+#include "H5MMprivate.h" /* Memory management */
#include "H5VLprivate.h" /* Virtual Object Layer */
@@ -65,6 +67,11 @@
/********************/
/* Local Prototypes */
/********************/
+static herr_t H5F__vfd_swmr_decode_md_hdr(int md_fd, H5FD_vfd_swmr_md_header *md_hdr);
+static herr_t H5F__vfd_swmr_decode_md_idx(int md_fd, H5FD_vfd_swmr_md_header *md_hdr, H5FD_vfd_swmr_md_index *md_idx);
+static herr_t H5F__vfd_swmr_verify_md_hdr_and_idx(H5F_t *f,
+ H5FD_vfd_swmr_md_header *md_hdr, H5FD_vfd_swmr_md_index *md_idx,
+ unsigned num_entries, H5FD_vfd_swmr_idx_entry_t *index);
/*********************/
@@ -75,6 +82,8 @@
/*****************************/
/* Library Private Variables */
/*****************************/
+/* Declare external the free list for H5FD_vfd_swmr_idx_entry_t */
+H5FL_SEQ_EXTERN(H5FD_vfd_swmr_idx_entry_t);
/*******************/
@@ -233,6 +242,373 @@ done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5F__get_sbe_addr_test() */
+/*
+ * VFD SWMR tests
+ */
+
+/*-------------------------------------------------------------------------
+ * Function: H5F__vfd_swmr_writer_create_open_flush_test
+ *
+ * Purpose: Verify info in the header and index when:
+ * (1) creating an HDF5 file
+ * (2) opening an existing HDF5 file
+ * (3) flushing an HDF5 file
+ *
+ * Open the metadata file
+ * Verify the file size is as expected (md_pages_reserved)
+ * For file create:
+ * --No header magic is found
+ * For file open or file flush:
+ * --Read and decode the header and index in the metadata file
+ * --Verify info in the header and index read from
+ * the metadata file is as expected (empty index)
+ *
+ * Return: SUCCEED/FAIL
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5F__vfd_swmr_writer_create_open_flush_test(hid_t file_id, hbool_t file_create)
+{
+ H5F_t *f; /* File pointer */
+ h5_stat_t stat_buf; /* Buffer for stat info */
+ H5FD_vfd_swmr_md_header md_hdr; /* Header for the metadata file */
+ H5FD_vfd_swmr_md_index md_idx; /* Indedx for the metadata file */
+ int md_fd = -1; /* The metadata file descriptor */
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_PACKAGE
+
+ /* Check arguments */
+ if(NULL == (f = (H5F_t *)H5VL_object_verify(file_id, H5I_FILE)))
+ HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file")
+
+ /* Open the metadata file */
+ if((md_fd = HDopen(f->shared->vfd_swmr_config.md_file_path, O_RDONLY)) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL, "error opening metadata file")
+
+ /* Verify the minimum size for the metadata file */
+ if(HDstat(f->shared->vfd_swmr_config.md_file_path, &stat_buf) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_BADFILE, FAIL, "unable to stat the metadata file")
+ if(stat_buf.st_size < (HDoff_t)((hsize_t)f->shared->vfd_swmr_config.md_pages_reserved * f->shared->fs_page_size))
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "incorrect metadata file size")
+
+ if(file_create) { /* Creating file */
+ uint32_t hdr_magic;
+
+ /* Seek to the beginning of the file */
+ if(HDlseek(md_fd, (HDoff_t)H5FD_MD_HEADER_OFF, SEEK_SET) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_SEEKERROR, FAIL, "error seeking metadata file")
+
+ /* Try to read the magic for header */
+ if(HDread(md_fd, &hdr_magic, H5_SIZEOF_MAGIC) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_READERROR, FAIL, "error reading metadata file")
+
+ /* Verify that there is no header magic in the metadata file */
+ if(HDmemcmp(&hdr_magic, H5FD_MD_HEADER_MAGIC, (size_t)H5_SIZEOF_MAGIC) == 0)
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "error finding header magic in the metadata file")
+
+ } else { /* Opening or flushing the file */
+
+ HDmemset(&md_hdr, 0, sizeof(H5FD_vfd_swmr_md_header));
+ HDmemset(&md_idx, 0, sizeof(H5FD_vfd_swmr_md_index));
+
+ /* Decode the header */
+ if(H5F__vfd_swmr_decode_md_hdr(md_fd, &md_hdr) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTDECODE, FAIL, "error decoding header in the metadata file")
+
+ /* Decode the index */
+ if(H5F__vfd_swmr_decode_md_idx(md_fd, &md_hdr, &md_idx) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTDECODE, FAIL, "error decoding index in the metadata file")
+
+ /* Verify info in header and index read from the metadata file */
+ if(H5F__vfd_swmr_verify_md_hdr_and_idx(f, &md_hdr, &md_idx, 0, NULL) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "incorrect info found in header and index of the metadata file")
+ }
+
+done:
+ /* Free the index entries */
+ if(!file_create && md_idx.entries) {
+ HDassert(md_idx.num_entries);
+ H5MM_free(md_idx.entries);
+ }
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5F__vfd_swmr_writer_create_open_flush_test() */
+
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5F__vfd_swmr_decode_md_hdr
+ *
+ * Purpose: Decode header and verify header magic
+ *
+ * Return: SUCCEED/FAIL
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5F__vfd_swmr_decode_md_hdr(int md_fd, H5FD_vfd_swmr_md_header *md_hdr)
+{
+ uint64_t index_length;
+ uint8_t image[H5FD_MD_HEADER_SIZE]; /* Buffer for the header image */
+ uint8_t *p = NULL; /* Points to the image */
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_PACKAGE
+
+ p = image;
+
+ /* Seek to the beginning of the file */
+ if(HDlseek(md_fd, (HDoff_t)H5FD_MD_HEADER_OFF, SEEK_SET) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_SEEKERROR, FAIL, "error seeking metadata file")
+
+ /* Read the header */
+ if(HDread(md_fd, image, H5FD_MD_HEADER_SIZE) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_READERROR, FAIL, "error reading metadata file")
+
+ /* Verify magic for header */
+ if(HDmemcmp(p, H5FD_MD_HEADER_MAGIC, (size_t)H5_SIZEOF_MAGIC) != 0)
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "does not find header magic in the metadata file")
+
+ p += H5_SIZEOF_MAGIC;
+
+ /* Deserialize fs_page_size, tick_num, index_offset, index_length */
+ UINT32DECODE(p, md_hdr->fs_page_size);
+ UINT64DECODE(p, md_hdr->tick_num);
+ UINT64DECODE(p, md_hdr->index_offset);
+ if ((index_length = uint64_decode(&p)) > SIZE_MAX) {
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "index is too long")
+ }
+ md_hdr->index_length = (size_t)index_length;
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5F__vfd_swmr_decode_md_hdr() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5F__vfd_swmr_decode_md_idx
+ *
+ * Purpose: Decode index and verify index magic
+ *
+ * Return: SUCCEED/FAIL
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5F__vfd_swmr_decode_md_idx(int md_fd, H5FD_vfd_swmr_md_header *md_hdr, H5FD_vfd_swmr_md_index *md_idx)
+{
+ uint8_t *image = NULL; /* Points to the buffer for the index image */
+ uint8_t *p = NULL; /* Points to the image */
+ unsigned i; /* Local index variable */
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_PACKAGE
+
+ /* Allocate buffer for the index image */
+ if(NULL == (image = H5MM_malloc(md_hdr->index_length)))
+ HGOTO_ERROR(H5E_FILE, H5E_CANTALLOC, FAIL, "memory allocation failed for index on disk buffer")
+
+ p = image;
+
+ /* Seek to the position of the index */
+ if(HDlseek(md_fd, (HDoff_t)md_hdr->index_offset, SEEK_SET) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_SEEKERROR, FAIL, "unable to seek in metadata file")
+
+ /* Read the index */
+ if(HDread(md_fd, image, md_hdr->index_length) < (int64_t)md_hdr->index_length)
+ HGOTO_ERROR(H5E_FILE, H5E_READERROR, FAIL, "error in reading the header in metadata file")
+
+ /* Verify magic for index */
+ if(HDmemcmp(p, H5FD_MD_INDEX_MAGIC, H5_SIZEOF_MAGIC) != 0)
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "no header magic in the metadata file")
+
+ p += H5_SIZEOF_MAGIC;
+
+ /* Deserialize tick_num and num_entries */
+ UINT64DECODE(p, md_idx->tick_num);
+ UINT32DECODE(p, md_idx->num_entries);
+
+ /* Deserialize index entries */
+ if(md_idx->num_entries) {
+ md_idx->entries =
+ H5MM_calloc(md_idx->num_entries * sizeof(md_idx->entries[0]));
+ /* Allocate memory for the index entries */
+ if(NULL == md_idx->entries)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTALLOC, FAIL, "memory allocation failed for index entries")
+
+ /* Decode index entries */
+ for(i = 0; i < md_idx->num_entries; i++) {
+ UINT32DECODE(p, md_idx->entries[i].hdf5_page_offset);
+ UINT32DECODE(p, md_idx->entries[i].md_file_page_offset);
+ UINT32DECODE(p, md_idx->entries[i].length);
+ UINT32DECODE(p, md_idx->entries[i].chksum);
+ } /* end for */
+
+ } /* end if */
+
+done:
+ /* Free the buffer */
+ if(image)
+ H5MM_free(image);
+ if(ret_value < 0) {
+ /* Free the index entries */
+ if(md_idx->entries) {
+ HDassert(md_idx->num_entries);
+ H5MM_free(md_idx->entries);
+ }
+ }
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5F__vfd_swmr_decode_md_idx() */
+
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5F__vfd_swmr_verify_md_hdr_idx_test
+ *
+ * Purpose: Verify the header and index in the metadata file:
+ * --fs_page_size in md header is the same as that stored in "f"
+ * --index_length in md header is as indicated by num_entries
+ * --index_offset in md header is right after the header
+ * --number of entries in md index is num_entries
+ * --entries in md index is as indicated by num_entries and index
+ *
+ * Return: SUCCEED/FAIL
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5F__vfd_swmr_verify_md_hdr_and_idx(H5F_t *f,
+ H5FD_vfd_swmr_md_header *md_hdr, H5FD_vfd_swmr_md_index *md_idx,
+ unsigned num_entries, H5FD_vfd_swmr_idx_entry_t *index)
+{
+ unsigned i; /* Local index variable */
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_PACKAGE
+
+ /* Verify fs_page_size read from header in the metadata file is fs_page_size in f */
+ if(md_hdr->fs_page_size != f->shared->fs_page_size)
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "incorrect fs_page_size read from metadata file")
+
+ /* Verify index_length read from header in the metadata file is the size of num_entries index */
+ if(md_hdr->index_length != H5FD_MD_INDEX_SIZE(num_entries))
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "incorrect index_length read from metadata file")
+
+ /* Verify index_offset read from header in the metadata file is the size of md header */
+ if(md_hdr->index_offset != f->shared->fs_page_size)
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "incorrect index_offset read from metadata file")
+
+ /* Verify num_entries read from index in the metadata file is num_entries */
+ if(md_idx->num_entries != num_entries)
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "incorrect num_entries read from metadata file")
+
+ /* Verify empty/non-empty index entries */
+ if(num_entries == 0) {
+ /* Verify the index is empty */
+ if(md_idx->entries != NULL)
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "incorrect entries in index")
+ } else {
+ /* Verify entries */
+ for(i = 0; i < num_entries; i++) {
+ if(md_idx->entries[i].length != index[i].length)
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "incorrect length read from metadata file")
+
+ if(md_idx->entries[i].hdf5_page_offset != index[i].hdf5_page_offset)
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "incorrect hdf5_page_offset read from metadata file")
+
+ if(md_idx->entries[i].md_file_page_offset != index[i].md_file_page_offset)
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "incorrect md_file_page_offset read from metadata file")
+
+ if(md_idx->entries[i].chksum != index[i].chksum)
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "incorrect chksum read from metadata file")
+ }
+ }
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5F__vfd_swmr_verify_md_hdr_and_idx() */
+
+static unsigned
+count_shadow_defrees(shadow_defree_queue_t *shadow_defrees)
+{
+ shadow_defree_t *shadow_defree;
+ unsigned count = 0;
+
+ TAILQ_FOREACH(shadow_defree, shadow_defrees, link)
+ count++;
+
+ return count;
+}
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5F__vfd_swmr_writer_md_test
+ *
+ * Purpose: Update the metadata file with the input index and verify
+ * the following:
+ * --info read from the metadata file is as indicated by
+ * the input: num_entries, index
+ * --# of entries on the delayed list is as indicated by
+ * the input: nshadow_defrees
+ *
+ * Return: SUCCEED/FAIL
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5F__vfd_swmr_writer_md_test(hid_t file_id, unsigned num_entries,
+ H5FD_vfd_swmr_idx_entry_t *index, unsigned nshadow_defrees)
+{
+ H5F_t *f; /* File pointer */
+ int md_fd = -1; /* The metadata file descriptor */
+ H5FD_vfd_swmr_md_header md_hdr; /* Header for the metadata file */
+ H5FD_vfd_swmr_md_index md_idx; /* Index for the metadata file */
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_PACKAGE
+
+ HDmemset(&md_hdr, 0, sizeof(H5FD_vfd_swmr_md_header));
+ HDmemset(&md_idx, 0, sizeof(H5FD_vfd_swmr_md_index));
+
+ /* Check arguments */
+ if(NULL == (f = (H5F_t *)H5VL_object_verify(file_id, H5I_FILE)))
+ HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file")
+
+ /* Update the metadata file with the input index */
+ if(H5F_update_vfd_swmr_metadata_file(f, num_entries, index) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTALLOC, FAIL, "error updating the md file with the index")
+
+ /* Verify the number of entries in the delayed list is as expected */
+ if(count_shadow_defrees(&f->shared->shadow_defrees) == nshadow_defrees)
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "incorrect # of entries in the delayed list")
+
+ /* Open the metadata file */
+ if((md_fd = HDopen(f->shared->vfd_swmr_config.md_file_path, O_RDONLY)) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL, "error opening metadata file")
+
+ /* Decode the header in the metadata file */
+ if(H5F__vfd_swmr_decode_md_hdr(md_fd, &md_hdr) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTDECODE, FAIL, "error decoding header in the metadata file")
+
+ /* Decode the index in the metadata file */
+ if(H5F__vfd_swmr_decode_md_idx(md_fd, &md_hdr, &md_idx) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL, "error decoding index in the metadata file")
+
+ /* Verify info read from the metadata file is the same as the input index */
+ if(H5F__vfd_swmr_verify_md_hdr_and_idx(f, &md_hdr, &md_idx, num_entries, index) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "incorrect info found in header and index of the metadata file")
+
+done:
+ /* Free the index entries */
+ if(md_idx.entries) {
+ HDassert(md_idx.num_entries);
+ H5MM_free(md_idx.entries);
+ }
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5F__vfd_swmr_writer_md_test() */
+
/*-------------------------------------------------------------------------
* Function: H5F__same_file_test
diff --git a/src/H5Fvfd_swmr.c b/src/H5Fvfd_swmr.c
new file mode 100644
index 0000000..376fa38
--- /dev/null
+++ b/src/H5Fvfd_swmr.c
@@ -0,0 +1,2111 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright by The HDF Group. *
+ * Copyright by the Board of Trustees of the University of Illinois. *
+ * All rights reserved. *
+ * *
+ * This file is part of HDF5. The full HDF5 copyright notice, including *
+ * terms governing use, modification, and redistribution, is contained in *
+ * the COPYING file, which can be found at the root of the source code *
+ * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. *
+ * If you do not have access to either file, you may request a copy from *
+ * help@hdfgroup.org. *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+/*-------------------------------------------------------------------------
+ *
+ * Created: H5Fvfd_swmr.c
+ * Oct 10 2019
+ *
+ * Purpose: Functions for VFD SWMR.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/****************/
+/* Module Setup */
+/****************/
+
+#include "H5Fmodule.h" /* This source code file is part of the H5F module */
+
+
+/***********/
+/* Headers */
+/***********/
+#include "H5private.h" /* Generic Functions */
+#include "H5Aprivate.h" /* Attributes */
+#include "H5ACprivate.h" /* Metadata cache */
+#include "H5CXprivate.h" /* API Contexts */
+#include "H5Dprivate.h" /* Datasets */
+#include "H5Eprivate.h" /* Error handling */
+#include "H5Fpkg.h" /* File access */
+#include "H5FDprivate.h" /* File drivers */
+#include "H5Gprivate.h" /* Groups */
+#include "H5Iprivate.h" /* IDs */
+#include "H5Lprivate.h" /* Links */
+#include "H5MFprivate.h" /* File memory management */
+#include "H5MVprivate.h" /* File memory management for VFD SWMR */
+#include "H5MMprivate.h" /* Memory management */
+#include "H5Pprivate.h" /* Property lists */
+#include "H5SMprivate.h" /* Shared Object Header Messages */
+#include "H5Tprivate.h" /* Datatypes */
+#include "hlog.h"
+
+/****************/
+/* Local Macros */
+/****************/
+
+#define nanosecs_per_second 1000000000 /* nanoseconds per second */
+#define nanosecs_per_tenth_sec 100000000 /* nanoseconds per 0.1 second */
+
+/********************/
+/* Local Prototypes */
+/********************/
+
+static herr_t H5F__vfd_swmr_update_end_of_tick_and_tick_num(H5F_shared_t *,
+ hbool_t);
+static herr_t H5F__vfd_swmr_construct_write_md_hdr(H5F_shared_t *, uint32_t);
+static herr_t H5F__vfd_swmr_construct_write_md_idx(H5F_shared_t *, uint32_t,
+ struct H5FD_vfd_swmr_idx_entry_t[]);
+static herr_t H5F__idx_entry_cmp(const void *_entry1, const void *_entry2);
+static herr_t H5F__vfd_swmr_create_index(H5F_shared_t *);
+static herr_t H5F__vfd_swmr_writer__wait_a_tick(H5F_t *);
+
+/*********************/
+/* Package Variables */
+/*********************/
+
+/*
+ * Globals for VFD SWMR
+ */
+
+unsigned int vfd_swmr_api_entries_g = 0;/* Times the library was entered
+ * and re-entered minus the times
+ * it was exited. We only perform
+ * the end-of-tick processing
+ * on the 0->1 and 1->0
+ * transitions.
+ */
+HLOG_OUTLET_SHORT_DEFN(swmr, all);
+HLOG_OUTLET_SHORT_DEFN(eot, swmr);
+HLOG_OUTLET_SHORT_DEFN(eotq, eot);
+HLOG_OUTLET_SHORT_DEFN(shadow_defrees, swmr);
+HLOG_OUTLET_MEDIUM_DEFN(noisy_shadow_defrees, shadow_defrees,
+ HLOG_OUTLET_S_OFF);
+HLOG_OUTLET_SHORT_DEFN(shadow_index_enlarge, swmr);
+HLOG_OUTLET_SHORT_DEFN(shadow_index_reclaim, swmr);
+HLOG_OUTLET_SHORT_DEFN(shadow_index_update, swmr);
+HLOG_OUTLET_SHORT_DEFN(tick, swmr);
+HLOG_OUTLET_SHORT_DEFN(mdc_invalidation, swmr);
+
+/*
+ * The head of the end of tick queue (EOT queue) for files opened in either
+ * VFD SWMR write or VFD SWMR read mode
+ */
+eot_queue_t eot_queue_g = TAILQ_HEAD_INITIALIZER(eot_queue_g);
+
+/*******************/
+/* Local Variables */
+/*******************/
+
+/* Declare a free list to manage the shadow_defree_t struct */
+H5FL_DEFINE(shadow_defree_t);
+
+/* Declare a free list to manage the eot_queue_entry_t struct */
+H5FL_DEFINE(eot_queue_entry_t);
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5F_vfd_swmr_init
+ *
+ * Purpose: Initialize globals and the corresponding fields in
+ * file pointer.
+ *
+ * For both VFD SWMR writer and reader:
+ *
+ * --set end_of_tick to the current time + tick length
+ *
+ * For VFD SWMR writer:
+ *
+ * --set f->shared->tick_num to 1
+ * --create the metadata file
+ * --when opening an existing HDF5 file, write header and
+ * empty index in the metadata file
+ *
+ * For VFD SWMR reader:
+ *
+ * --set f->shared->tick_num to the current tick read from the
+ * metadata file
+ *
+ * Return: Success: SUCCEED
+ * Failure: FAIL
+ *
+ * Programmer: Vailin Choi -- 11/??/18
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5F_vfd_swmr_init(H5F_t *f, hbool_t file_create)
+{
+ hsize_t md_size; /* Size of the metadata file */
+ haddr_t hdr_addr, idx_addr; /* Addresses returned from H5MV_alloc() */
+ herr_t ret_value = SUCCEED; /* Return value */
+ H5F_shared_t *shared = f->shared;
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ HDassert(H5F_SHARED_VFD_SWMR_CONFIG(shared));
+
+ shared->vfd_swmr = TRUE;
+
+ if(H5F_SHARED_INTENT(shared) & H5F_ACC_RDWR) {
+
+ HDassert(shared->vfd_swmr_config.writer);
+
+ SIMPLEQ_INIT(&shared->lower_defrees);
+ shared->vfd_swmr_writer = TRUE;
+ shared->tick_num = 1;
+
+ if ( H5PB_vfd_swmr__set_tick(shared) < 0 )
+ HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, \
+ "Can't update page buffer current tick")
+
+ /* Create the metadata file */
+ if ( ((shared->vfd_swmr_md_fd =
+ HDopen(shared->vfd_swmr_config.md_file_path, O_CREAT|O_RDWR,
+ H5_POSIX_CREATE_MODE_RW))) < 0 )
+
+ HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL, \
+ "unable to create the metadata file")
+
+ md_size = (hsize_t)shared->vfd_swmr_config.md_pages_reserved *
+ shared->fs_page_size;
+
+ assert(shared->fs_page_size >= H5FD_MD_HEADER_SIZE);
+
+ /* Allocate an entire page from the shadow file for the header. */
+ if ((hdr_addr = H5MV_alloc(f, shared->fs_page_size)) == HADDR_UNDEF){
+ HGOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL,
+ "error allocating shadow-file header");
+ }
+ HDassert(H5F_addr_eq(hdr_addr, H5FD_MD_HEADER_OFF));
+
+ idx_addr = H5MV_alloc(f, md_size - shared->fs_page_size);
+ if (idx_addr == HADDR_UNDEF) {
+ HGOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL,
+ "error allocating shadow-file index");
+ }
+
+ HDassert(H5F_addr_eq(idx_addr, shared->fs_page_size));
+
+ shared->writer_index_offset = idx_addr;
+
+ /* Set the metadata file size to md_pages_reserved */
+ if ( -1 == HDftruncate(shared->vfd_swmr_md_fd, (HDoff_t)md_size) )
+ HGOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL,
+ "truncate fail for the metadata file");
+
+ /* Set eof for metadata file to md_pages_reserved */
+ shared->vfd_swmr_md_eoa = (haddr_t)md_size;
+
+ /* When opening an existing HDF5 file, create header and empty
+ * index in the metadata file
+ */
+ if (!file_create) {
+
+ if (H5F__vfd_swmr_construct_write_md_idx(shared, 0, NULL) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL,
+ "fail to create index in md");
+
+ if (H5F__vfd_swmr_construct_write_md_hdr(shared, 0) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL,
+ "fail to create header in md");
+ }
+
+ } else { /* VFD SWMR reader */
+
+ HDassert(!shared->vfd_swmr_config.writer);
+
+ shared->vfd_swmr_writer = FALSE;
+
+ HDassert(shared->mdf_idx == NULL);
+
+ /* allocate an index to save the initial index */
+ if (H5F__vfd_swmr_create_index(shared) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTALLOC, FAIL,
+ "unable to allocate metadata file index");
+
+ /* Set tick_num to the current tick read from the metadata file */
+ shared->mdf_idx_entries_used = shared->mdf_idx_len;
+ if (H5FD_vfd_swmr_get_tick_and_idx(shared->lf, FALSE,
+ &shared->tick_num,
+ &(shared->mdf_idx_entries_used),
+ shared->mdf_idx) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTLOAD, FAIL,
+ "unable to load/decode metadata file");
+
+ assert(shared->tick_num != 0);
+ vfd_swmr_reader_did_increase_tick_to(shared->tick_num);
+
+ hlog_fast(tick, "%s first tick %" PRIu64,
+ __func__, shared->tick_num);
+
+#if 0 /* JRM */
+ HDfprintf(stderr,
+ "##### initialized index: tick/used/len = %lld/%d/%d #####\n",
+ shared->tick_num, shared->mdf_idx_entries_used,
+ shared->mdf_idx_len);
+#endif /* JRM */
+ }
+
+ /* Update end_of_tick */
+ if (H5F__vfd_swmr_update_end_of_tick_and_tick_num(shared, FALSE) < 0) {
+ HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL,
+ "unable to update end of tick");
+ }
+
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5F_vfd_swmr_init() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5F_vfd_swmr_close_or_flush
+ *
+ * Purpose: Used by the VFD SWMR writer when the HDF5 file is closed
+ * or flushed:
+ *
+ * 1) For file close:
+ * --write header and an empty index to the metadata file
+ * --increment tick_num
+ * --close the metadata file
+ * --unlink the metadata file
+ * --close the free-space manager for the metadata file
+ *
+ * 2) For file flush:
+ * --write header and an empty index to the metadata file
+ * --increment tick_num
+ * --start a new tick (??check with JM for sure)
+ * ??update end_of_tick
+ *
+ * Return: Success: SUCCEED
+ * Failure: FAIL
+ *
+ * Programmer: Vailin Choi -- 11/??/18
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5F_vfd_swmr_close_or_flush(H5F_t *f, hbool_t closing)
+{
+ H5F_shared_t *shared = f->shared;
+ shadow_defree_t *curr;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ HDassert(shared->vfd_swmr_writer);
+ HDassert(shared->vfd_swmr_md_fd >= 0);
+
+ /* Write empty index to the md file */
+ if (H5F__vfd_swmr_construct_write_md_idx(shared, 0, NULL) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, "fail to create index in md");
+
+ /* Write header to the md file */
+ if (H5F__vfd_swmr_construct_write_md_hdr(shared, 0) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, "fail to create header in md");
+
+ if ( closing ) { /* For file close */
+
+ ++shared->tick_num;
+
+ /* Close the md file */
+ if(HDclose(shared->vfd_swmr_md_fd) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL,
+ "unable to close the metadata file");
+ shared->vfd_swmr_md_fd = -1;
+
+ /* Unlink the md file */
+ if ( HDunlink(shared->vfd_swmr_config.md_file_path) < 0 )
+ HGOTO_ERROR(H5E_FILE, H5E_CANTREMOVE, FAIL,
+ "unable to unlink the metadata file");
+
+ /* Close the free-space manager for the metadata file */
+ if ( H5MV_close(f) < 0 )
+ HGOTO_ERROR(H5E_FILE, H5E_CANTRELEASE, FAIL,
+ "unable to close the free-space manager for the metadata file");
+
+ /* Free the delayed list */
+ while ((curr = TAILQ_FIRST(&shared->shadow_defrees)) != NULL) {
+ TAILQ_REMOVE(&shared->shadow_defrees, curr, link);
+ H5FL_FREE(shadow_defree_t, curr);
+ }
+ hlog_fast(shadow_defrees, "Emptied deferred shadow frees.");
+
+ assert(TAILQ_EMPTY(&shared->shadow_defrees));
+ } else { /* For file flush */
+ /* Update end_of_tick */
+ if (H5F__vfd_swmr_update_end_of_tick_and_tick_num(shared, TRUE) < 0)
+ HDONE_ERROR(H5E_FILE, H5E_CANTSET, FAIL,
+ "unable to update end of tick");
+ }
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+}
+
+static int
+shadow_range_defer_free(H5F_shared_t *shared, uint64_t offset, uint32_t length)
+{
+ shadow_defree_t *shadow_defree;
+
+ if (NULL == (shadow_defree = H5FL_CALLOC(shadow_defree_t)))
+ return -1;
+
+ shadow_defree->offset = offset;
+ shadow_defree->length = length;
+ shadow_defree->tick_num = shared->tick_num;
+
+ if (TAILQ_EMPTY(&shared->shadow_defrees))
+ hlog_fast(shadow_defrees, "Adding first deferred shadow free.");
+
+ TAILQ_INSERT_HEAD(&shared->shadow_defrees, shadow_defree, link);
+ return 0;
+}
+
+int
+shadow_image_defer_free(H5F_shared_t *shared,
+ const H5FD_vfd_swmr_idx_entry_t *entry)
+{
+ return shadow_range_defer_free(shared,
+ entry->md_file_page_offset * shared->fs_page_size, entry->length);
+}
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5F_update_vfd_swmr_metadata_file()
+ *
+ * Purpose: Update the metadata file with the input index
+ *
+ * --Sort index
+ *
+ * --For each non-null entry_ptr in the index entries:
+ * --Insert previous image of the entry onto the delayed list
+ * --Allocate space for the entry in the metadata file
+ * --Compute checksum
+ * --Update index entry
+ * --Write the entry to the metadata file
+ * --Set entry_ptr to NULL
+ *
+ * --Construct on disk image of the index and write index to the
+ * metadata file
+ *
+ * --Construct on disk image of the header and write header to
+ * the metadata file
+ *
+ * --Release time out entries from the delayed list to the
+ * free-space manager
+ *
+ * Return: SUCCEED/FAIL
+ *
+ * Programmer: Vailin Choi 11/??/18
+ *
+ * Changes: None.
+ *
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5F_update_vfd_swmr_metadata_file(H5F_t *f, uint32_t num_entries,
+ H5FD_vfd_swmr_idx_entry_t *index)
+{
+ H5F_shared_t *shared = f->shared;
+ shadow_defree_t *prev;
+ shadow_defree_t *shadow_defree;
+ haddr_t md_addr; /* Address in the metadata file */
+ uint32_t i; /* Local index variable */
+ herr_t ret_value = SUCCEED; /* Return value */
+ bool queue_was_nonempty;
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ /* Sort index entries by increasing offset in the HDF5 file */
+ if (num_entries > 0) {
+ HDqsort(index, num_entries, sizeof(*index), H5F__idx_entry_cmp);
+ /* Assert that no HDF5 page offsets are duplicated. */
+ for (i = 1; i < num_entries; i++)
+ assert(index[i - 1].hdf5_page_offset < index[i].hdf5_page_offset);
+ }
+
+ /* For each non-null entry_ptr in the index:
+ *
+ * --Insert previous image of the entry (if exists) to the
+ * beginning of the delayed list
+ *
+ * --Allocate space for the entry in the metadata file
+ *
+ * --Compute checksum, update the index entry, write entry to
+ * the metadata file
+ *
+ * --Set entry_ptr to NULL
+ */
+ for (i = 0; i < num_entries; i++) {
+
+ if (index[i].entry_ptr == NULL)
+ continue;
+
+ /* Prepend previous image of the entry to the delayed list */
+ if ( index[i].md_file_page_offset ) {
+ if (shadow_image_defer_free(shared, &index[i]) == -1) {
+ HGOTO_ERROR(H5E_FILE, H5E_CANTALLOC, FAIL, \
+ "unable to allocate the delayed entry")
+ }
+ }
+
+ /* Allocate space for the entry in the metadata file */
+ if((md_addr = H5MV_alloc(f, index[i].length)) == HADDR_UNDEF)
+ HGOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, \
+ "error in allocating space from the metadata file")
+
+ hlog_fast(noisy_shadow_defrees,
+ "shadow index %" PRIu32 " page offset %" PRIu64 " -> %" PRIuHADDR,
+ i, index[i].md_file_page_offset * shared->fs_page_size, md_addr);
+
+ HDassert(md_addr % shared->fs_page_size == 0);
+
+ /* Compute checksum and update the index entry */
+ index[i].md_file_page_offset = md_addr / shared->fs_page_size;
+ index[i].chksum = H5_checksum_metadata(index[i].entry_ptr,
+ index[i].length, 0);
+
+#if 0 /* JRM */
+ HDfprintf(stderr,
+ "writing index[%d] fo/mdfo/l/chksum/fc/lc = %lld/%lld/%ld/%lx/%lx/%lx\n",
+ i,
+ index[i].hdf5_page_offset,
+ index[i].md_file_page_offset,
+ index[i].length,
+ index[i].chksum,
+ (((char*)(index[i].entry_ptr))[0]),
+ (((char*)(index[i].entry_ptr))[4095]));
+
+ HDassert(md_addr == index[i].md_file_page_offset *
+ shared->fs_page_size);
+ HDassert(shared->fs_page_size == 4096);
+#endif /* JRM */
+
+ /* Seek and write the entry to the metadata file */
+ if ( HDlseek(shared->vfd_swmr_md_fd, (HDoff_t)md_addr,
+ SEEK_SET) < 0)
+
+ HGOTO_ERROR(H5E_FILE, H5E_SEEKERROR, FAIL, \
+ "unable to seek in the metadata file")
+
+ if ( HDwrite(shared->vfd_swmr_md_fd, index[i].entry_ptr,
+ index[i].length) != (ssize_t)index[i].length )
+
+ HGOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, \
+ "error in writing the page/multi-page entry to metadata file")
+
+ index[i].entry_ptr = NULL;
+ }
+
+ /* Construct and write index to the metadata file */
+ if (H5F__vfd_swmr_construct_write_md_idx(shared, num_entries, index) < 0)
+
+ HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, \
+ "fail to construct & write index to md")
+
+ /* Construct and write header to the md file */
+ if (H5F__vfd_swmr_construct_write_md_hdr(shared, num_entries) < 0)
+
+ HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, \
+ "fail to construct & write header to md")
+
+ queue_was_nonempty = !TAILQ_EMPTY(&shared->shadow_defrees);
+
+ /*
+ * Release time out entries from the delayed list by scanning the
+ * list from the bottom up:
+ *
+ * --release to the metadata file free space manager all index
+ * entries that have resided on the list for more than
+ * max_lag ticks
+ *
+ * --remove the associated entries from the list
+ */
+
+ if (shared->tick_num <= shared->vfd_swmr_config.max_lag)
+ goto done; // It is too early for any reclamations to be due.
+
+ TAILQ_FOREACH_REVERSE_SAFE(shadow_defree, &shared->shadow_defrees,
+ shadow_defree_queue, link, prev) {
+
+ if (shadow_defree->tick_num + shared->vfd_swmr_config.max_lag >
+ shared->tick_num) {
+ break; // No more entries are due for reclamation.
+ }
+
+ if (H5MV_free(f, shadow_defree->offset, shadow_defree->length) < 0) {
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL,
+ "unable to flush clean entry");
+ }
+
+ hlog_fast(noisy_shadow_defrees,
+ "released %" PRIu32 " bytes at %" PRIu64,
+ shadow_defree->length, shadow_defree->offset);
+
+ TAILQ_REMOVE(&shared->shadow_defrees, shadow_defree, link);
+
+ H5FL_FREE(shadow_defree_t, shadow_defree);
+ }
+
+ if (queue_was_nonempty && TAILQ_EMPTY(&shared->shadow_defrees))
+ hlog_fast(shadow_defrees, "Removed last deferred shadow free.");
+
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* end H5F_update_vfd_swmr_metadata_file() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5F_vfd_swmr_writer__delay_write
+ *
+ * Purpose: Given the base address of a page of metadata, or of a multi-
+ * page metadata entry, determine whether the write must be
+ * delayed.
+ *
+ * At the conceptual level, the VFD SWMR writer must delay the
+ * write of any metadata page or multi-page metadata that
+ * overwrites an existing metadata page or multi-page metadata
+ * entry until it has appeared in the metadata file index for
+ * at least max_lag ticks. Since the VFD SWMR reader goes
+ * to the HDF5 file for any piece of metadata not listed in
+ * the metadata file index, failure to delay such writes can
+ * result in message from the future bugs.
+ *
+ * The easy case is pages or multi-page metadata entries
+ * have just been allocated. Obviously, these can be written
+ * immediately. This case is tracked and tested by the page
+ * buffer proper.
+ *
+ * This routine looks up the supplied page in the metadata file
+ * index.
+ *
+ * If the entry doesn't exist, the function sets
+ * *untilp to the current tick plus max_lag.
+ *
+ * If the entry exists, the function sets *untilp
+ * equal to the entries delayed flush field if it is greater than
+ * or equal to the current tick, or zero otherwise.
+ *
+ * Return: SUCCEED/FAIL
+ *
+ * Programmer: John Mainzer 11/4/18
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5F_vfd_swmr_writer__delay_write(H5F_shared_t *shared, uint64_t page,
+ uint64_t *untilp)
+{
+ uint64_t until;
+ H5FD_vfd_swmr_idx_entry_t *ie_ptr;
+ H5FD_vfd_swmr_idx_entry_t *idx;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ HDassert(shared);
+ HDassert(shared->vfd_swmr);
+ HDassert(shared->vfd_swmr_writer);
+
+ idx = shared->mdf_idx;
+
+ HDassert(idx != NULL || shared->tick_num <= 1);
+
+ /* do a binary search on the metadata file index to see if
+ * it already contains an entry for `page`.
+ */
+
+ if (idx == NULL) {
+ ie_ptr = NULL;
+ } else {
+ ie_ptr = vfd_swmr_pageno_to_mdf_idx_entry(idx,
+ shared->mdf_idx_entries_used, page, false);
+ }
+
+ if (ie_ptr == NULL)
+ until = shared->tick_num + shared->vfd_swmr_config.max_lag;
+ else if (ie_ptr->delayed_flush >= shared->tick_num)
+ until = ie_ptr->delayed_flush;
+ else
+ until = 0;
+
+ if (until != 0 &&
+ (until < shared->tick_num ||
+ shared->tick_num + shared->vfd_swmr_config.max_lag < until))
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "VFD SWMR write delay out of range")
+
+ *untilp = until;
+
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5F_vfd_swmr_writer__delay_write() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5F_vfd_swmr_writer__prep_for_flush_or_close
+ *
+ * Purpose: In the context of the VFD SWMR writer, two issues must be
+ * addressed before the page buffer can be flushed -- as is
+ * necessary on both HDF5 file flush or close:
+ *
+ * 1) We must force an end of tick so as to clean the tick list
+ * in the page buffer.
+ *
+ * 2) If the page buffer delayed write list is not empty, we
+ * must repeatedly wait a tick and then run the writer end
+ * of tick function until the delayed write list drains.
+ *
+ * This function manages these details.
+ *
+ * Return: SUCCEED/FAIL
+ *
+ * Programmer: John Mainzer 11/27/18
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5F_vfd_swmr_writer__prep_for_flush_or_close(H5F_t *f)
+{
+ herr_t ret_value = SUCCEED; /* Return value */
+ H5F_shared_t *shared = f->shared;
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ HDassert(shared->vfd_swmr);
+ HDassert(shared->vfd_swmr_writer);
+ HDassert(shared->pb_ptr);
+
+ /* since we are about to flush the page buffer, force and end of
+ * tick so as to avoid attempts to flush entries on the page buffer
+ * tick list that were modified during the current tick.
+ */
+ if ( H5F_vfd_swmr_writer_end_of_tick(f, true) < 0 )
+
+ HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, \
+ "H5F_vfd_swmr_writer_end_of_tick() failed.")
+
+ while(shared->pb_ptr->dwl_len > 0) {
+
+ if(H5F__vfd_swmr_writer__wait_a_tick(f) < 0)
+
+ HGOTO_ERROR(H5E_FILE, H5E_CANTFLUSH, FAIL, "wait a tick failed.")
+ }
+
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5F_vfd_swmr_writer__prep_for_flush_or_close() */
+
+static int
+clean_shadow_index(H5F_t *f, uint32_t nentries,
+ H5FD_vfd_swmr_idx_entry_t *idx, uint32_t *ndeletedp)
+{
+ H5F_shared_t *shared = f->shared;
+ uint32_t i, j, ndeleted, max_lag = shared->vfd_swmr_config.max_lag;
+ uint64_t tick_num = shared->tick_num;
+ H5FD_vfd_swmr_idx_entry_t *ie;
+
+ for (i = j = ndeleted = 0; i < nentries; i++) {
+ ie = &idx[i];
+
+ if (ie->clean) {
+ hlog_fast(shadow_index_reclaim,
+ "Visiting clean shadow index slot %" PRIu32
+ " lower page %" PRIu64 " last flush %" PRIu64 " ticks ago",
+ i, ie->hdf5_page_offset, tick_num - ie->tick_of_last_flush);
+ }
+
+ if (ie->clean && ie->tick_of_last_flush + max_lag < tick_num) {
+
+ assert(!ie->garbage);
+ assert(ie->entry_ptr == NULL);
+
+ hlog_fast(shadow_index_reclaim,
+ "Reclaiming shadow index slot %" PRIu32
+ " lower page %" PRIu64, i, ie->hdf5_page_offset);
+
+ if (ie->md_file_page_offset != 0) {
+ if (shadow_image_defer_free(shared, ie) == -1)
+ return -1;
+ ie->md_file_page_offset = 0;
+ }
+ ndeleted++;
+ continue;
+ }
+ if (j != i)
+ idx[j] = *ie;
+ j++;
+ }
+ *ndeletedp = ndeleted;
+ return 0;
+}
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5F_vfd_swmr_writer_end_of_tick
+ *
+ * Purpose: Main routine for managing the end of tick for the VFD
+ * SWMR writer.
+ *
+ * This function performs all end of tick operations for the
+ * writer -- specifically:
+ *
+ * 1) If requested, flush all raw data to the HDF5 file.
+ *
+ * (Not for first cut.)
+ *
+ * 2) Flush the metadata cache to the page buffer.
+ *
+ * Note that we must run a tick after the destruction
+ * of the metadata cache, since this operation will usually
+ * dirty the first page in the HDF5 file. However, the
+ * metadata cache will no longer exist at this point.
+ *
+ * Thus, we must check for the existance of the metadata
+ * cache, and only attempt to flush it if it exists.
+ *
+ * 3) If this is the first tick (i.e. tick == 1), create the
+ * in memory version of the metadata file index.
+ *
+ * 4) Scan the page buffer tick list, and use it to update
+ * the metadata file index, adding or modifying entries as
+ * appropriate.
+ *
+ * 5) Scan the metadata file index for entries that can be
+ * removed -- specifically entries that have been written
+ * to the HDF5 file more than max_lag ticks ago, and haven't
+ * been modified since.
+ *
+ * (This is an optimization -- address it later)
+ *
+ * 6) Update the metadata file. Must do this before we
+ * release the tick list, as otherwise the page buffer
+ * entry images may not be available.
+ *
+ * 7) Release the page buffer tick list.
+ *
+ * 8) Release any delayed writes whose delay has expired.
+ *
+ * 9) Increment the tick, and update the end of tick.
+ *
+ * In passing, generate log entries as appropriate.
+ *
+ * Return: SUCCEED/FAIL
+ *
+ * Programmer: John Mainzer 11/4/18
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5F_vfd_swmr_writer_end_of_tick(H5F_t *f, bool wait_for_reader)
+{
+ H5F_shared_t *shared = f->shared;
+ uint32_t idx_entries_added = 0;
+ uint32_t idx_entries_modified = 0;
+ uint32_t idx_entries_removed = 0;
+ uint32_t idx_ent_not_in_tl = 0;
+ uint32_t idx_ent_not_in_tl_flushed = 0;
+ herr_t ret_value = SUCCEED; /* Return value */
+ bool incr_tick = false;
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ HDassert(shared);
+ HDassert(shared->pb_ptr);
+ HDassert(shared->vfd_swmr_writer);
+
+ if (!vfd_swmr_writer_may_increase_tick_to(shared->tick_num + 1,
+ wait_for_reader))
+ goto update_eot;
+
+ incr_tick = true;
+
+ /* 1) If requested, flush all raw data to the HDF5 file.
+ *
+ * (Not for first cut.)
+ */
+ HDassert(!shared->vfd_swmr_config.flush_raw_data);
+
+#if 1
+ /* Test to see if b-tree corruption seen in VFD SWMR tests
+ * is caused by client hiding data from the metadata cache. Do
+ * this by calling H5D_flush_all(), which flushes any cached
+ * dataset storage. Eventually, we will do this regardless
+ * when the above flush_raw_data flag is set.
+ */
+
+ if ( H5D_flush_all(f) < 0 )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, \
+ "unable to flush dataset cache")
+
+
+ if(H5MF_free_aggrs(f) < 0)
+
+ HGOTO_ERROR(H5E_FILE, H5E_CANTRELEASE, FAIL, "can't release file space")
+
+
+ if ( shared->cache ) {
+
+ if ( H5AC_flush(f) < 0 )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, \
+ "Can't flush metadata cache to the page buffer")
+ }
+
+
+
+ if ( H5FD_truncate(shared->lf, FALSE) < 0 )
+
+ HGOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, "low level truncate failed")
+#endif
+
+ /* 2) If it exists, flush the metadata cache to the page buffer. */
+ if ( shared->cache ) {
+
+ if ( H5AC_flush(f) < 0 )
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, \
+ "Can't flush metadata cache to the page buffer")
+ }
+
+
+ /* 3) If this is the first tick (i.e. tick == 1), create the
+ * in memory version of the metadata file index.
+ */
+ if ( ( shared->tick_num == 1 ) &&
+ ( H5F__vfd_swmr_create_index(shared) < 0 ) )
+
+ HGOTO_ERROR(H5E_FILE, H5E_CANTALLOC, FAIL, \
+ "unable to allocate metadata file index")
+
+
+ /* 4) Scan the page buffer tick list, and use it to update
+ * the metadata file index, adding or modifying entries as
+ * appropriate.
+ */
+ if ( H5PB_vfd_swmr__update_index(f, &idx_entries_added,
+ &idx_entries_modified,
+ &idx_ent_not_in_tl,
+ &idx_ent_not_in_tl_flushed) < 0 )
+
+ HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, "can't update MD file index")
+
+
+ /* 5) Scan the metadata file index for entries that can be
+ * removed -- specifically entries that have been written
+ * to the HDF5 file more than max_lag ticks ago, and haven't
+ * been modified since.
+ */
+ if (clean_shadow_index(f,
+ shared->mdf_idx_entries_used + idx_entries_added,
+ shared->mdf_idx, &idx_entries_removed) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, "can't clean shadow file index")
+
+ /* 6) Update the metadata file. Must do this before we
+ * release the tick list, as otherwise the page buffer
+ * entry images may not be available.
+ *
+ * Note that this operation will restore the index to
+ * sorted order.
+ */
+ if (H5F_update_vfd_swmr_metadata_file(f,
+ shared->mdf_idx_entries_used + idx_entries_added -
+ idx_entries_removed,
+ shared->mdf_idx) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, "can't update MD file")
+
+ /* at this point the metadata file index should be sorted -- update
+ * shared->mdf_idx_entries_used.
+ */
+ shared->mdf_idx_entries_used += idx_entries_added;
+ shared->mdf_idx_entries_used -= idx_entries_removed;
+
+ HDassert(shared->mdf_idx_entries_used <= shared->mdf_idx_len);
+
+#if 0 /* JRM */
+ H5F__vfd_swmr_writer__dump_index(f);
+#endif /* JRM */
+
+ /* 7) Release the page buffer tick list. */
+ if ( H5PB_vfd_swmr__release_tick_list(shared) < 0 )
+
+ HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, "can't release tick list")
+
+
+ /* 8) Release any delayed writes whose delay has expired */
+ if ( H5PB_vfd_swmr__release_delayed_writes(shared) < 0 )
+
+ HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, "can't release delayed writes")
+
+
+update_eot:
+
+ /* 9) Increment the tick, and update the end of tick. */
+
+ /* Update end_of_tick */
+ if (H5F__vfd_swmr_update_end_of_tick_and_tick_num(shared, incr_tick) < 0)
+
+ HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, \
+ "unable to update end of tick")
+
+ /* Remove the entry from the EOT queue */
+ if(H5F_vfd_swmr_remove_entry_eot(f) < 0)
+ HDONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, "unable to remove entry from EOT queue")
+
+ /* Re-insert the entry that corresponds to f onto the EOT queue */
+ if(H5F_vfd_swmr_insert_entry_eot(f) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, "unable to insert entry into the EOT queue")
+
+ hlog_fast(eot, "%s leave tick %" PRIu64 " idx len %" PRIu32,
+ __func__, shared->tick_num, shared->mdf_idx_entries_used);
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+}
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5F_vfd_swmr_writer__dump_index
+ *
+ * Purpose: Dump a summary of the metadata file index.
+ *
+ * Return: SUCCEED/FAIL
+ *
+ * Programmer: John Mainzer 12/14/19
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5F_vfd_swmr_writer__dump_index(H5F_shared_t *shared)
+{
+ unsigned int i;
+ uint32_t mdf_idx_len;
+ uint32_t mdf_idx_entries_used;
+ H5FD_vfd_swmr_idx_entry_t * index = NULL;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ HDassert(shared);
+ HDassert(shared->vfd_swmr);
+ HDassert(shared->mdf_idx);
+
+
+ index = shared->mdf_idx;
+ mdf_idx_len = shared->mdf_idx_len;
+ mdf_idx_entries_used = shared->mdf_idx_entries_used;
+
+ HDfprintf(stderr, "\n\nDumping Index:\n\n");
+ HDfprintf(stderr,
+ "index len / entries used = %" PRIu32 " / %" PRIu32 "\n\n",
+ mdf_idx_len, mdf_idx_entries_used);
+
+ for ( i = 0; i < mdf_idx_entries_used; i++ ) {
+
+ HDfprintf(stderr, "%u: %" PRIu64 " %" PRIu64 " %" PRIu32 "\n",
+ i, index[i].hdf5_page_offset, index[i].md_file_page_offset,
+ index[i].length);
+ }
+
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* end H5F_vfd_swmr_writer__dump_index() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5F_vfd_swmr_reader_end_of_tick
+ *
+ * Purpose: Main routine for VFD SWMR reader end of tick operations.
+ * The following operations must be performed:
+ *
+ * 1) Direct the VFD SWMR reader VFD to load the current header
+ * from the metadata file, and report the current tick.
+ *
+ * If the tick reported has not increased since the last
+ * call, do nothing and exit.
+ *
+ * 2) If the tick has increased, obtain a copy of the new
+ * index from the VFD SWMR reader VFD, and compare it with
+ * the old index to identify all pages that have been updated
+ * in the previous tick.
+ *
+ * If any such pages or multi-page metadata entries are found:
+ *
+ * a) direct the page buffer to evict any such superceeded
+ * pages, and
+ *
+ * b) direct the metadata cache to either evict or refresh
+ * any entries residing in the superceeded pages.
+ *
+ * Note that this operation MUST be performed in this order,
+ * as the metadata cache will refer to the page buffer
+ * when refreshing entries.
+ *
+ * 9) Increment the tick, and update the end of tick.
+ *
+ * Return: SUCCEED/FAIL
+ *
+ * Programmer: John Mainzer 12/29/18
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5F_vfd_swmr_reader_end_of_tick(H5F_t *f, bool entering_api)
+{
+ uint64_t tmp_tick_num = 0;
+ H5FD_vfd_swmr_idx_entry_t * tmp_mdf_idx;
+ uint32_t entries_added = 0;
+ uint32_t entries_removed = 0;
+ uint32_t entries_moved = 0;
+ uint32_t tmp_mdf_idx_len;
+ uint32_t tmp_mdf_idx_entries_used;
+ uint32_t mdf_idx_entries_used;
+ H5F_shared_t *shared = f->shared;
+ struct {
+ uint64_t pgno;
+ uint32_t length;
+ } *change = NULL;
+ herr_t ret_value = SUCCEED;
+ uint32_t i, j, nchanges;
+ H5FD_t *file = shared->lf;
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ HDassert(shared->pb_ptr);
+ HDassert(shared->vfd_swmr);
+ HDassert(!shared->vfd_swmr_writer);
+ HDassert(file);
+
+ hlog_fast(eot, "%s enter file %p index len %" PRIu32 " used %" PRIu32,
+ __func__, (void *)file,
+ shared->mdf_idx_len, shared->mdf_idx_entries_used);
+
+ /* 1) Direct the VFD SWMR reader VFD to load the current header
+ * from the metadata file, and report the current tick.
+ *
+ * If the tick reported has not increased since the last
+ * call, do nothing and exit.
+ */
+ if ( H5FD_vfd_swmr_get_tick_and_idx(file, TRUE, &tmp_tick_num,
+ NULL, NULL) < 0 )
+
+ HGOTO_ERROR(H5E_ARGS, H5E_CANTGET, FAIL, \
+ "error in retrieving tick_num from driver")
+
+ hlog_fast(tick,
+ "%s last tick %" PRIu64 " new tick %" PRIu64,
+ __func__, shared->tick_num, tmp_tick_num);
+
+ /* This is ok if we're entering the API, but it should
+ * not happen if we're exiting the API.
+ */
+ assert(entering_api || tmp_tick_num <
+ shared->tick_num + shared->vfd_swmr_config.max_lag);
+
+ if (!entering_api) {
+ H5FD_vfd_swmr_record_elapsed_ticks(shared->lf,
+ tmp_tick_num - shared->tick_num);
+ }
+
+ if ( tmp_tick_num != shared->tick_num ) {
+ const H5FD_vfd_swmr_idx_entry_t *new_mdf_idx;
+ const H5FD_vfd_swmr_idx_entry_t *old_mdf_idx;
+ uint32_t new_mdf_idx_entries_used;
+ uint32_t old_mdf_idx_entries_used;
+
+ /* swap the old and new metadata file indexes */
+
+ tmp_mdf_idx = shared->old_mdf_idx;
+ tmp_mdf_idx_len = shared->old_mdf_idx_len;
+ tmp_mdf_idx_entries_used = shared->old_mdf_idx_entries_used;
+
+ shared->old_mdf_idx = shared->mdf_idx;
+ shared->old_mdf_idx_len = shared->mdf_idx_len;
+ shared->old_mdf_idx_entries_used = shared->mdf_idx_entries_used;
+
+ shared->mdf_idx = tmp_mdf_idx;
+ shared->mdf_idx_len = tmp_mdf_idx_len;
+ shared->mdf_idx_entries_used = tmp_mdf_idx_entries_used;
+
+ /* if shared->mdf_idx is NULL, allocate an index */
+ if (shared->mdf_idx == NULL &&
+ H5F__vfd_swmr_create_index(shared) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTALLOC, FAIL,
+ "unable to allocate metadata file index");
+
+ mdf_idx_entries_used = shared->mdf_idx_len;
+
+#if 0 /* JRM */
+ HDfprintf(stderr, "--- reader EOT mdf_idx_entries_used = %d ---\n",
+ mdf_idx_entries_used);
+#endif /* JRM */
+
+ if (H5FD_vfd_swmr_get_tick_and_idx(file, FALSE, NULL,
+ &mdf_idx_entries_used,
+ shared->mdf_idx) < 0)
+ HGOTO_ERROR(H5E_ARGS, H5E_CANTGET, FAIL,
+ "error in retrieving tick_num from driver");
+
+ HDassert(mdf_idx_entries_used <= shared->mdf_idx_len);
+
+ shared->mdf_idx_entries_used = mdf_idx_entries_used;
+
+#if 0 /* JRM */
+ HDfprintf(stderr,
+ "--- reader EOT index used / len = %" PRIu32 "/%" PRIu32 " ---\n",
+ shared->mdf_idx_entries_used, shared->mdf_idx_len);
+#endif /* JRM */
+
+ new_mdf_idx = shared->mdf_idx;
+ old_mdf_idx = shared->old_mdf_idx;
+ new_mdf_idx_entries_used = shared->mdf_idx_entries_used;
+ old_mdf_idx_entries_used = shared->old_mdf_idx_entries_used;
+
+ change = malloc(sizeof(change[0]) *
+ (old_mdf_idx_entries_used + new_mdf_idx_entries_used));
+
+ if (change == NULL) {
+ HGOTO_ERROR(H5E_FILE, H5E_CANTALLOC, FAIL,
+ "unable to allocate removed pages list");
+ }
+
+ /* If an old metadata file index exists, compare it with the
+ * new index and evict any modified, new, or deleted pages
+ * and any associated metadata cache entries.
+ *
+ * Note that we must evict in two passes---page buffer first,
+ * and then metadata cache. This is necessary as the metadata
+ * cache may attempt to refresh entries rather than evict them,
+ * in which case it may access an entry in the page buffer.
+ */
+
+ for (i = j = nchanges = 0;
+ i < old_mdf_idx_entries_used &&
+ j < new_mdf_idx_entries_used; ) {
+ const H5FD_vfd_swmr_idx_entry_t *oent = &old_mdf_idx[i],
+ *nent = &new_mdf_idx[j];
+
+ /* Verify that the old and new indices are sorted as expected. */
+ HDassert(i == 0 ||
+ oent[-1].hdf5_page_offset < oent[0].hdf5_page_offset);
+
+ HDassert(j == 0 ||
+ nent[-1].hdf5_page_offset < nent[0].hdf5_page_offset);
+
+ if (oent->hdf5_page_offset == nent->hdf5_page_offset) {
+
+ if (oent->md_file_page_offset != nent->md_file_page_offset) {
+
+ /* It's ok if the length changes, I think, but I need
+ * to think about how to perform MDC invalidation in the
+ * case where the new entry is *longer*, because the
+ * extension could overlap with a second entry.
+ */
+ assert(oent->length == nent->length);
+
+ hlog_fast(shadow_index_update,
+ "shadow page for slot %" PRIu32 " lower page %" PRIu64
+ " moved, %" PRIu64 " -> %" PRIu64, i,
+ oent->hdf5_page_offset,
+ oent->md_file_page_offset,
+ nent->md_file_page_offset);
+
+ /* the page has been altered -- evict it and
+ * any contained metadata cache entries.
+ */
+ change[nchanges].pgno = oent->hdf5_page_offset;
+ change[nchanges].length = oent->length;
+ nchanges++;
+ entries_moved++;
+ }
+ i++;
+ j++;
+
+ } else if (oent->hdf5_page_offset < nent->hdf5_page_offset) {
+ /* the page has been removed from the new version
+ * of the index. Evict it and any contained metadata
+ * cache entries.
+ *
+ * If we are careful about removing entries from the
+ * the index so as to ensure that they haven't changed
+ * for several ticks, we can probably omit this. However,
+ * lets not worry about this for the first cut.
+ */
+ hlog_fast(shadow_index_update,
+ "writer removed shadow index slot %" PRIu32
+ " for page %" PRIu64, i, oent->hdf5_page_offset);
+
+ change[nchanges].pgno = oent->hdf5_page_offset;
+ change[nchanges].length = oent->length;
+ nchanges++;
+ entries_removed++;
+ i++;
+
+ } else { /* oent->hdf5_page_offset >
+ * nent->hdf5_page_offset
+ */
+
+ hlog_fast(shadow_index_update,
+ "writer added shadow index slot %" PRIu32
+ " for page %" PRIu64, j, nent->hdf5_page_offset);
+
+ /* The page has been added to the index. */
+ change[nchanges].pgno = nent->hdf5_page_offset;
+ change[nchanges].length = nent->length;
+ nchanges++;
+ entries_added++;
+ j++;
+ }
+ }
+
+ for (; j < new_mdf_idx_entries_used; j++) {
+ const H5FD_vfd_swmr_idx_entry_t *nent = &new_mdf_idx[j];
+ hlog_fast(shadow_index_update,
+ "writer added shadow index slot %" PRIu32
+ " for page %" PRIu64, j, nent->hdf5_page_offset);
+ change[nchanges].pgno = nent->hdf5_page_offset;
+ change[nchanges].length = nent->length;
+ nchanges++;
+ entries_added++;
+ }
+
+ /* cleanup any left overs in the old index */
+ for (; i < old_mdf_idx_entries_used; i++) {
+ const H5FD_vfd_swmr_idx_entry_t *oent = &old_mdf_idx[i];
+
+ /* the page has been removed from the new version of the
+ * index. Evict it from the page buffer and also evict any
+ * contained metadata cache entries
+ */
+
+ hlog_fast(shadow_index_update,
+ "writer removed shadow index slot %" PRIu32
+ " for page %" PRIu64, i, oent->hdf5_page_offset);
+
+ change[nchanges].pgno = oent->hdf5_page_offset;
+ change[nchanges].length = oent->length;
+ nchanges++;
+ entries_removed++;
+ }
+ for (i = 0; i < nchanges; i++) {
+ haddr_t page_addr =
+ (haddr_t)(change[i].pgno * shared->pb_ptr->page_size);
+ if (H5PB_remove_entry(shared, page_addr) < 0) {
+ HGOTO_ERROR(H5E_FILE, H5E_CANTFLUSH, FAIL,
+ "remove page buffer entry failed");
+ }
+ }
+ for (i = 0; i < nchanges; i++) {
+ hlog_fast(mdc_invalidation,
+ "invalidating MDC entries at page %" PRIu64
+ " length %" PRIu32 " tick %" PRIu64,
+ change[i].pgno, change[i].length, tmp_tick_num);
+ if (H5C_evict_or_refresh_all_entries_in_page(f,
+ change[i].pgno, change[i].length,
+ tmp_tick_num) < 0) {
+ HGOTO_ERROR(H5E_FILE, H5E_CANTFLUSH, FAIL,
+ "evict or refresh stale MDC entries failed");
+ }
+ }
+
+#if 0 /* JRM */
+ HDfprintf(stderr, "--- reader EOT pre new tick index "
+ "used/len = %" PRIu32 "/ %" PRIu32 " ---\n",
+ shared->mdf_idx_entries_used, shared->mdf_idx_len);
+#endif /* JRM */
+ /* At this point, we should have evicted or refreshed all stale
+ * page buffer and metadata cache entries.
+ *
+ * Start the next tick.
+ */
+ shared->tick_num = tmp_tick_num;
+
+ vfd_swmr_reader_did_increase_tick_to(tmp_tick_num);
+
+ /* Update end_of_tick */
+ if (H5F__vfd_swmr_update_end_of_tick_and_tick_num(shared,
+ FALSE) < 0) {
+ HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL,
+ "unable to update end of tick");
+ }
+ }
+
+ /* Remove the entry from the EOT queue */
+ if(H5F_vfd_swmr_remove_entry_eot(f) < 0) {
+ HDONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL,
+ "unable to remove entry from EOT queue")
+ }
+
+ /* Re-insert the entry that corresponds to f onto the EOT queue */
+ if(H5F_vfd_swmr_insert_entry_eot(f) < 0) {
+ HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL,
+ "unable to insert entry into the EOT queue")
+ }
+
+done:
+
+ hlog_fast(eot, "%s exit tick %" PRIu64
+ " len %" PRIu32 " -> %" PRIu32
+ " used %" PRIu32 " -> %" PRIu32
+ " added %" PRIu32 " removed %" PRIu32 " moved %" PRIu32 " %s",
+ __func__, shared->tick_num,
+ shared->old_mdf_idx_len, shared->mdf_idx_len,
+ shared->old_mdf_idx_entries_used, shared->mdf_idx_entries_used,
+ entries_added, entries_removed, entries_moved,
+ (ret_value == SUCCEED) ? "success" : "failure");
+
+ if (change != NULL)
+ free(change);
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* end H5F_vfd_swmr_reader_end_of_tick() */
+
+static void
+insert_eot_entry(eot_queue_entry_t *entry_ptr)
+{
+ eot_queue_entry_t *prec_ptr; /* The predecessor entry on the EOT end of tick queue */
+
+ /* Find the insertion point for the entry on the EOT queue */
+ TAILQ_FOREACH_REVERSE(prec_ptr, &eot_queue_g, eot_queue, link) {
+ if (timespeccmp(&prec_ptr->end_of_tick, &entry_ptr->end_of_tick, <=))
+ break;
+ }
+
+ hlog_fast(eotq, "%s: entry %p after %p file %p "
+ "tick %" PRIu64 " ending %jd.%09ld", __func__,
+ (void *)entry_ptr, (void *)prec_ptr, (void *)entry_ptr->vfd_swmr_file,
+ entry_ptr->tick_num, (intmax_t)entry_ptr->end_of_tick.tv_sec,
+ entry_ptr->end_of_tick.tv_nsec);
+
+ /* Insert the entry onto the EOT queue */
+ if (prec_ptr != NULL)
+ TAILQ_INSERT_AFTER(&eot_queue_g, prec_ptr, entry_ptr, link);
+ else
+ TAILQ_INSERT_HEAD(&eot_queue_g, entry_ptr, link);
+}
+
+
+/* Update an entry on the EOT queue and move it to its proper place.
+ */
+void
+H5F_vfd_swmr_update_entry_eot(eot_queue_entry_t *entry)
+{
+ H5F_t *f = entry->vfd_swmr_file;
+ H5F_shared_t *shared = f->shared;
+
+ /* Free the entry on the EOT queue that corresponds to f */
+
+ TAILQ_REMOVE(&eot_queue_g, entry, link);
+
+ hlog_fast(eotq, "%s: updating entry %p file %p "
+ "tick %" PRIu64 " ending %jd.%09ld", __func__,
+ (void *)entry, (void *)entry->vfd_swmr_file,
+ entry->tick_num, (intmax_t)entry->end_of_tick.tv_sec,
+ entry->end_of_tick.tv_nsec);
+
+ assert(entry->vfd_swmr_writer == shared->vfd_swmr_writer);
+ entry->tick_num = shared->tick_num;
+ entry->end_of_tick = shared->end_of_tick;
+
+ hlog_fast(eotq, "%s: ... to tick %" PRIu64 " ending %jd.%09ld", __func__,
+ entry->tick_num, (intmax_t)entry->end_of_tick.tv_sec,
+ entry->end_of_tick.tv_nsec);
+
+ insert_eot_entry(entry);
+}
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5F__vfd_swmr_remove_entry_eot
+ *
+ * Purpose: Remove an entry from the EOT queue
+ *
+ * Return: Success: SUCCEED
+ * Failure: FAIL
+ *
+ * Programmer: Vailin Choi -- 11/18/2019
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5F_vfd_swmr_remove_entry_eot(H5F_t *f)
+{
+ eot_queue_entry_t *curr;
+
+ FUNC_ENTER_NOAPI_NOINIT_NOERR
+
+ /* Free the entry on the EOT queue that corresponds to f */
+
+ TAILQ_FOREACH(curr, &eot_queue_g, link) {
+ if (curr->vfd_swmr_file == f)
+ break;
+ }
+
+ if (curr != NULL) {
+ hlog_fast(eotq, "%s: entry %p file %p "
+ "tick %" PRIu64 " ending %jd.%09ld", __func__,
+ (void *)curr, (void *)curr->vfd_swmr_file, curr->tick_num,
+ (intmax_t)curr->end_of_tick.tv_sec,
+ curr->end_of_tick.tv_nsec);
+ TAILQ_REMOVE(&eot_queue_g, curr, link);
+ curr = H5FL_FREE(eot_queue_entry_t, curr);
+ }
+
+ FUNC_LEAVE_NOAPI(SUCCEED)
+} /* H5F_vfd_swmr_remove_entry_eot() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5F_vfd_swmr_insert_entry_eot
+ *
+ * Purpose: Insert an entry onto the EOT queue
+ *
+ * Return: Success: SUCCEED
+ * Failure: FAIL
+ *
+ * Programmer: Vailin Choi -- 11/18/2019
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5F_vfd_swmr_insert_entry_eot(H5F_t *f)
+{
+ H5F_shared_t *shared = f->shared;
+ eot_queue_entry_t *entry_ptr; /* An entry on the EOT end of tick queue */
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ /* Allocate an entry to be inserted onto the EOT queue */
+ if (NULL == (entry_ptr = H5FL_CALLOC(eot_queue_entry_t)))
+ HGOTO_ERROR(H5E_FILE, H5E_CANTALLOC, FAIL, "unable to allocate the end of tick queue entry")
+
+ /* Initialize the entry */
+ entry_ptr->vfd_swmr_writer = shared->vfd_swmr_writer;
+ entry_ptr->tick_num = shared->tick_num;
+ entry_ptr->end_of_tick = shared->end_of_tick;
+ entry_ptr->vfd_swmr_file = f;
+
+ insert_eot_entry(entry_ptr);
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5F_vfd_swmr_insert_entry_eot() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5F_dump_eot_queue()
+ *
+ * Purpose: Dump the contents of the EOT queue
+ *
+ * Return: Success: SUCCEED
+ * Failure: FAIL
+ *
+ * Programmer: Vailin Choi -- 11/18/2019
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5F_dump_eot_queue(void)
+{
+ int i;
+ eot_queue_entry_t *curr;
+
+ FUNC_ENTER_NOAPI_NOINIT_NOERR
+
+ for (curr = TAILQ_FIRST(&eot_queue_g), i = 0;
+ curr != NULL;
+ curr = TAILQ_NEXT(curr, link), i++) {
+ HDfprintf(stderr, "%d: %s tick_num %" PRIu64
+ ", end_of_tick %jd.%09ld, vfd_swmr_file %p\n",
+ i, curr->vfd_swmr_writer ? "writer" : "not writer",
+ curr->tick_num,
+ curr->end_of_tick.tv_sec, curr->end_of_tick.tv_nsec,
+ curr->vfd_swmr_file);
+ }
+
+ if(i == 0)
+ HDfprintf(stderr, "EOT head is null\n");
+
+ FUNC_LEAVE_NOAPI(SUCCEED)
+
+} /* H5F_dump_eot_queue() */
+
+/*
+ * Beginning of static functions
+ */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5F__vfd_swmr_update_end_of_tick_and_tick_num
+ *
+ * Purpose: Update end_of_tick (shared->end_of_tick)
+ * Update tick_num (shared->tick_num)
+ *
+ * Return: Success: SUCCEED
+ * Failure: FAIL
+ *
+ * Programmer: Vailin Choi -- 11/??/18
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5F__vfd_swmr_update_end_of_tick_and_tick_num(H5F_shared_t *shared,
+ hbool_t incr_tick_num)
+{
+ struct timespec curr; /* Current time in struct timespec */
+ struct timespec new_end_of_tick; /* new end_of_tick in struct timespec */
+ int64_t curr_nsecs; /* current time in nanoseconds */
+ int64_t tlen_nsecs; /* tick_len in nanoseconds */
+ int64_t new_end_nsecs; /* new end_of_tick in nanoseconds */
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_STATIC
+
+ /* Get current time in struct timespec */
+ if ( HDclock_gettime(CLOCK_MONOTONIC, &curr) < 0 )
+
+ HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, \
+ "can't get time via clock_gettime")
+
+ /* Convert curr to nsecs */
+ curr_nsecs = curr.tv_sec * nanosecs_per_second + curr.tv_nsec;
+
+ /* Convert tick_len to nanosecs */
+ tlen_nsecs = shared->vfd_swmr_config.tick_len * nanosecs_per_tenth_sec;
+
+ /*
+ * Update shared->tick_num
+ */
+ if ( incr_tick_num ) {
+
+ shared->tick_num++;
+
+ hlog_fast(tick, "%s tick %" PRIu64 " -> %" PRIu64,
+ __func__, shared->tick_num - 1, shared->tick_num);
+
+ if ( H5PB_vfd_swmr__set_tick(shared) < 0 )
+
+ HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, \
+ "Can't update page buffer current tick")
+ }
+
+ /*
+ * Update shared->end_of_tick
+ */
+ /* Calculate new end_of_tick */
+
+ /* TODO: The modulo operation is very expensive on most machines --
+ * re-work this code so as to avoid it.
+ *
+ * JRM -- 11/12/18
+ */
+
+ new_end_nsecs = curr_nsecs + tlen_nsecs;
+ new_end_of_tick.tv_nsec = (long)(new_end_nsecs % nanosecs_per_second);
+ new_end_of_tick.tv_sec = new_end_nsecs / nanosecs_per_second;
+
+ shared->end_of_tick = new_end_of_tick;
+
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5F__vfd_swmr_update_end_of_tick_and_tick_num() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5F__vfd_swmr_construct_write_md_hdr
+ *
+ * Purpose: Encode and write header to the metadata file.
+ *
+ * This is used by the VFD SWMR writer:
+ *
+ * --when opening an existing HDF5 file
+ * --when closing the HDF5 file
+ * --after flushing an HDF5 file
+ * --when updating the metadata file
+ *
+ * Return: Success: SUCCEED
+ * Failure: FAIL
+ *
+ * Programmer: Vailin Choi -- 11/??/18
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5F__vfd_swmr_construct_write_md_hdr(H5F_shared_t *shared, uint32_t num_entries)
+{
+ uint8_t image[H5FD_MD_HEADER_SIZE]; /* Buffer for header */
+ uint8_t *p = NULL; /* Pointer to buffer */
+ uint32_t metadata_chksum; /* Computed metadata checksum value */
+ /* Size of header and index */
+ const size_t hdr_size = H5FD_MD_HEADER_SIZE;
+ ssize_t nwritten;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_STATIC
+
+ /*
+ * Encode metadata file header
+ */
+ p = image;
+
+ /* Encode magic for header */
+ HDmemcpy(p, H5FD_MD_HEADER_MAGIC, (size_t)H5_SIZEOF_MAGIC);
+ p += H5_SIZEOF_MAGIC;
+
+ /* Encode page size, tick number, index offset, index length */
+ UINT32ENCODE(p, shared->fs_page_size);
+ UINT64ENCODE(p, shared->tick_num);
+ UINT64ENCODE(p, shared->writer_index_offset);
+ UINT64ENCODE(p, H5FD_MD_INDEX_SIZE(num_entries));
+
+ /* Calculate checksum for header */
+ metadata_chksum = H5_checksum_metadata(image, (size_t)(p - image), 0);
+
+ /* Encode checksum for header */
+ UINT32ENCODE(p, metadata_chksum);
+
+ /* Sanity checks on header */
+ HDassert(p - image == (ptrdiff_t)hdr_size);
+
+ /* Set to beginning of the file */
+ if ( HDlseek(shared->vfd_swmr_md_fd, H5FD_MD_HEADER_OFF, SEEK_SET) < 0 )
+
+ HGOTO_ERROR(H5E_VFL, H5E_SEEKERROR, FAIL, \
+ "unable to seek in metadata file")
+
+ nwritten = HDwrite(shared->vfd_swmr_md_fd, image, hdr_size);
+ /* Write header to the metadata file */
+ if (nwritten != (ssize_t)hdr_size) {
+ HGOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, \
+ "error in writing header to metadata file")
+ }
+
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5F__vfd_swmr_construct_write_md_hdr() */
+
+
+/*-------------------------------------------------------------------------
+
+ * Function: H5F__vfd_swmr_construct_write_md_idx
+ *
+ * Purpose: Encode and write index to the metadata file.
+ *
+ * This is used by the VFD SWMR writer:
+ *
+ * --when opening an existing HDF5 file
+ * --when closing the HDF5 file
+ * --after flushing an HDF5 file
+ * --when updating the metadata file
+ *
+ * Return: Success: SUCCEED
+ * Failure: FAIL
+ *
+ * Programmer: Vailin Choi -- 11/??/18
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5F__vfd_swmr_construct_write_md_idx(H5F_shared_t *shared,
+ uint32_t num_entries, struct H5FD_vfd_swmr_idx_entry_t index[])
+{
+ uint8_t *image = NULL; /* Pointer to buffer */
+ uint8_t *p = NULL; /* Pointer to buffer */
+ uint32_t metadata_chksum; /* Computed metadata checksum value */
+ /* Size of index */
+ const size_t idx_size = H5FD_MD_INDEX_SIZE(num_entries);
+ ssize_t nwritten;
+ unsigned i; /* Local index variable */
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_STATIC
+
+ HDassert(num_entries == 0 || index != NULL);
+
+ /* Allocate space for the buffer to hold the index */
+ if ( (image = HDmalloc(idx_size)) == NULL )
+
+ HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, \
+ "memory allocation failed for md index")
+
+ /*
+ * Encode metadata file index
+ */
+ p = image;
+
+ /* Encode magic for index */
+ HDmemcpy(p, H5FD_MD_INDEX_MAGIC, H5_SIZEOF_MAGIC);
+ p += H5_SIZEOF_MAGIC;
+
+ /* Encode tick number */
+ UINT64ENCODE(p, shared->tick_num);
+
+ /* Encode number of entries in index */
+ UINT32ENCODE(p, num_entries);
+
+ /* Encode the index entries */
+ for(i = 0; i < num_entries; i++) {
+ UINT32ENCODE(p, index[i].hdf5_page_offset);
+ UINT32ENCODE(p, index[i].md_file_page_offset);
+ UINT32ENCODE(p, index[i].length);
+ UINT32ENCODE(p, index[i].chksum);
+ }
+
+ /* Calculate checksum for index */
+ metadata_chksum = H5_checksum_metadata(image, (size_t)(p - image), 0);
+
+ /* Encode checksum for index */
+ UINT32ENCODE(p, metadata_chksum);
+
+ /* Sanity checks on index */
+ HDassert(p - image == (ptrdiff_t)idx_size);
+
+ /* Verify the md file descriptor exists */
+ HDassert(shared->vfd_swmr_md_fd >= 0);
+
+ if (HDlseek(shared->vfd_swmr_md_fd,
+ (HDoff_t)shared->writer_index_offset, SEEK_SET) < 0)
+ HGOTO_ERROR(H5E_VFL, H5E_SEEKERROR, FAIL, \
+ "unable to seek in metadata file")
+
+ nwritten = HDwrite(shared->vfd_swmr_md_fd, image, idx_size);
+ /* Write index to the metadata file */
+ if (nwritten != (ssize_t)idx_size) {
+ HGOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, \
+ "error in writing index to metadata file")
+ }
+
+done:
+
+ if ( image ) {
+
+ HDfree(image);
+ }
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5F__vfd_swmr_construct_write_idx() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5F__idx_entry_cmp()
+ *
+ * Purpose: Callback used by HDqsort to sort entries in the index
+ *
+ * Return: 0 if the entries are the same
+ * -1 if entry1's offset is less than that of entry2
+ * 1 if entry1's offset is greater than that of entry2
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5F__idx_entry_cmp(const void *_entry1, const void *_entry2)
+{
+ const H5FD_vfd_swmr_idx_entry_t *entry1 = _entry1;
+ const H5FD_vfd_swmr_idx_entry_t *entry2 = _entry2;
+
+ int ret_value = 0; /* Return value */
+
+ FUNC_ENTER_STATIC_NOERR
+
+ /* Sanity checks */
+ HDassert(entry1);
+ HDassert(entry2);
+
+ if(entry1->hdf5_page_offset < entry2->hdf5_page_offset)
+ ret_value = -1;
+ else if(entry1->hdf5_page_offset > entry2->hdf5_page_offset)
+ ret_value = 1;
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5F__idx_entry_cmp() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5F__vfd_swmr_create_index
+ *
+ * Purpose: Allocate and initialize the index for the VFD SWMR metadata
+ * file.
+ *
+ * Return: SUCCEED/FAIL
+ *
+ * Programmer: John Mainzer 11/5/18
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5F__vfd_swmr_create_index(H5F_shared_t *shared)
+{
+ size_t bytes_available;
+ size_t entries_in_index;
+ H5FD_vfd_swmr_idx_entry_t * index;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_STATIC
+
+ HDassert(shared->vfd_swmr);
+ HDassert(shared->mdf_idx == NULL);
+ HDassert(shared->mdf_idx_len == 0);
+ HDassert(shared->mdf_idx_entries_used == 0);
+
+ bytes_available =
+ (size_t)shared->fs_page_size *
+ (size_t)(shared->vfd_swmr_config.md_pages_reserved - 1);
+
+ HDassert(bytes_available > 0);
+
+ entries_in_index =
+ (bytes_available - H5FD_MD_INDEX_SIZE(0)) / H5FD_MD_INDEX_ENTRY_SIZE;
+
+ HDassert(entries_in_index > 0);
+
+ index = HDcalloc(entries_in_index, sizeof(index[0]));
+
+ if (index == NULL) {
+ HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL,
+ "memory allocation failed for md index")
+ }
+
+ HDassert(entries_in_index <= UINT32_MAX);
+
+ shared->mdf_idx = index;
+ shared->mdf_idx_len = (uint32_t)entries_in_index;
+ shared->mdf_idx_entries_used = 0;
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+}
+
+H5FD_vfd_swmr_idx_entry_t *
+vfd_swmr_enlarge_shadow_index(H5F_t *f)
+{
+ H5F_shared_t *shared = f->shared;
+ H5FD_vfd_swmr_idx_entry_t *ret_value = NULL;
+ haddr_t idx_addr;
+ hsize_t idx_size;
+ H5FD_vfd_swmr_idx_entry_t *new_mdf_idx = NULL, *old_mdf_idx;
+ uint32_t new_mdf_idx_len, old_mdf_idx_len;
+
+ FUNC_ENTER_NOAPI(NULL)
+
+ hlog_fast(shadow_index_enlarge, "Enlarging shadow index.");
+
+ old_mdf_idx = shared->mdf_idx;
+ old_mdf_idx_len = shared->mdf_idx_len;
+
+ if (UINT32_MAX - old_mdf_idx_len >= old_mdf_idx_len)
+ new_mdf_idx_len = old_mdf_idx_len * 2;
+ else
+ new_mdf_idx_len = UINT32_MAX;
+
+ idx_size = H5FD_MD_INDEX_SIZE(new_mdf_idx_len);
+
+ idx_addr = H5MV_alloc(f, idx_size);
+
+ if (idx_addr == HADDR_UNDEF) {
+ HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL,
+ "shadow-file allocation failed for index")
+ }
+
+ new_mdf_idx = HDmalloc(new_mdf_idx_len * sizeof(new_mdf_idx[0]));
+
+ if (new_mdf_idx == NULL) {
+ (void)H5MV_free(f, idx_addr, idx_size);
+ HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL,
+ "memory allocation failed for md index")
+ }
+
+ /* Copy the old index in its entirety to the new, instead of copying
+ * just the _entries_used, because the caller may have been in the
+ * process of adding entries, and some callers may not update
+ * _entries_used immediately.
+ */
+ memcpy(new_mdf_idx, old_mdf_idx, sizeof(new_mdf_idx[0]) * old_mdf_idx_len);
+
+ shared->writer_index_offset = idx_addr;
+ ret_value = shared->mdf_idx = new_mdf_idx;
+ shared->mdf_idx_len = new_mdf_idx_len;
+
+ /* Postpone reclamation of the old index until max_lag ticks from now.
+ * It's only necessary to wait until after the new index is in place,
+ * so it's possible that some disused shadow storage will build up
+ * past what is strictly necessary, but it seems like a reasonable
+ * trade-off for simplicity.
+ */
+ if (shadow_range_defer_free(shared, shared->writer_index_offset,
+ H5FD_MD_INDEX_SIZE(old_mdf_idx_len)) == -1) {
+ HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL,
+ "could not schedule index reclamation");
+ }
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+}
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5F__vfd_swmr_writer__wait_a_tick
+ *
+ * Purpose: Before a file that has been opened by a VFD SWMR writer,
+ * all pending delayed writes must be allowed drain.
+ *
+ * This function facilitates this by sleeping for a tick, and
+ * then running the writer end of tick function.
+ *
+ * It should only be called as part the flush or close operations.
+ *
+ * Return: SUCCEED/FAIL
+ *
+ * Programmer: John Mainzer 11/23/18
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5F__vfd_swmr_writer__wait_a_tick(H5F_t *f)
+{
+ int result;
+ struct timespec req;
+ struct timespec rem;
+ uint64_t tick_in_nsec;
+ H5F_shared_t *shared = f->shared;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_STATIC
+
+ HDassert(shared->vfd_swmr);
+ HDassert(shared->vfd_swmr_writer);
+
+ tick_in_nsec = shared->vfd_swmr_config.tick_len * nanosecs_per_tenth_sec;
+ req.tv_nsec = (long)(tick_in_nsec % nanosecs_per_second);
+ req.tv_sec = (time_t)(tick_in_nsec / nanosecs_per_second);
+
+ result = HDnanosleep(&req, &rem);
+
+ while ( result == -1 ) {
+
+ req = rem;
+ result = HDnanosleep(&req, &rem);
+ }
+
+ if ( result != 0 )
+
+ HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, "HDnanosleep() failed.")
+
+ if ( H5F_vfd_swmr_writer_end_of_tick(f, false) < 0 )
+
+ HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, \
+ "H5F_vfd_swmr_writer_end_of_tick() failed.")
+
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5F__vfd_swmr_writer__wait_a_tick() */
+
+herr_t
+H5F_vfd_swmr_process_eot_queue(bool entering_api)
+{
+ struct timespec now;
+ eot_queue_entry_t *first_head, *head;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ first_head = head = TAILQ_FIRST(&eot_queue_g);
+
+ do {
+ H5F_t *f = head->vfd_swmr_file;
+ H5F_shared_t *shared = f->shared;
+
+ if(HDclock_gettime(CLOCK_MONOTONIC, &now) < 0) {
+ HGOTO_ERROR(H5E_FUNC, H5E_CANTGET, FAIL,
+ "can't get time via clock_gettime");
+ }
+ if(timespeccmp(&now, &head->end_of_tick, <))
+ break;
+ /* If the H5F_shared_t is labeled with a later EOT time than
+ * the queue entry is, then we have already performed the
+ * H5F_shared_t's EOT processing. That can happen if
+ * multiple H5F_t share the H5F_shared_t. Just update the
+ * EOT queue entry and move to the next.
+ */
+ if (timespeccmp(&head->end_of_tick, &shared->end_of_tick, <)) {
+ H5F_vfd_swmr_update_entry_eot(head);
+ } else if (shared->vfd_swmr_writer) {
+ if (H5F_vfd_swmr_writer_end_of_tick(f, false) < 0)
+ HGOTO_ERROR(H5E_FUNC, H5E_CANTSET, FAIL,
+ "end of tick error for VFD SWMR writer");
+ } else if (H5F_vfd_swmr_reader_end_of_tick(f, entering_api) < 0) {
+ HGOTO_ERROR(H5E_FUNC, H5E_CANTSET, FAIL,
+ "end of tick error for VFD SWMR reader");
+ }
+ } while ((head = TAILQ_FIRST(&eot_queue_g)) != NULL && head != first_head);
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+}
diff --git a/src/H5Gcache.c b/src/H5Gcache.c
index 13a33a3..ca67d56 100644
--- a/src/H5Gcache.c
+++ b/src/H5Gcache.c
@@ -101,6 +101,7 @@ const H5AC_class_t H5AC_SNODE[1] = {{
NULL, /* 'notify' callback */
H5G__cache_node_free_icr, /* 'free_icr' callback */
NULL, /* 'fsf_size' callback */
+ NULL, /* VFD SWMR 'refresh' callback */
}};
diff --git a/src/H5HFcache.c b/src/H5HFcache.c
index 8dbdf25..22a2623 100644
--- a/src/H5HFcache.c
+++ b/src/H5HFcache.c
@@ -146,6 +146,7 @@ const H5AC_class_t H5AC_FHEAP_HDR[1] = {{
NULL, /* 'notify' callback */
H5HF__cache_hdr_free_icr, /* 'free_icr' callback */
NULL, /* 'fsf_size' callback */
+ NULL, /* VFD SWMR 'refresh' callback */
}};
/* H5HF indirect block inherits cache-like properties from H5AC */
@@ -164,6 +165,7 @@ const H5AC_class_t H5AC_FHEAP_IBLOCK[1] = {{
H5HF__cache_iblock_notify, /* 'notify' callback */
H5HF__cache_iblock_free_icr, /* 'free_icr' callback */
NULL, /* 'fsf_size' callback */
+ NULL, /* VFD SWMR 'refresh' callback */
}};
/* H5HF direct block inherits cache-like properties from H5AC */
@@ -182,6 +184,7 @@ const H5AC_class_t H5AC_FHEAP_DBLOCK[1] = {{
H5HF__cache_dblock_notify, /* 'notify' callback */
H5HF__cache_dblock_free_icr, /* 'free_icr' callback */
H5HF__cache_dblock_fsf_size, /* 'fsf_size' callback */
+ NULL, /* VFD SWMR 'refresh' callback */
}};
diff --git a/src/H5HG.c b/src/H5HG.c
index f9d780c..8177dc3 100644
--- a/src/H5HG.c
+++ b/src/H5HG.c
@@ -610,6 +610,9 @@ H5HG_read(H5F_t *f, H5HG_t *hobj, void *object/*out*/, size_t *buf_size)
if(NULL == (heap = H5HG__protect(f, hobj->addr, H5AC__READ_ONLY_FLAG)))
HGOTO_ERROR(H5E_HEAP, H5E_CANTPROTECT, NULL, "unable to protect global heap")
+ if (hobj->idx >= heap->nused && H5HG_trap("out of bounds"))
+ HGOTO_ERROR(H5E_HEAP, H5E_BADRANGE, NULL, "address out of bounds")
+
HDassert(hobj->idx < heap->nused);
HDassert(heap->obj[hobj->idx].begin);
size = heap->obj[hobj->idx].size;
diff --git a/src/H5HGcache.c b/src/H5HGcache.c
index 7485aad..938c575 100644
--- a/src/H5HGcache.c
+++ b/src/H5HGcache.c
@@ -95,6 +95,7 @@ const H5AC_class_t H5AC_GHEAP[1] = {{
NULL, /* 'notify' callback */
H5HG__cache_heap_free_icr, /* 'free_icr' callback */
NULL, /* 'fsf_size' callback */
+ NULL, /* VFD SWMR 'refresh' callback */
}};
diff --git a/src/H5HGprivate.h b/src/H5HGprivate.h
index 573ef39..760c0bf 100644
--- a/src/H5HGprivate.h
+++ b/src/H5HGprivate.h
@@ -73,5 +73,7 @@ H5_DLL size_t H5HG_get_free_size(const H5HG_heap_t *h);
H5_DLL herr_t H5HG_debug(H5F_t *f, haddr_t addr, FILE *stream, int indent,
int fwidth);
+bool H5HG_trap(const char *);
+
#endif /* _H5HGprivate_H */
diff --git a/src/H5HGtrap.c b/src/H5HGtrap.c
new file mode 100644
index 0000000..2f09d48
--- /dev/null
+++ b/src/H5HGtrap.c
@@ -0,0 +1,30 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright by The HDF Group. *
+ * Copyright by the Board of Trustees of the University of Illinois. *
+ * All rights reserved. *
+ * *
+ * This file is part of HDF5. The full HDF5 copyright notice, including *
+ * terms governing use, modification, and redistribution, is contained in *
+ * the COPYING file, which can be found at the root of the source code *
+ * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. *
+ * If you do not have access to either file, you may request a copy from *
+ * help@hdfgroup.org. *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+/****************/
+/* Module Setup */
+/****************/
+
+#include "H5HGmodule.h" /* This source code file is part of the H5HG module */
+
+/*
+ * Headers
+ */
+#include "H5private.h" /* Generic Functions */
+#include "H5Eprivate.h" /* Error handling */
+#include "H5HGpkg.h" /* Global heaps */
+
+bool
+H5HG_trap(const char *reason)
+{
+ return false;
+}
diff --git a/src/H5HLcache.c b/src/H5HLcache.c
index 734ec5c..5a7321f 100644
--- a/src/H5HLcache.c
+++ b/src/H5HLcache.c
@@ -118,6 +118,7 @@ const H5AC_class_t H5AC_LHEAP_PRFX[1] = {{
NULL, /* 'notify' callback */
H5HL__cache_prefix_free_icr, /* 'free_icr' callback */
NULL, /* 'fsf_size' callback */
+ NULL, /* VFD SWMR 'refresh' callback */
}};
const H5AC_class_t H5AC_LHEAP_DBLK[1] = {{
@@ -135,6 +136,7 @@ const H5AC_class_t H5AC_LHEAP_DBLK[1] = {{
H5HL__cache_datablock_notify, /* 'notify' callback */
H5HL__cache_datablock_free_icr, /* 'free_icr' callback */
NULL, /* 'fsf_size' callback */
+ NULL, /* VFD SWMR 'refresh' callback */
}};
diff --git a/src/H5MF.c b/src/H5MF.c
index fac6620..22438d3 100644
--- a/src/H5MF.c
+++ b/src/H5MF.c
@@ -22,6 +22,9 @@
*-------------------------------------------------------------------------
*/
+#include "H5queue.h"
+#include "hlog.h"
+
/****************/
/* Module Setup */
/****************/
@@ -43,6 +46,8 @@
#include "H5VMprivate.h" /* Vectors and arrays */
+#include "hlog.h"
+
/****************/
/* Local Macros */
/****************/
@@ -88,6 +93,7 @@ typedef struct {
/* Local Prototypes */
/********************/
+static herr_t H5MF__xfree_impl(H5F_t *, H5FD_mem_t, haddr_t, hsize_t);
/* Allocator routines */
static haddr_t H5MF__alloc_pagefs(H5F_t *f, H5FD_mem_t alloc_type, hsize_t size);
@@ -125,11 +131,92 @@ hbool_t H5_PKG_INIT_VAR = FALSE;
/* Library Private Variables */
/*****************************/
-
/*******************/
/* Local Variables */
/*******************/
+HLOG_OUTLET_DECL(h5mf);
+HLOG_OUTLET_SHORT_DEFN(h5mf, all);
+HLOG_OUTLET_SHORT_DEFN(h5mf_defer, h5mf);
+HLOG_OUTLET_SHORT_DEFN(h5mf_free, h5mf);
+HLOG_OUTLET_SHORT_DEFN(h5mf_alloc, h5mf);
+HLOG_OUTLET_MEDIUM_DEFN(noisy_h5mf_alloc, h5mf_alloc, HLOG_OUTLET_S_OFF);
+HLOG_OUTLET_SHORT_DEFN(h5mf_extend, h5mf);
+HLOG_OUTLET_SHORT_DEFN(h5mf_shrink, h5mf);
+
+static herr_t
+defer_free(H5F_shared_t *shared, H5FD_mem_t alloc_type, haddr_t addr,
+ hsize_t size)
+{
+ lower_defree_t *df;
+
+ if ((df = malloc(sizeof(*df))) == NULL)
+ return FAIL;
+
+ df->alloc_type = alloc_type;
+ df->addr = addr;
+ df->size = size;
+ df->free_after_tick = shared->tick_num + shared->vfd_swmr_config.max_lag;
+
+ hlog_fast(h5mf_defer,
+ "%s.%d: deferred free at %" PRIuHADDR ", %" PRIuHSIZE
+ " bytes until tick %" PRIu64, __FILE__, __LINE__, addr, size,
+ df->free_after_tick);
+
+ SIMPLEQ_INSERT_TAIL(&shared->lower_defrees, df, link);
+
+ return SUCCEED;
+}
+
+static uint64_t
+H5MF_total_deferred_frees(H5F_shared_t *shared)
+{
+ lower_defree_t *df;
+ uint64_t total = 0;
+
+ SIMPLEQ_FOREACH(df, &shared->lower_defrees, link)
+ total += df->size;
+
+ return total;
+}
+
+herr_t
+H5MF_process_deferred_frees(H5F_t *f, const uint64_t tick_num)
+{
+ lower_defree_t *df;
+ herr_t err = SUCCEED;
+ H5F_shared_t *shared = f->shared;
+ lower_defree_queue_t defrees = SIMPLEQ_HEAD_INITIALIZER(defrees);
+
+ /* Have to empty the queue before processing it because we
+ * could re-enter this routine through H5MF__xfree_impl. If
+ * items were still on the queue, we would enter
+ * H5MF_process_deferred_frees() recursively until the queue was empty.
+ */
+ SIMPLEQ_CONCAT(&defrees, &shared->lower_defrees);
+
+ while ((df = SIMPLEQ_FIRST(&defrees)) != NULL) {
+ if (tick_num <= df->free_after_tick)
+ break;
+ hlog_fast(h5mf_defer,
+ "%s.%d: processing free at %" PRIuHADDR ", %" PRIuHSIZE " bytes",
+ __FILE__, __LINE__, df->addr, df->size);
+ SIMPLEQ_REMOVE_HEAD(&defrees, link);
+ if (H5MF__xfree_impl(f, df->alloc_type, df->addr, df->size) < 0)
+ err = FAIL;
+ free(df);
+ }
+
+ if (err != SUCCEED) {
+ hlog_fast(h5mf_defer, "%s.%d: error: dropped entries on the floor",
+ __FILE__, __LINE__);
+ }
+
+ /* Save remaining entries for processing, later. */
+ SIMPLEQ_CONCAT(&shared->lower_defrees, &defrees);
+
+ return err;
+}
/*-------------------------------------------------------------------------
@@ -795,9 +882,11 @@ H5MF_alloc(H5F_t *f, H5FD_mem_t alloc_type, hsize_t size)
haddr_t ret_value = HADDR_UNDEF; /* Return value */
FUNC_ENTER_NOAPI_TAG(H5AC__FREESPACE_TAG, HADDR_UNDEF)
-#ifdef H5MF_ALLOC_DEBUG
-HDfprintf(stderr, "%s: alloc_type = %u, size = %Hu\n", FUNC, (unsigned)alloc_type, size);
-#endif /* H5MF_ALLOC_DEBUG */
+
+ hlog_fast(h5mf_alloc,
+ "%s: enter %p type %u size %" PRIuHSIZE " tick %" PRIu64,
+ __func__, (void *)f->shared, (unsigned)alloc_type, size,
+ f->shared->vfd_swmr_writer ? f->shared->tick_num : 0);
/* check arguments */
HDassert(f);
@@ -805,11 +894,15 @@ HDfprintf(stderr, "%s: alloc_type = %u, size = %Hu\n", FUNC, (unsigned)alloc_typ
HDassert(f->shared->lf);
HDassert(size > 0);
+ if (!f->shared->vfd_swmr_writer)
+ ; // not a VFD SWMR writer, do not process deferrals
+ else if (H5MF_process_deferred_frees(f, f->shared->tick_num) < 0) {
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTGC, HADDR_UNDEF,
+ "could not process deferrals")
+ }
H5MF__alloc_to_fs_type(f->shared, alloc_type, size, &fs_type);
-#ifdef H5MF_ALLOC_DEBUG_MORE
-HDfprintf(stderr, "%s: Check 1.0\n", FUNC);
-#endif /* H5MF_ALLOC_DEBUG_MORE */
+ hlog_fast(noisy_h5mf_alloc, "%s: Check 1.0", __func__);
/* Set the ring type in the API context */
if(H5MF__fsm_type_is_self_referential(f->shared, fs_type))
@@ -843,9 +936,9 @@ HDfprintf(stderr, "%s: Check 1.0\n", FUNC);
/* If no space is found from the free-space manager, continue further action */
if(!H5F_addr_defined(ret_value)) {
-#ifdef H5MF_ALLOC_DEBUG_MORE
-HDfprintf(stderr, "%s: Check 2.0\n", FUNC);
-#endif /* H5MF_ALLOC_DEBUG_MORE */
+
+ hlog_fast(noisy_h5mf_alloc, "%s: Check 2.0", __func__);
+
if(f->shared->fs_strategy == H5F_FSPACE_STRATEGY_PAGE) {
HDassert(f->shared->fs_page_size >= H5F_FILE_SPACE_PAGE_SIZE_MIN);
if(HADDR_UNDEF == (ret_value = H5MF__alloc_pagefs(f, alloc_type, size)))
@@ -857,18 +950,17 @@ HDfprintf(stderr, "%s: Check 2.0\n", FUNC);
} /* end else */
} /* end if */
HDassert(H5F_addr_defined(ret_value));
-#ifdef H5MF_ALLOC_DEBUG_MORE
-HDfprintf(stderr, "%s: Check 3.0\n", FUNC);
-#endif /* H5MF_ALLOC_DEBUG_MORE */
+
+ hlog_fast(noisy_h5mf_alloc, "%s: Check 3.0", FUNC);
done:
/* Reset the ring in the API context */
if(orig_ring != H5AC_RING_INV)
H5AC_set_ring(orig_ring, NULL);
-#ifdef H5MF_ALLOC_DEBUG
-HDfprintf(stderr, "%s: Leaving: ret_value = %a, size = %Hu\n", FUNC, ret_value, size);
-#endif /* H5MF_ALLOC_DEBUG */
+ hlog_fast(h5mf_alloc,
+ "%s: leave %p type %u addr %" PRIuHADDR " size %" PRIuHSIZE,
+ __func__, (void *)f->shared, (unsigned)alloc_type, ret_value, size);
#ifdef H5MF_ALLOC_DEBUG_DUMP
H5MF__sects_dump(f, stderr);
#endif /* H5MF_ALLOC_DEBUG_DUMP */
@@ -986,7 +1078,7 @@ HDfprintf(stderr, "%s: alloc_type = %u, size = %Hu\n", FUNC, (unsigned)alloc_typ
/* Insert the new page into the Page Buffer list of new pages so
we don't read an empty page from disk */
- if(f->shared->page_buf != NULL && H5PB_add_new_page(f->shared, alloc_type, new_page) < 0)
+ if(f->shared->pb_ptr != NULL && H5PB_add_new_page(f->shared, alloc_type, new_page) < 0)
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTINSERT, HADDR_UNDEF, "can't add new page to Page Buffer new page list")
ret_value = new_page;
@@ -1094,17 +1186,14 @@ done:
herr_t
H5MF_xfree(H5F_t *f, H5FD_mem_t alloc_type, haddr_t addr, hsize_t size)
{
- H5F_mem_page_t fs_type; /* Free space type (mapped from allocation type) */
- H5MF_free_section_t *node = NULL; /* Free space section pointer */
- unsigned ctype; /* section class type */
- H5AC_ring_t orig_ring = H5AC_RING_INV; /* Original ring value */
- H5AC_ring_t fsm_ring; /* Ring of FSM */
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_NOAPI_TAG(H5AC__FREESPACE_TAG, FAIL)
-#ifdef H5MF_ALLOC_DEBUG
-HDfprintf(stderr, "%s: Entering - alloc_type = %u, addr = %a, size = %Hu\n", FUNC, (unsigned)alloc_type, addr, size);
-#endif /* H5MF_ALLOC_DEBUG */
+
+ hlog_fast(h5mf_free, "%s: Entering - alloc_type %u addr %" PRIuHADDR
+ " size %" PRIuHSIZE " tick %" PRIu64,
+ __func__, (unsigned)alloc_type, addr, size,
+ f->shared->vfd_swmr_writer ? f->shared->tick_num : 0);
/* check arguments */
HDassert(f);
@@ -1112,6 +1201,39 @@ HDfprintf(stderr, "%s: Entering - alloc_type = %u, addr = %a, size = %Hu\n", FUN
HGOTO_DONE(SUCCEED)
HDassert(addr != 0); /* Can't deallocate the superblock :-) */
+ if (!f->shared->vfd_swmr_writer || f->shared->closing ||
+ alloc_type != H5FD_MEM_DRAW) {
+ /* VFD SWMR writers defer raw-data allocations until the
+ * file starts to close.
+ */
+ ret_value = H5MF__xfree_impl(f, alloc_type, addr, size);
+ } else if (defer_free(f->shared, alloc_type, addr, size) < 0)
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTFREE, FAIL, "could not defer")
+ else if (H5MF_process_deferred_frees(f, f->shared->tick_num) < 0) {
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTGC, FAIL,
+ "could not process deferrals")
+ }
+
+done:
+ FUNC_LEAVE_NOAPI_TAG(ret_value)
+}
+
+static herr_t
+H5MF__xfree_inner_impl(H5F_t *f, H5FD_mem_t alloc_type, haddr_t addr, hsize_t size)
+{
+ H5F_mem_page_t fs_type; /* Free space type (mapped from allocation type) */
+ H5MF_free_section_t *node = NULL; /* Free space section pointer */
+ unsigned ctype; /* section class type */
+ H5AC_ring_t orig_ring = H5AC_RING_INV; /* Original ring value */
+ H5AC_ring_t fsm_ring; /* Ring of FSM */
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ hlog_fast(h5mf_free,
+ "%s: enter %p type %u addr %" PRIuHADDR " size %" PRIuHSIZE,
+ __func__, (void *)f->shared, (unsigned)alloc_type, addr, size);
+
+ FUNC_ENTER_STATIC_TAG(H5AC__FREESPACE_TAG)
+
H5MF__alloc_to_fs_type(f->shared, alloc_type, size, &fs_type);
/* Set the ring type in the API context */
@@ -1149,28 +1271,27 @@ HDfprintf(stderr, "%s: Entering - alloc_type = %u, addr = %a, size = %Hu\n", FUN
* see if we can avoid creating one by checking if the freed
* space is at the end of the file
*/
-#ifdef H5MF_ALLOC_DEBUG_MORE
-HDfprintf(stderr, "%s: fs_addr = %a\n", FUNC, f->shared->fs_addr[fs_type]);
-#endif /* H5MF_ALLOC_DEBUG_MORE */
+
+ hlog_fast(h5mf_free, "%s: fs_addr %" PRIuHADDR, __func__,
+ f->shared->fs_addr[fs_type]);
+
if(!H5F_addr_defined(f->shared->fs_addr[fs_type])) {
htri_t status; /* "can absorb" status for section into */
-#ifdef H5MF_ALLOC_DEBUG_MORE
-HDfprintf(stderr, "%s: Trying to avoid starting up free space manager\n", FUNC);
-#endif /* H5MF_ALLOC_DEBUG_MORE */
+ hlog_fast(h5mf_free, "%s: Trying to avoid starting up free space manager", __func__);
+
/* Try to shrink the file or absorb the block into a block aggregator */
if((status = H5MF_try_shrink(f, alloc_type, addr, size)) < 0)
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTMERGE, FAIL, "can't check for absorbing block")
else if(status > 0)
- /* Indicate success */
HGOTO_DONE(SUCCEED)
else if(size < f->shared->fs_threshold) {
-#ifdef H5MF_ALLOC_DEBUG_MORE
-HDfprintf(stderr, "%s: dropping addr = %a, size = %Hu, on the floor!\n", FUNC, addr, size);
-#endif /* H5MF_ALLOC_DEBUG_MORE */
+ hlog_fast(h5mf_free, "%s: dropping addr %" PRIuHADDR
+ " size %" PRIuHSIZE " on the floor!",
+ __func__, addr, size);
HGOTO_DONE(SUCCEED)
- } /* end else-if */
- } /* end if */
+ }
+ }
/* If we are deleting the free space manager, leave now, to avoid
* [re-]starting it.
@@ -1183,11 +1304,10 @@ HDfprintf(stderr, "%s: dropping addr = %a, size = %Hu, on the floor!\n", FUNC, a
*/
if(f->shared->fs_state[fs_type] == H5F_FS_STATE_DELETING ||
!H5F_HAVE_FREE_SPACE_MANAGER(f)) {
-#ifdef H5MF_ALLOC_DEBUG_MORE
-HDfprintf(stderr, "%s: dropping addr = %a, size = %Hu, on the floor!\n", FUNC, addr, size);
-#endif /* H5MF_ALLOC_DEBUG_MORE */
+ hlog_fast(h5mf_free, "%s: dropping addr %" PRIuHADDR
+ " size %" PRIuHSIZE " on the floor!", __func__, addr, size);
HGOTO_DONE(SUCCEED)
- } /* end if */
+ }
/* There's either already a free space manager, or the freed
* space isn't at the end of the file, so start up (or create)
@@ -1195,7 +1315,7 @@ HDfprintf(stderr, "%s: dropping addr = %a, size = %Hu, on the floor!\n", FUNC, a
*/
if(H5MF__start_fstype(f, fs_type) < 0)
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTINIT, FAIL, "can't initialize file free space")
- } /* end if */
+ }
/* Create the free-space section for the freed section */
ctype = H5MF_SECT_CLASS_TYPE(f, size);
@@ -1206,20 +1326,15 @@ HDfprintf(stderr, "%s: dropping addr = %a, size = %Hu, on the floor!\n", FUNC, a
if(size >= f->shared->fs_threshold) {
HDassert(f->shared->fs_man[fs_type]);
-#ifdef H5MF_ALLOC_DEBUG_MORE
-HDfprintf(stderr, "%s: Before H5FS_sect_add()\n", FUNC);
-#endif /* H5MF_ALLOC_DEBUG_MORE */
+ hlog_fast(h5mf_free, "%s: Before H5FS_sect_add()", __func__);
/* Add to the free space for the file */
if(H5MF__add_sect(f, alloc_type, f->shared->fs_man[fs_type], node) < 0)
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTINSERT, FAIL, "can't add section to file free space")
node = NULL;
-#ifdef H5MF_ALLOC_DEBUG_MORE
-HDfprintf(stderr, "%s: After H5FS_sect_add()\n", FUNC);
-#endif /* H5MF_ALLOC_DEBUG_MORE */
- } /* end if */
- else {
+ hlog_fast(h5mf_free, "%s: After H5FS_sect_add()", __func__);
+ } else {
htri_t merged; /* Whether node was merged */
H5MF_sect_ud_t udata; /* User data for callback */
@@ -1235,7 +1350,7 @@ HDfprintf(stderr, "%s: After H5FS_sect_add()\n", FUNC);
else if(merged == TRUE) /* successfully merged */
/* Indicate that the node was used */
node = NULL;
- } /* end else */
+ }
done:
/* Reset the ring in the API context */
@@ -1247,14 +1362,65 @@ done:
if(H5MF__sect_free((H5FS_section_info_t *)node) < 0)
HDONE_ERROR(H5E_RESOURCE, H5E_CANTRELEASE, FAIL, "can't free simple section node")
-#ifdef H5MF_ALLOC_DEBUG
-HDfprintf(stderr, "%s: Leaving, ret_value = %d\n", FUNC, ret_value);
-#endif /* H5MF_ALLOC_DEBUG */
+ hlog_fast(h5mf_free,
+ "%s: %p leave %d", __func__, (void *)f->shared, ret_value);
+
#ifdef H5MF_ALLOC_DEBUG_DUMP
H5MF__sects_dump(f, stderr);
#endif /* H5MF_ALLOC_DEBUG_DUMP */
FUNC_LEAVE_NOAPI_TAG(ret_value)
-} /* end H5MF_xfree() */
+}
+
+static herr_t
+H5MF__xfree_impl(H5F_t *f, H5FD_mem_t alloc_type, haddr_t addr, hsize_t size)
+{
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_STATIC_TAG(H5AC__FREESPACE_TAG)
+
+ ret_value = H5MF__xfree_inner_impl(f, alloc_type, addr, size);
+
+ /* If the page buffer is enabled, notify it so that it can remove any
+ * pages that lie in the freed region.
+ *
+ * This is necessary in normal (AKA non VFD SWMR mode) as if single large
+ * metadata entry is allocated out of the freed space, writes to the
+ * entry will by-pass the page buffer. If a dirty intersecting
+ * entry is left in the page buffer, it could introduce corruption
+ * if it is flushed after the large metadata entry is written.
+ *
+ * Further, in the VFD SWMR case, the large metadata entry will typically
+ * be buffered in the page buffer. If an intersecting entry is left
+ * in the page buffer, in addition to causing potential corruption in
+ * the HDF5 file, it may also result in overlaping entries in the page
+ * buffer and metadata file index.
+ *
+ * It's ok to remove the page from the PB without flushing to
+ * the shadow file or to the underlying HDF5 file because any
+ * writes to the page in this tick have not yet become visible
+ * to the reader, and any writes to the page in previous ticks are
+ * recorded in the shadow file.
+ *
+ * Note: This is not the correct place for this call, as it is
+ * sometimes bypassed by a HGOTO_DONE earlier in the function.
+ * This causes the assertion failure in fheap when run at
+ * express test level 0. Discuss with Vailin.
+ *
+ * JRM -- 4/28/20
+ */
+ if (ret_value == SUCCEED && f->shared->pb_ptr &&
+ size >= f->shared->fs_page_size) {
+
+ HDassert(H5F_SHARED_PAGED_AGGR(f->shared));
+
+ if (H5PB_remove_entries(f->shared, addr, size) < 0) {
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTFREE, FAIL,
+ "can't remove entries from page buffer")
+ }
+ }
+done:
+ FUNC_LEAVE_NOAPI_TAG(ret_value)
+}
/*-------------------------------------------------------------------------
@@ -1293,9 +1459,10 @@ H5MF_try_extend(H5F_t *f, H5FD_mem_t alloc_type, haddr_t addr, hsize_t size,
htri_t ret_value = FALSE; /* Return value */
FUNC_ENTER_NOAPI_TAG(H5AC__FREESPACE_TAG, FAIL)
-#ifdef H5MF_ALLOC_DEBUG
-HDfprintf(stderr, "%s: Entering: alloc_type = %u, addr = %a, size = %Hu, extra_requested = %Hu\n", FUNC, (unsigned)alloc_type, addr, size, extra_requested);
-#endif /* H5MF_ALLOC_DEBUG */
+
+ hlog_fast(h5mf_extend, "%s: Entering: alloc_type %u addr %" PRIuHADDR
+ " size %" PRIuHSIZE " extra_requested %" PRIuHSIZE, __func__,
+ (unsigned)alloc_type, addr, size, extra_requested);
/* Sanity check */
HDassert(f);
@@ -1343,9 +1510,9 @@ HDfprintf(stderr, "%s: Entering: alloc_type = %u, addr = %a, size = %Hu, extra_r
/* Try extending the block at EOA */
if((ret_value = H5F__try_extend(f, map_type, end, extra_requested + frag_size)) < 0)
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTEXTEND, FAIL, "error extending file")
-#ifdef H5MF_ALLOC_DEBUG_MORE
-HDfprintf(stderr, "%s: extended = %t\n", FUNC, ret_value);
-#endif /* H5MF_ALLOC_DEBUG_MORE */
+
+ hlog_fast(h5mf_extend, "%s: extended %s", __func__,
+ htri_to_string(ret_value));
/* If extending at EOA succeeds: */
/* for paged aggregation, put the fragment into the large-sized free-space manager */
@@ -1381,10 +1548,9 @@ HDfprintf(stderr, "%s: extended = %t\n", FUNC, ret_value);
if((ret_value = H5MF__aggr_try_extend(f, aggr, map_type, end, extra_requested)) < 0)
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTEXTEND, FAIL, "error extending aggregation block")
-#ifdef H5MF_ALLOC_DEBUG_MORE
-HDfprintf(stderr, "%s: H5MF__aggr_try_extend = %t\n", FUNC, ret_value);
-#endif /* H5MF_ALLOC_DEBUG_MORE */
- } /* end if */
+ hlog_fast(h5mf_extend, "%s: H5MF__aggr_try_extend %s", __func__,
+ htri_to_string(ret_value));
+ }
/* If no extension so far, try to extend into a free-space section */
if(ret_value == FALSE && ((f->shared->fs_strategy == H5F_FSPACE_STRATEGY_FSM_AGGR) ||
@@ -1405,10 +1571,10 @@ HDfprintf(stderr, "%s: H5MF__aggr_try_extend = %t\n", FUNC, ret_value);
if(f->shared->fs_man[fs_type]) {
if((ret_value = H5FS_sect_try_extend(f, f->shared->fs_man[fs_type], addr, size, extra_requested, H5FS_ADD_RETURNED_SPACE, &udata)) < 0)
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTEXTEND, FAIL, "error extending block in free space manager")
-#ifdef H5MF_ALLOC_DEBUG_MORE
-HDfprintf(stderr, "%s: Try to H5FS_sect_try_extend = %t\n", FUNC, ret_value);
-#endif /* H5MF_ALLOC_DEBUG_MORE */
- } /* end if */
+
+ hlog_fast(h5mf_extend, "%s: Try to H5FS_sect_try_extend %s",
+ __func__, htri_to_string(ret_value));
+ }
/* For paged aggregation and a metadata block: try to extend into page end threshold */
if(ret_value == FALSE && H5F_PAGED_AGGR(f) && map_type != H5FD_MEM_DRAW) {
@@ -1416,21 +1582,20 @@ HDfprintf(stderr, "%s: Try to H5FS_sect_try_extend = %t\n", FUNC, ret_value);
if(frag_size <= H5F_PGEND_META_THRES(f) && extra_requested <= frag_size)
ret_value = TRUE;
-#ifdef H5MF_ALLOC_DEBUG_MORE
-HDfprintf(stderr, "%s: Try to extend into the page end threshold = %t\n", FUNC, ret_value);
-#endif /* H5MF_ALLOC_DEBUG_MORE */
- } /* end if */
- } /* end if */
- } /* allow_extend */
+
+ hlog_fast(h5mf_extend, "%s: Try to extend into the page end threshold %s", __func__, htri_to_string(ret_value));
+ }
+ }
+ }
done:
/* Reset the ring in the API context */
if(orig_ring != H5AC_RING_INV)
H5AC_set_ring(orig_ring, NULL);
-#ifdef H5MF_ALLOC_DEBUG
-HDfprintf(stderr, "%s: Leaving: ret_value = %t\n", FUNC, ret_value);
-#endif /* H5MF_ALLOC_DEBUG */
+
+ hlog_fast(h5mf_extend, "%s: Leaving: ret_value %s", __func__, htri_to_string(ret_value));
+
#ifdef H5MF_ALLOC_DEBUG_DUMP
H5MF__sects_dump(f, stderr);
#endif /* H5MF_ALLOC_DEBUG_DUMP */
@@ -1465,9 +1630,9 @@ H5MF_try_shrink(H5F_t *f, H5FD_mem_t alloc_type, haddr_t addr, hsize_t size)
htri_t ret_value = FALSE; /* Return value */
FUNC_ENTER_NOAPI_TAG(H5AC__FREESPACE_TAG, FAIL)
-#ifdef H5MF_ALLOC_DEBUG
-HDfprintf(stderr, "%s: Entering - alloc_type = %u, addr = %a, size = %Hu\n", FUNC, (unsigned)alloc_type, addr, size);
-#endif /* H5MF_ALLOC_DEBUG */
+
+ hlog_fast(h5mf_shrink, "%s: Entering - alloc_type %u addr %" PRIuHADDR
+ " size %" PRIuHSIZE, __func__, (unsigned)alloc_type, addr, size);
/* check arguments */
HDassert(f);
@@ -1521,9 +1686,9 @@ done:
if(node && H5MF__sect_free((H5FS_section_info_t *)node) < 0)
HDONE_ERROR(H5E_RESOURCE, H5E_CANTRELEASE, FAIL, "can't free simple section node")
-#ifdef H5MF_ALLOC_DEBUG
-HDfprintf(stderr, "%s: Leaving, ret_value = %d\n", FUNC, ret_value);
-#endif /* H5MF_ALLOC_DEBUG */
+ hlog_fast(h5mf_shrink, "%s: Leaving, ret_value %d", __func__,
+ ret_value);
+
FUNC_LEAVE_NOAPI_TAG(ret_value)
} /* end H5MF_try_shrink() */
@@ -1546,27 +1711,27 @@ H5MF_close(H5F_t *f)
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_NOAPI_TAG(H5AC__FREESPACE_TAG, FAIL)
-#ifdef H5MF_ALLOC_DEBUG
-HDfprintf(stderr, "%s: Entering\n", FUNC);
-#endif /* H5MF_ALLOC_DEBUG */
+
+ hlog_fast(h5mf, "%s: entering", __func__);
/* check args */
HDassert(f);
HDassert(f->shared);
+ hlog_fast(h5mf, "%s: total deferred frees %" PRIu64, __func__,
+ H5MF_total_deferred_frees(f->shared));
+
if(H5F_PAGED_AGGR(f)) {
if((ret_value = H5MF__close_pagefs(f)) < 0)
HGOTO_ERROR(H5E_FILE, H5E_CANTFREE, FAIL, "can't close free-space managers for 'page' file space")
- } /* end if */
- else {
+ } else {
if((ret_value = H5MF__close_aggrfs(f)) < 0)
HGOTO_ERROR(H5E_FILE, H5E_CANTFREE, FAIL, "can't close free-space managers for 'aggr' file space")
} /* end else */
done:
-#ifdef H5MF_ALLOC_DEBUG
-HDfprintf(stderr, "%s: Leaving\n", FUNC);
-#endif /* H5MF_ALLOC_DEBUG */
+ hlog_fast(h5mf, "%s: leaving", __func__);
+
FUNC_LEAVE_NOAPI_TAG(ret_value)
} /* end H5MF_close() */
diff --git a/src/H5MFaggr.c b/src/H5MFaggr.c
index 0124555..476dacd 100644
--- a/src/H5MFaggr.c
+++ b/src/H5MFaggr.c
@@ -753,6 +753,9 @@ H5MF_free_aggrs(H5F_t *f)
HDassert(f->shared);
HDassert(f->shared->lf);
+ if (f->shared->closing && H5MF_process_deferred_frees(f, UINT64_MAX) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTFREE, FAIL, "could not process deferrals")
+
/* Retrieve metadata aggregator info, if available */
if(H5MF__aggr_query(f, &(f->shared->meta_aggr), &ma_addr, &ma_size) < 0)
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTGET, FAIL, "can't query metadata aggregator stats")
diff --git a/src/H5MFprivate.h b/src/H5MFprivate.h
index acd773b..c7e9e6e 100644
--- a/src/H5MFprivate.h
+++ b/src/H5MFprivate.h
@@ -67,6 +67,8 @@ H5_DLL ssize_t H5MF_get_free_sections(H5F_t *f, H5FD_mem_t type, size_t nsects,
/* File 'temporary' space allocation routines */
H5_DLL haddr_t H5MF_alloc_tmp(H5F_t *f, hsize_t size);
+herr_t H5MF_process_deferred_frees(H5F_t *, uint64_t);
+
/* 'block aggregator' routines */
H5_DLL herr_t H5MF_free_aggrs(H5F_t *f);
H5_DLL htri_t H5MF_aggrs_try_shrink_eoa(H5F_t *f);
diff --git a/src/H5MFsection.c b/src/H5MFsection.c
index 715ece4..1b9e756 100644
--- a/src/H5MFsection.c
+++ b/src/H5MFsection.c
@@ -771,13 +771,16 @@ H5MF__sect_small_merge(H5FS_section_info_t **_sect1, H5FS_section_info_t *_sect2
if(H5MF_xfree(udata->f, udata->alloc_type, (*sect1)->sect_info.addr, (*sect1)->sect_info.size) < 0)
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTFREE, FAIL, "can't free merged section")
- /* Need to free possible metadata page in the PB cache */
- /* This is in response to the data corruption bug from fheap.c with page buffering + page strategy */
- /* Note: Large metadata page bypasses the PB cache */
- /* Note: Update of raw data page (large or small sized) is handled by the PB cache */
- if(udata->f->shared->page_buf != NULL && udata->alloc_type != H5FD_MEM_DRAW)
+ /* Need to free possible raw/metadata page in the page buffer.
+ * This is in response to the data corruption bug from test/fheap.c
+ * when page buffering + page aggregation strategy are used.
+ * Note: Large raw/metadata page bypasses the page buffer.
+ * Note: Update of raw data page (large or small sized) is handled
+ * by the PB cache
+ */
+ if(udata->f->shared->pb_ptr != NULL)
if(H5PB_remove_entry(udata->f->shared, (*sect1)->sect_info.addr) < 0)
- HGOTO_ERROR(H5E_RESOURCE, H5E_CANTFREE, FAIL, "can't free merged section")
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTFREE, FAIL, "can't free merged section from page buffer")
if(H5MF__sect_free((H5FS_section_info_t *)(*sect1)) < 0)
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTRELEASE, FAIL, "can't free section node")
@@ -819,12 +822,21 @@ H5MF__sect_large_can_merge(const H5FS_section_info_t *_sect1,
const H5MF_free_section_t *sect2 = (const H5MF_free_section_t *)_sect2; /* File free section */
htri_t ret_value = FALSE; /* Return value */
- FUNC_ENTER_STATIC_NOERR
+ FUNC_ENTER_STATIC
/* Check arguments. */
HDassert(sect1);
HDassert(sect2);
HDassert(sect1->sect_info.type == sect2->sect_info.type); /* Checks "MERGE_SYM" flag */
+ if (!H5F_addr_lt(sect1->sect_info.addr, sect2->sect_info.addr)) {
+ fprintf(stderr, "%s.%d: sect1->sect_info.addr %" PRIuHADDR
+ ", sect2->sect_info.addr %" PRIuHADDR "\n", __func__, __LINE__,
+ sect1->sect_info.addr, sect2->sect_info.addr);
+ fprintf(stderr, "%s.%d: sect1->sect_info.size %" PRIuHSIZE
+ ", sect2->sect_info.size %" PRIuHSIZE "\n", __func__, __LINE__,
+ sect1->sect_info.size, sect2->sect_info.size);
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTRELEASE, FAIL, "can't merge")
+ }
HDassert(H5F_addr_lt(sect1->sect_info.addr, sect2->sect_info.addr));
ret_value = H5F_addr_eq(sect1->sect_info.addr + sect1->sect_info.size, sect2->sect_info.addr);
@@ -833,6 +845,7 @@ H5MF__sect_large_can_merge(const H5FS_section_info_t *_sect1,
HDfprintf(stderr, "%s: Leaving: ret_value = %t\n", FUNC, ret_value);
#endif /* H5MF_ALLOC_DEBUG_MORE */
+done:
FUNC_LEAVE_NOAPI(ret_value)
} /* H5MF__sect_large_can_merge() */
diff --git a/src/H5MV.c b/src/H5MV.c
new file mode 100644
index 0000000..5b73b9a
--- /dev/null
+++ b/src/H5MV.c
@@ -0,0 +1,721 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright by The HDF Group. *
+ * Copyright by the Board of Trustees of the University of Illinois. *
+ * All rights reserved. *
+ * *
+ * This file is part of HDF5. The full HDF5 copyright notice, including *
+ * terms governing use, modification, and redistribution, is contained in *
+ * the COPYING file, which can be found at the root of the source code *
+ * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. *
+ * If you do not have access to either file, you may request a copy from *
+ * help@hdfgroup.org. *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+/*-------------------------------------------------------------------------
+ *
+ * Created: H5MV.c
+ *
+ * Purpose: Free-space manager for VFD SWMR's metadata file
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/****************/
+/* Module Setup */
+/****************/
+
+#define H5F_FRIEND /*suppress error about including H5Fpkg */
+#define H5FS_FRIEND /*suppress error about including H5Fpkg */
+#include "H5MVmodule.h" /* This source code file is part of the H5MV module */
+
+
+/***********/
+/* Headers */
+/***********/
+#include "H5private.h" /* Generic Functions */
+#include "H5Eprivate.h" /* Error handling */
+#include "H5Fpkg.h" /* File access */
+#include "H5FSpkg.h" /* File free space */
+#include "H5Iprivate.h" /* IDs */
+#include "H5MVpkg.h" /* File memory management */
+#include "H5VMprivate.h" /* Vectors and arrays */
+
+#include "hlog.h"
+
+/****************/
+/* Local Macros */
+/****************/
+
+/* Define this to display debugging information for VFD SWMR */
+/* #define H5MV_VFD_SWMR_DEBUG */
+
+#define H5MV_FSPACE_SHRINK 80 /* Percent of "normal" size to shrink serialized free space size */
+#define H5MV_FSPACE_EXPAND 120 /* Percent of "normal" size to expand serialized free space size */
+
+/******************/
+/* Local Typedefs */
+/******************/
+
+/* User data for section info iterator callback for iterating over free space sections */
+typedef struct {
+ H5F_sect_info_t *sects; /* section info to be retrieved */
+ size_t sect_count; /* # of sections requested */
+ size_t sect_idx; /* the current count of sections */
+} H5MV_sect_iter_ud_t;
+
+
+/********************/
+/* Package Typedefs */
+/********************/
+
+
+/********************/
+/* Local Prototypes */
+/********************/
+
+static haddr_t H5MV__extend_md(H5F_shared_t *, hsize_t);
+
+/* Space allocation routines */
+H5_DLL haddr_t H5MV__alloc_md(H5F_shared_t *, hsize_t);
+H5_DLL htri_t H5MV__try_extend_md(H5F_shared_t *, haddr_t, hsize_t);
+
+/*********************/
+/* Package Variables */
+/*********************/
+
+/* Package initialization variable */
+hbool_t H5_PKG_INIT_VAR = FALSE;
+
+
+/*****************************/
+/* Library Private Variables */
+/*****************************/
+
+
+/*******************/
+/* Local Variables */
+/*******************/
+
+HLOG_OUTLET_SHORT_DEFN(h5mv, all);
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5MV__create()
+ *
+ * Purpose: Create free space manager for the metadata file by creating
+ * a free-space structure
+ *
+ * Return: Success: non-negative
+ * Failure: negative
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5MV__create(H5F_t *f)
+{
+ H5F_shared_t *shared = f->shared;
+ /* Free space section classes implemented for file */
+ const H5FS_section_class_t *classes[] = { H5MV_FSPACE_SECT_CLS_SIMPLE };
+ H5FS_create_t fs_create; /* Free space creation parameters */
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_PACKAGE
+
+ /*
+ * Check arguments.
+ */
+ HDassert(shared->fs_state_md == H5F_FS_STATE_CLOSED);
+
+ /* Set the free space creation parameters */
+ fs_create.client = H5FS_CLIENT_MD_VFD_ID;
+ fs_create.shrink_percent = H5MV_FSPACE_SHRINK;
+ fs_create.expand_percent = H5MV_FSPACE_EXPAND;
+ fs_create.max_sect_addr = 1 + H5VM_log2_gen((uint64_t)shared->maxaddr);
+ fs_create.max_sect_size = shared->maxaddr;
+
+ if(NULL == (shared->fs_man_md = H5FS_create(f, NULL, &fs_create, NELMTS(classes), classes, f, shared->fs_page_size, shared->fs_page_size)))
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTINIT, FAIL, "can't initialize free space info")
+
+ /* Set the state for the free space manager to "open", if it is now */
+ if(shared->fs_man_md)
+ shared->fs_state_md = H5F_FS_STATE_OPEN;
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5MV__create() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5MV_close
+ *
+ * Purpose: Close the free space manager for the metadata file
+ *
+ * Return: Success: non-negative
+ * Failure: negative
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5MV_close(H5F_t *f)
+{
+ H5F_shared_t *shared = f->shared;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ /*
+ * Check arguments.
+ */
+ HDassert(shared);
+#ifdef H5MV_VFD_SWMR_DEBUG
+HDfprintf(stderr, "%s: Trying to close free space manager\n", FUNC);
+#endif
+
+ /* Close an existing free space structure for the file */
+ if(shared->fs_man_md) {
+#ifdef H5MV_VFD_SWMR_DEBUG
+HDfprintf(stderr, "%s: Going to close free space manager\n", FUNC);
+#endif
+ HDassert(shared->fs_state_md != H5F_FS_STATE_CLOSED);
+
+ if(H5FS_close(f, shared->fs_man_md) < 0)
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTRELEASE, FAIL, "can't release free space info")
+ }
+
+ shared->fs_man_md = NULL;
+ shared->fs_state_md = H5F_FS_STATE_CLOSED;
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5MV_close() */
+
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5MV__find_sect
+ *
+ * Purpose: To find a section from the specified free-space manager
+ * to fulfill the request.
+ * If found, re-add the left-over space back to the manager.
+ *
+ * Return: TRUE if a section is found to fulfill the request
+ * FALSE if not
+ *
+ *-------------------------------------------------------------------------
+ */
+htri_t
+H5MV__find_sect(H5F_t *f, hsize_t size, H5FS_t *fspace, haddr_t *addr)
+{
+ H5MV_free_section_t *node; /* Free space section pointer */
+ htri_t ret_value = FAIL; /* Whether an existing free list node was found */
+
+ FUNC_ENTER_PACKAGE
+
+ HDassert(f);
+ HDassert(fspace);
+
+ /* Try to get a section from the free space manager */
+ if((ret_value = H5FS_sect_find(f, fspace, size, (H5FS_section_info_t **)&node)) < 0)
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "error locating free space in file")
+
+#ifdef H5MV_VFD_SWMR_DEBUG
+HDfprintf(stderr, "%s: section found = %t\n", FUNC, ret_value);
+#endif
+
+ /* Check for actually finding section */
+ if(ret_value) {
+ /* Sanity check */
+ HDassert(node);
+
+ /* Retrieve return value */
+ if(addr)
+ *addr = node->sect_info.addr;
+
+ /* Check for eliminating the section */
+ if(node->sect_info.size == size) {
+#ifdef H5MV_VFD_SWMR_DEBUG
+HDfprintf(stderr, "%s: freeing node\n", FUNC);
+#endif
+
+ /* Free section node */
+ if(H5MV__sect_free(&node->sect_info) < 0)
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTRELEASE, FAIL, "can't free simple section node")
+ } /* end if */
+ else {
+ /* Adjust information for section */
+ node->sect_info.addr += size;
+ node->sect_info.size -= size;
+
+#ifdef H5MV_VFD_SWMR_DEBUG
+HDfprintf(stderr, "%s: adding node, node->sect_info.addr = %a, node->sect_info.size = %Hu\n", FUNC, node->sect_info.addr, node->sect_info.size);
+#endif
+
+ /* Re-add the section to the free-space manager */
+ if(H5FS_sect_add(f, fspace, &node->sect_info, H5FS_ADD_RETURNED_SPACE, f) < 0)
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTINSERT, FAIL, "can't re-add section to file free space")
+ } /* end else */
+ } /* end if */
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5MV__find_sect() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5MV_alloc
+ *
+ * Purpose: Allocate SIZE bytes of file memory and return the relative
+ * address where that contiguous chunk of file memory exists.
+ *
+ * Return: Success: The file address of new chunk.
+ * Failure: HADDR_UNDEF
+ *
+ *-------------------------------------------------------------------------
+ */
+haddr_t
+H5MV_alloc(H5F_t *f, hsize_t size)
+{
+ H5F_shared_t *shared = f->shared;
+ haddr_t eoa; /* EOA for the file */
+ hsize_t frag_size = 0; /* Fragment size */
+ hsize_t misalign_size = 0; /* Mis-aligned size */
+ H5MV_free_section_t *node = NULL; /* Free space section pointer */
+ haddr_t ret_value = HADDR_UNDEF; /* Return value */
+
+ FUNC_ENTER_NOAPI(HADDR_UNDEF)
+ hlog_fast(h5mv, "%s: enter size %" PRIuHSIZE, __func__, size);
+
+ /* check arguments */
+ HDassert(shared->vfd_swmr_md_fd >= 0);
+ HDassert(size > 0);
+
+ /* Search for large enough space in the free space manager */
+ if(shared->fs_man_md != NULL) {
+ if(H5MV__find_sect(f, size, shared->fs_man_md, &ret_value) < 0)
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, HADDR_UNDEF, "error locating a node")
+ }
+
+ /* If no space is found from the free-space manager or no free-space manager, extend md's EOF */
+ if(!H5F_addr_defined(ret_value)) {
+
+ /* Get the EOA for the metadata file */
+ if(HADDR_UNDEF == (eoa = H5MV_get_vfd_swmr_md_eoa(shared)))
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTGET, HADDR_UNDEF, "Unable to get eoa for VFD SWMR metadata file")
+
+ /* If EOA is mis-aligned, calculate the fragment size */
+ if(H5F_addr_gt(eoa, 0) && (misalign_size = eoa % shared->fs_page_size))
+ frag_size = shared->fs_page_size - misalign_size;
+
+ /* Allocate from end of file */
+ if(HADDR_UNDEF == (ret_value = H5MV__alloc_md(shared, size + frag_size)))
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, HADDR_UNDEF, "allocation failed")
+
+ /* If there is a mis-aligned fragment at EOA */
+ if(frag_size) {
+ /* Start up the free-space manager if not so */
+ if(shared->fs_man_md == NULL) {
+ if(H5MV__create(f) < 0)
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTINIT, HADDR_UNDEF, "can't initialize free space manager")
+ }
+ HDassert(shared->fs_man_md);
+
+ /* Create the free-space section for the fragment */
+ if(NULL == (node = H5MV__sect_new(eoa, frag_size)))
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTINIT, HADDR_UNDEF, "can't initialize free space section")
+
+ /* Add the section */
+ if(H5FS_sect_add(f, shared->fs_man_md, &node->sect_info, H5FS_ADD_RETURNED_SPACE, f) < 0)
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTINSERT, HADDR_UNDEF, "can't re-add section to file free space")
+
+ node = NULL;
+ }
+ ret_value += frag_size;
+
+ } /* end if */
+
+ HDassert(H5F_addr_defined(ret_value));
+
+done:
+ hlog_fast(h5mv, "%s: leave addr %" PRIuHADDR " size %" PRIuHSIZE,
+ __func__, ret_value, size);
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5MV_alloc() */
+
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5MV_free
+ *
+ * Purpose: Frees part of a file, making that part of the file
+ * available for reuse.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5MV_free(H5F_t *f, haddr_t addr, hsize_t size)
+{
+ H5F_shared_t *shared = f->shared;
+ H5MV_free_section_t *node = NULL; /* Free space section pointer */
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ hlog_fast(h5mv, "%s: enter addr %" PRIuHADDR " size %" PRIuHSIZE,
+ __func__, addr, size);
+
+ /* check arguments */
+ HDassert(f);
+ if(!H5F_addr_defined(addr) || 0 == size)
+ HGOTO_DONE(SUCCEED)
+ HDassert(addr != 0);
+
+
+ /* Check if the free space manager for the file has been initialized */
+ if(shared->fs_man_md == NULL) {
+ /* If there's no free space manager for objects of this type,
+ * see if we can avoid creating one by checking if the freed
+ * space is at the end of the file
+ */
+ htri_t status; /* "can absorb" status for section into */
+
+ hlog_fast(h5mv, "%s: trying to avoid starting up free space manager",
+ __func__);
+
+ /* Try to shrink the file */
+ if((status = H5MV_try_shrink(f, addr, size)) < 0)
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTMERGE, FAIL, "can't check for absorbing block")
+ else if(status > 0)
+ HGOTO_DONE(SUCCEED)
+
+ /* If we are deleting the free space manager, leave now, to avoid
+ * [re-]starting it: dropping free space section on the floor.
+ */
+ if(shared->fs_state_md == H5F_FS_STATE_DELETING) {
+ hlog_fast(h5mv, "%s: dropping addr %" PRIuHADDR
+ " size %" PRIuHSIZE " on the floor!", __func__, addr, size);
+ HGOTO_DONE(SUCCEED)
+ }
+
+ /* There's either already a free space manager, or the freed
+ * space isn't at the end of the file, so start up (or create)
+ * the file space manager
+ */
+ if(H5MV__create(f) < 0)
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTINIT, FAIL, "can't initialize free space manager")
+ }
+
+ /* Create the free-space section for the freed section */
+ if(NULL == (node = H5MV__sect_new(addr, size)))
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTINIT, FAIL, "can't initialize free space section")
+
+ HDassert(shared->fs_man_md);
+
+ hlog_fast(h5mv, "%s: before H5FS_sect_add, addr %" PRIuHADDR
+ " size %" PRIuHSIZE, __func__, addr, size);
+
+ /* Add the section */
+ if(H5FS_sect_add(f, shared->fs_man_md, &node->sect_info, H5FS_ADD_RETURNED_SPACE, f) < 0)
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTINSERT, FAIL, "can't re-add section to file free space")
+
+ node = NULL;
+
+ hlog_fast(h5mv, "%s: after H5FS_sect_add", __func__);
+
+done:
+ /* Release section node, if allocated and not added to section list or merged */
+ if(node)
+ if(H5MV__sect_free(&node->sect_info) < 0)
+ HDONE_ERROR(H5E_RESOURCE, H5E_CANTRELEASE, FAIL, "can't free simple section node")
+
+ hlog_fast(h5mv, "%s: leave %d", __func__, ret_value);
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5MV_free() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5MV_try_extend
+ *
+ * Purpose: Extend a block at EOA in the file if possible.
+ *
+ * Return: Success: TRUE(1) - Block was extended
+ * FALSE(0) - Block could not be extended
+ * Failure: FAIL
+ *
+ *-------------------------------------------------------------------------
+ */
+htri_t
+H5MV_try_extend(H5F_t *f, haddr_t addr, hsize_t size, hsize_t extra_requested)
+{
+ H5F_shared_t *shared = f->shared;
+ haddr_t end; /* End of block to extend */
+ htri_t ret_value = FALSE; /* Return value */
+
+ FUNC_ENTER_NOAPI(FAIL)
+#ifdef H5MV_VFD_SWMR_DEBUG
+HDfprintf(stderr, "%s: Entering: addr = %a, size = %Hu, extra_requested = %Hu\n", FUNC, addr, size, extra_requested);
+#endif
+
+ /* Sanity check */
+ HDassert(f);
+ HDassert(H5F_SHARED_INTENT(shared) & H5F_ACC_RDWR);
+
+ /* Compute end of block to extend */
+ end = addr + size;
+
+ /* Try extending the block at EOA */
+ if((ret_value = H5MV__try_extend_md(shared, end, extra_requested)) < 0)
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTEXTEND, FAIL, "error extending file")
+#ifdef H5MV_VFD_SWMR_DEBUG
+HDfprintf(stderr, "%s: extended = %t\n", FUNC, ret_value);
+#endif
+
+ /* If no extension so far, try to extend into a free-space section */
+ if(ret_value == FALSE) {
+
+ /* Try to extend the block into a free-space section */
+ if(shared->fs_man_md) {
+ if((ret_value = H5FS_sect_try_extend(f, shared->fs_man_md, addr, size, extra_requested, H5FS_ADD_RETURNED_SPACE, NULL)) < 0)
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTEXTEND, FAIL, "error extending block in free space manager")
+#ifdef H5MV_VFD_SWMR_DEBUG
+HDfprintf(stderr, "%s: Try to H5FS_sect_try_extend = %t\n", FUNC, ret_value);
+#endif
+ } /* end if */
+
+ } /* end if */
+
+done:
+#ifdef H5MV_VFD_SWMR_DEBUG
+HDfprintf(stderr, "%s: Leaving: ret_value = %t\n", FUNC, ret_value);
+#endif
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5MV_try_extend() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5MV_try_shrink
+ *
+ * Purpose: Try to shrink the size of a file with a block
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+htri_t
+H5MV_try_shrink(H5F_t *f, haddr_t addr, hsize_t size)
+{
+ H5F_shared_t *shared = f->shared;
+ H5MV_free_section_t *node = NULL; /* Free space section pointer */
+ htri_t ret_value = FALSE; /* Return value */
+
+ FUNC_ENTER_NOAPI(FAIL)
+#ifdef H5MV_VFD_SWMR_DEBUG
+HDfprintf(stderr, "%s: Entering - addr = %a, size = %Hu\n", FUNC, addr, size);
+#endif
+
+ /* check arguments */
+ HDassert(shared->lf);
+ HDassert(H5F_addr_defined(addr));
+ HDassert(size > 0);
+
+ /* Create free-space section for block */
+ if(NULL == (node = H5MV__sect_new(addr, size)))
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTINIT, FAIL, "can't initialize free space section")
+
+ /* Check if the block can shrink the container */
+ if((ret_value = H5MV__sect_can_shrink(&node->sect_info, f)) < 0)
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTMERGE, FAIL, "can't check if section can shrink container")
+ else if(ret_value > 0) {
+ /* Shrink or absorb the section */
+ if(H5MV__sect_shrink((H5FS_section_info_t **)&node, f) < 0)
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTSHRINK, FAIL, "can't shrink container")
+ } /* end if */
+
+done:
+ /* Free section node allocated */
+ if(node && H5MV__sect_free(&node->sect_info) < 0)
+ HDONE_ERROR(H5E_RESOURCE, H5E_CANTRELEASE, FAIL, "can't free simple section node")
+
+#ifdef H5MV_VFD_SWMR_DEBUG
+HDfprintf(stderr, "%s: Leaving, ret_value = %d\n", FUNC, ret_value);
+#endif
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5MV_try_shrink() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5MV__free_md
+ *
+ * Purpose: Release space at the end of the metadata file's allocated space
+ *
+ * Return: Success: Non-negative
+ * Failure: Negative
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5MV__free_md(H5F_shared_t *shared, haddr_t addr, hsize_t size)
+{
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_PACKAGE
+
+ /* Check args */
+ HDassert(shared);
+ HDassert(size > 0);
+
+ /* Sanity checking */
+ if(!H5F_addr_defined(addr))
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "invalid file offset")
+
+ /* XXX what's maxaddr used here for? */
+ if(addr > shared->maxaddr || H5F_addr_overflow(addr, size) || (addr + size) > shared->maxaddr)
+ HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "invalid file free space region to free")
+
+ /* Check if this free block is at the end of file allocated space.
+ * Truncate it if this is true.
+ */
+ if(shared->vfd_swmr_md_eoa == (addr + size))
+ shared->vfd_swmr_md_eoa = addr;
+ else {
+ /* leak memory */
+#ifdef H5MV_VFD_SWMR_DEBUG
+HDfprintf(stderr, "%s: LEAKED MEMORY!!! addr = %a, size = %Hu\n", FUNC, addr, size);
+#endif
+ } /* end else */
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5MV__free_md() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5MV__alloc_md
+ *
+ * Purpose: Allocate space at the end of the metadata file
+ *
+ * Return: Success: Non-negative
+ * Failure: Negative
+ *
+ *-------------------------------------------------------------------------
+ */
+haddr_t
+H5MV__alloc_md(H5F_shared_t *shared, hsize_t size)
+{
+ haddr_t ret_value = HADDR_UNDEF; /* Return value */
+
+ FUNC_ENTER_PACKAGE
+
+ /* check args */
+ HDassert(shared);
+ HDassert(size > 0);
+
+ /* Extend the EOA space of the metadata file */
+ ret_value = H5MV__extend_md(shared, size);
+
+ if(!H5F_addr_defined(ret_value))
+ HGOTO_ERROR(H5E_VFL, H5E_NOSPACE, HADDR_UNDEF, "driver eoa update request failed")
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* end H5MV__alloc_md() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5MV__try_extend_md
+ *
+ * Purpose: Try to extend a block at the end of the metadata file, if possible.
+ *
+ * Return: Success: Non-negative
+ * Failure: Negative
+ *
+ *-------------------------------------------------------------------------
+ */
+htri_t
+H5MV__try_extend_md(H5F_shared_t *shared, haddr_t blk_end, hsize_t extra_requested)
+{
+ htri_t ret_value = FALSE; /* Return value */
+
+ FUNC_ENTER_PACKAGE
+
+ /* check args */
+ HDassert(shared);
+ HDassert(extra_requested > 0);
+
+ /* Check if the block is exactly at the end of the file */
+ if(H5F_addr_eq(blk_end, shared->vfd_swmr_md_eoa)) {
+
+ /* Extend the EOA space of the metadata file */
+ if(HADDR_UNDEF == H5MV__extend_md(shared, extra_requested))
+ HGOTO_ERROR(H5E_FILE, H5E_CANTEXTEND, FAIL, "driver extend request failed")
+
+ /* Indicate success */
+ ret_value = TRUE;
+ }
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5MV__try_extend_md() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5MV__extend_md
+ *
+ * Purpose: Extend the EOA space of the metadata file.
+ *
+ * Return: Success: Non-negative
+ * Failure: Negative
+ *
+ *-------------------------------------------------------------------------
+ */
+static haddr_t
+H5MV__extend_md(H5F_shared_t *shared, hsize_t size)
+{
+ haddr_t eoa;
+ haddr_t ret_value = HADDR_UNDEF; /* Return value */
+
+ FUNC_ENTER_NOAPI_NOINIT
+
+ /* Get current end-of-allocated space address */
+ eoa = shared->vfd_swmr_md_eoa;
+
+ /* Check for overflow when extending */
+ /* XXX why does this check maxaddr? That should have no bearing on
+ * the metadata file.
+ */
+ if(H5F_addr_overflow(eoa, size) || (eoa + size) > shared->maxaddr)
+ HGOTO_ERROR(H5E_FILE, H5E_NOSPACE, HADDR_UNDEF, "file allocation request failed")
+
+ /* Set the address to return */
+ ret_value = eoa;
+
+ /* Extend the end-of-allocated space address */
+ shared->vfd_swmr_md_eoa += size;
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+}
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5MV_get_vfd_swmr_md_eoa
+ *
+ * Purpose: Quick and dirty routine to retrieve the EOA for the metadata file
+ * (Mainly added to stop non-file routines from poking about in the
+ * H5F_t data structure)
+ *
+ * Return: The EOA for the metadata file
+ *-------------------------------------------------------------------------
+ */
+haddr_t
+H5MV_get_vfd_swmr_md_eoa(const H5F_shared_t *shared)
+{
+ /* Use FUNC_ENTER_NOAPI_NOINIT_NOERR here to avoid performance issues */
+ FUNC_ENTER_NOAPI_NOINIT_NOERR
+
+ HDassert(shared->vfd_swmr);
+
+ FUNC_LEAVE_NOAPI(shared->vfd_swmr_md_eoa)
+}
diff --git a/src/H5MVmodule.h b/src/H5MVmodule.h
new file mode 100644
index 0000000..5a95767
--- /dev/null
+++ b/src/H5MVmodule.h
@@ -0,0 +1,33 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright by The HDF Group. *
+ * All rights reserved. *
+ * *
+ * This file is part of HDF5. The full HDF5 copyright notice, including *
+ * terms governing use, modification, and redistribution, is contained in *
+ * the COPYING file, which can be found at the root of the source code *
+ * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. *
+ * If you do not have access to either file, you may request a copy from *
+ * help@hdfgroup.org. *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+/*
+ * Programmer: Quincey Koziol <koziol@hdfgroup.org>
+ * Saturday, September 12, 2015
+ *
+ * Purpose: This file contains declarations which define macros for the
+ * H5MV package. Including this header means that the source file
+ * is part of the H5MV package.
+ */
+#ifndef _H5MVmodule_H
+#define _H5MVmodule_H
+
+/* Define the proper control macros for the generic FUNC_ENTER/LEAVE and error
+ * reporting macros.
+ */
+#define H5MV_MODULE
+#define H5_MY_PKG H5MV
+#define H5_MY_PKG_ERR H5E_RESOURCE
+#define H5_MY_PKG_INIT NO
+
+#endif /* _H5MVmodule_H */
+
diff --git a/src/H5MVpkg.h b/src/H5MVpkg.h
new file mode 100644
index 0000000..cb29879
--- /dev/null
+++ b/src/H5MVpkg.h
@@ -0,0 +1,85 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright by The HDF Group. *
+ * Copyright by the Board of Trustees of the University of Illinois. *
+ * All rights reserved. *
+ * *
+ * This file is part of HDF5. The full HDF5 copyright notice, including *
+ * terms governing use, modification, and redistribution, is contained in *
+ * the COPYING file, which can be found at the root of the source code *
+ * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. *
+ * If you do not have access to either file, you may request a copy from *
+ * help@hdfgroup.org. *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+/*
+ * Programmer: Quincey Koziol <koziol@hdfgroup.org>
+ * Tuesday, January 8, 2008
+ *
+ * Purpose: This file contains declarations which are visible only within
+ * the H5MV package. Source files outside the H5MV package should
+ * include H5MVprivate.h instead.
+ */
+#if !(defined H5MV_FRIEND || defined H5MV_MODULE)
+#error "Do not include this file outside the H5MV package!"
+#endif
+
+#ifndef _H5MVpkg_H
+#define _H5MVpkg_H
+
+/* Get package's private header */
+#include "H5MVprivate.h"
+
+/* Other private headers needed by this file */
+#include "H5FSprivate.h" /* File free space */
+
+
+/**************************/
+/* Package Private Macros */
+/**************************/
+
+/* Define this to display information about file allocations */
+/* #define H5MV_VFD_SWMR_DEBUG */
+
+/* Free-space section types for file */
+/* (values stored in free space data structures in file) */
+#define H5MV_FSPACE_SECT_SIMPLE 0 /* For non-paged aggregation: section is a range of actual bytes in file */
+
+/****************************/
+/* Package Private Typedefs */
+/****************************/
+
+/* File free space section info */
+typedef struct H5MV_free_section_t {
+ H5FS_section_info_t sect_info; /* Free space section information (must be first in struct) */
+} H5MV_free_section_t;
+
+/*****************************/
+/* Package Private Variables */
+/*****************************/
+
+/* H5MF single section inherits serializable properties from H5FS_section_class_t */
+H5_DLLVAR H5FS_section_class_t H5MV_FSPACE_SECT_CLS_SIMPLE[1];
+
+
+/******************************/
+/* Package Private Prototypes */
+/******************************/
+
+H5_DLL htri_t H5MV__find_sect(H5F_t *f, hsize_t size, H5FS_t *fspace, haddr_t *addr);
+H5_DLL herr_t H5MV__create(H5F_t *f);
+
+/* free-space section routines */
+H5_DLL H5MV_free_section_t *H5MV__sect_new(haddr_t sect_off, hsize_t sect_size);
+H5_DLL herr_t H5MV__sect_free(H5FS_section_info_t *sect);
+H5_DLL htri_t H5MV__sect_can_shrink(const H5FS_section_info_t *_sect, void *udata);
+H5_DLL herr_t H5MV__sect_shrink(H5FS_section_info_t **_sect, void *udata);
+H5_DLL haddr_t H5MV_get_vfd_swmr_md_eoa(const H5F_shared_t *);
+H5_DLL herr_t H5MV__free_md(H5F_shared_t *, haddr_t, hsize_t);
+
+
+/* Testing routines */
+#ifdef H5MF_TESTING
+#endif /* H5MF_TESTING */
+
+#endif /* _H5MFpkg_H */
+
diff --git a/src/H5MVprivate.h b/src/H5MVprivate.h
new file mode 100644
index 0000000..2c0e95a
--- /dev/null
+++ b/src/H5MVprivate.h
@@ -0,0 +1,58 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright by The HDF Group. *
+ * Copyright by the Board of Trustees of the University of Illinois. *
+ * All rights reserved. *
+ * *
+ * This file is part of HDF5. The full HDF5 copyright notice, including *
+ * terms governing use, modification, and redistribution, is contained in *
+ * the COPYING file, which can be found at the root of the source code *
+ * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. *
+ * If you do not have access to either file, you may request a copy from *
+ * help@hdfgroup.org. *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+/*-------------------------------------------------------------------------
+ *
+ * Created: H5MVprivate.h
+ *
+ * Purpose: Private header file for file memory management.
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _H5MVprivate_H
+#define _H5MVprivate_H
+
+/* Private headers needed by this file */
+#include "H5Fprivate.h" /* File access */
+
+/**************************/
+/* Library Private Macros */
+/**************************/
+
+
+/****************************/
+/* Library Private Typedefs */
+/****************************/
+
+
+/*****************************/
+/* Library-private Variables */
+/*****************************/
+
+
+/***************************************/
+/* Library-private Function Prototypes */
+/***************************************/
+
+/* File space manager routines */
+
+H5_DLL herr_t H5MV_close(H5F_t *f);
+
+/* File space allocation routines */
+H5_DLL haddr_t H5MV_alloc(H5F_t *f, hsize_t size);
+H5_DLL herr_t H5MV_free(H5F_t *f, haddr_t addr, hsize_t size);
+H5_DLL herr_t H5MV_try_extend(H5F_t *f, haddr_t addr, hsize_t size, hsize_t extra_requested);
+H5_DLL htri_t H5MV_try_shrink(H5F_t *f, haddr_t addr, hsize_t size);
+
+#endif /* end _H5MVprivate_H */
+
diff --git a/src/H5MVsection.c b/src/H5MVsection.c
new file mode 100644
index 0000000..81f1a00
--- /dev/null
+++ b/src/H5MVsection.c
@@ -0,0 +1,395 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright by The HDF Group. *
+ * Copyright by the Board of Trustees of the University of Illinois. *
+ * All rights reserved. *
+ * *
+ * This file is part of HDF5. The full HDF5 copyright notice, including *
+ * terms governing use, modification, and redistribution, is contained in *
+ * the COPYING file, which can be found at the root of the source code *
+ * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. *
+ * If you do not have access to either file, you may request a copy from *
+ * help@hdfgroup.org. *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+/*
+ * Programmer: Quincey Koziol <koziol@hdfgroup.org>
+ * Tuesday, January 8, 2008
+ *
+ * Purpose: Free space section callbacks for file.
+ *
+ */
+
+/****************/
+/* Module Setup */
+/****************/
+
+#define H5F_FRIEND /*suppress error about including H5Fpkg */
+#include "H5MVmodule.h" /* This source code file is part of the H5MF module */
+
+
+/***********/
+/* Headers */
+/***********/
+#include "H5private.h" /* Generic Functions */
+#include "H5Eprivate.h" /* Error handling */
+#include "H5Fpkg.h" /* File access */
+#include "H5MVpkg.h" /* File memory management */
+
+
+/****************/
+/* Local Macros */
+/****************/
+
+
+/******************/
+/* Local Typedefs */
+/******************/
+
+
+/********************/
+/* Package Typedefs */
+/********************/
+
+
+/********************/
+/* Local Prototypes */
+/********************/
+
+/* 'simple' section callbacks */
+static htri_t H5MV__sect_can_merge(const H5FS_section_info_t *sect1,
+ const H5FS_section_info_t *sect2, void *udata);
+static herr_t H5MV__sect_merge(H5FS_section_info_t **sect1,
+ H5FS_section_info_t *sect2, void *udata);
+static herr_t H5MV__sect_valid(const H5FS_section_class_t *cls,
+ const H5FS_section_info_t *sect);
+static H5FS_section_info_t *H5MV__sect_split(H5FS_section_info_t *sect,
+ hsize_t frag_size);
+
+
+
+/*********************/
+/* Package Variables */
+/*********************/
+
+/* Class info for "simple" free space sections */
+H5FS_section_class_t H5MV_FSPACE_SECT_CLS_SIMPLE[1] = {{
+ /* Class variables */
+ H5MV_FSPACE_SECT_SIMPLE, /* Section type */
+ 0, /* Extra serialized size */
+ H5FS_CLS_MERGE_SYM | H5FS_CLS_ADJUST_OK | H5FS_CLS_GHOST_OBJ, /* Class flags */
+ NULL, /* Class private info */
+
+ /* Class methods */
+ NULL, /* Initialize section class */
+ NULL, /* Terminate section class */
+
+ /* Object methods */
+ NULL, /* Add section */
+ NULL, /* Serialize section */
+ NULL, /* Deserialize section */
+ H5MV__sect_can_merge, /* Can sections merge? */
+ H5MV__sect_merge, /* Merge sections */
+ H5MV__sect_can_shrink, /* Can section shrink container?*/
+ H5MV__sect_shrink, /* Shrink container w/section */
+ H5MV__sect_free, /* Free section */
+ H5MV__sect_valid, /* Check validity of section */
+ H5MV__sect_split, /* Split section node for alignment */
+ NULL, /* Dump debugging for section */
+}};
+
+
+/*****************************/
+/* Library Private Variables */
+/*****************************/
+
+
+/*******************/
+/* Local Variables */
+/*******************/
+
+/* Declare a free list to manage the H5MF_free_section_t struct */
+H5FL_DEFINE(H5MV_free_section_t);
+
+/*
+ * "simple" section callbacks
+ */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5MV__sect_new
+ *
+ * Purpose: Create a new section and return it to the caller
+ *
+ * Return: Pointer to new section on success/NULL on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+H5MV_free_section_t *
+H5MV__sect_new(haddr_t sect_off, hsize_t sect_size)
+{
+ H5MV_free_section_t *sect; /* 'Simple' free space section to add */
+ H5MV_free_section_t *ret_value = NULL; /* Return value */
+
+ FUNC_ENTER_PACKAGE
+
+ /* Check arguments. */
+ HDassert(sect_size);
+
+ /* Create free space section node */
+ if(NULL == (sect = H5FL_MALLOC(H5MV_free_section_t)))
+ HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed for free section node")
+
+ /* Set the information passed in */
+ sect->sect_info.addr = sect_off;
+ sect->sect_info.size = sect_size;
+
+ /* Set the section's class & state */
+ sect->sect_info.type = H5MV_FSPACE_SECT_SIMPLE;
+ sect->sect_info.state = H5FS_SECT_LIVE;
+
+ /* Set return value */
+ ret_value = sect;
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5MV__sect_new() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5MV__sect_free
+ *
+ * Purpose: Free a 'simple' section node
+ *
+ * Return: Success: non-negative
+ * Failure: negative
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5MV__sect_free(H5FS_section_info_t *_sect)
+{
+ H5MV_free_section_t *sect = (H5MV_free_section_t *)_sect; /* File free section */
+
+ FUNC_ENTER_PACKAGE_NOERR
+
+ /* Check arguments. */
+ HDassert(sect);
+
+ /* Release the section */
+ sect = H5FL_FREE(H5MV_free_section_t, sect);
+
+ FUNC_LEAVE_NOAPI(SUCCEED)
+} /* H5MV__sect_free() */
+
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5MV__sect_can_merge
+ *
+ * Purpose: Can two sections of this type merge?
+ *
+ * Note: Second section must be "after" first section
+ *
+ * Return: Success: non-negative (TRUE/FALSE)
+ * Failure: negative
+ *
+ *-------------------------------------------------------------------------
+ */
+static htri_t
+H5MV__sect_can_merge(const H5FS_section_info_t *_sect1,
+ const H5FS_section_info_t *_sect2, void H5_ATTR_UNUSED *_udata)
+{
+ const H5MV_free_section_t *sect1 = (const H5MV_free_section_t *)_sect1; /* File free section */
+ const H5MV_free_section_t *sect2 = (const H5MV_free_section_t *)_sect2; /* File free section */
+ htri_t ret_value = FAIL; /* Return value */
+
+ FUNC_ENTER_STATIC_NOERR
+
+ /* Check arguments. */
+ HDassert(sect1);
+ HDassert(sect2);
+ HDassert(sect1->sect_info.type == sect2->sect_info.type); /* Checks "MERGE_SYM" flag */
+ HDassert(H5F_addr_lt(sect1->sect_info.addr, sect2->sect_info.addr));
+
+ /* Check if second section adjoins first section */
+ ret_value = H5F_addr_eq(sect1->sect_info.addr + sect1->sect_info.size, sect2->sect_info.addr);
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5MV__sect_can_merge() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5MV__sect_merge
+ *
+ * Purpose: Merge two sections of this type
+ *
+ * Note: Second section always merges into first node
+ *
+ * Return: Success: non-negative
+ * Failure: negative
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5MV__sect_merge(H5FS_section_info_t **_sect1, H5FS_section_info_t *_sect2,
+ void H5_ATTR_UNUSED *_udata)
+{
+ H5MV_free_section_t **sect1 = (H5MV_free_section_t **)_sect1; /* File free section */
+ H5MV_free_section_t *sect2 = (H5MV_free_section_t *)_sect2; /* File free section */
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_STATIC
+
+ /* Check arguments. */
+ HDassert(sect1);
+ HDassert((*sect1)->sect_info.type == H5MV_FSPACE_SECT_SIMPLE);
+ HDassert(sect2);
+ HDassert(sect2->sect_info.type == H5MV_FSPACE_SECT_SIMPLE);
+ HDassert(H5F_addr_eq((*sect1)->sect_info.addr + (*sect1)->sect_info.size, sect2->sect_info.addr));
+
+ /* Add second section's size to first section */
+ (*sect1)->sect_info.size += sect2->sect_info.size;
+
+ /* Get rid of second section */
+ if(H5MV__sect_free((H5FS_section_info_t *)sect2) < 0)
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTRELEASE, FAIL, "can't free section node")
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5MV__sect_merge() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5MV__sect_can_shrink
+ *
+ * Purpose: Can this section shrink the container?
+ *
+ * Return: Success: non-negative (TRUE/FALSE)
+ * Failure: negative
+ *
+ *-------------------------------------------------------------------------
+ */
+htri_t
+H5MV__sect_can_shrink(const H5FS_section_info_t *_sect, void *_udata)
+{
+ const H5MV_free_section_t *sect = (const H5MV_free_section_t *)_sect; /* File free section */
+ H5F_t *f = (H5F_t *)_udata;
+ H5F_shared_t *shared = f->shared;
+ haddr_t eoa; /* End of address space in the file */
+ haddr_t end; /* End of section to extend */
+ htri_t ret_value = FALSE; /* Return value */
+
+ FUNC_ENTER_STATIC
+
+ /* Check arguments. */
+ HDassert(sect);
+
+ /* Retrieve the end oa the file's address space */
+ if(HADDR_UNDEF == (eoa = H5MV_get_vfd_swmr_md_eoa(shared)))
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTGET, FAIL, "get_eoa request for VFD SWMR metadata file failed")
+
+ /* Compute address of end of section to check */
+ end = sect->sect_info.addr + sect->sect_info.size;
+
+ /* Check if the section is exactly at the end of the allocated space in the file */
+ if(H5F_addr_eq(end, eoa))
+ /* Indicate shrinking can occur */
+ ret_value = TRUE;
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5MV__sect_can_shrink() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5MV__sect_shrink
+ *
+ * Purpose: Shrink container with section
+ *
+ * Return: Success: non-negative
+ * Failure: negative
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5MV__sect_shrink(H5FS_section_info_t **_sect, void *_udata)
+{
+ H5F_t *f = (H5F_t *)_udata;
+ H5F_shared_t *shared = f->shared;
+ H5MV_free_section_t **sect = (H5MV_free_section_t **)_sect; /* File free section */
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_STATIC
+
+ /* Check arguments. */
+ HDassert(sect);
+ HDassert(H5F_SHARED_INTENT(shared) & H5F_ACC_RDWR);
+
+ /* Release section's space at EOA */
+ if(H5MV__free_md(shared, (*sect)->sect_info.addr, (*sect)->sect_info.size) < 0)
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTFREE, FAIL, "free request for VFD SWMR metadata file failed")
+
+ /* Free the section */
+ if(H5MV__sect_free(&(*sect)->sect_info) < 0)
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTRELEASE, FAIL, "can't free simple section node")
+
+ /* Mark section as freed, for free space manager */
+ *sect = NULL;
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5MV__sect_shrink() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5MV__sect_valid
+ *
+ * Purpose: Check the validity of a section
+ *
+ * Return: Success: non-negative
+ * Failure: negative
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5MV__sect_valid(const H5FS_section_class_t H5_ATTR_UNUSED *cls, const H5FS_section_info_t *_sect)
+{
+ const H5MV_free_section_t *sect = (const H5MV_free_section_t *)_sect; /* File free section */
+
+ FUNC_ENTER_STATIC_NOERR
+
+ /* Check arguments. */
+ HDassert(sect);
+
+ FUNC_LEAVE_NOAPI(SUCCEED)
+} /* H5MV__sect_valid() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5MV__sect_split
+ *
+ * Purpose: Split SECT into 2 sections: fragment for alignment & the aligned section
+ * SECT's addr and size are updated to point to the aligned section
+ *
+ * Return: Success: the fragment for aligning sect
+ * Failure: null
+ *
+ *-------------------------------------------------------------------------
+ */
+static H5FS_section_info_t *
+H5MV__sect_split(H5FS_section_info_t *sect, hsize_t frag_size)
+{
+ H5MV_free_section_t *ret_value = NULL; /* Return value */
+
+ FUNC_ENTER_STATIC
+
+ /* Allocate space for new section */
+ if(NULL == (ret_value = H5MV__sect_new(sect->addr, frag_size)))
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL, "can't initialize free space section")
+
+ /* Set new section's info */
+ sect->addr += frag_size;
+ sect->size -= frag_size;
+
+done:
+ FUNC_LEAVE_NOAPI((H5FS_section_info_t *)ret_value)
+} /* end H5MV__sect_split() */
diff --git a/src/H5Ocache.c b/src/H5Ocache.c
index 45c55fd..53ae461 100644
--- a/src/H5Ocache.c
+++ b/src/H5Ocache.c
@@ -117,6 +117,7 @@ const H5AC_class_t H5AC_OHDR[1] = {{
H5O__cache_notify, /* 'notify' callback */
H5O__cache_free_icr, /* 'free_icr' callback */
NULL, /* 'fsf_size' callback */
+ NULL, /* VFD SWMR 'refresh' callback */
}};
/* H5O object header chunk inherits cache-like properties from H5AC */
@@ -135,6 +136,7 @@ const H5AC_class_t H5AC_OHDR_CHK[1] = {{
H5O__cache_chk_notify, /* 'notify' callback */
H5O__cache_chk_free_icr, /* 'free_icr' callback */
NULL, /* 'fsf_size' callback */
+ NULL, /* VFD SWMR 'refresh' callback */
}};
/* Declare external the free list for H5O_unknown_t's */
diff --git a/src/H5Oflush.c b/src/H5Oflush.c
index 898184b..eeed75e 100644
--- a/src/H5Oflush.c
+++ b/src/H5Oflush.c
@@ -28,6 +28,7 @@
#include "H5Omodule.h" /* This source code file is part of the H5O module */
#define H5T_FRIEND /* Suppress error about including H5Tpkg */
+#define H5D_FRIEND /* Suppress error about including H5Dpkg */
/***********/
/* Headers */
@@ -40,6 +41,7 @@
#include "H5Fprivate.h" /* Files */
#include "H5Gprivate.h" /* Groups */
#include "H5Iprivate.h" /* IDs */
+#include "H5Dpkg.h" /* Datasets */
#include "H5Opkg.h" /* Objects */
#include "H5Tpkg.h" /* Datatypes */
@@ -287,7 +289,6 @@ done:
herr_t
H5O_refresh_metadata(hid_t oid, H5O_loc_t oloc)
{
- H5VL_object_t *vol_obj = NULL; /* VOL object associated with the ID */
hbool_t objs_incr = FALSE; /* Whether the object count in the file was incremented */
herr_t ret_value = SUCCEED; /* Return value */
@@ -298,7 +299,9 @@ H5O_refresh_metadata(hid_t oid, H5O_loc_t oloc)
H5G_loc_t obj_loc;
H5O_loc_t obj_oloc;
H5G_name_t obj_path;
- H5O_shared_t cached_H5O_shared;
+ H5O_refresh_state_t state;
+ H5D_t *ds;
+ const H5VL_object_t *vol_obj;
H5VL_t *connector = NULL;
/* Create empty object location */
@@ -312,11 +315,6 @@ H5O_refresh_metadata(hid_t oid, H5O_loc_t oloc)
H5F_incr_nopen_objs(oloc.file);
objs_incr = TRUE;
- /* Save important datatype state */
- if(H5I_get_type(oid) == H5I_DATATYPE)
- if(H5T_save_refresh_state(oid, &cached_H5O_shared) < 0)
- HGOTO_ERROR(H5E_DATATYPE, H5E_CANTOPENOBJ, FAIL, "unable to save datatype state")
-
/* Get the VOL object from the ID and cache a pointer to the connector.
* The vol_obj will disappear when the underlying object is closed, so
* we can't use that directly.
@@ -325,6 +323,24 @@ H5O_refresh_metadata(hid_t oid, H5O_loc_t oloc)
HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "invalid object identifier")
connector = vol_obj->connector;
+ /* Save important datatype state */
+ switch (H5I_get_type(oid)) {
+ case H5I_DATATYPE:
+ if (H5T_save_refresh_state(oid, &state.shared_ohdr_info) < 0)
+ HGOTO_ERROR(H5E_DATATYPE, H5E_CANTOPENOBJ, FAIL,
+ "unable to save datatype state");
+ break;
+ case H5I_DATASET:
+ ds = (H5D_t *)vol_obj->data;
+ state.dapl_id = ds->shared->dapl_id;
+ if (H5I_inc_ref(state.dapl_id, false) < 0)
+ HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL,
+ "could not increase refcnt");
+ break;
+ default:
+ break;
+ }
+
/* Bump the number of references on the VOL connector.
* If you don't do this, VDS refreshes can accidentally close the connector.
*/
@@ -335,16 +351,27 @@ H5O_refresh_metadata(hid_t oid, H5O_loc_t oloc)
HGOTO_ERROR(H5E_OHDR, H5E_CANTLOAD, FAIL, "unable to refresh object")
/* Re-open the object, re-fetching its metadata */
- if((H5O_refresh_metadata_reopen(oid, &obj_loc, connector, FALSE)) < 0)
+ if (H5O_refresh_metadata_reopen(oid, &obj_loc, &state, connector,
+ FALSE) < 0)
HGOTO_ERROR(H5E_OHDR, H5E_CANTLOAD, FAIL, "unable to refresh object")
/* Restore the number of references on the VOL connector */
connector->nrefs--;
/* Restore important datatype state */
- if(H5I_get_type(oid) == H5I_DATATYPE)
- if(H5T_restore_refresh_state(oid, &cached_H5O_shared) < 0)
+ switch (H5I_get_type(oid)) {
+ case H5I_DATATYPE:
+ if(H5T_restore_refresh_state(oid, &state.shared_ohdr_info) < 0)
HGOTO_ERROR(H5E_DATATYPE, H5E_CANTOPENOBJ, FAIL, "unable to restore datatype state")
+ break;
+ case H5I_DATASET:
+ if (H5I_dec_ref(state.dapl_id) < 0)
+ HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL,
+ "could not decrease refcnt");
+ break;
+ default:
+ break;
+ }
} /* end if */
@@ -442,7 +469,8 @@ done:
*-------------------------------------------------------------------------
*/
herr_t
-H5O_refresh_metadata_reopen(hid_t oid, H5G_loc_t *obj_loc, H5VL_t *vol_connector, hbool_t start_swmr)
+H5O_refresh_metadata_reopen(hid_t oid, H5G_loc_t *obj_loc,
+ const H5O_refresh_state_t *state, H5VL_t *vol_connector, hbool_t start_swmr)
{
void *object = NULL; /* Object for this operation */
H5I_type_t type; /* Type of object for the ID */
@@ -471,9 +499,12 @@ H5O_refresh_metadata_reopen(hid_t oid, H5G_loc_t *obj_loc, H5VL_t *vol_connector
break;
case H5I_DATASET:
- /* Re-open the dataset */
- if(NULL == (object = H5D_open(obj_loc, H5P_DATASET_ACCESS_DEFAULT)))
- HGOTO_ERROR(H5E_DATASET, H5E_CANTOPENOBJ, FAIL, "unable to open dataset")
+ object = H5D_open(obj_loc,
+ (state == NULL) ? H5P_DATASET_ACCESS_DEFAULT : state->dapl_id);
+ if(NULL == object) {
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTOPENOBJ, FAIL,
+ "unable to open dataset");
+ }
if(!start_swmr) /* No need to handle multiple opens when H5Fstart_swmr_write() */
if(H5D_mult_refresh_reopen((H5D_t *)object) < 0)
HGOTO_ERROR(H5E_OHDR, H5E_CANTOPENOBJ, FAIL, "unable to finish refresh for dataset")
diff --git a/src/H5Oprivate.h b/src/H5Oprivate.h
index 0be6d89..9a1c1ed 100644
--- a/src/H5Oprivate.h
+++ b/src/H5Oprivate.h
@@ -275,6 +275,14 @@ typedef struct H5O_shared_t {
} u;
} H5O_shared_t;
+/* Storage for non-persistent (i.e., not stored in the HDF5 file) information
+ * that has to be preserved when an object is closed & reopened by
+ * H5O_refresh_metadata().
+ */
+typedef union _H5O_refresh_state {
+ hid_t dapl_id; // dataset refresh: access plist
+ H5O_shared_t shared_ohdr_info; // datatype refresh
+} H5O_refresh_state_t;
/*
* Link Info Message.
@@ -972,7 +980,8 @@ H5_DLL herr_t H5O_msg_get_flags(const H5O_loc_t *loc, unsigned type_id, uint8_t
H5_DLL herr_t H5O_flush(H5O_loc_t *oloc, hid_t obj_id);
H5_DLL herr_t H5O_flush_common(H5O_loc_t *oloc, hid_t obj_id);
H5_DLL herr_t H5O_refresh_metadata(hid_t oid, H5O_loc_t oloc);
-H5_DLL herr_t H5O_refresh_metadata_reopen(hid_t oid, H5G_loc_t *obj_loc, H5VL_t *vol_driver, hbool_t start_swmr);
+H5_DLL herr_t H5O_refresh_metadata_reopen(hid_t, H5G_loc_t *,
+ const H5O_refresh_state_t *, H5VL_t *, hbool_t);
/* Cache corking functions */
H5_DLL herr_t H5O_disable_mdc_flushes(H5O_loc_t *oloc);
diff --git a/src/H5PB.c b/src/H5PB.c
index 907fe82..1e0bab7 100644
--- a/src/H5PB.c
+++ b/src/H5PB.c
@@ -13,9 +13,11 @@
/*-------------------------------------------------------------------------
*
- * Created: H5PB.c
- *
- * Purpose: Page Buffer routines.
+ * Created: H5PB2.c
+ *
+ * Purpose: Re-implementation of the page buffer with added features to
+ * support VFD SWMR.
+ * JRM -- 10/11/18
*
*-------------------------------------------------------------------------
*/
@@ -24,96 +26,50 @@
/* Module Setup */
/****************/
-#define H5F_FRIEND /* Suppress error about including H5Fpkg */
-#include "H5PBmodule.h" /* This source code file is part of the H5PB module */
+#define H5F_FRIEND /* suppress error about including H5Fpkg */
+#include "H5PBmodule.h" /* This source code file is part of the
+ * H5PB module
+ */
/***********/
/* Headers */
/***********/
-#include "H5private.h" /* Generic Functions */
-#include "H5Eprivate.h" /* Error handling */
-#include "H5Fpkg.h" /* Files */
-#include "H5FDprivate.h" /* File drivers */
-#include "H5Iprivate.h" /* IDs */
-#include "H5MMprivate.h" /* Memory management */
-#include "H5PBpkg.h" /* File access */
-#include "H5SLprivate.h" /* Skip List */
+#include "H5private.h" /* Generic Functions */
+#include "H5Eprivate.h" /* Error handling */
+#include "H5Fpkg.h" /* Files */
+#include "H5FDprivate.h" /* File drivers */
+#include "H5Iprivate.h" /* IDs */
+#include "H5FLprivate.h" /* Free lists */
+#include "H5MMprivate.h" /* Memory management */
+#include "H5PBpkg.h" /* File access */
+
+#include "hlog.h"
/****************/
/* Local Macros */
/****************/
-#define H5PB__PREPEND(page_ptr, head_ptr, tail_ptr, len) { \
- if((head_ptr) == NULL) { \
- (head_ptr) = (page_ptr); \
- (tail_ptr) = (page_ptr); \
- } /* end if */ \
- else { \
- (head_ptr)->prev = (page_ptr); \
- (page_ptr)->next = (head_ptr); \
- (head_ptr) = (page_ptr); \
- } /* end else */ \
- (len)++; \
-} /* H5PB__PREPEND() */
-
-#define H5PB__REMOVE(page_ptr, head_ptr, tail_ptr, len) { \
- if((head_ptr) == (page_ptr)) { \
- (head_ptr) = (page_ptr)->next; \
- if((head_ptr) != NULL) \
- (head_ptr)->prev = NULL; \
- } /* end if */ \
- else \
- (page_ptr)->prev->next = (page_ptr)->next; \
- if((tail_ptr) == (page_ptr)) { \
- (tail_ptr) = (page_ptr)->prev; \
- if((tail_ptr) != NULL) \
- (tail_ptr)->next = NULL; \
- } /* end if */ \
- else \
- (page_ptr)->next->prev = (page_ptr)->prev; \
- page_ptr->next = NULL; \
- page_ptr->prev = NULL; \
- (len)--; \
-}
-
-#define H5PB__INSERT_LRU(page_buf, page_ptr) { \
- HDassert(page_buf); \
- HDassert(page_ptr); \
- /* insert the entry at the head of the list. */ \
- H5PB__PREPEND((page_ptr), (page_buf)->LRU_head_ptr, \
- (page_buf)->LRU_tail_ptr, (page_buf)->LRU_list_len) \
-}
-#define H5PB__REMOVE_LRU(page_buf, page_ptr) { \
- HDassert(page_buf); \
- HDassert(page_ptr); \
- /* remove the entry from the list. */ \
- H5PB__REMOVE((page_ptr), (page_buf)->LRU_head_ptr, \
- (page_buf)->LRU_tail_ptr, (page_buf)->LRU_list_len) \
-}
-
-#define H5PB__MOVE_TO_TOP_LRU(page_buf, page_ptr) { \
- HDassert(page_buf); \
- HDassert(page_ptr); \
- /* Remove entry and insert at the head of the list. */ \
- H5PB__REMOVE((page_ptr), (page_buf)->LRU_head_ptr, \
- (page_buf)->LRU_tail_ptr, (page_buf)->LRU_list_len) \
- H5PB__PREPEND((page_ptr), (page_buf)->LRU_head_ptr, \
- (page_buf)->LRU_tail_ptr, (page_buf)->LRU_list_len) \
-}
+/* Round _x down to nearest _size. */
+#ifndef rounddown
+#define rounddown(_x, _size) (((_x) / (_size)) * (_size))
+#endif
+/* Round _x up to nearest _size. */
+#ifndef roundup
+#define roundup(_x, _size) ((((_x) + (_size) - 1) / (_size)) * (_size))
+#endif
/******************/
/* Local Typedefs */
/******************/
-/* Iteration context for destroying page buffer */
-typedef struct {
- H5PB_t *page_buf;
- hbool_t actual_slist;
-} H5PB_ud1_t;
-
+typedef struct _metadata_section {
+ haddr_t addr;
+ size_t len;
+ const char *buf;
+} metadata_section_t;
/********************/
/* Package Typedefs */
@@ -123,10 +79,50 @@ typedef struct {
/********************/
/* Local Prototypes */
/********************/
-static herr_t H5PB__insert_entry(H5PB_t *page_buf, H5PB_entry_t *page_entry);
-static htri_t H5PB__make_space(H5F_shared_t *f_sh, H5PB_t *page_buf, H5FD_mem_t inserted_type);
-static herr_t H5PB__write_entry(H5F_shared_t *f_sh, H5PB_entry_t *page_entry);
+static H5PB_entry_t * H5PB__allocate_page(H5PB_t *pb_ptr, size_t buf_size,
+ hbool_t clean_image);
+
+static herr_t H5PB__create_new_page(H5PB_t *pb_ptr, haddr_t addr, size_t size,
+ H5FD_mem_t type, hbool_t clean_image, H5PB_entry_t **entry_ptr_ptr);
+
+static void H5PB__deallocate_page(H5PB_entry_t *entry_ptr);
+
+static herr_t H5PB__evict_entry(H5F_shared_t *, H5PB_entry_t *, bool, bool);
+
+static herr_t H5PB__flush_entry(H5F_shared_t *, H5PB_t *, H5PB_entry_t *);
+
+static herr_t H5PB__load_page(H5F_shared_t *, H5PB_t *, haddr_t,
+ H5FD_mem_t, H5PB_entry_t **);
+
+static herr_t H5PB__make_space(H5F_shared_t *, H5PB_t *, H5FD_mem_t);
+
+static herr_t H5PB__mark_entry_clean(H5PB_t *, H5PB_entry_t *);
+
+static herr_t H5PB__mark_entry_dirty(H5F_shared_t *, H5PB_t *, H5PB_entry_t *);
+
+static herr_t H5PB__read_meta(H5F_shared_t *, H5FD_mem_t, haddr_t,
+ size_t, void *);
+
+static herr_t H5PB__read_raw(H5F_shared_t *, H5FD_mem_t, haddr_t,
+ size_t, void *);
+
+static herr_t H5PB__write_meta(H5F_shared_t *, H5FD_mem_t, haddr_t,
+ size_t, const void *);
+
+static herr_t H5PB__write_raw(H5F_shared_t *, H5FD_mem_t, haddr_t,
+ size_t, const void *);
+
+static void metadata_section_split(size_t, haddr_t, size_t, const void *,
+ metadata_section_t *);
+
+static herr_t metadata_multipart_read(H5F_shared_t *, H5FD_mem_t, haddr_t,
+ size_t, void *);
+
+static herr_t metadata_multipart_write(H5F_shared_t *, H5FD_mem_t, haddr_t,
+ size_t, const void *);
+
+static void H5PB_log_access_by_size_counts(const H5PB_t *);
/*********************/
/* Package Variables */
@@ -144,1374 +140,4288 @@ hbool_t H5_PKG_INIT_VAR = FALSE;
/*******************/
/* Local Variables */
/*******************/
+
+
/* Declare a free list to manage the H5PB_t struct */
H5FL_DEFINE_STATIC(H5PB_t);
/* Declare a free list to manage the H5PB_entry_t struct */
H5FL_DEFINE_STATIC(H5PB_entry_t);
+HLOG_OUTLET_DECL(pagebuffer);
+HLOG_OUTLET_SHORT_DEFN(pagebuffer, all);
+HLOG_OUTLET_SHORT_DEFN(pb_access_sizes, pagebuffer);
+HLOG_OUTLET_SHORT_DEFN(pbflush, pagebuffer);
+HLOG_OUTLET_SHORT_DEFN(pbflush_entry, pbflush);
+HLOG_OUTLET_SHORT_DEFN(pbio, pagebuffer);
+HLOG_OUTLET_SHORT_DEFN(pbrd, pbio);
+HLOG_OUTLET_SHORT_DEFN(pbwr, pbio);
+HLOG_OUTLET_SHORT_DEFN(lengthen_pbentry, pagebuffer);
+HLOG_OUTLET_SHORT_DEFN(pbrm, pagebuffer);
/*-------------------------------------------------------------------------
- * Function: H5PB_reset_stats
*
- * Purpose: This function was created without documentation.
- * What follows is my best understanding of Mohamad's intent.
+ * Function: H5PB_reset_stats
*
- * Reset statistics collected for the page buffer layer.
+ * Purpose: Reset statistics collected for the page buffer layer.
*
* Return: Non-negative on success/Negative on failure
*
- * Programmer: Mohamad Chaarawi
+ * Programmer: John Mainzer -- 10/12/18
+ *
+ * Changes: None.
*
*-------------------------------------------------------------------------
*/
herr_t
-H5PB_reset_stats(H5PB_t *page_buf)
+H5PB_reset_stats(H5PB_t *pb_ptr)
{
+ int i;
+
FUNC_ENTER_NOAPI_NOERR
/* Sanity checks */
- HDassert(page_buf);
-
- page_buf->accesses[0] = 0;
- page_buf->accesses[1] = 0;
- page_buf->hits[0] = 0;
- page_buf->hits[1] = 0;
- page_buf->misses[0] = 0;
- page_buf->misses[1] = 0;
- page_buf->evictions[0] = 0;
- page_buf->evictions[1] = 0;
- page_buf->bypasses[0] = 0;
- page_buf->bypasses[1] = 0;
+ HDassert(pb_ptr);
+ HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+
+ for ( i = 0; i < H5PB__NUM_STAT_TYPES; i++ ) {
+
+ pb_ptr->bypasses[i] = 0;
+ pb_ptr->accesses[i] = 0;
+ pb_ptr->hits[i] = 0;
+ pb_ptr->misses[i] = 0;
+ pb_ptr->loads[i] = 0;
+ pb_ptr->insertions[i] = 0;
+ pb_ptr->flushes[i] = 0;
+ pb_ptr->evictions[i] = 0;
+ pb_ptr->clears[i] = 0;
+ }
+
+ pb_ptr->max_lru_len = 0;
+ pb_ptr->max_lru_size = 0;
+ pb_ptr->lru_md_skips = 0;
+ pb_ptr->lru_rd_skips = 0;
+ pb_ptr->total_ht_insertions = 0;
+ pb_ptr->total_ht_deletions = 0;
+ pb_ptr->successful_ht_searches = 0;
+ pb_ptr->total_successful_ht_search_depth = 0;
+ pb_ptr->failed_ht_searches = 0;
+ pb_ptr->total_failed_ht_search_depth = 0;
+ pb_ptr->max_index_len = 0;
+ pb_ptr->max_clean_index_len = 0;
+ pb_ptr->max_dirty_index_len = 0;
+ pb_ptr->max_clean_index_size = 0;
+ pb_ptr->max_dirty_index_size = 0;
+ pb_ptr->max_index_size = 0;
+ pb_ptr->max_rd_pages = 0;
+ pb_ptr->max_md_pages = 0;
+ pb_ptr->max_mpmde_count = 0;
+ pb_ptr->lru_tl_skips = 0;
+ pb_ptr->max_tl_len = 0;
+ pb_ptr->max_tl_size = 0;
+ pb_ptr->delayed_writes = 0;
+ pb_ptr->total_delay = 0;
+ pb_ptr->max_dwl_len = 0;
+ pb_ptr->max_dwl_size = 0;
+ pb_ptr->total_dwl_ins_depth = 0;
FUNC_LEAVE_NOAPI(SUCCEED)
+
} /* H5PB_reset_stats() */
/*-------------------------------------------------------------------------
- * Function: H5PB_get_stats
+ * Function: H5PB_get_stats
*
* Purpose: This function was created without documentation.
* What follows is my best understanding of Mohamad's intent.
*
- * Retrieve statistics collected about page accesses for the page buffer layer.
- * --accesses: the number of metadata and raw data accesses to the page buffer layer
- * --hits: the number of metadata and raw data hits in the page buffer layer
- * --misses: the number of metadata and raw data misses in the page buffer layer
- * --evictions: the number of metadata and raw data evictions from the page buffer layer
- * --bypasses: the number of metadata and raw data accesses that bypass the page buffer layer
+ * Retrieve statistics collected about page accesses for the
+ * page buffer layer.
*
- * Return: Non-negative on success/Negative on failure
+ * --accesses: the number of metadata and raw data accesses
+ * to the page buffer layer
+ *
+ * --hits: the number of metadata and raw data hits in
+ * the page buffer layer
+ *
+ * --misses: the number of metadata and raw data misses in
+ * the page buffer layer
+ *
+ * --evictions: the number of metadata and raw data evictions
+ * from the page buffer layer
*
- * Programmer: Mohamad Chaarawi
+ * --bypasses: the number of metadata and raw data accesses
+ * that bypass the page buffer layer
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: Mohamad Chaarawi
*
*-------------------------------------------------------------------------
*/
herr_t
-H5PB_get_stats(const H5PB_t *page_buf, unsigned accesses[2], unsigned hits[2],
+H5PB_get_stats(const H5PB_t *pb_ptr, unsigned accesses[2], unsigned hits[2],
unsigned misses[2], unsigned evictions[2], unsigned bypasses[2])
{
FUNC_ENTER_NOAPI_NOERR
/* Sanity checks */
- HDassert(page_buf);
-
- accesses[0] = page_buf->accesses[0];
- accesses[1] = page_buf->accesses[1];
- hits[0] = page_buf->hits[0];
- hits[1] = page_buf->hits[1];
- misses[0] = page_buf->misses[0];
- misses[1] = page_buf->misses[1];
- evictions[0] = page_buf->evictions[0];
- evictions[1] = page_buf->evictions[1];
- bypasses[0] = page_buf->bypasses[0];
- bypasses[1] = page_buf->bypasses[1];
+ HDassert(pb_ptr);
+
+ accesses[0] = (unsigned)pb_ptr->accesses[0];
+ accesses[1] = (unsigned)pb_ptr->accesses[1];
+ accesses[2] = (unsigned)pb_ptr->accesses[2];
+ hits[0] = (unsigned)pb_ptr->hits[0];
+ hits[1] = (unsigned)pb_ptr->hits[1];
+ hits[2] = (unsigned)pb_ptr->hits[2];
+ misses[0] = (unsigned)pb_ptr->misses[0];
+ misses[1] = (unsigned)pb_ptr->misses[1];
+ misses[2] = (unsigned)pb_ptr->misses[2];
+ evictions[0] = (unsigned)pb_ptr->evictions[0];
+ evictions[1] = (unsigned)pb_ptr->evictions[1];
+ evictions[2] = (unsigned)pb_ptr->evictions[2];
+ bypasses[0] = (unsigned)pb_ptr->bypasses[0];
+ bypasses[1] = (unsigned)pb_ptr->bypasses[1];
+ bypasses[2] = (unsigned)pb_ptr->bypasses[2];
FUNC_LEAVE_NOAPI(SUCCEED)
} /* H5PB_get_stats */
/*-------------------------------------------------------------------------
- * Function: H5PB_print_stats()
*
- * Purpose: This function was created without documentation.
- * What follows is my best understanding of Mohamad's intent.
+ * Function: H5PB_print_stats()
*
- * Print out statistics collected for the page buffer layer.
+ * Purpose: Print out statistics collected for the page buffer layer.
*
- * Return: Non-negative on success/Negative on failure
+ * Return: Non-negative on success/Negative on failure
*
- * Programmer: Mohamad Chaarawi
+ * Programmer: John Mainzer -- 10/12/18
+ *
+ * Changes: None.
*
*-------------------------------------------------------------------------
*/
herr_t
-H5PB_print_stats(const H5PB_t *page_buf)
+H5PB_print_stats(const H5PB_t *pb_ptr)
{
+ double ave_succ_search_depth = 0.0L;
+ double ave_failed_search_depth = 0.0L;
+ double ave_delayed_write = 0.0L;
+ double ave_delayed_write_ins_depth = 0.0L;
+
FUNC_ENTER_NOAPI_NOINIT_NOERR
- HDassert(page_buf);
+ HDassert(pb_ptr);
+ HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+
+ HDfprintf(stdout, "\n\nPage Buffer Statistics (raw/meta/mpmde): \n\n");
+
+ HDfprintf(stdout, "bypasses = %lld (%lld/%lld/%lld)\n",
+ (pb_ptr->bypasses[0] + pb_ptr->bypasses[1] + pb_ptr->bypasses[2]),
+ pb_ptr->bypasses[0], pb_ptr->bypasses[1], pb_ptr->bypasses[2]);
+
+ HDfprintf(stdout, "acesses = %lld (%lld/%lld/%lld)\n",
+ (pb_ptr->accesses[0] + pb_ptr->accesses[1] + pb_ptr->accesses[2]),
+ pb_ptr->accesses[0], pb_ptr->accesses[1], pb_ptr->accesses[2]);
+
+ HDfprintf(stdout, "hits = %lld (%lld/%lld/%lld)\n",
+ (pb_ptr->hits[0] + pb_ptr->hits[1] + pb_ptr->hits[2]),
+ pb_ptr->hits[0], pb_ptr->hits[1], pb_ptr->hits[2]);
+
+ HDfprintf(stdout, "misses = %lld (%lld/%lld/%lld)\n",
+ (pb_ptr->misses[0] + pb_ptr->misses[1] + pb_ptr->misses[2]),
+ pb_ptr->misses[0], pb_ptr->misses[1], pb_ptr->misses[2]);
+
+ HDfprintf(stdout, "loads = %lld (%lld/%lld/%lld)\n",
+ (pb_ptr->loads[0] + pb_ptr->loads[1] + pb_ptr->loads[2]),
+ pb_ptr->loads[0], pb_ptr->loads[1], pb_ptr->loads[2]);
+
+ HDfprintf(stdout, "insertions = %lld (%lld/%lld/%lld)\n",
+ (pb_ptr->insertions[0] + pb_ptr->insertions[1] +
+ pb_ptr->insertions[2]),
+ pb_ptr->insertions[0], pb_ptr->insertions[1],
+ pb_ptr->insertions[2]);
+
+ HDfprintf(stdout, "flushes = %lld (%lld/%lld/%lld)\n",
+ (pb_ptr->flushes[0] + pb_ptr->flushes[1] + pb_ptr->flushes[2]),
+ pb_ptr->flushes[0], pb_ptr->flushes[1], pb_ptr->flushes[2]);
+
+ HDfprintf(stdout, "evictions = %lld (%lld/%lld/%lld)\n",
+ (pb_ptr->evictions[0] + pb_ptr->evictions[1] +
+ pb_ptr->evictions[2]),
+ pb_ptr->evictions[0], pb_ptr->evictions[1], pb_ptr->evictions[2]);
+
+ HDfprintf(stdout, "clears = %lld (%lld/%lld/%lld)\n",
+ (pb_ptr->clears[0] + pb_ptr->clears[1] + pb_ptr->clears[2]),
+ pb_ptr->clears[0], pb_ptr->clears[1], pb_ptr->clears[2]);
+
+ HDfprintf(stdout, "max LRU len / size = %lld / %lld\n",
+ pb_ptr->max_lru_len, pb_ptr->max_lru_size);
+
+ HDfprintf(stdout,
+ "LRU make space md/rd/tl skips = %lld/%lld/%lld\n",
+ pb_ptr->lru_md_skips, pb_ptr->lru_rd_skips,
+ pb_ptr->lru_tl_skips);
+
+ HDfprintf(stdout, "hash table insertions / deletions = %lld / %lld\n",
+ pb_ptr->total_ht_insertions, pb_ptr->total_ht_deletions);
- HDprintf("PAGE BUFFER STATISTICS:\n");
+ if ( pb_ptr->successful_ht_searches > 0 ) {
- HDprintf("******* METADATA\n");
- HDprintf("\t Total Accesses: %u\n", page_buf->accesses[0]);
- HDprintf("\t Hits: %u\n", page_buf->hits[0]);
- HDprintf("\t Misses: %u\n", page_buf->misses[0]);
- HDprintf("\t Evictions: %u\n", page_buf->evictions[0]);
- HDprintf("\t Bypasses: %u\n", page_buf->bypasses[0]);
- HDprintf("\t Hit Rate = %f%%\n", ((double)page_buf->hits[0]/(page_buf->accesses[0] - page_buf->bypasses[0]))*100);
- HDprintf("*****************\n\n");
+ ave_succ_search_depth =
+ (double)(pb_ptr->total_successful_ht_search_depth) /
+ (double)(pb_ptr->successful_ht_searches);
+ }
+ HDfprintf(stdout, "successful ht searches / ave depth = %lld / %llf\n",
+ pb_ptr->successful_ht_searches, ave_succ_search_depth);
- HDprintf("******* RAWDATA\n");
- HDprintf("\t Total Accesses: %u\n", page_buf->accesses[1]);
- HDprintf("\t Hits: %u\n", page_buf->hits[1]);
- HDprintf("\t Misses: %u\n", page_buf->misses[1]);
- HDprintf("\t Evictions: %u\n", page_buf->evictions[1]);
- HDprintf("\t Bypasses: %u\n", page_buf->bypasses[1]);
- HDprintf("\t Hit Rate = %f%%\n", ((double)page_buf->hits[1]/(page_buf->accesses[1]-page_buf->bypasses[0]))*100);
- HDprintf("*****************\n\n");
+ if ( pb_ptr->failed_ht_searches > 0 ) {
+
+ ave_failed_search_depth =
+ (double)(pb_ptr->total_failed_ht_search_depth) /
+ (double)(pb_ptr->failed_ht_searches);
+ }
+ HDfprintf(stdout, "failed ht searches / ave depth = %lld / %llf\n",
+ pb_ptr->failed_ht_searches, ave_failed_search_depth);
+
+ HDfprintf(stdout, "max index length / size = %lld / %lld\n",
+ pb_ptr->max_index_len, pb_ptr->max_index_size);
+
+ HDfprintf(stdout, "max rd / md / mpmde entries = %lld / %lld / %lld\n",
+ pb_ptr->max_rd_pages, pb_ptr->max_md_pages,
+ pb_ptr->max_mpmde_count);
+
+ HDfprintf(stdout, "tick list max len / size = %lld / %lld\n",
+ pb_ptr->max_tl_len, pb_ptr->max_tl_size);
+
+ HDfprintf(stdout, "delayed write list max len / size = %lld / %lld\n",
+ pb_ptr->max_dwl_len, pb_ptr->max_dwl_size);
+
+ if ( pb_ptr->delayed_writes > 0 ) {
+
+ ave_delayed_write = (double)(pb_ptr->total_delay) /
+ (double)(pb_ptr->delayed_writes);
+ ave_delayed_write_ins_depth = (double)(pb_ptr->total_dwl_ins_depth) /
+ (double)(pb_ptr->delayed_writes);
+ }
+ HDfprintf(stdout,
+ "delayed writes / ave delay / ave ins depth = %lld / %llf / %llf\n",
+ pb_ptr->delayed_writes, ave_delayed_write, ave_delayed_write_ins_depth);
FUNC_LEAVE_NOAPI(SUCCEED)
+
} /* H5PB_print_stats */
/*-------------------------------------------------------------------------
- * Function: H5PB_create
*
- * Purpose: Create and setup the PB on the file.
+ * Function: H5PB_add_new_page
*
- * Return: Non-negative on success/Negative on failure
+ * Purpose: Insert a new blank page to the page buffer if the page
+ * buffer is configured to allow pages of the specified
+ * type.
+ *
+ * This function is called by the MF layer when a new page
+ * is allocated to indicate to the page buffer layer that
+ * a read of the page from the file is not necessary since
+ * it's an empty page.
+ *
+ * For purposes of the VFD SWMR writer, we also track pages
+ * that are inserted via this call, as the fact that the
+ * page was allocated implies that an earlier version does
+ * not exist in the HDF5 file, and thus we need not concern
+ * ourselves with delaying the write of this pages to avoid
+ * messages from the future on the reader.
+ *
+ * Note that this function inserts the new page without
+ * attempting to make space. This can result in the page
+ * buffer exceeding its maximum size.
+ *
+ * Note also that it is possible that the page (marked clean)
+ * will be evicted before its first use.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: John Mainzer -- 10/12/18
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5PB_add_new_page(H5F_shared_t *shared, H5FD_mem_t type, haddr_t page_addr)
+{
+ hbool_t can_insert = TRUE;
+ H5PB_t *pb_ptr = NULL;
+ H5PB_entry_t *entry_ptr = NULL;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ /* Sanity checks */
+ HDassert(shared);
+ HDassert(shared->pb_ptr);
+
+ pb_ptr = shared->pb_ptr;
+
+ HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+
+ if ( H5FD_MEM_DRAW == type ) { /* raw data page insertion */
+
+ if ( pb_ptr->min_md_pages == pb_ptr->max_pages ) {
+
+ can_insert = FALSE;
+
+ }
+ } else { /* metadata page insertion */
+
+ if ( pb_ptr->min_rd_pages == pb_ptr->max_pages ) {
+
+ can_insert = FALSE;
+ }
+ }
+
+ if ( can_insert ) {
+
+ if ( H5PB__create_new_page(pb_ptr, page_addr,
+ (size_t)(pb_ptr->page_size),
+ type, TRUE, &entry_ptr) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "new page buffer page creation failed.")
+
+ /* make note that this page was allocated, not loaded from file */
+ entry_ptr->loaded = FALSE;
+
+ /* updates stats */
+ H5PB__UPDATE_STATS_FOR_INSERTION(pb_ptr, entry_ptr);
+ }
+
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5PB_add_new_page */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5PB_create
*
- * Programmer: Mohamad Chaarawi
+ * Purpose: Setup a page buffer for the supplied file.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: John Mainzer -- 10/11/18
+ *
+ * Changes: None.
*
*-------------------------------------------------------------------------
*/
herr_t
-H5PB_create(H5F_shared_t *f_sh, size_t size, unsigned page_buf_min_meta_perc, unsigned page_buf_min_raw_perc)
+H5PB_create(H5F_shared_t *shared, size_t size, unsigned page_buf_min_meta_perc,
+ unsigned page_buf_min_raw_perc)
{
- H5PB_t *page_buf = NULL;
+ hbool_t vfd_swmr_writer = FALSE;
+ int i;
+ int32_t min_md_pages;
+ int32_t min_rd_pages;
+ H5PB_t *pb_ptr = NULL;
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_NOAPI(FAIL)
/* Sanity checks */
- HDassert(f_sh);
+ HDassert(shared);
+ HDassert(page_buf_min_meta_perc <= 100);
+ HDassert(page_buf_min_raw_perc <= 100);
+ HDassert((page_buf_min_meta_perc + page_buf_min_raw_perc) <= 100);
/* Check args */
- if(f_sh->fs_strategy != H5F_FSPACE_STRATEGY_PAGE)
- HGOTO_ERROR(H5E_FILE, H5E_CANTINIT, FAIL, "Enabling Page Buffering requires PAGE file space strategy")
- /* round down the size if it is larger than the page size */
- else if(size > f_sh->fs_page_size) {
+ if ( shared->fs_strategy != H5F_FSPACE_STRATEGY_PAGE )
+
+ HGOTO_ERROR(H5E_FILE, H5E_CANTINIT, FAIL, \
+ "Enabling Page Buffering requires PAGE file space strategy")
+
+ else if ( size > shared->fs_page_size ) {
+
+ /* round size down to the next multiple of fs_page_size */
+
hsize_t temp_size;
- temp_size = (size / f_sh->fs_page_size) * f_sh->fs_page_size;
+ temp_size = (size / shared->fs_page_size) * shared->fs_page_size;
+
H5_CHECKED_ASSIGN(size, size_t, temp_size, hsize_t);
- } /* end if */
- else if(0 != size % f_sh->fs_page_size)
- HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTINIT, FAIL, "Page Buffer size must be >= to the page size")
- /* Allocate the new page buffering structure */
- if(NULL == (page_buf = H5FL_CALLOC(H5PB_t)))
- HGOTO_ERROR(H5E_PAGEBUF, H5E_NOSPACE, FAIL, "memory allocation failed")
+ } /* end if */
+ else if ( 0 != size % shared->fs_page_size )
- page_buf->max_size = size;
- H5_CHECKED_ASSIGN(page_buf->page_size, size_t, f_sh->fs_page_size, hsize_t);
- page_buf->min_meta_perc = page_buf_min_meta_perc;
- page_buf->min_raw_perc = page_buf_min_raw_perc;
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTINIT, FAIL, \
+ "Page Buffer size must be >= to the page size")
/* Calculate the minimum page count for metadata and raw data
- * based on the fractions provided
+ * based on the fractions provided
*/
- page_buf->min_meta_count = (unsigned)((size * page_buf_min_meta_perc) / (f_sh->fs_page_size * 100));
- page_buf->min_raw_count = (unsigned)((size * page_buf_min_raw_perc) / (f_sh->fs_page_size * 100));
+ min_md_pages = (int32_t)((size * page_buf_min_meta_perc) /
+ (shared->fs_page_size * 100));
+ min_rd_pages = (int32_t)((size * page_buf_min_raw_perc) /
+ (shared->fs_page_size * 100));
+ HDassert(min_md_pages >= 0);
+ HDassert(min_rd_pages >= 0);
+ HDassert((min_md_pages + min_rd_pages) <=
+ (int32_t)(size / shared->fs_page_size));
+
+
+ /* compute vfd_swmr_writer */
+ if ( ( H5F_SHARED_VFD_SWMR_CONFIG(shared) ) && ( H5F_SHARED_INTENT(shared) & H5F_ACC_RDWR ) ) {
- if(NULL == (page_buf->slist_ptr = H5SL_create(H5SL_TYPE_HADDR, NULL)))
- HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTCREATE, FAIL, "can't create skip list")
- if(NULL == (page_buf->mf_slist_ptr = H5SL_create(H5SL_TYPE_HADDR, NULL)))
- HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTCREATE, FAIL, "can't create skip list")
+ HDassert(shared->vfd_swmr_config.writer);
+ vfd_swmr_writer = TRUE;
+ }
- if(NULL == (page_buf->page_fac = H5FL_fac_init(page_buf->page_size)))
- HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTINIT, FAIL, "can't create page factory")
- f_sh->page_buf = page_buf;
+ /* Allocate the new page buffering structure */
+ if(NULL == (pb_ptr = H5FL_MALLOC(H5PB_t)))
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_NOSPACE, FAIL, "memory allocation failed")
+
+ /* initialize the new instance of H5PB_t */
+
+ pb_ptr->magic = H5PB__H5PB_T_MAGIC;
+ pb_ptr->page_size = shared->fs_page_size;
+ H5_CHECKED_ASSIGN(pb_ptr->page_size, size_t, \
+ shared->fs_page_size, hsize_t);
+ pb_ptr->max_pages = (int32_t)(size / shared->fs_page_size);
+ pb_ptr->curr_pages = 0;
+ pb_ptr->curr_md_pages = 0;
+ pb_ptr->curr_rd_pages = 0;
+ pb_ptr->min_md_pages = min_md_pages;
+ pb_ptr->min_rd_pages = min_rd_pages;
+
+ pb_ptr->max_size = size;
+ pb_ptr->min_meta_perc = page_buf_min_meta_perc;
+ pb_ptr->min_raw_perc = page_buf_min_raw_perc;
+
+ /* index */
+ for ( i = 0; i < H5PB__HASH_TABLE_LEN; i++ )
+ pb_ptr->ht[i] = NULL;
+ pb_ptr->index_len = 0;
+ pb_ptr->clean_index_len = 0;
+ pb_ptr->dirty_index_len = 0;
+ pb_ptr->index_size = 0;
+ pb_ptr->clean_index_size = 0;
+ pb_ptr->dirty_index_size = 0;
+ pb_ptr->il_len = 0;
+ pb_ptr->il_size = 0;
+ pb_ptr->il_head = NULL;
+ pb_ptr->il_tail = NULL;
+
+ /* LRU */
+ pb_ptr->LRU_len = 0;
+ pb_ptr->LRU_size = 0;
+ pb_ptr->LRU_head_ptr = NULL;
+ pb_ptr->LRU_tail_ptr = NULL;
+
+
+ /* VFD SWMR specific fields.
+ * The following fields are defined iff vfd_swmr_writer is TRUE.
+ */
+ pb_ptr->vfd_swmr_writer = vfd_swmr_writer;
+ pb_ptr->mpmde_count = 0;
+ pb_ptr->cur_tick = 0;
+
+ /* delayed write list */
+ pb_ptr->max_delay = 0;
+ pb_ptr->dwl_len = 0;
+ pb_ptr->dwl_size = 0;
+ pb_ptr->dwl_head_ptr = NULL;
+ pb_ptr->dwl_tail_ptr = NULL;
+
+ /* tick list */
+ pb_ptr->tl_len = 0;
+ pb_ptr->tl_size = 0;
+ pb_ptr->tl_head_ptr = NULL;
+ pb_ptr->tl_tail_ptr = NULL;
+
+ H5PB_reset_stats(pb_ptr);
+
+ shared->pb_ptr = pb_ptr;
+
+ /* if this is a VFD SWMR reader, inform the reader VFD that the
+ * page buffer is configured. Note that this is for sanity
+ * checking, and only needed until we modify the file open
+ * code to create the page buffer before any file reads in
+ * the VFD SWMR reader case. After that, this code should be
+ * removed.
+ * JRM -- 1/29/19
+ */
+ if ( ( H5F_SHARED_VFD_SWMR_CONFIG(shared) ) &&
+ ( 0 == (H5F_SHARED_INTENT(shared) & H5F_ACC_RDWR) ) ) {
+
+ HDassert(shared->lf);
+ HDassert(! shared->vfd_swmr_config.writer);
+
+ H5FD_vfd_swmr_set_pb_configured(shared->lf);
+ }
done:
- if(ret_value < 0) {
- if(page_buf != NULL) {
- if(page_buf->slist_ptr != NULL)
- H5SL_close(page_buf->slist_ptr);
- if(page_buf->mf_slist_ptr != NULL)
- H5SL_close(page_buf->mf_slist_ptr);
- if(page_buf->page_fac != NULL)
- H5FL_fac_term(page_buf->page_fac);
- page_buf = H5FL_FREE(H5PB_t, page_buf);
- } /* end if */
- } /* end if */
+
+ if ( ret_value < 0 ) {
+
+ if ( pb_ptr != NULL ) {
+
+ pb_ptr = H5FL_FREE(H5PB_t, pb_ptr);
+
+ }
+ }
FUNC_LEAVE_NOAPI(ret_value)
+
} /* H5PB_create */
/*-------------------------------------------------------------------------
- * Function: H5PB__flush_cb
*
- * Purpose: Callback to flush PB skiplist entries.
+ * Function: H5PB_dest
*
- * Return: Non-negative on success/Negative on failure
+ * Purpose: Flush (if necessary) and evict all entries in the page
+ * buffer, and then discard the page buffer.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: John Mainzer -- 10/22/18
*
- * Programmer: Mohamad Chaarawi
+ * Changes: None.
*
*-------------------------------------------------------------------------
*/
-static herr_t
-H5PB__flush_cb(void *item, void H5_ATTR_UNUSED *key, void *_op_data)
+herr_t
+H5PB_dest(H5F_shared_t *shared)
{
- H5PB_entry_t *page_entry = (H5PB_entry_t *)item; /* Pointer to page entry node */
- H5F_shared_t *f_sh = (H5F_shared_t *)_op_data;
+ int i;
+ H5PB_t *pb_ptr = NULL;
+ H5PB_entry_t *entry_ptr = NULL;
+ H5PB_entry_t *evict_ptr = NULL;
herr_t ret_value = SUCCEED; /* Return value */
- FUNC_ENTER_STATIC
+ FUNC_ENTER_NOAPI(FAIL)
- /* Sanity checks */
- HDassert(page_entry);
- HDassert(f_sh);
+ /* Sanity check */
+ HDassert(shared);
+
+ /* flush and destroy the page buffer, if it exists */
+ if ( shared->pb_ptr ) {
+
+ pb_ptr = shared->pb_ptr;
+
+ H5PB_log_access_by_size_counts(pb_ptr);
+
+ HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+
+ /* the current implementation if very inefficient, and will
+ * fail if there are any outstanding delayed writes -- must fix this
+ */
+ for ( i = 0; i < H5PB__HASH_TABLE_LEN; i++ ) {
+
+ entry_ptr = pb_ptr->ht[i];
+
+ while ( entry_ptr ) {
+
+ HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC);
+
+ evict_ptr = entry_ptr;
+ entry_ptr = entry_ptr->ht_next;
- /* Flush the page if it's dirty */
- if(page_entry->is_dirty)
- if(H5PB__write_entry(f_sh, page_entry) < 0)
- HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, "file write failed")
+ if ( evict_ptr->is_dirty ) {
+
+ if ( H5PB__flush_entry(shared, pb_ptr, evict_ptr) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \
+ "Can't flush entry")
+ }
+
+ if ( H5PB__evict_entry(shared, evict_ptr, TRUE, true) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "forced eviction failed")
+
+ entry_ptr = pb_ptr->ht[i];
+ }
+ }
+
+ /* regular operations fields */
+ HDassert(pb_ptr->curr_pages == 0);
+ HDassert(pb_ptr->curr_md_pages == 0);
+ HDassert(pb_ptr->curr_rd_pages == 0);
+ HDassert(pb_ptr->index_len == 0);
+ HDassert(pb_ptr->index_size == 0);
+ HDassert(pb_ptr->LRU_len == 0);
+ HDassert(pb_ptr->LRU_size == 0);
+ HDassert(pb_ptr->LRU_head_ptr == NULL);
+ HDassert(pb_ptr->LRU_tail_ptr == NULL);
+
+ /* VFD SWMR fields */
+ HDassert(pb_ptr->dwl_len == 0);
+ HDassert(pb_ptr->dwl_size == 0);
+ HDassert(pb_ptr->dwl_head_ptr == NULL);
+ HDassert(pb_ptr->dwl_tail_ptr == NULL);
+
+ HDassert(pb_ptr->tl_len == 0);
+ HDassert(pb_ptr->tl_size == 0);
+ HDassert(pb_ptr->tl_head_ptr == NULL);
+ HDassert(pb_ptr->tl_tail_ptr == NULL);
+
+ pb_ptr->magic = 0;
+ shared->pb_ptr = H5FL_FREE(H5PB_t, pb_ptr);
+ }
done:
+
FUNC_LEAVE_NOAPI(ret_value)
-} /* H5PB__flush_cb() */
+
+} /* H5PB_dest */
/*-------------------------------------------------------------------------
- * Function: H5PB_flush
*
- * Purpose: Flush/Free all the PB entries to the file.
+ * Function: H5PB_flush
*
- * Return: Non-negative on success/Negative on failure
+ * Purpose: If the page buffer is defined, flush all entries.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: John Mainzer -- 10/22/18
*
- * Programmer: Mohamad Chaarawi
+ * Changes: None.
*
*-------------------------------------------------------------------------
*/
herr_t
-H5PB_flush(H5F_shared_t *f_sh)
+H5PB_flush(H5F_shared_t *shared)
{
+ int i;
+ H5PB_t *pb_ptr = NULL;
+ H5PB_entry_t *entry_ptr = NULL;
+ H5PB_entry_t *flush_ptr = NULL;
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_NOAPI(FAIL)
/* Sanity check */
- HDassert(f_sh);
+ HDassert(shared);
- /* Flush all the entries in the PB skiplist, if we have write access on the file */
- if(f_sh->page_buf && (H5F_ACC_RDWR & H5F_SHARED_INTENT(f_sh))) {
- H5PB_t *page_buf = f_sh->page_buf;
+ pb_ptr = shared->pb_ptr;
- /* Iterate over all entries in page buffer skip list */
- if(H5SL_iterate(page_buf->slist_ptr, H5PB__flush_cb, f_sh))
- HGOTO_ERROR(H5E_PAGEBUF, H5E_BADITER, FAIL, "can't flush page buffer skip list")
- } /* end if */
+ if ( pb_ptr ) {
+
+ HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+
+ /* the current implementation is very inefficient, and will
+ * fail if there are any delayed writes -- must fix this
+ */
+ for ( i = 0; i < H5PB__HASH_TABLE_LEN; i++ ) {
+
+ entry_ptr = pb_ptr->ht[i];
+
+ while ( entry_ptr ) {
+
+ HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC);
+
+ flush_ptr = entry_ptr;
+ entry_ptr = entry_ptr->ht_next;
+ hlog_fast(pbflush, "%s: visiting %zu-byte page %" PRIu64,
+ __func__, flush_ptr->size, flush_ptr->page);
+
+ if ( flush_ptr->is_dirty ) {
+
+ if (flush_ptr->delay_write_until != 0) {
+ hlog_fast(pbflush, "%s: delaying %zu-byte page %" PRIu64
+ " until %" PRIu64 " (now %" PRIu64 ")",
+ __func__, flush_ptr->size, flush_ptr->page,
+ flush_ptr->delay_write_until,
+ shared->tick_num);
+ continue;
+ }
+
+ if ( H5PB__flush_entry(shared, pb_ptr, flush_ptr) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \
+ "Can't flush entry")
+ }
+ }
+ }
+ }
done:
+
FUNC_LEAVE_NOAPI(ret_value)
+
} /* H5PB_flush */
/*-------------------------------------------------------------------------
- * Function: H5PB__dest_cb
*
- * Purpose: Callback to free PB skiplist entries.
+ * Function: H5PB_page_exists
*
- * Return: Non-negative on success/Negative on failure
+ * Purpose: Test to see if a page buffer page exists at the specified
+ * address. Set *page_exists_ptr to TRUE or FALSE accordingly.
+ *
+ * This function exists for the convenience of the test
+ * code
+ *
+ * Return: Non-negative on success/Negative on failure
*
- * Programmer: Mohamad Chaarawi
+ * Programmer: John Mainzer -- 10/22/18
+ *
+ * Changes: None.
*
*-------------------------------------------------------------------------
*/
-static herr_t
-H5PB__dest_cb(void *item, void H5_ATTR_UNUSED *key, void *_op_data)
+herr_t
+H5PB_page_exists(H5F_shared_t *shared, haddr_t addr, hbool_t *page_exists_ptr)
{
- H5PB_entry_t *page_entry = (H5PB_entry_t *)item; /* Pointer to page entry node */
- H5PB_ud1_t *op_data = (H5PB_ud1_t *)_op_data;
+ uint64_t page;
+ H5PB_t *pb_ptr = NULL;
+ H5PB_entry_t *entry_ptr = NULL;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI(FAIL)
- FUNC_ENTER_STATIC_NOERR
+ /* Sanity check */
+ HDassert(shared);
+ HDassert(shared->pb_ptr);
- /* Sanity checking */
- HDassert(page_entry);
- HDassert(op_data);
- HDassert(op_data->page_buf);
+ pb_ptr = shared->pb_ptr;
- /* Remove entry from LRU list */
- if(op_data->actual_slist) {
- H5PB__REMOVE_LRU(op_data->page_buf, page_entry)
- page_entry->page_buf_ptr = H5FL_FAC_FREE(op_data->page_buf->page_fac, page_entry->page_buf_ptr);
- } /* end if */
+ HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+ HDassert(page_exists_ptr);
- /* Free page entry */
- page_entry = H5FL_FREE(H5PB_entry_t, page_entry);
+ /* Calculate the page offset */
+ page = (addr / pb_ptr->page_size);
- FUNC_LEAVE_NOAPI(SUCCEED)
-} /* H5PB__dest_cb() */
+ /* the supplied address should be page aligned */
+ HDassert(addr == page * pb_ptr->page_size);
+
+ /* Search for page in the hash table */
+ H5PB__SEARCH_INDEX(pb_ptr, page, entry_ptr, FAIL)
+
+ HDassert((NULL == entry_ptr) || (entry_ptr->addr == addr));
+
+ *page_exists_ptr = ( entry_ptr != NULL );
+
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5PB_page_exists */
+
+static void
+H5PB_count_meta_access_by_size(H5PB_t *pb, size_t size)
+{
+ const size_t nslots = NELMTS(pb->access_size_count);
+ size_t i, hi;
+
+ for (hi = pb->page_size, i = 0; i < nslots - 1; i++, hi *= 2){
+ if (size <= hi)
+ break;
+ }
+ pb->access_size_count[i]++;
+}
+
+static void
+H5PB_log_access_by_size_counts(const H5PB_t *pb)
+{
+ const size_t nslots = NELMTS(pb->access_size_count);
+ size_t i, lo, hi;
+
+ hlog_fast(pb_access_sizes, "page buffer %p metadata accesses by size:",
+ (const void *)pb);
+
+ for (lo = 0, hi = pb->page_size, i = 0;
+ i < nslots - 1;
+ i++, lo = hi + 1, hi *= 2) {
+ hlog_fast(pb_access_sizes,
+ "%16" PRIu64 " accesses %8zu - %8zu bytes long",
+ pb->access_size_count[i], lo, hi);
+ }
+
+ hlog_fast(pb_access_sizes,
+ "%16" PRIu64 " accesses %8zu - greater bytes long",
+ pb->access_size_count[i], lo);
+}
/*-------------------------------------------------------------------------
- * Function: H5PB_dest
*
- * Purpose: Flush and destroy the PB on the file if it exists.
+ * Function: H5PB_read
*
- * Return: Non-negative on success/Negative on failure
+ * Purpose: Satisfy the read from the page buffer if possible.
+ *
+ * 1) If the page buffer is disabled, simply read from the
+ * HDF5 file and return.
+ *
+ * 2) If the read is for raw data, and the page buffer is
+ * configured for metadata only (i.e. min_md_pages ==
+ * max_pages), simply read from the HDF5 file and return.
+ *
+ * 3) If the read is for raw data, and it of page size or
+ * larger, read it directly from the HDF5 file.
+ *
+ * It is possible that the page buffer contains dirty pages
+ * that intersect with the read -- test for this and update
+ * the read buffer from the page buffer if any such pages
+ * exist.
+ *
+ * Note that no pages are inserted into the page buffer in
+ * this case.
+ *
+ * 4) If the read is for raw data, and it is of size less
+ * than the page size, satisfy the read from the page
+ * buffer, loading and inserting pages into the
+ * page buffer as necessary
+ *
+ * 5) If the read is for metadata, and the page buffer is
+ * configured for raw data only (i.e. min_rd_pages ==
+ * max_pages), simply read from the HDF5 file and return.
+ *
+ * The free space manager guarantees that allocations larger
+ * than one page will be page alligned, and that allocations
+ * of size less than or equal to page size will not cross page
+ * boundaries. Further, unlike raw data, metadata is always
+ * written and read atomically.
+ *
+ * In principle, this should make it easy to discriminate
+ * between small and multi-page metadata entries so that
+ * pages containing the former will be buffered and the
+ * latter be read directly from file.
+ *
+ * Unfortunately, the metadata cache does not always know the
+ * size of metadata entries when it tries to read them. In
+ * such cases, it issues speculative reads that may be either
+ * smaller or larger than the actual size of the piece of
+ * metadata that is finally read.
+ *
+ * Since we are guaranteed that all metadata allocations larger
+ * that one page are page aligned, we can safely clip at the
+ * page boundary any non page aligned metadata read that crosses
+ * page boundaries.
+ *
+ * However, page aligned reads could wind up being either
+ * small or multi-page. This results in two scenarios that
+ * we must handle:
+ *
+ * a) A page aligned read of size less than one page
+ * turns out to be mult-page.
+ *
+ * In this case, the initial speculative read will
+ * result in a page load and insertion into the page
+ * buffer. This page must be evicted on the subsequent
+ * read of size greater than page size.
+ *
+ * In the context of VFD SWMR, it is also possible that
+ * that the multi-page metadata entry is already in the
+ * page buffer -- in which case the initial read should
+ * be satisfied from the multi-page page buffer entry.
+ *
+ * b) A page aligned, larger than one page read turns out
+ * to be small (less than one page).
+ *
+ * If there is already a page in the page buffer with
+ * same address, we can safely clip the original
+ * read to page size
+ *
+ * The above considerations resolve into the following cases:
+ *
+ * 6) If the read is for metadata and not page aligned, clip
+ * the read to the end of the current page if necessary.
+ * Load the relevant page if necessary and satisfy the
+ * read from the page buffer. Note that it there is an
+ * existing page, it must not be a multi-page metadata
+ * entry. It it is, flag an error.
+ *
+ * 7) If the read is for metadata, is page aligned, is larger
+ * than one page, and there is no entry in the page buffer,
+ * satisfy the read from the file
+ *
+ * 8) If the read is for metadata, is page aligned, is larger
+ * than one page, and there is a regular entry at the target
+ * page address, test to see if the last read was for the
+ * same address.
*
- * Programmer: Mohamad Chaarawi
+ * If was, evict the page, and satisfy the read from file.
+ * Flag an error if the page was dirty.
+ *
+ * If the last read was for a different page, clip the read
+ * to one page, and satisfy the read from the existing
+ * regular entry.
+ *
+ * 9) If the read is for metadata, is page aligned, is larger
+ * than one page, and there is a multi-page metadata entry
+ * at the target page address, test to see if
+ * pb_ptr->vfd_swmr_write is TRUE.
+ *
+ * If it is, satisfy the read from the multi-page metadata
+ * entry, clipping the read if necessary.
+ *
+ * if pb_ptr->vfd_swmr_write is FALSE, flag an error.
+ *
+ * 10) If the read is for metadata, is page aligned, is no
+ * larger than a page, test to see if the page buffer
+ * contains a page at the target address.
+ *
+ * If it doesn't, load the page and satisfy the read
+ * from it.
+ *
+ * If it contains a regular page entry, satisfy the read
+ * from it.
+ *
+ * If it contains a multipage metadata entry at the target
+ * address, satisfy the read from the multi-page metadata
+ * entry if pb_ptr->vfd_swmr_write is TRUE, and flag an
+ * error otherwise.
+ *
+ * Observe that this function handles casses 1, 2, and 5
+ * directly, calls H5PB_read_raw() for cases 3 & 4, and
+ * calls H5PB_read_meta() for cases 6), 7, 8, 9), and 10).
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: John Mainzer -- 10/11/18
+ *
+ * Changes: None.
*
*-------------------------------------------------------------------------
*/
+/* TBD Add optional raw-data bypass here and at H5PB_write when we
+ * are operating in parallel mode.
+ */
herr_t
-H5PB_dest(H5F_shared_t *f_sh)
+H5PB_read(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size,
+ void *buf/*out*/)
{
- herr_t ret_value = SUCCEED; /* Return value */
+ H5PB_t *pb_ptr; /* Page buffer for this file */
+ herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_NOAPI(FAIL)
- /* Sanity checks */
- HDassert(f_sh);
+ hlog_fast(pbrd, "%s %p type %d %" PRIuHADDR " size %zu",
+ __func__, (void *)shared, type, addr, size);
- /* flush and destroy the page buffer, if it exists */
- if(f_sh->page_buf) {
- H5PB_t *page_buf = f_sh->page_buf;
- H5PB_ud1_t op_data; /* Iteration context */
+ pb_ptr = shared->pb_ptr;
+
+ if (pb_ptr != NULL && type != H5FD_MEM_DRAW)
+ H5PB_count_meta_access_by_size(pb_ptr, size);
- if(H5PB_flush(f_sh) < 0)
- HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTFLUSH, FAIL, "can't flush page buffer")
+ HDassert(pb_ptr == NULL || pb_ptr->magic == H5PB__H5PB_T_MAGIC);
- /* Set up context info */
- op_data.page_buf = page_buf;
+ /* Bypass the page buffer in case
+ * 1) page buffer is disabled
+ * _) MPI I/O is enabled
+ * 2) page buffer configured for metadata only, and it's a raw-data access
+ * 5) page buffer configured for raw data only, and it's a metadata access
+ */
+ if (pb_ptr == NULL || H5F_SHARED_HAS_FEATURE(shared, H5FD_FEAT_HAS_MPI) ||
+ (H5FD_MEM_DRAW == type && pb_ptr->min_md_pages == pb_ptr->max_pages) ||
+ (H5FD_MEM_DRAW != type && pb_ptr->min_rd_pages == pb_ptr->max_pages)) {
- /* Destroy the skip list containing all the entries in the PB */
- op_data.actual_slist = TRUE;
- if(H5SL_destroy(page_buf->slist_ptr, H5PB__dest_cb, &op_data))
- HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTCLOSEOBJ, FAIL, "can't destroy page buffer skip list")
+ if (H5FD_read(shared->lf, type, addr, size, buf) < 0) {
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL,
+ "read through lower VFD failed");
+ }
- /* Destroy the skip list containing the new entries */
- op_data.actual_slist = FALSE;
- if(H5SL_destroy(page_buf->mf_slist_ptr, H5PB__dest_cb, &op_data))
- HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTCLOSEOBJ, FAIL, "can't destroy page buffer skip list")
+ if (pb_ptr != NULL)
+ H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size);
+ HGOTO_DONE(SUCCEED);
+ }
- /* Destroy the page factory */
- if(H5FL_fac_term(page_buf->page_fac) < 0)
- HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTRELEASE, FAIL, "can't destroy page buffer page factory")
+ if (H5FD_MEM_DRAW == type) { /* cases 3 and 4 */
+ if (H5PB__read_raw(shared, type, addr, size, buf) < 0)
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, "raw read failed");
+ } else if (metadata_multipart_read(shared, type, addr, size, buf) < 0)
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, "meta read failed");
- f_sh->page_buf = H5FL_FREE(H5PB_t, page_buf);
- } /* end if */
+ H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size);
done:
FUNC_LEAVE_NOAPI(ret_value)
-} /* H5PB_dest */
+}
+
+/* Remove the entry corresponding to lower-file page number `page`.
+ * Return 0 if there was no such entry or if the entry was removed
+ * successfully. Return -1 on error.
+ *
+ * If `only_mark` is true, then the entry corresponding shadow-index
+ * entry is not removed. Instead, it is marked as garbage. This is
+ * a stop-gap fix for a performance problem in H5PB_dest(): deleting
+ * all of the index entries took time quadratic in their number because
+ * this routine performs an O(n) copy of index entries.
+ */
+static int
+shadow_idx_entry_remove(H5F_shared_t *shared, uint64_t page, bool only_mark)
+{
+ ptrdiff_t i;
+ H5FD_vfd_swmr_idx_entry_t *entry;
+
+ entry = vfd_swmr_pageno_to_mdf_idx_entry(shared->mdf_idx,
+ shared->mdf_idx_entries_used, page, false);
+
+ if (entry == NULL)
+ return 0;
+
+ if (shared->vfd_swmr_writer && entry->md_file_page_offset != 0) {
+ if (shadow_image_defer_free(shared, entry) != 0)
+ return -1;
+ entry->md_file_page_offset = 0;
+ }
+
+ if (only_mark) {
+ entry->garbage = true;
+ return 0;
+ }
+
+ i = entry - shared->mdf_idx;
+
+ if (shared->mdf_idx_entries_used > i + 1) {
+ const size_t ntocopy =
+ (size_t)(shared->mdf_idx_entries_used - (i + 1));
+ memmove(&shared->mdf_idx[i],
+ &shared->mdf_idx[i + 1],
+ ntocopy * sizeof(shared->mdf_idx[i + 1]));
+ }
+ shared->mdf_idx_entries_used--;
+ return 0;
+}
/*-------------------------------------------------------------------------
- * Function: H5PB_add_new_page
*
- * Purpose: Add a new page to the new page skip list. This is called
- * from the MF layer when a new page is allocated to
- * indicate to the page buffer layer that a read of the page
- * from the file is not necessary since it's an empty page.
+ * Function: H5PB_remove_entry
+ *
+ * Purpose: Remove possible metadata entry with ADDR from the PB cache.
+ *
+ * This is in response to the data corruption bug from fheap.c
+ * with page buffering + page strategy.
+ *
+ * Note: Large metadata page bypasses the PB cache.
+ *
+ * Note: Update of raw data page (large or small sized) is
+ * handled by the PB cache.
*
* Return: Non-negative on success/Negative on failure
*
- * Programmer: Mohamad Chaarawi
+ * Programmer: Vailin Choi; Feb 2017
+ *
+ * Changes: Reworked function for re-implementation of the page buffer.
+ *
+ * In the context of VFD SWMR, it is possible that the
+ * discarded page or multi-page metadata entry has been
+ * modified during the current tick and/or is subject to a
+ * delayed write. We must detect this, and remove the entry
+ * from the tick list and/or delayed write list before it is
+ * evicted.
+ *
+ * Vailin: I think we need to do this for raw data as well.
+ *
+ * JRM -- 10/23/18
+ *
+ * We also need to evict modified pages from the page
+ * buffer in the VFD SWMR reader case to avoid message from
+ * the past bugs. This function will serve for this for
+ * now, but for efficiency, we may want a version that takes
+ * a list of pages instead.
+ *
+ * JRM -- 12/30/18
*
*-------------------------------------------------------------------------
*/
herr_t
-H5PB_add_new_page(H5F_shared_t *f_sh, H5FD_mem_t type, haddr_t page_addr)
+H5PB_remove_entry(H5F_shared_t *shared, haddr_t addr)
{
- H5PB_t *page_buf; /* Page buffer to operate on */
- H5PB_entry_t *page_entry = NULL; /* Pointer to the corresponding page entry */
- herr_t ret_value = SUCCEED; /* Return value */
+ uint64_t page;
+ H5PB_t *pb_ptr;
+ H5PB_entry_t *entry_ptr = NULL;
+ herr_t ret_value = SUCCEED;
FUNC_ENTER_NOAPI(FAIL)
- /* Sanity checks */
- HDassert(f_sh);
- page_buf = f_sh->page_buf;
- HDassert(page_buf);
+ pb_ptr = shared->pb_ptr;
- /* If there is an existing page, this means that at some point the
- * file free space manager freed and re-allocated a page at the same
- * address. No need to do anything here then...
- */
- /* MSC - to be safe, might want to dig in the MF layer and remove
- * the page when it is freed from this list if it still exists and
- * remove this check
- */
- if(NULL == H5SL_search(page_buf->mf_slist_ptr, &(page_addr))) {
- /* Create the new PB entry */
- if(NULL == (page_entry = H5FL_CALLOC(H5PB_entry_t)))
- HGOTO_ERROR(H5E_PAGEBUF, H5E_NOSPACE, FAIL, "memory allocation failed")
-
- /* Initialize page fields */
- page_entry->addr = page_addr;
- page_entry->type = (H5F_mem_page_t)type;
- page_entry->is_dirty = FALSE;
-
- /* Insert entry in skip list */
- if(H5SL_insert(page_buf->mf_slist_ptr, page_entry, &(page_entry->addr)) < 0)
- HGOTO_ERROR(H5E_PAGEBUF, H5E_BADVALUE, FAIL, "Can't insert entry in skip list")
- } /* end if */
+ /* Calculate the page offset */
+ page = (addr / pb_ptr->page_size);
+
+ HDassert(addr == page * pb_ptr->page_size);
+
+ /* Search for page in the hash table */
+ H5PB__SEARCH_INDEX(pb_ptr, page, entry_ptr, FAIL)
+
+ if ( entry_ptr ) {
+
+ HDassert(entry_ptr->addr == addr);
+
+ /* A page or a metadata multi-page with vfd_swmr_writer (case 7) */
+ HDassert( (entry_ptr->size == pb_ptr->page_size) ||
+ (entry_ptr->size > pb_ptr->page_size &&
+ entry_ptr->mem_type != H5FD_MEM_DRAW &&
+ pb_ptr->vfd_swmr_writer) );
+
+ if ( entry_ptr->modified_this_tick ) {
+
+ H5PB__REMOVE_FROM_TL(pb_ptr, entry_ptr, FAIL);
+
+ entry_ptr->modified_this_tick = FALSE;
+ }
+
+ if ( entry_ptr->delay_write_until > 0 ) {
+
+ entry_ptr->delay_write_until = 0;
+
+ H5PB__REMOVE_FROM_DWL(pb_ptr, entry_ptr, FAIL)
+
+ if ( ! ( entry_ptr->is_mpmde ) ) {
+
+ H5PB__UPDATE_RP_FOR_INSERTION(pb_ptr, entry_ptr, FAIL);
+ }
+ }
+
+ /* if the entry is dirty, mark it clean before we evict */
+ if ( ( entry_ptr->is_dirty ) &&
+ ( H5PB__mark_entry_clean(pb_ptr, entry_ptr) < 0 ) )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "mark entry clean failed")
+
+ if ( H5PB__evict_entry(shared, entry_ptr, TRUE, false) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, "forced eviction failed")
+
+ assert(!shared->vfd_swmr_writer || vfd_swmr_pageno_to_mdf_idx_entry(shared->mdf_idx, shared->mdf_idx_entries_used, page, false) == NULL);
+ }
done:
- if(ret_value < 0)
- if(page_entry)
- page_entry = H5FL_FREE(H5PB_entry_t, page_entry);
FUNC_LEAVE_NOAPI(ret_value)
-} /* H5PB_add_new_page */
+
+} /* H5PB_remove_entry */
+
+herr_t
+H5PB_remove_entries(H5F_shared_t *shared, haddr_t addr, hsize_t size)
+{
+ H5PB_t *pb_ptr;
+ H5PB_entry_t *entry_ptr;
+ herr_t ret_value = SUCCEED;
+ metadata_section_t section[3] = {{0, 0, NULL}, {0, 0, NULL}, {0, 0, NULL}};
+ int i;
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ pb_ptr = shared->pb_ptr;
+
+ HDassert(addr % pb_ptr->page_size == 0);
+
+ if (size > pb_ptr->page_size) {
+ hlog_fast(pbrm,
+ "removing multipage region [%" PRIuHADDR ", %" PRIuHADDR ")",
+ addr, addr + size);
+ }
+
+ metadata_section_split(pb_ptr->page_size, addr, size, NULL, section);
+
+ for (i = 0; i < 3; i++) {
+ metadata_section_t *iter = &section[i];
+
+ if (iter->len == 0)
+ continue;
+
+ if (iter->len < size) {
+ hlog_fast(pbrm, "removing entry [%" PRIuHADDR ", %" PRIuHADDR ") "
+ "for split region [%" PRIuHADDR ", %" PRIuHADDR ")",
+ iter->addr, iter->addr + iter->len, addr, addr + size);
+ }
+
+ assert(iter->addr % pb_ptr->page_size == 0);
+
+ if (H5PB_remove_entry(shared, iter->addr) < 0)
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, "forced eviction failed")
+ }
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+}
/*-------------------------------------------------------------------------
- * Function: H5PB_update_entry
*
- * Purpose: In PHDF5, entries that are written by other processes and just
- * marked clean by this process have to have their corresponding
- * pages updated if they exist in the page buffer.
- * This routine checks and update the pages.
+ * Function: H5PB_update_entry
*
- * Return: Non-negative on success/Negative on failure
+ * Purpose: In PHDF5, metadata cache entries that are written by other
+ * processes are simply marked clean in the current process.
+ * However, if the page buffer is enabled, entries marked
+ * clean must still be written to the page buffer so as to
+ * keep the contents of metadata pages consistent on all
+ * processes.
+ *
+ * Do this as follows:
+ *
+ * 1) Test to see if the page buffer is configured to accept
+ * metadata pages. If it isn't, return.
+ *
+ * 2) Test to see if the page buffer contains the page that
+ * contains the supplied metadata cache entry. If it
+ * doesn't, return.
+ *
+ * 3) Write the supplied buffer to page at the appropriate
+ * offset.
*
- * Programmer: Mohamad Chaarawi
+ * Note that at present, page buffering is disabled in the
+ * parallel case. Thus this function has not been tested.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: John Mainzer -- 10/23/18
+ *
+ * Changes: None.
*
*-------------------------------------------------------------------------
*/
-herr_t
-H5PB_update_entry(H5PB_t *page_buf, haddr_t addr, size_t size, const void *buf)
+herr_t
+H5PB_update_entry(H5PB_t *pb_ptr, haddr_t addr, size_t size, const void *buf)
{
- H5PB_entry_t *page_entry; /* Pointer to the corresponding page entry */
+ uint64_t page;
+ size_t offset;
+ H5PB_entry_t *entry_ptr = NULL;
haddr_t page_addr;
+ herr_t ret_value = SUCCEED; /* Return value */
- FUNC_ENTER_NOAPI_NOERR
+ FUNC_ENTER_NOAPI(FAIL)
/* Sanity checks */
- HDassert(page_buf);
- HDassert(size <= page_buf->page_size);
+ HDassert(pb_ptr);
+ HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+ HDassert(size > 0);
+ HDassert(size <= pb_ptr->page_size);
HDassert(buf);
- /* calculate the aligned address of the first page */
- page_addr = (addr / page_buf->page_size) * page_buf->page_size;
+ if ( pb_ptr->min_rd_pages < pb_ptr->max_pages ) {
- /* search for the page and update if found */
- page_entry = (H5PB_entry_t *)H5SL_search(page_buf->slist_ptr, (void *)(&page_addr));
- if(page_entry) {
- haddr_t offset;
+ /* page buffer is configured to accept metadata pages */
- HDassert(addr + size <= page_addr + page_buf->page_size);
- offset = addr - page_addr;
- H5MM_memcpy((uint8_t *)page_entry->page_buf_ptr + offset, buf, size);
+ /* Calculate the aligned address of the containing page */
+ page = (addr / pb_ptr->page_size);
+ page_addr = page * pb_ptr->page_size;
- /* move to top of LRU list */
- H5PB__MOVE_TO_TOP_LRU(page_buf, page_entry)
- } /* end if */
+ H5PB__SEARCH_INDEX(pb_ptr, page, entry_ptr, FAIL)
+
+ if ( entry_ptr ) {
+
+ HDassert( entry_ptr->is_metadata );
+ HDassert( ! (entry_ptr->is_mpmde) );
+ HDassert(addr + size <= page_addr + pb_ptr->page_size);
+
+ offset = addr - page_addr;
+
+ HDmemcpy(((uint8_t *)(entry_ptr->image_ptr) + offset),
+ buf, size);
+
+ /* should we mark the page dirty? If so, replace the following
+ * with a call to H5PB__mark_entry_dirty()
+ */
+ H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, FAIL)
+ }
+ }
+
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
- FUNC_LEAVE_NOAPI(SUCCEED)
} /* H5PB_update_entry */
/*-------------------------------------------------------------------------
- * Function: H5PB_remove_entry
*
- * Purpose: Remove possible metadata entry with ADDR from the PB cache.
- * This is in response to the data corruption bug from fheap.c
- * with page buffering + page strategy.
- * Note: Large metadata page bypasses the PB cache.
- * Note: Update of raw data page (large or small sized) is handled by the PB cache.
+ * Function: H5PB_vfd_swmr__release_delayed_writes
*
- * Return: Non-negative on success/Negative on failure
+ * Purpose: After the tick list has been released, and before the
+ * beginning of the next tick, we must scan the delayed
+ * write list, and release those entries whose delays have
+ * expired.
*
- * Programmer: Vailin Choi; Feb 2017
+ * Note that pages of metadata, and multi-page metadata entries
+ * are handled differently.
+ *
+ * Regular pages are removed from the delayed write list and
+ * inserted in the replacement policy
+ *
+ * In contrast, multi-page metadata entries are simply
+ * flushed and evicted.
+ *
+ * Since the delayed write list is sorted in decreasing
+ * delay_write_until order, we start our scan at the bottom
+ * of the delayed write list and continue upwards until no
+ * expired entries remain.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: John Mainzer -- 11/15/18
+ *
+ * Changes: None.
*
*-------------------------------------------------------------------------
*/
-herr_t
-H5PB_remove_entry(const H5F_shared_t *f_sh, haddr_t addr)
+herr_t
+H5PB_vfd_swmr__release_delayed_writes(H5F_shared_t *shared)
{
- H5PB_t *page_buf; /* Page buffer to operate on */
- H5PB_entry_t *page_entry = NULL; /* Pointer to the page entry being searched */
+ H5PB_t * pb_ptr = NULL;
+ H5PB_entry_t *entry_ptr = NULL;
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_NOAPI(FAIL)
/* Sanity checks */
- HDassert(f_sh);
- page_buf = f_sh->page_buf;
- HDassert(page_buf);
+ HDassert(shared);
+ HDassert(shared->vfd_swmr);
+ HDassert(shared->vfd_swmr_writer);
- /* Search for address in the skip list */
- page_entry = (H5PB_entry_t *)H5SL_search(page_buf->slist_ptr, (void *)(&addr));
+ pb_ptr = shared->pb_ptr;
- /* If found, remove the entry from the PB cache */
- if(page_entry) {
- HDassert(page_entry->type != H5F_MEM_PAGE_DRAW);
- if(NULL == H5SL_remove(page_buf->slist_ptr, &(page_entry->addr)))
- HGOTO_ERROR(H5E_CACHE, H5E_BADVALUE, FAIL, "Page Entry is not in skip list")
+ HDassert(pb_ptr);
+ HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+ HDassert(pb_ptr->vfd_swmr_writer);
- /* Remove from LRU list */
- H5PB__REMOVE_LRU(page_buf, page_entry)
- HDassert(H5SL_count(page_buf->slist_ptr) == page_buf->LRU_list_len);
+ while (pb_ptr->dwl_tail_ptr &&
+ pb_ptr->dwl_tail_ptr->delay_write_until <= shared->tick_num) {
- page_buf->meta_count--;
+ entry_ptr = pb_ptr->dwl_tail_ptr;
- page_entry->page_buf_ptr = H5FL_FAC_FREE(page_buf->page_fac, page_entry->page_buf_ptr);
- page_entry = H5FL_FREE(H5PB_entry_t, page_entry);
- } /* end if */
+ HDassert(entry_ptr->is_dirty);
+
+ entry_ptr->delay_write_until = 0;
+
+ H5PB__REMOVE_FROM_DWL(pb_ptr, entry_ptr, FAIL)
+
+ if ( entry_ptr->is_mpmde ) { /* flush and evict now */
+
+ if ( H5PB__flush_entry(shared, pb_ptr, entry_ptr) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \
+ "flush of mpmde failed")
+
+ if ( H5PB__evict_entry(shared, entry_ptr, TRUE, false) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "eviction of mpmde failed")
+
+ } else { /* insert it in the replacement policy */
+
+ H5PB__UPDATE_RP_FOR_INSERT_APPEND(pb_ptr, entry_ptr, FAIL)
+ }
+ }
done:
+
FUNC_LEAVE_NOAPI(ret_value)
-} /* H5PB_remove_entry */
+
+} /* H5PB_vfd_swmr__release_delayed_writes() */
/*-------------------------------------------------------------------------
- * Function: H5PB_read
*
- * Purpose: Reads in the data from the page containing it if it exists
- * in the PB cache; otherwise reads in the page through the VFD.
+ * Function: H5PB_vfd_swmr__release_tick_list
*
- * Return: Non-negative on success/Negative on failure
+ * Purpose: After the metadata file has been updated, and before the
+ * beginning of the next tick, we must release the tick list.
+ *
+ * This function performs this function.
+ *
+ * In passing, flush and evict any multi-page metadata entries
+ * that are not subject to a delayed write.
*
- * Programmer: Mohamad Chaarawi
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: John Mainzer -- 11/12/18
+ *
+ * Changes: None.
*
*-------------------------------------------------------------------------
*/
-herr_t
-H5PB_read(H5F_shared_t *f_sh, H5FD_mem_t type, haddr_t addr, size_t size, void *buf/*out*/)
+herr_t
+H5PB_vfd_swmr__release_tick_list(H5F_shared_t *shared)
{
- H5PB_t *page_buf; /* Page buffering info for this file */
- H5PB_entry_t *page_entry; /* Pointer to the corresponding page entry */
- H5FD_t *file; /* File driver pointer */
- haddr_t first_page_addr, last_page_addr; /* Addresses of the first and last pages covered by I/O */
- haddr_t offset;
- haddr_t search_addr; /* Address of current page */
- hsize_t num_touched_pages; /* Number of pages accessed */
- size_t access_size;
- hbool_t bypass_pb = FALSE; /* Whether to bypass page buffering */
- hsize_t i; /* Local index variable */
+ H5PB_t * pb_ptr = NULL;
+ H5PB_entry_t *entry_ptr = NULL;
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_NOAPI(FAIL)
/* Sanity checks */
- HDassert(f_sh);
- HDassert(type != H5FD_MEM_GHEAP);
+ HDassert(shared);
+ HDassert(shared->vfd_swmr);
+ HDassert(shared->vfd_swmr_writer);
- /* Get pointer to page buffer info for this file */
- page_buf = f_sh->page_buf;
+ pb_ptr = shared->pb_ptr;
-#ifdef H5_HAVE_PARALLEL
- if(H5F_SHARED_HAS_FEATURE(f_sh, H5FD_FEAT_HAS_MPI)) {
-#if 1
- bypass_pb = TRUE;
-#else
- /* MSC - why this stopped working ? */
- int mpi_size;
-
- if((mpi_size = H5F_shared_mpi_get_size(f_sh)) < 0)
- HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTGET, FAIL, "can't retrieve MPI communicator size")
- if(1 != mpi_size)
- bypass_pb = TRUE;
-#endif
- } /* end if */
-#endif
+ HDassert(pb_ptr);
+ HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+ HDassert(pb_ptr->vfd_swmr_writer);
- /* If page buffering is disabled, or the I/O size is larger than that of a
- * single page, or if this is a parallel raw data access, bypass page
- * buffering.
- */
- if(NULL == page_buf || size >= page_buf->page_size ||
- (bypass_pb && H5FD_MEM_DRAW == type)) {
- if(H5F__accum_read(f_sh, type, addr, size, buf) < 0)
- HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, "read through metadata accumulator failed")
+ /* remove all entries from the tick list */
+ while ( pb_ptr->tl_head_ptr ) {
- /* Update statistics */
- if(page_buf) {
- if(type == H5FD_MEM_DRAW)
- page_buf->bypasses[1] ++;
- else
- page_buf->bypasses[0] ++;
- } /* end if */
-
- /* If page buffering is disabled, or if this is a large metadata access,
- * or if this is parallel raw data access, we are done here
- */
- if(NULL == page_buf || (size >= page_buf->page_size && H5FD_MEM_DRAW != type) ||
- (bypass_pb && H5FD_MEM_DRAW == type))
- HGOTO_DONE(SUCCEED)
- } /* end if */
+ entry_ptr = pb_ptr->tl_head_ptr;
- /* Update statistics */
- if(page_buf) {
- if(type == H5FD_MEM_DRAW)
- page_buf->accesses[1]++;
- else
- page_buf->accesses[0]++;
- } /* end if */
+ H5PB__REMOVE_FROM_TL(pb_ptr, entry_ptr, FAIL)
- /* Calculate the aligned address of the first page */
- first_page_addr = (addr / page_buf->page_size) * page_buf->page_size;
+ entry_ptr->modified_this_tick = FALSE;
- /* For Raw data calculate the aligned address of the last page and
- * the number of pages accessed if more than 1 page is accessed
- */
- if(H5FD_MEM_DRAW == type) {
- last_page_addr = ((addr + size - 1) / page_buf->page_size) * page_buf->page_size;
-
- /* How many pages does this write span */
- num_touched_pages = (last_page_addr / page_buf->page_size + 1) -
- (first_page_addr / page_buf->page_size);
- if(first_page_addr == last_page_addr) {
- HDassert(1 == num_touched_pages);
- last_page_addr = HADDR_UNDEF;
- } /* end if */
- } /* end if */
- /* Otherwise set last page addr to HADDR_UNDEF */
- else {
- num_touched_pages = 1;
- last_page_addr = HADDR_UNDEF;
- } /* end else */
+ if ( entry_ptr->is_mpmde ) {
+
+ HDassert(entry_ptr->is_dirty);
+
+ if ( entry_ptr->delay_write_until == 0 ) {
+
+ /* flush and evict the multi-page metadata entry immediately */
+ if ( H5PB__flush_entry(shared, pb_ptr, entry_ptr) < 0 )
- /* Translate to file driver I/O info object */
- file = f_sh->lf;
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \
+ "flush of mpmde failed")
- /* Copy raw data from dirty pages into the read buffer if the read
- request spans pages in the page buffer*/
- if(H5FD_MEM_DRAW == type && size >= page_buf->page_size) {
- H5SL_node_t *node;
+ if ( H5PB__evict_entry(shared, entry_ptr, TRUE, false) < 0 )
- /* For each touched page in the page buffer, check if it
- * exists in the page Buffer and is dirty. If it does, we
- * update the buffer with what's in the page so we get the up
- * to date data into the buffer after the big read from the file.
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "eviction of mpmde failed")
+ }
+ }
+ /* if the entry is not a multi-page metadata entry, it must already
+ * be on either the replacment policy or the delayed write list.
+ * In either case, it will be flush when possible and necessary.
*/
- node = H5SL_find(page_buf->slist_ptr, (void *)(&first_page_addr));
- for(i = 0; i < num_touched_pages; i++) {
- search_addr = i*page_buf->page_size + first_page_addr;
+ }
+
+ HDassert(pb_ptr->tl_head_ptr == NULL);
+ HDassert(pb_ptr->tl_tail_ptr == NULL);
+ HDassert(pb_ptr->tl_len == 0);
+ HDassert(pb_ptr->tl_size == 0);
- /* if we still haven't located a starting page, search again */
- if(!node && i!=0)
- node = H5SL_find(page_buf->slist_ptr, (void *)(&search_addr));
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
- /* if the current page is in the Page Buffer, do the updates */
- if(node) {
- page_entry = (H5PB_entry_t *)H5SL_item(node);
+} /* H5PB_vfd_swmr__release_tick_list */
- HDassert(page_entry);
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5PB_vfd_swmr__set_tick
+ *
+ * Purpose: At the beginning of each tick, the page buffer must be told
+ * to synchronize its copy of the current tick with that of
+ * the file to which the page buffer belongs.
+ *
+ * This function performs this function.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: John Mainzer -- 11/20/18
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5PB_vfd_swmr__set_tick(H5F_shared_t *shared)
+{
+ H5PB_t * pb_ptr = NULL;
+ herr_t ret_value = SUCCEED; /* Return value */
- /* If the current page address falls out of the access
- block, then there are no more pages to go over */
- if(page_entry->addr >= addr + size)
- break;
+ FUNC_ENTER_NOAPI(FAIL)
- HDassert(page_entry->addr == search_addr);
+ /* Sanity checks */
+ HDassert(shared);
+ HDassert(shared->vfd_swmr);
+ HDassert(shared->vfd_swmr_writer);
- if(page_entry->is_dirty) {
- /* special handling for the first page if it is not a full page access */
- if(i == 0 && first_page_addr != addr) {
- offset = addr - first_page_addr;
- HDassert(page_buf->page_size > offset);
+ pb_ptr = shared->pb_ptr;
- H5MM_memcpy(buf, (uint8_t *)page_entry->page_buf_ptr + offset,
- page_buf->page_size - (size_t)offset);
+ HDassert(pb_ptr);
+ HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+ HDassert(pb_ptr->vfd_swmr_writer);
- /* move to top of LRU list */
- H5PB__MOVE_TO_TOP_LRU(page_buf, page_entry)
- } /* end if */
- /* special handling for the last page if it is not a full page access */
- else if(num_touched_pages > 1 && i == num_touched_pages-1 && search_addr < addr+size) {
- offset = (num_touched_pages-2)*page_buf->page_size +
- (page_buf->page_size - (addr - first_page_addr));
+ /* the tick must always increase by 1 -- verify this */
+ if ( shared->tick_num != pb_ptr->cur_tick + 1 )
- H5MM_memcpy((uint8_t *)buf + offset, page_entry->page_buf_ptr,
- (size_t)((addr + size) - last_page_addr));
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "shared->tick_num (%" PRIu64 ") != (%" PRIu64 ") pb_ptr->cur_tick + 1 ?!?!", shared->tick_num, pb_ptr->cur_tick)
- /* move to top of LRU list */
- H5PB__MOVE_TO_TOP_LRU(page_buf, page_entry)
- } /* end else-if */
- /* copy the entire fully accessed pages */
- else {
- offset = i*page_buf->page_size;
-
- H5MM_memcpy((uint8_t *)buf+(i*page_buf->page_size) , page_entry->page_buf_ptr,
- page_buf->page_size);
- } /* end else */
- } /* end if */
- node = H5SL_next(node);
- } /* end if */
- } /* end for */
- } /* end if */
- else {
- /* A raw data access could span 1 or 2 PB entries at this point so
- we need to handle that */
- HDassert(1 == num_touched_pages || 2 == num_touched_pages);
- for(i = 0 ; i < num_touched_pages; i++) {
- haddr_t buf_offset;
-
- /* Calculate the aligned address of the page to search for it in the skip list */
- search_addr = (0==i ? first_page_addr : last_page_addr);
-
- /* Calculate the access size if the access spans more than 1 page */
- if(1 == num_touched_pages)
- access_size = size;
- else
- access_size = (0 == i ? (size_t)((first_page_addr + page_buf->page_size) - addr) : (size - access_size));
-
- /* Lookup the page in the skip list */
- page_entry = (H5PB_entry_t *)H5SL_search(page_buf->slist_ptr, (void *)(&search_addr));
-
- /* if found */
- if(page_entry) {
- offset = (0 == i ? addr - page_entry->addr : 0);
- buf_offset = (0 == i ? 0 : size - access_size);
-
- /* copy the requested data from the page into the input buffer */
- H5MM_memcpy((uint8_t *)buf + buf_offset, (uint8_t *)page_entry->page_buf_ptr + offset, access_size);
-
- /* Update LRU */
- H5PB__MOVE_TO_TOP_LRU(page_buf, page_entry)
-
- /* Update statistics */
- if(type == H5FD_MEM_DRAW)
- page_buf->hits[1]++;
- else
- page_buf->hits[0]++;
- } /* end if */
- /* if not found */
- else {
- void *new_page_buf = NULL;
- size_t page_size = page_buf->page_size;
- haddr_t eoa;
-
- /* make space for new entry */
- if((H5SL_count(page_buf->slist_ptr) * page_buf->page_size) >= page_buf->max_size) {
- htri_t can_make_space;
-
- /* check if we can make space in page buffer */
- if((can_make_space = H5PB__make_space(f_sh, page_buf, type)) < 0)
- HGOTO_ERROR(H5E_PAGEBUF, H5E_NOSPACE, FAIL, "make space in Page buffer Failed")
-
- /* if make_space returns 0, then we can't use the page
- buffer for this I/O and we need to bypass */
- if(0 == can_make_space) {
- /* make space can't return FALSE on second touched page since the first is of the same type */
- HDassert(0 == i);
-
- /* read entire block from VFD and return */
- if(H5FD_read(file, type, addr, size, buf) < 0)
- HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, "driver read request failed")
-
- /* Break out of loop */
- break;
- } /* end if */
- } /* end if */
-
- /* Read page from VFD */
- if(NULL == (new_page_buf = H5FL_FAC_MALLOC(page_buf->page_fac)))
- HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTALLOC, FAIL, "memory allocation failed for page buffer entry")
-
- /* Read page through the VFD layer, but make sure we don't read past the EOA. */
-
- /* Retrieve the 'eoa' for the file */
- if(HADDR_UNDEF == (eoa = H5F_shared_get_eoa(f_sh, type)))
- HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTGET, FAIL, "driver get_eoa request failed")
-
- /* If the entire page falls outside the EOA, then fail */
- if(search_addr > eoa)
- HGOTO_ERROR(H5E_PAGEBUF, H5E_BADVALUE, FAIL, "reading an entire page that is outside the file EOA")
-
- /* Adjust the read size to not go beyond the EOA */
- if(search_addr + page_size > eoa)
- page_size = (size_t)(eoa - search_addr);
-
- /* Read page from VFD */
- if(H5FD_read(file, type, search_addr, page_size, new_page_buf) < 0)
- HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, "driver read request failed")
-
- /* Copy the requested data from the page into the input buffer */
- offset = (0 == i ? addr - search_addr : 0);
- buf_offset = (0 == i ? 0 : size - access_size);
- H5MM_memcpy((uint8_t *)buf + buf_offset, (uint8_t *)new_page_buf + offset, access_size);
-
- /* Create the new PB entry */
- if(NULL == (page_entry = H5FL_CALLOC(H5PB_entry_t)))
- HGOTO_ERROR(H5E_PAGEBUF, H5E_NOSPACE, FAIL, "memory allocation failed")
-
- page_entry->page_buf_ptr = new_page_buf;
- page_entry->addr = search_addr;
- page_entry->type = (H5F_mem_page_t)type;
- page_entry->is_dirty = FALSE;
-
- /* Insert page into PB */
- if(H5PB__insert_entry(page_buf, page_entry) < 0)
- HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTSET, FAIL, "error inserting new page in page buffer")
-
- /* Update statistics */
- if(type == H5FD_MEM_DRAW)
- page_buf->misses[1]++;
- else
- page_buf->misses[0]++;
- } /* end else */
- } /* end for */
- } /* end else */
+ pb_ptr->cur_tick = shared->tick_num;
done:
+
FUNC_LEAVE_NOAPI(ret_value)
-} /* end H5PB_read() */
+
+} /* H5PB_vfd_swmr__release_tick_list */
/*-------------------------------------------------------------------------
- * Function: H5PB_write
*
- * Purpose: Write data into the Page Buffer. If the page exists in the
- * cache, update it; otherwise read it from disk, update it, and
- * insert into cache.
+ * Function: H5PB_vfd_swmr__update_index
*
- * Return: Non-negative on success/Negative on failure
+ * Purpose: In the VFD SWMR writer, all metadata writes to the page
+ * buffer during a tick are buffered in the page buffer in
+ * the tick list. Further, the metadata cache is flushed
+ * to the page buffer at the end of the tick so that all
+ * metadata changes during the tick are reflected in the
+ * tick list.
+ *
+ * Once this is done, the internal representation of the
+ * metadata file index must be updated from the tick list
+ * so that the metadata file can be updated, and the tick
+ * list can be emptied and prepared to buffer metadata changes
+ * in the next tick.
+ *
+ * This function is called to accomplish this. Its cycle of
+ * operation is as follows:
+ *
+ * 1) Scan the tick list. For each entry (*entry), test
+ * to see if it appears in the index.
+ *
+ * If it does the entry must have been modified in the
+ * past tick. Update the index entry (*ie_ptr) as follows:
+ *
+ * a) Set ie_ptr->entry_ptr = entry->image_ptr. This
+ * is needed to give the metadata file update code
+ * access to the image of the target page or multi-page
+ * multi-date entry. Note that ie_ptr->entry_ptr will
+ * be set to NULL as soon as the metadata file is updated,
+ * so the buffer pointed to by entry->image_ptr can
+ * be safely discarded at any time after the metadata
+ * file update.
+ *
+ * b) Set ie_ptr->tick_of_last_change to the current tick.
+ *
+ * c) If entry->is_dirty, set ie_ptr->clean to FALSE.
+ * If entry->is_dirty is FALSE, set ie_ptr->clean
+ * to TRUE and set ie_ptr->tick_of_last_flush to the
+ * current tick.
+ *
+ * If the tick list entry (*entry) doesn't appear in
+ * the index, allocate a metadata file index entry (*ie_ptr),
+ * and initialize it as follows:
+ *
+ * ie_ptr->hdf5_page_offset = entry->page
+ * ie_ptr->length = entry->size
+ * ie_ptr->delayed_flush = entry->delay_write_until
+ *
+ * and then update the new entry as per the existing entry
+ * case described above.
+ *
+ * 2) Scan the internal representation of the metadata file
+ * index for entries that do not appear in the tick list.
+ * For each such entry (*ie_ptr), proceed as follows:
+ *
+ * 1) If ie_ptr->clean, we are done -- proceed to the
+ * next index entry that doesn't appear in the tick list.
+ *
+ * 2) Test to see if the cognate entry appears in the page
+ * buffer. If it doesn't, it must have been flushed and
+ * evicted in the past tick. Set
+ *
+ * ie_ptr->clean = TRUE, and
+ *
+ * ie_ptr->tick_of_last_flush = current tick
+ *
+ * and proceed to the next index entry that doesn't
+ * appear in the tick list.
+ *
+ * 3) If the cognate entry does appear in the page buffer
+ * and is clean, proceed as per 2) above.
+ *
+ * 4) In all other cases, do nothing, and proceed to the
+ * next index entry that does not appear in the tick list.
+ *
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: John Mainzer -- 11/9/18
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5PB_vfd_swmr__update_index(H5F_t *f,
+ uint32_t * idx_ent_added_ptr,
+ uint32_t * idx_ent_modified_ptr,
+ uint32_t * idx_ent_not_in_tl_ptr,
+ uint32_t * idx_ent_not_in_tl_flushed_ptr)
+{
+ H5F_shared_t * const shared = f->shared;
+ const uint64_t tick_num = shared->tick_num;
+ uint32_t i;
+ uint32_t idx_ent_added = 0;
+ uint32_t idx_ent_modified = 0;
+ uint32_t idx_ent_not_in_tl = 0;
+ uint32_t idx_ent_not_in_tl_flushed = 0;
+ H5PB_t * pb_ptr = NULL;
+ H5PB_entry_t *entry;
+ H5FD_vfd_swmr_idx_entry_t * ie_ptr = NULL;
+ H5FD_vfd_swmr_idx_entry_t * idx = NULL;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ HDassert(shared->vfd_swmr);
+ HDassert(shared->vfd_swmr_writer);
+
+ idx = shared->mdf_idx;
+
+ HDassert(idx);
+
+ pb_ptr = shared->pb_ptr;
+
+ HDassert(pb_ptr);
+ HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+ HDassert(pb_ptr->vfd_swmr_writer);
+
+ HDassert(idx_ent_added_ptr);
+ HDassert(idx_ent_modified_ptr);
+ HDassert(idx_ent_not_in_tl_ptr);
+ HDassert(idx_ent_not_in_tl_flushed_ptr);
+
+ /* scan the tick list and insert or update metadata file index entries
+ * as appropriate.
+ */
+
+ for (entry = pb_ptr->tl_head_ptr; entry != NULL; entry = entry->tl_next) {
+ uint64_t target_page = entry->page;
+
+ HDassert(entry->magic == H5PB__H5PB_ENTRY_T_MAGIC);
+
+ /* see if the shadow index already contains an entry for *entry. */
+
+ ie_ptr = vfd_swmr_pageno_to_mdf_idx_entry(idx,
+ shared->mdf_idx_entries_used, target_page, false);
+
+ if ( ie_ptr == NULL ) { /* alloc new entry in the metadata file index*/
+ uint32_t new_index_entry_index;
+
+ new_index_entry_index = shared->mdf_idx_entries_used +
+ idx_ent_added++;
+
+ if (new_index_entry_index >= shared->mdf_idx_len &&
+ (idx = vfd_swmr_enlarge_shadow_index(f)) == NULL) {
+ HDfprintf(stderr,
+ "\n\nmax mdf index len (%" PRIu32 ") exceeded.\n\n",
+ shared->mdf_idx_len);
+ HDfprintf(stderr, "tick = %" PRIu64 ".\n", tick_num);
+ exit(EXIT_FAILURE);
+ }
+
+ ie_ptr = idx + new_index_entry_index;
+
+ /* partial initialization of new entry -- rest done later */
+ ie_ptr->hdf5_page_offset = target_page;
+ ie_ptr->md_file_page_offset = 0; /* undefined at this point */
+ ie_ptr->chksum = 0; /* undefined at this point */
+ /* ie_ptr->entry_ptr initialized below */
+ /* ie_ptr->tick_of_last_change initialized below */
+ /* ie_ptr->clean initialized below */
+ /* ie_ptr->tick_of_last_flush initialized below */
+ ie_ptr->delayed_flush = entry->delay_write_until;
+ ie_ptr->moved_to_lower_file = false;
+ ie_ptr->garbage = false;
+ ie_ptr->length = (uint32_t)entry->size;
+
+ } else {
+ /* If entry->size changed, discard the too-small (too-big?)
+ * shadow region and set the shadow-file page number to 0
+ * so that H5F_update_vfd_swmr_metadata_file() will
+ * allocate a new one.
+ */
+ if (ie_ptr->length != (uint32_t)entry->size) {
+ int ret;
+
+ ret = shadow_image_defer_free(shared, ie_ptr);
+ HDassert(ret == 0);
+
+ ie_ptr->md_file_page_offset = 0;
+ ie_ptr->length = (uint32_t)entry->size;
+ }
+
+ idx_ent_modified++;
+ }
+
+ ie_ptr->entry_ptr = entry->image_ptr;
+ ie_ptr->tick_of_last_change = tick_num;
+ assert(entry->is_dirty);
+ ie_ptr->clean = false;
+ ie_ptr->tick_of_last_flush = 0;
+ }
+
+ /* scan the metadata file index for entries that don't appear in the
+ * tick list. If the index entry is dirty, and either doesn't appear
+ * in the page buffer, or is clean in the page buffer, mark the index
+ * entry clean and as having been flushed in the current tick.
+ */
+ for ( i = 0; i < shared->mdf_idx_entries_used; i++ ) {
+
+ HDassert(i == 0 ||
+ idx[i - 1].hdf5_page_offset < idx[i].hdf5_page_offset);
+
+ ie_ptr = idx + i;
+
+ if (ie_ptr->tick_of_last_change == tick_num)
+ continue;
+
+ idx_ent_not_in_tl++;
+
+ if (ie_ptr->clean)
+ continue;
+
+ H5PB__SEARCH_INDEX(pb_ptr, ie_ptr->hdf5_page_offset, entry, FAIL);
+
+ if (entry == NULL || !entry->is_dirty) {
+ hlog_fast(shadow_index_reclaim,
+ "Marking shadow index slot %" PRIu32 " clean at tick %" PRIu64,
+ i, tick_num);
+ idx_ent_not_in_tl_flushed++;
+ ie_ptr->clean = TRUE;
+ ie_ptr->tick_of_last_flush = tick_num;
+ }
+ }
+
+ HDassert(idx_ent_modified + idx_ent_not_in_tl ==
+ shared->mdf_idx_entries_used);
+
+ HDassert(idx_ent_modified + idx_ent_not_in_tl + idx_ent_added <=
+ shared->mdf_idx_len);
+
+ *idx_ent_added_ptr = idx_ent_added;
+ *idx_ent_modified_ptr = idx_ent_modified;
+ *idx_ent_not_in_tl_ptr = idx_ent_not_in_tl;
+ *idx_ent_not_in_tl_flushed_ptr = idx_ent_not_in_tl_flushed;
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+}
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5PB_write
*
- * Programmer: Mohamad Chaarawi
+ * Purpose: Write data into the Page Buffer if practical, and to file
+ * otherwise. Specifically:
+ *
+ * 1) If the page buffer is disabled, simply write to the
+ * HDF5 file and return.
+ *
+ * 2) If the write is raw data, and the page buffer is
+ * configured for metadata only (i.e. min_md_pages ==
+ * max_pages), simply write to the HDF5 file and return.
+ *
+ * 3) If the write is raw data, and it of page size or
+ * larger, write directly from the HDF5 file.
+ *
+ * It is possible that the write intersects one or more
+ * pages in the page buffer -- test for this and update
+ * any partially written pages, and evict any pages
+ * that are completely overwritten.
+ *
+ * Note that no pages are inserted into the page buffer in
+ * this case.
+ *
+ * 4) If the write is of raw data, and it is of size less
+ * than the page size, write the page into the page
+ * buffer, loading and inserting pages into the
+ * page buffer as necessary
+ *
+ * 5) If the write is of metadata, and the page buffer is
+ * configured for raw data only (i.e. min_rd_pages ==
+ * max_pages), simply write to the HDF5 file and return.
+ *
+ * 6) If the write is of metadata, the write is larger than
+ * one page, and vfd_swmr_writer is FALSE, simply read
+ * from the HDF5 file. There is no need to check the
+ * page buffer, as metadata is always read atomically,
+ * and entries of this size are not buffered in the page
+ * buffer.
+ *
+ * 7) If the write is of metadata, the write is larger than
+ * one page, and vfd_swmr_writer is TRUE, the write must
+ * buffered in the page buffer until the end of the tick.
+ *
+ * If it doesn't exist already, create a multi-page metadata
+ * entry in the page buffer and copy the write into it.
+ * Insert the new entry in the tick list if necessary.
+ *
+ * Test to see if the write of the multi-page metadata
+ * entry must be delayed. If so, place the entry in
+ * the delayed write list. Otherwise, the multi-page
+ * metadata entry will be written to the HDF5 file and
+ * evicted when the tick list is released at the of the
+ * tick.
+ *
+ *
+ * 8) If the write is of metadata, and the write is of size
+ * less than or equal to the page size, write the data
+ * into the page buffer, loading and inserting a page
+ * if necessary.
+ *
+ * If, in addition, vfd_swmr_writer is TRUE, add the page
+ * touched by the write to the tick list.
+ *
+ * Observe that this function handles casses 1, 2, 5, and 6
+ * directly, calls H5PB_write_raw() for cases 3 & 4, and
+ * calls H5PB_read_meta() for cases 7, and 8.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: John Mainzer -- 10/11/18
+ *
+ * Changes: None.
*
*-------------------------------------------------------------------------
*/
herr_t
-H5PB_write(H5F_shared_t *f_sh, H5FD_mem_t type, haddr_t addr,
- size_t size, const void *buf)
+H5PB_write(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size,
+ const void *buf)
{
- H5PB_t *page_buf; /* Page buffering info for this file */
- H5PB_entry_t *page_entry; /* Pointer to the corresponding page entry */
- H5FD_t *file; /* File driver pointer */
- haddr_t first_page_addr, last_page_addr; /* Addresses of the first and last pages covered by I/O */
- haddr_t offset;
- haddr_t search_addr; /* Address of current page */
- hsize_t num_touched_pages; /* Number of pages accessed */
- size_t access_size;
+ H5PB_t *pb_ptr; /* Page buffer for this file */
hbool_t bypass_pb = FALSE; /* Whether to bypass page buffering */
- hsize_t i; /* Local index variable */
- herr_t ret_value = SUCCEED; /* Return value */
+ herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_NOAPI(FAIL)
- /* Sanity checks */
- HDassert(f_sh);
+ hlog_fast(pbwr, "%s %p type %d addr %" PRIuHADDR " size %zu",
+ __func__, (void *)shared, type, addr, size);
+
+ pb_ptr = shared->pb_ptr;
+
+ if (pb_ptr != NULL && type != H5FD_MEM_DRAW)
+ H5PB_count_meta_access_by_size(pb_ptr, size);
+
+ if ( pb_ptr == NULL ) {
+
+ bypass_pb = TRUE; /* case 1) -- page buffer is disabled */
+
+ } else {
+
+ HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+
+ if ( H5FD_MEM_DRAW == type ) { /* raw data write */
- /* Get pointer to page buffer info for this file */
- page_buf = f_sh->page_buf;
+ if ( pb_ptr->min_md_pages == pb_ptr->max_pages ) {
+
+ /* case 2) -- page buffer configured for metadata only */
+ bypass_pb = TRUE;
+
+ }
+ } else { /* metadata write */
+
+ if ( pb_ptr->min_rd_pages == pb_ptr->max_pages ) {
+
+ /* case 5) -- page buffer configured for raw data only */
+ bypass_pb = TRUE;
+
+ } else if ( ( size >= pb_ptr->page_size ) &&
+ ( ! ( pb_ptr->vfd_swmr_writer ) ) ) {
+
+ /* case 6) -- md read larger than one page and
+ * pb_ptr->vfd_swmr_writer is FALSE.
+ */
+ bypass_pb = TRUE;
+ }
+ }
+ }
#ifdef H5_HAVE_PARALLEL
- if(H5F_SHARED_HAS_FEATURE(f_sh, H5FD_FEAT_HAS_MPI)) {
-#if 1
+ /* at present, the page buffer must be disabled in the parallel case.
+ * However, just in case ...
+ */
+ if(H5F_SHARED_HAS_FEATURE(shared, H5FD_FEAT_HAS_MPI)) {
+
bypass_pb = TRUE;
-#else
- /* MSC - why this stopped working ? */
- int mpi_size;
-
- if((mpi_size = H5F_shared_mpi_get_size(f_sh)) < 0)
- HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTGET, FAIL, "can't retrieve MPI communicator size")
- if(1 != mpi_size)
- bypass_pb = TRUE;
-#endif
+
} /* end if */
-#endif
+#endif /* H5_HAVE_PARALLEL */
- /* If page buffering is disabled, or the I/O size is larger than that of a
- * single page, or if this is a parallel raw data access, bypass page
- * buffering.
- */
- if(NULL == page_buf || size >= page_buf->page_size || bypass_pb) {
- if(H5F__accum_write(f_sh, type, addr, size, buf) < 0)
- HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, "write through metadata accumulator failed")
+ if ( bypass_pb ) { /* cases 1, 2. 5, and 6 */
+
+ if ( H5FD_write(shared->lf, type, addr, size, buf) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL,
+ "write through lower VFD failed")
/* Update statistics */
- if(page_buf) {
- if(type == H5FD_MEM_DRAW || type == H5FD_MEM_GHEAP)
- page_buf->bypasses[1]++;
- else
- page_buf->bypasses[0]++;
- } /* end if */
-
- /* If page buffering is disabled, or if this is a large metadata access,
- * or if this is a parallel raw data access, we are done here
+ if ( pb_ptr ) {
+
+ H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size);
+ }
+ } else {
+
+ if ( H5FD_MEM_DRAW == type ) { /* cases 3 and 4 */
+
+ if ( H5PB__write_raw(shared, type, addr, size, buf) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \
+ "H5PB_read_raw() failed")
+
+ } else { /* cases 7, and 8 */
+
+ if ( metadata_multipart_write(shared, type, addr, size, buf) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \
+ "H5PB_read_meta() failed")
+ }
+
+ H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size);
+ }
+
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* end H5PB_write() */
+
+
+/**************************************************************************/
+/***************************** STATIC FUNCTIONS ***************************/
+/**************************************************************************/
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5PB__allocate_page
+ *
+ * Purpose: Allocate an instance of H5PB_entry_t and its associated
+ * buffer. The supplied size must be greater than or
+ * equal to pb_ptr->page_size, and equal to that value if
+ * pb_ptr->vfd_swmr_writer is FALSE.
+ *
+ * The associated buffer is zeroed if clean_image is TRUE.
+ *
+ * Return: Pointer to the newly allocated instance of H5PB_entry_t
+ * on success, and NULL on failure.
+ *
+ * Programmer: John Mainzer -- 10/12/18
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static H5PB_entry_t *
+H5PB__allocate_page(H5PB_t *pb_ptr, size_t size, hbool_t clean_image)
+{
+ H5PB_entry_t *entry_ptr = NULL;
+ void * image_ptr = NULL;
+ H5PB_entry_t *ret_value = NULL; /* Return value */
+
+ FUNC_ENTER_NOAPI(NULL)
+
+ /* sanity checks */
+ HDassert(pb_ptr);
+ HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+ HDassert(size >= pb_ptr->page_size);
+ HDassert((size == pb_ptr->page_size) || (pb_ptr->vfd_swmr_writer));
+
+ /* allocate the entry and its associated image buffer */
+ if ( NULL == (entry_ptr = H5FL_MALLOC(H5PB_entry_t)))
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_NOSPACE, NULL, \
+ "memory allocation for H5PB_entry_t failed")
+
+ if ( clean_image ) {
+
+ image_ptr = H5MM_calloc(size);
+
+ } else {
+
+ image_ptr = H5MM_malloc(size);
+ }
+
+ if ( NULL == image_ptr )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_NOSPACE, NULL, \
+ "memory allocation for page image failed")
+
+ /* initialize the new page buffer entry */
+ entry_ptr->magic = H5PB__H5PB_ENTRY_T_MAGIC;
+ entry_ptr->pb_ptr = pb_ptr;
+ entry_ptr->addr = HADDR_UNDEF;
+ entry_ptr->page = 0;
+ entry_ptr->size = size;
+ entry_ptr->image_ptr = image_ptr;
+ entry_ptr->mem_type = H5FD_MEM_DEFAULT;
+ entry_ptr->is_metadata = FALSE;
+ entry_ptr->is_mpmde = FALSE;
+ entry_ptr->is_dirty = FALSE;
+
+ /* fields supporting the hash table */
+ entry_ptr->ht_prev = NULL;
+ entry_ptr->ht_next = NULL;
+ entry_ptr->il_prev = NULL;
+ entry_ptr->il_next = NULL;
+
+ /* fields supporting replacement policise */
+ entry_ptr->next = NULL;
+ entry_ptr->prev = NULL;
+
+ /* fields supporting VFD SWMR */
+ entry_ptr->is_mpmde = FALSE;
+ entry_ptr->loaded = FALSE;
+ entry_ptr->modified_this_tick = FALSE;
+ entry_ptr->delay_write_until = 0;
+ entry_ptr->tl_next = NULL;
+ entry_ptr->tl_prev = NULL;
+
+ ret_value = entry_ptr;
+
+done:
+
+ if ( NULL == ret_value ) {
+
+ if ( entry_ptr ) {
+
+ entry_ptr->magic = 0;
+ entry_ptr = H5FL_FREE(H5PB_entry_t, entry_ptr);
+ }
+
+ if ( image_ptr ) {
+
+ image_ptr = H5MM_xfree(image_ptr);
+ }
+ } /* end if */
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5PB__allocate_page() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5PB__create_new_page
+ *
+ * Purpose: Create a new page and insert it in the page buffer with
+ * the specified address and type. If entry_ptr_ptr is not
+ * NULL, return a pointer to the new entry in *entry_ptr_ptr.
+ *
+ * Throw an error if a page already exists at the specified
+ * address.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: John Mainzer -- 10/12/18
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5PB__create_new_page(H5PB_t *pb_ptr, haddr_t addr, size_t size,
+ H5FD_mem_t type, hbool_t clean_image, H5PB_entry_t **entry_ptr_ptr)
+{
+ hbool_t inserted_in_index = FALSE;
+ hbool_t inserted_in_lru = FALSE;
+ uint64_t page;
+ H5PB_entry_t *entry_ptr = NULL;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ /* Sanity checks */
+ HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+
+ page = (uint64_t)addr / (uint64_t)(pb_ptr->page_size);
+
+ HDassert((uint64_t)(addr) == (page * (uint64_t)(pb_ptr->page_size)));
+
+ HDassert(size >= pb_ptr->page_size);
+ HDassert((size == pb_ptr->page_size) ||
+ ((pb_ptr->vfd_swmr_writer) && (type != H5FD_MEM_DRAW)));
+ HDassert((NULL == entry_ptr_ptr) || (NULL == *entry_ptr_ptr));
+
+ H5PB__SEARCH_INDEX(pb_ptr, page, entry_ptr, FAIL);
+
+ if ( entry_ptr != NULL ) {
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "page buffer already contains a page at the specified address")
+ }
+
+ entry_ptr = H5PB__allocate_page(pb_ptr, size, clean_image);
+
+ if ( NULL == entry_ptr )
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_NOSPACE, FAIL, \
+ "Can't allocate new page buffer entry")
+
+ /* perform additional initialization */
+ HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC);
+ HDassert(entry_ptr->pb_ptr == pb_ptr);
+ entry_ptr->addr = addr;
+ entry_ptr->page = page;
+ HDassert(entry_ptr->size == size);
+ HDassert(entry_ptr->image_ptr);
+ entry_ptr->mem_type = type;
+ entry_ptr->is_metadata = (type != H5FD_MEM_DRAW);
+ entry_ptr->is_mpmde = ((entry_ptr->is_metadata) &&
+ (size > pb_ptr->page_size));
+ entry_ptr->is_dirty = FALSE;
+
+ /* insert in the hash table */
+ H5PB__INSERT_IN_INDEX(pb_ptr, entry_ptr, FAIL)
+ inserted_in_index = TRUE;
+
+ /* insert at the head of the LRU if it isn't a multi-page metadata entry */
+ if ( ! entry_ptr->is_mpmde ) {
+
+ H5PB__UPDATE_RP_FOR_INSERTION(pb_ptr, entry_ptr, FAIL)
+ inserted_in_lru = TRUE;
+ }
+
+ /* updates stats */
+ H5PB__UPDATE_STATS_FOR_INSERTION(pb_ptr, entry_ptr);
+
+ if ( entry_ptr_ptr ) {
+
+ *entry_ptr_ptr = entry_ptr;
+ }
+
+done:
+
+ if ( ret_value < 0 ) {
+
+ if ( entry_ptr ) {
+
+ if ( inserted_in_lru ) {
+
+ H5PB__UPDATE_RP_FOR_EVICTION(pb_ptr, entry_ptr, FAIL);
+ }
+
+ if ( inserted_in_index ) {
+
+ H5PB__DELETE_FROM_INDEX(pb_ptr, entry_ptr, FAIL)
+ }
+
+ H5PB__deallocate_page(entry_ptr);
+ entry_ptr = NULL;
+ }
+ }
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5PB_add_new_page */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5PB__deallocate_page
+ *
+ * Purpose: Free the supplied instance of H5PB_entry_t and its
+ * associated buffer. The entry must be clean and removed
+ * from the page buffer before this function is called.
+ *
+ * Return: void
+ *
+ * Programmer: John Mainzer -- 10/12/18
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static void
+H5PB__deallocate_page(H5PB_entry_t *entry_ptr)
+{
+ FUNC_ENTER_NOAPI_NOINIT_NOERR
+
+ /* sanity checks */
+ HDassert(entry_ptr);
+ HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC);
+ HDassert(entry_ptr->size > 0);
+ HDassert(entry_ptr->image_ptr);
+ HDassert(!(entry_ptr->is_dirty));
+ HDassert(entry_ptr->ht_next == NULL);
+ HDassert(entry_ptr->ht_prev == NULL);
+ HDassert(entry_ptr->il_next == NULL);
+ HDassert(entry_ptr->il_prev == NULL);
+ HDassert(entry_ptr->next == NULL);
+ HDassert(entry_ptr->prev == NULL);
+ HDassert(entry_ptr->tl_next == NULL);
+ HDassert(entry_ptr->tl_prev == NULL);
+
+ entry_ptr->magic = 0;
+ entry_ptr->image_ptr = H5MM_xfree(entry_ptr->image_ptr);
+ entry_ptr = H5FL_FREE(H5PB_entry_t, entry_ptr);
+
+ FUNC_LEAVE_NOAPI_VOID
+
+} /* H5PB__deallocate_page() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5PB__evict_entry
+ *
+ * Purpose: Evict the target entry from the from the page buffer, and
+ * de-allocate its associated image and instance of
+ * H5PB_entry_t.
+ *
+ * In general, entries must be clean before they can be
+ * evicted, and the minimum metadata and raw data limits
+ * must be respected. Attempts to evict an entry that
+ * that do not respect these constraints will generate
+ * and error unless the force parameter is TRUE, in which
+ * case, these constraints are ignored.
+ *
+ * If `only_mark` is true, then the page-table entry's
+ * corresponding shadow-index entry is not removed. Instead,
+ * it is marked as garbage. This is a stop-gap fix for a
+ * performance problem in H5PB_dest(): deleting all of the
+ * index entries took time quadratic in their number.
+ *
+ * In the context of VFD SWMR, there is also the requirement
+ * that entries to be evicted not be on the tick list, and
+ * also not reside on the delayed write list. In the rare
+ * case in which such a page is discarded by the free space
+ * manager, it must be removed from the tick list and/or the
+ * delayed write list before being evicted by this function.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: John Mainzer -- 10/14/18
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5PB__evict_entry(H5F_shared_t *shared, H5PB_entry_t *entry_ptr, bool force,
+ bool only_mark)
+{
+ H5PB_t *pb_ptr = shared->pb_ptr;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ /* sanity checks */
+ HDassert(pb_ptr);
+ HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+ HDassert(entry_ptr);
+ HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC);
+ HDassert(entry_ptr->size > 0);
+ HDassert(entry_ptr->image_ptr);
+ /* entries on either the tick list or the delayed write
+ * list may not be evicted -- verify this.
+ */
+ HDassert(!(entry_ptr->modified_this_tick));
+ HDassert(entry_ptr->delay_write_until == 0);
+
+ if ( ( ! force ) && ( entry_ptr->is_dirty ) )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "Attempt to evict a dirty entry");
+
+ if ( ! force ) {
+
+ /* it is OK to evict an metadata page if pb_ptr->curr_md_pages ==
+ * pb_ptr->min_md_pages - 1 if we are about to replace it with another
+ * metadata page.
+ *
+ * Similarly, it is OK to evict an raw data page if
+ * pb_ptr->curr_rd_pages == pb_ptr->min_rd_pages - 1 if we are
+ * about to replace it with another raw data page.
+ *
+ * Assume sanity checks have been made before this call, and
+ * allow the above without testing the intended replacement.
*/
- if(NULL == page_buf || (size >= page_buf->page_size && H5FD_MEM_DRAW != type) ||
- (bypass_pb && H5FD_MEM_DRAW == type))
- HGOTO_DONE(SUCCEED)
+ if ( ( entry_ptr->is_metadata ) &&
+ ( pb_ptr->curr_md_pages < pb_ptr->min_md_pages ) ) {
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "Attempt to violate min_md_pages");
+
+ } else if ( ( ! entry_ptr->is_metadata ) &&
+ ( pb_ptr->curr_rd_pages < pb_ptr->min_rd_pages ) ) {
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "Attempt to violate min_rd_pages");
+ }
+ } else if ( ( entry_ptr->is_dirty ) &&
+ ( H5PB__mark_entry_clean(pb_ptr, entry_ptr) < 0 ) ) {
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, "mark entry clean failed")
+ }
+
+ /* if the entry is in the replacement policy, remove it */
+ if ( ! (entry_ptr->is_mpmde) ) {
+
+ H5PB__UPDATE_RP_FOR_EVICTION(pb_ptr, entry_ptr, FAIL)
+ }
+
+ /* remove the entry from the hash table */
+ H5PB__DELETE_FROM_INDEX(pb_ptr, entry_ptr, FAIL)
+
+ /* We need to remove the entry from the shadow file index in
+ * the VFD SWMR case.
+ *
+ * If a multipage metadata entry is deallocated, and a new, single-page
+ * metadata entry is allocated at the same base address, then
+ * the old shadow index entry will still tell the size of the previous
+ * image, which is greater than a page, and a shadow-file flush will
+ * access bytes past the end of the entry's image.
+ *
+ * When we add code to allow entries
+ * to age out of the metadata file index, that may provide
+ * code that we can reuse to perform this invalidation.
+ *
+ * It's also possible (I think) for the index-entry size to be set
+ * to one page, and then for a multipage entry to appear later at that
+ * same index entry. The recorded size will still say the same, but
+ * the image will be bigger. So the shadow file will never see the
+ * entire image written, just the first page of the image.
+ */
+ if (shared->vfd_swmr_writer &&
+ shadow_idx_entry_remove(shared, entry_ptr->page, only_mark) == -1) {
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL,
+ "failed to remove shadow index entry")
+ }
-#ifdef H5_HAVE_PARALLEL
- if(bypass_pb) {
- if(H5PB_update_entry(page_buf, addr, size, buf) > 0)
- HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTUPDATE, FAIL, "failed to update PB with metadata cache")
- HGOTO_DONE(SUCCEED)
- } /* end if */
+ /* update stats for eviction */
+ H5PB__UPDATE_STATS_FOR_EVICTION(pb_ptr, entry_ptr)
+
+ /* deallocate the page */
+ H5PB__deallocate_page(entry_ptr);
+
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5PB__evict_entry() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5PB__flush_entry
+ *
+ * Purpose: Flush the target entry to file.
+ *
+ * Under normal circumstances, the entry will be in the
+ * replacement policy. In this, also update the replacement
+ * policy for flush.
+ *
+ * If pb_ptr->vfd_swmr_writer, it is possible that the target
+ * is a multi-page metadata entry. In this case, the entry
+ * is not in the replacement policy, and thus the policy
+ * should not be updated.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: John Mainzer -- 10/14/18
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5PB__flush_entry(H5F_shared_t *shared, H5PB_t *pb_ptr, H5PB_entry_t *const entry_ptr)
+{
+ haddr_t eoa; /* Current EOA for the file */
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ /* sanity checks */
+ HDassert(shared);
+ HDassert(shared->lf);
+ HDassert(pb_ptr);
+ HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+ HDassert(entry_ptr);
+ HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC);
+ HDassert(entry_ptr->size > 0);
+ HDassert(entry_ptr->size >= pb_ptr->page_size);
+ HDassert((entry_ptr->size == pb_ptr->page_size) || (entry_ptr->is_mpmde));
+ HDassert(entry_ptr->image_ptr);
+ HDassert(entry_ptr->is_dirty);
+ HDassert((pb_ptr->vfd_swmr_writer) || (!(entry_ptr->is_mpmde)));
+ HDassert(0 == entry_ptr->delay_write_until);
+
+ hlog_fast(pbflush_entry,
+ "%s: flushing %zu-byte page %" PRIu64 " @ %" PRIuHADDR,
+ __func__, entry_ptr->size, entry_ptr->page, entry_ptr->addr);
+
+ /* Retrieve the 'eoa' for the file */
+ if ( HADDR_UNDEF == (eoa = H5FD_get_eoa(shared->lf, entry_ptr->mem_type)) )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTGET, FAIL, \
+ "driver get_eoa request failed")
+
+ /* TODO: update the free space manager to inform the page buffer when
+ * space is de-allocated so that the following assertions will be
+ * true in all cases.
+ */
+
+ /* Verify that the base addresss of the page is within the EOA. If it
+ * isn't, the associated page has been discarded and should have been
+ * removed from the page buffer. This is a bug in the HDF5 library, so
+ * an assertion is adequate here.
+ */
+ HDassert( eoa > entry_ptr->addr );
+
+ /* Space at the end of the file should be allocate in increments of
+ * pages. Thus the entire page should be within the EOA. Again,
+ * an assertion is adequate here.
+ */
+ HDassert( eoa >= entry_ptr->addr + entry_ptr->size );
+
+ /* flush the entry */
+ if ( H5FD_write(shared->lf, entry_ptr->mem_type, entry_ptr->addr,
+ entry_ptr->size, entry_ptr->image_ptr) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, "file write failed")
+
+ /* mark the entry clean */
+ if ( H5PB__mark_entry_clean(pb_ptr, entry_ptr) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, "mark entry clean failed")
+
+
+ /* if the entry is on the LRU, update the replacement policy */
+ if (!entry_ptr->is_mpmde) {
+ HDassert(entry_ptr->delay_write_until == 0);
+
+ H5PB__UPDATE_RP_FOR_FLUSH(pb_ptr, entry_ptr, FAIL)
+ }
+
+ /* update stats for flush */
+ H5PB__UPDATE_STATS_FOR_FLUSH(pb_ptr, entry_ptr)
+
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5PB__flush_entry() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5PB__load_page
+ *
+ * Purpose: Load the page with the specified base address and insert
+ * it into the page buffer. If necessary and possible, make
+ * space for the new page first.
+ *
+ * Note that the size of the page is always pb_ptr->page_size,
+ * even in the VFD SWMR case, as in this context, multi-page
+ * metadata entries are always written in full, and they
+ * may only enter the page buffer as the result of a write.
+ *
+ * In the context of VFD SWMR, when a page is loaded from
+ * file, it is possible that the VFD SWMR writer must delay
+ * writes to the page to avoid the possibility of message from
+ * the future bugs on the VFD SWMR reader. For this reason,
+ * make note of the fact that the entry has been loaded from
+ * from file, so that the necessary checks can be made when
+ * writing to the page.
+ *
+ * Return: SUCCEED if no errors are encountered, and
+ * FAIL otherwise.
+ *
+ * Programmer: John Mainzer -- 10/18/18
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5PB__load_page(H5F_shared_t *shared, H5PB_t *pb_ptr, haddr_t addr,
+ H5FD_mem_t type, H5PB_entry_t **entry_ptr_ptr)
+{
+ hbool_t skip_read = FALSE;
+ haddr_t eof = HADDR_UNDEF;
+ H5PB_entry_t *entry_ptr = NULL;
+ void *image_ptr = NULL;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ /* sanity checks */
+ HDassert(shared);
+ HDassert(shared->lf);
+ HDassert(pb_ptr);
+ HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+ HDassert((entry_ptr_ptr == NULL) || (*entry_ptr_ptr == NULL));
+
+#if 0 /* JRM */
+ haddr_t eoa;
+ /* Retrieve the 'eoa' for the file */
+ if ( HADDR_UNDEF == (eoa = H5FD_get_eoa(shared->lf, type)))
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTGET, FAIL, \
+ "driver get_eoa request failed")
+ if ( addr + ((haddr_t)(pb_ptr->page_size)) > eoa )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "Attempt to load page that extends past EOA")
+#endif /* JRM */
+ if ( HADDR_UNDEF == (eof = H5FD_get_eof(shared->lf, H5FD_MEM_DEFAULT)) )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTGET, FAIL, \
+ "driver get_eof request failed")
+
+#if 0
+ /* It is possible that this page been allocated but not
+ * written. Skip the read if addr > EOF. In this case, tell
+ * H5PB__create_new_page() to zero the page image.
+ *
+ * Don't set "skip_read = (addr >= eof);" when accumulator is used.
+ */
+ skip_read = (addr >= eof);
#endif
- } /* end if */
- /* Update statistics */
- if(page_buf) {
- if(type == H5FD_MEM_DRAW || type == H5FD_MEM_GHEAP)
- page_buf->accesses[1]++;
- else
- page_buf->accesses[0]++;
- } /* end if */
+ /* make space in the page buffer if necessary */
+ if ( ( pb_ptr->curr_pages >= pb_ptr->max_pages ) &&
+ ( H5PB__make_space(shared, pb_ptr, type) < 0 ) )
- /* Calculate the aligned address of the first page */
- first_page_addr = (addr / page_buf->page_size) * page_buf->page_size;
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "H5PB__make_space() reports an error")
+
+
+ /* Create a new page buffer page and insert it into the page buffer */
+ if ( H5PB__create_new_page(pb_ptr, addr, (size_t)(pb_ptr->page_size),
+ type, skip_read, &entry_ptr) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "can't create new page buffer page")
+
+ HDassert(entry_ptr);
+ HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC);
+ HDassert(entry_ptr->addr == addr);
+
+ image_ptr = entry_ptr->image_ptr;
+
+ HDassert(image_ptr);
- /* For raw data calculate the aligned address of the last page and
- * the number of pages accessed if more than 1 page is accessed
+ /* Read the contents of the page from file, and store it in the
+ * image buffer associated with the new entry.
*/
- if(H5FD_MEM_DRAW == type) {
- last_page_addr = (addr + size - 1) / page_buf->page_size * page_buf->page_size;
-
- /* how many pages does this write span */
- num_touched_pages = (last_page_addr/page_buf->page_size + 1) -
- (first_page_addr / page_buf->page_size);
- if(first_page_addr == last_page_addr) {
- HDassert(1 == num_touched_pages);
- last_page_addr = HADDR_UNDEF;
- } /* end if */
- } /* end if */
- /* Otherwise set last page addr to HADDR_UNDEF */
- else {
- num_touched_pages = 1;
- last_page_addr = HADDR_UNDEF;
- } /* end else */
+ if ( ( ! skip_read ) &&
+ ( H5FD_read(shared->lf, type, addr, entry_ptr->size, image_ptr) < 0 ) )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
+ "driver read request failed")
+
+ /* If in fact the page was read from file, make note of this fact
+ * for purposes of VFD SWMR delayed writes in the VFD SWMR writer.
+ */
+ entry_ptr->loaded = ! skip_read;
+
+ H5PB__UPDATE_STATS_FOR_LOAD(pb_ptr, entry_ptr)
+
+ if ( entry_ptr_ptr ) {
+
+ *entry_ptr_ptr = entry_ptr;
+ }
- /* Translate to file driver I/O info object */
- file = f_sh->lf;
+done:
+
+ /* add cleanup in case of failure */
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5PB__load_page() */
- /* Check if existing pages for raw data need to be updated since raw data access is not atomic */
- if(H5FD_MEM_DRAW == type && size >= page_buf->page_size) {
- /* For each touched page, check if it exists in the page buffer, and
- * update it with the data in the buffer to keep it up to date
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5PB__make_space
+ *
+ * Purpose: Evict one or more pages from the page buffer so as to
+ * reduce the size of the page buffer to pb_ptr->max_pages - 1.
+ * if possible.
+ *
+ * Note that the function must not be called under
+ * nonsensical conditions -- thus if either
+ *
+ * 1) the inserted type is metadata and min_rd_pages ==
+ * max_pages, or
+ *
+ * 2) the inserted type is raw data and min_md_pages ==
+ * max_pages
+ *
+ * holds, the function has been called in error, and an
+ * assertion failure is appropriate.
+ *
+ * If the page buffer is below its maximum size, we are
+ * done, and the function simply returns.
+ *
+ * Otherwise, scan upwards from the bottom of the LRU list,
+ * examining each entry in turn.
+ *
+ * If the entry is dirty, flush it, move it to the top of the
+ * LRU, and continue with the scan. Note in the VFD SWMR case,
+ * we do not have to concern ourselves with delayed writes in
+ * this context, as all entries which are subject to delayed
+ * writes must reside on the delayed write list, not the LRU list.
+ *
+ * If the entry is:
+ *
+ * 1) clean
+ *
+ * 2) either:
+ *
+ * a) the target entry is metadata and
+ * curr_md_pages > min_md_pages.
+ *
+ * b) the target entry is raw data and
+ * curr_rd_pages > min_rd_pages.
+ *
+ * c) the target entry is metadata, the inserted_type
+ * is metadata, and curr_md_pages == min_md_pages.
+ *
+ * d) the target entry is raw data, the inserted_type
+ * is raw data, and curr_rd_pages == min_rd_pages.
+ *
+ * 3) The entry is not on the tick list (which can only
+ * happen if pb_ptr->vfd_swmr_writer is TRUE).
+ *
+ * evict the entry and test to see if pb_ptr->curr_pages <
+ * pb_ptr->max_pages. If it is, return. Otherwise, continue
+ * the scan until either the above condidtion is fulfilled,
+ * or the head of the LRU is reach.
+ *
+ * Under normal circumstances, it should always be possible
+ * to reduce the size of the page buffer below pb_ptr->max_pages.
+ * However, due to prohibition on evicting entries on the
+ * tick list, and either flushing or evicting entries on the
+ * delayed write list, this will not in general be the case
+ * if pb_ptr->vfd_swmr_writer is TRUE. In this case, the
+ * page buffer may exceed its maximum size by an arbitrary
+ * amount.
+ *
+ * If this situation occurs with any regularity, we will
+ * need a mechanism to avoid attempts to make space when
+ * it is not possible to do so.
+ *
+ * Return: SUCCEED if no errors are encountered, and
+ * FAIL otherwise.
+ *
+ * Programmer: John Mainzer -- 10/14/18
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5PB__make_space(H5F_shared_t *shared, H5PB_t *pb_ptr, H5FD_mem_t inserted_type)
+{
+ hbool_t inserting_md;
+ H5PB_entry_t *search_ptr;
+ H5PB_entry_t *flush_ptr;
+ H5PB_entry_t *evict_ptr;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ /* sanity checks */
+ HDassert(pb_ptr);
+ HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+ HDassert(pb_ptr->min_md_pages + pb_ptr->min_rd_pages <= pb_ptr->max_pages);
+
+ inserting_md = ( H5FD_MEM_DRAW != inserted_type );
+
+ if ( ( inserting_md ) && ( pb_ptr->min_rd_pages == pb_ptr->max_pages ) )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL,
+ "can't make space for metadata -- pb config for raw data only")
+
+ if ( ( ! inserting_md ) && ( pb_ptr->min_md_pages == pb_ptr->max_pages ) )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL,
+ "can't make space for raw data -- pb config for metadata only")
+
+ search_ptr = pb_ptr->LRU_tail_ptr;
+
+ while ( ( search_ptr ) && ( pb_ptr->curr_pages >= pb_ptr->max_pages ) ) {
+
+ HDassert(search_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC);
+
+ if ( search_ptr->modified_this_tick ) { /* entry is on tick list */
+
+ search_ptr = search_ptr->prev;
+ H5PB__UPDATE_STATS_FOR_LRU_TL_SKIP(pb_ptr);
+
+ } else if ( ( inserting_md ) &&
+ ( ! (search_ptr->is_metadata) ) &&
+ ( pb_ptr->curr_rd_pages <= pb_ptr->min_rd_pages ) ) {
+
+ search_ptr = search_ptr->prev;
+ H5PB__UPDATE_STATS_FOR_LRU_RD_SKIP(pb_ptr);
+
+ } else if ( ( ! inserting_md ) &&
+ ( search_ptr->is_metadata ) &&
+ ( pb_ptr->curr_md_pages <= pb_ptr->min_md_pages ) ) {
+
+ search_ptr = search_ptr->prev;
+ H5PB__UPDATE_STATS_FOR_LRU_MD_SKIP(pb_ptr);
+
+ } else if ( search_ptr->is_dirty ) {
+
+ /* One can make the argument that we should test for dirty
+ * entries first, instead of skipping potentially dirty
+ * entries in the above clauses. However, I suspect that
+ * this would result in excessive flushes. Lets try it
+ * this way for now.
+ */
+
+ flush_ptr = search_ptr;
+
+ /* if the *search_ptr has a predecessor in the LRU,
+ * set set search_ptr equal to search_ptr->prev. Otherwise,
+ * leave search_ptr unchanged, so that it can be examined
+ * on the next pass through the while loop after it has been
+ * flushed.
+ */
+ if ( search_ptr->prev ) {
+
+ search_ptr = search_ptr->prev;
+ }
+
+ if ( H5PB__flush_entry(shared, pb_ptr, flush_ptr) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \
+ "Can't flush entry")
+
+ } else { /* evict the entry */
+
+ evict_ptr = search_ptr;
+ search_ptr = search_ptr->prev;
+ if ( H5PB__evict_entry(shared, evict_ptr, FALSE, false) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \
+ "Can't evict entry")
+ }
+ }
+
+ HDassert( ( search_ptr == NULL ) ||
+ ( pb_ptr->curr_pages < pb_ptr->max_pages ) );
+
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5PB__make_space() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5PB__mark_entry_clean
+ *
+ * Purpose: Mark the target entry clean
+ *
+ * This function is typically used when an entry has been
+ * completely overwritten and is about to be evicted. In
+ * this case, the entry must be marked clean to avoid
+ * sanity check failures on evictions.
+ *
+ * While this function does update the index for the
+ * entry clean, it does not update the replacement policy.
+ * If this is desired, it must be done by the caller.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: John Mainzer -- 10/14/18
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5PB__mark_entry_clean(H5PB_t *pb_ptr, H5PB_entry_t *entry_ptr)
+{
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ /* sanity checks */
+ HDassert(pb_ptr);
+ HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+ HDassert(entry_ptr);
+ HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC);
+ HDassert(entry_ptr->size > 0);
+ HDassert(entry_ptr->size >= pb_ptr->page_size);
+ HDassert((entry_ptr->size == pb_ptr->page_size) || (entry_ptr->is_mpmde));
+ HDassert(entry_ptr->image_ptr);
+ HDassert((pb_ptr->vfd_swmr_writer) || (!(entry_ptr->is_mpmde)));
+
+ /* mark the entry clean */
+ entry_ptr->is_dirty = FALSE;
+
+ /* update the index for the entry clean */
+ H5PB__UPDATE_INDEX_FOR_ENTRY_CLEAN(pb_ptr, entry_ptr)
+
+ /* don't update the replacement policy -- this will be done by
+ * the caller if desired.
+ */
+
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5PB__mark_entry_clean() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5PB__mark_entry_dirty
+ *
+ * Purpose: Mark the target entry as dirty.
+ *
+ * If pb_ptr->vfd_swmr_writer is FALSE, the entry will be
+ * in the replacement policy. In this, we simply mark the
+ * entry as dirty, and update the replacement policy for an
+ * access.
+ *
+ * If pb_ptr->vfd_swmr_writer, it is possible that we must
+ * delay writes to the target page or multi-page metadata
+ * entry to avoid message from the future bugs on the VFD
+ * SWMR readers. In such cases we must set the
+ * delay_write_until field and insert the entry on the
+ * delayed write list instead of the replacement policy.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: John Mainzer -- 10/14/18
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5PB__mark_entry_dirty(H5F_shared_t *shared, H5PB_t *pb_ptr, H5PB_entry_t *entry_ptr)
+{
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ /* sanity checks */
+ HDassert(pb_ptr);
+ HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+ HDassert(entry_ptr);
+ HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC);
+ HDassert(entry_ptr->size > 0);
+ HDassert(entry_ptr->size >= pb_ptr->page_size);
+ HDassert((entry_ptr->size == pb_ptr->page_size) || (entry_ptr->is_mpmde));
+ HDassert(entry_ptr->image_ptr);
+ HDassert((pb_ptr->vfd_swmr_writer) || (!(entry_ptr->is_mpmde)));
+
+ /* mark the entry dirty if necessary */
+ if ( ! ( entry_ptr->is_dirty ) ) {
+
+ entry_ptr->is_dirty = TRUE;
+
+ H5PB__UPDATE_INDEX_FOR_ENTRY_DIRTY(pb_ptr, entry_ptr)
+
+ /* since the entry was clean, there can be no pending delayed write */
+ HDassert(entry_ptr->delay_write_until == 0);
+
+ if ( ( pb_ptr->vfd_swmr_writer ) &&
+ ( entry_ptr->loaded ) &&
+ ( entry_ptr->mem_type != H5FD_MEM_DRAW ) &&
+ ( H5F_vfd_swmr_writer__delay_write(shared, entry_ptr->page,
+ &(entry_ptr->delay_write_until)) < 0 ) )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "get delayed write request failed")
+
+ if ( entry_ptr->delay_write_until > 0 ) {
+
+ if ( ! ( entry_ptr->is_mpmde ) ) {
+
+ /* remove the entry from the replacement policy */
+
+ H5PB__UPDATE_RP_FOR_REMOVE(pb_ptr, entry_ptr, FAIL)
+ }
+
+ H5PB__INSERT_IN_DWL(pb_ptr, entry_ptr, FAIL)
+
+ } else if ( ! (entry_ptr->is_mpmde) ) {
+
+ H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, FAIL)
+
+ } else {
+
+ /* the entry should be a multi-page metadata entry that
+ * has been modified this tick. Thus no action is required.
+ */
+ HDassert(entry_ptr->is_mpmde);
+ HDassert(pb_ptr->vfd_swmr_writer);
+ }
+ } else if ( ( ! (entry_ptr->is_mpmde) ) &&
+ ( entry_ptr->delay_write_until == 0 ) ) {
+
+ /* the entry is dirty and on the replacement policy -- just update
+ * the replacement policy for an access
*/
- for(i = 0; i < num_touched_pages; i++) {
- search_addr = i * page_buf->page_size + first_page_addr;
+ H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, FAIL)
+ }
- /* Special handling for the first page if it is not a full page update */
- if(i == 0 && first_page_addr != addr) {
- /* Lookup the page in the skip list */
- page_entry = (H5PB_entry_t *)H5SL_search(page_buf->slist_ptr, (void *)(&search_addr));
- if(page_entry) {
- offset = addr - first_page_addr;
- HDassert(page_buf->page_size > offset);
-
- /* Update page's data */
- H5MM_memcpy((uint8_t *)page_entry->page_buf_ptr + offset, buf, page_buf->page_size - (size_t)offset);
-
- /* Mark page dirty and push to top of LRU */
- page_entry->is_dirty = TRUE;
- H5PB__MOVE_TO_TOP_LRU(page_buf, page_entry)
- } /* end if */
- } /* end if */
- /* Special handling for the last page if it is not a full page update */
- else if(num_touched_pages > 1 && i == (num_touched_pages - 1) &&
- (search_addr + page_buf->page_size) != (addr + size)) {
- HDassert(search_addr+page_buf->page_size > addr+size);
-
- /* Lookup the page in the skip list */
- page_entry = (H5PB_entry_t *)H5SL_search(page_buf->slist_ptr, (void *)(&search_addr));
- if(page_entry) {
- offset = (num_touched_pages - 2) * page_buf->page_size +
- (page_buf->page_size - (addr - first_page_addr));
-
- /* Update page's data */
- H5MM_memcpy(page_entry->page_buf_ptr, (const uint8_t *)buf + offset,
- (size_t)((addr + size) - last_page_addr));
+done:
- /* Mark page dirty and push to top of LRU */
- page_entry->is_dirty = TRUE;
- H5PB__MOVE_TO_TOP_LRU(page_buf, page_entry)
- } /* end if */
- } /* end else-if */
- /* Discard all fully written pages from the page buffer */
- else {
- page_entry = (H5PB_entry_t *)H5SL_remove(page_buf->slist_ptr, (void *)(&search_addr));
- if(page_entry) {
- /* Remove from LRU list */
- H5PB__REMOVE_LRU(page_buf, page_entry)
-
- /* Decrement page count of appropriate type */
- if(H5F_MEM_PAGE_DRAW == page_entry->type || H5F_MEM_PAGE_GHEAP == page_entry->type)
- page_buf->raw_count--;
- else
- page_buf->meta_count--;
-
- /* Free page info */
- page_entry->page_buf_ptr = H5FL_FAC_FREE(page_buf->page_fac, page_entry->page_buf_ptr);
- page_entry = H5FL_FREE(H5PB_entry_t, page_entry);
- } /* end if */
- } /* end else */
- } /* end for */
- } /* end if */
- else {
- /* An access could span 1 or 2 PBs at this point so we need to handle that */
- HDassert(1 == num_touched_pages || 2 == num_touched_pages);
- for(i = 0; i < num_touched_pages; i++) {
- haddr_t buf_offset;
-
- /* Calculate the aligned address of the page to search for it in the skip list */
- search_addr = (0 == i ? first_page_addr : last_page_addr);
-
- /* Calculate the access size if the access spans more than 1 page */
- if(1 == num_touched_pages)
- access_size = size;
- else
- access_size = (0 == i ? (size_t)(first_page_addr + page_buf->page_size - addr) : (size - access_size));
-
- /* Lookup the page in the skip list */
- page_entry = (H5PB_entry_t *)H5SL_search(page_buf->slist_ptr, (void *)(&search_addr));
-
- /* If found */
- if(page_entry) {
- offset = (0 == i ? addr - page_entry->addr : 0);
- buf_offset = (0 == i ? 0 : size - access_size);
-
- /* Copy the requested data from the input buffer into the page */
- H5MM_memcpy((uint8_t *)page_entry->page_buf_ptr + offset, (const uint8_t *)buf + buf_offset, access_size);
-
- /* Mark page dirty and push to top of LRU */
- page_entry->is_dirty = TRUE;
- H5PB__MOVE_TO_TOP_LRU(page_buf, page_entry)
-
- /* Update statistics */
- if(type == H5FD_MEM_DRAW || type == H5FD_MEM_GHEAP)
- page_buf->hits[1]++;
- else
- page_buf->hits[0]++;
- } /* end if */
- /* If not found */
- else {
- void *new_page_buf;
- size_t page_size = page_buf->page_size;
-
- /* Make space for new entry */
- if((H5SL_count(page_buf->slist_ptr) * page_buf->page_size) >= page_buf->max_size) {
- htri_t can_make_space;
-
- /* Check if we can make space in page buffer */
- if((can_make_space = H5PB__make_space(f_sh, page_buf, type)) < 0)
- HGOTO_ERROR(H5E_PAGEBUF, H5E_NOSPACE, FAIL, "make space in Page buffer Failed")
-
- /* If make_space returns 0, then we can't use the page
- * buffer for this I/O and we need to bypass
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5PB__mark_entry_dirty() */
+
+static void
+metadata_section_split(size_t pgsz, haddr_t addr, size_t len, const void *_buf,
+ metadata_section_t *section)
+{
+ int i;
+ size_t totlen = 0;
+ haddr_t whole_pgaddr, tail_pgaddr;
+ const char *buf = _buf;
+ metadata_section_t *head = &section[0], *middle = &section[1],
+ *tail = &section[2];
+
+ /* Try to find the address of the first whole page, and the address of
+ * the page after the last whole page.
+ */
+ whole_pgaddr = roundup(addr, pgsz);
+ tail_pgaddr = rounddown(addr + len, pgsz);
+
+ /* In the degenerate case where the first whole page is "after" the last,
+ * actually the entire access lands between page boundaries.
+ */
+ if (whole_pgaddr > tail_pgaddr) {
+ assert(len < pgsz);
+ head->addr = addr;
+ head->len = len;
+ head->buf = buf;
+ return;
+ }
+
+ /* `head` spans any range beginning before the first page boundary. */
+ if (addr < whole_pgaddr) {
+ head->buf = buf;
+ head->len = pgsz - addr % pgsz;
+ head->addr = addr;
+ }
+
+ /* `middle` spans one or more whole pages in between the end of
+ * `head` and before the beginning of `tail`.
+ */
+ if (whole_pgaddr < tail_pgaddr) {
+ middle->buf = (buf == NULL) ? NULL : &buf[whole_pgaddr - addr];
+ middle->len = tail_pgaddr - whole_pgaddr;
+ middle->addr = whole_pgaddr;
+ }
+
+ /* `tail` spans residual bytes that follow the last page boundary. */
+ if (tail_pgaddr < addr + len) {
+ tail->len = (addr + len) - tail_pgaddr;
+ tail->buf = (buf == NULL) ? NULL : &buf[tail_pgaddr - addr];
+ tail->addr = tail_pgaddr;
+ }
+
+ for (i = 0; i < 3; i++) {
+ metadata_section_t *iter = &section[i];
+ if (iter->len == 0)
+ continue;
+ assert(iter->addr == addr + totlen);
+ assert(iter->buf == ((buf == NULL) ? NULL : &buf[totlen]));
+// assert(i == 0 || iter[-1].buf + iter[-1].len == iter->buf);
+ totlen += iter->len;
+ }
+
+ assert(totlen == len);
+}
+
+static herr_t
+metadata_multipart_read(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr,
+ size_t len, void *_buf/*out*/)
+{
+ herr_t rc;
+ int i;
+ const size_t pgsz = shared->pb_ptr->page_size;
+ metadata_section_t section[3] = {{0, 0, NULL}, {0, 0, NULL}, {0, 0, NULL}};
+
+ metadata_section_split(pgsz, addr, len, _buf, section);
+
+ for (i = 0; i < 3; i++) {
+ metadata_section_t *iter = &section[i];
+ if (iter->buf == NULL)
+ continue;
+ rc = H5PB__read_meta(shared, type, iter->addr, iter->len,
+ (void *)(uintptr_t)iter->buf);
+ if (rc < 0)
+ return rc;
+ }
+
+ return SUCCEED;
+}
+
+static herr_t
+metadata_multipart_write(H5F_shared_t *shared, H5FD_mem_t type,
+ haddr_t addr, size_t len, const void *_buf/*out*/)
+{
+ herr_t rc;
+ int i;
+ const size_t pgsz = shared->pb_ptr->page_size;
+ metadata_section_t section[3] = {{0, 0, NULL}, {0, 0, NULL}, {0, 0, NULL}};
+
+ metadata_section_split(pgsz, addr, len, _buf, section);
+
+ for (i = 0; i < 3; i++) {
+ metadata_section_t *iter = &section[i];
+
+ if (iter->buf == NULL)
+ continue;
+ rc = H5PB__write_meta(shared, type, iter->addr, iter->len, iter->buf);
+ if (rc < 0)
+ return rc;
+ }
+
+ return SUCCEED;
+}
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5PB__read_meta
+ *
+ * Purpose: Satisfy a metadata read in cases 7, 8, 9, and 10)
+ * H5PB_read(). Specifically:
+ *
+ * 6) If the read is for metadata and not page aligned, clip
+ * the read to the end of the current page if necessary.
+ * Load the relevant page if necessary and satisfy the
+ * read from the page buffer. Note that it there is an
+ * existing page, it must not be a multi-page metadata
+ * entry. It it is, flag an error.
+ *
+ * 7) If the read is for metadata, is page aligned, is larger
+ * than one page, and there is no entry in the page buffer,
+ * satisfy the read from the file
+ *
+ * 8) If the read is for metadata, is page aligned, is larger
+ * than one page, and there is a regular entry at the target
+ * page address, test to see if the last read was for the
+ * same address.
+ *
+ * If was, evict the page, and satisfy the read from file.
+ * Flag an error if the page was dirty.
+ *
+ * If the last read was for a different page, clip the read
+ * to one page, and satisfy the read from the existing
+ * regular entry.
+ *
+ * 9) If the read is for metadata, is page aligned, is larger
+ * than one page, and there is a multi-page metadata entry
+ * at the target page address, test to see if
+ * pb_ptr->vfd_swmr_write is TRUE.
+ *
+ * If it is, satisfy the read from the multi-page metadata
+ * entry, clipping the read if necessary.
+ *
+ * if pb_ptr->vfd_swmr_write is FALSE, flag an error.
+ *
+ * 10) If the read is for metadata, is page aligned, is no
+ * larger than a page, test to see if the page buffer
+ * contains a page at the target address.
+ *
+ * If it doesn't, load the page and satisfy the read
+ * from it.
+ *
+ * If it contains a regular page entry, satisfy the read
+ * from it.
+ *
+ * If it contains a multipage metadata entry at the target
+ * address, satisfy the read from the multi-page metadata
+ * entry if pb_ptr->vfd_swmr_write is TRUE, and flag an
+ * error otherwise.
+ *
+ * The above case analysis may be a bit hard to read. If so,
+ * the table shown below may help to clarify. Here:
+ *
+ * P/A == page aligned
+ * size > PL == size > page length
+ * PA == previous address
+ * A == current address
+ *
+ * In the entry exists column:
+ *
+ * N == no entry
+ * R == regular (1 page) entry
+ * MPMDE == multi-page metadata entry
+ *
+ * | size | entry | VFD | |
+ * P/A: | > PL | exists | SWMR | PA == A | Comments:
+ * ------+------+--------+------+---------+-------------------------------------
+ * N | X | N || R | X | X | Clip read to page boundary if
+ * | | | | | necessary
+ * | | | | | Load entry if necessary
+ * | | | | | Satisfy read from entry (case 6)
+ * ------+------+--------+------+---------+-------------------------------------
+ * N | X | MPMDE | X | X | Error (case 6)
+ * ------+------+--------+------+---------+-------------------------------------
+ * | | | | |
+ * ------+------+--------+------+---------+-------------------------------------
+ * Y | Y | N | X | X | Satisfy read from file (case 7)
+ * ------+------+--------+------+---------+-------------------------------------
+ * Y | Y | R | X | N | Clip read to page boundary
+ * | | | | | Satisfy read from entry (case 8)
+ * ------+------+--------+------+---------+-------------------------------------
+ * Y | Y | R | X | Y | Evict entry
+ * | | | | | (must be clean -- flag error if not)
+ * | | | | | Satisfy read from file (case 8)
+ * ------+------+--------+------+---------+-------------------------------------
+ * Y | Y | MPMDE | N | X | Error (case 9)
+ * ------+------+--------+------+---------+-------------------------------------
+ * Y | Y | MPMDE | Y | X | Clip read to MPE size if required.
+ * | | | | | Satify read from MPE (case 9)
+ * ------+------+--------+------+---------+-------------------------------------
+ * | | | | |
+ * ------+------+--------+------+---------+-------------------------------------
+ * Y | N | N | X | X | Load entry
+ * | | | | | Satisfy read from entry (case 10)
+ * ------+------+--------+------+---------+-------------------------------------
+ * Y | N | R | X | X | Satisfy read from entry (case 10)
+ * ------+------+--------+------+---------+-------------------------------------
+ * Y | N | MPMDE | Y | X | Satisfy read from entry (case 10)
+ * ------+------+--------+------+---------+-------------------------------------
+ * Y | N | MPMDE | N | X | Error (case 10)
+ * ------+------+--------+------+---------+-------------------------------------
+ *
+ * Observe that the above cases imply that:
+ *
+ * 1) The page buffer is defined.
+ *
+ * 2) The page buffer has been configured to accept at least
+ * one page of metadata.
+ *
+ * 3) This is a metadata read.
+ *
+ * Note also that if the metadata read is of size
+ * no larger than page size, it may not cross page
+ * boundaries.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: John Mainzer -- 10/11/18
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size,
+ void *buf/*out*/)
+{
+ H5PB_t *pb_ptr; /* Page buffer for this file */
+ H5PB_entry_t *entry_ptr; /* Pointer to page buffer entry */
+ H5FD_t *file; /* File driver pointer */
+ uint64_t page; /* page offset of addr */
+ haddr_t page_addr; /* page containing addr */
+ static haddr_t prev_addr = HADDR_UNDEF; /* addr of last call */
+ size_t offset; /* offset of read in page */
+ size_t clipped_size; /* possibley clipped size */
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ /* Sanity checks */
+ HDassert(shared);
+ HDassert(shared->pb_ptr);
+
+ pb_ptr = shared->pb_ptr;
+
+ HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+ HDassert(pb_ptr->min_rd_pages < pb_ptr->max_pages);
+ HDassert(shared->lf);
+
+ file = shared->lf;
+
+ HDassert(H5FD_MEM_DRAW != type);
+ HDassert(buf);
+
+ /* Calculate the aligned address of the first page */
+ page = (addr / pb_ptr->page_size);
+ page_addr = page * pb_ptr->page_size;
+
+ if ( page_addr != addr ) { /* case 6 */
+
+ /* If the read is for metadata and not page aligned, clip
+ * the read to the end of the current page if necessary.
+ * Load the relevant page if necessary and satisfy the
+ * read from the page buffer. Note that it there is an
+ * existing page, it must not be a multi-page metadata
+ * entry. It it is, flag an error.
+ */
+
+ offset = addr - page_addr;
+
+ if ( (offset + size) <= pb_ptr->page_size ) {
+
+ clipped_size = size;
+
+ } else {
+
+ clipped_size = size - ( (offset + size) - pb_ptr->page_size);
+ }
+
+ HDassert( clipped_size > 0 );
+ HDassert( clipped_size <= size );
+ HDassert( (offset + clipped_size) <= pb_ptr->page_size );
+
+ /* get the containing page */
+ H5PB__SEARCH_INDEX(pb_ptr, page, entry_ptr, FAIL)
+
+ /* update hit rate stats */
+ H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, ((entry_ptr) != NULL), \
+ TRUE, FALSE)
+
+ if ( ( NULL == entry_ptr ) &&
+ ( H5PB__load_page(shared, pb_ptr, page_addr, type, &entry_ptr) < 0 ) )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
+ "page buffer page load request failed (1)")
+
+ HDassert(entry_ptr);
+ HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC);
+ HDassert(entry_ptr->addr == page_addr);
+ HDassert(entry_ptr->is_metadata);
+ HDassert(!(entry_ptr->is_mpmde));
+
+ /* copy data from the page into read buffer */
+ HDmemcpy((uint8_t *)buf, (uint8_t *)(entry_ptr->image_ptr) + offset,
+ clipped_size);
+
+ /* if the entry is on the LRU, update the replacement policy */
+ if ( ( ! (entry_ptr->is_mpmde) ) &&
+ ( entry_ptr->delay_write_until == 0 ) ) {
+
+ H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, FAIL)
+ }
+ } else {
+
+ HDassert( page_addr == addr );
+
+ if ( size >= pb_ptr->page_size ) {
+
+ /* search the page buffer for an entry at page */
+ H5PB__SEARCH_INDEX(pb_ptr, page, entry_ptr, FAIL)
+
+
+ if ( entry_ptr == NULL ) { /* case 7 */
+
+ /* update hit rate stats */
+ H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, FALSE, TRUE, size > pb_ptr->page_size)
+
+ /* If the read is for metadata, is page aligned, is larger
+ * than one page, and there is no entry in the page buffer,
+ * satisfy the read from the file
+ */
+ if ( H5FD_read(file, type, addr, size, buf) < 0)
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
+ "driver read request failed (1)")
+
+ H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size);
+ } else {
+
+ HDassert( entry_ptr );
+ HDassert( entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC );
+ HDassert( entry_ptr->is_metadata );
+
+ if ( ! ( entry_ptr->is_mpmde ) ) { /* case 8 */
+
+ /* If the read is for metadata, is page aligned, is larger
+ * than one page, and there is a regular entry at the target
+ * page address, test to see if the last read was for the
+ * same address.
+ *
+ * If it was, evict the page, and satisfy the read from
+ * file. Flag an error if the page was dirty.
+ *
+ * If the last read was for a different page, clip the read
+ * to one page, and satisfy the read from the existing
+ * regular entry.
*/
- if(0 == can_make_space) {
- HDassert(0 == i);
-
- /* Write to VFD and return */
- if(H5FD_write(file, type, addr, size, buf) < 0)
- HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, "driver write request failed")
-
- /* Break out of loop */
- break;
- } /* end if */
- } /* end if */
-
- /* Don't bother searching if there is no write access */
- if(H5F_ACC_RDWR & H5F_SHARED_INTENT(f_sh))
- /* Lookup & remove the page from the new skip list page if
- * it exists to see if this is a new page from the MF layer
+
+ HDassert( entry_ptr->size == pb_ptr->page_size );
+
+ if ( addr == prev_addr ) {
+
+ /* since this is a second try, don't update
+ * hit rate stats.
+ */
+
+ HDassert( ! ( entry_ptr->is_dirty ) );
+
+ if (H5PB__evict_entry(shared, entry_ptr, TRUE, false) < 0)
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "forced eviction failed (1)")
+ if ( H5FD_read(file, type, addr, size, buf) < 0)
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
+ "driver read request failed (2)")
+
+ H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size);
+ } else {
+
+ HDassert( entry_ptr->image_ptr );
+
+ /* copy data from the page into read buffer */
+ HDmemcpy((uint8_t *)buf,
+ (uint8_t *)(entry_ptr->image_ptr),
+ entry_ptr->size);
+
+ /* if the entry is on the LRU, update the replacement
+ * policy
+ */
+ if ( ( ! (entry_ptr->is_mpmde) ) &&
+ ( entry_ptr->delay_write_until == 0 ) ) {
+
+ H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, FAIL)
+ }
+
+ /* update hit rate stats */
+ H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, TRUE, TRUE, FALSE)
+ }
+ } else { /* case 9 */
+
+ /* If the read is for metadata, is page aligned, is larger
+ * than one page, and there is a multi-page metadata entry
+ * at the target page address, test to see if
+ * pb_ptr->vfd_swmr_write is TRUE.
+ *
+ * If it is, satisfy the read from the multi-page metadata
+ * entry, clipping the read if necessary.
+ *
+ * if pb_ptr->vfd_swmr_write is FALSE, flag an error.
*/
- page_entry = (H5PB_entry_t *)H5SL_remove(page_buf->mf_slist_ptr, (void *)(&search_addr));
-
- /* Calculate offset into the buffer of the page and the user buffer */
- offset = (0 == i ? addr - search_addr : 0);
- buf_offset = (0 == i ? 0 : size - access_size);
-
- /* If found, then just update the buffer pointer to the newly allocate buffer */
- if(page_entry) {
- /* Allocate space for the page buffer */
- if(NULL == (new_page_buf = H5FL_FAC_MALLOC(page_buf->page_fac)))
- HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTALLOC, FAIL, "memory allocation failed for page buffer entry")
- HDmemset(new_page_buf, 0, (size_t)offset);
- HDmemset((uint8_t *)new_page_buf + offset + access_size, 0, page_size - ((size_t)offset + access_size));
-
- page_entry->page_buf_ptr = new_page_buf;
-
- /* Update statistics */
- if(type == H5FD_MEM_DRAW || type == H5FD_MEM_GHEAP)
- page_buf->hits[1]++;
- else
- page_buf->hits[0]++;
- } /* end if */
- /* Otherwise read page through the VFD layer, but make sure we don't read past the EOA. */
- else {
- haddr_t eoa, eof = HADDR_UNDEF;
-
- /* Allocate space for the page buffer */
- if(NULL == (new_page_buf = H5FL_FAC_CALLOC(page_buf->page_fac)))
- HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTALLOC, FAIL, "memory allocation failed for page buffer entry")
-
- /* Create the new loaded PB entry */
- if(NULL == (page_entry = H5FL_CALLOC(H5PB_entry_t)))
- HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTALLOC, FAIL, "memory allocation failed")
-
- page_entry->page_buf_ptr = new_page_buf;
- page_entry->addr = search_addr;
- page_entry->type = (H5F_mem_page_t)type;
-
- /* Retrieve the 'eoa' for the file */
- if(HADDR_UNDEF == (eoa = H5F_shared_get_eoa(f_sh, type)))
- HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTGET, FAIL, "driver get_eoa request failed")
-
- /* If the entire page falls outside the EOA, then fail */
- if(search_addr > eoa)
- HGOTO_ERROR(H5E_PAGEBUF, H5E_BADVALUE, FAIL, "writing to a page that is outside the file EOA")
-
- /* Retrieve the 'eof' for the file - The MPI-VFD EOF
- * returned will most likely be HADDR_UNDEF, so skip
- * that check.
+ HDassert( entry_ptr->is_mpmde );
+ HDassert( pb_ptr->vfd_swmr_writer );
+
+ if ( size > entry_ptr->size ) {
+
+ clipped_size = entry_ptr->size;
+
+ } else {
+
+ clipped_size = size;
+ }
+
+ /* copy data from the page into read buffer */
+ HDmemcpy((uint8_t *)buf, (uint8_t *)(entry_ptr->image_ptr),
+ clipped_size);
+
+ /* if the entry is on the LRU, update the replacement
+ * policy
*/
- if(!H5F_SHARED_HAS_FEATURE(f_sh, H5FD_FEAT_HAS_MPI))
- if(HADDR_UNDEF == (eof = H5FD_get_eof(f_sh->lf, H5FD_MEM_DEFAULT)))
- HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTGET, FAIL, "driver get_eof request failed")
-
- /* Adjust the read size to not go beyond the EOA */
- if(search_addr + page_size > eoa)
- page_size = (size_t)(eoa - search_addr);
-
- if(search_addr < eof) {
- if(H5FD_read(file, type, search_addr, page_size, new_page_buf) < 0)
- HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, "driver read request failed")
-
- /* Update statistics */
- if(type == H5FD_MEM_DRAW || type == H5FD_MEM_GHEAP)
- page_buf->misses[1]++;
- else
- page_buf->misses[0]++;
- } /* end if */
- } /* end else */
-
- /* Copy the requested data from the page into the input buffer */
- H5MM_memcpy((uint8_t *)new_page_buf + offset, (const uint8_t *)buf+buf_offset, access_size);
-
- /* Page is dirty now */
- page_entry->is_dirty = TRUE;
-
- /* Insert page into PB, evicting other pages as necessary */
- if(H5PB__insert_entry(page_buf, page_entry) < 0)
- HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTSET, FAIL, "error inserting new page in page buffer")
- } /* end else */
- } /* end for */
- } /* end else */
+ if ( ( ! (entry_ptr->is_mpmde) ) &&
+ ( entry_ptr->delay_write_until == 0 ) ) {
+
+ H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, FAIL)
+ }
+
+ /* update hit rate stats */
+ H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, TRUE, TRUE, TRUE)
+ }
+ }
+ } else { /* case 10 */
+
+ /* If the read is for metadata, is page aligned, is no
+ * larger than a page, test to see if the page buffer
+ * contains a page at the target address.
+ *
+ * If it doesn't, load the page and satisfy the read
+ * from it.
+ *
+ * If it contains a regular page entry, satisfy the read
+ * from it.
+ *
+ * If it contains a multipage metadata entry at the target
+ * address, satisfy the read from the multi-page metadata
+ * entry if pb_ptr->vfd_swmr_write is TRUE, and flag an
+ * error otherwise.
+ */
+ HDassert( size <= pb_ptr->page_size );
+
+ /* get the containing page */
+ H5PB__SEARCH_INDEX(pb_ptr, page, entry_ptr, FAIL)
+
+ /* update hit rate stats */
+ H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, (entry_ptr != NULL), \
+ TRUE, FALSE)
+
+ if ( ( NULL == entry_ptr ) &&
+ ( H5PB__load_page(shared, pb_ptr, page_addr, type, &entry_ptr) < 0))
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
+ "page buffer page load request failed (2)")
+
+ HDassert( entry_ptr );
+ HDassert( entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC );
+ HDassert( entry_ptr->is_metadata );
+ HDassert( ( ! ( entry_ptr->is_mpmde ) ) ||
+ ( pb_ptr->vfd_swmr_writer) );
+
+ /* copy data from the page into read buffer */
+ HDmemcpy((uint8_t *)buf, (uint8_t *)(entry_ptr->image_ptr), size);
+
+ /* if the entry is on the LRU, update the replacement policy */
+ if ( ( ! (entry_ptr->is_mpmde) ) &&
+ ( entry_ptr->delay_write_until == 0 ) ) {
+
+ H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, FAIL)
+ }
+ }
+ }
+
+ prev_addr = addr;
done:
+
FUNC_LEAVE_NOAPI(ret_value)
-} /* end H5PB_write() */
+
+} /* end H5PB__read_meta() */
/*-------------------------------------------------------------------------
- * Function: H5PB__insert_entry()
*
- * Purpose: This function was created without documentation.
- * What follows is my best understanding of Mohamad's intent.
+ * Function: H5PB__read_raw
*
- * Insert the supplied page into the page buffer, both the
- * skip list and the LRU.
+ * Purpose: Satisfy a raw data read in cases 3 and 4 from H5PB_read().
+ * Specifically:
*
- * As best I can tell, this function imposes no limit on the
- * number of entries in the page buffer beyond an assertion
- * failure it the page count exceeds the limit.
+ * 3) If the read is for raw data, and it is larger than the
+ * page size, read it directly from the HDF5 file.
*
- * JRM -- 12/22/16
+ * It is possible that the page buffer contains dirty pages
+ * that intersect with the read -- test for this and update
+ * the read buffer from the page buffer if any such pages
+ * exist.
*
+ * Note that no pages are inserted into the page buffer in
+ * this case.
*
- * Return: Non-negative on success/Negative on failure
+ * 4) If the read is for raw data, and it is of size less
+ * than or equal to the page size, satisfy the read from
+ * the page buffer, loading and inserting pages into the
+ * page buffer as necessary
+ *
+ * Observe that this implies that:
*
- * Programmer: Mohamad Chaarawi
+ * 1) The page buffer is defined.
+ *
+ * 2) The page buffer has been configured to accept at least
+ * one page of raw data.
+ *
+ * 2) This is a raw data read.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: John Mainzer -- 10/11/18
+ *
+ * Changes: None.
*
*-------------------------------------------------------------------------
*/
static herr_t
-H5PB__insert_entry(H5PB_t *page_buf, H5PB_entry_t *page_entry)
+H5PB__read_raw(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size,
+ void *buf/*out*/)
{
- herr_t ret_value = SUCCEED; /* Return value */
+ H5PB_t *pb_ptr; /* Page buffer for this file */
+ H5PB_entry_t *entry_ptr; /* Pointer to page buffer entry */
+ uint64_t first_page; /* page offset of first I/O */
+ uint64_t last_page; /* page offset of last I/O */
+ uint64_t search_page; /* page offset of current page */
+ haddr_t first_page_addr; /* address of first page of I/O */
+ haddr_t last_page_addr; /* address of last page of I/O */
+ haddr_t search_addr; /* Address of current page */
+ hsize_t num_touched_pages; /* Number of pages accessed */
+ size_t offset; /* offset of read in page */
+ size_t length; /* length of read in page */
+ hsize_t i; /* Local index variable */
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ /* Sanity checks */
+ HDassert(shared);
+ HDassert(shared->pb_ptr);
+
+ pb_ptr = shared->pb_ptr;
+
+ HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+ HDassert(pb_ptr->min_md_pages < pb_ptr->max_pages);
+ HDassert(H5FD_MEM_DRAW == type);
+
+
+ /* Calculate the aligned address of the first page */
+ first_page = (addr / pb_ptr->page_size);
+ first_page_addr = first_page * pb_ptr->page_size;
+
+ /* Calculate the aligned address of the last page */
+ last_page = ((addr + size - 1) / pb_ptr->page_size);
+ last_page_addr = last_page * pb_ptr->page_size;
+
+ /* Calculate number of pages that this read spans. */
+ num_touched_pages = last_page - first_page + 1;
+
+ if ( first_page_addr == last_page_addr ) {
+
+ HDassert(1 == num_touched_pages);
+ last_page_addr = HADDR_UNDEF;
+
+ }
+
+ /* case 3) raw data read of page size or greater. */
+ if ( size >= pb_ptr->page_size ) {
+
+ if ( H5FD_read(shared->lf, type, addr, size, buf) < 0)
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, "read failed")
+
+
+ H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size);
+
+
+ /* For each page that intersects with the above read, check to see
+ * if it exists in the page buffer, and if so, if it is dirty.
+ *
+ * If it does and is, update the read buffer with the contents
+ * of the page so we get the up to date data into the buffer
+ * after the big read from the file.
+ */
+ search_page = first_page;
+ search_addr = first_page_addr;
+
+ for(i = 0; i < num_touched_pages; i++) {
+
+ H5PB__SEARCH_INDEX(pb_ptr, search_page, entry_ptr, FAIL)
+
+ /* update hit rate stats */
+ H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, (entry_ptr != NULL), \
+ FALSE, FALSE)
+
+ if ( entry_ptr ) {
+
+ HDassert( entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC );
+ HDassert( ! ( entry_ptr->is_metadata ) );
+ HDassert( entry_ptr->page == search_page );
+ HDassert( entry_ptr->addr == search_addr );
+ HDassert( entry_ptr->size == pb_ptr->page_size );
+ HDassert( entry_ptr->delay_write_until == 0 );
+ /* This page and [addr, addr + size) should NOT be disjoint. */
+ HDassert(!(addr + size <= entry_ptr->addr || entry_ptr->addr + entry_ptr->size <= addr));
+
+ if ( entry_ptr->is_dirty ) {
+
+ if ( i == 0 ) {
+
+ /* handle the possible partial access of the
+ * first page.
+ */
+
+ HDassert( search_addr == first_page_addr );
+ HDassert( search_page == first_page );
+
+ offset = addr - first_page_addr;
+
+ HDassert((( offset == 0 ) && (search_addr == addr )) ||
+ (( offset > 0 ) && ( search_addr < addr )));
+
+ HDassert(pb_ptr->page_size >= offset);
+
+ HDassert( size >= pb_ptr->page_size - (size_t)offset );
+
+ HDmemcpy(buf, (uint8_t *)entry_ptr->image_ptr + offset,
+ pb_ptr->page_size - (size_t)offset);
+
+ } else if ( i == num_touched_pages - 1 ) {
+
+ /* handle the possible partial access of the
+ * last page.
+ */
+ HDassert( i > 0 );
+ HDassert( search_addr == last_page_addr );
+ HDassert( search_page == last_page );
+ HDassert( addr < last_page_addr );
+ HDassert( last_page_addr < addr + size );
+
+ offset = (num_touched_pages - 2) * pb_ptr->page_size +
+ (pb_ptr->page_size - (addr - first_page_addr));
+
+ HDmemcpy((uint8_t *)buf + offset, entry_ptr->image_ptr,
+ (size_t)((addr + size) - last_page_addr));
+
+ } else {
+
+ /* this is an internal page -- copy it in its
+ * entireity.
+ */
+
+ offset = (i - 1) * pb_ptr->page_size +
+ (pb_ptr->page_size - (addr - first_page_addr));
+
+ HDassert ( addr + offset == search_addr );
+ HDassert ( offset + pb_ptr->page_size <= size );
+
+ HDmemcpy((uint8_t *)buf + offset,
+ entry_ptr->image_ptr,
+ pb_ptr->page_size);
+ }
+
+ /* we have touched the entry -- move it to the top
+ * of the LRU if it resides there.
+ *
+ * The entry will be on the LRU if both it is not
+ * a multi-page metadata entry and it is not
+ * subject to a delayed write.
+ *
+ * As this is a raw data page buffer entry, both of
+ * these must be true, and are asserted above.
+ *
+ * Thus, just update the LRU.
+ */
+ H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, FAIL)
+
+ } /* if ( entry_ptr->is_dirty ) */
+ } /* if ( entry_ptr ) */
+
+ search_page++;
+ search_addr += pb_ptr->page_size;
+
+ } /* end for */
+ } else {
+ /* case 4: Raw data read of size less than page size.
+ *
+ * In this case, read the desired data from the page buffer, loading
+ * pages if necessary.
+ */
+ HDassert(size < pb_ptr->page_size);
+
+ /* first page */
+ offset = addr - first_page_addr;
+
+ if ( (offset + size) <= pb_ptr->page_size ) {
+
+ HDassert(num_touched_pages == 1);
+ length = size;
+
+ } else {
- FUNC_ENTER_STATIC
+ HDassert(num_touched_pages == 2);
+ length = size - (pb_ptr->page_size - offset);
+ }
- /* Insert entry in skip list */
- if(H5SL_insert(page_buf->slist_ptr, page_entry, &(page_entry->addr)) < 0)
- HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTINSERT, FAIL, "can't insert entry in skip list")
- HDassert(H5SL_count(page_buf->slist_ptr) * page_buf->page_size <= page_buf->max_size);
+ /* get the first page */
+ H5PB__SEARCH_INDEX(pb_ptr, first_page, entry_ptr, FAIL)
- /* Increment appropriate page count */
- if(H5F_MEM_PAGE_DRAW == page_entry->type || H5F_MEM_PAGE_GHEAP == page_entry->type)
- page_buf->raw_count++;
- else
- page_buf->meta_count++;
+ /* update hit rate stats */
+ H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, (entry_ptr != NULL), \
+ FALSE, FALSE)
- /* Insert entry in LRU */
- H5PB__INSERT_LRU(page_buf, page_entry)
+ if ( ( NULL == entry_ptr ) &&
+ ( H5PB__load_page(shared, pb_ptr, first_page_addr,
+ type, &entry_ptr) < 0 ) )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
+ "page buffer page load request failed (1)")
+
+ HDassert(entry_ptr);
+ HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC);
+ HDassert(entry_ptr->addr == first_page_addr);
+
+
+ /* copy data from first page into read buffer */
+ HDmemcpy((uint8_t *)buf, ((uint8_t *)(entry_ptr->image_ptr) + offset),
+ length);
+
+ H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, FAIL)
+
+ /* second page, if it exists */
+ if ( num_touched_pages == 2 ) {
+
+ offset = length;
+ length = size - offset;
+
+ HDassert(offset + length == size);
+
+ /* get the second page */
+ H5PB__SEARCH_INDEX(pb_ptr, last_page, entry_ptr, FAIL)
+
+ /* update hit rate stats */
+ H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, (entry_ptr != NULL), \
+ FALSE, FALSE)
+
+ if ( ( NULL == entry_ptr ) &&
+ ( H5PB__load_page(shared, pb_ptr, last_page_addr,
+ type, &entry_ptr) < 0 ) )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
+ "page buffer page load request failed (2)")
+
+ HDassert(entry_ptr);
+ HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC);
+ HDassert(entry_ptr->addr == last_page_addr);
+ HDassert(entry_ptr->page == last_page);
+
+ /* copy data from second page into read buffer */
+ HDmemcpy(((uint8_t *)(buf) + offset),
+ (uint8_t *)(entry_ptr->image_ptr), length);
+
+ H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, FAIL)
+ }
+ } /* end else */
done:
+
FUNC_LEAVE_NOAPI(ret_value)
-} /* end H5PB__insert_entry() */
+
+} /* end H5PB__read_raw() */
/*-------------------------------------------------------------------------
- * Function: H5PB__make_space()
*
- * Purpose: This function was created without documentation.
- * What follows is my best understanding of Mohamad's intent.
+ * Function: H5PB__write_meta
*
- * If necessary and if possible, evict a page from the page
- * buffer to make space for the supplied page. Depending on
- * the page buffer configuration and contents, and the page
- * supplied this may or may not be possible.
+ * Purpose: Satisfy a metadata write in cases 7 and 8 from H5PB_write().
+ * Specifically:
*
- * JRM -- 12/22/16
+ * 7) If the write is of metadata, the write is larger than
+ * one page, and vfd_swmr_writer is TRUE, the write must
+ * buffered in the page buffer until the end of the tick.
*
- * Return: Non-negative on success/Negative on failure
+ * If it doesn't exist already, create a multi-page metadata
+ * entry in the page buffer and copy the write into it.
+ * Insert the new entry in the tick list if necessary.
+ *
+ * Test to see if the write of the multi-page metadata
+ * entry must be delayed. If so, place the entry in
+ * the delayed write list. Otherwise, the multi-page
+ * metadata entry will be written to the HDF5 file and
+ * evicted when the tick list is released at the of the
+ * tick.
+ *
+ * 8) If the write is of metadata, and the write is of size
+ * less than or equal to the page size, write the data
+ * into the page buffer, loading and inserting a page
+ * if necessary.
+ *
+ * If, in addition, vfd_swmr_writer is TRUE, we must:
+ *
+ * * add the page touched by the write to the tick list
+ * so that it will be buffered until the end of the
+ * tick.
+ *
+ * * test to see if the write must be delayed, and
+ * add the page to the delayed write list if so.
+ *
+ * Observe that this implies that:
+ *
+ * 1) The page buffer is defined.
+ *
+ * 2) The page buffer has been configured to accept at least
+ * one page of metadata.
+ *
+ * 3) This is a metadata write.
*
- * Programmer: Mohamad Chaarawi
+ * Note also that if the metadata write is of size
+ * no larger than page size, it may not cross page
+ * boundaries.
+ *
+ * Further, for writes larger than page size (case 7 only),
+ * the base address must be page aligned.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: John Mainzer -- 10/11/18
+ *
+ * Changes: None.
*
*-------------------------------------------------------------------------
*/
-static htri_t
-H5PB__make_space(H5F_shared_t *f_sh, H5PB_t *page_buf, H5FD_mem_t inserted_type)
+static herr_t
+H5PB__write_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr,
+ size_t size, const void *buf/*in*/)
{
- H5PB_entry_t *page_entry; /* Pointer to page eviction candidate */
- htri_t ret_value = TRUE; /* Return value */
+ H5PB_t *pb_ptr; /* Page buffer for this file */
+ H5PB_entry_t *entry_ptr; /* Pointer to page buffer entry */
+ uint64_t page; /* page offset of addr */
+ haddr_t page_addr; /* page containg addr */
+ size_t offset; /* offset of write in page */
+ herr_t ret_value = SUCCEED; /* Return value */
- FUNC_ENTER_STATIC
+ FUNC_ENTER_NOAPI(FAIL)
- /* Sanity check */
- HDassert(f_sh);
- HDassert(page_buf);
-
- /* Get oldest entry */
- page_entry = page_buf->LRU_tail_ptr;
-
- if(H5FD_MEM_DRAW == inserted_type) {
- /* If threshould is 100% metadata and page buffer is full of
- metadata, then we can't make space for raw data */
- if(0 == page_buf->raw_count && page_buf->min_meta_count == page_buf->meta_count) {
- HDassert(page_buf->meta_count * page_buf->page_size == page_buf->max_size);
- HGOTO_DONE(FALSE)
- } /* end if */
-
- /* check the metadata threshold before evicting metadata items */
- while(1) {
- if(page_entry->prev && H5F_MEM_PAGE_META == page_entry->type &&
- page_buf->min_meta_count >= page_buf->meta_count)
- page_entry = page_entry->prev;
- else
- break;
- } /* end while */
- } /* end if */
- else {
- /* If threshould is 100% raw data and page buffer is full of
- raw data, then we can't make space for meta data */
- if(0 == page_buf->meta_count && page_buf->min_raw_count == page_buf->raw_count) {
- HDassert(page_buf->raw_count * page_buf->page_size == page_buf->max_size);
- HGOTO_DONE(FALSE)
- } /* end if */
-
- /* check the raw data threshold before evicting raw data items */
- while(1) {
- if(page_entry->prev && (H5F_MEM_PAGE_DRAW == page_entry->type || H5F_MEM_PAGE_GHEAP == page_entry->type) &&
- page_buf->min_raw_count >= page_buf->raw_count)
- page_entry = page_entry->prev;
- else
- break;
- } /* end while */
- } /* end else */
+ /* Sanity checks */
+ HDassert(shared);
+ HDassert(shared->pb_ptr);
+
+ pb_ptr = shared->pb_ptr;
+
+ HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+ HDassert(pb_ptr->min_rd_pages < pb_ptr->max_pages);
+ HDassert(H5FD_MEM_DRAW != type);
+ HDassert(buf);
- /* Remove from page index */
- if(NULL == H5SL_remove(page_buf->slist_ptr, &(page_entry->addr)))
- HGOTO_ERROR(H5E_PAGEBUF, H5E_BADVALUE, FAIL, "Tail Page Entry is not in skip list")
+ /* Calculate the aligned address of the first page */
+ page = (addr / pb_ptr->page_size);
+ page_addr = page * pb_ptr->page_size;
+
+ /* if size > pb_ptr->page_size, addr must be page aligned */
+ HDassert((size <= pb_ptr->page_size) || (addr == page_addr));
+
+ H5PB__SEARCH_INDEX(pb_ptr, page, entry_ptr, FAIL)
+
+ /* case 7) metadata write of size greater than page size. */
+ if ( size > pb_ptr->page_size ) {
+
+ offset = 0;
+
+ /* The write must be for a multi-page metadata entry, and
+ * we must be running as a VFD SWMR writer.
+ *
+ * This requires the following actions:
+ *
+ * 1) If the multi-page metadata entry is not already in the
+ * page buffer, create an entry for it.
+ *
+ * 2) Overwrite the image of the entry with the write buffer.
+ *
+ * 3) If the entry is not already on the tick list, add it to
+ * the tick list.
+ *
+ * 4) If the entry is not already on the delayed write list,
+ * test to see if it should be, and move it from the
+ * LRU to the delayed write list and set the delay_write_until
+ * field appropriately.
+ *
+ * This is done via the call to H5PB__mark_entry_dirty()
+ */
+ HDassert(pb_ptr->vfd_swmr_writer);
+ HDassert(addr == page_addr);
- /* Remove entry from LRU list */
- H5PB__REMOVE_LRU(page_buf, page_entry)
- HDassert(H5SL_count(page_buf->slist_ptr) == page_buf->LRU_list_len);
+ /* If we're about to overwrite a single-page entry with multiple
+ * pages, lengthen the entry.
+ */
+ if (entry_ptr != NULL && entry_ptr->size < size) {
+ H5PB_entry_t *overlap;
+ void *new_image = H5MM_malloc(size);
+ uint64_t iter_page;
+ uint64_t last_page = page +
+ roundup(size, pb_ptr->page_size) / pb_ptr->page_size;
+
+ hlog_fast(lengthen_pbentry,
+ "lengthening page %" PRIu64 " from %zu bytes to %zu, "
+ "last page %" PRIu64 "\n", page, entry_ptr->size, size,
+ last_page);
+
+ for (iter_page = page + 1; iter_page < last_page; iter_page++) {
+ H5PB__SEARCH_INDEX(pb_ptr, iter_page, overlap, FAIL)
+ assert(overlap == NULL);
+ }
+ if (new_image == NULL) {
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL,
+ "couldn't extend entry");
+ }
+ H5PB__UPDATE_RP_FOR_REMOVE(pb_ptr, entry_ptr, FAIL)
+
+ /* To keep statistics for the index and the tick-list up-to-date,
+ * it's expedient to remove and re-insert entries there.
+ */
+ H5PB__DELETE_FROM_INDEX(pb_ptr, entry_ptr, FAIL)
+ if (entry_ptr->modified_this_tick)
+ H5PB__REMOVE_FROM_TL(pb_ptr, entry_ptr, FAIL)
+
+ entry_ptr->image_ptr = H5MM_xfree(entry_ptr->image_ptr);
+ entry_ptr->image_ptr = new_image;
+ entry_ptr->is_mpmde = true;
+ entry_ptr->size = size;
+
+ if (entry_ptr->modified_this_tick)
+ H5PB__INSERT_IN_TL(pb_ptr, entry_ptr, FAIL)
+ H5PB__INSERT_IN_INDEX(pb_ptr, entry_ptr, FAIL)
+ }
+
+ /* update hit rate stats */
+ H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, (entry_ptr != NULL), \
+ TRUE, TRUE)
+
+ if ( NULL == entry_ptr ) {
+
+ /* the multi-page metadata entry is not currently in the page
+ * buffer. Create an entry for it, and insert it into the LRU.
+ *
+ * Don't bother to try to make space for it, as VFD SWMR
+ * ignores the limits on page buffer size.
+ */
+ if ( H5PB__create_new_page(pb_ptr, addr, size, type,
+ FALSE, &entry_ptr) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "can't create new page buffer page")
+
+ /* set entry_ptr->loaded to TRUE so as to trigger the
+ * the delayed write test in H5PB__mark_entry_dirty().
+ */
+ entry_ptr->loaded = TRUE;
+ }
+
+ /* at this point, one way or the other, the multi-page metadata
+ * entry must be in the page buffer.
+ */
+ HDassert(entry_ptr->is_mpmde);
+ HDassert(size == entry_ptr->size);
+ HDassert(type == entry_ptr->mem_type);
- /* Decrement appropriate page type counter */
- if(H5F_MEM_PAGE_DRAW == page_entry->type || H5F_MEM_PAGE_GHEAP == page_entry->type)
- page_buf->raw_count--;
- else
- page_buf->meta_count--;
+ } else {
+ /* case 8) metadata write of size no larger than page size */
- /* Flush page if dirty */
- if(page_entry->is_dirty)
- if(H5PB__write_entry(f_sh, page_entry) < 0)
- HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, "file write failed")
+ offset = addr - page_addr;
- /* Update statistics */
- if(page_entry->type == H5F_MEM_PAGE_DRAW || H5F_MEM_PAGE_GHEAP == page_entry->type)
- page_buf->evictions[1]++;
- else
- page_buf->evictions[0]++;
+ /* write cannot cross page boundaries. */
+ HDassert((offset + size) <= pb_ptr->page_size);
- /* Release page */
- page_entry->page_buf_ptr = H5FL_FAC_FREE(page_buf->page_fac, page_entry->page_buf_ptr);
- page_entry = H5FL_FREE(H5PB_entry_t, page_entry);
+ /* update hit rate stats */
+ H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, (entry_ptr != NULL), \
+ TRUE, FALSE)
+
+ if (NULL == entry_ptr &&
+ H5PB__load_page(shared, pb_ptr, page_addr, type, &entry_ptr) < 0) {
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
+ "page buffer page load request failed (1)")
+ }
+
+ HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC);
+ HDassert(entry_ptr->addr == page_addr);
+ HDassert(!(entry_ptr->is_mpmde));
+ HDassert(entry_ptr->size == pb_ptr->page_size);
+ HDassert(size <= entry_ptr->size);
+ }
+
+ HDassert(entry_ptr->is_metadata);
+
+ /* copy data from the write buffer into the page image */
+ HDmemcpy((uint8_t *)(entry_ptr->image_ptr) + offset, buf, size);
+
+ if (H5PB__mark_entry_dirty(shared, pb_ptr, entry_ptr) < 0)
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, "mark entry dirty failed")
+
+ /* Force the page buffer to retain the page until the end of
+ * the tick: add the entry to the tick list if it is not
+ * already present.
+ */
+ if (pb_ptr->vfd_swmr_writer && !entry_ptr->modified_this_tick) {
+ entry_ptr->modified_this_tick = true;
+ H5PB__INSERT_IN_TL(pb_ptr, entry_ptr, FAIL)
+ }
done:
+
FUNC_LEAVE_NOAPI(ret_value)
-} /* end H5PB__make_space() */
+
+} /* end H5PB__write_meta() */
/*-------------------------------------------------------------------------
- * Function: H5PB__write_entry()
*
- * Purpose: ???
+ * Function: H5PB__write_raw
*
- * This function was created without documentation.
- * What follows is my best understanding of Mohamad's intent.
+ * Purpose: Satisfy a raw data write in cases 3 and 4 from H5PB_write().
+ * Specifically:
*
- * Return: Non-negative on success/Negative on failure
+ * 3) If the write is raw data, and it of page size or
+ * larger, write directly to the HDF5 file.
+ *
+ * It is possible that the write intersects one or more
+ * pages in the page buffer -- test for this and update
+ * any partially written pages, and evict any pages
+ * that are completely overwritten.
+ *
+ * Note that no pages are inserted into the page buffer in
+ * this case.
+ *
+ * 4) If the write is of raw data, and it is of size less
+ * than the page size, write the page into the page
+ * buffer, loading and inserting pages into the
+ * page buffer as necessary
+ *
+ * Observe that this implies that:
+ *
+ * 1) The page buffer is defined.
+ *
+ * 2) The page buffer has been configured to accept at least
+ * one page of raw data.
+ *
+ * 2) This is a raw data write.
+ *
+ * Return: Non-negative on success/Negative on failure
*
- * Programmer: Mohamad Chaarawi
+ * Programmer: John Mainzer -- 10/11/18
+ *
+ * Changes: None.
*
*-------------------------------------------------------------------------
*/
static herr_t
-H5PB__write_entry(H5F_shared_t *f_sh, H5PB_entry_t *page_entry)
+H5PB__write_raw(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size,
+ const void *buf/*out*/)
{
- haddr_t eoa; /* Current EOA for the file */
- herr_t ret_value = SUCCEED; /* Return value */
+ H5PB_t *pb_ptr; /* Page buffer for this file */
+ H5PB_entry_t *entry_ptr; /* Pointer to page buffer entry */
+ uint64_t first_page; /* page offset of first I/O */
+ uint64_t last_page; /* page offset of last I/O */
+ uint64_t search_page; /* page offset of current page */
+ haddr_t first_page_addr; /* address of first page of I/O */
+ haddr_t last_page_addr; /* address of last page of I/O */
+ haddr_t search_addr; /* Address of current page */
+ hsize_t num_touched_pages; /* Number of pages accessed */
+ hsize_t i; /* Local index variable */
+ size_t length; /* length of write in a page */
+ size_t offset; /* offset of write in a page */
+ herr_t ret_value = SUCCEED; /* Return value */
- FUNC_ENTER_STATIC
+ FUNC_ENTER_NOAPI(FAIL)
- /* Sanity check */
- HDassert(f_sh);
- HDassert(page_entry);
+ /* Sanity checks */
+ HDassert(shared);
+ HDassert(shared->pb_ptr);
- /* Retrieve the 'eoa' for the file */
- if(HADDR_UNDEF == (eoa = H5F_shared_get_eoa(f_sh, (H5FD_mem_t)page_entry->type)))
- HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTGET, FAIL, "driver get_eoa request failed")
+ pb_ptr = shared->pb_ptr;
- /* If the starting address of the page is larger than
- * the EOA, then the entire page is discarded without writing.
- */
- if(page_entry->addr <= eoa) {
- H5FD_t *file; /* File driver I/O info */
- size_t page_size = f_sh->page_buf->page_size;
+ HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+ HDassert(pb_ptr->min_md_pages < pb_ptr->max_pages);
+ HDassert(shared->lf);
- /* Adjust the page length if it exceeds the EOA */
- if((page_entry->addr + page_size) > eoa)
- page_size = (size_t)(eoa - page_entry->addr);
+ HDassert(H5FD_MEM_DRAW == type);
- /* Translate to file driver I/O info object */
- file = f_sh->lf;
+ /* Calculate the aligned address of the first page */
+ first_page = (addr / pb_ptr->page_size);
+ first_page_addr = first_page * pb_ptr->page_size;
- if(H5FD_write(file, (H5FD_mem_t)page_entry->type, page_entry->addr, page_size, page_entry->page_buf_ptr) < 0)
- HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, "file write failed")
- } /* end if */
+ /* Calculate the aligned address of the last page */
+ last_page = ((addr + size - 1) / pb_ptr->page_size);
+ last_page_addr = last_page * pb_ptr->page_size;
+
+ /* Calculate number of pages that this read spans. */
+ num_touched_pages = last_page - first_page + 1;
+
+ if ( first_page_addr == last_page_addr ) {
+
+ HDassert(1 == num_touched_pages);
+ last_page_addr = HADDR_UNDEF;
+
+ }
+
+ /* case 3) raw data write of page size or greater. */
+ if ( size >= pb_ptr->page_size ) {
+ if ( H5FD_write(shared->lf, type, addr, size, buf) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \
+ "write through metadata accumulator failed")
+
+
+ H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size);
+
+ /* For each page that intersects with the above write, check to see
+ * if it exists in the page buffer.
+ *
+ * If it does and is, and if the write overwrites page fully,
+ * mark the page clean and evict it.
+ *
+ * If the write only partially intersects a page, update the
+ * page and mark it dirty.
+ */
+ search_page = first_page;
+ search_addr = first_page_addr;
+
+ for(i = 0; i < num_touched_pages; i++) {
+
+ H5PB__SEARCH_INDEX(pb_ptr, search_page, entry_ptr, FAIL)
+
+ /* update hit rate stats */
+ H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, (entry_ptr != NULL), \
+ FALSE, FALSE)
+
+ if ( entry_ptr ) {
+
+ HDassert( entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC );
+ HDassert( ! ( entry_ptr->is_metadata ) );
+ HDassert( entry_ptr->page == search_page );
+ HDassert( entry_ptr->addr == search_addr );
+ HDassert( entry_ptr->size == pb_ptr->page_size );
+ HDassert( entry_ptr->delay_write_until == 0 );
+ HDassert( entry_ptr->addr <= addr + size );
+
+ if ( ( addr <= entry_ptr->addr ) &&
+ ( entry_ptr->addr + entry_ptr->size <= addr + size ) ) {
+
+ /* the page is completely overwritten -- mark it clean
+ * and evict it.
+ */
+ if ( ( entry_ptr->is_dirty ) &&
+ ( H5PB__mark_entry_clean(pb_ptr, entry_ptr) < 0 ) )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "mark entry clean failed")
+
+ if (H5PB__evict_entry(shared, entry_ptr, TRUE, false) < 0)
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "forced eviction failed (1)")
+
+ } else if ( i == 0 ) {
+
+ /* handle partial overwrite of the first page. */
+
+ HDassert( search_addr == first_page_addr );
+ HDassert( search_page == first_page );
+ HDassert( search_addr < addr );
+ HDassert( entry_ptr->addr + entry_ptr->size <=
+ addr + size );
+
+ offset = addr - first_page_addr;
+
+ HDassert( offset > 0 );
+ HDassert( pb_ptr->page_size >= offset );
+ HDassert( size >= pb_ptr->page_size - (size_t)offset );
+
+ HDmemcpy((uint8_t *)entry_ptr->image_ptr + offset, buf,
+ pb_ptr->page_size - (size_t)offset);
+
+ if ( H5PB__mark_entry_dirty(shared, pb_ptr, entry_ptr) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "mark entry dirty failed (1)")
+
+ } else if ( i == num_touched_pages - 1 ) {
+
+ /* handle partial overwrite of the last page. */
+ HDassert( i > 0 );
+ HDassert( search_addr == last_page_addr );
+ HDassert( search_page == last_page );
+ HDassert( addr < last_page_addr );
+ HDassert( last_page_addr < addr + size );
- page_entry->is_dirty = FALSE;
+ offset = (num_touched_pages - 2) * pb_ptr->page_size +
+ (pb_ptr->page_size - (addr - first_page_addr));
+
+ HDmemcpy(entry_ptr->image_ptr,
+ (const uint8_t *)buf + offset,
+ (size_t)((addr + size) - last_page_addr));
+
+ if ( H5PB__mark_entry_dirty(shared, pb_ptr, entry_ptr) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "mark entry dirty failed (2)")
+ } else {
+
+ /* this should be un-reachable */
+ HDassert(FALSE);
+
+ }
+ } /* if ( entry_ptr ) */
+
+ search_page++;
+ search_addr += pb_ptr->page_size;
+
+ } /* end for */
+ } else {
+ /* case 4: Raw data write of size less than page size.
+ *
+ * In this case, write the data to the page buffer, loading
+ * pages if necessary.
+ */
+ HDassert(size < pb_ptr->page_size);
+
+ /* first page */
+ offset = addr - first_page_addr;
+
+ if ( (offset + size) <= pb_ptr->page_size ) {
+
+ HDassert(num_touched_pages == 1);
+ length = size;
+
+ } else {
+
+ HDassert(num_touched_pages == 2);
+ length = pb_ptr->page_size - offset;
+ HDassert( offset + length == pb_ptr->page_size );
+ }
+
+ /* get the first page */
+ H5PB__SEARCH_INDEX(pb_ptr, first_page, entry_ptr, FAIL)
+
+ /* update hit rate stats */
+ H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, (entry_ptr != NULL), \
+ FALSE, FALSE)
+
+ if ( ( NULL == entry_ptr ) &&
+ ( H5PB__load_page(shared, pb_ptr, first_page_addr,
+ type, &entry_ptr) < 0 ) )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
+ "page buffer page load request failed (1)")
+
+ HDassert(entry_ptr);
+ HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC);
+ HDassert(entry_ptr->addr == first_page_addr);
+
+
+ /* copy data from the write buffer into the first page */
+ HDmemcpy(((uint8_t *)(entry_ptr->image_ptr)) + offset,
+ (const uint8_t *)buf, length);
+
+ if ( H5PB__mark_entry_dirty(shared, pb_ptr, entry_ptr) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "mark entry dirty failed (3)")
+
+ /* second page, if it exists */
+ if ( num_touched_pages == 2 ) {
+
+ offset = length;
+ length = size - offset;
+
+ HDassert(offset + length == size);
+
+ /* get the first page */
+ H5PB__SEARCH_INDEX(pb_ptr, last_page, entry_ptr, FAIL)
+
+ /* update hit rate stats */
+ H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, (entry_ptr != NULL), \
+ FALSE, FALSE)
+
+ if ( ( NULL == entry_ptr ) &&
+ ( H5PB__load_page(shared, pb_ptr, last_page_addr,
+ type, &entry_ptr) < 0 ) )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
+ "page buffer page load request failed (2)")
+
+ HDassert(entry_ptr);
+ HDassert(entry_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC);
+ HDassert(entry_ptr->addr == last_page_addr);
+ HDassert(entry_ptr->page == last_page);
+
+ /* copy data from the write buffer into the first page */
+ HDmemcpy((uint8_t *)(entry_ptr->image_ptr),
+ ((const uint8_t *)(buf) + offset), length);
+
+ if ( H5PB__mark_entry_dirty(shared, pb_ptr, entry_ptr) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "mark entry dirty failed (3)")
+ }
+ }
done:
+
FUNC_LEAVE_NOAPI(ret_value)
-} /* end H5PB__write_entry() */
+
+} /* end H5PB__write_raw() */
diff --git a/src/H5PBpkg.h b/src/H5PBpkg.h
index 6b9168b..fb9f29f 100644
--- a/src/H5PBpkg.h
+++ b/src/H5PBpkg.h
@@ -21,38 +21,1923 @@
/* Get package's private header */
#include "H5PBprivate.h"
-/* Other private headers needed by this file */
+/*
+ * File: H5PBpkg.h
+ *
+ * Purpose: This file contains declarations which are normally visible
+ * only within the H5PB package.
+ *
+ * Source files outside the H5PB package should include
+ * H5PBprivate.h instead.
+ *
+ * Programmer: John Mainzer -- 10/07/18
+ */
/**************************/
/* Package Private Macros */
/**************************/
+/* page buffer configuration settings */
+#define H5PB__H5PB_ENTRY_T_MAGIC 0x02030405
+#define H5PB__DO_SANITY_CHECKS TRUE
+#define H5PB__COLLECT_PAGE_BUFFER_STATS TRUE
+
+
+/****************************************************************************
+ *
+ * We maintain doubly linked lists of instances of H5PB_entry_t for a
+ * variety of reasons -- LRU list, tick list, and the delayed write list
+ * at present. The following macros support linking and unlinking
+ * instances of H5PB_entry_t by both their regular and tick list next
+ * and previous pointers. Note that the tick list and the delayed write
+ * list are only used in the context of VFD SWMR
+ *
+ * The size and length fields are also maintained.
+ *
+ * Note that the relevant pair of prev and next pointers are presumed to be
+ * NULL on entry in the insertion macros.
+ *
+ * Finally, observe that the sanity checking macros evaluate to the empty
+ * string when H5PB__DO_SANITY_CHECKS is FALSE. They also contain calls
+ * to the HGOTO_ERROR macro, which may not be appropriate in all cases.
+ * If so, we will need versions of the insertion and deletion macros which
+ * do not reference the sanity checking macros.
+ * JRM - 10/07/18
+ *
+ ****************************************************************************/
+
+#if H5PB__DO_SANITY_CHECKS
+
+#define H5PB__DLL_PRE_REMOVE_SC(entry_ptr, head_ptr, tail_ptr, len, Size, fv) \
+if ( ( (head_ptr) == NULL ) || \
+ ( (tail_ptr) == NULL ) || \
+ ( (entry_ptr) == NULL ) || \
+ ( (len) <= 0 ) || \
+ ( (Size) < (int64_t)((entry_ptr)->size ) ) || \
+ ( ( (Size) == (int64_t)((entry_ptr)->size) ) && ( ! ( (len) == 1 ) ) ) || \
+ ( ( (entry_ptr)->prev == NULL ) && ( (head_ptr) != (entry_ptr) ) ) || \
+ ( ( (entry_ptr)->next == NULL ) && ( (tail_ptr) != (entry_ptr) ) ) || \
+ ( ( (len) == 1 ) && \
+ ( ! ( ( (head_ptr) == (entry_ptr) ) && \
+ ( (tail_ptr) == (entry_ptr) ) && \
+ ( (entry_ptr)->next == NULL ) && \
+ ( (entry_ptr)->prev == NULL ) && \
+ ( (Size) == (int64_t)((entry_ptr)->size) ) \
+ ) \
+ ) \
+ ) \
+ ) { \
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, (fv), "DLL pre remove SC failed") \
+}
+
+#define H5PB__DLL_SC(head_ptr, tail_ptr, len, Size, fv) \
+if ( ( ( ( (head_ptr) == NULL ) || ( (tail_ptr) == NULL ) ) && \
+ ( (head_ptr) != (tail_ptr) ) \
+ ) || \
+ ( (len) < 0 ) || \
+ ( (Size) < 0 ) || \
+ ( ( (len) == 1 ) && \
+ ( ( (head_ptr) != (tail_ptr) ) || \
+ ( (head_ptr) == NULL ) || ( (head_ptr)->size != (size_t)(Size) ) \
+ ) \
+ ) || \
+ ( ( (len) >= 1 ) && \
+ ( ( (head_ptr) == NULL ) || ( (head_ptr)->prev != NULL ) || \
+ ( (tail_ptr) == NULL ) || ( (tail_ptr)->next != NULL ) \
+ ) \
+ ) \
+ ) { \
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, (fv), "DLL sanity check failed") \
+}
+
+#define H5PB__DLL_PRE_INSERT_SC(entry_ptr, head_ptr, tail_ptr, len, Size, fv) \
+if ( ( (entry_ptr) == NULL ) || \
+ ( (entry_ptr)->next != NULL ) || \
+ ( (entry_ptr)->prev != NULL ) || \
+ ( ( ( (head_ptr) == NULL ) || ( (tail_ptr) == NULL ) ) && \
+ ( (head_ptr) != (tail_ptr) ) \
+ ) || \
+ ( ( (len) == 1 ) && \
+ ( ( (head_ptr) != (tail_ptr) ) || \
+ ( (head_ptr) == NULL ) || ( (head_ptr)->size != (size_t)(Size) ) \
+ ) \
+ ) || \
+ ( ( (len) >= 1 ) && \
+ ( ( (head_ptr) == NULL ) || ( (head_ptr)->prev != NULL ) || \
+ ( (tail_ptr) == NULL ) || ( (tail_ptr)->next != NULL ) \
+ ) \
+ ) \
+ ) { \
+ HDassert(FALSE); \
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, (fv), "DLL pre insert SC failed") \
+}
+
+#else /* H5PB__DO_SANITY_CHECKS */
+
+#define H5PB__DLL_PRE_REMOVE_SC(entry_ptr, head_ptr, tail_ptr, len, Size, fv)
+#define H5PB__DLL_SC(head_ptr, tail_ptr, len, Size, fv)
+#define H5PB__DLL_PRE_INSERT_SC(entry_ptr, head_ptr, tail_ptr, len, Size, fv)
+
+#endif /* H5PB__DO_SANITY_CHECKS */
+
+
+#define H5PB__DLL_APPEND(entry_ptr, head_ptr, tail_ptr, len, Size, fail_val) \
+{ \
+ H5PB__DLL_PRE_INSERT_SC(entry_ptr, head_ptr, tail_ptr, len, Size, \
+ fail_val) \
+ if ( (head_ptr) == NULL ) \
+ { \
+ (head_ptr) = (entry_ptr); \
+ (tail_ptr) = (entry_ptr); \
+ } \
+ else \
+ { \
+ (tail_ptr)->next = (entry_ptr); \
+ (entry_ptr)->prev = (tail_ptr); \
+ (tail_ptr) = (entry_ptr); \
+ } \
+ (len)++; \
+ (Size) += (int64_t)((entry_ptr)->size); \
+} /* H5PB__DLL_APPEND() */
+
+#define H5PB__DLL_PREPEND(entry_ptr, head_ptr, tail_ptr, len, Size, fail_val) \
+{ \
+ H5PB__DLL_PRE_INSERT_SC(entry_ptr, head_ptr, tail_ptr, len, Size, \
+ fail_val) \
+ if ( (head_ptr) == NULL ) \
+ { \
+ (head_ptr) = (entry_ptr); \
+ (tail_ptr) = (entry_ptr); \
+ } \
+ else \
+ { \
+ (head_ptr)->prev = (entry_ptr); \
+ (entry_ptr)->next = (head_ptr); \
+ (head_ptr) = (entry_ptr); \
+ } \
+ (len)++; \
+ (Size) += (int64_t)((entry_ptr)->size); \
+} /* H5PB__DLL_PREPEND() */
+
+#define H5PB__DLL_INSERT_BEFORE(entry_ptr, suc_ptr, head_ptr, tail_ptr, len, \
+ Size, fail_val) \
+{ \
+ HDassert( ((suc_ptr) == NULL) || \
+ ((suc_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC) ); \
+ \
+ if ( suc_ptr == NULL ) \
+ /* list empty or no successor -- append */ \
+ H5PB__DLL_APPEND(entry_ptr, head_ptr, tail_ptr, len, Size, fail_val) \
+ \
+ else if ( suc_ptr->prev == NULL ) \
+ /* successor at head of list -- prepend */ \
+ H5PB__DLL_PREPEND(entry_ptr, head_ptr, tail_ptr, len, Size, fail_val) \
+ \
+ else /* sucessor in body of list -- insert before it */ \
+ { \
+ H5PB__DLL_PRE_INSERT_SC(entry_ptr, head_ptr, tail_ptr, len, Size, \
+ fail_val) \
+ HDassert(suc_ptr->prev->magic == H5PB__H5PB_ENTRY_T_MAGIC); \
+ HDassert(suc_ptr->prev->next == suc_ptr); \
+ entry_ptr->prev = suc_ptr->prev; \
+ entry_ptr->prev->next = entry_ptr; \
+ entry_ptr->next = suc_ptr; \
+ suc_ptr->prev = entry_ptr; \
+ (len)++; \
+ (Size) += (int64_t)((entry_ptr)->size); \
+ } \
+} /* H5PB__DLL_INSERT_BEFORE() */
+
+#define H5PB__DLL_REMOVE(entry_ptr, head_ptr, tail_ptr, len, Size, fail_val) \
+{ \
+ H5PB__DLL_PRE_REMOVE_SC(entry_ptr, head_ptr, tail_ptr, len, Size, \
+ fail_val) \
+ { \
+ if ( (head_ptr) == (entry_ptr) ) \
+ { \
+ (head_ptr) = (entry_ptr)->next; \
+ if ( (head_ptr) != NULL ) \
+ (head_ptr)->prev = NULL; \
+ } \
+ else \
+ (entry_ptr)->prev->next = (entry_ptr)->next; \
+ if ( (tail_ptr) == (entry_ptr) ) \
+ { \
+ (tail_ptr) = (entry_ptr)->prev; \
+ if ( (tail_ptr) != NULL ) \
+ (tail_ptr)->next = NULL; \
+ } \
+ else \
+ (entry_ptr)->next->prev = (entry_ptr)->prev; \
+ entry_ptr->next = NULL; \
+ entry_ptr->prev = NULL; \
+ (len)--; \
+ (Size) -= (int64_t)((entry_ptr)->size); \
+ } \
+} /* H5PB__DLL_REMOVE() */
+
+
+#if H5PB__DO_SANITY_CHECKS
+
+#define H5PB__IL_DLL_PRE_REMOVE_SC(entry_ptr, hd_ptr, tail_ptr, len, Size, fv) \
+if ( ( (hd_ptr) == NULL ) || \
+ ( (tail_ptr) == NULL ) || \
+ ( (entry_ptr) == NULL ) || \
+ ( (len) <= 0 ) || \
+ ( (Size) < (int64_t)((entry_ptr)->size) ) || \
+ ( ( (Size) == (int64_t)((entry_ptr)->size) ) && \
+ ( ! ( (len) == 1 ) ) ) || \
+ ( ( (entry_ptr)->il_prev == NULL ) && ( (hd_ptr) != (entry_ptr) ) ) || \
+ ( ( (entry_ptr)->il_next == NULL ) && ( (tail_ptr) != (entry_ptr) ) ) || \
+ ( ( (len) == 1 ) && \
+ ( ! ( ( (hd_ptr) == (entry_ptr) ) && ( (tail_ptr) == (entry_ptr) ) && \
+ ( (entry_ptr)->il_next == NULL ) && \
+ ( (entry_ptr)->il_prev == NULL ) && \
+ ( (Size) == (int64_t)((entry_ptr)->size) ) \
+ ) \
+ ) \
+ ) \
+ ) { \
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, (fv), "il DLL pre remove SC failed") \
+}
+
+#define H5PB__IL_DLL_PRE_INSERT_SC(entry_ptr, hd_ptr, tail_ptr, len, Size, fv) \
+if ( ( (entry_ptr) == NULL ) || \
+ ( (entry_ptr)->il_next != NULL ) || \
+ ( (entry_ptr)->il_prev != NULL ) || \
+ ( ( ( (hd_ptr) == NULL ) || ( (tail_ptr) == NULL ) ) && \
+ ( (hd_ptr) != (tail_ptr) ) \
+ ) || \
+ ( ( (len) == 1 ) && \
+ ( ( (hd_ptr) != (tail_ptr) ) || ( (Size) <= 0 ) || \
+ ( (hd_ptr) == NULL ) || ( (int64_t)((hd_ptr)->size) != (Size) ) \
+ ) \
+ ) || \
+ ( ( (len) >= 1 ) && \
+ ( ( (hd_ptr) == NULL ) || ( (hd_ptr)->il_prev != NULL ) || \
+ ( (tail_ptr) == NULL ) || ( (tail_ptr)->il_next != NULL ) \
+ ) \
+ ) \
+ ) { \
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, (fv), "IL DLL pre insert SC failed") \
+}
+
+#define H5PB__IL_DLL_SC(head_ptr, tail_ptr, len, Size, fv) \
+if ( ( ( ( (head_ptr) == NULL ) || ( (tail_ptr) == NULL ) ) && \
+ ( (head_ptr) != (tail_ptr) ) \
+ ) || \
+ ( ( (len) == 1 ) && \
+ ( ( (head_ptr) != (tail_ptr) ) || \
+ ( (head_ptr) == NULL ) || ( (int64_t)((head_ptr)->size) != (Size) ) \
+ ) \
+ ) || \
+ ( ( (len) >= 1 ) && \
+ ( ( (head_ptr) == NULL ) || ( (head_ptr)->il_prev != NULL ) || \
+ ( (tail_ptr) == NULL ) || ( (tail_ptr)->il_next != NULL ) \
+ ) \
+ ) \
+ ) { \
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, (fv), "IL DLL sanity check failed") \
+}
+
+#else /* H5PB__DO_SANITY_CHECKS */
+
+#define H5PB__IL_DLL_PRE_REMOVE_SC(entry_ptr, hd_ptr, tail_ptr, len, Size, fv)
+#define H5PB__IL_DLL_PRE_INSERT_SC(entry_ptr, hd_ptr, tail_ptr, len, Size, fv)
+#define H5PB__IL_DLL_SC(head_ptr, tail_ptr, len, Size, fv)
+
+#endif /* H5PB__DO_SANITY_CHECKS */
+
+
+#define H5PB__IL_DLL_APPEND(entry_ptr, head_ptr, tail_ptr, len, Size, fail_val)\
+{ \
+ H5PB__IL_DLL_PRE_INSERT_SC(entry_ptr, head_ptr, tail_ptr, len, Size, \
+ fail_val) \
+ if ( (head_ptr) == NULL ) \
+ { \
+ (head_ptr) = (entry_ptr); \
+ (tail_ptr) = (entry_ptr); \
+ } \
+ else \
+ { \
+ (tail_ptr)->il_next = (entry_ptr); \
+ (entry_ptr)->il_prev = (tail_ptr); \
+ (tail_ptr) = (entry_ptr); \
+ } \
+ (len)++; \
+ (Size) += (int64_t)((entry_ptr)->size); \
+ H5PB__IL_DLL_SC(head_ptr, tail_ptr, len, Size, fail_val) \
+} /* H5PB__IL_DLL_APPEND() */
+
+#define H5PB__IL_DLL_PREPEND(entry_ptr, head_ptr, tail_ptr, len, Size, \
+ fail_val) \
+{ \
+ H5PB__DLL_PRE_INSERT_SC(entry_ptr, head_ptr, tail_ptr, len, Size, \
+ fail_val) \
+ if ( (head_ptr) == NULL ) \
+ { \
+ (head_ptr) = (entry_ptr); \
+ (tail_ptr) = (entry_ptr); \
+ } \
+ else \
+ { \
+ (head_ptr)->il_prev = (entry_ptr); \
+ (entry_ptr)->il_next = (head_ptr); \
+ (head_ptr) = (entry_ptr); \
+ } \
+ (len)++; \
+ (Size) += (int64_t)((entry_ptr)->size); \
+} /* H5PB__DLL_PREPEND() */
+
+
+#define H5PB__IL_DLL_INSERT_BEFORE(entry_ptr, suc_ptr, head_ptr, tail_ptr, \
+ len, Size, fail_val) \
+{ \
+ HDassert( ((suc_ptr) == NULL) || \
+ ((suc_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC) ); \
+ \
+ if ( suc_ptr == NULL ) \
+ /* list empty or no successor -- append */ \
+ H5PB__IL_DLL_APPEND(entry_ptr, head_ptr, tail_ptr, len, Size, \
+ fail_val) \
+ \
+ else if ( suc_ptr->il_prev == NULL ) \
+ /* successor at head of list -- prepend */ \
+ H5PB__IL_DLL_PREPEND(entry_ptr, head_ptr, tail_ptr, len, Size, \
+ fail_val) \
+ \
+ else /* sucessor in body of list -- insert before it */ \
+ { \
+ H5PB__IL_DLL_PRE_INSERT_SC(entry_ptr, head_ptr, tail_ptr, len, Size, \
+ fail_val) \
+ HDassert(suc_ptr->il_prev->magic == H5PB__H5PB_ENTRY_T_MAGIC); \
+ HDassert(suc_ptr->il_prev->il_next == suc_ptr); \
+ entry_ptr->il_prev = suc_ptr->il_prev; \
+ entry_ptr->il_prev->il_next = entry_ptr; \
+ entry_ptr->il_next = suc_ptr; \
+ suc_ptr->il_prev = entry_ptr; \
+ (len)++; \
+ (Size) += (int64_t)((entry_ptr)->size); \
+ } \
+} /* H5PB__DLL_INSERT_BEFORE() */
+
+#define H5PB__IL_DLL_REMOVE(entry_ptr, head_ptr, tail_ptr, len, Size, fv) \
+{ \
+ H5PB__IL_DLL_PRE_REMOVE_SC(entry_ptr, head_ptr, tail_ptr, len, Size, fv) \
+ { \
+ if ( (head_ptr) == (entry_ptr) ) \
+ { \
+ (head_ptr) = (entry_ptr)->il_next; \
+ if ( (head_ptr) != NULL ) \
+ (head_ptr)->il_prev = NULL; \
+ } \
+ else \
+ (entry_ptr)->il_prev->il_next = (entry_ptr)->il_next; \
+ if ( (tail_ptr) == (entry_ptr) ) \
+ { \
+ (tail_ptr) = (entry_ptr)->il_prev; \
+ if ( (tail_ptr) != NULL ) \
+ (tail_ptr)->il_next = NULL; \
+ } \
+ else \
+ (entry_ptr)->il_next->il_prev = (entry_ptr)->il_prev; \
+ entry_ptr->il_next = NULL; \
+ entry_ptr->il_prev = NULL; \
+ (len)--; \
+ (Size) -= (int64_t)((entry_ptr)->size); \
+ } \
+ H5PB__IL_DLL_SC(head_ptr, tail_ptr, len, Size, fv) \
+} /* H5PB__IL_DLL_REMOVE() */
+
+
+#if H5PB__DO_SANITY_CHECKS
+
+#define H5PB__TL_DLL_PRE_REMOVE_SC(entry_ptr, hd_ptr, tail_ptr, len, Size, fv) \
+if ( ( (hd_ptr) == NULL ) || \
+ ( (tail_ptr) == NULL ) || \
+ ( (entry_ptr) == NULL ) || \
+ ( (len) <= 0 ) || \
+ ( (Size) < (int64_t)((entry_ptr)->size ) ) || \
+ ( ( (Size) == (int64_t)((entry_ptr)->size) ) && ( ! ( (len) == 1 ) ) ) || \
+ ( ( (entry_ptr)->tl_prev == NULL ) && ( (hd_ptr) != (entry_ptr) ) ) || \
+ ( ( (entry_ptr)->tl_next == NULL ) && ( (tail_ptr) != (entry_ptr) ) ) || \
+ ( ( (len) == 1 ) && \
+ ( ! ( ( (hd_ptr) == (entry_ptr) ) && ( (tail_ptr) == (entry_ptr) ) && \
+ ( (entry_ptr)->tl_next == NULL ) && \
+ ( (entry_ptr)->tl_prev == NULL ) && \
+ ( (Size) == (int64_t)((entry_ptr)->size) ) \
+ ) \
+ ) \
+ ) \
+ ) { \
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, (fv), "TL DLL pre remove SC failed") \
+}
+
+#define H5PB__TL_DLL_SC(head_ptr, tail_ptr, len, Size, fv) \
+if ( ( ( ( (head_ptr) == NULL ) || ( (tail_ptr) == NULL ) ) && \
+ ( (head_ptr) != (tail_ptr) ) \
+ ) || \
+ ( (len) < 0 ) || \
+ ( (Size) < 0 ) || \
+ ( ( (len) == 1 ) && \
+ ( ( (head_ptr) != (tail_ptr) ) || ( (Size) <= 0 ) || \
+ ( (head_ptr) == NULL ) || ( (head_ptr)->size != (Size) ) \
+ ) \
+ ) || \
+ ( ( (len) >= 1 ) && \
+ ( ( (head_ptr) == NULL ) || ( (head_ptr)->tl_prev != NULL ) || \
+ ( (tail_ptr) == NULL ) || ( (tail_ptr)->tl_next != NULL ) \
+ ) \
+ ) \
+ ) { \
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, (fv), "TL DLL sanity check failed") \
+}
+
+#define H5PB__TL_DLL_PRE_INSERT_SC(entry_ptr, hd_ptr, tail_ptr, len, Size, fv) \
+if ( ( (entry_ptr) == NULL ) || \
+ ( (entry_ptr)->tl_next != NULL ) || \
+ ( (entry_ptr)->tl_prev != NULL ) || \
+ ( ( ( (hd_ptr) == NULL ) || ( (tail_ptr) == NULL ) ) && \
+ ( (hd_ptr) != (tail_ptr) ) \
+ ) || \
+ ( ( (len) == 1 ) && \
+ ( ( (hd_ptr) != (tail_ptr) ) || ( (Size) <= 0 ) || \
+ ( (hd_ptr) == NULL ) || ( (int64_t)((hd_ptr)->size) != (Size) ) \
+ ) \
+ ) || \
+ ( ( (len) >= 1 ) && \
+ ( ( (hd_ptr) == NULL ) || ( (hd_ptr)->tl_prev != NULL ) || \
+ ( (tail_ptr) == NULL ) || ( (tail_ptr)->tl_next != NULL ) \
+ ) \
+ ) \
+ ) { \
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, (fv), "TL DLL pre insert SC failed") \
+}
+
+#else /* H5PB__DO_SANITY_CHECKS */
+
+#define H5PB__TL_DLL_PRE_REMOVE_SC(entry_ptr, hd_ptr, tail_ptr, len, Size, fv)
+#define H5PB__TL_DLL_SC(head_ptr, tail_ptr, len, Size, fv)
+#define H5PB__TL_DLL_PRE_INSERT_SC(entry_ptr, hd_ptr, tail_ptr, len, Size, fv)
+
+#endif /* H5PB__DO_SANITY_CHECKS */
+
+
+#define H5PB__TL_DLL_APPEND(entry_ptr, head_ptr, tail_ptr, len, Size, fail_val)\
+{ \
+ H5PB__TL_DLL_PRE_INSERT_SC(entry_ptr, head_ptr, tail_ptr, len, Size, \
+ fail_val) \
+ if ( (head_ptr) == NULL ) \
+ { \
+ (head_ptr) = (entry_ptr); \
+ (tail_ptr) = (entry_ptr); \
+ } \
+ else \
+ { \
+ (tail_ptr)->tl_next = (entry_ptr); \
+ (entry_ptr)->tl_prev = (tail_ptr); \
+ (tail_ptr) = (entry_ptr); \
+ } \
+ (len)++; \
+ (Size) += entry_ptr->size; \
+} /* H5PB__AUX_DLL_APPEND() */
+
+#define H5PB__TL_DLL_PREPEND(entry_ptr, head_ptr, tail_ptr, len, Size, fv) \
+{ \
+ H5PB__TL_DLL_PRE_INSERT_SC(entry_ptr, head_ptr, tail_ptr, len, Size, fv) \
+ if ( (head_ptr) == NULL ) \
+ { \
+ (head_ptr) = (entry_ptr); \
+ (tail_ptr) = (entry_ptr); \
+ } \
+ else \
+ { \
+ (head_ptr)->tl_prev = (entry_ptr); \
+ (entry_ptr)->tl_next = (head_ptr); \
+ (head_ptr) = (entry_ptr); \
+ } \
+ (len)++; \
+ (Size) += (int64_t)(entry_ptr->size); \
+} /* H5PB__TL_DLL_PREPEND() */
+
+#define H5PB__TL_DLL_REMOVE(entry_ptr, head_ptr, tail_ptr, len, Size, fv) \
+{ \
+ H5PB__TL_DLL_PRE_REMOVE_SC(entry_ptr, head_ptr, tail_ptr, len, Size, fv) \
+ { \
+ if ( (head_ptr) == (entry_ptr) ) \
+ { \
+ (head_ptr) = (entry_ptr)->tl_next; \
+ if ( (head_ptr) != NULL ) \
+ (head_ptr)->tl_prev = NULL; \
+ } \
+ else \
+ (entry_ptr)->tl_prev->tl_next = (entry_ptr)->tl_next; \
+ if ( (tail_ptr) == (entry_ptr) ) \
+ { \
+ (tail_ptr) = (entry_ptr)->tl_prev; \
+ if ( (tail_ptr) != NULL ) \
+ (tail_ptr)->tl_next = NULL; \
+ } \
+ else \
+ (entry_ptr)->tl_next->tl_prev = (entry_ptr)->tl_prev; \
+ entry_ptr->tl_next = NULL; \
+ entry_ptr->tl_prev = NULL; \
+ (len)--; \
+ (Size) -= (int64_t)(entry_ptr->size); \
+ } \
+} /* H5PB__TL_DLL_REMOVE() */
+
+
+/***********************************************************************
+ *
+ * Stats collection macros
+ *
+ * The following macros must handle stats collection when this collection
+ * is enabled, and evaluate to the empty string when it is not.
+ *
+ * The sole exception to this rule is
+ * H5PB__UPDATE_PB_HIT_RATE_STATS(), which is always active as
+ * the page buffer hit rate stats are always collected and available.
+ *
+ ***********************************************************************/
+
+#if H5PB__COLLECT_PAGE_BUFFER_STATS
+
+#define H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, hit, is_metadata, is_mpmde) \
+{ \
+ int ii; \
+ \
+ HDassert(pb_ptr); \
+ HDassert((pb_ptr)->magic == H5PB__H5PB_T_MAGIC); \
+ \
+ if ( is_metadata ) { \
+ if ( is_mpmde ) { \
+ ii = H5PB__STATS_MPMDE; \
+ } else { \
+ ii = H5PB__STATS_MD; \
+ } \
+ } else { \
+ ii = H5PB__STATS_RD; \
+ } \
+ if ( hit ) \
+ ((pb_ptr)->hits[ii])++; \
+ else \
+ ((pb_ptr)->misses[ii])++; \
+} /* H5PB__UPDATE_PB_HIT_RATE_STATS */
+
+#define H5PB__UPDATE_HT_SIZE_STATS(pb_ptr) \
+ if ( (pb_ptr)->index_len > (pb_ptr)->max_index_len ) \
+ (pb_ptr)->max_index_len = (pb_ptr)->index_len; \
+ if ( (pb_ptr)->clean_index_len > (pb_ptr)->max_clean_index_len ) \
+ (pb_ptr)->max_clean_index_len = (pb_ptr)->clean_index_len; \
+ if ( (pb_ptr)->dirty_index_len > (pb_ptr)->max_dirty_index_len ) \
+ (pb_ptr)->max_dirty_index_len = (pb_ptr)->dirty_index_len; \
+ if ( (pb_ptr)->index_size > (pb_ptr)->max_index_size ) \
+ (pb_ptr)->max_index_size = (pb_ptr)->index_size; \
+ if ( (pb_ptr)->clean_index_size > (pb_ptr)->max_clean_index_size ) \
+ (pb_ptr)->max_clean_index_size = (pb_ptr)->clean_index_size; \
+ if ( (pb_ptr)->dirty_index_size > (pb_ptr)->max_dirty_index_size ) \
+ (pb_ptr)->max_dirty_index_size = (pb_ptr)->dirty_index_size; \
+ if ( (pb_ptr)->curr_md_pages > (pb_ptr)->max_md_pages ) \
+ (pb_ptr)->max_md_pages = (pb_ptr)->curr_md_pages; \
+ if ( (pb_ptr)->curr_rd_pages > (pb_ptr)->max_rd_pages ) \
+ (pb_ptr)->max_rd_pages = (pb_ptr)->curr_rd_pages; \
+ if ( (pb_ptr)->mpmde_count > (pb_ptr)->max_mpmde_count ) \
+ (pb_ptr)->max_rd_pages = (pb_ptr)->curr_rd_pages;
+
+#define H5PB__UPDATE_STATS_FOR_HT_INSERTION(pb_ptr) \
+ ((pb_ptr)->total_ht_insertions)++;
+
+
+#define H5PB__UPDATE_STATS_FOR_HT_DELETION(pb_ptr) \
+ (pb_ptr)->total_ht_deletions++;
+
+#define H5PB__UPDATE_STATS_FOR_HT_SEARCH(pb_ptr, success, depth) \
+ HDassert(depth >= 0); \
+ if ( success ) { \
+ (pb_ptr)->successful_ht_searches++; \
+ (pb_ptr)->total_successful_ht_search_depth += (int64_t)depth; \
+ } else { \
+ (pb_ptr)->failed_ht_searches++; \
+ (pb_ptr)->total_failed_ht_search_depth += (int64_t)depth; \
+ }
+
+#define H5PB__UPDATE_LRU_SIZE_STATS(pb_ptr) \
+ if ( (pb_ptr)->LRU_len > (pb_ptr)->max_lru_len ) \
+ (pb_ptr)->max_lru_len = (pb_ptr)->LRU_len; \
+ if ( (pb_ptr)->LRU_size > (pb_ptr)->max_lru_size ) \
+ (pb_ptr)->max_lru_size = (pb_ptr)->LRU_size;
+
+#define H5PB__UPDATE_STATS_FOR_LRU_MD_SKIP(pb_ptr) \
+ ((pb_ptr)->lru_md_skips)++;
+
+#define H5PB__UPDATE_STATS_FOR_LRU_RD_SKIP(pb_ptr) \
+ ((pb_ptr)->lru_rd_skips)++;
+
+#define H5PB__UPDATE_STATS_FOR_LRU_TL_SKIP(pb_ptr) \
+{ \
+ HDassert(pb_ptr->vfd_swmr_writer); \
+ ((pb_ptr)->lru_tl_skips)++; \
+}
+
+#define H5PB__UPDATE_TL_SIZE_STATS(pb_ptr) \
+{ \
+ HDassert((pb_ptr)->vfd_swmr_writer); \
+ if ( (pb_ptr)->tl_len > (pb_ptr)->max_tl_len ) \
+ (pb_ptr)->max_tl_len = (pb_ptr)->tl_len; \
+ if ( (pb_ptr)->tl_size > (pb_ptr)->max_tl_size ) \
+ (pb_ptr)->max_tl_size = (pb_ptr)->tl_size; \
+}
+
+#define H5PB__UPDATE_DWL_SIZE_STATS(pb_ptr) \
+{ \
+ HDassert((pb_ptr)->vfd_swmr_writer); \
+ if ( (pb_ptr)->dwl_len > (pb_ptr)->max_dwl_len ) \
+ (pb_ptr)->max_dwl_len = (pb_ptr)->dwl_len; \
+ if ( (pb_ptr)->dwl_size > (pb_ptr)->max_dwl_size ) \
+ (pb_ptr)->max_dwl_size = (pb_ptr)->dwl_size; \
+}
+
+#define H5PB__UPDATE_DWL_DELAYED_WRITES(pb_ptr, insertion_depth, delay) \
+{ \
+ HDassert((pb_ptr)->vfd_swmr_writer); \
+ (pb_ptr)->delayed_writes++; \
+ (pb_ptr)->total_delay += (int64_t)(delay); \
+ (pb_ptr)->total_dwl_ins_depth += (insertion_depth); \
+}
+
+
+#define H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size) \
+{ \
+ int _i; \
+ \
+ HDassert(pb_ptr); \
+ HDassert((pb_ptr)->magic == H5PB__H5PB_T_MAGIC); \
+ \
+ if ( H5FD_MEM_DRAW == (type) ) { \
+ _i = H5PB__STATS_RD; \
+ } else if ( (size) > (pb_ptr)->page_size ) { \
+ _i = H5PB__STATS_MPMDE; \
+ } else { \
+ _i = H5PB__STATS_MD; \
+ } \
+ ((pb_ptr)->accesses[_i])++; \
+} /* H5PB__UPDATE_STATS_FOR_ACCESS */
+
+
+#define H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size) \
+{ \
+ int ii; \
+ \
+ HDassert(pb_ptr); \
+ HDassert((pb_ptr)->magic == H5PB__H5PB_T_MAGIC); \
+ \
+ if ( H5FD_MEM_DRAW == (type) ) { \
+ ii = H5PB__STATS_RD; \
+ } else if ( (size) > (pb_ptr)->page_size ) { \
+ ii = H5PB__STATS_MPMDE; \
+ } else { \
+ ii = H5PB__STATS_MD; \
+ } \
+ ((pb_ptr)->bypasses[ii])++; \
+} /* H5PB__UPDATE_STATS_FOR_BYPASS */
+
+
+#define H5PB__UPDATE_STATS_FOR_FLUSH(pb_ptr, entry_ptr) \
+{ \
+ int i; \
+ \
+ HDassert(pb_ptr); \
+ HDassert((pb_ptr)->magic == H5PB__H5PB_T_MAGIC); \
+ HDassert(entry_ptr); \
+ HDassert((entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC); \
+ \
+ if ( (entry_ptr)->is_metadata ) { \
+ if ( (entry_ptr)->is_mpmde ) { \
+ i = H5PB__STATS_MPMDE; \
+ } else { \
+ i = H5PB__STATS_MD; \
+ } \
+ } else { \
+ i = H5PB__STATS_RD; \
+ } \
+ ((pb_ptr)->flushes[i])++; \
+} /* H5PB__UPDATE_STATS_FOR_FLUSH */
+
+
+#define H5PB__UPDATE_STATS_FOR_EVICTION(pb_ptr, entry_ptr) \
+{ \
+ int i; \
+ \
+ HDassert(pb_ptr); \
+ HDassert((pb_ptr)->magic == H5PB__H5PB_T_MAGIC); \
+ HDassert(entry_ptr); \
+ HDassert((entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC); \
+ \
+ if ( (entry_ptr)->is_metadata ) { \
+ if ( (entry_ptr)->is_mpmde ) { \
+ i = H5PB__STATS_MPMDE; \
+ } else { \
+ i = H5PB__STATS_MD; \
+ } \
+ } else { \
+ i = H5PB__STATS_RD; \
+ } \
+ ((pb_ptr)->evictions[i])++; \
+} /* H5PB__UPDATE_STATS_FOR_EVICTION */
+
+
+#define H5PB__UPDATE_STATS_FOR_CLEAR(pb_ptr, entry_ptr) \
+{ \
+ int i; \
+ \
+ HDassert(pb_ptr); \
+ HDassert((pb_ptr)->magic == H5PB__H5PB_T_MAGIC); \
+ HDassert(entry_ptr); \
+ HDassert((entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC); \
+ \
+ if ( (entry_ptr)->is_metadata ) { \
+ if ( (entry_ptr)->is_mpmde ) { \
+ i = H5PB__STATS_MPMDE; \
+ } else { \
+ i = H5PB__STATS_MD; \
+ } \
+ } else { \
+ i = H5PB__STATS_RD; \
+ } \
+ ((pb_ptr)->clears[i])++; \
+} /* H5PB__UPDATE_STATS_FOR_CLEAR */
+
+
+#define H5PB__UPDATE_STATS_FOR_INSERTION(pb_ptr, entry_ptr) \
+{ \
+ int i; \
+ \
+ HDassert(pb_ptr); \
+ HDassert((pb_ptr)->magic == H5PB__H5PB_T_MAGIC); \
+ HDassert(entry_ptr); \
+ HDassert((entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC); \
+ \
+ if ( (entry_ptr)->is_metadata ) { \
+ if ( (entry_ptr)->is_mpmde ) { \
+ i = H5PB__STATS_MPMDE; \
+ } else { \
+ i = H5PB__STATS_MD; \
+ } \
+ } else { \
+ i = H5PB__STATS_RD; \
+ } \
+ ((pb_ptr)->insertions[i])++; \
+} /* H5PB__UPDATE_STATS_FOR_INSERTION */
+
+#define H5PB__UPDATE_STATS_FOR_LOAD(pb_ptr, entry_ptr) \
+{ \
+ int i; \
+ \
+ HDassert(pb_ptr); \
+ HDassert((pb_ptr)->magic == H5PB__H5PB_T_MAGIC); \
+ HDassert(entry_ptr); \
+ HDassert((entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC); \
+ \
+ if ( (entry_ptr)->is_metadata ) { \
+ if ( (entry_ptr)->is_mpmde ) { \
+ i = H5PB__STATS_MPMDE; \
+ } else { \
+ i = H5PB__STATS_MD; \
+ } \
+ } else { \
+ i = H5PB__STATS_RD; \
+ } \
+ ((pb_ptr)->loads[i])++; \
+} /* H5PB__UPDATE_STATS_FOR_LOAD */
+
+#else /* H5PB__COLLECT_PAGE_BUFFER_STATS */
+
+#define H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, hit, is_metadata, is_mpmde)
+#define H5PB__UPDATE_HT_SIZE_STATS(pb_ptr)
+#define H5PB__UPDATE_STATS_FOR_HT_INSERTION(pb_ptr)
+#define H5PB__UPDATE_STATS_FOR_HT_DELETION(pb_ptr)
+#define H5PB__UPDATE_HT_SEARCH_STATS(pb_ptr, success, depth)
+#define H5PB__UPDATE_LRU_SIZE_STATS(pb_ptr)
+#define H5PB__UPDATE_STATS_FOR_LRU_MD_SKIP(pb_ptr)
+#define H5PB__UPDATE_STATS_FOR_LRU_RD_SKIP(pb_ptr)
+#define H5PB__UPDATE_STATS_FOR_LRU_TL_SKIP(pb_ptr)
+#define H5PB__UPDATE_STATS_FOR_LRU_DWL_SKIP(pb_ptr)
+#define H5PB__UPDATE_TL_SIZE_STATS(pb_ptr)
+#define H5PB__UPDATE_DWL_SIZE_STATS(pb_ptr)
+#define H5PB__UPDATE_DWL_DELAYED_WRITES(pb_ptr, insertion_depth, delay)
+#define H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size)
+#define H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size)
+#define H5PB__UPDATE_STATS_FOR_FLUSH(pb_ptr, entry_ptr)
+#define H5PB__UPDATE_STATS_FOR_EVICTION(pb_ptr, entry_ptr)
+#define H5PB__UPDATE_STATS_FOR_CLEAR(pb_ptr, entry_ptr)
+#define H5PB__UPDATE_STATS_FOR_INSERTION(pb_ptr, entry_ptr)
+#define H5PB__UPDATE_STATS_FOR_LOAD(pb_ptr, entry_ptr)
+
+#endif /* H5PB__COLLECT_PAGE_BUFFER_STATS */
+
+
+/***********************************************************************
+ *
+ * Hash table access and manipulation macros:
+ *
+ * The following macros handle searches, insertions, and deletion in
+ * the hash table.
+ *
+ * Note that the input to the hash function is the page of the page
+ * buffer entry, not it address (recall that page * page_size) == addr).
+ *
+ * JRM -- 10/09/18
+ *
+ * Changes:
+ *
+ * - None
+ *
+ ***********************************************************************/
+
+#define H5PB__HASH_MASK ((uint64_t)(H5PB__HASH_TABLE_LEN - 1))
+
+#define H5PB__HASH_FCN(x) (int)(((uint64_t)(x)) & H5PB__HASH_MASK)
+
+#if H5PB__DO_SANITY_CHECKS
+
+#define H5PB__PRE_HT_INSERT_SC(pb_ptr, entry_ptr, fail_val) \
+if ( ( (pb_ptr) == NULL ) || \
+ ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \
+ ( (entry_ptr) == NULL ) || \
+ ( (entry_ptr)->ht_next != NULL ) || \
+ ( (entry_ptr)->ht_prev != NULL ) || \
+ ( (entry_ptr)->size <= 0 ) || \
+ ( H5PB__HASH_FCN((entry_ptr)->page) < 0 ) || \
+ ( H5PB__HASH_FCN((entry_ptr)->page) >= H5PB__HASH_TABLE_LEN ) || \
+ ( (pb_ptr)->index_size != \
+ ((pb_ptr)->clean_index_size + \
+ (pb_ptr)->dirty_index_size) ) || \
+ ( (pb_ptr)->index_size < ((pb_ptr)->clean_index_size) ) || \
+ ( (pb_ptr)->index_size < ((pb_ptr)->dirty_index_size) ) || \
+ ( (pb_ptr)->index_len != (pb_ptr)->il_len ) || \
+ ( (pb_ptr)->index_size != (pb_ptr)->il_size ) || \
+ ( (pb_ptr)->curr_pages < 0 ) || \
+ ( (pb_ptr)->curr_rd_pages < 0 ) || \
+ ( (pb_ptr)->curr_md_pages < 0 ) || \
+ ( ((pb_ptr)->curr_pages != \
+ ((pb_ptr)->curr_md_pages + (pb_ptr)->curr_rd_pages)) ) || \
+ ( (pb_ptr)->mpmde_count < 0 ) || \
+ ( (pb_ptr)->index_len != \
+ ((pb_ptr)->curr_pages + (pb_ptr)->mpmde_count) ) ) { \
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, fail_val, "pre HT insert SC failed") \
+}
+
+#define H5PB__POST_HT_INSERT_SC(pb_ptr, entry_ptr, fail_val) \
+if ( ( (pb_ptr) == NULL ) || \
+ ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \
+ ( (pb_ptr)->index_size != \
+ ((pb_ptr)->clean_index_size + \
+ (pb_ptr)->dirty_index_size) ) || \
+ ( (pb_ptr)->index_size < ((pb_ptr)->clean_index_size) ) || \
+ ( (pb_ptr)->index_size < ((pb_ptr)->dirty_index_size) ) || \
+ ( (pb_ptr)->index_len != (pb_ptr)->il_len ) || \
+ ( (pb_ptr)->index_len != \
+ ((pb_ptr)->curr_pages + (pb_ptr)->mpmde_count) ) || \
+ ( (pb_ptr)->index_size != (pb_ptr)->il_size) ) { \
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, fail_val, "post HT insert SC failed") \
+}
+
+#define H5PB__PRE_HT_REMOVE_SC(pb_ptr, entry_ptr) \
+if ( ( (pb_ptr) == NULL ) || \
+ ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \
+ ( (pb_ptr)->index_len < 1 ) || \
+ ( (entry_ptr) == NULL ) || \
+ ( (pb_ptr)->index_size < (int64_t)((entry_ptr)->size) ) || \
+ ( (entry_ptr)->size <= 0 ) || \
+ ( H5PB__HASH_FCN((entry_ptr)->page) < 0 ) || \
+ ( H5PB__HASH_FCN((entry_ptr)->page) >= H5PB__HASH_TABLE_LEN ) || \
+ ( ((pb_ptr)->ht)[(H5PB__HASH_FCN((entry_ptr)->page))] \
+ == NULL ) || \
+ ( ( ((pb_ptr)->ht)[(H5PB__HASH_FCN((entry_ptr)->page))] \
+ != (entry_ptr) ) && \
+ ( (entry_ptr)->ht_prev == NULL ) ) || \
+ ( ( ((pb_ptr)->ht)[(H5PB__HASH_FCN((entry_ptr)->page))] == \
+ (entry_ptr) ) && \
+ ( (entry_ptr)->ht_prev != NULL ) ) || \
+ ( (pb_ptr)->index_size != \
+ ((pb_ptr)->clean_index_size + \
+ (pb_ptr)->dirty_index_size) ) || \
+ ( (pb_ptr)->index_size < ((pb_ptr)->clean_index_size) ) || \
+ ( (pb_ptr)->index_size < ((pb_ptr)->dirty_index_size) ) || \
+ ( (pb_ptr)->index_len != (pb_ptr)->il_len ) || \
+ ( (pb_ptr)->index_size != (pb_ptr)->il_size ) ) { \
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, "pre HT remove SC failed") \
+}
+
+#define H5PB__POST_HT_REMOVE_SC(pb_ptr, entry_ptr) \
+if ( ( (pb_ptr) == NULL ) || \
+ ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \
+ ( (entry_ptr) == NULL ) || \
+ ( (entry_ptr)->size <= 0 ) || \
+ ( (entry_ptr)->ht_prev != NULL ) || \
+ ( (entry_ptr)->ht_prev != NULL ) || \
+ ( (pb_ptr)->index_size != \
+ ((pb_ptr)->clean_index_size + \
+ (pb_ptr)->dirty_index_size) ) || \
+ ( (pb_ptr)->index_size < ((pb_ptr)->clean_index_size) ) || \
+ ( (pb_ptr)->index_size < ((pb_ptr)->dirty_index_size) ) || \
+ ( (pb_ptr)->index_len != (pb_ptr)->il_len ) || \
+ ( (pb_ptr)->index_size != (pb_ptr)->il_size ) || \
+ ( (pb_ptr)->curr_pages < 0 ) || \
+ ( (pb_ptr)->curr_rd_pages < 0 ) || \
+ ( (pb_ptr)->curr_md_pages < 0 ) || \
+ ( ((pb_ptr)->curr_pages != \
+ ((pb_ptr)->curr_md_pages + (pb_ptr)->curr_rd_pages)) ) || \
+ ( (pb_ptr)->mpmde_count < 0 ) || \
+ ( (pb_ptr)->index_len != \
+ ((pb_ptr)->curr_pages + (pb_ptr)->mpmde_count) ) ) { \
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, "post HT remove SC failed") \
+}
+
+#define H5PB__PRE_HT_SEARCH_SC(pb_ptr, page, fail_val) \
+if ( ( (pb_ptr) == NULL ) || \
+ ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \
+ ( (pb_ptr)->index_size != \
+ ((pb_ptr)->clean_index_size + (pb_ptr)->dirty_index_size) ) || \
+ ( H5PB__HASH_FCN(page) < 0 ) || \
+ ( H5PB__HASH_FCN(page) >= H5PB__HASH_TABLE_LEN ) ) { \
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, fail_val, "pre HT search SC failed") \
+}
+
+#define H5PB__POST_SUC_HT_SEARCH_SC(pb_ptr, entry_ptr, k, fail_val) \
+if ( ( (pb_ptr) == NULL ) || \
+ ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \
+ ( (pb_ptr)->index_len < 1 ) || \
+ ( (entry_ptr) == NULL ) || \
+ ( (pb_ptr)->index_size < (int64_t)((entry_ptr)->size )) || \
+ ( (pb_ptr)->index_size != \
+ ((pb_ptr)->clean_index_size + (pb_ptr)->dirty_index_size) ) || \
+ ( (entry_ptr)->size <= 0 ) || \
+ ( ((pb_ptr)->ht)[k] == NULL ) || \
+ ( ( ((pb_ptr)->ht)[k] != (entry_ptr) ) && \
+ ( (entry_ptr)->ht_prev == NULL ) ) || \
+ ( ( ((pb_ptr)->ht)[k] == (entry_ptr) ) && \
+ ( (entry_ptr)->ht_prev != NULL ) ) || \
+ ( ( (entry_ptr)->ht_prev != NULL ) && \
+ ( (entry_ptr)->ht_prev->ht_next != (entry_ptr) ) ) || \
+ ( ( (entry_ptr)->ht_next != NULL ) && \
+ ( (entry_ptr)->ht_next->ht_prev != (entry_ptr) ) ) ) { \
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, fail_val, \
+ "post successful HT search SC failed") \
+}
+
+#define H5PB__POST_HT_SHIFT_TO_FRONT_SC(pb_ptr, entry_ptr, k, fail_val) \
+if ( ( (pb_ptr) == NULL ) || \
+ ( ((pb_ptr)->ht)[k] != (entry_ptr) ) || \
+ ( (entry_ptr)->ht_prev != NULL ) ) { \
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, fail_val, \
+ "post HT shift to front SC failed") \
+}
+
+#define H5PB__PRE_HT_ENTRY_SIZE_CHANGE_SC(pb_ptr, old_size, new_size, \
+ entry_ptr, was_clean) \
+if ( ( (pb_ptr) == NULL ) || \
+ ( (pb_ptr)->index_len <= 0 ) || \
+ ( (pb_ptr)->index_size <= 0 ) || \
+ ( (new_size) <= 0 ) || \
+ ( (old_size) > (pb_ptr)->index_size ) || \
+ ( ( (pb_ptr)->index_len == 1 ) && \
+ ( (pb_ptr)->index_size != (old_size) ) ) || \
+ ( (pb_ptr)->index_size != \
+ ((pb_ptr)->clean_index_size + \
+ (pb_ptr)->dirty_index_size) ) || \
+ ( (pb_ptr)->index_size < ((pb_ptr)->clean_index_size) ) || \
+ ( (pb_ptr)->index_size < ((pb_ptr)->dirty_index_size) ) || \
+ ( ( !( was_clean ) || \
+ ( (pb_ptr)->clean_index_size < (old_size) ) ) && \
+ ( ( (was_clean) ) || \
+ ( (pb_ptr)->dirty_index_size < (old_size) ) ) ) || \
+ ( (entry_ptr) == NULL ) || \
+ ( (pb_ptr)->index_len != (pb_ptr)->il_len ) || \
+ ( (pb_ptr)->index_size != (pb_ptr)->il_size ) ) { \
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "pre HT entry size change SC failed") \
+}
+
+#define H5PB__POST_HT_ENTRY_SIZE_CHANGE_SC(pb_ptr, old_size, new_size, \
+ entry_ptr) \
+if ( ( (pb_ptr) == NULL ) || \
+ ( (pb_ptr)->index_len <= 0 ) || \
+ ( (pb_ptr)->index_size <= 0 ) || \
+ ( (new_size) > (pb_ptr)->index_size ) || \
+ ( (pb_ptr)->index_size != \
+ ((pb_ptr)->clean_index_size + \
+ (pb_ptr)->dirty_index_size) ) || \
+ ( (pb_ptr)->index_size < ((pb_ptr)->clean_index_size) ) || \
+ ( (pb_ptr)->index_size < ((pb_ptr)->dirty_index_size) ) || \
+ ( ( !((entry_ptr)->is_dirty ) || \
+ ( (pb_ptr)->dirty_index_size < (new_size) ) ) && \
+ ( ( ((entry_ptr)->is_dirty) ) || \
+ ( (pb_ptr)->clean_index_size < (new_size) ) ) ) || \
+ ( ( (pb_ptr)->index_len == 1 ) && \
+ ( (pb_ptr)->index_size != (new_size) ) ) || \
+ ( (pb_ptr)->index_len != (pb_ptr)->il_len ) || \
+ ( (pb_ptr)->index_size != (pb_ptr)->il_size ) ) { \
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "post HT entry size change SC failed") \
+}
+
+#define H5PB__PRE_HT_UPDATE_FOR_ENTRY_CLEAN_SC(pb_ptr, entry_ptr) \
+if ( ( (pb_ptr) == NULL ) || \
+ ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \
+ ( (pb_ptr)->index_len <= 0 ) || \
+ ( (entry_ptr) == NULL ) || \
+ ( (entry_ptr)->is_dirty != FALSE ) || \
+ ( (pb_ptr)->index_size < (int64_t)((entry_ptr)->size) ) || \
+ ( (pb_ptr)->dirty_index_size < (int64_t)((entry_ptr)->size) ) || \
+ ( (pb_ptr)->index_size != \
+ ((pb_ptr)->clean_index_size + (pb_ptr)->dirty_index_size) ) || \
+ ( (pb_ptr)->index_size < ((pb_ptr)->clean_index_size) ) || \
+ ( (pb_ptr)->index_size < ((pb_ptr)->dirty_index_size) ) ) { \
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "pre HT update for entry clean SC failed") \
+}
+
+#define H5PB__PRE_HT_UPDATE_FOR_ENTRY_DIRTY_SC(pb_ptr, entry_ptr) \
+if ( ( (pb_ptr) == NULL ) || \
+ ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \
+ ( (pb_ptr)->index_len <= 0 ) || \
+ ( (entry_ptr) == NULL ) || \
+ ( (entry_ptr)->is_dirty != TRUE ) || \
+ ( (pb_ptr)->index_size < (int64_t)((entry_ptr)->size) ) || \
+ ( (pb_ptr)->clean_index_size < (int64_t)((entry_ptr)->size) ) || \
+ ( (pb_ptr)->index_size != \
+ ((pb_ptr)->clean_index_size + (pb_ptr)->dirty_index_size) ) || \
+ ( (pb_ptr)->index_size < ((pb_ptr)->clean_index_size) ) || \
+ ( (pb_ptr)->index_size < ((pb_ptr)->dirty_index_size) ) ) { \
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "pre HT update for entry dirty SC failed") \
+}
+
+#define H5PB__POST_HT_UPDATE_FOR_ENTRY_CLEAN_SC(pb_ptr, entry_ptr) \
+if ( ( (pb_ptr)->index_size != \
+ ((pb_ptr)->clean_index_size + (pb_ptr)->dirty_index_size) ) || \
+ ( (pb_ptr)->index_size < ((pb_ptr)->clean_index_size) ) || \
+ ( (pb_ptr)->index_size < ((pb_ptr)->dirty_index_size) ) ) { \
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "post HT update for entry clean SC failed") \
+}
+
+#define H5PB__POST_HT_UPDATE_FOR_ENTRY_DIRTY_SC(pb_ptr, entry_ptr) \
+if ( ( (pb_ptr)->index_size != \
+ ((pb_ptr)->clean_index_size + (pb_ptr)->dirty_index_size) ) || \
+ ( (pb_ptr)->index_size < ((pb_ptr)->clean_index_size) ) || \
+ ( (pb_ptr)->index_size < ((pb_ptr)->dirty_index_size) ) ) { \
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "post HT update for entry dirty SC failed") \
+}
+
+#else /* H5PB__DO_SANITY_CHECKS */
+
+#define H5PB__PRE_HT_INSERT_SC(pb_ptr, entry_ptr, fail_val)
+#define H5PB__POST_HT_INSERT_SC(pb_ptr, entry_ptr, fail_val)
+#define H5PB__PRE_HT_REMOVE_SC(pb_ptr, entry_ptr)
+#define H5PB__POST_HT_REMOVE_SC(pb_ptr, entry_ptr)
+#define H5PB__PRE_HT_SEARCH_SC(pb_ptr, Addr, fail_val)
+#define H5PB__POST_SUC_HT_SEARCH_SC(pb_ptr, entry_ptr, k, fail_val)
+#define H5PB__POST_HT_SHIFT_TO_FRONT_SC(pb_ptr, entry_ptr, k, fail_val)
+#define H5PB__PRE_HT_UPDATE_FOR_ENTRY_CLEAN_SC(pb_ptr, entry_ptr)
+#define H5PB__PRE_HT_UPDATE_FOR_ENTRY_DIRTY_SC(pb_ptr, entry_ptr)
+#define H5PB__PRE_HT_ENTRY_SIZE_CHANGE_SC(pb_ptr, old_size, new_size, \
+ entry_ptr, was_clean)
+#define H5PB__POST_HT_ENTRY_SIZE_CHANGE_SC(pb_ptr, old_size, new_size, \
+ entry_ptr)
+#define H5PB__POST_HT_UPDATE_FOR_ENTRY_CLEAN_SC(pb_ptr, entry_ptr)
+#define H5PB__POST_HT_UPDATE_FOR_ENTRY_DIRTY_SC(pb_ptr, entry_ptr)
+
+#endif /* H5PB__DO_SANITY_CHECKS */
+
+
+#define H5PB__INSERT_IN_INDEX(pb_ptr, entry_ptr, fail_val) \
+{ \
+ int k; \
+ H5PB__PRE_HT_INSERT_SC(pb_ptr, entry_ptr, fail_val) \
+ k = H5PB__HASH_FCN((entry_ptr)->page); \
+ if(((pb_ptr)->ht)[k] != NULL) { \
+ (entry_ptr)->ht_next = ((pb_ptr)->ht)[k]; \
+ (entry_ptr)->ht_next->ht_prev = (entry_ptr); \
+ } \
+ ((pb_ptr)->ht)[k] = (entry_ptr); \
+ (pb_ptr)->index_len++; \
+ (pb_ptr)->index_size += (int64_t)((entry_ptr)->size); \
+ if((entry_ptr)->is_dirty) { \
+ (pb_ptr)->dirty_index_size += (int64_t)((entry_ptr)->size); \
+ } else { \
+ (pb_ptr)->clean_index_size += (int64_t)((entry_ptr)->size); \
+ } \
+ if ( (entry_ptr)->is_metadata ) { \
+ if ( (entry_ptr)->is_mpmde ) { \
+ ((pb_ptr)->mpmde_count)++; \
+ } else { \
+ ((pb_ptr)->curr_md_pages)++; \
+ (pb_ptr)->curr_pages++; \
+ } \
+ } else { \
+ ((pb_ptr)->curr_rd_pages)++; \
+ (pb_ptr)->curr_pages++; \
+ } \
+ H5PB__IL_DLL_APPEND((entry_ptr), (pb_ptr)->il_head, \
+ (pb_ptr)->il_tail, (pb_ptr)->il_len, \
+ (pb_ptr)->il_size, fail_val) \
+ H5PB__UPDATE_STATS_FOR_HT_INSERTION(pb_ptr) \
+ H5PB__UPDATE_HT_SIZE_STATS(pb_ptr) \
+ H5PB__POST_HT_INSERT_SC(pb_ptr, entry_ptr, fail_val) \
+}
+
+#define H5PB__DELETE_FROM_INDEX(pb_ptr, entry_ptr, fail_val) \
+{ \
+ int k; \
+ H5PB__PRE_HT_REMOVE_SC(pb_ptr, entry_ptr) \
+ k = H5PB__HASH_FCN((entry_ptr)->page); \
+ if((entry_ptr)->ht_next) \
+ (entry_ptr)->ht_next->ht_prev = (entry_ptr)->ht_prev; \
+ if((entry_ptr)->ht_prev) \
+ (entry_ptr)->ht_prev->ht_next = (entry_ptr)->ht_next; \
+ if(((pb_ptr)->ht)[k] == (entry_ptr)) \
+ ((pb_ptr)->ht)[k] = (entry_ptr)->ht_next; \
+ (entry_ptr)->ht_next = NULL; \
+ (entry_ptr)->ht_prev = NULL; \
+ (pb_ptr)->index_len--; \
+ (pb_ptr)->index_size -= (int64_t)((entry_ptr)->size); \
+ if((entry_ptr)->is_dirty) { \
+ (pb_ptr)->dirty_index_size -= (int64_t)((entry_ptr)->size); \
+ } else { \
+ (pb_ptr)->clean_index_size -= (int64_t)((entry_ptr)->size); \
+ } \
+ if ( (entry_ptr)->is_metadata ) { \
+ if ( (entry_ptr)->is_mpmde ) { \
+ ((pb_ptr)->mpmde_count)--; \
+ } else { \
+ ((pb_ptr)->curr_md_pages)--; \
+ (pb_ptr)->curr_pages--; \
+ } \
+ } else { \
+ ((pb_ptr)->curr_rd_pages)--; \
+ (pb_ptr)->curr_pages--; \
+ } \
+ H5PB__IL_DLL_REMOVE((entry_ptr), (pb_ptr)->il_head, \
+ (pb_ptr)->il_tail, (pb_ptr)->il_len, \
+ (pb_ptr)->il_size, fail_val) \
+ H5PB__UPDATE_STATS_FOR_HT_DELETION(pb_ptr) \
+ H5PB__POST_HT_REMOVE_SC(pb_ptr, entry_ptr) \
+}
+
+#define H5PB__SEARCH_INDEX(pb_ptr, Page, entry_ptr, fail_val) \
+{ \
+ int k; \
+ int depth = 0; \
+ H5PB__PRE_HT_SEARCH_SC(pb_ptr, Page, fail_val) \
+ k = H5PB__HASH_FCN(Page); \
+ entry_ptr = ((pb_ptr)->ht)[k]; \
+ while(entry_ptr) { \
+ if ( (Page) == (entry_ptr)->page ) { \
+ H5PB__POST_SUC_HT_SEARCH_SC(pb_ptr, entry_ptr, k, fail_val) \
+ if ( (entry_ptr) != ((pb_ptr)->ht)[k] ) { \
+ if ( (entry_ptr)->ht_next ) \
+ (entry_ptr)->ht_next->ht_prev = (entry_ptr)->ht_prev; \
+ HDassert((entry_ptr)->ht_prev != NULL); \
+ (entry_ptr)->ht_prev->ht_next = (entry_ptr)->ht_next; \
+ ((pb_ptr)->ht)[k]->ht_prev = (entry_ptr); \
+ (entry_ptr)->ht_next = ((pb_ptr)->ht)[k]; \
+ (entry_ptr)->ht_prev = NULL; \
+ ((pb_ptr)->ht)[k] = (entry_ptr); \
+ H5PB__POST_HT_SHIFT_TO_FRONT_SC(pb_ptr, entry_ptr, k, fail_val)\
+ } \
+ break; \
+ } \
+ (entry_ptr) = (entry_ptr)->ht_next; \
+ (depth)++; \
+ } \
+ H5PB__UPDATE_STATS_FOR_HT_SEARCH(pb_ptr, (entry_ptr != NULL), depth) \
+}
+
+#define H5PB__UPDATE_INDEX_FOR_ENTRY_CLEAN(pb_ptr, entry_ptr) \
+{ \
+ H5PB__PRE_HT_UPDATE_FOR_ENTRY_CLEAN_SC(pb_ptr, entry_ptr); \
+ (pb_ptr)->dirty_index_size -= (int64_t)((entry_ptr)->size); \
+ (pb_ptr)->clean_index_size += (int64_t)((entry_ptr)->size); \
+ H5PB__POST_HT_UPDATE_FOR_ENTRY_CLEAN_SC(pb_ptr, entry_ptr); \
+}
+
+#define H5PB__UPDATE_INDEX_FOR_ENTRY_DIRTY(pb_ptr, entry_ptr) \
+{ \
+ H5PB__PRE_HT_UPDATE_FOR_ENTRY_DIRTY_SC(pb_ptr, entry_ptr); \
+ (pb_ptr)->clean_index_size -= (int64_t)((entry_ptr)->size); \
+ (pb_ptr)->dirty_index_size += (int64_t)((entry_ptr)->size); \
+ H5PB__POST_HT_UPDATE_FOR_ENTRY_DIRTY_SC(pb_ptr, entry_ptr); \
+}
+
+#define H5PB__UPDATE_INDEX_FOR_SIZE_CHANGE(pb_ptr, old_size, new_size, \
+ entry_ptr, was_clean) \
+{ \
+ H5PB__PRE_HT_ENTRY_SIZE_CHANGE_SC(pb_ptr, old_size, new_size, \
+ entry_ptr, was_clean) \
+ (pb_ptr)->index_size -= (old_size); \
+ (pb_ptr)->index_size += (new_size); \
+ if(was_clean) { \
+ (pb_ptr)->clean_index_size -= (old_size); \
+ } else { \
+ (pb_ptr)->dirty_index_size -= (old_size); \
+ } \
+ if((entry_ptr)->is_dirty) { \
+ (pb_ptr)->dirty_index_size += (new_size); \
+ } else { \
+ (pb_ptr)->clean_index_size += (new_size); \
+ } \
+ H5PB__DLL_UPDATE_FOR_SIZE_CHANGE((pb_ptr)->il_len, \
+ (pb_ptr)->il_size, \
+ (old_size), (new_size)) \
+ H5PB__POST_HT_ENTRY_SIZE_CHANGE_SC(pb_ptr, old_size, new_size, \
+ entry_ptr) \
+}
+
+
+/***********************************************************************
+ *
+ * Replacement policy update macros
+ *
+ * The following macros handle updates to the replacement policy for
+ * insertions, flushes, and evictions.
+ *
+ * At present, the only replacement policy is a modified LRU policy.
+ *
+ * JRM -- 10/09/18
+ *
+ ***********************************************************************/
+
+/*-------------------------------------------------------------------------
+ *
+ * Macro: H5PB__UPDATE_RP_FOR_EVICTION
+ *
+ * Purpose: Update the replacement policy data structures for an
+ * eviction of the specified page buffer entry.
+ *
+ * At present, we only support the modified LRU policy, so
+ * this function deals with that case unconditionally. If
+ * we ever support other replacement policies, the function
+ * should switch on the current policy and act accordingly.
+ *
+ * Return: Non-negative on success/Negative on failure.
+ *
+ * Programmer: John Mainzer, 10/09/18
+ *
+ * Modifications:
+ *
+ * None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#define H5PB__UPDATE_RP_FOR_EVICTION(pb_ptr, entry_ptr, fail_val) \
+{ \
+ HDassert( (pb_ptr) ); \
+ HDassert( (pb_ptr)->magic == H5PB__H5PB_T_MAGIC ); \
+ HDassert( (entry_ptr) ); \
+ HDassert( (entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC ); \
+ HDassert( !((entry_ptr)->is_dirty) ); \
+ HDassert( (entry_ptr)->size >= pb_ptr->page_size ); \
+ \
+ /* modified LRU specific code */ \
+ \
+ /* remove the entry from the LRU list. */ \
+ \
+ H5PB__DLL_REMOVE((entry_ptr), (pb_ptr)->LRU_head_ptr, \
+ (pb_ptr)->LRU_tail_ptr, (pb_ptr)->LRU_len, \
+ (pb_ptr)->LRU_size, (fail_val)) \
+ \
+ /* End modified LRU specific code. */ \
+ \
+} /* H5PB__UPDATE_RP_FOR_EVICTION */
+
+/*-------------------------------------------------------------------------
+ *
+ * Macro: H5PB__UPDATE_RP_FOR_REMOVE
+ *
+ * Purpose: Update the replacement policy data structures for the
+ * removal of the specified page buffer entry from the
+ * replacement policy, but not from the page buffer.
+ *
+ * At present, this this only happens when an entry is
+ * dirtied, and subject to a delayed write.
+ *
+ * At present, we only support the modified LRU policy, so
+ * this function deals with that case unconditionally. If
+ * we ever support other replacement policies, the function
+ * should switch on the current policy and act accordingly.
+ *
+ * Return: Non-negative on success/Negative on failure.
+ *
+ * Programmer: John Mainzer, 10/09/18
+ *
+ * Modifications:
+ *
+ * None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#define H5PB__UPDATE_RP_FOR_REMOVE(pb_ptr, entry_ptr, fail_val) \
+{ \
+ HDassert( (pb_ptr) ); \
+ HDassert( (pb_ptr)->magic == H5PB__H5PB_T_MAGIC ); \
+ HDassert( (entry_ptr) ); \
+ HDassert( (entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC ); \
+ HDassert( ! ((entry_ptr)->is_mpmde) ); \
+ HDassert( (entry_ptr)->size == pb_ptr->page_size ); \
+ \
+ /* modified LRU specific code */ \
+ \
+ /* remove the entry from the LRU list. */ \
+ \
+ H5PB__DLL_REMOVE((entry_ptr), (pb_ptr)->LRU_head_ptr, \
+ (pb_ptr)->LRU_tail_ptr, (pb_ptr)->LRU_len, \
+ (pb_ptr)->LRU_size, (fail_val)) \
+ \
+ /* End modified LRU specific code. */ \
+ \
+} /* H5PB__UPDATE_RP_FOR_EVICTION */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Macro: H5PB__UPDATE_RP_FOR_ACCESS
+ *
+ * Purpose: Update the replacement policy data structures for an
+ * access of the specified page buffer entry.
+ *
+ * At present, we only support the modified LRU policy, so
+ * this function deals with that case unconditionally. If
+ * we ever support other replacement policies, the function
+ * should switch on the current policy and act accordingly.
+ *
+ * Return: N/A
+ *
+ * Programmer: John Mainzer, 10/09/18
+ *
+ * Modifications:
+ *
+ * None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#define H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, fail_val) \
+{ \
+ HDassert( (pb_ptr) ); \
+ HDassert( (pb_ptr)->magic == H5PB__H5PB_T_MAGIC ); \
+ HDassert( (entry_ptr) ); \
+ HDassert( (entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC ); \
+ HDassert( (entry_ptr)->size >= pb_ptr->page_size ); \
+ \
+ /* modified LRU specific code */ \
+ \
+ /* Move entry to the head of the LRU */ \
+ \
+ H5PB__DLL_REMOVE((entry_ptr), (pb_ptr)->LRU_head_ptr, \
+ (pb_ptr)->LRU_tail_ptr, (pb_ptr)->LRU_len, \
+ (pb_ptr)->LRU_size, (fail_val)) \
+ \
+ H5PB__DLL_PREPEND((entry_ptr), (pb_ptr)->LRU_head_ptr, \
+ (pb_ptr)->LRU_tail_ptr, (pb_ptr)->LRU_len, \
+ (pb_ptr)->LRU_size, (fail_val)) \
+ \
+ /* End modified LRU specific code. */ \
+ \
+} /* H5PB__UPDATE_RP_FOR_ACCESS */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Macro: H5PB__UPDATE_RP_FOR_FLUSH
+ *
+ * Purpose: Update the replacement policy data structures for a flush
+ * of the specified page buffer entry.
+ *
+ * At present, we only support the modified LRU policy, so
+ * this function deals with that case unconditionally. If
+ * we ever support other replacement policies, the function
+ * should switch on the current policy and act accordingly.
+ *
+ * Return: N/A
+ *
+ * Programmer: John Mainzer, 10/09/18
+ *
+ * Modifications:
+ *
+ * None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#define H5PB__UPDATE_RP_FOR_FLUSH(pb_ptr, entry_ptr, fail_val) \
+{ \
+ H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, fail_val) \
+ \
+} /* H5PB__UPDATE_RP_FOR_FLUSH */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Macro: H5PB__UPDATE_RP_FOR_INSERT_APPEND
+ *
+ * Purpose: Update the replacement policy data structures for an
+ * insertion of the specified cache entry.
+ *
+ * Unlike H5PB__UPDATE_RP_FOR_INSERTION below, mark the
+ * new entry as the LEAST recently used entry, not the
+ * most recently used.
+ *
+ * At present, we only support the modified LRU policy, so
+ * this function deals with that case unconditionally. If
+ * we ever support other replacement policies, the function
+ * should switch on the current policy and act accordingly.
+ *
+ * Return: N/A
+ *
+ * Programmer: John Mainzer, 10/10/18
+ *
+ * Modifications:
+ *
+ * None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#define H5PB__UPDATE_RP_FOR_INSERT_APPEND(pb_ptr, entry_ptr, fail_val) \
+{ \
+ HDassert( (pb_ptr) ); \
+ HDassert( (pb_ptr)->magic == H5PB__H5PB_T_MAGIC ); \
+ HDassert( (entry_ptr) ); \
+ HDassert( (entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC ); \
+ HDassert( (entry_ptr)->size == pb_ptr->page_size ); \
+ \
+ /* modified LRU specific code */ \
+ \
+ /* insert the entry at the tail of the LRU list. */ \
+ \
+ H5PB__DLL_APPEND((entry_ptr), (pb_ptr)->LRU_head_ptr, \
+ (pb_ptr)->LRU_tail_ptr, (pb_ptr)->LRU_len, \
+ (pb_ptr)->LRU_size, (fail_val)) \
+ \
+ H5PB__UPDATE_LRU_SIZE_STATS(pb_ptr) \
+ \
+ /* End modified LRU specific code. */ \
+}
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Macro: H5PB__UPDATE_RP_FOR_INSERTION
+ *
+ * Purpose: Update the replacement policy data structures for an
+ * insertion of the specified cache entry.
+ *
+ * At present, we only support the modified LRU policy, so
+ * this function deals with that case unconditionally. If
+ * we ever support other replacement policies, the function
+ * should switch on the current policy and act accordingly.
+ *
+ * Return: N/A
+ *
+ * Programmer: John Mainzer, 10/10/18
+ *
+ * Modifications:
+ *
+ * None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#define H5PB__UPDATE_RP_FOR_INSERTION(pb_ptr, entry_ptr, fail_val) \
+{ \
+ HDassert( (pb_ptr) ); \
+ HDassert( (pb_ptr)->magic == H5PB__H5PB_T_MAGIC ); \
+ HDassert( (entry_ptr) ); \
+ HDassert( (entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC ); \
+ HDassert( (entry_ptr)->size >= pb_ptr->page_size ); \
+ \
+ /* modified LRU specific code */ \
+ \
+ /* insert the entry at the head of the LRU list. */ \
+ \
+ H5PB__DLL_PREPEND((entry_ptr), (pb_ptr)->LRU_head_ptr, \
+ (pb_ptr)->LRU_tail_ptr, (pb_ptr)->LRU_len, \
+ (pb_ptr)->LRU_size, (fail_val)) \
+ \
+ H5PB__UPDATE_LRU_SIZE_STATS(pb_ptr) \
+ \
+ /* End modified LRU specific code. */ \
+}
+
+
+/***********************************************************************
+ *
+ * Tick list management macros
+ *
+ * When the target file is opened in VFD SWMR writer mode, the page
+ * buffer must retain copies of all metadata writes during each tick so
+ * that the metadata file can be updated correctly in end of tick
+ * processing.
+ *
+ * Once tick processing is complete, all entries are removed from the
+ * tick list, to leave it empty for the next tick. Metadata pages from
+ * the tick list are already in the replacement policy, and thus require
+ * no further action.
+ *
+ * Multi-page metadata entries are evicted from the page buffer if they
+ * are not subject to delayed write, or left in the delayed write list
+ * for later flush and eviction if they are.
+ *
+ * The macros required to support this are defined below.
+ *
+ * JRM -- 10/09/18
+ *
+ ***********************************************************************/
+
+/*-------------------------------------------------------------------------
+ *
+ * Macro: H5PB__INSERT_IN_TL
+ *
+ * Purpose: Insert the specified page buffer entry at the head of the
+ * tick list.
+ *
+ * Return: N/A
+ *
+ * Programmer: John Mainzer, 10/10/18
+ *
+ * Modifications:
+ *
+ * None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#define H5PB__INSERT_IN_TL(pb_ptr, entry_ptr, fail_val) \
+{ \
+ HDassert( (pb_ptr) ); \
+ HDassert( (pb_ptr)->magic == H5PB__H5PB_T_MAGIC ); \
+ HDassert( (pb_ptr)->vfd_swmr_writer ); \
+ HDassert( (entry_ptr) ); \
+ HDassert( (entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC ); \
+ HDassert( (entry_ptr)->modified_this_tick ); \
+ HDassert( (entry_ptr)->size >= pb_ptr->page_size ); \
+ \
+ /* insert the entry at the head of the tick list. */ \
+ \
+ H5PB__TL_DLL_PREPEND((entry_ptr), (pb_ptr)->tl_head_ptr, \
+ (pb_ptr)->tl_tail_ptr, (pb_ptr)->tl_len, \
+ (pb_ptr)->tl_size, (fail_val)) \
+ \
+ H5PB__UPDATE_TL_SIZE_STATS(pb_ptr) \
+ \
+} /* H5PB__INSERT_IN_TL */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Macro: H5PB__REMOVE_FROM_TL
+ *
+ * Purpose: Remove the specified page buffer entry from the tick list.
+ *
+ * Return: N/A
+ *
+ * Programmer: John Mainzer, 10/10/18
+ *
+ * Modifications:
+ *
+ * None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#define H5PB__REMOVE_FROM_TL(pb_ptr, entry_ptr, fail_val) \
+{ \
+ HDassert( (pb_ptr) ); \
+ HDassert( (pb_ptr)->magic == H5PB__H5PB_T_MAGIC ); \
+ HDassert( (pb_ptr)->vfd_swmr_writer ); \
+ HDassert( (entry_ptr) ); \
+ HDassert( (entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC ); \
+ HDassert( (entry_ptr)->modified_this_tick ); \
+ HDassert( (entry_ptr)->size >= pb_ptr->page_size ); \
+ \
+ /* remove the entry from the tick list. */ \
+ \
+ H5PB__TL_DLL_REMOVE((entry_ptr), (pb_ptr)->tl_head_ptr, \
+ (pb_ptr)->tl_tail_ptr, (pb_ptr)->tl_len, \
+ (pb_ptr)->tl_size, (fail_val)) \
+ \
+ \
+} /* H5PB__REMOVE_FROM_TL */
+
+
+/***********************************************************************
+ *
+ * Delayed write list management macros
+ *
+ * When the target file is opened in VFD SWMR writer mode, the page
+ * buffer must delay flush of all metadata pages and multi-page metadata
+ * entries that:
+ *
+ * 1) have not appeared in the metadata file index for at least max_lag
+ * ticks, and
+ *
+ * 2) a previous version of the metadata page or multi-page metadata
+ * cache entry exists in the file.
+ *
+ * Failure to do so can result in VFD SWMR readers to receive messages
+ * from the future.
+ *
+ * To minimize overhead, the delayed write list is sorted in decreasing
+ * values of the constituent delay_write_until fields.
+ *
+ * Entries are removed from the delayed write list when their
+ * delay_write_until fields are satisfied. Metadata pages are inserted
+ * at the bottom of the replacement policy, and multi-page metadata
+ * entries are immediately flushed and evicted.
+ *
+ * The macros required to support this are defined below.
+ *
+ * JRM -- 10/09/18
+ *
+ ***********************************************************************/
+
+/*-------------------------------------------------------------------------
+ *
+ * Macro: H5PB__INSERT_IN_DWL
+ *
+ * Insert the supplied page buffer entry in the delayed write list
+ * maintaining the invarient:
+ *
+ * entry_ptr->next == NULL ||
+ * entry_ptr->delay_write_until >= entry_ptr->next->delay_write_until
+ *
+ * In passing update pb_ptr->max_delay if appropriate.
+ *
+ * Return: N/A
+ *
+ * Programmer: John Mainzer, 10/10/18
+ *
+ * Modifications:
+ *
+ * None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#define H5PB__INSERT_IN_DWL(pb_ptr, entry_ptr, fail_val) \
+{ \
+ int insertion_depth = 0; \
+ uint64_t delay; \
+ H5PB_entry_t * suc_ptr; \
+ \
+ HDassert( (pb_ptr) ); \
+ HDassert( (pb_ptr)->magic == H5PB__H5PB_T_MAGIC ); \
+ HDassert( (pb_ptr)->vfd_swmr_writer ); \
+ HDassert( (entry_ptr) ); \
+ HDassert( (entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC ); \
+ HDassert( (entry_ptr)->size >= pb_ptr->page_size ); \
+ HDassert( (entry_ptr)->delay_write_until > (pb_ptr)->cur_tick ); \
+ \
+ delay = (entry_ptr)->delay_write_until - (pb_ptr)->cur_tick; \
+ suc_ptr = pb_ptr->dwl_head_ptr; \
+ \
+ while ( (suc_ptr) && \
+ ((suc_ptr)->delay_write_until > (entry_ptr)->delay_write_until) ) \
+ { \
+ insertion_depth++; \
+ suc_ptr = suc_ptr->next; \
+ } \
+ \
+ H5PB__DLL_INSERT_BEFORE((entry_ptr), (suc_ptr), \
+ (pb_ptr)->dwl_head_ptr, \
+ (pb_ptr)->dwl_tail_ptr, (pb_ptr)->dwl_len, \
+ (pb_ptr)->dwl_size, (fail_val)) \
+ \
+ if ( entry_ptr->delay_write_until > pb_ptr->max_delay ) \
+ pb_ptr->max_delay = entry_ptr->delay_write_until; \
+ \
+ H5PB__UPDATE_DWL_SIZE_STATS(pb_ptr) \
+ H5PB__UPDATE_DWL_DELAYED_WRITES(pb_ptr, insertion_depth, delay) \
+ \
+} /* H5PB__INSERT_IN_DWL */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Macro: H5PB__REMOVE_FROM_DWL
+ *
+ * Purpose: Remove the specified page buffer entry from the delayed
+ * write list.
+ *
+ * Return: N/A
+ *
+ * Programmer: John Mainzer, 10/10/18
+ *
+ * Modifications:
+ *
+ * None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#define H5PB__REMOVE_FROM_DWL(pb_ptr, entry_ptr, fail_val) \
+{ \
+ HDassert( (pb_ptr) ); \
+ HDassert( (pb_ptr)->magic == H5PB__H5PB_T_MAGIC ); \
+ HDassert( (pb_ptr)->vfd_swmr_writer ); \
+ HDassert( (entry_ptr) ); \
+ HDassert( (entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC ); \
+ HDassert( (entry_ptr)->size >= pb_ptr->page_size ); \
+ HDassert( (entry_ptr)->delay_write_until == 0 ); \
+ \
+ /* remove the entry from the delayed write list. */ \
+ \
+ H5PB__DLL_REMOVE((entry_ptr), (pb_ptr)->dwl_head_ptr, \
+ (pb_ptr)->dwl_tail_ptr, (pb_ptr)->dwl_len, \
+ (pb_ptr)->dwl_size, (fail_val)) \
+ \
+ \
+} /* H5PB__REMOVE_FROM_DWLL */
+
+
/****************************/
/* Package Private Typedefs */
/****************************/
-typedef struct H5PB_entry_t {
- void *page_buf_ptr; /* Pointer to the buffer containing the data */
- haddr_t addr; /* Address of the page in the file */
- H5F_mem_page_t type; /* Type of the page entry (H5F_MEM_PAGE_RAW/META) */
- hbool_t is_dirty; /* Flag indicating whether the page has dirty data or not */
+/****************************************************************************
+ *
+ * structure H5PB_entry_t
+ *
+ * Individual instances of the H5PB_entry_t structure are used to manage
+ * individual pages in the page buffer. In the case of a VFD SWMR writer,
+ * they are also used to manage multi-page metadata entries.
+ *
+ * The fields of this structure are discussed below:
+ *
+ * JRM - 9/27/18
+ *
+ * magic: Unsigned 32 bit integer that must always be set to
+ * H5PB__H5PB_ENTRY_T_MAGIC when the entry is valid.
+ *
+ * pb_ptr: Pointer to the page buffer that contains this entry.
+ *
+ * addr: Base address of the page in the file.
+ *
+ * page: Page offset of the page -- i.e. addr / pb_ptr->page_size.
+ * Note that addr must always equal page * pb_ptr->page_size.
+ *
+ * size: Size of the page buffer entry in bytes. Under normal
+ * circumstance, this will always be equal to pb_ptr->page_size.
+ * However, in the context of a VFD SWMR writer, the page
+ * buffer may be used to store multi-page metadata entries
+ * until the end of tick, or to delay writes of such entries
+ * for up to max_lag ticks.
+ *
+ * In such cases, size must be greater than pb_ptr->page_size.
+ *
+ * image_ptr: Pointer to void. When not NULL, this field points to a
+ * dynamically allocated block of size bytes in which the
+ * on disk image of the page. In the context of VFD SWMR,
+ * it points to the image of the multi-page metadata entry.
+ *
+ * mem_type: Type (H5F_mem_t) of the page buffer entry. This value
+ * is needed when reading or writing the entry from/to file.
+ *
+ * is_metadata: Boolean flag that is set to TRUE iff the associated
+ * entry is a page of metadata (or, in the context of VFD
+ * SWMR, a multi-page metadata entry).
+ *
+ * is_dirty: Boolean flag indicating whether the contents of the page
+ * buffer entry has been modified since the last time it
+ * was written to disk.
+ *
+ *
+ * Fields supporting the hash table:
+ *
+ * Entries in the page buffer are indexed by a more or less conventional
+ * hash table with chaining (see header comment on H5PB_t for futher details).
+ * If there are multiple entries in any hash bin, they are stored in a doubly
+ * linked list.
+ *
+ * To facilitate flushing the page buffer, we also maintain a doubly linked
+ * list of all entries in the page buffer.
+ *
+ * ht_next: Next pointer used by the hash table to store multiple
+ * entries in a single hash bin. This field points to the
+ * next entry in the doubly linked list of entries in the
+ * hash bin, or NULL if there is no next entry.
+ *
+ * ht_prev: Prev pointer used by the hash table to store multiple
+ * entries in a single hash bin. This field points to the
+ * previous entry in the doubly linked list of entries in
+ * the hash bin, or NULL if there is no previuos entry.
+ *
+ * il_next: Next pointer used by the index to maintain a doubly linked
+ * list of all entries in the index (and thus in the page buffer).
+ * This field contains a pointer to the next entry in the
+ * index list, or NULL if there is no next entry.
+ *
+ * il_prev: Prev pointer used by the index to maintain a doubly linked
+ * list of all entries in the index (and thus in the page buffer).
+ * This field contains a pointer to the previous entry in the
+ * index list, or NULL if there is no previous entry.
+ *
+ *
+ * Fields supporting replacement policies:
+ *
+ * The page buffer must have a replacement policy, and it will usually be
+ * necessary for this structure to contain fields supporting that policy.
+ *
+ * At present, only a modified LRU replacement policy is contemplated,
+ * (see header comment for H5PB_t for details), for which the following
+ * fields are adequate.
+ *
+ * next: Next pointer in either the LRU, or (in the context of
+ * VFD SWMR) the delayed write list. If there is no next entry
+ * on the list, this field should be set to NULL.
+ *
+ * prev: Prev pointer in either the LRU, or (in the context of
+ * VFD SWMR) the delayed write list. If there is no previous
+ * entry on the list, this field should be set to NULL.
+ *
+ * Fields supporting VFD SWMR:
+ *
+ * is_mpmde: Boolean flag that is set to TRUE iff the entry
+ * is a multi-page metadata entry. In the absense of VFD
+ * SWMR, the field should always be set to FALSE.
+ *
+ * Observe that:
+ *
+ * is_mpmde <==> is_metadata && size > pb_ptr->page_size
+ *
+ * loaded: Boolean flag that is set to TRUE iff the entry was loaded
+ * from file. This is a necessary input in determining
+ * whether the write of the entry must be delayed.
+ *
+ * This field is only maintained in the VFD SWMR case
+ * and should be false otherwise.
+ *
+ * modified_this_tick: This field is set to TRUE iff pb_ptr->vfd_swrm_write
+ * and the entry has been modified in the current tick. If
+ * modified_this_tick is TRUE, the entry must also be in the
+ * tick list.
+ *
+ * delay_write_until: Unsigned 64 bit integer containing the first tick
+ * in which the entry may be written to file, or 0 if there
+ * is no such constraint. It should be set ot 0 when VFD
+ * is not enabled.
+ *
+ * tl_next: Next pointer on the list of entries modified in the current
+ * tick, If the enty is not on the tick list, or if there is
+ * no next entry on the list, this field should be set to NULL.
+ *
+ * tl_prev: Prev pointer on the list of entries modified in the current
+ * tick, If the enty is not on the tick list, or if there is
+ * no previous entry on the list, this field should be set to
+ * NULL.
+ *
+ ****************************************************************************/
- /* Fields supporting replacement policies */
- struct H5PB_entry_t *next; /* next pointer in the LRU list */
- struct H5PB_entry_t *prev; /* previous pointer in the LRU list */
-} H5PB_entry_t;
+#define H5PB__H5PB_ENTRY_T_MAGIC 0x02030405
+struct H5PB_entry_t {
-/*****************************/
-/* Package Private Variables */
-/*****************************/
+ uint32_t magic;
+ H5PB_t *pb_ptr;
+ haddr_t addr;
+ uint64_t page;
+ size_t size;
+ void *image_ptr;
+ H5FD_mem_t mem_type;
+ hbool_t is_metadata;
+ hbool_t is_dirty;
+ /* fields supporting the hash table: */
+ struct H5PB_entry_t *ht_next;
+ struct H5PB_entry_t *ht_prev;
+ struct H5PB_entry_t *il_next;
+ struct H5PB_entry_t *il_prev;
-/******************************/
-/* Package Private Prototypes */
-/******************************/
+ /* fields supporting replacement policies: */
+ struct H5PB_entry_t *next;
+ struct H5PB_entry_t *prev;
+ /* fields supporting VFD SWMR */
+ hbool_t is_mpmde;
+ hbool_t loaded;
+ hbool_t modified_this_tick;
+ uint64_t delay_write_until;
+ struct H5PB_entry_t *tl_next;
+ struct H5PB_entry_t *tl_prev;
+
+}; /* H5PB_entry_t */
#endif /* _H5PBpkg_H */
diff --git a/src/H5PBprivate.h b/src/H5PBprivate.h
index 82d010d..983d183 100644
--- a/src/H5PBprivate.h
+++ b/src/H5PBprivate.h
@@ -11,68 +11,642 @@
* help@hdfgroup.org. *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
-/*-------------------------------------------------------------------------
+/*
+ * File: H5PBprivate.h
*
- * Created: H5PBprivate.h
- * June 2014
- * Mohamad Chaarawi
+ * Purpose: This file contains declarations which are normally visible
+ * within the HDF5 library, but are not visible at the user
+ * level
*
- *-------------------------------------------------------------------------
+ * Programmer: John Mainzer -- 10/07/18
*/
#ifndef _H5PBprivate_H
#define _H5PBprivate_H
/* Include package's public header */
-#ifdef NOT_YET
-#include "H5PBpublic.h"
-#endif /* NOT_YET */
+
+/* no H5PBpublic.h at present */
+
/* Private headers needed by this header */
#include "H5private.h" /* Generic Functions */
-#include "H5Fprivate.h" /* File access */
-#include "H5FLprivate.h" /* Free Lists */
-#include "H5SLprivate.h" /* Skip List */
/**************************/
/* Library Private Macros */
/**************************/
+#define H5PB__HASH_TABLE_LEN 4096 /* must be a power of 2 */
+
/****************************/
/* Library Private Typedefs */
/****************************/
-/* Forward declaration for a page buffer entry */
-struct H5PB_entry_t;
+/* Typedef for the page buffer entry structure (defined in H5PBpkg.h) */
+typedef struct H5PB_entry_t H5PB_entry_t;
+
+
+
+/******************************************************************************
+ *
+ * structure H5PB_t
+ *
+ * Catchall structure for all variables specific to an instance of the page
+ * buffer.
+ *
+ * At present, the page buffer serves two purposes in the HDF5 library.
+ *
+ * Under normal operating conditions, it serves as a normal page buffer whose
+ * purpose is to minimize and optimize file I/O by aggregating small metadata
+ * and raw data writes into pages, and by caching frequently used pages.
+ *
+ * In addition, when a file is opened for VFD SWMR writing, the page buffer is
+ * used to retain copies of all metadata pages and multi-page metadata entries
+ * that are written in a given tick, and under certain cases, to delay metadata
+ * page and/or multi-page metadata entry writes for some number of ticks.
+ * If the entry has not appeared in the VFD SWMR index for at least max_lag
+ * ticks, this is necessary to avoid message from the future bugs. See the
+ * VFD SWMR RFC for further details.
+ *
+ * To reflect this, the fields of this structure are divided into three
+ * sections. Specifically fields needed for general operations, fields needed
+ * for VFD SWMR, and statistics.
+ *
+ * FIELDS FOR GENERAL OPERATIONS:
+ *
+ * magic: Unsigned 32 bit integer that must always be set to
+ * H5PB__H5PB_T_MAGIC. This field is used to validate pointers to
+ * instances of H5PB_t.
+ *
+ * page_size: size_t containing the page buffer page size in bytes.
+ *
+ * max_pages: 64 bit integer containing the nominal maximum number
+ * of pages in the page buffer. Note that on creation, the page
+ * buffer is empty, and that under certain circumstances (mostly
+ * related to VFD SWMR) this limit can be exceeded by large
+ * amounts.
+ *
+ * curr_pages: 64 bit integer containing the current number of pages
+ * in the page buffer. curr_pages must always equal the sum of
+ * curr_md_pages + curr_rd_pages.
+ *
+ * Note that in the context of VFD SWMR, this count does NOT
+ * include multi-page metadata entries.
+ *
+ * curr_md_pages: 64 bit integer containing the current number of
+ * metadata pages in the page buffer.
+ *
+ * Note that in the context of VFD SWMR, this count does NOT
+ * include multi-page metadata entries.
+ *
+ * curr_rd_pages: 64 bit integer containing the current number of
+ * raw data pages in the page buffer.
+ *
+ * min_md_pages: 64 bit integer containing the number of pages in the
+ * page buffer reserved for metadata. No metadata page may be
+ * evicted from the page buffer if curr_md_pages is less than or
+ * equal to this value.
+ *
+ * min_rd_pages: 64 bin integer containing the number of pages in the
+ * page buffer reserved for raw data. No page or raw data may be
+ * evicted from the page buffer if curr_rd_pages is less than or
+ * equal to this value.
+ *
+ * The FAPL fields are used to store the page buffer configuration data
+ * provided to the page buffer in the H5PB_create() call.
+ *
+ * max_size: Maximum page buffer size supplied by the FAPL.
+ *
+ * min_meta_perc: Percent of the page buffer reserved for metadata as
+ * supplied in the FAPL.
+ *
+ * min_raw_perc: Percent of the page buffer reserved for metadata as
+ * supplied in the FAPL.
+ *
+ * The purpose of the index is to allow us to efficiently look up all pages
+ * (and multi-page metadata entries in the context of VFD SWMR) in the
+ * page buffer.
+ *
+ * This function is provided by a hash table with chaining, albeit with one
+ * un-unusual feature.
+ *
+ * Specifically hash table size must be a power of two, and the hash function
+ * simply clips the high order bits off the page offset of the entry.
+ *
+ * This should work, as space is typically allocated sequentually, and thus
+ * via a reverse principle of locality argument, hot pages are unlikely to
+ * hash to the same bucket. That said, we must collect statistics to alert
+ * us should this not be the case.
+ *
+ * We also maintain a linked list of all entries in the index to facilitate
+ * flush operations.
+ *
+ * index Array of pointer to H5PB_entry_t of size
+ * H5PB__HASH_TABLE_LEN. This size must ba a power of 2,
+ * not the usual prime number.
+ *
+ * index_len: Number of entries currently in the hash table used to index
+ * the page buffer. index_len should always equal
+ * clean_index_len + dirty_index_len.
+ *
+ * clean_index_len: Number of clean entries currently in the hash table
+ * used to index the page buffer.
+ *
+ * dirty_index_len: Number of dirty entries currently in the hash table
+ * used to index the page buffer.
+ *
+ * index_size: Number of bytes currently stored in the hash table used to
+ * index the page buffer. Under normal circumstances, this
+ * value will be index_len * page size. However, if
+ * vfd_swmr_writer is TRUE, it may be larger.
+ *
+ * index_size should always equal clean_index_size +
+ * dirty_index_size.
+ *
+ * clean_index_size: Number of bytes of clean entries currently stored in
+ * the hash table used to index the page buffer.
+ *
+ * dirty_index_size: Number of bytes of dirty entries currently stored in
+ * the hash table used to index the page buffer.
+ *
+ * il_len: Number of entries on the index list.
+ *
+ * This must always be equal to index_len. As such, this
+ * field is redundant. However, the existing linked list
+ * management macros expect to maintain a length field, so
+ * this field exists primarily to avoid adding complexity to
+ * these macros.
+ *
+ * il_size: Number of bytes of cache entries currently stored in the
+ * index list.
+ *
+ * This must always be equal to index_size. As such, this
+ * field is redundant. However, the existing linked list
+ * management macros expect to maintain a size field, so
+ * this field exists primarily to avoid adding complexity to
+ * these macros.
+ *
+ * il_head: Pointer to the head of the doubly linked list of entries in
+ * the index list. Note that cache entries on this list are
+ * linked by their il_next and il_prev fields.
+ *
+ * This field is NULL if the index is empty.
+ *
+ * il_tail: Pointer to the tail of the doubly linked list of entries in
+ * the index list. Note that cache entries on this list are
+ * linked by their il_next and il_prev fields.
+ *
+ * This field is NULL if the index is empty.
+ *
+ *
+ * Fields supporting the modified LRU policy:
+ *
+ * See most any OS text for a discussion of the LRU replacement policy.
+ *
+ * Under normal operating circumstances (i.e. vfd_swmr_writer is FALSE)
+ * all entries will reside both in the index and in the LRU. Further,
+ * all entries will be of size page_size.
+ *
+ * The VFD SWMR writer case (i.e. vfd_swmr_writer is TRUE) is complicated
+ * by the requirements that we:
+ *
+ * 1) buffer all metadat writes (including multi-page metadata writes) that
+ * occur during a tick, and
+ *
+ * 2) when necessary, delay metadata writes for up to max_lag ticks to
+ * avoid message from the future bugs on the VFD SWMR readers.
+ *
+ * See discussion of fields supporting VFD SWMR below for details.
+ *
+ * Discussions of the individual fields used by the modified LRU replacement
+ * policy follow:
+ *
+ * LRU_len: Number of page buffer entries currently on the LRU.
+ *
+ * Observe that LRU_len + dwl_len must always equal
+ * index_len.
+ *
+ * LRU_size: Number of bytes of page buffer entries currently residing
+ * on the LRU list.
+ *
+ * Observe that LRU_size + dwl_size must always equal
+ * index_size.
+ *
+ * LRU_head_ptr: Pointer to the head of the doubly linked LRU list. Page
+ * buffer entries on this list are linked by their next and
+ * prev fields.
+ *
+ * This field is NULL if the list is empty.
+ *
+ * LRU_tail_ptr: Pointer to the tail of the doubly linked LRU list. Page
+ * buffer entries on this list are linked by their next and
+ * prev fields.
+ *
+ * This field is NULL if the list is empty.
+ *
+ *
+ * FIELDS SUPPORTING VFD SWMR:
+ *
+ * If the file is opened as a VFD SWMR writer (i.e. vfd_swmr_writer == TRUE),
+ * the page buffer must retain the data necessary to update the metadata
+ * file at the end of each tick, and also delay writes as necessary so as
+ * to avoid message from the future bugs on the VFD SWMR readers.
+ *
+ * The tick list exists to allow us to buffer copies of all metadata writes
+ * during a tick, and the delayed write list supports delayed writes.
+ *
+ * If a regular page is written to during a tick, it is placed on the tick
+ * list. If there is no reason to delay its write to file (i.e. either
+ * it was just allocated, or it has existed in the metadata file index for
+ * at least max_lag ticks), it is also placed on the LRU, where it may be
+ * flushed, but not evicted. If its write must be delayed, it is placed on
+ * the delayed write list, where it must remain until its write delay is
+ * satisfied -- at which point it is moved to the LRU.
+ *
+ * If a multi-page metadata entry is written during a tick, it is placed on
+ * the tick list. If, in addition, the write of the entry must be delayed,
+ * it is also place on the delayed write list. Note that multi-page metadata
+ * entries may never appear on the LRU.
+ *
+ * At the end of each tick, the tick list is emptied.
+ *
+ * Regular pages are simply removed from the tick list, as they must already
+ * appear on either the LRU or the delayed write list.
+ *
+ * Multi-page metadata entries that are not also on the delayed write list
+ * are simply flushed and evicted.
+ *
+ * The delayed write list is also scanned at the end of each tick. Regular
+ * entries that are now flushable are placed at the head of the LRU. Multi-
+ * page metadata entries that are flushable are flushed and evicted.
+ *
+ * The remainder of this sections contains discussions of the fields and
+ * data structures used to support the above operations.
+ *
+ * vfd_swmr_writer: Boolean flag that is set to TRUE iff the file is
+ * the file is opened in VFD SWMR mode. The remaining
+ * VFD SWMR fields are defined iff vfd_swmr_writer is TRUE.
+ *
+ * mpmde_count: int64_t containing the number of multi-page metadata
+ * entries currently resident in the page buffer. Observe
+ * that index_len should always equal curr_pages + mpmde_count.
+ *
+ * cur_tick: uint64_t containing the current tick. This is a copy of
+ * the same field in the associated instance of H5F_file_t,
+ * and is maintained as a convenience.
+ *
+ * In the context of VFD SWMR the delayed write list allows us to delay
+ * metadata writes to the HDF5 file until it appears in all indexes in the
+ * last max_lag ticks. This is essential if a version of the page or
+ * multi-page metadata entry already exists in the HDF5 file -- failure to
+ * delay the write can result in a message from the future which will
+ * likely be perciived as file corruption by the reader.
+ *
+ * To facilitate identification of entries that must be removed from the
+ * DWL during the end of tick scan, the list always observes the following
+ * invarient for any entry on the list:
+ *
+ * entry_ptr->next == NULL ||
+ * entry_ptr->delay_write_until >= entry_ptr->next->delay_write_until
+ *
+ * Discussion of the fields used to implement the delayed write list follows:
+ *
+ * max_delay: Maximum of the delay_write_until fields of the entries on
+ * the delayed write list. This must never be more than max_lag
+ * ticks in advance of the current tick, and should be set to
+ * zero if the delayed write list is empty.
+ *
+ * dwl_len: Number of page buffer entries currently on the delayed
+ * write list.
+ *
+ * Observe that LRU_len + dwl_len must always equal
+ * index_len.
+ *
+ * dwl_size: Number of bytes of page buffer entries currently residing
+ * on the LRU list.
+ *
+ * Observe that LRU_size + dwl_size must always equal
+ * index_size.
+ *
+ * dwl_head_ptr: Pointer to the head of the doubly linked delayed write list.
+ * Page buffer entries on this list are linked by their next and
+ * prev fields.
+ *
+ * This field is NULL if the list is empty.
+ *
+ * dwl_tail_ptr: Pointer to the tail of the doubly linked delayed write list.
+ * Page buffer entries on this list are linked by their next and
+ * prev fields.
+ *
+ * This field is NULL if the list is empty.
+ *
+ * For VFD SWMR to function, copies of all pages modified during a tick must
+ * be retained in the page buffer to allow correct updates to the index and
+ * metadata file at the end of tick.
+ *
+ * To implement this, all entries modified during the current tick are placed
+ * on the tick list. Entries are removed from the tick list during end of
+ * tick processing, so each tick starts with an empty tick list.
+ *
+ * Unless the entry also resides on the delayed write list, entries on the
+ * tick list may be flushed, but they may not be evicted.
+ *
+ * Discussion of the fields used to implement the tick list follows:
+ *
+ * tl_len: Number of page buffer entries currently on the tick list
+ *
+ * tl_size: Number of bytes of page buffer entries currently residing
+ * on the tick list.
+ *
+ * tl_head_ptr: Pointer to the head of the doubly linked tick list.
+ * Page buffer entries on this list are linked by their tl_next
+ * and tl_prev fields.
+ *
+ * This field is NULL if the list is empty.
+ *
+ * tl_tail_ptr: Pointer to the tail of the doubly linked tick list.
+ * Page buffer entries on this list are linked by their tl_next
+ * and tl_prev fields.
+ *
+ * This field is NULL if the list is empty.
+ *
+ *
+ * STATISTICS:
+ *
+ * Multi-page metadata entries (which may only appear in VFD
+ * SWMR mode) are NOT counted in the following statistics.
+ *
+ * Note that all statistics fields contain only data since the last time
+ * that statistics were reset.
+ *
+ * bypasses: Array of int64_t of length H5PB__NUM_STAT_TYPES containing
+ * the number of times that the page buffer has been
+ * bypassed for raw data, metadata, and for multi-page
+ * metadata entries (VFD SWMR only) as indexed by 5PB__STATS_MD,
+ * H5PB__STATS_RD, and H5PB__STATS_MPMDE respectively.
+ *
+ * accesses: Array of int64_t of length H5PB__NUM_STAT_TYPES containing
+ * the number of page buffer accesses for raw data, metadata,
+ * and for multi-page metadata entries (VFD SWMR only) as
+ * indexed by 5PB__STATS_MD, H5PB__STATS_RD, and
+ * H5PB__STATS_MPMDE respectively.
+ *
+ * hits: Array of int64_t of length H5PB__NUM_STAT_TYPES containing
+ * the number of page buffer hits for raw data, metadata,
+ * and for multi-page metadata entries (VFD SWMR only) as
+ * indexed by 5PB__STATS_MD, H5PB__STATS_RD, and
+ * H5PB__STATS_MPMDE respectively.
+ *
+ * misses: Array of int64_t of length H5PB__NUM_STAT_TYPES containing
+ * the number of page buffer misses for raw data, metadata,
+ * and for multi-page metadata entries (VFD SWMR only) as
+ * indexed by 5PB__STATS_MD, H5PB__STATS_RD, and
+ * H5PB__STATS_MPMDE respectively.
+ *
+ * loads: Array of int64_t of length H5PB__NUM_STAT_TYPES containing
+ * the number of page buffer loads for raw data, metadata,
+ * and for multi-page metadata entries (VFD SWMR only) as
+ * indexed by 5PB__STATS_MD, H5PB__STATS_RD, and
+ * H5PB__STATS_MPMDE respectively.
+ *
+ * insertions: Array of int64_t of length H5PB__NUM_STAT_TYPES containing
+ * the number of page buffer insertions of raw data, metadata,
+ * and for multi-page metadata entries (VFD SWMR only) as
+ * indexed by 5PB__STATS_MD, H5PB__STATS_RD, and
+ * H5PB__STATS_MPMDE respectively.
+ *
+ * flushes: Array of int64_t of length H5PB__NUM_STAT_TYPES containing
+ * the number of page buffer flushes of raw data, metadata,
+ * and for multi-page metadata entries (VFD SWMR only) as
+ * indexed by 5PB__STATS_MD, H5PB__STATS_RD, and
+ * H5PB__STATS_MPMDE respectively.
+ *
+ * evictions: Array of int64_t of length H5PB__NUM_STAT_TYPES containing
+ * the number of page buffer evictions of raw data, metadata,
+ * and for multi-page metadata entries (VFD SWMR only) as
+ * indexed by 5PB__STATS_MD, H5PB__STATS_RD, and
+ * H5PB__STATS_MPMDE respectively.
+ *
+ * clears: Array of int64_t of length H5PB__NUM_STAT_TYPES containing
+ * the number of page buffer entry clears of raw data, metadata,
+ * and for multi-page metadata entries (VFD SWMR only) as
+ * indexed by 5PB__STATS_MD, H5PB__STATS_RD, and
+ * H5PB__STATS_MPMDE respectively.
+ *
+ * max_lru_len: int64_t containing the maximum number of entries that
+ * have appeared in the LRU.
+ *
+ * max_lru_size: int64_t containing the maximum size of the LRU.
+ *
+ * lru_md_skips: When searching for an entry to evict, metadata entries on
+ * the LRU must be skipped if the number of metadata pages
+ * in the page buffer fails to exceed min_md_pages.
+ *
+ * This int64_t is used to keep a count of these skips.
+ *
+ * If this number becomes excessive, it will be necessary to
+ * add a holding tank for such entries.
+ *
+ * lru_rd_skips: When searching for an entry to evict, raw data entries on
+ * the LRU must be skipped if the number of raw data pages
+ * in the page buffer fails to exceed min_rd_pages.
+ *
+ * This int64_t is used to keep a count of these skips.
+ *
+ * If this number becomes excessive, it will be necessary to
+ * add a holding tank for such entries.
+ *
+ * Multi-page metadata entries (which appear only in VFD SWMR mode) are
+ * listed in the hash take, and thus they are counted in the following
+ * statistics.
+ *
+ * total_ht_insertions: Number of times entries have been inserted into the
+ * hash table.
+ *
+ * total_ht_deletions: Number of times entries have been deleted from the
+ * hash table.
+ *
+ * successful_ht_searches: int64 containing the total number of successful
+ * searches of the hash table.
+ *
+ * total_successful_ht_search_depth: int64 containing the total number of
+ * entries other than the targets examined in successful
+ * searches of the hash table.
+ *
+ * failed_ht_searches: int64 containing the total number of unsuccessful
+ * searches of the hash table.
+ *
+ * total_failed_ht_search_depth: int64 containing the total number of
+ * entries examined in unsuccessful searches of the hash
+ * table.
+ *
+ * max_index_len: Largest value attained by the index_len field.
+ *
+ * max_clean_index_len: Largest value attained by the clean_index_len field.
+ *
+ * max_dirty_index_len: Largest value attained by the dirty_index_len field.
+ *
+ * max_index_size: Largest value attained by the index_size field.
+ *
+ * max_clean_index_size: Largest value attained by the clean_index_size field.
+ *
+ * max_dirty_index_size: Largest value attained by the dirty_index_size field.
+ *
+ * max_rd_pages: Maximum number of raw data pages in the page buffer.
+ *
+ * max_md_pages: Maximum number of metadata pages in the page buffer.
+ *
+ *
+ * Statistics pretaining to VFD SWMR.
+ *
+ * max_mpmde_count: Maximum number of multi-page metadata entries in the
+ * page buffer.
+ *
+ * lru_tl_skips: When searching for an entry to evict, metadata entries on
+ * the LRU must be skipped if they also reside on the tick list.
+ *
+ * This int64_t is used to keep a count of these skips.
+ *
+ * If this number becomes excessive, it will be necessary to
+ * add a holding tank for such entries.
+ *
+ * max_tl_len: int64_t containing the maximum value of tl_len.
+ *
+ * max_tl_size: int64_t containing the maximum value of tl_size.
+ *
+ * delayed_writes: int64_t containing the total number of delayed writes.
+ *
+ * total_delay: int64_t containing the total number of ticks by which
+ * entry writes have been delayed.
+ *
+ * max_dwl_len: int64_t containing the maximum value of dwl_len.
+ *
+ * max_dwl_size: int64_t containing the maximum value of dwl_size.
+ *
+ * total_dwl_ins_depth: int64_t containing the total insertion depth
+ * required to maintain the odering invarient on the
+ * delayed write list.
+ *
+ ******************************************************************************/
+
+#define H5PB__H5PB_T_MAGIC 0x01020304
+
+#define H5PB__STATS_MD 0
+#define H5PB__STATS_RD 1
+#define H5PB__STATS_MPMDE 2
+#define H5PB__NUM_STAT_TYPES 3
-/* Typedef for the main structure for the page buffer */
typedef struct H5PB_t {
- size_t max_size; /* The total page buffer size */
- size_t page_size; /* Size of a single page */
- unsigned min_meta_perc; /* Minimum ratio of metadata entries required before evicting meta entries */
- unsigned min_raw_perc; /* Minimum ratio of raw data entries required before evicting raw entries */
- unsigned meta_count; /* Number of entries for metadata */
- unsigned raw_count; /* Number of entries for raw data */
- unsigned min_meta_count; /* Minimum # of entries for metadata */
- unsigned min_raw_count; /* Minimum # of entries for raw data */
-
- H5SL_t *slist_ptr; /* Skip list with all the active page entries */
- H5SL_t *mf_slist_ptr; /* Skip list containing newly allocated page entries inserted from the MF layer */
-
- size_t LRU_list_len; /* Number of entries in the LRU (identical to slist_ptr count) */
- struct H5PB_entry_t *LRU_head_ptr; /* Head pointer of the LRU */
- struct H5PB_entry_t *LRU_tail_ptr; /* Tail pointer of the LRU */
-
- H5FL_fac_head_t *page_fac; /* Factory for allocating pages */
-
- /* Statistics */
- unsigned accesses[2];
- unsigned hits[2];
- unsigned misses[2];
- unsigned evictions[2];
- unsigned bypasses[2];
+
+ /* Fields for general operations: */
+
+ uint32_t magic;
+ size_t page_size;
+ int64_t max_pages;
+ int64_t curr_pages;
+ int64_t curr_md_pages;
+ int64_t curr_rd_pages;
+ int64_t min_md_pages;
+ int64_t min_rd_pages;
+
+ /* FAPL fields */
+ size_t max_size;
+ unsigned min_meta_perc;
+ unsigned min_raw_perc;
+
+ /* index */
+ H5PB_entry_t *(ht[H5PB__HASH_TABLE_LEN]);
+ int64_t index_len;
+ int64_t clean_index_len;
+ int64_t dirty_index_len;
+ int64_t index_size;
+ int64_t clean_index_size;
+ int64_t dirty_index_size;
+ int64_t il_len;
+ int64_t il_size;
+ H5PB_entry_t * il_head;
+ H5PB_entry_t * il_tail;
+
+ /* LRU */
+ int64_t LRU_len;
+ int64_t LRU_size;
+ H5PB_entry_t * LRU_head_ptr;
+ H5PB_entry_t * LRU_tail_ptr;
+
+
+ /* Fields for VFD SWMR operations: */
+
+ hbool_t vfd_swmr_writer;
+ int64_t mpmde_count;
+ uint64_t cur_tick;
+
+ /* delayed write list */
+ uint64_t max_delay;
+ int64_t dwl_len;
+ int64_t dwl_size;
+ H5PB_entry_t * dwl_head_ptr;
+ H5PB_entry_t * dwl_tail_ptr;
+
+ /* tick list */
+ int64_t tl_len;
+ int64_t tl_size;
+ H5PB_entry_t * tl_head_ptr;
+ H5PB_entry_t * tl_tail_ptr;
+
+ /* Statistics: */
+
+ /* general operations statistics: */
+ /* these statistics count pages only, not multi-page metadata entries
+ * (that occur only in the VFD SWMR writer case).
+ */
+ int64_t bypasses[H5PB__NUM_STAT_TYPES];
+ int64_t accesses[H5PB__NUM_STAT_TYPES];
+ int64_t hits[H5PB__NUM_STAT_TYPES];
+ int64_t misses[H5PB__NUM_STAT_TYPES];
+ int64_t loads[H5PB__NUM_STAT_TYPES];
+ int64_t insertions[H5PB__NUM_STAT_TYPES];
+ int64_t flushes[H5PB__NUM_STAT_TYPES];
+ int64_t evictions[H5PB__NUM_STAT_TYPES];
+ int64_t clears[H5PB__NUM_STAT_TYPES];
+ uint64_t access_size_count[6];
+ int64_t max_lru_len;
+ int64_t max_lru_size;
+ int64_t lru_md_skips;
+ int64_t lru_rd_skips;
+
+ /* In the VFD SWMR case, both pages and multi-page metadata entries
+ * are stored in the index. Thus mult-page metadata entries are
+ * included in the index related statistics.
+ */
+ int64_t total_ht_insertions;
+ int64_t total_ht_deletions;
+ int64_t successful_ht_searches;
+ int64_t total_successful_ht_search_depth;
+ int64_t failed_ht_searches;
+ int64_t total_failed_ht_search_depth;
+ int64_t max_index_len;
+ int64_t max_clean_index_len;
+ int64_t max_dirty_index_len;
+ int64_t max_index_size;
+ int64_t max_clean_index_size;
+ int64_t max_dirty_index_size;
+ int64_t max_rd_pages;
+ int64_t max_md_pages;
+
+
+ /* vfd swmr statistics */
+ int64_t max_mpmde_count;
+ int64_t lru_tl_skips;
+ int64_t max_tl_len;
+ int64_t max_tl_size;
+ int64_t delayed_writes;
+ int64_t total_delay;
+ int64_t max_dwl_len;
+ int64_t max_dwl_size;
+ int64_t total_dwl_ins_depth;
+
} H5PB_t;
/*****************************/
@@ -85,20 +659,49 @@ typedef struct H5PB_t {
/***************************************/
/* General routines */
-H5_DLL herr_t H5PB_create(H5F_shared_t *f_sh, size_t page_buffer_size, unsigned page_buf_min_meta_perc, unsigned page_buf_min_raw_perc);
-H5_DLL herr_t H5PB_flush(H5F_shared_t *f_sh);
-H5_DLL herr_t H5PB_dest(H5F_shared_t *f_sh);
-H5_DLL herr_t H5PB_add_new_page(H5F_shared_t *f_sh, H5FD_mem_t type, haddr_t page_addr);
-H5_DLL herr_t H5PB_update_entry(H5PB_t *page_buf, haddr_t addr, size_t size, const void *buf);
-H5_DLL herr_t H5PB_remove_entry(const H5F_shared_t *f_sh, haddr_t addr);
-H5_DLL herr_t H5PB_read(H5F_shared_t *f_sh, H5FD_mem_t type, haddr_t addr, size_t size, void *buf/*out*/);
-H5_DLL herr_t H5PB_write(H5F_shared_t *f_sh, H5FD_mem_t type, haddr_t addr, size_t size, const void *buf);
+H5_DLL herr_t H5PB_create(H5F_shared_t *shared, size_t page_buffer_size,
+ unsigned page_buf_min_meta_perc, unsigned page_buf_min_raw_perc);
+
+H5_DLL herr_t H5PB_flush(H5F_shared_t *);
+
+H5_DLL herr_t H5PB_dest(H5F_shared_t *);
+
+H5_DLL herr_t H5PB_add_new_page(H5F_shared_t *, H5FD_mem_t, haddr_t);
+
+H5_DLL herr_t H5PB_update_entry(H5PB_t *, haddr_t, size_t, const void *);
+
+H5_DLL herr_t H5PB_remove_entry(H5F_shared_t *, haddr_t);
+H5_DLL herr_t H5PB_remove_entries(H5F_shared_t *, haddr_t, hsize_t);
+
+H5_DLL herr_t H5PB_read(H5F_shared_t *, H5FD_mem_t, haddr_t,
+ size_t, void * /*out*/);
+
+H5_DLL herr_t H5PB_write(H5F_shared_t *, H5FD_mem_t, haddr_t,
+ size_t, const void *);
+
+/* VFD SWMR specific routines */
+H5_DLL herr_t H5PB_vfd_swmr__release_delayed_writes(H5F_shared_t *);
+
+H5_DLL herr_t H5PB_vfd_swmr__release_tick_list(H5F_shared_t *);
+
+H5_DLL herr_t H5PB_vfd_swmr__set_tick(H5F_shared_t *);
+
+H5_DLL herr_t H5PB_vfd_swmr__update_index(H5F_t *f,
+ uint32_t * idx_ent_added_ptr, uint32_t * idx_ent_modified_ptr,
+ uint32_t * idx_ent_not_in_tl_ptr, uint32_t * idx_ent_not_in_tl_flushed_ptr);
/* Statistics routines */
-H5_DLL herr_t H5PB_reset_stats(H5PB_t *page_buf);
+H5_DLL herr_t H5PB_reset_stats(H5PB_t *);
+
H5_DLL herr_t H5PB_get_stats(const H5PB_t *page_buf, unsigned accesses[2],
- unsigned hits[2], unsigned misses[2], unsigned evictions[2], unsigned bypasses[2]);
-H5_DLL herr_t H5PB_print_stats(const H5PB_t *page_buf);
+ unsigned hits[2], unsigned misses[2], unsigned evictions[2],
+ unsigned bypasses[2]);
+
+H5_DLL herr_t H5PB_print_stats(const H5PB_t *);
+
+
+/* test & debug functions */
+H5_DLL herr_t H5PB_page_exists(H5F_shared_t *, haddr_t, hbool_t *);
#endif /* !_H5PBprivate_H */
diff --git a/src/H5Pfapl.c b/src/H5Pfapl.c
index 30b590f..ed6f06a 100644
--- a/src/H5Pfapl.c
+++ b/src/H5Pfapl.c
@@ -47,6 +47,7 @@
#ifdef H5_HAVE_WINDOWS
#include "H5FDwindows.h" /* Win32 I/O */
#endif
+#include "H5FDvfd_swmr.h" /* Posix unbuffered I/O file driver */
/* Includes needed to set default VOL connector */
#include "H5VLnative_private.h" /* Native VOL connector */
@@ -282,6 +283,11 @@
#define H5F_ACS_VOL_CONN_CMP H5P__facc_vol_cmp
#define H5F_ACS_VOL_CONN_CLOSE H5P__facc_vol_close
+/* Definitions for the VFD SWMR configuration */
+#define H5F_ACS_VFD_SWMR_CONFIG_SIZE sizeof(H5F_vfd_swmr_config_t)
+#define H5F_ACS_VFD_SWMR_CONFIG_DEF H5F__DEFAULT_VFD_SWMR_CONFIG
+#define H5F_ACS_VFD_SWMR_CONFIG_ENC H5P__facc_vfd_swmr_config_enc
+#define H5F_ACS_VFD_SWMR_CONFIG_DEC H5P__facc_vfd_swmr_config_dec
/******************/
/* Local Typedefs */
@@ -329,6 +335,8 @@ static herr_t H5P__facc_multi_type_enc(const void *value, void **_pp, size_t *si
static herr_t H5P__facc_multi_type_dec(const void **_pp, void *value);
static herr_t H5P__facc_libver_type_enc(const void *value, void **_pp, size_t *size);
static herr_t H5P__facc_libver_type_dec(const void **_pp, void *value);
+static herr_t H5P__facc_vfd_swmr_config_enc(const void *value, void **_pp, size_t *size);
+static herr_t H5P__facc_vfd_swmr_config_dec(const void **_pp, void *value);
/* Metadata cache log location property callbacks */
static herr_t H5P_facc_mdc_log_location_enc(const void *value, void **_pp, size_t *size);
@@ -448,7 +456,9 @@ static const size_t H5F_def_page_buf_size_g = H5F_ACS_PAGE_BUFFER_SIZE_DEF;
static const unsigned H5F_def_page_buf_min_meta_perc_g = H5F_ACS_PAGE_BUFFER_MIN_META_PERC_DEF; /* Default page buffer minimum metadata size */
static const unsigned H5F_def_page_buf_min_raw_perc_g = H5F_ACS_PAGE_BUFFER_MIN_RAW_PERC_DEF; /* Default page buffer mininum raw data size */
-
+static const H5F_vfd_swmr_config_t H5F_def_vfd_swmr_config_g = H5F_ACS_VFD_SWMR_CONFIG_DEF; /* Default vfd swmr configuration */
+
+
/*-------------------------------------------------------------------------
* Function: H5P__facc_reg_prop
*
@@ -698,6 +708,12 @@ H5P__facc_reg_prop(H5P_genclass_t *pclass)
NULL, NULL, NULL, NULL) < 0)
HGOTO_ERROR(H5E_PLIST, H5E_CANTINSERT, FAIL, "can't insert property into class")
+ /* Register the default VFD SWMR configuration */
+ if(H5P__register_real(pclass, H5F_ACS_VFD_SWMR_CONFIG_NAME, H5F_ACS_VFD_SWMR_CONFIG_SIZE, &H5F_def_vfd_swmr_config_g,
+ NULL, NULL, NULL, H5F_ACS_VFD_SWMR_CONFIG_ENC, H5F_ACS_VFD_SWMR_CONFIG_DEC,
+ NULL, NULL, NULL, NULL) < 0)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTINSERT, FAIL, "can't insert property into class")
+
/* Register the file VOL connector ID & info */
/* (Note: this property should not have an encode/decode callback -QAK) */
if(H5P__register_real(pclass, H5F_ACS_VOL_CONN_NAME, H5F_ACS_VOL_CONN_SIZE, &def_vol_prop,
@@ -705,6 +721,13 @@ H5P__facc_reg_prop(H5P_genclass_t *pclass)
H5F_ACS_VOL_CONN_DEL, H5F_ACS_VOL_CONN_COPY, H5F_ACS_VOL_CONN_CMP, H5F_ACS_VOL_CONN_CLOSE) < 0)
HGOTO_ERROR(H5E_PLIST, H5E_CANTINSERT, FAIL, "can't insert property into class")
+ if (H5P_LST_FILE_ACCESS_ANY_VFD_g == H5I_INVALID_HID) {
+ H5P_LST_FILE_ACCESS_ANY_VFD_g = H5P_create_id(pclass, false);
+ if (H5P_LST_FILE_ACCESS_ANY_VFD_g == H5I_INVALID_HID) {
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTINSERT, FAIL,
+ "can't create any-vfd fapl");
+ }
+ }
done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5P__facc_reg_prop() */
@@ -3976,6 +3999,109 @@ H5P__facc_libver_type_dec(const void **_pp, void *_value)
FUNC_LEAVE_NOAPI(SUCCEED)
} /* end H5P__facc_libver_type_dec() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5P__facc_vfd_swmr_config_enc
+ *
+ * Purpose: Callback routine which is called whenever the VFD SWMR config
+ * property in the file access property list is encoded.
+ *
+ * Return: Success: Non-negative
+ * Failure: Negative
+ *
+ * Programmer: Vailin Choi; July 2018
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5P__facc_vfd_swmr_config_enc(const void *value, void **_pp, size_t *size)
+{
+ const H5F_vfd_swmr_config_t *config = (const H5F_vfd_swmr_config_t *)value; /* Create local aliases for values */
+ uint8_t **pp = (uint8_t **)_pp;
+
+ FUNC_ENTER_STATIC_NOERR
+
+ /* Sanity check */
+ HDassert(value);
+ HDcompile_assert(sizeof(size_t) <= sizeof(uint64_t));
+
+ if(NULL != *pp) {
+
+ /* int */
+ INT32ENCODE(*pp, (int32_t)config->version);
+ INT32ENCODE(*pp, (int32_t)config->tick_len);
+ INT32ENCODE(*pp, (int32_t)config->max_lag);
+ H5_ENCODE_UNSIGNED(*pp, config->writer);
+ H5_ENCODE_UNSIGNED(*pp, config->flush_raw_data);
+ INT32ENCODE(*pp, (int32_t)config->md_pages_reserved);
+ INT32ENCODE(*pp, (int32_t)config->pb_expansion_threshold);
+ HDmemcpy(*pp, (const uint8_t *)(config->md_file_path), (size_t)(H5F__MAX_VFD_SWMR_FILE_NAME_LEN + 1));
+ *pp += H5F__MAX_VFD_SWMR_FILE_NAME_LEN + 1;
+ HDmemcpy(*pp, (const uint8_t *)(config->log_file_path), (size_t)(H5F__MAX_VFD_SWMR_FILE_NAME_LEN + 1));
+ *pp += H5F__MAX_VFD_SWMR_FILE_NAME_LEN + 1;
+
+ } /* end if */
+
+ /* Compute encoded size */
+ *size += ( (5 * sizeof(int32_t)) +
+ (2 * sizeof(unsigned)) +
+ (2 * (H5F__MAX_VFD_SWMR_FILE_NAME_LEN + 1)) );
+
+ FUNC_LEAVE_NOAPI(SUCCEED)
+} /* end H5P__facc_vfd_swmr_config_enc() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5P__facc_vfd_swmr_config_dec
+ *
+ * Purpose: Callback routine which is called whenever the VFD SWMR
+ * config property in the file access property list is decoded.
+ *
+ * Return: Success: Non-negative
+ * Failure: Negative
+ *
+ * Programmer: Vailin Choi; July 2018
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5P__facc_vfd_swmr_config_dec(const void **_pp, void *_value)
+{
+ H5F_vfd_swmr_config_t *config = (H5F_vfd_swmr_config_t *)_value;
+ const uint8_t **pp = (const uint8_t **)_pp;
+
+ FUNC_ENTER_STATIC_NOERR
+
+ /* Sanity checks */
+ HDassert(pp);
+ HDassert(*pp);
+ HDassert(config);
+ HDcompile_assert(sizeof(size_t) <= sizeof(uint64_t));
+
+ /* Set property to default value */
+ HDmemcpy(config, &H5F_def_vfd_swmr_config_g, sizeof(H5F_vfd_swmr_config_t));
+
+ /* int */
+ INT32DECODE(*pp, config->version);
+ INT32DECODE(*pp, config->tick_len);
+ INT32DECODE(*pp, config->max_lag);
+
+ H5_DECODE_UNSIGNED(*pp, config->writer);
+ H5_DECODE_UNSIGNED(*pp, config->flush_raw_data);
+
+ /* int */
+ INT32DECODE(*pp, config->md_pages_reserved);
+ INT32DECODE(*pp, config->pb_expansion_threshold);
+
+ HDstrcpy(config->md_file_path, (const char *)(*pp));
+ *pp += H5F__MAX_VFD_SWMR_FILE_NAME_LEN + 1;
+
+ HDstrcpy(config->log_file_path, (const char *)(*pp));
+ *pp += H5F__MAX_VFD_SWMR_FILE_NAME_LEN + 1;
+
+ FUNC_LEAVE_NOAPI(SUCCEED)
+} /* end H5P__facc_vfd_swmr_config_dec() */
+
/*-------------------------------------------------------------------------
* Function: H5Pset_metadata_read_attempts
@@ -5411,6 +5537,75 @@ done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5P_set_vol() */
+/*-------------------------------------------------------------------------
+ * Function: H5Pset_vfd_swmr_config
+ *
+ * Purpose: Set VFD SWMR configuration in the target FAPL.
+ * Note: Hard-wired to set the driver in the fapl
+ * to use the SWMR VFD driver; this will be changed
+ * later
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: Vailin Choi; July 2018
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5Pset_vfd_swmr_config(hid_t plist_id, H5F_vfd_swmr_config_t *config_ptr)
+{
+ H5P_genplist_t *plist; /* Property list pointer */
+ size_t name_len;
+ herr_t ret_value = SUCCEED; /* return value */
+
+ FUNC_ENTER_API(FAIL)
+ H5TRACE2("e", "i*x", plist_id, config_ptr);
+
+ /* Get the plist structure */
+ if(NULL == (plist = H5P_object_verify(plist_id,H5P_FILE_ACCESS)))
+ HGOTO_ERROR(H5E_ATOM, H5E_BADATOM, FAIL, "can't find object for ID")
+
+ /* Validate the input configuration */
+
+ /* Check args */
+ if(config_ptr == NULL)
+ HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "NULL config_ptr on entry")
+
+ /* This field must always be set to a known version */
+ if(config_ptr->version != H5F__CURR_VFD_SWMR_CONFIG_VERSION)
+ HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "Unknown config version")
+
+ /* This field must be at least 3 */
+ if(config_ptr->max_lag < 3 )
+ HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "max_lag must be at least 3")
+
+ /* This field must be >= 2 */
+ if(config_ptr->md_pages_reserved < 2 )
+ HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "md_pages_reserved must be at least 2")
+
+ /* This field must be in the range [0, 100] */
+ if(config_ptr->pb_expansion_threshold > H5F__MAX_PB_EXPANSION_THRESHOLD)
+ HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "pb_expansion_threshold out of range")
+
+ /* Must provide the path for the metadata file */
+ name_len = HDstrlen(config_ptr->md_file_path);
+ if(name_len == 0)
+ HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "md_file_path is empty")
+ else if(name_len > H5F__MAX_VFD_SWMR_FILE_NAME_LEN)
+ HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "md_file_path is too long")
+
+ /* Set the modified config */
+ if(H5P_set(plist, H5F_ACS_VFD_SWMR_CONFIG_NAME, config_ptr) < 0)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set metadata cache initial config")
+
+ if(H5P_set_driver(plist, H5FD_VFD_SWMR, NULL) < 0)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set VFD SWMR driver info");
+
+done:
+ FUNC_LEAVE_API(ret_value)
+} /* H5Pset_vfd_swmr_config() */
+
+
/*-------------------------------------------------------------------------
* Function: H5P_reset_vol_class
@@ -5530,6 +5725,42 @@ done:
FUNC_LEAVE_API(ret_value)
} /* end H5Pget_vol_id() */
+/*-------------------------------------------------------------------------
+ * Function: H5Pget_vfd_swmr_config
+ *
+ * Purpose: Retrieve the VFD SWMR configuration from the target FAPL.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: Vailin Choi; July 2018
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5Pget_vfd_swmr_config(hid_t plist_id, H5F_vfd_swmr_config_t *config_ptr)
+{
+ H5P_genplist_t *plist; /* Property list pointer */
+ herr_t ret_value = SUCCEED; /* return value */
+
+ FUNC_ENTER_API(FAIL)
+ H5TRACE2("e", "i*x", plist_id, config_ptr);
+
+ /* Get the plist structure */
+ if(NULL == (plist = H5P_object_verify(plist_id,H5P_FILE_ACCESS)))
+ HGOTO_ERROR(H5E_ATOM, H5E_BADATOM, FAIL, "can't find object for ID")
+
+ /* Validate the config_ptr */
+ if(config_ptr == NULL)
+ HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL config_ptr on entry.")
+
+ /* Get the current VFD SWMR configuration */
+ if(H5P_get(plist, H5F_ACS_VFD_SWMR_CONFIG_NAME, config_ptr) < 0)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTGET,FAIL, "can't get VFD SWMR config")
+
+done:
+ FUNC_LEAVE_API(ret_value)
+} /* H5Pget_vfd_swmr_config() */
+
/*-------------------------------------------------------------------------
* Function: H5Pget_vol_info
diff --git a/src/H5Pint.c b/src/H5Pint.c
index 2e463b2..19ed63e 100644
--- a/src/H5Pint.c
+++ b/src/H5Pint.c
@@ -178,6 +178,7 @@ H5P_genclass_t *H5P_CLS_REFERENCE_ACCESS_g = NULL;
*/
hid_t H5P_LST_FILE_CREATE_ID_g = H5I_INVALID_HID;
hid_t H5P_LST_FILE_ACCESS_ID_g = H5I_INVALID_HID;
+hid_t H5P_LST_FILE_ACCESS_ANY_VFD_g = H5I_INVALID_HID;
hid_t H5P_LST_DATASET_CREATE_ID_g = H5I_INVALID_HID;
hid_t H5P_LST_DATASET_ACCESS_ID_g = H5I_INVALID_HID;
hid_t H5P_LST_DATASET_XFER_ID_g = H5I_INVALID_HID;
diff --git a/src/H5Ppublic.h b/src/H5Ppublic.h
index bb33561..8d3c92b 100644
--- a/src/H5Ppublic.h
+++ b/src/H5Ppublic.h
@@ -78,6 +78,7 @@
*/
#define H5P_FILE_CREATE_DEFAULT (H5OPEN H5P_LST_FILE_CREATE_ID_g)
#define H5P_FILE_ACCESS_DEFAULT (H5OPEN H5P_LST_FILE_ACCESS_ID_g)
+#define H5P_FILE_ACCESS_ANY_VFD (H5OPEN H5P_LST_FILE_ACCESS_ANY_VFD_g)
#define H5P_DATASET_CREATE_DEFAULT (H5OPEN H5P_LST_DATASET_CREATE_ID_g)
#define H5P_DATASET_ACCESS_DEFAULT (H5OPEN H5P_LST_DATASET_ACCESS_ID_g)
#define H5P_DATASET_XFER_DEFAULT (H5OPEN H5P_LST_DATASET_XFER_ID_g)
@@ -212,6 +213,7 @@ H5_DLLVAR hid_t H5P_CLS_REFERENCE_ACCESS_ID_g;
/* (Internal to library, do not use! Use macros above) */
H5_DLLVAR hid_t H5P_LST_FILE_CREATE_ID_g;
H5_DLLVAR hid_t H5P_LST_FILE_ACCESS_ID_g;
+H5_DLLVAR hid_t H5P_LST_FILE_ACCESS_ANY_VFD_g;
H5_DLLVAR hid_t H5P_LST_DATASET_CREATE_ID_g;
H5_DLLVAR hid_t H5P_LST_DATASET_ACCESS_ID_g;
H5_DLLVAR hid_t H5P_LST_DATASET_XFER_ID_g;
@@ -392,6 +394,10 @@ H5_DLL herr_t H5Pset_mdc_image_config(hid_t plist_id, H5AC_cache_image_config_t
H5_DLL herr_t H5Pget_mdc_image_config(hid_t plist_id, H5AC_cache_image_config_t *config_ptr /*out*/);
H5_DLL herr_t H5Pset_page_buffer_size(hid_t plist_id, size_t buf_size, unsigned min_meta_per, unsigned min_raw_per);
H5_DLL herr_t H5Pget_page_buffer_size(hid_t plist_id, size_t *buf_size, unsigned *min_meta_per, unsigned *min_raw_per);
+/* VFD SWMR configuration */
+H5_DLL herr_t H5Pset_vfd_swmr_config(hid_t plist_id, H5F_vfd_swmr_config_t *config_ptr);
+H5_DLL herr_t H5Pget_vfd_swmr_config(hid_t plist_id, H5F_vfd_swmr_config_t *config_ptr);
+
/* Dataset creation property list (DCPL) routines */
H5_DLL herr_t H5Pset_layout(hid_t plist_id, H5D_layout_t layout);
diff --git a/src/H5SMcache.c b/src/H5SMcache.c
index 7f243a6..998fe9b 100644
--- a/src/H5SMcache.c
+++ b/src/H5SMcache.c
@@ -95,6 +95,7 @@ const H5AC_class_t H5AC_SOHM_TABLE[1] = {{
NULL, /* 'notify' callback */
H5SM__cache_table_free_icr, /* 'free_icr' callback */
NULL, /* 'fsf_size' callback */
+ NULL, /* 'refresh' callback */
}};
const H5AC_class_t H5AC_SOHM_LIST[1] = {{
@@ -112,6 +113,7 @@ const H5AC_class_t H5AC_SOHM_LIST[1] = {{
NULL, /* 'notify' callback */
H5SM__cache_list_free_icr, /* 'free_icr' callback */
NULL, /* 'fsf_size' callback */
+ NULL, /* 'refresh' callback */
}};
diff --git a/src/H5VLnative.h b/src/H5VLnative.h
index b607abc..e6be0bd 100644
--- a/src/H5VLnative.h
+++ b/src/H5VLnative.h
@@ -80,6 +80,9 @@
#define H5VL_NATIVE_FILE_GET_MPI_ATOMICITY 26 /* H5Fget_mpi_atomicity */
#define H5VL_NATIVE_FILE_SET_MPI_ATOMICITY 27 /* H5Fset_mpi_atomicity */
#define H5VL_NATIVE_FILE_POST_OPEN 28 /* Adjust file after open, with wrapping context */
+#define H5VL_NATIVE_FILE_VFD_SWMR_DISABLE_EOT 29
+#define H5VL_NATIVE_FILE_VFD_SWMR_ENABLE_EOT 30
+#define H5VL_NATIVE_FILE_VFD_SWMR_END_TICK 31
/* Values for native VOL connector group optional VOL operations */
#ifndef H5_NO_DEPRECATED_SYMBOLS
diff --git a/src/H5VLnative_file.c b/src/H5VLnative_file.c
index 5275898..563f4a6 100644
--- a/src/H5VLnative_file.c
+++ b/src/H5VLnative_file.c
@@ -679,11 +679,11 @@ H5VL__native_file_optional(void *obj, H5VL_file_optional_t optional_type,
case H5VL_NATIVE_FILE_RESET_PAGE_BUFFERING_STATS:
{
/* Sanity check */
- if(NULL == f->shared->page_buf)
+ if(NULL == f->shared->pb_ptr)
HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "page buffering not enabled on file")
/* Reset the statistics */
- if(H5PB_reset_stats(f->shared->page_buf) < 0)
+ if(H5PB_reset_stats(f->shared->pb_ptr) < 0)
HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "can't reset stats for page buffering")
break;
@@ -699,11 +699,11 @@ H5VL__native_file_optional(void *obj, H5VL_file_optional_t optional_type,
unsigned *bypasses = HDva_arg(arguments, unsigned *);
/* Sanity check */
- if(NULL == f->shared->page_buf)
+ if(NULL == f->shared->pb_ptr)
HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "page buffering not enabled on file")
/* Get the statistics */
- if(H5PB_get_stats(f->shared->page_buf, accesses, hits, misses, evictions, bypasses) < 0)
+ if(H5PB_get_stats(f->shared->pb_ptr, accesses, hits, misses, evictions, bypasses) < 0)
HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "can't retrieve stats for page buffering")
break;
@@ -828,6 +828,33 @@ H5VL__native_file_optional(void *obj, H5VL_file_optional_t optional_type,
break;
}
+ /* H5Fvfd_swmr_disable_end_of_tick() */
+ case H5VL_NATIVE_FILE_VFD_SWMR_DISABLE_EOT:
+ {
+ /* Call package routine */
+ if(H5F__vfd_swmr_disable_end_of_tick((H5F_t *)obj) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTINIT, FAIL, "can't disable EOT for VFD SWMR")
+ break;
+ }
+
+ /* H5Fvfd_swmr_enable_end_of_tick() */
+ case H5VL_NATIVE_FILE_VFD_SWMR_ENABLE_EOT:
+ {
+ /* Call package routine */
+ if(H5F__vfd_swmr_enable_end_of_tick((H5F_t *)obj) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTINIT, FAIL, "can't enable EOT for VFD SWMR")
+ break;
+ }
+
+ /* H5Fvfd_swmr_end_tick() */
+ case H5VL_NATIVE_FILE_VFD_SWMR_END_TICK:
+ {
+ /* Call package routine */
+ if(H5F__vfd_swmr_end_tick((H5F_t *)obj) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTINIT, FAIL, "can't trigger EOT processing for VFD SWMR")
+ break;
+ }
+
default:
HGOTO_ERROR(H5E_VOL, H5E_UNSUPPORTED, FAIL, "invalid optional operation")
} /* end switch */
diff --git a/src/H5private.h b/src/H5private.h
index 836d7d5..3e11b4f 100644
--- a/src/H5private.h
+++ b/src/H5private.h
@@ -1058,6 +1058,9 @@ typedef off_t h5_stat_size_t;
#ifndef HDgettimeofday
#define HDgettimeofday(S,P) gettimeofday(S,P)
#endif /* HDgettimeofday */
+#ifndef HDclock_gettime
+ #define HDclock_gettime(C,T) clock_gettime(C,T)
+#endif /* HDclock_gettime */
#ifndef HDgetuid
#define HDgetuid() getuid()
#endif /* HDgetuid */
@@ -2176,11 +2179,54 @@ H5_DLL herr_t H5CX_pop(void);
\
BEGIN_MPE_LOG
+#include "H5FDvfd_swmr_private.h"
+#include "H5time_private.h" /* for timespeccmp */
+
+#define VFD_SWMR_ENTER(err) \
+ do { \
+ /* TBD assert that the API lock is held. The API lock */ \
+ /* synchronizes access to `vfd_swmr_api_entries_g` */ \
+ if (vfd_swmr_api_entries_g++ > 0) \
+ ; /* Do nothing: we are *re-*entering the API. */ \
+ else if (TAILQ_EMPTY(&eot_queue_g)) \
+ ; /* Nothing to do. */ \
+ else if (H5F_vfd_swmr_process_eot_queue(true) < 0) { \
+ HGOTO_ERROR(H5E_FUNC, H5E_CANTSET, err, \
+ "error processing EOT queue") \
+ } \
+ } while (0)
+
+#define VFD_SWMR_LEAVE(err) \
+ do { \
+ /* TBD assert that the API lock is held. The API lock */ \
+ /* synchronizes access to `vfd_swmr_api_entries_g` */ \
+ if (--vfd_swmr_api_entries_g > 0) \
+ ; /* Do nothing: we are still in an API call. */ \
+ else if (err_occurred) \
+ ; /* Do nothing: an error occurred. */ \
+ else if (TAILQ_EMPTY(&eot_queue_g)) \
+ ; /* Nothing to do. */ \
+ else if (H5F_vfd_swmr_process_eot_queue(false) < 0) { \
+ HDONE_ERROR(H5E_FUNC, H5E_CANTSET, err, \
+ "error processing EOT queue") \
+ } \
+ } while (0)
+
/* Use this macro for all "normal" API functions */
#define FUNC_ENTER_API(err) {{ \
FUNC_ENTER_API_COMMON \
FUNC_ENTER_API_INIT(err); \
FUNC_ENTER_API_PUSH(err); \
+ VFD_SWMR_ENTER(err); \
+ /* Clear thread error stack entering public functions */ \
+ H5E_clear_stack(NULL); \
+ {
+
+/* Use this macro when VFD SWMR EOT is not used on entering an API function */
+#define FUNC_ENTER_API_NO_EOT(err) {{ \
+ FUNC_ENTER_API_COMMON \
+ FUNC_ENTER_API_INIT(err); \
+ FUNC_ENTER_API_PUSH(err); \
/* Clear thread error stack entering public functions */ \
H5E_clear_stack(NULL); \
{
@@ -2193,6 +2239,7 @@ H5_DLL herr_t H5CX_pop(void);
FUNC_ENTER_API_COMMON \
FUNC_ENTER_API_INIT(err); \
FUNC_ENTER_API_PUSH(err); \
+ VFD_SWMR_ENTER(err); \
{
/*
@@ -2397,6 +2444,18 @@ H5_DLL herr_t H5CX_pop(void);
H5TRACE_RETURN(ret_value);
#define FUNC_LEAVE_API(ret_value) \
+ VFD_SWMR_LEAVE(ret_value); \
+ FUNC_LEAVE_API_COMMON(ret_value); \
+ (void)H5CX_pop(); \
+ H5_POP_FUNC \
+ if(err_occurred) \
+ (void)H5E_dump_api_stack(TRUE); \
+ FUNC_LEAVE_API_THREADSAFE \
+ return(ret_value); \
+}} /*end scope from beginning of FUNC_ENTER*/
+
+/* Use this macro when VFD SWMR EOT is not used on leaving an API function */
+#define FUNC_LEAVE_API_NO_EOT(ret_value) \
FUNC_LEAVE_API_COMMON(ret_value); \
(void)H5CX_pop(); \
H5_POP_FUNC \
diff --git a/src/H5public.h b/src/H5public.h
index d3edd23..3a3de88 100644
--- a/src/H5public.h
+++ b/src/H5public.h
@@ -177,42 +177,82 @@ typedef long long ssize_t;
#endif
#endif
+/* int64_t type is used for creation order field for links. It may be
+ * defined in Posix.1g, otherwise it is defined here.
+ */
+#if H5_SIZEOF_INT64_T>=8
+#elif H5_SIZEOF_INT>=8
+ typedef int int64_t;
+# undef H5_SIZEOF_INT64_T
+# define H5_SIZEOF_INT64_T H5_SIZEOF_INT
+#elif H5_SIZEOF_LONG>=8
+ typedef long int64_t;
+# undef H5_SIZEOF_INT64_T
+# define H5_SIZEOF_INT64_T H5_SIZEOF_LONG
+#elif H5_SIZEOF_LONG_LONG>=8
+ typedef long long int64_t;
+# undef H5_SIZEOF_INT64_T
+# define H5_SIZEOF_INT64_T H5_SIZEOF_LONG_LONG
+#else
+# error "nothing appropriate for int64_t"
+#endif
+
+/* uint64_t type is used for fields for H5O_info_t. It may be
+ * defined in Posix.1g, otherwise it is defined here.
+ */
+#if H5_SIZEOF_UINT64_T>=8
+#elif H5_SIZEOF_INT>=8
+ typedef unsigned uint64_t;
+# undef H5_SIZEOF_UINT64_T
+# define H5_SIZEOF_UINT64_T H5_SIZEOF_INT
+#elif H5_SIZEOF_LONG>=8
+ typedef unsigned long uint64_t;
+# undef H5_SIZEOF_UINT64_T
+# define H5_SIZEOF_UINT64_T H5_SIZEOF_LONG
+#elif H5_SIZEOF_LONG_LONG>=8
+ typedef unsigned long long uint64_t;
+# undef H5_SIZEOF_UINT64_T
+# define H5_SIZEOF_UINT64_T H5_SIZEOF_LONG_LONG
+#else
+# error "nothing appropriate for uint64_t"
+#endif
+
/*
* The sizes of file objects have their own types defined here, use a 64-bit
* type.
*/
-#if H5_SIZEOF_LONG_LONG >= 8
-H5_GCC_DIAG_OFF(long-long)
-typedef unsigned long long hsize_t;
-typedef signed long long hssize_t;
-H5_GCC_DIAG_ON(long-long)
-# define H5_SIZEOF_HSIZE_T H5_SIZEOF_LONG_LONG
-# define H5_SIZEOF_HSSIZE_T H5_SIZEOF_LONG_LONG
-#else
-# error "nothing appropriate for hsize_t"
-#endif
-#define HSIZE_UNDEF ((hsize_t)(hssize_t)(-1))
+typedef uint64_t hsize_t;
+typedef int64_t hssize_t;
+#define PRIXHSIZE PRIX64
+#define PRIdHSIZE PRId64
+#define PRIiHSIZE PRIi64
+#define PRIoHSIZE PRIo64
+#define PRIuHSIZE PRIu64
+#define PRIxHSIZE PRIx64
+#define H5_SIZEOF_HSIZE_T H5_SIZEOF_UINT64_T
+#define H5_SIZEOF_HSSIZE_T H5_SIZEOF_INT64_T
+#define HSIZE_UNDEF UINT64_MAX
/*
* File addresses have their own types.
*/
#if H5_SIZEOF_INT >= 8
typedef unsigned haddr_t;
-# define HADDR_UNDEF ((haddr_t)(-1))
+# define HADDR_UNDEF UINT_MAX
# define H5_SIZEOF_HADDR_T H5_SIZEOF_INT
# ifdef H5_HAVE_PARALLEL
# define HADDR_AS_MPI_TYPE MPI_UNSIGNED
# endif /* H5_HAVE_PARALLEL */
#elif H5_SIZEOF_LONG >= 8
typedef unsigned long haddr_t;
-# define HADDR_UNDEF ((haddr_t)(long)(-1))
+# define HADDR_UNDEF ULONG_MAX
# define H5_SIZEOF_HADDR_T H5_SIZEOF_LONG
# ifdef H5_HAVE_PARALLEL
# define HADDR_AS_MPI_TYPE MPI_UNSIGNED_LONG
# endif /* H5_HAVE_PARALLEL */
#elif H5_SIZEOF_LONG_LONG >= 8
typedef unsigned long long haddr_t;
-# define HADDR_UNDEF ((haddr_t)(long long)(-1))
+# define HADDR_UNDEF ULLONG_MAX
# define H5_SIZEOF_HADDR_T H5_SIZEOF_LONG_LONG
# ifdef H5_HAVE_PARALLEL
# define HADDR_AS_MPI_TYPE MPI_LONG_LONG_INT
@@ -221,15 +261,25 @@ H5_GCC_DIAG_ON(long-long)
# error "nothing appropriate for haddr_t"
#endif
#if H5_SIZEOF_HADDR_T == H5_SIZEOF_INT
-# define H5_PRINTF_HADDR_FMT "%u"
+# define PRIXHADDR "X"
+# define PRIoHADDR "o"
+# define PRIuHADDR "u"
+# define PRIxHADDR "x"
#elif H5_SIZEOF_HADDR_T == H5_SIZEOF_LONG
-# define H5_PRINTF_HADDR_FMT "%lu"
+# define PRIXHADDR "lX"
+# define PRIoHADDR "lo"
+# define PRIuHADDR "lu"
+# define PRIxHADDR "lx"
#elif H5_SIZEOF_HADDR_T == H5_SIZEOF_LONG_LONG
-# define H5_PRINTF_HADDR_FMT "%" H5_PRINTF_LL_WIDTH "u"
+# define PRIXHADDR H5_PRINTF_LL_WIDTH "X"
+# define PRIoHADDR H5_PRINTF_LL_WIDTH "o"
+# define PRIuHADDR H5_PRINTF_LL_WIDTH "u"
+# define PRIxHADDR H5_PRINTF_LL_WIDTH "x"
#else
# error "nothing appropriate for H5_PRINTF_HADDR_FMT"
#endif
-#define HADDR_MAX (HADDR_UNDEF-1)
+#define H5_PRINTF_HADDR_FMT "%" PRIuHADDR
+#define HADDR_MAX (HADDR_UNDEF-1)
/* uint32_t type is used for creation order field for messages. It may be
* defined in Posix.1g, otherwise it is defined here.
@@ -251,46 +301,6 @@ H5_GCC_DIAG_ON(long-long)
# error "nothing appropriate for uint32_t"
#endif
-/* int64_t type is used for creation order field for links. It may be
- * defined in Posix.1g, otherwise it is defined here.
- */
-#if H5_SIZEOF_INT64_T>=8
-#elif H5_SIZEOF_INT>=8
- typedef int int64_t;
-# undef H5_SIZEOF_INT64_T
-# define H5_SIZEOF_INT64_T H5_SIZEOF_INT
-#elif H5_SIZEOF_LONG>=8
- typedef long int64_t;
-# undef H5_SIZEOF_INT64_T
-# define H5_SIZEOF_INT64_T H5_SIZEOF_LONG
-#elif H5_SIZEOF_LONG_LONG>=8
- typedef long long int64_t;
-# undef H5_SIZEOF_INT64_T
-# define H5_SIZEOF_INT64_T H5_SIZEOF_LONG_LONG
-#else
-# error "nothing appropriate for int64_t"
-#endif
-
-/* uint64_t type is used for fields for H5O_info_t. It may be
- * defined in Posix.1g, otherwise it is defined here.
- */
-#if H5_SIZEOF_UINT64_T>=8
-#elif H5_SIZEOF_INT>=8
- typedef unsigned uint64_t;
-# undef H5_SIZEOF_UINT64_T
-# define H5_SIZEOF_UINT64_T H5_SIZEOF_INT
-#elif H5_SIZEOF_LONG>=8
- typedef unsigned long uint64_t;
-# undef H5_SIZEOF_UINT64_T
-# define H5_SIZEOF_UINT64_T H5_SIZEOF_LONG
-#elif H5_SIZEOF_LONG_LONG>=8
- typedef unsigned long long uint64_t;
-# undef H5_SIZEOF_UINT64_T
-# define H5_SIZEOF_UINT64_T H5_SIZEOF_LONG_LONG
-#else
-# error "nothing appropriate for uint64_t"
-#endif
-
/* Common iteration orders */
typedef enum {
H5_ITER_UNKNOWN = -1, /* Unknown order */
@@ -328,6 +338,17 @@ typedef struct H5_ih_info_t {
hsize_t heap_size;
} H5_ih_info_t;
+static inline const char *
+htri_to_string(htri_t v)
+{
+ if (v == 0)
+ return "false";
+ else if (v < 0)
+ return "error";
+ else
+ return "true";
+}
+
/* Tokens are unique and permanent identifiers that are
* used to reference HDF5 objects in a container. */
diff --git a/src/H5queue.h b/src/H5queue.h
new file mode 100644
index 0000000..816acca
--- /dev/null
+++ b/src/H5queue.h
@@ -0,0 +1,847 @@
+/* $NetBSD: queue.h,v 1.70.10.1 2017/10/02 13:21:41 martin Exp $ */
+
+/*
+ * Copyright (c) 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)queue.h 8.5 (Berkeley) 8/20/94
+ */
+
+#ifndef _SYS_QUEUE_H_
+#define _SYS_QUEUE_H_
+
+/*
+ * This file defines five types of data structures: singly-linked lists,
+ * lists, simple queues, tail queues, and circular queues.
+ *
+ * A singly-linked list is headed by a single forward pointer. The
+ * elements are singly linked for minimum space and pointer manipulation
+ * overhead at the expense of O(n) removal for arbitrary elements. New
+ * elements can be added to the list after an existing element or at the
+ * head of the list. Elements being removed from the head of the list
+ * should use the explicit macro for this purpose for optimum
+ * efficiency. A singly-linked list may only be traversed in the forward
+ * direction. Singly-linked lists are ideal for applications with large
+ * datasets and few or no removals or for implementing a LIFO queue.
+ *
+ * A list is headed by a single forward pointer (or an array of forward
+ * pointers for a hash table header). The elements are doubly linked
+ * so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before
+ * or after an existing element or at the head of the list. A list
+ * may only be traversed in the forward direction.
+ *
+ * A simple queue is headed by a pair of pointers, one the head of the
+ * list and the other to the tail of the list. The elements are singly
+ * linked to save space, so elements can only be removed from the
+ * head of the list. New elements can be added to the list after
+ * an existing element, at the head of the list, or at the end of the
+ * list. A simple queue may only be traversed in the forward direction.
+ *
+ * A tail queue is headed by a pair of pointers, one to the head of the
+ * list and the other to the tail of the list. The elements are doubly
+ * linked so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before or
+ * after an existing element, at the head of the list, or at the end of
+ * the list. A tail queue may be traversed in either direction.
+ *
+ * A circle queue is headed by a pair of pointers, one to the head of the
+ * list and the other to the tail of the list. The elements are doubly
+ * linked so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before or after
+ * an existing element, at the head of the list, or at the end of the list.
+ * A circle queue may be traversed in either direction, but has a more
+ * complex end of list detection.
+ *
+ * For details on the use of these macros, see the queue(3) manual page.
+ */
+
+/*
+ * Include the definition of NULL only on NetBSD because sys/null.h
+ * is not available elsewhere. This conditional makes the header
+ * portable and it can simply be dropped verbatim into any system.
+ * The caveat is that on other systems some other header
+ * must provide NULL before the macros can be used.
+ */
+#ifdef __NetBSD__
+#include <sys/null.h>
+#endif
+
+#if defined(QUEUEDEBUG)
+# if defined(_KERNEL)
+# define QUEUEDEBUG_ABORT(...) panic(__VA_ARGS__)
+# else
+# include <err.h>
+# define QUEUEDEBUG_ABORT(...) err(1, __VA_ARGS__)
+# endif
+#endif
+
+/*
+ * Singly-linked List definitions.
+ */
+#define SLIST_HEAD(name, type) \
+struct name { \
+ struct type *slh_first; /* first element */ \
+}
+
+#define SLIST_HEAD_INITIALIZER(head) \
+ { NULL }
+
+#define SLIST_ENTRY(type) \
+struct { \
+ struct type *sle_next; /* next element */ \
+}
+
+/*
+ * Singly-linked List access methods.
+ */
+#define SLIST_FIRST(head) ((head)->slh_first)
+#define SLIST_END(head) NULL
+#define SLIST_EMPTY(head) ((head)->slh_first == NULL)
+#define SLIST_NEXT(elm, field) ((elm)->field.sle_next)
+
+#define SLIST_FOREACH(var, head, field) \
+ for((var) = (head)->slh_first; \
+ (var) != SLIST_END(head); \
+ (var) = (var)->field.sle_next)
+
+#define SLIST_FOREACH_SAFE(var, head, field, tvar) \
+ for ((var) = SLIST_FIRST((head)); \
+ (var) != SLIST_END(head) && \
+ ((tvar) = SLIST_NEXT((var), field), 1); \
+ (var) = (tvar))
+
+/*
+ * Singly-linked List functions.
+ */
+#define SLIST_INIT(head) do { \
+ (head)->slh_first = SLIST_END(head); \
+} while (/*CONSTCOND*/0)
+
+#define SLIST_INSERT_AFTER(slistelm, elm, field) do { \
+ (elm)->field.sle_next = (slistelm)->field.sle_next; \
+ (slistelm)->field.sle_next = (elm); \
+} while (/*CONSTCOND*/0)
+
+#define SLIST_INSERT_HEAD(head, elm, field) do { \
+ (elm)->field.sle_next = (head)->slh_first; \
+ (head)->slh_first = (elm); \
+} while (/*CONSTCOND*/0)
+
+#define SLIST_REMOVE_AFTER(slistelm, field) do { \
+ (slistelm)->field.sle_next = \
+ SLIST_NEXT(SLIST_NEXT((slistelm), field), field); \
+} while (/*CONSTCOND*/0)
+
+#define SLIST_REMOVE_HEAD(head, field) do { \
+ (head)->slh_first = (head)->slh_first->field.sle_next; \
+} while (/*CONSTCOND*/0)
+
+#define SLIST_REMOVE(head, elm, type, field) do { \
+ if ((head)->slh_first == (elm)) { \
+ SLIST_REMOVE_HEAD((head), field); \
+ } \
+ else { \
+ struct type *curelm = (head)->slh_first; \
+ while(curelm->field.sle_next != (elm)) \
+ curelm = curelm->field.sle_next; \
+ curelm->field.sle_next = \
+ curelm->field.sle_next->field.sle_next; \
+ } \
+} while (/*CONSTCOND*/0)
+
+
+/*
+ * List definitions.
+ */
+#define LIST_HEAD(name, type) \
+struct name { \
+ struct type *lh_first; /* first element */ \
+}
+
+#define LIST_HEAD_INITIALIZER(head) \
+ { NULL }
+
+#define LIST_ENTRY(type) \
+struct { \
+ struct type *le_next; /* next element */ \
+ struct type **le_prev; /* address of previous next element */ \
+}
+
+/*
+ * List access methods.
+ */
+#define LIST_FIRST(head) ((head)->lh_first)
+#define LIST_END(head) NULL
+#define LIST_EMPTY(head) ((head)->lh_first == LIST_END(head))
+#define LIST_NEXT(elm, field) ((elm)->field.le_next)
+
+#define LIST_FOREACH(var, head, field) \
+ for ((var) = ((head)->lh_first); \
+ (var) != LIST_END(head); \
+ (var) = ((var)->field.le_next))
+
+#define LIST_FOREACH_SAFE(var, head, field, tvar) \
+ for ((var) = LIST_FIRST((head)); \
+ (var) != LIST_END(head) && \
+ ((tvar) = LIST_NEXT((var), field), 1); \
+ (var) = (tvar))
+
+#define LIST_MOVE(head1, head2, field) do { \
+ LIST_INIT((head2)); \
+ if (!LIST_EMPTY((head1))) { \
+ (head2)->lh_first = (head1)->lh_first; \
+ (head2)->lh_first->field.le_prev = &(head2)->lh_first; \
+ LIST_INIT((head1)); \
+ } \
+} while (/*CONSTCOND*/0)
+
+/*
+ * List functions.
+ */
+#if defined(QUEUEDEBUG)
+#define QUEUEDEBUG_LIST_INSERT_HEAD(head, elm, field) \
+ if ((head)->lh_first && \
+ (head)->lh_first->field.le_prev != &(head)->lh_first) \
+ QUEUEDEBUG_ABORT("LIST_INSERT_HEAD %p %s:%d", (head), \
+ __FILE__, __LINE__);
+#define QUEUEDEBUG_LIST_OP(elm, field) \
+ if ((elm)->field.le_next && \
+ (elm)->field.le_next->field.le_prev != \
+ &(elm)->field.le_next) \
+ QUEUEDEBUG_ABORT("LIST_* forw %p %s:%d", (elm), \
+ __FILE__, __LINE__); \
+ if (*(elm)->field.le_prev != (elm)) \
+ QUEUEDEBUG_ABORT("LIST_* back %p %s:%d", (elm), \
+ __FILE__, __LINE__);
+#define QUEUEDEBUG_LIST_POSTREMOVE(elm, field) \
+ (elm)->field.le_next = (void *)1L; \
+ (elm)->field.le_prev = (void *)1L;
+#else
+#define QUEUEDEBUG_LIST_INSERT_HEAD(head, elm, field)
+#define QUEUEDEBUG_LIST_OP(elm, field)
+#define QUEUEDEBUG_LIST_POSTREMOVE(elm, field)
+#endif
+
+#define LIST_INIT(head) do { \
+ (head)->lh_first = LIST_END(head); \
+} while (/*CONSTCOND*/0)
+
+#define LIST_INSERT_AFTER(listelm, elm, field) do { \
+ QUEUEDEBUG_LIST_OP((listelm), field) \
+ if (((elm)->field.le_next = (listelm)->field.le_next) != \
+ LIST_END(head)) \
+ (listelm)->field.le_next->field.le_prev = \
+ &(elm)->field.le_next; \
+ (listelm)->field.le_next = (elm); \
+ (elm)->field.le_prev = &(listelm)->field.le_next; \
+} while (/*CONSTCOND*/0)
+
+#define LIST_INSERT_BEFORE(listelm, elm, field) do { \
+ QUEUEDEBUG_LIST_OP((listelm), field) \
+ (elm)->field.le_prev = (listelm)->field.le_prev; \
+ (elm)->field.le_next = (listelm); \
+ *(listelm)->field.le_prev = (elm); \
+ (listelm)->field.le_prev = &(elm)->field.le_next; \
+} while (/*CONSTCOND*/0)
+
+#define LIST_INSERT_HEAD(head, elm, field) do { \
+ QUEUEDEBUG_LIST_INSERT_HEAD((head), (elm), field) \
+ if (((elm)->field.le_next = (head)->lh_first) != LIST_END(head))\
+ (head)->lh_first->field.le_prev = &(elm)->field.le_next;\
+ (head)->lh_first = (elm); \
+ (elm)->field.le_prev = &(head)->lh_first; \
+} while (/*CONSTCOND*/0)
+
+#define LIST_REMOVE(elm, field) do { \
+ QUEUEDEBUG_LIST_OP((elm), field) \
+ if ((elm)->field.le_next != NULL) \
+ (elm)->field.le_next->field.le_prev = \
+ (elm)->field.le_prev; \
+ *(elm)->field.le_prev = (elm)->field.le_next; \
+ QUEUEDEBUG_LIST_POSTREMOVE((elm), field) \
+} while (/*CONSTCOND*/0)
+
+#define LIST_REPLACE(elm, elm2, field) do { \
+ if (((elm2)->field.le_next = (elm)->field.le_next) != NULL) \
+ (elm2)->field.le_next->field.le_prev = \
+ &(elm2)->field.le_next; \
+ (elm2)->field.le_prev = (elm)->field.le_prev; \
+ *(elm2)->field.le_prev = (elm2); \
+ QUEUEDEBUG_LIST_POSTREMOVE((elm), field) \
+} while (/*CONSTCOND*/0)
+
+/*
+ * Simple queue definitions.
+ */
+#define SIMPLEQ_HEAD(name, type) \
+struct name { \
+ struct type *sqh_first; /* first element */ \
+ struct type **sqh_last; /* addr of last next element */ \
+}
+
+#define SIMPLEQ_HEAD_INITIALIZER(head) \
+ { NULL, &(head).sqh_first }
+
+#define SIMPLEQ_ENTRY(type) \
+struct { \
+ struct type *sqe_next; /* next element */ \
+}
+
+/*
+ * Simple queue access methods.
+ */
+#define SIMPLEQ_FIRST(head) ((head)->sqh_first)
+#define SIMPLEQ_END(head) NULL
+#define SIMPLEQ_EMPTY(head) ((head)->sqh_first == SIMPLEQ_END(head))
+#define SIMPLEQ_NEXT(elm, field) ((elm)->field.sqe_next)
+
+#define SIMPLEQ_FOREACH(var, head, field) \
+ for ((var) = ((head)->sqh_first); \
+ (var) != SIMPLEQ_END(head); \
+ (var) = ((var)->field.sqe_next))
+
+#define SIMPLEQ_FOREACH_SAFE(var, head, field, next) \
+ for ((var) = ((head)->sqh_first); \
+ (var) != SIMPLEQ_END(head) && \
+ ((next = ((var)->field.sqe_next)), 1); \
+ (var) = (next))
+
+/*
+ * Simple queue functions.
+ */
+#define SIMPLEQ_INIT(head) do { \
+ (head)->sqh_first = NULL; \
+ (head)->sqh_last = &(head)->sqh_first; \
+} while (/*CONSTCOND*/0)
+
+#define SIMPLEQ_INSERT_HEAD(head, elm, field) do { \
+ if (((elm)->field.sqe_next = (head)->sqh_first) == NULL) \
+ (head)->sqh_last = &(elm)->field.sqe_next; \
+ (head)->sqh_first = (elm); \
+} while (/*CONSTCOND*/0)
+
+#define SIMPLEQ_INSERT_TAIL(head, elm, field) do { \
+ (elm)->field.sqe_next = NULL; \
+ *(head)->sqh_last = (elm); \
+ (head)->sqh_last = &(elm)->field.sqe_next; \
+} while (/*CONSTCOND*/0)
+
+#define SIMPLEQ_INSERT_AFTER(head, listelm, elm, field) do { \
+ if (((elm)->field.sqe_next = (listelm)->field.sqe_next) == NULL)\
+ (head)->sqh_last = &(elm)->field.sqe_next; \
+ (listelm)->field.sqe_next = (elm); \
+} while (/*CONSTCOND*/0)
+
+#define SIMPLEQ_REMOVE_HEAD(head, field) do { \
+ if (((head)->sqh_first = (head)->sqh_first->field.sqe_next) == NULL) \
+ (head)->sqh_last = &(head)->sqh_first; \
+} while (/*CONSTCOND*/0)
+
+#define SIMPLEQ_REMOVE_AFTER(head, elm, field) do { \
+ if (((elm)->field.sqe_next = (elm)->field.sqe_next->field.sqe_next) \
+ == NULL) \
+ (head)->sqh_last = &(elm)->field.sqe_next; \
+} while (/*CONSTCOND*/0)
+
+#define SIMPLEQ_REMOVE(head, elm, type, field) do { \
+ if ((head)->sqh_first == (elm)) { \
+ SIMPLEQ_REMOVE_HEAD((head), field); \
+ } else { \
+ struct type *curelm = (head)->sqh_first; \
+ while (curelm->field.sqe_next != (elm)) \
+ curelm = curelm->field.sqe_next; \
+ if ((curelm->field.sqe_next = \
+ curelm->field.sqe_next->field.sqe_next) == NULL) \
+ (head)->sqh_last = &(curelm)->field.sqe_next; \
+ } \
+} while (/*CONSTCOND*/0)
+
+#define SIMPLEQ_CONCAT(head1, head2) do { \
+ if (!SIMPLEQ_EMPTY((head2))) { \
+ *(head1)->sqh_last = (head2)->sqh_first; \
+ (head1)->sqh_last = (head2)->sqh_last; \
+ SIMPLEQ_INIT((head2)); \
+ } \
+} while (/*CONSTCOND*/0)
+
+#define SIMPLEQ_LAST(head, type, field) \
+ (SIMPLEQ_EMPTY((head)) ? \
+ NULL : \
+ ((struct type *)(void *) \
+ ((char *)((head)->sqh_last) - offsetof(struct type, field))))
+
+/*
+ * Tail queue definitions.
+ */
+#define _TAILQ_HEAD(name, type, qual) \
+struct name { \
+ qual type *tqh_first; /* first element */ \
+ qual type *qual *tqh_last; /* addr of last next element */ \
+}
+#define TAILQ_HEAD(name, type) _TAILQ_HEAD(name, struct type,)
+
+#define TAILQ_HEAD_INITIALIZER(head) \
+ { TAILQ_END(head), &(head).tqh_first }
+
+#define _TAILQ_ENTRY(type, qual) \
+struct { \
+ qual type *tqe_next; /* next element */ \
+ qual type *qual *tqe_prev; /* address of previous next element */\
+}
+#define TAILQ_ENTRY(type) _TAILQ_ENTRY(struct type,)
+
+/*
+ * Tail queue access methods.
+ */
+#define TAILQ_FIRST(head) ((head)->tqh_first)
+#define TAILQ_END(head) (NULL)
+#define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next)
+#define TAILQ_LAST(head, headname) \
+ (*(((struct headname *)(void *)((head)->tqh_last))->tqh_last))
+#define TAILQ_PREV(elm, headname, field) \
+ (*(((struct headname *)(void *)((elm)->field.tqe_prev))->tqh_last))
+#define TAILQ_EMPTY(head) (TAILQ_FIRST(head) == TAILQ_END(head))
+
+
+#define TAILQ_FOREACH(var, head, field) \
+ for ((var) = ((head)->tqh_first); \
+ (var) != TAILQ_END(head); \
+ (var) = ((var)->field.tqe_next))
+
+#define TAILQ_FOREACH_SAFE(var, head, field, next) \
+ for ((var) = ((head)->tqh_first); \
+ (var) != TAILQ_END(head) && \
+ ((next) = TAILQ_NEXT(var, field), 1); (var) = (next))
+
+#define TAILQ_FOREACH_REVERSE(var, head, headname, field) \
+ for ((var) = TAILQ_LAST((head), headname); \
+ (var) != TAILQ_END(head); \
+ (var) = TAILQ_PREV((var), headname, field))
+
+#define TAILQ_FOREACH_REVERSE_SAFE(var, head, headname, field, prev) \
+ for ((var) = TAILQ_LAST((head), headname); \
+ (var) != TAILQ_END(head) && \
+ ((prev) = TAILQ_PREV((var), headname, field), 1); (var) = (prev))
+
+/*
+ * Tail queue functions.
+ */
+#if defined(QUEUEDEBUG)
+#define QUEUEDEBUG_TAILQ_INSERT_HEAD(head, elm, field) \
+ if ((head)->tqh_first && \
+ (head)->tqh_first->field.tqe_prev != &(head)->tqh_first) \
+ QUEUEDEBUG_ABORT("TAILQ_INSERT_HEAD %p %s:%d", (head), \
+ __FILE__, __LINE__);
+#define QUEUEDEBUG_TAILQ_INSERT_TAIL(head, elm, field) \
+ if (*(head)->tqh_last != NULL) \
+ QUEUEDEBUG_ABORT("TAILQ_INSERT_TAIL %p %s:%d", (head), \
+ __FILE__, __LINE__);
+#define QUEUEDEBUG_TAILQ_OP(elm, field) \
+ if ((elm)->field.tqe_next && \
+ (elm)->field.tqe_next->field.tqe_prev != \
+ &(elm)->field.tqe_next) \
+ QUEUEDEBUG_ABORT("TAILQ_* forw %p %s:%d", (elm), \
+ __FILE__, __LINE__); \
+ if (*(elm)->field.tqe_prev != (elm)) \
+ QUEUEDEBUG_ABORT("TAILQ_* back %p %s:%d", (elm), \
+ __FILE__, __LINE__);
+#define QUEUEDEBUG_TAILQ_PREREMOVE(head, elm, field) \
+ if ((elm)->field.tqe_next == NULL && \
+ (head)->tqh_last != &(elm)->field.tqe_next) \
+ QUEUEDEBUG_ABORT("TAILQ_PREREMOVE head %p elm %p %s:%d",\
+ (head), (elm), __FILE__, __LINE__);
+#define QUEUEDEBUG_TAILQ_POSTREMOVE(elm, field) \
+ (elm)->field.tqe_next = (void *)1L; \
+ (elm)->field.tqe_prev = (void *)1L;
+#else
+#define QUEUEDEBUG_TAILQ_INSERT_HEAD(head, elm, field)
+#define QUEUEDEBUG_TAILQ_INSERT_TAIL(head, elm, field)
+#define QUEUEDEBUG_TAILQ_OP(elm, field)
+#define QUEUEDEBUG_TAILQ_PREREMOVE(head, elm, field)
+#define QUEUEDEBUG_TAILQ_POSTREMOVE(elm, field)
+#endif
+
+#define TAILQ_INIT(head) do { \
+ (head)->tqh_first = TAILQ_END(head); \
+ (head)->tqh_last = &(head)->tqh_first; \
+} while (/*CONSTCOND*/0)
+
+#define TAILQ_INSERT_HEAD(head, elm, field) do { \
+ QUEUEDEBUG_TAILQ_INSERT_HEAD((head), (elm), field) \
+ if (((elm)->field.tqe_next = (head)->tqh_first) != TAILQ_END(head))\
+ (head)->tqh_first->field.tqe_prev = \
+ &(elm)->field.tqe_next; \
+ else \
+ (head)->tqh_last = &(elm)->field.tqe_next; \
+ (head)->tqh_first = (elm); \
+ (elm)->field.tqe_prev = &(head)->tqh_first; \
+} while (/*CONSTCOND*/0)
+
+#define TAILQ_INSERT_TAIL(head, elm, field) do { \
+ QUEUEDEBUG_TAILQ_INSERT_TAIL((head), (elm), field) \
+ (elm)->field.tqe_next = TAILQ_END(head); \
+ (elm)->field.tqe_prev = (head)->tqh_last; \
+ *(head)->tqh_last = (elm); \
+ (head)->tqh_last = &(elm)->field.tqe_next; \
+} while (/*CONSTCOND*/0)
+
+#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \
+ QUEUEDEBUG_TAILQ_OP((listelm), field) \
+ if (((elm)->field.tqe_next = (listelm)->field.tqe_next) != \
+ TAILQ_END(head)) \
+ (elm)->field.tqe_next->field.tqe_prev = \
+ &(elm)->field.tqe_next; \
+ else \
+ (head)->tqh_last = &(elm)->field.tqe_next; \
+ (listelm)->field.tqe_next = (elm); \
+ (elm)->field.tqe_prev = &(listelm)->field.tqe_next; \
+} while (/*CONSTCOND*/0)
+
+#define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \
+ QUEUEDEBUG_TAILQ_OP((listelm), field) \
+ (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \
+ (elm)->field.tqe_next = (listelm); \
+ *(listelm)->field.tqe_prev = (elm); \
+ (listelm)->field.tqe_prev = &(elm)->field.tqe_next; \
+} while (/*CONSTCOND*/0)
+
+#define TAILQ_REMOVE(head, elm, field) do { \
+ QUEUEDEBUG_TAILQ_PREREMOVE((head), (elm), field) \
+ QUEUEDEBUG_TAILQ_OP((elm), field) \
+ if (((elm)->field.tqe_next) != TAILQ_END(head)) \
+ (elm)->field.tqe_next->field.tqe_prev = \
+ (elm)->field.tqe_prev; \
+ else \
+ (head)->tqh_last = (elm)->field.tqe_prev; \
+ *(elm)->field.tqe_prev = (elm)->field.tqe_next; \
+ QUEUEDEBUG_TAILQ_POSTREMOVE((elm), field); \
+} while (/*CONSTCOND*/0)
+
+#define TAILQ_REPLACE(head, elm, elm2, field) do { \
+ if (((elm2)->field.tqe_next = (elm)->field.tqe_next) != \
+ TAILQ_END(head)) \
+ (elm2)->field.tqe_next->field.tqe_prev = \
+ &(elm2)->field.tqe_next; \
+ else \
+ (head)->tqh_last = &(elm2)->field.tqe_next; \
+ (elm2)->field.tqe_prev = (elm)->field.tqe_prev; \
+ *(elm2)->field.tqe_prev = (elm2); \
+ QUEUEDEBUG_TAILQ_POSTREMOVE((elm), field); \
+} while (/*CONSTCOND*/0)
+
+#define TAILQ_CONCAT(head1, head2, field) do { \
+ if (!TAILQ_EMPTY(head2)) { \
+ *(head1)->tqh_last = (head2)->tqh_first; \
+ (head2)->tqh_first->field.tqe_prev = (head1)->tqh_last; \
+ (head1)->tqh_last = (head2)->tqh_last; \
+ TAILQ_INIT((head2)); \
+ } \
+} while (/*CONSTCOND*/0)
+
+/*
+ * Singly-linked Tail queue declarations.
+ */
+#define STAILQ_HEAD(name, type) \
+struct name { \
+ struct type *stqh_first; /* first element */ \
+ struct type **stqh_last; /* addr of last next element */ \
+}
+
+#define STAILQ_HEAD_INITIALIZER(head) \
+ { NULL, &(head).stqh_first }
+
+#define STAILQ_ENTRY(type) \
+struct { \
+ struct type *stqe_next; /* next element */ \
+}
+
+/*
+ * Singly-linked Tail queue access methods.
+ */
+#define STAILQ_FIRST(head) ((head)->stqh_first)
+#define STAILQ_END(head) NULL
+#define STAILQ_NEXT(elm, field) ((elm)->field.stqe_next)
+#define STAILQ_EMPTY(head) (STAILQ_FIRST(head) == STAILQ_END(head))
+
+/*
+ * Singly-linked Tail queue functions.
+ */
+#define STAILQ_INIT(head) do { \
+ (head)->stqh_first = NULL; \
+ (head)->stqh_last = &(head)->stqh_first; \
+} while (/*CONSTCOND*/0)
+
+#define STAILQ_INSERT_HEAD(head, elm, field) do { \
+ if (((elm)->field.stqe_next = (head)->stqh_first) == NULL) \
+ (head)->stqh_last = &(elm)->field.stqe_next; \
+ (head)->stqh_first = (elm); \
+} while (/*CONSTCOND*/0)
+
+#define STAILQ_INSERT_TAIL(head, elm, field) do { \
+ (elm)->field.stqe_next = NULL; \
+ *(head)->stqh_last = (elm); \
+ (head)->stqh_last = &(elm)->field.stqe_next; \
+} while (/*CONSTCOND*/0)
+
+#define STAILQ_INSERT_AFTER(head, listelm, elm, field) do { \
+ if (((elm)->field.stqe_next = (listelm)->field.stqe_next) == NULL)\
+ (head)->stqh_last = &(elm)->field.stqe_next; \
+ (listelm)->field.stqe_next = (elm); \
+} while (/*CONSTCOND*/0)
+
+#define STAILQ_REMOVE_HEAD(head, field) do { \
+ if (((head)->stqh_first = (head)->stqh_first->field.stqe_next) == NULL) \
+ (head)->stqh_last = &(head)->stqh_first; \
+} while (/*CONSTCOND*/0)
+
+#define STAILQ_REMOVE(head, elm, type, field) do { \
+ if ((head)->stqh_first == (elm)) { \
+ STAILQ_REMOVE_HEAD((head), field); \
+ } else { \
+ struct type *curelm = (head)->stqh_first; \
+ while (curelm->field.stqe_next != (elm)) \
+ curelm = curelm->field.stqe_next; \
+ if ((curelm->field.stqe_next = \
+ curelm->field.stqe_next->field.stqe_next) == NULL) \
+ (head)->stqh_last = &(curelm)->field.stqe_next; \
+ } \
+} while (/*CONSTCOND*/0)
+
+#define STAILQ_FOREACH(var, head, field) \
+ for ((var) = ((head)->stqh_first); \
+ (var); \
+ (var) = ((var)->field.stqe_next))
+
+#define STAILQ_FOREACH_SAFE(var, head, field, tvar) \
+ for ((var) = STAILQ_FIRST((head)); \
+ (var) && ((tvar) = STAILQ_NEXT((var), field), 1); \
+ (var) = (tvar))
+
+#define STAILQ_CONCAT(head1, head2) do { \
+ if (!STAILQ_EMPTY((head2))) { \
+ *(head1)->stqh_last = (head2)->stqh_first; \
+ (head1)->stqh_last = (head2)->stqh_last; \
+ STAILQ_INIT((head2)); \
+ } \
+} while (/*CONSTCOND*/0)
+
+#define STAILQ_LAST(head, type, field) \
+ (STAILQ_EMPTY((head)) ? \
+ NULL : \
+ ((struct type *)(void *) \
+ ((char *)((head)->stqh_last) - offsetof(struct type, field))))
+
+
+#ifndef _KERNEL
+/*
+ * Circular queue definitions. Do not use. We still keep the macros
+ * for compatibility but because of pointer aliasing issues their use
+ * is discouraged!
+ */
+
+/*
+ * __launder_type(): We use this ugly hack to work around the the compiler
+ * noticing that two types may not alias each other and elide tests in code.
+ * We hit this in the CIRCLEQ macros when comparing 'struct name *' and
+ * 'struct type *' (see CIRCLEQ_HEAD()). Modern compilers (such as GCC
+ * 4.8) declare these comparisons as always false, causing the code to
+ * not run as designed.
+ *
+ * This hack is only to be used for comparisons and thus can be fully const.
+ * Do not use for assignment.
+ *
+ * If we ever choose to change the ABI of the CIRCLEQ macros, we could fix
+ * this by changing the head/tail sentinal values, but see the note above
+ * this one.
+ */
+static __inline const void * __launder_type(const void *);
+static __inline const void *
+__launder_type(const void *__x)
+{
+ __asm __volatile("" : "+r" (__x));
+ return __x;
+}
+
+#if defined(QUEUEDEBUG)
+#define QUEUEDEBUG_CIRCLEQ_HEAD(head, field) \
+ if ((head)->cqh_first != CIRCLEQ_ENDC(head) && \
+ (head)->cqh_first->field.cqe_prev != CIRCLEQ_ENDC(head)) \
+ QUEUEDEBUG_ABORT("CIRCLEQ head forw %p %s:%d", (head), \
+ __FILE__, __LINE__); \
+ if ((head)->cqh_last != CIRCLEQ_ENDC(head) && \
+ (head)->cqh_last->field.cqe_next != CIRCLEQ_ENDC(head)) \
+ QUEUEDEBUG_ABORT("CIRCLEQ head back %p %s:%d", (head), \
+ __FILE__, __LINE__);
+#define QUEUEDEBUG_CIRCLEQ_ELM(head, elm, field) \
+ if ((elm)->field.cqe_next == CIRCLEQ_ENDC(head)) { \
+ if ((head)->cqh_last != (elm)) \
+ QUEUEDEBUG_ABORT("CIRCLEQ elm last %p %s:%d", \
+ (elm), __FILE__, __LINE__); \
+ } else { \
+ if ((elm)->field.cqe_next->field.cqe_prev != (elm)) \
+ QUEUEDEBUG_ABORT("CIRCLEQ elm forw %p %s:%d", \
+ (elm), __FILE__, __LINE__); \
+ } \
+ if ((elm)->field.cqe_prev == CIRCLEQ_ENDC(head)) { \
+ if ((head)->cqh_first != (elm)) \
+ QUEUEDEBUG_ABORT("CIRCLEQ elm first %p %s:%d", \
+ (elm), __FILE__, __LINE__); \
+ } else { \
+ if ((elm)->field.cqe_prev->field.cqe_next != (elm)) \
+ QUEUEDEBUG_ABORT("CIRCLEQ elm prev %p %s:%d", \
+ (elm), __FILE__, __LINE__); \
+ }
+#define QUEUEDEBUG_CIRCLEQ_POSTREMOVE(elm, field) \
+ (elm)->field.cqe_next = (void *)1L; \
+ (elm)->field.cqe_prev = (void *)1L;
+#else
+#define QUEUEDEBUG_CIRCLEQ_HEAD(head, field)
+#define QUEUEDEBUG_CIRCLEQ_ELM(head, elm, field)
+#define QUEUEDEBUG_CIRCLEQ_POSTREMOVE(elm, field)
+#endif
+
+#define CIRCLEQ_HEAD(name, type) \
+struct name { \
+ struct type *cqh_first; /* first element */ \
+ struct type *cqh_last; /* last element */ \
+}
+
+#define CIRCLEQ_HEAD_INITIALIZER(head) \
+ { CIRCLEQ_END(&head), CIRCLEQ_END(&head) }
+
+#define CIRCLEQ_ENTRY(type) \
+struct { \
+ struct type *cqe_next; /* next element */ \
+ struct type *cqe_prev; /* previous element */ \
+}
+
+/*
+ * Circular queue functions.
+ */
+#define CIRCLEQ_INIT(head) do { \
+ (head)->cqh_first = CIRCLEQ_END(head); \
+ (head)->cqh_last = CIRCLEQ_END(head); \
+} while (/*CONSTCOND*/0)
+
+#define CIRCLEQ_INSERT_AFTER(head, listelm, elm, field) do { \
+ QUEUEDEBUG_CIRCLEQ_HEAD((head), field) \
+ QUEUEDEBUG_CIRCLEQ_ELM((head), (listelm), field) \
+ (elm)->field.cqe_next = (listelm)->field.cqe_next; \
+ (elm)->field.cqe_prev = (listelm); \
+ if ((listelm)->field.cqe_next == CIRCLEQ_ENDC(head)) \
+ (head)->cqh_last = (elm); \
+ else \
+ (listelm)->field.cqe_next->field.cqe_prev = (elm); \
+ (listelm)->field.cqe_next = (elm); \
+} while (/*CONSTCOND*/0)
+
+#define CIRCLEQ_INSERT_BEFORE(head, listelm, elm, field) do { \
+ QUEUEDEBUG_CIRCLEQ_HEAD((head), field) \
+ QUEUEDEBUG_CIRCLEQ_ELM((head), (listelm), field) \
+ (elm)->field.cqe_next = (listelm); \
+ (elm)->field.cqe_prev = (listelm)->field.cqe_prev; \
+ if ((listelm)->field.cqe_prev == CIRCLEQ_ENDC(head)) \
+ (head)->cqh_first = (elm); \
+ else \
+ (listelm)->field.cqe_prev->field.cqe_next = (elm); \
+ (listelm)->field.cqe_prev = (elm); \
+} while (/*CONSTCOND*/0)
+
+#define CIRCLEQ_INSERT_HEAD(head, elm, field) do { \
+ QUEUEDEBUG_CIRCLEQ_HEAD((head), field) \
+ (elm)->field.cqe_next = (head)->cqh_first; \
+ (elm)->field.cqe_prev = CIRCLEQ_END(head); \
+ if ((head)->cqh_last == CIRCLEQ_ENDC(head)) \
+ (head)->cqh_last = (elm); \
+ else \
+ (head)->cqh_first->field.cqe_prev = (elm); \
+ (head)->cqh_first = (elm); \
+} while (/*CONSTCOND*/0)
+
+#define CIRCLEQ_INSERT_TAIL(head, elm, field) do { \
+ QUEUEDEBUG_CIRCLEQ_HEAD((head), field) \
+ (elm)->field.cqe_next = CIRCLEQ_END(head); \
+ (elm)->field.cqe_prev = (head)->cqh_last; \
+ if ((head)->cqh_first == CIRCLEQ_ENDC(head)) \
+ (head)->cqh_first = (elm); \
+ else \
+ (head)->cqh_last->field.cqe_next = (elm); \
+ (head)->cqh_last = (elm); \
+} while (/*CONSTCOND*/0)
+
+#define CIRCLEQ_REMOVE(head, elm, field) do { \
+ QUEUEDEBUG_CIRCLEQ_HEAD((head), field) \
+ QUEUEDEBUG_CIRCLEQ_ELM((head), (elm), field) \
+ if ((elm)->field.cqe_next == CIRCLEQ_ENDC(head)) \
+ (head)->cqh_last = (elm)->field.cqe_prev; \
+ else \
+ (elm)->field.cqe_next->field.cqe_prev = \
+ (elm)->field.cqe_prev; \
+ if ((elm)->field.cqe_prev == CIRCLEQ_ENDC(head)) \
+ (head)->cqh_first = (elm)->field.cqe_next; \
+ else \
+ (elm)->field.cqe_prev->field.cqe_next = \
+ (elm)->field.cqe_next; \
+ QUEUEDEBUG_CIRCLEQ_POSTREMOVE((elm), field) \
+} while (/*CONSTCOND*/0)
+
+#define CIRCLEQ_FOREACH(var, head, field) \
+ for ((var) = ((head)->cqh_first); \
+ (var) != CIRCLEQ_ENDC(head); \
+ (var) = ((var)->field.cqe_next))
+
+#define CIRCLEQ_FOREACH_REVERSE(var, head, field) \
+ for ((var) = ((head)->cqh_last); \
+ (var) != CIRCLEQ_ENDC(head); \
+ (var) = ((var)->field.cqe_prev))
+
+/*
+ * Circular queue access methods.
+ */
+#define CIRCLEQ_FIRST(head) ((head)->cqh_first)
+#define CIRCLEQ_LAST(head) ((head)->cqh_last)
+/* For comparisons */
+#define CIRCLEQ_ENDC(head) (__launder_type(head))
+/* For assignments */
+#define CIRCLEQ_END(head) ((void *)(head))
+#define CIRCLEQ_NEXT(elm, field) ((elm)->field.cqe_next)
+#define CIRCLEQ_PREV(elm, field) ((elm)->field.cqe_prev)
+#define CIRCLEQ_EMPTY(head) \
+ (CIRCLEQ_FIRST(head) == CIRCLEQ_ENDC(head))
+
+#define CIRCLEQ_LOOP_NEXT(head, elm, field) \
+ (((elm)->field.cqe_next == CIRCLEQ_ENDC(head)) \
+ ? ((head)->cqh_first) \
+ : (elm->field.cqe_next))
+#define CIRCLEQ_LOOP_PREV(head, elm, field) \
+ (((elm)->field.cqe_prev == CIRCLEQ_ENDC(head)) \
+ ? ((head)->cqh_last) \
+ : (elm->field.cqe_prev))
+#endif /* !_KERNEL */
+
+#endif /* !_SYS_QUEUE_H_ */
diff --git a/src/H5retry_private.h b/src/H5retry_private.h
new file mode 100644
index 0000000..6621957
--- /dev/null
+++ b/src/H5retry_private.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2019 The HDF Group. All rights reserved.
+ *
+ * This file is part of HDF5. The full HDF5 copyright notice, including
+ * terms governing use, modification, and redistribution, is contained in
+ * the COPYING file, which can be found at the root of the source code
+ * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases.
+ * If you do not have access to either file, you may request a copy from
+ * help@hdfgroup.org.
+ */
+
+#ifndef _H5retry_private_H
+#define _H5retry_private_H
+
+/*
+ * Data types and functions for retry loops.
+ */
+
+/* State for a retry loop. No user-serviceable parts inside. */
+typedef struct h5_retry_t {
+ uint64_t maxival; /* maximum sleep interval (nanoseconds) */
+ unsigned maxtries; /* maximum permissible tries */
+ unsigned tries; /* remaining tries */
+ uint64_t ival; /* nanoseconds sleep interval before clamping to
+ * maxival
+ */
+} h5_retry_t;
+
+/* Default minimum/maximum retry intervals: 1/10s minimum, 1s maximum. */
+#define H5_RETRY_DEFAULT_MINIVAL ( 100ULL * 1000ULL * 1000ULL)
+#define H5_RETRY_DEFAULT_MAXIVAL ( 1000ULL * 1000ULL * 1000ULL)
+/* One hour: */
+#define H5_RETRY_ONE_SECOND (1000ULL * 1000ULL * 1000ULL)
+#define H5_RETRY_ONE_HOUR (3600ULL * H5_RETRY_ONE_SECOND)
+
+/* If any tries remain, decrease the number of remaining tries and
+ * return true. Otherwise, return false.
+ *
+ * XXX This is not part of the API. XXX
+ */
+static inline bool
+h5_retry_decrement(struct h5_retry_t *r)
+{
+ if (r->tries == 0)
+ return false;
+ --r->tries;
+ return true;
+}
+
+/* Establish state for a retry loop in `r`. The loop will retry no
+ * more than `maxtries` times, sleeping for no fewer than `minival`
+ * nanoseconds between tries. After each try, the sleep time will
+ * increase to `maxival` nanoseconds or twice the previous sleep time,
+ * whichever is less.
+ *
+ * `h5_retry_init` always returns true. This is to help one use
+ * it in a loop like this:
+ *
+ * for (do_try = h5_retry_init(&r, 100, H5_RETRY_DEFAULT_MINIVAL,
+ * H5_RETRY_DEFAULT_MAXIVAL);
+ * do_try;
+ * do_try = h5_retry_next(&r)) {
+ * .
+ * .
+ * .
+ * }
+ *
+ * Note well: the program will enter the body of the loop, above, no more
+ * than 101 times: once for an initial try, and then 100 times for retries.
+ */
+static inline bool
+h5_retry_init(struct h5_retry_t *r, unsigned int maxtries, uint64_t minival,
+ uint64_t maxival)
+{
+ memset(r, '\0', sizeof(*r));
+ assert(0 < maxtries);
+ assert(0 < minival && minival <= maxival);
+ r->tries = r->maxtries = maxtries;
+ r->ival = minival;
+ r->maxival = maxival;
+ return h5_retry_decrement(r);
+}
+
+/* If any tries remain, sleep for the mininum interval, or twice the
+ * previous sleep time, and return true. If no tries remain, return false.
+ */
+static inline bool
+h5_retry_next(struct h5_retry_t *r)
+{
+ uint64_t ival;
+
+ if (!h5_retry_decrement(r))
+ return false;
+ ival = r->ival;
+ if (r->maxival < ival)
+ ival = r->maxival;
+ else if (UINT64_MAX - ival >= ival)
+ r->ival += ival;
+
+ H5_nanosleep(ival);
+
+ return true;
+}
+
+/* Return the number of tries performed since `h5_retry_init()`
+ * was called on `r`.
+ */
+static inline unsigned
+h5_retry_tries(struct h5_retry_t *r)
+{
+ return r->maxtries - r->tries;
+}
+
+#endif /* _H5retry_private_H */
diff --git a/src/H5system.c b/src/H5system.c
index 24935fd..fff4c8c 100644
--- a/src/H5system.c
+++ b/src/H5system.c
@@ -1448,15 +1448,32 @@ done:
void
H5_nanosleep(uint64_t nanosec)
{
+ const uint64_t nanosec_per_sec = 1000 * 1000 * 1000;
struct timespec sleeptime; /* Struct to hold time to sleep */
FUNC_ENTER_NOAPI_NOINIT_NOERR
- /* Set up time to sleep */
- sleeptime.tv_sec = 0;
- sleeptime.tv_nsec = (long)nanosec;
+ /* Set up time to sleep
+ *
+ * Assuming ILP32 or LP64 or wider architecture, (long)operand
+ * satisfies 0 <= operand < nanosec_per_sec < LONG_MAX.
+ *
+ * It's harder to be sure that we don't overflow time_t.
+ */
+ sleeptime.tv_sec = (time_t)(nanosec / nanosec_per_sec);
+ sleeptime.tv_nsec = (long)(nanosec % nanosec_per_sec);
- HDnanosleep(&sleeptime, NULL);
+ /* Sleep for up to `sleeptime` and, in the event of an interruption,
+ * save the unslept time back to `sleeptime`.
+ */
+ while (HDnanosleep(&sleeptime, &sleeptime) == -1) {
+ /* If we were just interrupted, sleep for the remaining time.
+ * Otherwise, the error was essentially impossible, so just stop
+ * sleeping.
+ */
+ if (errno != EINTR)
+ break;
+ }
FUNC_LEAVE_NOAPI_VOID
} /* end H5_nanosleep() */
diff --git a/src/H5time_private.h b/src/H5time_private.h
new file mode 100644
index 0000000..30cbaf1
--- /dev/null
+++ b/src/H5time_private.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2019 The HDF Group. All rights reserved.
+ *
+ * This file is part of HDF5. The full HDF5 copyright notice, including
+ * terms governing use, modification, and redistribution, is contained in
+ * the COPYING file, which can be found at the root of the source code
+ * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases.
+ * If you do not have access to either file, you may request a copy from
+ * help@hdfgroup.org.
+ */
+
+/*
+ * Portions of this file derive from <sys/time.h> in NetBSD. Applicable
+ * copyright notices and licenses are reproduced here:
+ */
+
+/*-
+ * Copyright (c) 2017 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)time.h 8.5 (Berkeley) 5/4/95
+ */
+
+#ifndef _H5time_private_H
+#define _H5time_private_H
+
+#ifdef __NetBSD__
+#include <sys/time.h>
+#else
+#define timespecclear(tsp) (tsp)->tv_sec = (time_t)((tsp)->tv_nsec = 0L)
+#define timespecisset(tsp) ((tsp)->tv_sec || (tsp)->tv_nsec)
+#define timespeccmp(tsp, usp, cmp) \
+ (((tsp)->tv_sec == (usp)->tv_sec) ? \
+ ((tsp)->tv_nsec cmp (usp)->tv_nsec) : \
+ ((tsp)->tv_sec cmp (usp)->tv_sec))
+#define timespecadd(tsp, usp, vsp) \
+ do { \
+ (vsp)->tv_sec = (tsp)->tv_sec + (usp)->tv_sec; \
+ (vsp)->tv_nsec = (tsp)->tv_nsec + (usp)->tv_nsec; \
+ if ((vsp)->tv_nsec >= 1000000000L) { \
+ (vsp)->tv_sec++; \
+ (vsp)->tv_nsec -= 1000000000L; \
+ } \
+ } while (/* CONSTCOND */ 0)
+#define timespecsub(tsp, usp, vsp) \
+ do { \
+ (vsp)->tv_sec = (tsp)->tv_sec - (usp)->tv_sec; \
+ (vsp)->tv_nsec = (tsp)->tv_nsec - (usp)->tv_nsec; \
+ if ((vsp)->tv_nsec < 0) { \
+ (vsp)->tv_sec--; \
+ (vsp)->tv_nsec += 1000000000L; \
+ } \
+ } while (/* CONSTCOND */ 0)
+#define timespec2ns(x) (((uint64_t)(x)->tv_sec) * 1000000000UL + (uint64_t)(x)->tv_nsec)
+#endif
+
+#endif /* _H5time_private_H */
diff --git a/src/Makefile.am b/src/Makefile.am
index 787b502..cb3fa3c 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -59,11 +59,13 @@ libhdf5_la_SOURCES= H5.c H5checksum.c H5dbg.c H5lib_settings.c H5system.c \
H5F.c H5Faccum.c H5Fcwfs.c H5Fdbg.c H5Fdeprec.c H5Fefc.c H5Ffake.c \
H5Fint.c H5Fio.c H5Fmount.c H5Fquery.c H5Fsfile.c H5Fspace.c \
H5Fsuper.c H5Fsuper_cache.c H5Ftest.c \
+ H5Fvfd_swmr.c \
H5FA.c H5FAcache.c H5FAdbg.c H5FAdblock.c H5FAdblkpage.c H5FAhdr.c \
H5FAint.c H5FAstat.c H5FAtest.c \
H5FD.c H5FDcore.c H5FDfamily.c H5FDhdfs.c H5FDint.c H5FDlog.c \
H5FDmulti.c H5FDsec2.c H5FDspace.c \
H5FDsplitter.c H5FDstdio.c H5FDtest.c \
+ H5FDvfd_swmr.c H5FDvfd_swmr_instr.c \
H5FL.c H5FO.c H5FS.c H5FScache.c H5FSdbg.c H5FSint.c H5FSsection.c \
H5FSstat.c H5FStest.c \
H5G.c H5Gbtree2.c H5Gcache.c H5Gcompact.c H5Gdense.c H5Gdeprec.c \
@@ -72,13 +74,12 @@ libhdf5_la_SOURCES= H5.c H5checksum.c H5dbg.c H5lib_settings.c H5system.c \
H5HF.c H5HFbtree2.c H5HFcache.c H5HFdbg.c H5HFdblock.c H5HFdtable.c \
H5HFhdr.c H5HFhuge.c H5HFiblock.c H5HFiter.c H5HFman.c H5HFsection.c \
H5HFspace.c H5HFstat.c H5HFtest.c H5HFtiny.c \
- H5HG.c H5HGcache.c H5HGdbg.c H5HGquery.c \
+ H5HG.c H5HGcache.c H5HGdbg.c H5HGquery.c H5HGtrap.c \
H5HL.c H5HLcache.c H5HLdbg.c H5HLint.c H5HLprfx.c H5HLdblk.c \
- H5HP.c \
- H5I.c H5Itest.c \
- H5L.c H5Ldeprec.c H5Lexternal.c \
+ H5HP.c H5I.c H5Itest.c H5L.c H5Ldeprec.c H5Lexternal.c \
H5M.c \
H5MF.c H5MFaggr.c H5MFdbg.c H5MFsection.c \
+ H5MV.c H5MVsection.c \
H5MM.c H5MP.c H5MPtest.c \
H5O.c H5Odeprec.c H5Oainfo.c H5Oalloc.c H5Oattr.c H5Oattribute.c \
H5Obogus.c H5Obtreek.c H5Ocache.c H5Ocache_image.c H5Ochunk.c \
@@ -113,8 +114,9 @@ libhdf5_la_SOURCES= H5.c H5checksum.c H5dbg.c H5lib_settings.c H5system.c \
H5VLnative_token.c \
H5VLpassthru.c \
H5VM.c H5WB.c H5Z.c \
- H5Zdeflate.c H5Zfletcher32.c H5Znbit.c H5Zshuffle.c H5Zscaleoffset.c \
- H5Zszip.c H5Ztrans.c
+ H5Zdeflate.c H5Zfletcher32.c H5Znbit.c H5Zshuffle.c \
+ H5Zscaleoffset.c H5Zszip.c H5Ztrans.c \
+ hlog.c
# Only compile parallel sources if necessary
if BUILD_PARALLEL_CONDITIONAL
@@ -137,13 +139,16 @@ if ROS3_VFD_CONDITIONAL
endif
# Public headers
-include_HEADERS = hdf5.h H5api_adpt.h H5overflow.h H5pubconf.h H5public.h H5version.h \
+include_HEADERS = hdf5.h H5api_adpt.h H5overflow.h H5pubconf.h H5public.h \
+ H5queue.h \
+ H5version.h \
H5Apublic.h H5ACpublic.h \
H5Cpublic.h H5Dpublic.h \
H5Epubgen.h H5Epublic.h H5ESpublic.h H5Fpublic.h \
H5FDpublic.h H5FDcore.h H5FDdirect.h H5FDfamily.h H5FDhdfs.h \
H5FDlog.h H5FDmirror.h H5FDmpi.h H5FDmpio.h H5FDmulti.h H5FDros3.h \
H5FDsec2.h H5FDsplitter.h H5FDstdio.h H5FDwindows.h \
+ H5FDvfd_swmr.h \
H5Gpublic.h H5Ipublic.h H5Lpublic.h \
H5Mpublic.h H5MMpublic.h H5Opublic.h H5Ppublic.h \
H5PLextern.h H5PLpublic.h \
diff --git a/src/hdf5.h b/src/hdf5.h
index eaaf8ae..7df67f7 100644
--- a/src/hdf5.h
+++ b/src/hdf5.h
@@ -53,6 +53,7 @@
#include "H5FDsec2.h" /* POSIX unbuffered file I/O */
#include "H5FDsplitter.h" /* Twin-channel (R/W & R/O) I/O passthrough */
#include "H5FDstdio.h" /* Standard C buffered I/O */
+#include "H5FDvfd_swmr.h" /* VFD SWMR reader VFD */
#ifdef H5_HAVE_WINDOWS
#include "H5FDwindows.h" /* Win32 I/O */
#endif
diff --git a/src/hlog.c b/src/hlog.c
new file mode 100644
index 0000000..d2d0e51
--- /dev/null
+++ b/src/hlog.c
@@ -0,0 +1,366 @@
+/*
+ * Copyright (c) 2004, 2005, 2006, 2007 David Young. All rights reserved.
+ *
+ * See COPYING at the top of the HDF5 distribution for license terms.
+ */
+/*
+ * Copyright (c) 2004 Urbana-Champaign Independent Media Center.
+ * All rights reserved.
+ *
+ * See COPYING at the top of the HDF5 distribution for license terms.
+ */
+#include <err.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h> /* for uintmax_t */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include <sys/param.h>
+#include <sys/cdefs.h>
+
+#include "hlog.h"
+#include "H5time_private.h"
+
+TAILQ_HEAD(, hlog_outlet) hlog_outlets = TAILQ_HEAD_INITIALIZER(hlog_outlets);
+
+HLOG_OUTLET_TOP_DEFN(all);
+
+static struct timespec timestamp_zero;
+
+void hlog_init(void) _constructor;
+static void hlog_init_timestamps(void);
+
+void
+hlog_init(void)
+{
+ const char *settings0;
+ char *item, *settings;
+
+ if ((settings0 = getenv("HLOG")) == NULL)
+ return;
+
+ if ((settings = strdup(settings0)) == NULL) {
+ warn("%s: cannot duplicate settings string", __func__);
+ return;
+ }
+
+ while ((item = strsep(&settings, " ,")) != NULL) {
+ hlog_outlet_state_t state;
+ char key[64 + 1], val[4 + 1]; // + 1 for the terminating NUL
+ int nconverted;
+
+ nconverted = sscanf(item, " %64[0-9a-z_] = %4s ", key, val);
+ if (nconverted != 2) {
+ warnx("%s: malformed HLOG item \"%s\"", __func__, item);
+ continue;
+ }
+
+ if (strcmp(val, "on") == 0 || strcmp(val, "yes") == 0)
+ state = HLOG_OUTLET_S_ON;
+ else if (strcmp(val, "off") == 0 || strcmp(val, "no") == 0)
+ state = HLOG_OUTLET_S_OFF;
+ else if (strcmp(val, "pass") == 0)
+ state = HLOG_OUTLET_S_PASS;
+ else {
+ warnx("%s: bad HLOG value \"%s\" in item \"%s\"", __func__,
+ val, item);
+ continue;
+ }
+
+ if (hlog_set_state(key, state, true) == -1) {
+ warn("%s: could not set state for HLOG item \"%s\"", __func__,
+ item);
+ }
+ }
+
+ free(settings);
+}
+
+
+static void
+hlog_init_timestamps(void)
+{
+ static bool initialized = false;
+
+ if (initialized)
+ return;
+
+ if (clock_gettime(CLOCK_MONOTONIC, &timestamp_zero) == -1)
+ err(EXIT_FAILURE, "%s: clock_gettime", __func__);
+
+ initialized = true;
+}
+
+static void
+hlog_print_time(void)
+{
+ struct timespec elapsed, now;
+
+ hlog_init_timestamps();
+
+ if (clock_gettime(CLOCK_MONOTONIC, &now) == -1)
+ err(EXIT_FAILURE, "%s: clock_gettime", __func__);
+
+ timespecsub(&now, &timestamp_zero, &elapsed);
+
+ fprintf(stderr, "%ju.%.9ld ", (uintmax_t)elapsed.tv_sec, elapsed.tv_nsec);
+}
+
+void
+vhlog(const char *fmt, va_list ap)
+{
+ hlog_print_time();
+ (void)vfprintf(stderr, fmt, ap);
+ (void)fputc('\n', stderr);
+}
+
+static char *
+message_extend_stderr(const char *fmt0)
+{
+ static const char sep[] = ": ";
+ const char *m;
+ char *fmt;
+ size_t fmtlen;
+
+ m = strerror(errno);
+
+ fmtlen = strlen(fmt0) + strlen(m) + sizeof(sep);
+
+ if ((fmt = malloc(fmtlen)) == NULL) {
+ err(EXIT_FAILURE, "%s: malloc failed", __func__);
+ return NULL;
+ }
+
+ /* Note well the safe strcpy, strcat usage. Thank you. */
+ strcpy(fmt, fmt0);
+ strcat(fmt, sep);
+ strcat(fmt, m);
+
+ return fmt;
+}
+
+static char *
+message_extend(const char *fmt0)
+{
+ return message_extend_stderr(fmt0);
+}
+
+void
+vhlog_err(int status, const char *fmt0, va_list ap)
+{
+ char *fmt;
+
+ if ((fmt = message_extend(fmt0)) == NULL)
+ exit(status);
+ vhlog(fmt, ap);
+ free(fmt);
+
+ exit(status);
+}
+
+void
+vhlog_errx(int status, const char *fmt, va_list ap)
+{
+ vhlog(fmt, ap);
+ exit(status);
+}
+
+void
+vhlog_warn(const char *fmt0, va_list ap)
+{
+ char *fmt;
+
+ if ((fmt = message_extend(fmt0)) == NULL)
+ return;
+ vhlog(fmt, ap);
+ free(fmt);
+}
+
+void
+vhlog_warnx(const char *fmt, va_list ap)
+{
+ vhlog(fmt, ap);
+}
+
+void
+hlog_err(int status, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vhlog_err(status, fmt, ap);
+ va_end(ap);
+}
+
+void
+hlog_errx(int status, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vhlog_errx(status, fmt, ap);
+ va_end(ap);
+}
+
+void
+hlog_warn(const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vhlog_warn(fmt, ap);
+ va_end(ap);
+}
+
+void
+hlog_warnx(const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vhlog_warnx(fmt, ap);
+ va_end(ap);
+}
+
+struct hlog_outlet *
+hlog_outlet_find_active(struct hlog_outlet *ls0)
+{
+ struct hlog_outlet *ls;
+
+ HLOG_OUTLET_FOREACH(ls, ls0) {
+ switch (ls->ls_state) {
+ case HLOG_OUTLET_S_PASS:
+ continue;
+ case HLOG_OUTLET_S_OFF:
+ return NULL;
+ case HLOG_OUTLET_S_ON:
+ default:
+ return ls;
+ }
+ }
+ return NULL;
+}
+
+void
+hlog_always(struct hlog_outlet *ls _unused, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vhlog(fmt, ap);
+ va_end(ap);
+}
+
+void
+hlog_impl(struct hlog_outlet *ls0, const char *fmt, ...)
+{
+ struct hlog_outlet *ls;
+ va_list ap;
+
+ if ((ls = hlog_outlet_find_active(ls0)) == NULL) {
+ ls0->ls_resolved = HLOG_OUTLET_S_OFF;
+ return;
+ }
+
+ ls0->ls_resolved = HLOG_OUTLET_S_ON;
+
+ va_start(ap, fmt);
+ vhlog(fmt, ap);
+ va_end(ap);
+}
+
+static void
+hlog_outlet_reset_all(void)
+{
+ struct hlog_outlet *ls;
+
+ TAILQ_FOREACH(ls, &hlog_outlets, ls_next)
+ ls->ls_resolved = HLOG_OUTLET_S_PASS;
+}
+
+struct hlog_outlet *
+hlog_outlet_lookup(const char *name)
+{
+ struct hlog_outlet *ls;
+
+ TAILQ_FOREACH(ls, &hlog_outlets, ls_next) {
+ if (strcmp(ls->ls_name, name) == 0)
+ return ls;
+ }
+ return NULL;
+}
+
+static struct hlog_outlet *
+hlog_outlet_create(const char *name)
+{
+ struct hlog_outlet *ls;
+
+ if ((ls = calloc(1, sizeof(*ls))) == NULL)
+ return NULL;
+ else if ((ls->ls_name0 = strdup(name)) == NULL) {
+ free(ls);
+ return NULL;
+ }
+ ls->ls_name = ls->ls_name0;
+ ls->ls_rendezvous = true;
+ return ls;
+}
+
+static void
+hlog_outlet_destroy(struct hlog_outlet *ls)
+{
+ /*LINTED*/
+ if (ls->ls_name0 != NULL)
+ free(ls->ls_name0);
+ free(ls);
+}
+
+int
+hlog_set_state(const char *name, hlog_outlet_state_t state, bool rendezvous)
+{
+ struct hlog_outlet *ls;
+ errno = 0;
+
+ switch (state) {
+ case HLOG_OUTLET_S_PASS:
+ case HLOG_OUTLET_S_OFF:
+ case HLOG_OUTLET_S_ON:
+ break;
+ default:
+ errno = EINVAL;
+ return -1;
+ }
+ if ((ls = hlog_outlet_lookup(name)) == NULL && !rendezvous) {
+ errno = ESRCH;
+ return -1;
+ } else if (ls == NULL) {
+ if ((ls = hlog_outlet_create(name)) == NULL)
+ return -1;
+ TAILQ_INSERT_TAIL(&hlog_outlets, ls, ls_next);
+ }
+ ls->ls_state = state;
+ hlog_outlet_reset_all();
+ return 0;
+}
+
+void
+hlog_outlet_register(struct hlog_outlet *ls_arg)
+{
+ struct hlog_outlet *ls;
+ if ((ls = hlog_outlet_lookup(ls_arg->ls_name)) == NULL ||
+ ls->ls_rendezvous) {
+ TAILQ_INSERT_TAIL(&hlog_outlets, ls_arg, ls_next);
+ if (ls == NULL)
+ return;
+ warnx("%s: rendezvous with log-outlet '%s'", __func__,
+ ls->ls_name);
+ ls_arg->ls_state = ls->ls_state;
+ TAILQ_REMOVE(&hlog_outlets, ls, ls_next);
+ hlog_outlet_destroy(ls);
+ } else
+ warnx("%s: duplicate log-outlet, '%s'", __func__, ls->ls_name);
+}
diff --git a/src/hlog.h b/src/hlog.h
new file mode 100644
index 0000000..2489a47
--- /dev/null
+++ b/src/hlog.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2004, 2005, 2006, 2007 David Young. All rights reserved.
+ *
+ * See COPYING at the top of the HDF5 distribution for license terms.
+ */
+/*
+ * Copyright (c) 2004 Urbana-Champaign Independent Media Center.
+ * All rights reserved.
+ *
+ * See COPYING at the top of the HDF5 distribution for license terms.
+ */
+#ifndef _HLOG_H
+#define _HLOG_H
+
+#include <stdarg.h>
+#include <stdbool.h>
+#include <syslog.h>
+#include <sys/cdefs.h>
+
+#include "H5queue.h"
+
+#ifndef _unused
+#define _unused __attribute__((unused))
+#endif
+
+#ifndef _constructor
+#define _constructor __attribute__((constructor))
+#endif
+
+#ifndef _noreturn
+#define _noreturn __attribute__((__noreturn__))
+#endif
+
+#ifndef _printflike
+#define _printflike(_fmt, _args) \
+ __attribute__((__format__(__printf__,_fmt,_args)))
+#endif
+
+enum hlog_outlet_state {
+ HLOG_OUTLET_S_PASS = 0
+ , HLOG_OUTLET_S_OFF = 1
+ , HLOG_OUTLET_S_ON = 2
+};
+
+typedef enum hlog_outlet_state hlog_outlet_state_t;
+
+struct hlog_outlet {
+ hlog_outlet_state_t ls_resolved;
+ struct hlog_outlet *ls_parent;
+ hlog_outlet_state_t ls_state;
+ const char *ls_name;
+ char *ls_name0;
+ bool ls_rendezvous;
+ TAILQ_ENTRY(hlog_outlet) ls_next;
+};
+
+typedef struct hlog_outlet hlog_outlet_t;
+
+#define HLOG_CONSTRUCTOR(__sym) \
+void hlog_constructor_##__sym(void) _constructor; \
+void \
+hlog_constructor_##__sym(void) \
+{ \
+ hlog_outlet_register(&__sym); \
+} \
+void hlog_undefined_##__sym(void) _constructor
+
+#define HLOG_OUTLET_FOREACH(__le, __le0) \
+ for ((__le) = (__le0); (__le) != NULL; (__le) = (__le)->ls_parent)
+
+#define HLOG_OUTLET_DECL1(__sym) extern struct hlog_outlet __sym
+
+#define HLOG_JOIN_SYMS(x, y) x ## y
+
+#define HLOG_PREFIX(_sfx) HLOG_JOIN_SYMS(hlog_gbl_, _sfx)
+
+#define HLOG_OUTLET_DECL(__name) HLOG_OUTLET_DECL1(HLOG_PREFIX(__name))
+
+#define HLOG_OUTLET_DEFN(__sym, __name, __parent, __state) \
+ struct hlog_outlet __sym = { \
+ .ls_name = __name \
+ , .ls_parent = (__parent) \
+ , .ls_state = (__state) \
+ }; \
+ HLOG_CONSTRUCTOR(__sym)
+
+#define HLOG_OUTLET_MEDIUM_DEFN(__name, __parent, __state) \
+ HLOG_OUTLET_DEFN(HLOG_PREFIX(__name), #__name, &HLOG_PREFIX(__parent), \
+ __state)
+
+#define HLOG_OUTLET_SHORT_DEFN(__name, __parent) \
+ HLOG_OUTLET_MEDIUM_DEFN(__name, __parent, HLOG_OUTLET_S_PASS)
+
+#define HLOG_OUTLET_TOP_DEFN(__name) \
+ HLOG_OUTLET_DEFN(HLOG_PREFIX(__name), #__name, NULL, HLOG_OUTLET_S_PASS)
+
+HLOG_OUTLET_DECL(all);
+
+#define hlog(_name, _fmt, ...) \
+ hlog_impl(&HLOG_PREFIX(_name), _fmt, __VA_ARGS__)
+
+#define hlog_fast(_name, ...) \
+ do { \
+ hlog_outlet_t *_ls0 = &HLOG_PREFIX(_name); \
+ \
+ if (_ls0->ls_resolved == HLOG_OUTLET_S_OFF) \
+ break; \
+ else if (_ls0->ls_resolved == HLOG_OUTLET_S_ON) \
+ hlog_always(_ls0, __VA_ARGS__); \
+ else \
+ hlog_impl(_ls0, __VA_ARGS__); \
+ } while (/*CONSTCOND*/0)
+
+struct hlog_outlet *hlog_outlet_find_active(struct hlog_outlet *);
+void hlog_outlet_register(struct hlog_outlet *);
+struct hlog_outlet *hlog_outlet_lookup(const char *);
+int hlog_set_state(const char *, hlog_outlet_state_t, bool);
+
+void vhlog(const char *, va_list) _printflike(1,0);
+
+void vhlog_warn(const char *, va_list) _printflike(1,0);
+void vhlog_warnx(const char *, va_list) _printflike(1,0);
+void vhlog_err(int, const char *, va_list) _printflike(2,0) _noreturn;
+void vhlog_errx(int, const char *, va_list) _printflike(2,0) _noreturn;
+
+void hlog_warnx(const char *, ...) _printflike(1,2);
+void hlog_warn(const char *, ...) _printflike(1,2);
+
+void hlog_err(int, const char *, ...) _printflike(2,3) _noreturn;
+void hlog_errx(int, const char *, ...) _printflike(2,3) _noreturn;
+
+void hlog_always(struct hlog_outlet *, const char *, ...)
+ _printflike(2,3);
+
+void hlog_impl(struct hlog_outlet *, const char *, ...)
+ _printflike(2,3);
+
+#endif /* _HLOG_H */