summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/H5C.c75
-rw-r--r--src/H5Cimage.c29
-rw-r--r--src/H5Cmpio.c13
-rw-r--r--src/H5Cpkg.h165
-rw-r--r--src/H5Cprivate.h2
-rw-r--r--src/H5Cquery.c108
-rw-r--r--src/H5Ctest.c56
-rw-r--r--src/H5PB.c1219
-rw-r--r--src/H5PBpkg.h26
-rw-r--r--src/H5PBprivate.h23
10 files changed, 1443 insertions, 273 deletions
diff --git a/src/H5C.c b/src/H5C.c
index abea0d4..aa3428b 100644
--- a/src/H5C.c
+++ b/src/H5C.c
@@ -477,6 +477,10 @@ H5C_create(size_t max_cache_size,
cache_ptr->rdfsm_settled = FALSE;
cache_ptr->mdfsm_settled = FALSE;
+ /* fields supporting page buffer hints */
+ cache_ptr->curr_io_type = NULL;
+ cache_ptr->curr_read_speculative = FALSE;
+
if(H5C_reset_cache_hit_rate_stats(cache_ptr) < 0)
/* this should be impossible... */
HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, NULL, "H5C_reset_cache_hit_rate_stats failed")
@@ -487,6 +491,7 @@ H5C_create(size_t max_cache_size,
#ifndef NDEBUG
cache_ptr->get_entry_ptr_from_addr_counter = 0;
+ cache_ptr->curr_io_type = NULL;
#endif /* NDEBUG */
/* Set return value */
@@ -974,10 +979,13 @@ done:
*
* Programmer: John Mainzer -- 12/16/18
*
- * Changes: None.
+ * Changes: Added macro calls to maintain the page buffer hints.
+ *
+ * JRM -- 3/20/20
*
*-------------------------------------------------------------------------
*/
+
herr_t
H5C_evict_or_refresh_all_entries_in_page(H5F_t * f, uint64_t page,
uint32_t length, uint64_t tick)
@@ -994,7 +1002,7 @@ H5C_evict_or_refresh_all_entries_in_page(H5F_t * f, uint64_t page,
H5C_cache_entry_t * entry_ptr;
H5C_cache_entry_t * follow_ptr = NULL;
herr_t ret_value = SUCCEED; /* Return value */
- bool found = false;
+ hbool_t found = FALSE;
FUNC_ENTER_NOAPI(FAIL)
@@ -1036,7 +1044,7 @@ H5C_evict_or_refresh_all_entries_in_page(H5F_t * f, uint64_t page,
page * cache_ptr->page_size + length <=
entry_ptr->addr + entry_ptr->size);
- found = true;
+ found = TRUE;
/* since end of tick occurs only on API call entry in
* the VFD SWMR reader case, the entry must not be protected.
@@ -1134,12 +1142,17 @@ H5C_evict_or_refresh_all_entries_in_page(H5F_t * f, uint64_t page,
H5C_IMAGE_EXTRA_SPACE);
#endif /* H5C_DO_MEMORY_SANITY_CHECKS */
+ H5C__SET_PB_READ_HINTS(cache_ptr, entry_ptr->type, TRUE)
+
if ( H5F_block_read(f, entry_ptr->type->mem_type,
entry_ptr->addr,
- image_len, image_ptr) < 0 )
+ image_len, image_ptr) < 0 ) {
+ H5C__RESET_PB_READ_HINTS(cache_ptr)
HGOTO_ERROR(H5E_CACHE, H5E_READERROR, FAIL, \
"Can't read image (1)")
+ }
+ H5C__RESET_PB_READ_HINTS(cache_ptr)
/* 3) Call the refresh callback. If it doesn't
* request a different image size, goto 6)
@@ -1171,12 +1184,18 @@ H5C_evict_or_refresh_all_entries_in_page(H5F_t * f, uint64_t page,
H5C_IMAGE_EXTRA_SPACE);
#endif /* H5C_DO_MEMORY_SANITY_CHECKS */
+ H5C__SET_PB_READ_HINTS(cache_ptr, entry_ptr->type, TRUE)
+
if ( H5F_block_read(f, entry_ptr->type->mem_type,
entry_ptr->addr,
- image_len, image_ptr) < 0 )
+ image_len, image_ptr) < 0 ) {
+
+ H5C__RESET_PB_READ_HINTS(cache_ptr)
HGOTO_ERROR(H5E_CACHE, H5E_READERROR, FAIL, \
"Can't read image (2)")
+ }
+ H5C__RESET_PB_READ_HINTS(cache_ptr)
/* 5) Call the refresh callback again. Requesting
* a different buffer size again is an error.
@@ -6494,6 +6513,14 @@ done:
*
* Programmer: John Mainzer, 5/5/04
*
+ * Changes: Please maintain the changes list, and do not delete it
+ * unless you have merged it into the header comment
+ * proper.
+ *
+ * Added macro calls to maintain page buffer hints.
+ *
+ * JRM -- 3/20/20
+ *
*-------------------------------------------------------------------------
*/
herr_t
@@ -6679,8 +6706,18 @@ H5C__flush_single_entry(H5F_t *f, H5C_cache_entry_t *entry_ptr, unsigned flags)
else
mem_type = entry_ptr->type->mem_type;
- if(H5F_block_write(f, mem_type, entry_ptr->addr, entry_ptr->size, entry_ptr->image_ptr) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't write image to file")
+ H5C__SET_PB_WRITE_HINTS(cache_ptr, entry_ptr->type)
+
+ if ( H5F_block_write(f, mem_type, entry_ptr->addr,
+ entry_ptr->size,
+ entry_ptr->image_ptr) < 0 ) {
+
+ H5C__RESET_PB_WRITE_HINTS(cache_ptr)
+
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, \
+ "Can't write image to file")
+ }
+ H5C__RESET_PB_WRITE_HINTS(cache_ptr)
#ifdef H5_HAVE_PARALLEL
}
#endif /* H5_HAVE_PARALLEL */
@@ -7082,6 +7119,10 @@ done:
* small.
* JRM -- 3/25/20
*
+ * Added macro calls to maintain the page buffer read hints.
+ *
+ * JRM -- 3/20/20
+ *
*-------------------------------------------------------------------------
*/
static void *
@@ -7233,10 +7274,18 @@ H5C_load_entry(H5F_t * f,
if ( !coll_access || 0 == mpi_rank ) {
#endif /* H5_HAVE_PARALLEL */
- if ( H5F_block_read(f, type->mem_type, addr, len, image) < 0 )
+ H5C__SET_PB_READ_HINTS(f->shared->cache, type, TRUE)
+
+ if ( H5F_block_read(f, type->mem_type, addr, len, image) < 0 ) {
+
+ H5C__RESET_PB_READ_HINTS(f->shared->cache)
HGOTO_ERROR(H5E_CACHE, H5E_READERROR, NULL, \
"Can't read image*")
+ }
+
+ H5C__RESET_PB_READ_HINTS(f->shared->cache)
+
#ifdef H5_HAVE_PARALLEL
} /* end if */
/* if the collective metadata read optimization is turned on,
@@ -7345,11 +7394,19 @@ H5C_load_entry(H5F_t * f,
*
* JRM -- 3/24/20
*/
+
+ H5C__SET_PB_READ_HINTS(f->shared->cache, type, \
+ FALSE);
+
if ( H5F_block_read(f, type->mem_type, addr,
- actual_len, image) < 0)
+ actual_len, image) < 0 ) {
+
+ H5C__RESET_PB_READ_HINTS(f->shared->cache)
HGOTO_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, \
"can't read image")
+ }
+ H5C__RESET_PB_READ_HINTS(f->shared->cache)
#endif /* JRM */
#ifdef H5_HAVE_PARALLEL
}
diff --git a/src/H5Cimage.c b/src/H5Cimage.c
index ee286d9..9a6d667 100644
--- a/src/H5Cimage.c
+++ b/src/H5Cimage.c
@@ -1058,6 +1058,22 @@ H5C__read_cache_image(H5F_t *f, H5C_t *cache_ptr)
#endif /* H5_HAVE_PARALLEL */
/* Read the buffer (if serial access, or rank 0 of parallel access) */
+
+ /* No need to set the page buffer hints here, as if paged
+ * allocation is in use, we know that the cache image was allocated
+ * directly from the free space manager, and thus either doesn't
+ * cross page boundaries, or is page aligned. Between this,
+ * and the fact that the cache image is never read speculatively,
+ * the page buffer should never request hints in this context.
+ *
+ * If for some reason it does, the NULL curr_io_type will trigger
+ * an assertion failure.
+ *
+ * Note that we will have to revisit this if we ever use
+ * cache_ptr->curr_io_type for something other than sanity
+ * checking
+ * JRM -- 3/30/20
+ */
if(H5F_block_read(f, H5FD_MEM_SUPER, cache_ptr->image_addr,
cache_ptr->image_len, cache_ptr->image_buffer) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_READERROR, FAIL, "Can't read metadata cache image block")
@@ -3554,6 +3570,19 @@ H5C__write_cache_image(H5F_t *f, const H5C_t *cache_ptr)
#endif /* H5_HAVE_PARALLEL */
/* Write the buffer (if serial access, or rank 0 for parallel access) */
+
+ /* No need to set the page buffer hints here.
+ *
+ * If paged allocation is in use, we know that the cache image
+ * was allocated directly from the free space manager, and thus
+ * either doesn't cross page boundaries, or is page aligned.
+ * Thus it should never trigger the sanity checks in the page buffer.
+ *
+ * If for some reason it does, the NULL curr_io_type will trigger
+ * an assertion failure.
+ *
+ * JRM -- 3/30/20
+ */
if(H5F_block_write(f, H5FD_MEM_SUPER, cache_ptr->image_addr, cache_ptr->image_len, cache_ptr->image_buffer) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't write metadata cache image block to file")
#ifdef H5_HAVE_PARALLEL
diff --git a/src/H5Cmpio.c b/src/H5Cmpio.c
index 0ac4c4f..e3c60a6 100644
--- a/src/H5Cmpio.c
+++ b/src/H5Cmpio.c
@@ -1018,6 +1018,19 @@ H5C__collective_write(H5F_t *f)
HGOTO_ERROR(H5E_CACHE, H5E_CANTSET, FAIL, "can't set MPI-I/O properties")
/* Write data */
+ /*
+ * At present the page buffer is disabled in the parallel case, and
+ * thus VFD SWMR can't be used either. Thus, for now, there is
+ * no point in setting the page buffer hints.
+ *
+ * More to the point, since we are actually writing a derived type
+ * containing multiple metadata cache entries, we couldn't set it
+ * to a meaningful value.
+ *
+ * When we enable the page buffer in parallel, we will have to
+ * revisit this.
+ * JRM -- 3/30/20
+ */
if(H5F_block_write(f, H5FD_MEM_DEFAULT, (haddr_t)0, (size_t)1, base_buf) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to write entries collectively")
diff --git a/src/H5Cpkg.h b/src/H5Cpkg.h
index d9a1641..a5eafd6 100644
--- a/src/H5Cpkg.h
+++ b/src/H5Cpkg.h
@@ -3480,6 +3480,102 @@ if ( ( (entry_ptr) == NULL ) || \
} /* H5C__MOVE_TO_TOP_IN_COLL_LIST */
#endif /* H5_HAVE_PARALLEL */
+
+/***************************************/
+/* page buffer hint maintenance macros */
+/***************************************/
+
+/*-------------------------------------------------------------------------
+ *
+ * Macro: H5C__SET/RESET_PB_READ_HINTS
+ *
+ * Purpose: Set or reset the fields needed to provide hints to the
+ * page buffer so that it can disambuate between speculative
+ * reads that cross page boundaries and read of metadata
+ * entries that cross page boundaries without starting on
+ * a page boundary. This latter behaviour shouldn't happen,
+ * and the hints allow the page buffer to detect this
+ * behaviour by un-expected cache client.
+ *
+ * See the discussion of the PB hint fields in the header
+ * comment for H5C_t for further details.
+ *
+ * Return: N/A
+ *
+ * Programmer: John Mainzer, 3/30/20
+ *
+ * Modifications:
+ *
+ * None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#define H5C__SET_PB_READ_HINTS(cache_ptr, type, may_be_speculative) \
+{ \
+ HDassert(cache_ptr); \
+ HDassert((cache_ptr)->magic == H5C__H5C_T_MAGIC); \
+ HDassert((cache_ptr)->curr_io_type == NULL); \
+ HDassert(type); \
+ (cache_ptr)->curr_io_type = (type); \
+ (cache_ptr)->curr_read_speculative = (may_be_speculative) && \
+ ((cache_ptr)->curr_io_type->flags & H5AC__CLASS_SPECULATIVE_LOAD_FLAG); \
+ \
+} /* H5C__SET_PB_READ_HINTS() */
+
+#define H5C__RESET_PB_READ_HINTS(cache_ptr) \
+{ \
+ HDassert(cache_ptr); \
+ HDassert((cache_ptr)->magic == H5C__H5C_T_MAGIC); \
+ HDassert((cache_ptr)->curr_io_type); \
+ (cache_ptr)->curr_io_type = NULL; \
+ (cache_ptr)->curr_read_speculative = FALSE; \
+ \
+} /* H5C__SET_PB_READ_HINTS() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Macro: H5C__SET/RESET_PB_WRITE_HINTS
+ *
+ * Purpose: Set or reset the fields needed to provide hints to the
+ * page buffer so that it can detect un-expected writes of
+ * metadata entries that cross page boundaries and do not
+ * start on page boundaries.
+ *
+ * See the discussion of the PB hint fields in the header
+ * comment for H5C_t for further details.
+ *
+ * Return: N/A
+ *
+ * Programmer: John Mainzer, 3/30/20
+ *
+ * Modifications:
+ *
+ * None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#define H5C__SET_PB_WRITE_HINTS(cache_ptr, type) \
+{ \
+ HDassert(cache_ptr); \
+ HDassert((cache_ptr)->magic == H5C__H5C_T_MAGIC); \
+ HDassert((cache_ptr)->curr_io_type == NULL); \
+ HDassert(type); \
+ (cache_ptr)->curr_io_type = (type); \
+ \
+} /* H5C__SET_PB_WRITE_HINTS() */
+
+#define H5C__RESET_PB_WRITE_HINTS(cache_ptr) \
+{ \
+ HDassert(cache_ptr); \
+ HDassert((cache_ptr)->magic == H5C__H5C_T_MAGIC); \
+ HDassert((cache_ptr)->curr_io_type); \
+ (cache_ptr)->curr_io_type = NULL; \
+ \
+} /* H5C__SET_PB_WRITE_HINTS() */
+
/****************************/
/* Package Private Typedefs */
@@ -4413,6 +4509,47 @@ typedef struct H5C_tag_info_t {
* managers that are involved in allocating space for free
* space managers.
*
+ * Page Buffer Related Fields:
+ *
+ * Due to the irregular behavior of some of the cache clients, the
+ * page buffer occasionally need hints to manage metadta I/O requests
+ * from the metadata cache -- particularly in the context of VFD SWMR.
+ * The following fields exist to support this.
+ *
+ *
+ * curr_io_type: Pointer to the instance of H5C_class_t associated with
+ * the current I/O operation. This pointer should be set
+ * just before any I/O operation by the metadata cache, and
+ * re-set to NULL immediately thereafter.
+ *
+ * This field exists because the fixed and variable length
+ * array cache clients allocate numerous entries in a single
+ * block, and sub-allocate metadata cache entries out of this
+ * block. The effect of this is to break the invarient,
+ * normally maintained by the free space managers in paged
+ * allocation mode, that no entry of less than a page in
+ * size crosses page boundaries, and that entries of page
+ * size or greater are page aligned. This in turn causes
+ * problems for the page buffer -- particularly in VFD SWMR
+ * mode.
+ *
+ * The correct solution is to modify the fixed and variable
+ * length array cache client to repair this. However, in
+ * the interrim, this field exists to detect similar
+ * behaviour elsewhere.
+ *
+ * To complicate matters, speculative reads for metadata
+ * cache entries which must determine their lengths via
+ * inspection of the on disk image of the entry, may mimic
+ * the behaviour of the fixed and extensible arrays. Thus
+ * the curr_io_type is also needed to dis-ambiguate reads.
+ *
+ * curr_read_speculative: Boolean flag indicating whether the current
+ * read request is guaranteed to be of the correct length.
+ * Field is used to distinguish between the initial and final
+ * read attempts
+ *
+ *
*
* Statistics collection fields:
*
@@ -4744,6 +4881,28 @@ typedef struct H5C_tag_info_t {
* called successfully. This field is only defined when
* NDEBUG is not #defined.
*
+ * curr_io_type: Pointer to the instance of H5C_class_t associated with
+ * the current I/O operation. This pointer should be set
+ * just before any I/O operation by the metadata cache, and
+ * re-set to NULL immediately thereafter. This field is
+ * only defined when NDEBUG is not #defined.
+ *
+ * This field exists because the fixed and variable length
+ * array cache clients allocate numerous entries in a single
+ * block, and sub-allocate metadata cache entries out of this
+ * block. The effect of this is to break the invarient,
+ * normally maintained by the free space managers in paged
+ * allocation mode, that no entry of less than a page in
+ * size crosses page boundaries, and that entries of page
+ * size or greater are page aligned. This in turn causes
+ * problems for the page buffer -- particularly in VFD SWMR
+ * mode.
+ *
+ * The correct solution is to modify the fixed and variable
+ * length array cache client to repair this. However, in
+ * the interrim, this field exists to detect similar
+ * behaviour elsewhere.
+ *
****************************************************************************/
struct H5C_t {
uint32_t magic;
@@ -4892,6 +5051,10 @@ struct H5C_t {
hbool_t rdfsm_settled;
hbool_t mdfsm_settled;
+ /* Fields supporting page buffer hints */
+ const H5C_class_t * curr_io_type;
+ hbool_t curr_read_speculative;
+
#if H5C_COLLECT_CACHE_STATS
/* stats fields */
int64_t hits[H5C__MAX_NUM_TYPE_IDS + 1];
@@ -5025,6 +5188,8 @@ H5_DLL herr_t H5C__untag_entry(H5C_t *cache, H5C_cache_entry_t *entry);
/* Testing functions */
#ifdef H5C_TESTING
H5_DLL herr_t H5C__verify_cork_tag_test(hid_t fid, H5O_token_t tag_token, hbool_t status);
+H5_DLL void H5C_set_curr_io_type_splitable(H5C_t * cache_ptr,
+ hbool_t set_splitable);
#endif /* H5C_TESTING */
#endif /* _H5Cpkg_H */
diff --git a/src/H5Cprivate.h b/src/H5Cprivate.h
index 23091cb..7678911 100644
--- a/src/H5Cprivate.h
+++ b/src/H5Cprivate.h
@@ -2411,6 +2411,8 @@ H5_DLL herr_t H5C_get_cache_size(H5C_t *cache_ptr, size_t *max_size_ptr,
uint32_t *cur_num_entries_ptr);
H5_DLL herr_t H5C_get_cache_flush_in_progress(H5C_t *cache_ptr, hbool_t *flush_in_progress_ptr);
H5_DLL herr_t H5C_get_cache_hit_rate(H5C_t *cache_ptr, double *hit_rate_ptr);
+H5_DLL int H5C_get_curr_io_client_type(H5C_t * cache_ptr);
+H5_DLL hbool_t H5C_get_curr_read_speculative(H5C_t * cache_ptr);
H5_DLL herr_t H5C_get_entry_status(const H5F_t *f, haddr_t addr,
size_t *size_ptr, hbool_t *in_cache_ptr, hbool_t *is_dirty_ptr,
hbool_t *is_protected_ptr, hbool_t *is_pinned_ptr, hbool_t *is_corked_ptr,
diff --git a/src/H5Cquery.c b/src/H5Cquery.c
index 9f1ec31..477a8ba 100644
--- a/src/H5Cquery.c
+++ b/src/H5Cquery.c
@@ -452,3 +452,111 @@ done:
FUNC_LEAVE_NOAPI(ret_value)
} /* H5C_get_mdc_image_info() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5C_get_curr_io_client_type
+ *
+ * Purpose: Return the type id associated with the metadata cache
+ * client whose data is currently being read or written.
+ *
+ * This id is obtained via the curr_io_type field in
+ * H5C_t, which is set just before most I/O calls from the
+ * metadata cache, and reset to NULL immediately thereafter.
+ *
+ * If cache_ptr->curr_io_type is NULL, the function
+ * returns -1.
+ *
+ * Note: At present, cache_ptr->curr_io_type should always
+ * be defined in the serial case with the exception
+ * of cache image I/O. In general, it is not defined in
+ * the parallel case. This is not a problem for now, as
+ * this function is used in page buffer sanity checking,
+ * and for now at least, the page buffer is not enabled in
+ * the parallel case.
+ *
+ * Return: ID of cache client whose image is being read or written,
+ * or H5AC_NTYPES if cache_ptr->curr_io_type is undefined.
+ *
+ * Programmer: John Mainzer
+ * 3/31/20
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+int
+H5C_get_curr_io_client_type(H5C_t * cache_ptr)
+{
+ int ret_value = -1; /* Return value */
+
+ FUNC_ENTER_NOAPI_NOINIT_NOERR
+
+ HDassert(cache_ptr);
+ HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC);
+
+ if ( cache_ptr->curr_io_type ) {
+
+ ret_value = cache_ptr->curr_io_type->id;
+ }
+
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5C_get_curr_io_client_type() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5C_get_curr_read_speculative
+ *
+ * Purpose: Return a boolean flag indicating whether the current
+ * read is speculative.
+ *
+ * Note that this value is only defined during a read generated
+ * by the metadatat cache. At all other times, the return
+ * value undefined (although the current implementation
+ * returns FALSE in such cases).
+ *
+ * Note also that this function exists to provide hints to the
+ * page buffer, which for now at least, is only available in
+ * the serial case. It should not be depended upon in the
+ * parallel case -- at least until verified, and potential
+ * interactions with collective metadata reads are investigated
+ * and dismissed.
+ *
+ * Return: True if the current call to H5F_block_read() by the
+ * metadata cache is an initial read attempt for a cache
+ * client whose speculative read flag is set (in H5AC_class_t),
+ * and false otherwise.
+ *
+ * Return value is undefined if a call to H5F_block_read by
+ * the metadata cache is not in progress.
+ *
+ * Programmer: John Mainzer
+ * 3/31/20
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+hbool_t
+H5C_get_curr_read_speculative(H5C_t * cache_ptr)
+{
+ hbool_t ret_value = FALSE; /* Return value */
+
+ FUNC_ENTER_NOAPI_NOINIT_NOERR
+
+ HDassert(cache_ptr);
+ HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC);
+
+ if ( cache_ptr->curr_io_type ) {
+
+ ret_value = cache_ptr->curr_read_speculative;
+ }
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5C_get_curr_read_speculative() */
+
+
diff --git a/src/H5Ctest.c b/src/H5Ctest.c
index 7f24302..b549da5 100644
--- a/src/H5Ctest.c
+++ b/src/H5Ctest.c
@@ -78,8 +78,6 @@ typedef struct {
/* Local Variables */
/*******************/
-
-
/*-------------------------------------------------------------------------
* Function: H5C__verify_cork_tag_test_cb
@@ -167,3 +165,57 @@ done:
FUNC_LEAVE_NOAPI(ret_value)
} /* H5C__verify_cork_tag_test() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5C_set_curr_io_type_splitable()
+ *
+ * Purpose: To test the meta data entry splitting capability in the page
+ * buffer (needed to deal with H5FA and H5EA's unfortunate
+ * design choice of sub-allocating multiple metadata entries
+ * out of a single file space allocation), we must be able
+ * to configure the metadata cache to report that the
+ * current I/O request is for such an entry.
+ *
+ * To do this, we must set cache_ptr->curr_io_type to
+ * point to the instance of H5C_class_t with one such
+ * client.
+ *
+ * This function does this by setting cache_ptr->curr_io_type
+ * to H5AC_EARRAY_DBLK_PAGE if set_splitable is TRUE, and to
+ * NULL otherwise.
+ *
+ * Needless to say, this is purely a testing function, and
+ * should not be called otherwise.
+ *
+ * Return: void
+ *
+ * Programmer: John Mainzer
+ * 4/10/20
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+void
+H5C_set_curr_io_type_splitable(H5C_t * cache_ptr, hbool_t set_splitable)
+{
+ FUNC_ENTER_NOAPI_NOINIT_NOERR
+
+ HDassert(cache_ptr);
+ HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC);
+
+ if ( set_splitable ) {
+
+ cache_ptr->curr_io_type = H5AC_EARRAY_DBLK_PAGE;
+
+ } else {
+
+ cache_ptr->curr_io_type = NULL;
+ }
+
+
+ FUNC_LEAVE_NOAPI_VOID
+
+} /* H5C_set_curr_io_type_splitable() */
+
diff --git a/src/H5PB.c b/src/H5PB.c
index 14ced59..da65788 100644
--- a/src/H5PB.c
+++ b/src/H5PB.c
@@ -52,9 +52,12 @@
/****************/
/* Round _x down to nearest _size. */
+/* not used at present */
+/*
#ifndef rounddown
#define rounddown(_x, _size) (((_x) / (_size)) * (_size))
#endif
+*/
/* Round _x up to nearest _size. */
#ifndef roundup
@@ -113,14 +116,6 @@ static herr_t H5PB__write_meta(H5F_shared_t *, H5FD_mem_t, haddr_t,
static herr_t H5PB__write_raw(H5F_shared_t *, H5FD_mem_t, haddr_t,
size_t, const void *);
-static void metadata_section_split(size_t, haddr_t, size_t, const void *,
- metadata_section_t *);
-
-static herr_t metadata_multipart_read(H5F_shared_t *, H5FD_mem_t, haddr_t,
- size_t, void *);
-
-static herr_t metadata_multipart_write(H5F_shared_t *, H5FD_mem_t, haddr_t,
- size_t, const void *);
/*********************/
/* Package Variables */
@@ -222,6 +217,8 @@ H5PB_reset_stats(H5PB_t *pb_ptr)
pb_ptr->max_dwl_len = 0;
pb_ptr->max_dwl_size = 0;
pb_ptr->total_dwl_ins_depth = 0;
+ pb_ptr->md_read_splits = 0;
+ pb_ptr->md_write_splits = 0;
FUNC_LEAVE_NOAPI(SUCCEED)
@@ -252,7 +249,13 @@ H5PB_reset_stats(H5PB_t *pb_ptr)
* --bypasses: the number of metadata and raw data accesses
* that bypass the page buffer layer
*
- * Return: Non-negative on success/Negative on failure
+ * TODO: The available stats have changed considerably
+ * since Mohamad wrote this routine. Update
+ * the function once things settle down.
+ *
+ * JRM -- 4/13/20
+ *
+ * Return: Non-negative on success/Negative on failure
*
* Programmer: Mohamad Chaarawi
*
@@ -297,7 +300,9 @@ H5PB_get_stats(const H5PB_t *pb_ptr, unsigned accesses[2], unsigned hits[2],
*
* Programmer: John Mainzer -- 10/12/18
*
- * Changes: None.
+ * Changes: Added support for md_read_splits and md_write_splits.
+ *
+ * JRM -- 4/11/20
*
*-------------------------------------------------------------------------
*/
@@ -404,10 +409,14 @@ H5PB_print_stats(const H5PB_t *pb_ptr)
ave_delayed_write_ins_depth = (double)(pb_ptr->total_dwl_ins_depth) /
(double)(pb_ptr->delayed_writes);
}
+
HDfprintf(stdout,
"delayed writes / ave delay / ave ins depth = %lld / %llf / %llf\n",
pb_ptr->delayed_writes, ave_delayed_write, ave_delayed_write_ins_depth);
+ HDfprintf(stdout, "metadata read / write splits = %lld / %lld.\n",
+ pb_ptr->md_read_splits, pb_ptr->md_write_splits);
+
FUNC_LEAVE_NOAPI(SUCCEED)
} /* H5PB_print_stats */
@@ -444,7 +453,10 @@ H5PB_print_stats(const H5PB_t *pb_ptr)
*
* Programmer: John Mainzer -- 10/12/18
*
- * Changes: None.
+ * Changes: Modified function to function to prevent the insertion
+ * of raw data pages when operating in VFD SWMR mode.
+ *
+ * JRM -- 3/25/20
*
*-------------------------------------------------------------------------
*/
@@ -468,7 +480,8 @@ H5PB_add_new_page(H5F_shared_t *shared, H5FD_mem_t type, haddr_t page_addr)
if ( H5FD_MEM_DRAW == type ) { /* raw data page insertion */
- if ( pb_ptr->min_md_pages == pb_ptr->max_pages ) {
+ if ( ( pb_ptr->min_md_pages == pb_ptr->max_pages ) ||
+ ( pb_ptr->vfd_swmr ) ) {
can_insert = FALSE;
@@ -514,7 +527,12 @@ done:
*
* Programmer: John Mainzer -- 10/11/18
*
- * Changes: None.
+ * Changes: Added initialization for the vfd_swmr field. Also
+ * added code to force min_rd_pages to 0 if vfd_swrm is
+ * TRUE. Do this since we now exclude raw data from the
+ * page buffer when operating in VFD SWMR mode.
+ *
+ * JRM -- 3/28/20
*
*-------------------------------------------------------------------------
*/
@@ -522,6 +540,7 @@ herr_t
H5PB_create(H5F_shared_t *shared, size_t size, unsigned page_buf_min_meta_perc,
unsigned page_buf_min_raw_perc)
{
+ hbool_t vfd_swmr = FALSE;
hbool_t vfd_swmr_writer = FALSE;
int i;
int32_t min_md_pages;
@@ -572,11 +591,21 @@ H5PB_create(H5F_shared_t *shared, size_t size, unsigned page_buf_min_meta_perc,
(int32_t)(size / shared->fs_page_size));
- /* compute vfd_swmr_writer */
- if ( ( H5F_SHARED_VFD_SWMR_CONFIG(shared) ) && ( H5F_SHARED_INTENT(shared) & H5F_ACC_RDWR ) ) {
+ /* compute vfd_swrm and vfd_swmr_writer */
+ if ( H5F_SHARED_VFD_SWMR_CONFIG(shared) ) {
+
+ vfd_swmr = TRUE;
+
+ /* force min_rd_pages to zero since raw data is exclued from
+ * the page buffer in VFD SWMR mode.
+ */
+ min_rd_pages = 0;
+
+ if ( H5F_SHARED_INTENT(shared) & H5F_ACC_RDWR ) {
- HDassert(shared->vfd_swmr_config.writer);
- vfd_swmr_writer = TRUE;
+ HDassert(shared->vfd_swmr_config.writer);
+ vfd_swmr_writer = TRUE;
+ }
}
@@ -626,6 +655,7 @@ H5PB_create(H5F_shared_t *shared, size_t size, unsigned page_buf_min_meta_perc,
/* VFD SWMR specific fields.
* The following fields are defined iff vfd_swmr_writer is TRUE.
*/
+ pb_ptr->vfd_swmr = vfd_swmr;
pb_ptr->vfd_swmr_writer = vfd_swmr_writer;
pb_ptr->mpmde_count = 0;
pb_ptr->cur_tick = 0;
@@ -925,9 +955,11 @@ done:
*
* 2) If the read is for raw data, and the page buffer is
* configured for metadata only (i.e. min_md_pages ==
- * max_pages), simply read from the HDF5 file and return.
+ * max_pages), or if we are operating in VFD SWMR mode
+ * (i.e. vfd_swmr == TRUE), simply read from the HDF5
+ * file and return.
*
- * 3) If the read is for raw data, and it of page size or
+ * 3) If the read is for raw data, and is of page size or
* larger, read it directly from the HDF5 file.
*
* It is possible that the page buffer contains dirty pages
@@ -957,17 +989,41 @@ done:
* between small and multi-page metadata entries so that
* pages containing the former will be buffered and the
* latter be read directly from file.
- *
- * Unfortunately, the metadata cache does not always know the
+ *
+ * Unfortunately, there are several flies in the ointment.
+ *
+ * First, the fixed and extensible array on disk data
+ * structures allocate multiple metadata cache entries in
+ * a single block, and use this fact to make the addresses
+ * of all but the first entry in the block computable. While
+ * this simplifies the fixed and extensible array on disk data
+ * structures, if complicates the metadata cache and the page
+ * buffer. Needless to say, the correct solution to this
+ * problem is to remove the complexity at its source. However,
+ * for now, we must code around the problem.
+ *
+ * Thus, this function must examine each read request
+ * to determine if it crosses page boundaries and is not for
+ * two or more complete pages. If it does, and it is one of
+ * the fixed or extensible array entries that is sub-allocated
+ * from a larger space allocation, the read request must be
+ * split into the minimal set of read requests that either
+ * don't cross page boundaries, or are page aligned and
+ * consist of an integral number of pages.
+ *
+ *
+ * Second, the metadata cache does not always know the
* size of metadata entries when it tries to read them. In
* such cases, it issues speculative reads that may be either
* smaller or larger than the actual size of the piece of
* metadata that is finally read.
*
* Since we are guaranteed that all metadata allocations larger
- * that one page are page aligned, we can safely clip at the
- * page boundary any non page aligned metadata read that crosses
- * page boundaries.
+ * that one page are page aligned (with the exception of those
+ * sub-allocated from larger allocations -- which we deal with
+ * by splitting I/O requests as discussed above), we can safely
+ * clip at the page boundary any non page aligned metadata
+ * read that crosses page boundaries.
*
* However, page aligned reads could wind up being either
* small or multi-page. This results in two scenarios that
@@ -1008,15 +1064,13 @@ done:
*
* 8) If the read is for metadata, is page aligned, is larger
* than one page, and there is a regular entry at the target
- * page address, test to see if the last read was for the
- * same address.
+ * page address, test to see if the read is speculative.
*
- * If was, evict the page, and satisfy the read from file.
- * Flag an error if the page was dirty.
+ * If it is not, evict the page, and satisfy the read from
+ * file. Flag an error if the page was dirty.
*
- * If the last read was for a different page, clip the read
- * to one page, and satisfy the read from the existing
- * regular entry.
+ * If it is, clip the read to one page, and satisfy the
+ * read from the existing regular entry.
*
* 9) If the read is for metadata, is page aligned, is larger
* than one page, and there is a multi-page metadata entry
@@ -1051,60 +1105,334 @@ done:
*
* Programmer: John Mainzer -- 10/11/18
*
- * Changes: None.
+ * Changes: Updated for discovery of the fact that the fixed and
+ * extensible array data structures allocate multiple
+ * metadata cache entries in a single block, and thus
+ * violate that invarient that metadata entries either
+ * do not cross page boundaries, or are page aligned.
+ *
+ * JRM -- 3/28/20
*
*-------------------------------------------------------------------------
*/
-/* TBD Add optional raw-data bypass here and at H5PB_write when we
- * are operating in parallel mode.
- */
+
herr_t
H5PB_read(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size,
void *buf/*out*/)
{
- H5PB_t *pb_ptr; /* Page buffer for this file */
+ H5PB_t *pb_ptr; /* Page buffer for this file */
+ hbool_t bypass_pb = FALSE; /* Whether to bypass page buffering */
+ hbool_t split_read = FALSE; /* whether the read must be split */
herr_t ret_value = SUCCEED; /* Return value */
+ /* the following six fields are defined iff split_read is TRUE */
+ haddr_t prefix_addr = HADDR_UNDEF; /* addr of prefix -- if defined */
+ haddr_t body_addr = HADDR_UNDEF; /* addr of body -- if defined */
+ haddr_t suffix_addr = HADDR_UNDEF; /* addr of suffix -- if defined */
+ size_t prefix_size = 0; /* size of prefix */
+ size_t body_size = 0; /* size of body */
+ size_t suffix_size = 0; /* size of suffix */
+
+
FUNC_ENTER_NOAPI(FAIL)
+ /* Sanity checks */
+ HDassert(shared);
+
hlog_fast(pbrd, "%s %p type %d %" PRIuHADDR " size %zu",
__func__, (void *)shared, type, addr, size);
+
pb_ptr = shared->pb_ptr;
- HDassert(pb_ptr == NULL || pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+ if ( pb_ptr == NULL ) {
- /* Bypass the page buffer in case
- * 1) page buffer is disabled
- * _) MPI I/O is enabled
- * 2) page buffer configured for metadata only, and it's a raw-data access
- * 5) page buffer configured for raw data only, and it's a metadata access
- */
- if (pb_ptr == NULL || H5F_SHARED_HAS_FEATURE(shared, H5FD_FEAT_HAS_MPI) ||
- (H5FD_MEM_DRAW == type && pb_ptr->min_md_pages == pb_ptr->max_pages) ||
- (H5FD_MEM_DRAW != type && pb_ptr->min_rd_pages == pb_ptr->max_pages)) {
+ bypass_pb = TRUE; /* case 1) -- page buffer is disabled */
+
+ } else {
+
+ HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+
+ if ( H5FD_MEM_DRAW == type ) { /* raw data read */
+
+ if ( ( pb_ptr->min_md_pages == pb_ptr->max_pages ) ||
+ ( pb_ptr->vfd_swmr ) ) {
+
+ /* case 2) -- page buffer configured for metadata only
+ * or vfd swmr.
+ */
+ bypass_pb = TRUE;
+
+ }
+ } else { /* metadata read */
+
+ if ( pb_ptr->min_rd_pages == pb_ptr->max_pages ) {
+
+ /* case 5) -- page buffer configured for raw data only */
+ bypass_pb = TRUE;
+
+ } else {
+ /* determine whether the read request must be split,
+ * and if so, compute the start points and sizes of
+ * of the sections.
+ *
+ * Note: The following code is almost identical to the
+ * similar code in H5PB_write(). Thus, on the surface,
+ * it is an obvious candidate for refactoring into a
+ * function 0r macro.
+ *
+ * However, there are subtle differences between
+ * the two pieces of code which are driven by the
+ * possibility of speculative reads.
+ *
+ * More to the point, further changes may be necessary.
+ * Thus we should wait on refactoring until this code has
+ * been in daily use for some time, and it is clear
+ * that further changes are unlikely.
+ */
+ int mdc_client_id = -1; /* id of mdc client, or -1 if undef */
+ uint64_t start_page; /* page index of first page in read */
+ uint64_t second_page; /* page index of second page in read */
+ uint64_t end_page; /* page index of last page in read */
+ uint64_t body_page; /* page index of start of body */
+ haddr_t start_page_addr; /* addr of first page in read */
+ haddr_t second_page_addr;/* addr of second page in read */
+ haddr_t end_page_addr; /* addr of last page in read */
+ haddr_t end_addr; /* addr of last byte in read */
+
+ /* Calculate the aligned address of the first page */
+ start_page = (addr / pb_ptr->page_size);
+ start_page_addr = start_page * pb_ptr->page_size;
+
+ /* Calculate the aligned address of the last page */
+ end_addr = addr + (haddr_t)(size - 1);
+ end_page = end_addr / (haddr_t)(pb_ptr->page_size);
+ end_page_addr = end_page * pb_ptr->page_size;
+
+ HDassert(start_page_addr <= addr);
+ HDassert(addr < start_page_addr + (haddr_t)(pb_ptr->page_size));
+
+ HDassert(start_page <= end_page);
+ HDassert(end_page_addr <= ((addr + (haddr_t)size - 1)));
+ HDassert((addr + (haddr_t)size - 1) <
+ (end_page_addr + pb_ptr->page_size));
+
+ /* test to see if the read crosses a page boundary, and
+ * does not start on a page boundary, and is not of an
+ * integral number of pages.
+ */
+ if ( ( start_page < end_page ) &&
+ ( ! ( ( addr == start_page_addr ) &&
+ ( end_page_addr + (haddr_t)(pb_ptr->page_size) ==
+ end_addr + 1 ) ) ) ) {
+
+ /* the read crosses a page boundary and is not
+ * page aligned and of length some multiple of page size.
+ *
+ * Test to see if the read is for a metadata entry that
+ * is sub-allocated from a larger space allocation.
+ *
+ * Note that the following test may have to be
+ * adjusted.
+ */
+ mdc_client_id = H5C_get_curr_io_client_type(shared->cache);
+
+ if ( ( mdc_client_id == (int)H5AC_EARRAY_DBLK_PAGE_ID ) || \
+ ( mdc_client_id == (int)H5AC_FARRAY_DBLK_PAGE_ID ) ) {
+
+ split_read = TRUE;
+ }
+ }
+
+ if ( split_read ) {
+
+ /* compute the base addresses and length of the prefix,
+ * body, and suffix of the read, where these terms are
+ * defined as follows:
+ *
+ * prefix: All bytes from addr to the first page address
+ * at or after addr. If addr == start_page_addr,
+ * the prefix is empty.
+ *
+ * body: All bytes from the first page address covered
+ * by the read up to but not including the last
+ * page address in the read. Note that the
+ * length of the body must be a multiple of the
+ * page size. If only one page address is
+ * included in the read, the body is empty.
+ *
+ * suffix: All bytes from the last page address in the
+ * read until the end of the read. If the
+ * read ends on a page boundary, the suffix is
+ * empty.
+ *
+ * Since we know that the read crosses at least one
+ * page boundary, and we have aleady filtered out the
+ * body only case, at least two of the above must be
+ * non-empty.
+ */
+
+ second_page = start_page + 1;
+ second_page_addr =
+ (haddr_t)(second_page * pb_ptr->page_size);
+
+ if ( addr > start_page_addr ) { /* prefix exists */
+
+ prefix_addr = addr;
+ prefix_size = (size_t)(second_page_addr - addr);
+
+ HDassert(prefix_addr > start_page_addr);
+ HDassert(prefix_size < pb_ptr->page_size);
+ HDassert(((size_t)(addr - start_page_addr) + \
+ prefix_size) == pb_ptr->page_size);
+ }
+
+ if ( size - prefix_size >= pb_ptr->page_size ) {
+
+ /* body exists */
+
+ if ( addr == start_page_addr ) {
+
+ body_page = start_page;
+ body_addr = start_page_addr;
+
+ } else {
+
+ body_page = second_page;
+ body_addr = second_page_addr;
+ }
+
+ if ( end_addr < end_page_addr +
+ (haddr_t)(pb_ptr->page_size - 1) ) {
+
+ /* suffix exists */
+ body_size = (size_t)(end_page - body_page) *
+ pb_ptr->page_size;
+
+ } else {
+
+ /* suffix is empty */
+ body_size = (size_t)(end_page - body_page + 1) *
+ pb_ptr->page_size;
+ }
+
+ HDassert((body_page == start_page) || \
+ (body_page == start_page + 1));
+
+ HDassert(body_addr == \
+ (haddr_t)(body_page * pb_ptr->page_size));
+
+ HDassert(body_size < size);
+ HDassert(body_size >= pb_ptr->page_size);
+
+
+ HDassert(body_addr == \
+ addr + (haddr_t)prefix_size);
+ HDassert((body_addr + (haddr_t)body_size) \
+ <= (end_addr + 1));
+ }
- if (H5FD_read(shared->lf, type, addr, size, buf) < 0) {
- HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL,
- "read through lower VFD failed");
+ if ( end_addr < end_page_addr +
+ (haddr_t)(pb_ptr->page_size - 1) ) {
+
+ suffix_addr = end_page_addr;
+ suffix_size = (end_addr + 1) - end_page_addr;
+
+ HDassert(suffix_addr == \
+ addr + (haddr_t)(prefix_size + body_size));
+ }
+
+ HDassert(size == prefix_size + body_size + suffix_size);
+ }
+ }
}
+ }
+
+#ifdef H5_HAVE_PARALLEL
+ /* at present, the page buffer must be disabled in the parallel case.
+ * However, just in case ...
+ */
+ if ( H5F_SHARED_HAS_FEATURE(shared, H5FD_FEAT_HAS_MPI) ) {
+
+ bypass_pb = TRUE;
+
+ } /* end if */
+#endif /* H5_HAVE_PARALLEL */
+
+
+ if ( bypass_pb ) { /* cases 1, 2. and 5 */
+
+ if ( H5FD_read(shared->lf, type, addr, size, buf) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
+ "read through failed")
+
+ /* Update statistics */
+ if ( pb_ptr ) {
- if (pb_ptr != NULL)
H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size);
- HGOTO_DONE(SUCCEED);
- }
+ }
+ } else {
- if (H5FD_MEM_DRAW == type) { /* cases 3 and 4 */
- if (H5PB__read_raw(shared, type, addr, size, buf) < 0)
- HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, "raw read failed");
- } else if (metadata_multipart_read(shared, type, addr, size, buf) < 0)
- HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, "meta read failed");
+ if ( H5FD_MEM_DRAW == type ) { /* cases 3 and 4 */
- H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size);
+ if ( H5PB__read_raw(shared, type, addr, size, buf) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
+ "H5PB_read_raw() failed")
+
+ } else if ( split_read ) {
+
+ /* handle the sub-allocated entry case */
+
+ /* read prefix if it exists */
+ if ( prefix_size > 0 ) {
+
+ if ( H5PB__read_meta(shared, type, prefix_addr,
+ prefix_size, buf) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
+ "H5PB_read_meta() failed on prefix")
+ }
+
+ /* read body -- if it exists. */
+ if ( body_size > 0 ) {
+
+ if ( H5PB__read_meta(shared, type, body_addr, body_size,
+ (void *)((uint8_t *)buf +
+ prefix_size)) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
+ "H5PB_read_meta() failed on body")
+ }
+
+ /* read suffix -- if it exists. */
+ if ( suffix_size > 0 ) {
+
+ if ( H5PB__read_meta(shared, type, suffix_addr, suffix_size,
+ (void *)((uint8_t *)buf + prefix_size +
+ body_size)) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
+ "H5PB_read_meta() failed on suffix")
+ }
+
+ H5PB__UPDATE_STATS_FOR_READ_SPLIT(pb_ptr)
+
+ } else { /* pass to H5PB_read_meta() -- cases 6, 7, 8, 9, & 10 */
+
+ if ( H5PB__read_meta(shared, type, addr, size, buf) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
+ "H5PB_read_meta() failed")
+ }
+ }
done:
+
FUNC_LEAVE_NOAPI(ret_value)
-}
+
+} /* H5PB_read() */
/* Remove the entry corresponding to lower-file page number `page`.
* Return 0 if there was no such entry or if the entry was removed
@@ -1198,12 +1526,16 @@ herr_t
H5PB_remove_entry(H5F_shared_t *shared, haddr_t addr)
{
uint64_t page;
- H5PB_t *pb_ptr;
+ H5PB_t *pb_ptr = NULL;
H5PB_entry_t *entry_ptr = NULL;
- herr_t ret_value = SUCCEED;
+ herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_NOAPI(FAIL)
+ /* Sanity checks */
+ HDassert(shared);
+ HDassert(shared->pb_ptr);
+
pb_ptr = shared->pb_ptr;
/* Calculate the page offset */
@@ -1263,50 +1595,169 @@ done:
} /* H5PB_remove_entry */
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5PB_remove_entries
+ *
+ * Purpose: Remove entries in the page buffer associated with a
+ * newly freed multi-page block of file space.
+ *
+ * There are several possible situations here.
+ *
+ * In the context of metadata, there are two possible cases.
+ *
+ * 1) The block of file space is associated with a metadata
+ * entry.
+ *
+ * In regular operating mode, this entry will not be
+ * cached in the page buffer, so there should be nothing
+ * to do.
+ *
+ * In VFD SWMR mode, the entry may be cached in a single
+ * multi-page entry.
+ *
+ * 2) The block of file space has been sub-allocated
+ * into multiple metadata entries (i.e. fixed and extensible
+ * array). In this case, the individual entries may cross
+ * boundaries without being page aligned -- however, for
+ * purposes of the page buffer, I/O requests on these
+ * entries will have been broken up into requests that
+ * either do not cross page boundaries or are page aligned.
+ *
+ * In the context of raw data, the page buffer may or may
+ * not contain regular entries scattered over the space
+ * touched by the newly freed file space.
+ *
+ * In all contexts, there is no guarantee that the page buffer
+ * will contain any of the possible entries.
+ *
+ * Space allocations larger than one page must be page alligned.
+ * Further, any space between the end of a multi-page allocation
+ * and the next page boundary will remain un-allocated until after
+ * the original allocation is freed. This implies that:
+ *
+ * 1) The address passed into this call must be page aligned.
+ *
+ * 2) The page buffer may safely discard any page that
+ * intersects with the newly freed file space allocation.
+ *
+ * The bottom line here is that we must scan the page buffer
+ * index, and discard all entries that intersect the supplied
+ * address and length. As a sanity check, we must verify that
+ * any such entries don't overlap.
+ *
+ * Also, in the context of the VFD SWMR write, it is possible
+ * that the discarded pages will reside in the tick list or
+ * the delayed write list -- if so, they must be removed
+ * prior to eviction.
+ *
+ * Note:
+ *
+ * This function scans the page buffer hash table to
+ * find entries to remove. While this is normally
+ * pretty in-expensive, a very large (i.e. GB) file
+ * space free may impose significant cost.
+ *
+ * As best I understand it, such frees are rare, so
+ * the current solution should be good enough for now.
+ * However, if we determine that the current solution
+ * is too expensive, two alternate solutions come to mine.
+ *
+ * a) Scan the index list instead of the hash table
+ * if the free is sufficiently large. Also, skip
+ * entirely if the page buffer doesn't contain any
+ * pages of the appropriate type.
+ *
+ * b) Whenever writing a large metadata entry, scan for
+ * intersecting entries and delete them. (potential
+ * issues with fixed and variable array entries are
+ * dealt with via the splitting mechanism.) In this
+ * case we would also have to simply ignore writes
+ * beyond EOA on flush or close.
+ *
+ * Note that we already scan for intersecting entries
+ * on large raw data writes -- with possible performance
+ * issues for large writes.
+ *
+ * JRM -- 4/25/20
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: John Mainzer 4/25/20
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
herr_t
H5PB_remove_entries(H5F_shared_t *shared, haddr_t addr, hsize_t size)
{
- H5PB_t *pb_ptr;
- H5PB_entry_t *entry_ptr;
- herr_t ret_value = SUCCEED;
- metadata_section_t section[3] = {{0, 0, NULL}, {0, 0, NULL}, {0, 0, NULL}};
- int i;
+ uint64_t i;
+ uint64_t start_page;
+ uint64_t end_page;
+ int64_t entry_pages = 0;
+ hsize_t entry_size;
+ H5PB_t *pb_ptr = NULL;
+ H5PB_entry_t *entry_ptr = NULL;
+ herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_NOAPI(FAIL)
+ /* Sanity checks */
+ HDassert(shared);
+ HDassert(shared->pb_ptr);
+
pb_ptr = shared->pb_ptr;
- HDassert(addr % pb_ptr->page_size == 0);
+ /* Calculate the start_page offset */
+ start_page = (addr / pb_ptr->page_size);
- if (size > pb_ptr->page_size) {
- hlog_fast(pbrm,
- "removing multipage region [%" PRIuHADDR ", %" PRIuHADDR ")",
- addr, addr + size);
- }
+ HDassert(addr == start_page * pb_ptr->page_size);
- metadata_section_split(pb_ptr->page_size, addr, size, NULL, section);
+ /* Calculate the end_page offset */
+ end_page = ((addr + (haddr_t)(size - 1)) / pb_ptr->page_size);
- for (i = 0; i < 3; i++) {
- metadata_section_t *iter = &section[i];
+ HDassert(start_page <= end_page);
+ HDassert(((end_page - start_page) * pb_ptr->page_size) <= size);
+ HDassert(size <= ((end_page - start_page + 1) * pb_ptr->page_size));
+
+ for ( i = start_page; i <= end_page; i++ )
+ {
+ /* test to see if page i exists */
+ H5PB__SEARCH_INDEX(pb_ptr, i, entry_ptr, FAIL)
- if (iter->len == 0)
- continue;
+ if ( entry_ptr ) {
- if (iter->len < size) {
- hlog_fast(pbrm, "removing entry [%" PRIuHADDR ", %" PRIuHADDR ") "
- "for split region [%" PRIuHADDR ", %" PRIuHADDR ")",
- iter->addr, iter->addr + iter->len, addr, addr + size);
- }
+ /* verify that this entry doesn't overlap with a previously
+ * visited entry.
+ */
+ HDassert(entry_pages <= 0);
- assert(iter->addr % pb_ptr->page_size == 0);
+ entry_size = entry_ptr->size;
+ entry_pages = (int64_t)(entry_size / pb_ptr->page_size);
- if (H5PB_remove_entry(shared, iter->addr) < 0)
- HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, "forced eviction failed")
+ if ( (uint64_t)entry_pages * pb_ptr->page_size < entry_size ) {
+
+ entry_pages++;
+ }
+
+ /* remove the entry */
+ if ( H5PB_remove_entry(shared, entry_ptr->addr) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+ "H5PB_remove_entry() failed")
+
+ }
+ entry_pages--;
}
done:
+
FUNC_LEAVE_NOAPI(ret_value)
-}
+
+} /* H5PB_remove_entries() */
/*-------------------------------------------------------------------------
@@ -1706,9 +2157,9 @@ done:
*
*-------------------------------------------------------------------------
*/
-herr_t
-H5PB_vfd_swmr__update_index(H5F_t *f,
- uint32_t * idx_ent_added_ptr,
+herr_t
+H5PB_vfd_swmr__update_index(H5F_t *f,
+ uint32_t * idx_ent_added_ptr,
uint32_t * idx_ent_modified_ptr,
uint32_t * idx_ent_not_in_tl_ptr,
uint32_t * idx_ent_not_in_tl_flushed_ptr)
@@ -1734,7 +2185,7 @@ H5PB_vfd_swmr__update_index(H5F_t *f,
idx = shared->mdf_idx;
HDassert(idx);
-
+
pb_ptr = shared->pb_ptr;
HDassert(pb_ptr);
@@ -1763,7 +2214,7 @@ H5PB_vfd_swmr__update_index(H5F_t *f,
if ( ie_ptr == NULL ) { /* alloc new entry in the metadata file index*/
uint32_t new_index_entry_index;
- new_index_entry_index = shared->mdf_idx_entries_used +
+ new_index_entry_index = shared->mdf_idx_entries_used +
idx_ent_added++;
if (new_index_entry_index >= shared->mdf_idx_len &&
@@ -1816,7 +2267,7 @@ H5PB_vfd_swmr__update_index(H5F_t *f,
ie_ptr->tick_of_last_flush = 0;
}
- /* scan the metadata file index for entries that don't appear in the
+ /* scan the metadata file index for entries that don't appear in the
* tick list. If the index entry is dirty, and either doesn't appear
* in the page buffer, or is clean in the page buffer, mark the index
* entry clean and as having been flushed in the current tick.
@@ -1848,7 +2299,7 @@ H5PB_vfd_swmr__update_index(H5F_t *f,
}
}
- HDassert(idx_ent_modified + idx_ent_not_in_tl ==
+ HDassert(idx_ent_modified + idx_ent_not_in_tl ==
shared->mdf_idx_entries_used);
HDassert(idx_ent_modified + idx_ent_not_in_tl + idx_ent_added <=
@@ -1860,8 +2311,10 @@ H5PB_vfd_swmr__update_index(H5F_t *f,
*idx_ent_not_in_tl_flushed_ptr = idx_ent_not_in_tl_flushed;
done:
+
FUNC_LEAVE_NOAPI(ret_value)
-}
+
+} /* H5PB_vfd_swmr__update_index() */
/*-------------------------------------------------------------------------
@@ -1876,9 +2329,10 @@ done:
*
* 2) If the write is raw data, and the page buffer is
* configured for metadata only (i.e. min_md_pages ==
- * max_pages), simply write to the HDF5 file and return.
+ * max_pages), or if the page buffer is operating in
+ * vfd_swmr mode, simply write to the HDF5 file and return.
*
- * 3) If the write is raw data, and it of page size or
+ * 3) If the write is raw data, and is of page size or
* larger, write directly from the HDF5 file.
*
* It is possible that the write intersects one or more
@@ -1898,13 +2352,68 @@ done:
* configured for raw data only (i.e. min_rd_pages ==
* max_pages), simply write to the HDF5 file and return.
*
+ * The free space manager guarantees that allocations larger
+ * than one page will be page alligned, and that allocations
+ * of size less than or equal to page size will not cross page
+ * boundaries. Further, unlike raw data, metadata is always
+ * written and read atomically.
+ *
+ * In principle, this should make it easy to discriminate
+ * between small and multi-page metadata entries so that
+ * pages containing the former will be buffered and the
+ * latter be written directly to file.
+ *
+ * Unfortunately, there is a fly in the ointment.
+ *
+ * The fixed and extensible array on disk data
+ * structures allocate multiple metadata cache entries in
+ * a single block, and use this fact to make the addresses
+ * of all but the first entry in the block computable. While
+ * this simplifies the fixed and extensible array on disk data
+ * structures, it complicates the metadata cache and the page
+ * buffer.
+ *
+ * From the page buffer perspective, it breaks the invarient
+ * that metadata entries of less than page size don't cross
+ * page boundaries, and those of size greater than or equal
+ * to page size start on page boundaries -- which is important
+ * for VFD SWMR as it allows efficient management of multi-page
+ * metadata entries.
+ *
+ * While it is tempting to repair the fixed and extensible
+ * array data structures so as to remove this irregularity,
+ * and remove the resulting complexity from both the metadata
+ * cache and the page buffer, this is a ticklish task, as there
+ * are already files in the wild that use the existing versions
+ * of these data structures. Thus, due to resource constraints,
+ * we have to program around the issue for now.
+ *
+ * Fortunately, for purposes of the page buffer, this is
+ * relatively easy -- when we encounter a metadata write
+ * that crosses one or more page boundaries, and is not
+ * both page aligned and an integral number of pages, we
+ * query the metadata cache to determine the type of the
+ * client whose data is being writtne. If it is one of the
+ * mis-behaving types, we split it into two or three writes
+ * such that each write either doesn't cross page boundaries,
+ * or is page aligned and an integral number of pages.
+ *
+ * This is done in this function, and is not reflected in
+ * the case analysis in the rest of this comment.
+ *
* 6) If the write is of metadata, the write is larger than
- * one page, and vfd_swmr_writer is FALSE, simply read
- * from the HDF5 file. There is no need to check the
+ * one page, and vfd_swmr_writer is FALSE, simply write
+ * to the HDF5 file. There is no need to check the
* page buffer, as metadata is always read atomically,
* and entries of this size are not buffered in the page
* buffer.
*
+ * Observe that this write must be page aligned. This
+ * should be enforced by the free space manager, but
+ * for now it is enforced by the above mentioned practice
+ * of splitting writes from cache client that don't
+ * allocate each entry separately.
+ *
* 7) If the write is of metadata, the write is larger than
* one page, and vfd_swmr_writer is TRUE, the write must
* buffered in the page buffer until the end of the tick.
@@ -1937,7 +2446,17 @@ done:
*
* Programmer: John Mainzer -- 10/11/18
*
- * Changes: None.
+ * Changes: Updated to support splitting of metadata writes that
+ * are not page aligned and cross page boundaries into
+ * 2 or 3 writes that are either page aligned or do not
+ * cross page boundaries. Full details in the header
+ * comment above, that has been updated to document
+ * this change.
+ *
+ * Also updated case 2 to bypass the page buffer for raw
+ * data writes in vfd swmr mode.
+ *
+ * JRM -- 4/5/20
*
*-------------------------------------------------------------------------
*/
@@ -1945,10 +2464,19 @@ herr_t
H5PB_write(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size,
const void *buf)
{
- H5PB_t *pb_ptr; /* Page buffer for this file */
+ H5PB_t *pb_ptr; /* Page buffer for this file */
hbool_t bypass_pb = FALSE; /* Whether to bypass page buffering */
+ hbool_t split_write = FALSE; /* whether md write must be split */
herr_t ret_value = SUCCEED; /* Return value */
+ /* the following six fields are defined iff split_write is TRUE */
+ haddr_t prefix_addr = HADDR_UNDEF; /* addr of prefix -- if defined */
+ haddr_t body_addr = HADDR_UNDEF; /* addr of body -- if defined */
+ haddr_t suffix_addr = HADDR_UNDEF; /* addr of suffix -- if defined */
+ size_t prefix_size = 0; /* size of prefix */
+ size_t body_size = 0; /* size of body */
+ size_t suffix_size = 0; /* size of suffix */
+
FUNC_ENTER_NOAPI(FAIL)
hlog_fast(pbwr, "%s %p type %d %" PRIuHADDR " size %zu",
@@ -1966,7 +2494,8 @@ H5PB_write(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size,
if ( H5FD_MEM_DRAW == type ) { /* raw data write */
- if ( pb_ptr->min_md_pages == pb_ptr->max_pages ) {
+ if ( ( pb_ptr->min_md_pages == pb_ptr->max_pages ) ||
+ ( pb_ptr->vfd_swmr ) ) {
/* case 2) -- page buffer configured for metadata only */
bypass_pb = TRUE;
@@ -1979,13 +2508,207 @@ H5PB_write(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size,
/* case 5) -- page buffer configured for raw data only */
bypass_pb = TRUE;
- } else if ( ( size >= pb_ptr->page_size ) &&
- ( ! ( pb_ptr->vfd_swmr_writer ) ) ) {
+ } else {
- /* case 6) -- md read larger than one page and
- * pb_ptr->vfd_swmr_writer is FALSE.
+ /* determine whether the write request must be split,
+ * and if so, compute the start points and sizes of
+ * of the sections.
+ *
+ * Note: The following code is almost identical to the
+ * similar code in H5PB_read(). Thus, on the surface,
+ * it is an obvious candidate for refactoring into a
+ * function or macro.
+ *
+ * However, there are subtle differences between
+ * the two pieces of code which are driven by the
+ * possibility of speculative reads.
+ *
+ * More to the point, further changes may be necessary.
+ * Thus we should wait on refactoring until this code has
+ * been in daily use for some time, and it is clear
+ * that further changes are unlikely.
*/
- bypass_pb = TRUE;
+ int mdc_client_id = -1; /* id of mdc client, or -1 if undef */
+ uint64_t start_page; /* page index of first page in read */
+ uint64_t second_page; /* page index of second page in read */
+ uint64_t end_page; /* page index of last page in read */
+ uint64_t body_page; /* page index of start of body */
+ haddr_t start_page_addr; /* addr of first page in read */
+ haddr_t second_page_addr;/* addr of second page in read */
+ haddr_t end_page_addr; /* addr of last page in read */
+ haddr_t end_addr; /* addr of last byte in read */
+
+ /* Calculate the aligned address of the first page */
+ start_page = (addr / pb_ptr->page_size);
+ start_page_addr = start_page * pb_ptr->page_size;
+
+ /* Calculate the aligned address of the last page */
+ end_addr = addr + (haddr_t)(size - 1);
+ end_page = end_addr / (haddr_t)(pb_ptr->page_size);
+ end_page_addr = end_page * pb_ptr->page_size;
+
+ HDassert(start_page_addr <= addr);
+ HDassert(addr < start_page_addr + (haddr_t)(pb_ptr->page_size));
+
+ HDassert(start_page <= end_page);
+ HDassert(end_page_addr <= ((addr + (haddr_t)size - 1)));
+ HDassert((addr + (haddr_t)size - 1) <
+ (end_page_addr + pb_ptr->page_size));
+
+ /* test to see if the write crosses a page boundary, and
+ * does not start on a page boundary, and is not of an
+ * integral number of pages.
+ */
+ if ( ( start_page < end_page ) &&
+ ( ! ( ( addr == start_page_addr ) &&
+ ( end_page_addr + (haddr_t)(pb_ptr->page_size) ==
+ end_addr + 1 ) ) ) ) {
+
+ /* the read crosses a page boundary and is not
+ * page aligned and of length some multiple of page size.
+ *
+ * Test to see if the read is for a metadata entry that
+ * is sub-allocated from a larger space allocation.
+ *
+ * Note that the following test may have to be
+ * adjusted.
+ */
+ mdc_client_id = H5C_get_curr_io_client_type(shared->cache);
+
+ if ( ( mdc_client_id == (int)H5AC_EARRAY_DBLK_PAGE_ID ) || \
+ ( mdc_client_id == (int)H5AC_FARRAY_DBLK_PAGE_ID ) ) {
+
+ split_write = TRUE;
+
+ } else {
+
+ HDassert(addr == start_page_addr);
+ HDassert(size > pb_ptr->page_size);
+
+ if ( ! pb_ptr->vfd_swmr_writer ) {
+
+ /* case 6) -- multi-page entry with fixed /
+ * extensible array filtered out, and no
+ * no VFD swmr.
+ */
+ bypass_pb = TRUE;
+ }
+ }
+ } else if ( ( size > pb_ptr->page_size ) &&
+ ( ! pb_ptr->vfd_swmr_writer ) ) {
+
+ /* write is larger than page size and we are not
+ * in VFD SWMR mode -- bypass the page buffer.
+ * This is also case 6. We catch it here as
+ * the code to determine whether to split only
+ * looks at I/O requests that cross page bundaries
+ * and are not both page aligned and an integral
+ * number of pages in length.
+ */
+ HDassert(start_page_addr == addr);
+ bypass_pb = TRUE;
+ }
+
+ if ( split_write ) {
+
+ /* compute the base addresses and length of the prefix,
+ * body, and suffix of the write, where these terms are
+ * defined as follows:
+ *
+ * prefix: All bytes from addr to the first page address
+ * at or after addr. If addr == start_page_addr,
+ * the prefix is empty.
+ *
+ * body: All bytes from the first page address covered
+ * by the write up to but not including the last
+ * page address in the write. Note that the
+ * length of the body must be a multiple of the
+ * page size. If only one page address is
+ * included in the write, the body is empty.
+ *
+ * suffix: All bytes from the last page address in the
+ * write until the end of the write. If the
+ * write ends on a page boundary, the suffix is
+ * empty.
+ *
+ * Since we know that the write crosses at least one
+ * page boundary, and we have aleady filtered out the
+ * body only case, at least two of the above must be
+ * non-empty.
+ */
+
+ second_page = start_page + 1;
+ second_page_addr =
+ (haddr_t)(second_page * pb_ptr->page_size);
+
+ if ( addr > start_page_addr ) { /* prefix exists */
+
+ prefix_addr = addr;
+ prefix_size = (size_t)(second_page_addr - addr);
+
+ HDassert(prefix_addr > start_page_addr);
+ HDassert(prefix_size < pb_ptr->page_size);
+ HDassert(((size_t)(addr - start_page_addr) + \
+ prefix_size) == pb_ptr->page_size);
+ }
+
+ if ( size - prefix_size >= pb_ptr->page_size ) {
+
+ /* body exists */
+
+ if ( addr == start_page_addr ) {
+
+ body_page = start_page;
+ body_addr = start_page_addr;
+
+ } else {
+
+ body_page = second_page;
+ body_addr = second_page_addr;
+ }
+
+ if ( end_addr < end_page_addr +
+ (haddr_t)(pb_ptr->page_size - 1) ) {
+
+ /* suffix exists */
+ body_size = (size_t)(end_page - body_page) *
+ pb_ptr->page_size;
+
+ } else {
+
+ /* suffix is empty */
+ body_size = (size_t)(end_page - body_page + 1) *
+ pb_ptr->page_size;
+ }
+
+ HDassert((body_page == start_page) || \
+ (body_page == start_page + 1));
+
+ HDassert(body_addr == \
+ (haddr_t)(body_page * pb_ptr->page_size));
+
+ HDassert(body_size < size);
+ HDassert(body_size >= pb_ptr->page_size);
+
+
+ HDassert(body_addr == \
+ addr + (haddr_t)prefix_size);
+ HDassert((body_addr + (haddr_t)body_size) \
+ <= (end_addr + 1));
+ }
+
+ if ( end_addr < end_page_addr +
+ (haddr_t)(pb_ptr->page_size - 1) ) {
+
+ suffix_addr = end_page_addr;
+ suffix_size = (end_addr + 1) - end_page_addr;
+
+ HDassert(suffix_addr == \
+ addr + (haddr_t)(prefix_size + body_size));
+ }
+
+ HDassert(size == prefix_size + body_size + suffix_size);
+ }
}
}
}
@@ -2001,6 +2724,7 @@ H5PB_write(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size,
} /* end if */
#endif /* H5_HAVE_PARALLEL */
+
if ( bypass_pb ) { /* cases 1, 2. 5, and 6 */
if ( H5FD_write(shared->lf, type, addr, size, buf) < 0 )
@@ -2022,15 +2746,84 @@ H5PB_write(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size,
HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \
"H5PB_read_raw() failed")
+ } else if ( split_write ) {
+
+ /* handle the sub-allocated entry case */
+
+ /* write prefix if it exists */
+ if ( prefix_size > 0 ) {
+
+ if ( H5PB__write_meta(shared, type, addr,
+ prefix_size, buf) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \
+ "H5PB__write_meta() failed on prefix")
+ }
+
+ /* write the body if it exists */
+ if ( body_size > 0 ) {
+
+ /* The "body_size == pb_ptr->page_size" clause in the
+ * following if is required since in normal operating
+ * mode, the page buffer buffers metadata I/O
+ * requests of page size or less.
+ *
+ * Thus this clause ensures that a single page body
+ * does not bypass the page buffer, setting the potential
+ * for an older version to shadow the most recent version.
+ *
+ * Note: The page buffer really shouldn't buffer page
+ * aligned single page metadata I/O requests, as it
+ * creates extra overhead to no purpose. However,
+ * fixing this is a bit tricky, and the case doesn't
+ * appear to be common. Thus, while it should be
+ * fixed, I don't think it is urgent.
+ *
+ * JRM 4/19/20
+ */
+ if ( ( pb_ptr->vfd_swmr ) ||
+ ( body_size == pb_ptr->page_size ) ) {
+
+ if ( H5PB__write_meta(shared, type, body_addr, body_size,
+ (const void *)((const uint8_t *)buf +
+ prefix_size)) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \
+ "H5PB__write_meta() failed on body")
+
+ } else {
+
+ if ( H5FD_write(shared->lf, type, body_addr, body_size,
+ (const void *)((const uint8_t *)buf +
+ prefix_size)) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \
+ "write through of body failed")
+
+ H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size);
+ }
+ }
+
+ /* write the suffix if it exists */
+ if ( suffix_size > 0 ) {
+
+ if ( H5PB__write_meta(shared, type, suffix_addr, suffix_size,
+ (const void *)((const uint8_t *)buf +
+ prefix_size + body_size)) < 0 )
+
+ HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \
+ "H5PB_write_meta() failed on suffix")
+ }
+
+ H5PB__UPDATE_STATS_FOR_WRITE_SPLIT(pb_ptr)
+
} else { /* cases 7, and 8 */
- if ( metadata_multipart_write(shared, type, addr, size, buf) < 0 )
+ if ( H5PB__write_meta(shared, type, addr, size, buf) < 0 )
HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \
- "H5PB_read_meta() failed")
+ "H5PB_write_meta() failed")
}
-
- H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size);
}
done:
@@ -3024,118 +3817,6 @@ done:
} /* H5PB__mark_entry_dirty() */
-static void
-metadata_section_split(size_t pgsz, haddr_t addr, size_t len, const void *_buf,
- metadata_section_t *section)
-{
- int i;
- size_t totlen = 0;
- haddr_t whole_pgaddr, tail_pgaddr;
- const char *buf = _buf;
- metadata_section_t *head = &section[0], *middle = &section[1],
- *tail = &section[2];
-
- /* Try to find the address of the first whole page, and the address of
- * the page after the last whole page.
- */
- whole_pgaddr = roundup(addr, pgsz);
- tail_pgaddr = rounddown(addr + len, pgsz);
-
- /* In the degenerate case where the first whole page is "after" the last,
- * actually the entire access lands between page boundaries.
- */
- if (whole_pgaddr > tail_pgaddr) {
- assert(len < pgsz);
- head->addr = addr;
- head->len = len;
- head->buf = buf;
- return;
- }
-
- /* `head` spans any range beginning before the first page boundary. */
- if (addr < whole_pgaddr) {
- head->buf = buf;
- head->len = pgsz - addr % pgsz;
- head->addr = addr;
- }
-
- /* `middle` spans one or more whole pages in between the end of
- * `head` and before the beginning of `tail`.
- */
- if (whole_pgaddr < tail_pgaddr) {
- middle->buf = (buf == NULL) ? NULL : &buf[whole_pgaddr - addr];
- middle->len = tail_pgaddr - whole_pgaddr;
- middle->addr = whole_pgaddr;
- }
-
- /* `tail` spans residual bytes that follow the last page boundary. */
- if (tail_pgaddr < addr + len) {
- tail->len = (addr + len) - tail_pgaddr;
- tail->buf = (buf == NULL) ? NULL : &buf[tail_pgaddr - addr];
- tail->addr = tail_pgaddr;
- }
-
- for (i = 0; i < 3; i++) {
- metadata_section_t *iter = &section[i];
- if (iter->len == 0)
- continue;
- assert(iter->addr == addr + totlen);
- assert(iter->buf == ((buf == NULL) ? NULL : &buf[totlen]));
-// assert(i == 0 || iter[-1].buf + iter[-1].len == iter->buf);
- totlen += iter->len;
- }
-
- assert(totlen == len);
-}
-
-static herr_t
-metadata_multipart_read(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr,
- size_t len, void *_buf/*out*/)
-{
- herr_t rc;
- int i;
- const size_t pgsz = shared->pb_ptr->page_size;
- metadata_section_t section[3] = {{0, 0, NULL}, {0, 0, NULL}, {0, 0, NULL}};
-
- metadata_section_split(pgsz, addr, len, _buf, section);
-
- for (i = 0; i < 3; i++) {
- metadata_section_t *iter = &section[i];
- if (iter->buf == NULL)
- continue;
- rc = H5PB__read_meta(shared, type, iter->addr, iter->len,
- (void *)(uintptr_t)iter->buf);
- if (rc < 0)
- return rc;
- }
-
- return SUCCEED;
-}
-
-static herr_t
-metadata_multipart_write(H5F_shared_t *shared, H5FD_mem_t type,
- haddr_t addr, size_t len, const void *_buf/*out*/)
-{
- herr_t rc;
- int i;
- const size_t pgsz = shared->pb_ptr->page_size;
- metadata_section_t section[3] = {{0, 0, NULL}, {0, 0, NULL}, {0, 0, NULL}};
-
- metadata_section_split(pgsz, addr, len, _buf, section);
-
- for (i = 0; i < 3; i++) {
- metadata_section_t *iter = &section[i];
-
- if (iter->buf == NULL)
- continue;
- rc = H5PB__write_meta(shared, type, iter->addr, iter->len, iter->buf);
- if (rc < 0)
- return rc;
- }
-
- return SUCCEED;
-}
-
/*-------------------------------------------------------------------------
*
@@ -3151,21 +3832,25 @@ metadata_multipart_write(H5F_shared_t *shared, H5FD_mem_t type,
* existing page, it must not be a multi-page metadata
* entry. It it is, flag an error.
*
+ * Recall that by the time we get to this function,
+ * un-aligned page reads from the fixed and variable
+ * length array structures that cross page boundaries
+ * have already been split into two or three reads
+ * that conform to the usual pattern of metadata reads.
+ *
* 7) If the read is for metadata, is page aligned, is larger
* than one page, and there is no entry in the page buffer,
* satisfy the read from the file
*
* 8) If the read is for metadata, is page aligned, is larger
* than one page, and there is a regular entry at the target
- * page address, test to see if the last read was for the
- * same address.
+ * page address, test to see if the read is speculative.
*
- * If was, evict the page, and satisfy the read from file.
- * Flag an error if the page was dirty.
+ * If it is not, evict the page, and satisfy the read from
+ * file. Flag an error if the page was dirty.
*
- * If the last read was for a different page, clip the read
- * to one page, and satisfy the read from the existing
- * regular entry.
+ * If it is, clip the read to one page, and satisfy the
+ * read from the existing regular entry.
*
* 9) If the read is for metadata, is page aligned, is larger
* than one page, and there is a multi-page metadata entry
@@ -3197,7 +3882,7 @@ metadata_multipart_write(H5F_shared_t *shared, H5FD_mem_t type,
*
* P/A == page aligned
* size > PL == size > page length
- * PA == previous address
+ * Spec == speculative read
* A == current address
*
* In the entry exists column:
@@ -3207,7 +3892,7 @@ metadata_multipart_write(H5F_shared_t *shared, H5FD_mem_t type,
* MPMDE == multi-page metadata entry
*
* | size | entry | VFD | |
- * P/A: | > PL | exists | SWMR | PA == A | Comments:
+ * P/A: | > PL | exists | SWMR | Spec | Comments:
* ------+------+--------+------+---------+-------------------------------------
* N | X | N || R | X | X | Clip read to page boundary if
* | | | | | necessary
@@ -3220,10 +3905,10 @@ metadata_multipart_write(H5F_shared_t *shared, H5FD_mem_t type,
* ------+------+--------+------+---------+-------------------------------------
* Y | Y | N | X | X | Satisfy read from file (case 7)
* ------+------+--------+------+---------+-------------------------------------
- * Y | Y | R | X | N | Clip read to page boundary
+ * Y | Y | R | X | Y | Clip read to page boundary
* | | | | | Satisfy read from entry (case 8)
* ------+------+--------+------+---------+-------------------------------------
- * Y | Y | R | X | Y | Evict entry
+ * Y | Y | R | X | N | Evict entry
* | | | | | (must be clean -- flag error if not)
* | | | | | Satisfy read from file (case 8)
* ------+------+--------+------+---------+-------------------------------------
@@ -3261,20 +3946,25 @@ metadata_multipart_write(H5F_shared_t *shared, H5FD_mem_t type,
*
* Programmer: John Mainzer -- 10/11/18
*
- * Changes: None.
+ * Changes: Updated to use the speculative read hint from the
+ * metadata cache, and remove the static variable
+ * containing the base address of the last read.
+ *
+ * JRM -- 4/5/20
*
*-------------------------------------------------------------------------
*/
static herr_t
-H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size,
- void *buf/*out*/)
+H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr,
+ size_t size, void *buf/*out*/)
{
+ hbool_t bypass = FALSE; /* flag indicating PB bypassed */
+ hbool_t speculative = FALSE; /* speculative read hint from mdc */
H5PB_t *pb_ptr; /* Page buffer for this file */
H5PB_entry_t *entry_ptr; /* Pointer to page buffer entry */
H5FD_t *file; /* File driver pointer */
uint64_t page; /* page offset of addr */
haddr_t page_addr; /* page containing addr */
- static haddr_t prev_addr = HADDR_UNDEF; /* addr of last call */
size_t offset; /* offset of read in page */
size_t clipped_size; /* possibley clipped size */
herr_t ret_value = SUCCEED; /* Return value */
@@ -3333,7 +4023,8 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size
TRUE, FALSE)
if ( ( NULL == entry_ptr ) &&
- ( H5PB__load_page(shared, pb_ptr, page_addr, type, &entry_ptr) < 0 ) )
+ ( H5PB__load_page(shared, pb_ptr, page_addr,
+ type, &entry_ptr) < 0 ) )
HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
"page buffer page load request failed (1)")
@@ -3358,7 +4049,7 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size
HDassert( page_addr == addr );
- if ( size >= pb_ptr->page_size ) {
+ if ( size > pb_ptr->page_size ) {
/* search the page buffer for an entry at page */
H5PB__SEARCH_INDEX(pb_ptr, page, entry_ptr, FAIL)
@@ -3367,10 +4058,11 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size
if ( entry_ptr == NULL ) { /* case 7 */
/* update hit rate stats */
- H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, FALSE, TRUE, size > pb_ptr->page_size)
+ H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, FALSE, \
+ TRUE, size > pb_ptr->page_size)
- /* If the read is for metadata, is page aligned, is larger
- * than one page, and there is no entry in the page buffer,
+ /* If the read is for metadata, is page aligned, is larger
+ * than page size, and there is no entry in the page buffer,
* satisfy the read from the file
*/
if ( H5FD_read(file, type, addr, size, buf) < 0)
@@ -3378,7 +4070,10 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size
HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
"driver read request failed (1)")
+ bypass = TRUE;
+
H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size);
+
} else {
HDassert( entry_ptr );
@@ -3389,28 +4084,29 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size
/* If the read is for metadata, is page aligned, is larger
* than one page, and there is a regular entry at the target
- * page address, test to see if the last read was for the
- * same address.
+ * page address, test to see if the read is speculative.
*
- * If it was, evict the page, and satisfy the read from
+ * If it is not, evict the page, and satisfy the read from
* file. Flag an error if the page was dirty.
*
- * If the last read was for a different page, clip the read
- * to one page, and satisfy the read from the existing
- * regular entry.
+ * If it is, clip the read to one page, and satisfy
+ * the read from the existing regular entry.
*/
HDassert( entry_ptr->size == pb_ptr->page_size );
- if ( addr == prev_addr ) {
+ speculative = H5C_get_curr_read_speculative(shared->cache);
+
+ if ( ! speculative ) {
- /* since this is a second try, don't update
+ /* since this is likely a second try, don't update
* hit rate stats.
*/
HDassert( ! ( entry_ptr->is_dirty ) );
- if (H5PB__evict_entry(shared, entry_ptr, TRUE, false) < 0)
+ if ( H5PB__evict_entry(shared, entry_ptr,
+ TRUE, false) < 0 )
HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
"forced eviction failed (1)")
@@ -3419,7 +4115,9 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size
HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
"driver read request failed (2)")
+ bypass = TRUE;
H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size);
+
} else {
HDassert( entry_ptr->image_ptr );
@@ -3439,7 +4137,8 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size
}
/* update hit rate stats */
- H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, TRUE, TRUE, FALSE)
+ H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, TRUE, \
+ TRUE, FALSE)
}
} else { /* case 9 */
@@ -3509,7 +4208,8 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size
TRUE, FALSE)
if ( ( NULL == entry_ptr ) &&
- ( H5PB__load_page(shared, pb_ptr, page_addr, type, &entry_ptr) < 0))
+ ( H5PB__load_page(shared, pb_ptr, page_addr,
+ type, &entry_ptr) < 0))
HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
"page buffer page load request failed (2)")
@@ -3532,7 +4232,8 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size
}
}
- prev_addr = addr;
+ if ( ! bypass )
+ H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size);
done:
@@ -3830,6 +4531,8 @@ H5PB__read_raw(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size,
}
} /* end else */
+ H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size);
+
done:
FUNC_LEAVE_NOAPI(ret_value)
@@ -4073,6 +4776,8 @@ H5PB__write_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr,
H5PB__INSERT_IN_TL(pb_ptr, entry_ptr, FAIL)
}
+ H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size);
+
done:
FUNC_LEAVE_NOAPI(ret_value)
@@ -4121,8 +4826,8 @@ done:
*-------------------------------------------------------------------------
*/
static herr_t
-H5PB__write_raw(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size,
- const void *buf/*out*/)
+H5PB__write_raw(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr,
+ size_t size, const void *buf/*out*/)
{
H5PB_t *pb_ptr; /* Page buffer for this file */
H5PB_entry_t *entry_ptr; /* Pointer to page buffer entry */
@@ -4372,6 +5077,8 @@ H5PB__write_raw(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size
}
}
+ H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size);
+
done:
FUNC_LEAVE_NOAPI(ret_value)
diff --git a/src/H5PBpkg.h b/src/H5PBpkg.h
index 49911d6..1cfeb59 100644
--- a/src/H5PBpkg.h
+++ b/src/H5PBpkg.h
@@ -670,19 +670,19 @@ if ( ( (entry_ptr) == NULL ) || \
#define H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size) \
{ \
- int i; \
+ int ii; \
\
HDassert(pb_ptr); \
HDassert((pb_ptr)->magic == H5PB__H5PB_T_MAGIC); \
\
if ( H5FD_MEM_DRAW == (type) ) { \
- i = H5PB__STATS_RD; \
+ ii = H5PB__STATS_RD; \
} else if ( (size) > (pb_ptr)->page_size ) { \
- i = H5PB__STATS_MPMDE; \
+ ii = H5PB__STATS_MPMDE; \
} else { \
- i = H5PB__STATS_MD; \
+ ii = H5PB__STATS_MD; \
} \
- ((pb_ptr)->accesses[i])++; \
+ ((pb_ptr)->accesses[ii])++; \
} /* H5PB__UPDATE_STATS_FOR_ACCESS */
@@ -812,6 +812,20 @@ if ( ( (entry_ptr) == NULL ) || \
((pb_ptr)->loads[i])++; \
} /* H5PB__UPDATE_STATS_FOR_LOAD */
+#define H5PB__UPDATE_STATS_FOR_READ_SPLIT(pb_ptr) \
+{ \
+ HDassert(pb_ptr); \
+ HDassert((pb_ptr)->magic == H5PB__H5PB_T_MAGIC); \
+ (pb_ptr->md_read_splits)++; \
+} /* H5PB__UPDATE_STATS_FOR_READ_SPLIT */
+
+#define H5PB__UPDATE_STATS_FOR_WRITE_SPLIT(pb_ptr) \
+{ \
+ HDassert(pb_ptr); \
+ HDassert((pb_ptr)->magic == H5PB__H5PB_T_MAGIC); \
+ (pb_ptr->md_write_splits)++; \
+} /* H5PB__UPDATE_STATS_FOR_READ_SPLIT */
+
#else /* H5PB__COLLECT_PAGE_BUFFER_STATS */
#define H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, hit, is_metadata, is_mpmde)
@@ -834,6 +848,8 @@ if ( ( (entry_ptr) == NULL ) || \
#define H5PB__UPDATE_STATS_FOR_CLEAR(pb_ptr, entry_ptr)
#define H5PB__UPDATE_STATS_FOR_INSERTION(pb_ptr, entry_ptr)
#define H5PB__UPDATE_STATS_FOR_LOAD(pb_ptr, entry_ptr)
+#define H5PB__UPDATE_STATS_FOR_READ_SPLIT(pb_ptr)
+#define H5PB__UPDATE_STATS_FOR_WRITE_SPLIT(pb_ptr)
#endif /* H5PB__COLLECT_PAGE_BUFFER_STATS */
diff --git a/src/H5PBprivate.h b/src/H5PBprivate.h
index 6b879c7..32e681e 100644
--- a/src/H5PBprivate.h
+++ b/src/H5PBprivate.h
@@ -249,6 +249,9 @@ typedef struct H5PB_entry_t H5PB_entry_t;
*
* FIELDS SUPPORTING VFD SWMR:
*
+ * If the file is opened in VFD SWMR mode (i.e. vfd_swmr == TRUE), all
+ * raw data I/O must be passed through to the HDF5 file
+ *
* If the file is opened as a VFD SWMR writer (i.e. vfd_swmr_writer == TRUE),
* the page buffer must retain the data necessary to update the metadata
* file at the end of each tick, and also delay writes as necessary so as
@@ -285,8 +288,12 @@ typedef struct H5PB_entry_t H5PB_entry_t;
* The remainder of this sections contains discussions of the fields and
* data structures used to support the above operations.
*
+ * vfd_swmr: Boolean flag that is set to TRUE IFF the file is opened
+ * in VFD SWMR mode -- either reader or writer. This field
+ * is used to exclude raw data from the page buffer.
+ *
* vfd_swmr_writer: Boolean flag that is set to TRUE iff the file is
- * the file is opened in VFD SWMR mode. The remaining
+ * is opened in VFD SWMR writer mode. The remaining
* VFD SWMR fields are defined iff vfd_swmr_writer is TRUE.
*
* mpmde_count: int64_t containing the number of multi-page metadata
@@ -528,6 +535,16 @@ typedef struct H5PB_entry_t H5PB_entry_t;
* total_dwl_ins_depth: int64_t containing the total insertion depth
* required to maintain the odering invarient on the
* delayed write list.
+ *
+ * md_read_splits: int64_t containing the number of metadata reads that
+ * are split into two or three sub-reads to manage the
+ * case in which a group of metadata cache clients
+ * sub-allocate entries from a single file space allocationn.
+ *
+ * md_write_splits: int64_t containing the number of metadata writes that
+ * are split into two or three sub-writes to manage the
+ * case in which a group of metadata cache clients
+ * sub-allocate entries from a single file space allocationn.
*
******************************************************************************/
@@ -578,6 +595,7 @@ typedef struct H5PB_t {
/* Fields for VFD SWMR operations: */
+ hbool_t vfd_swmr;
hbool_t vfd_swmr_writer;
int64_t mpmde_count;
uint64_t cur_tick;
@@ -645,6 +663,8 @@ typedef struct H5PB_t {
int64_t max_dwl_len;
int64_t max_dwl_size;
int64_t total_dwl_ins_depth;
+ int64_t md_read_splits;
+ int64_t md_write_splits;
} H5PB_t;
@@ -670,6 +690,7 @@ H5_DLL herr_t H5PB_add_new_page(H5F_shared_t *, H5FD_mem_t, haddr_t);
H5_DLL herr_t H5PB_update_entry(H5PB_t *, haddr_t, size_t, const void *);
H5_DLL herr_t H5PB_remove_entry(H5F_shared_t *, haddr_t);
+
H5_DLL herr_t H5PB_remove_entries(H5F_shared_t *, haddr_t, hsize_t);
H5_DLL herr_t H5PB_read(H5F_shared_t *, H5FD_mem_t, haddr_t,