10 files changed, 1438 insertions, 268 deletions
diff --git a/src/H5C.c b/src/H5C.c
index 948e781..9ebf0c6 100644
--- a/src/H5C.c
+++ b/src/H5C.c
@@ -477,6 +477,10 @@ H5C_create(size_t		      max_cache_size,
     cache_ptr->rdfsm_settled		= FALSE;
     cache_ptr->mdfsm_settled		= FALSE;
 
+    /* fields supporting page buffer hints */
+    cache_ptr->curr_io_type             = NULL;
+    cache_ptr->curr_read_speculative    = FALSE;
+
     if(H5C_reset_cache_hit_rate_stats(cache_ptr) < 0)
         /* this should be impossible... */
         HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, NULL, "H5C_reset_cache_hit_rate_stats failed")
@@ -487,6 +491,7 @@ H5C_create(size_t		      max_cache_size,
 
 #ifndef NDEBUG
     cache_ptr->get_entry_ptr_from_addr_counter  = 0;
+    cache_ptr->curr_io_type                     = NULL;
 #endif /* NDEBUG */
 
     /* Set return value */
@@ -974,10 +979,13 @@ done:
  *
  * Programmer:  John Mainzer -- 12/16/18
  *		
- * Changes:     None.
+ * Changes:     Added macro calls to maintain the page buffer hints.
+ *              
+ *                                           JRM -- 3/20/20
  *
  *-------------------------------------------------------------------------
  */
+
 herr_t
 H5C_evict_or_refresh_all_entries_in_page(H5F_t * f, uint64_t page, 
     uint32_t length, uint64_t tick)
@@ -994,7 +1002,7 @@ H5C_evict_or_refresh_all_entries_in_page(H5F_t * f, uint64_t page,
     H5C_cache_entry_t * entry_ptr;
     H5C_cache_entry_t * follow_ptr = NULL;
     herr_t ret_value = SUCCEED;      /* Return value */
-    bool found = false;
+    hbool_t found = FALSE;
 
     FUNC_ENTER_NOAPI(FAIL)
 
@@ -1036,7 +1044,7 @@ H5C_evict_or_refresh_all_entries_in_page(H5F_t * f, uint64_t page,
                      page * cache_ptr->page_size + length <=
                      entry_ptr->addr + entry_ptr->size);
 
-            found = true;
+            found = TRUE;
 
             /* since end of tick occurs only on API call entry in 
              * the VFD SWMR reader case, the entry must not be protected.
@@ -1135,12 +1143,17 @@ H5C_evict_or_refresh_all_entries_in_page(H5F_t * f, uint64_t page,
                              H5C_IMAGE_EXTRA_SPACE);
 #endif /* H5C_DO_MEMORY_SANITY_CHECKS */
                     
+                    H5C__SET_PB_READ_HINTS(cache_ptr, entry_ptr->type, TRUE)
+
                     if ( H5F_block_read(f, entry_ptr->type->mem_type, 
                                         entry_ptr->addr, 
-                                        image_len, image_ptr) < 0 )
+                                        image_len, image_ptr) < 0 ) {
 
+                        H5C__RESET_PB_READ_HINTS(cache_ptr)
                         HGOTO_ERROR(H5E_CACHE, H5E_READERROR, FAIL, \
                                     "Can't read image (1)")
+                    }
+                    H5C__RESET_PB_READ_HINTS(cache_ptr)
 
                     /* 3) Call the refresh callback.  If it doesn't 
                      *    request a different image size, goto 6)
@@ -1172,12 +1185,18 @@ H5C_evict_or_refresh_all_entries_in_page(H5F_t * f, uint64_t page,
                                  H5C_IMAGE_EXTRA_SPACE);
 #endif /* H5C_DO_MEMORY_SANITY_CHECKS */
                     
+                        H5C__SET_PB_READ_HINTS(cache_ptr, entry_ptr->type, TRUE)
+
                         if ( H5F_block_read(f, entry_ptr->type->mem_type, 
                                             entry_ptr->addr, 
-                                            image_len, image_ptr) < 0 )
+                                            image_len, image_ptr) < 0 ) {
+
+                            H5C__RESET_PB_READ_HINTS(cache_ptr)
 
                             HGOTO_ERROR(H5E_CACHE, H5E_READERROR, FAIL, \
                                         "Can't read image (2)")
+                        }
+                        H5C__RESET_PB_READ_HINTS(cache_ptr)
 
                         /* 5) Call the refresh callback again.  Requesting
                          *    a different buffer size again is an error.
@@ -6495,6 +6514,14 @@ done:
  *
  * Programmer:  John Mainzer, 5/5/04
  *
+ * Changes:     Please maintain the changes list, and do not delete it
+ *              unless you have merged it into the header comment 
+ *              proper.
+ *
+ *              Added macro calls to maintain page buffer hints.
+ *              
+ *                                           JRM -- 3/20/20
+ *
  *-------------------------------------------------------------------------
  */
 herr_t
@@ -6680,8 +6707,18 @@ H5C__flush_single_entry(H5F_t *f, H5C_cache_entry_t *entry_ptr, unsigned flags)
                 else
                     mem_type = entry_ptr->type->mem_type;
 
-                if(H5F_block_write(f, mem_type, entry_ptr->addr, entry_ptr->size, entry_ptr->image_ptr) < 0)
-                    HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't write image to file")
+                H5C__SET_PB_WRITE_HINTS(cache_ptr, entry_ptr->type)
+
+                if ( H5F_block_write(f, mem_type, entry_ptr->addr, 
+                                     entry_ptr->size, 
+                                     entry_ptr->image_ptr) < 0 ) {
+
+                    H5C__RESET_PB_WRITE_HINTS(cache_ptr)
+
+                    HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, \
+                                "Can't write image to file")
+                }
+                H5C__RESET_PB_WRITE_HINTS(cache_ptr)
 #ifdef H5_HAVE_PARALLEL
             }
 #endif /* H5_HAVE_PARALLEL */
@@ -7083,6 +7120,10 @@ done:
  *              small.  
  *                                           JRM -- 3/25/20
  *
+ *              Added macro calls to maintain the page buffer read hints.
+ *              
+ *                                           JRM -- 3/20/20
+ *
  *-------------------------------------------------------------------------
  */
 static void *
@@ -7234,10 +7275,18 @@ H5C_load_entry(H5F_t *              f,
             if ( !coll_access || 0 == mpi_rank ) {
 #endif /* H5_HAVE_PARALLEL */
 
-                if ( H5F_block_read(f, type->mem_type, addr, len, image) < 0 )
+                H5C__SET_PB_READ_HINTS(f->shared->cache, type, TRUE)
+
+                if ( H5F_block_read(f, type->mem_type, addr, len, image) < 0 ) {
+
+                    H5C__RESET_PB_READ_HINTS(f->shared->cache)
 
                     HGOTO_ERROR(H5E_CACHE, H5E_READERROR, NULL, \
                                 "Can't read image*")
+                }
+
+                H5C__RESET_PB_READ_HINTS(f->shared->cache)
+
 #ifdef H5_HAVE_PARALLEL
             } /* end if */
             /* if the collective metadata read optimization is turned on,
@@ -7346,11 +7395,19 @@ H5C_load_entry(H5F_t *              f,
                              * 
                              *                          JRM -- 3/24/20
                              */
+
+                            H5C__SET_PB_READ_HINTS(f->shared->cache, type, \
+                                                   FALSE);
+
                             if ( H5F_block_read(f, type->mem_type, addr, 
-                                                actual_len, image) < 0)
+                                                actual_len, image) < 0 ) {
+
+                                H5C__RESET_PB_READ_HINTS(f->shared->cache)
 
                                 HGOTO_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, \
                                             "can't read image")
+                            }
+                            H5C__RESET_PB_READ_HINTS(f->shared->cache)
 #endif /* JRM */
 #ifdef H5_HAVE_PARALLEL
                         }
diff --git a/src/H5Cimage.c b/src/H5Cimage.c
index ee286d9..9a6d667 100644
--- a/src/H5Cimage.c
+++ b/src/H5Cimage.c
@@ -1058,6 +1058,22 @@ H5C__read_cache_image(H5F_t *f, H5C_t *cache_ptr)
 #endif /* H5_HAVE_PARALLEL */
 
 	/* Read the buffer (if serial access, or rank 0 of parallel access) */
+
+        /* No need to set the page buffer hints here, as if paged
+         * allocation is in use, we know that the cache image was allocated
+         * directly from the free space manager, and thus either doesn't 
+         * cross page boundaries, or is page aligned.  Between this,
+         * and the fact that the cache image is never read speculatively,
+         * the page buffer should never request hints in this context.
+         * 
+         * If for some reason it does, the NULL curr_io_type will trigger
+         * an assertion failure.
+         *
+         * Note that we will have to revisit this if we ever use 
+         * cache_ptr->curr_io_type for something other than sanity
+         * checking
+         *                                      JRM -- 3/30/20
+         */
         if(H5F_block_read(f, H5FD_MEM_SUPER, cache_ptr->image_addr,
                 cache_ptr->image_len, cache_ptr->image_buffer) < 0)
             HGOTO_ERROR(H5E_CACHE, H5E_READERROR, FAIL, "Can't read metadata cache image block")
@@ -3554,6 +3570,19 @@ H5C__write_cache_image(H5F_t *f, const H5C_t *cache_ptr)
 #endif /* H5_HAVE_PARALLEL */
 
 	/* Write the buffer (if serial access, or rank 0 for parallel access) */
+
+        /* No need to set the page buffer hints here.
+         *
+         * If paged allocation is in use, we know that the cache image 
+         * was allocated directly from the free space manager, and thus 
+         * either doesn't cross page boundaries, or is page aligned.  
+         * Thus it should never trigger the sanity checks in the page buffer.
+         * 
+         * If for some reason it does, the NULL curr_io_type will trigger
+         * an assertion failure.
+         *
+         *                                      JRM -- 3/30/20
+         */
 	if(H5F_block_write(f, H5FD_MEM_SUPER, cache_ptr->image_addr, cache_ptr->image_len, cache_ptr->image_buffer) < 0)
             HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't write metadata cache image block to file")
 #ifdef H5_HAVE_PARALLEL
diff --git a/src/H5Cmpio.c b/src/H5Cmpio.c
index 199c494..16db2ad 100644
--- a/src/H5Cmpio.c
+++ b/src/H5Cmpio.c
@@ -1018,6 +1018,19 @@ H5C__collective_write(H5F_t *f)
             HGOTO_ERROR(H5E_CACHE, H5E_CANTSET, FAIL, "can't set MPI-I/O properties")
 
         /* Write data */
+        /*
+         * At present the page buffer is disabled in the parallel case, and 
+         * thus VFD SWMR can't be used either.  Thus, for now, there is 
+         * no point in setting the page buffer hints.
+         *
+         * More to the point, since we are actually writing a derived type
+         * containing multiple metadata cache entries, we couldn't set it 
+         * to a meaningful value.
+         *
+         * When we enable the page buffer in parallel, we will have to 
+         * revisit this.
+         *                                 JRM -- 3/30/20
+         */
         if(H5F_block_write(f, H5FD_MEM_DEFAULT, (haddr_t)0, (size_t)1, base_buf) < 0)
             HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to write entries collectively")
 
diff --git a/src/H5Cpkg.h b/src/H5Cpkg.h
index d9a1641..a5eafd6 100644
--- a/src/H5Cpkg.h
+++ b/src/H5Cpkg.h
@@ -3480,6 +3480,102 @@ if ( ( (entry_ptr) == NULL ) ||                                                \
 } /* H5C__MOVE_TO_TOP_IN_COLL_LIST */
 #endif /* H5_HAVE_PARALLEL */
 
+
+/***************************************/
+/* page buffer hint maintenance macros */
+/***************************************/
+
+/*-------------------------------------------------------------------------
+ *
+ * Macro:	H5C__SET/RESET_PB_READ_HINTS
+ *
+ * Purpose:     Set or reset the fields needed to provide hints to the 
+ *              page buffer so that it can disambuate between speculative
+ *              reads that cross page boundaries and read of metadata 
+ *              entries that cross page boundaries without starting on 
+ *              a page boundary.  This latter behaviour shouldn't happen,
+ *              and the hints allow the page buffer to detect this 
+ *              behaviour by un-expected cache client.  
+ *
+ *              See the discussion of the PB hint fields in the header
+ *              comment for H5C_t for further details.
+ *
+ * Return:      N/A
+ *
+ * Programmer:  John Mainzer, 3/30/20
+ *
+ * Modifications:
+ *
+ *		None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#define H5C__SET_PB_READ_HINTS(cache_ptr, type, may_be_speculative)            \
+{                                                                              \
+    HDassert(cache_ptr);                                                       \
+    HDassert((cache_ptr)->magic == H5C__H5C_T_MAGIC);                          \
+    HDassert((cache_ptr)->curr_io_type == NULL);                               \
+    HDassert(type);                                                            \
+    (cache_ptr)->curr_io_type = (type);                                        \
+    (cache_ptr)->curr_read_speculative = (may_be_speculative) &&               \
+       ((cache_ptr)->curr_io_type->flags & H5AC__CLASS_SPECULATIVE_LOAD_FLAG); \
+                                                                               \
+} /* H5C__SET_PB_READ_HINTS() */
+
+#define H5C__RESET_PB_READ_HINTS(cache_ptr)             \
+{                                                      \
+    HDassert(cache_ptr);                               \
+    HDassert((cache_ptr)->magic == H5C__H5C_T_MAGIC);  \
+    HDassert((cache_ptr)->curr_io_type);               \
+    (cache_ptr)->curr_io_type = NULL;                  \
+    (cache_ptr)->curr_read_speculative = FALSE;        \
+                                                       \
+} /* H5C__SET_PB_READ_HINTS() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Macro:	H5C__SET/RESET_PB_WRITE_HINTS
+ *
+ * Purpose:     Set or reset the fields needed to provide hints to the 
+ *              page buffer so that it can detect un-expected writes of
+ *              metadata entries that cross page boundaries and do not 
+ *              start on page boundaries.
+ *
+ *              See the discussion of the PB hint fields in the header
+ *              comment for H5C_t for further details.
+ *
+ * Return:      N/A
+ *
+ * Programmer:  John Mainzer, 3/30/20
+ *
+ * Modifications:
+ *
+ *		None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#define H5C__SET_PB_WRITE_HINTS(cache_ptr, type)       \
+{                                                      \
+    HDassert(cache_ptr);                               \
+    HDassert((cache_ptr)->magic == H5C__H5C_T_MAGIC);  \
+    HDassert((cache_ptr)->curr_io_type == NULL);       \
+    HDassert(type);                                    \
+    (cache_ptr)->curr_io_type = (type);                \
+                                                       \
+} /* H5C__SET_PB_WRITE_HINTS() */
+
+#define H5C__RESET_PB_WRITE_HINTS(cache_ptr)           \
+{                                                      \
+    HDassert(cache_ptr);                               \
+    HDassert((cache_ptr)->magic == H5C__H5C_T_MAGIC);  \
+    HDassert((cache_ptr)->curr_io_type);               \
+    (cache_ptr)->curr_io_type = NULL;                  \
+                                                       \
+} /* H5C__SET_PB_WRITE_HINTS() */
+
 
 /****************************/
 /* Package Private Typedefs */
@@ -4413,6 +4509,47 @@ typedef struct H5C_tag_info_t {
  *		managers that are involved in allocating space for free
  *		space managers.
  *
+ * Page Buffer Related Fields:
+ *
+ * Due to the irregular behavior of some of the cache clients, the 
+ * page buffer occasionally need hints to manage metadta I/O requests
+ * from the metadata cache -- particularly in the context of VFD SWMR.
+ * The following fields exist to support this.
+ *
+ *
+ * curr_io_type:  Pointer to the instance of H5C_class_t associated with
+ *              the current I/O operation.  This pointer should be set
+ *              just before any I/O operation by the metadata cache, and 
+ *              re-set to NULL immediately thereafter. 
+ *
+ *              This field exists because the fixed and variable length 
+ *              array cache clients allocate numerous entries in a single
+ *              block, and sub-allocate metadata cache entries out of this
+ *              block.  The effect of this is to break the invarient,
+ *              normally maintained by the free space managers in paged
+ *              allocation mode, that no entry of less than a page in
+ *              size crosses page boundaries, and that entries of page 
+ *              size or greater are page aligned.  This in turn causes 
+ *              problems for the page buffer -- particularly in VFD SWMR
+ *              mode.
+ *
+ *              The correct solution is to modify the fixed and variable 
+ *              length array cache client to repair this.  However, in 
+ *              the interrim, this field exists to detect similar 
+ *              behaviour elsewhere.
+ *
+ *              To complicate matters, speculative reads for metadata 
+ *              cache entries which must determine their lengths via 
+ *              inspection of the on disk image of the entry, may mimic 
+ *              the behaviour of the fixed and extensible arrays.  Thus
+ *              the curr_io_type is also needed to dis-ambiguate reads.
+ *              
+ * curr_read_speculative: Boolean flag indicating whether the current
+ *              read request is guaranteed to be of the correct length.
+ *              Field is used to distinguish between the initial and final 
+ *              read attempts
+ * 
+ *
  *
  * Statistics collection fields:
  *
@@ -4744,6 +4881,28 @@ typedef struct H5C_tag_info_t {
  *              called successfully.  This field is only defined when
  *              NDEBUG is not #defined.
  *
+ * curr_io_type:  Pointer to the instance of H5C_class_t associated with
+ *              the current I/O operation.  This pointer should be set
+ *              just before any I/O operation by the metadata cache, and 
+ *              re-set to NULL immediately thereafter.  This field is 
+ *              only defined when NDEBUG is not #defined.
+ *
+ *              This field exists because the fixed and variable length 
+ *              array cache clients allocate numerous entries in a single
+ *              block, and sub-allocate metadata cache entries out of this
+ *              block.  The effect of this is to break the invarient,
+ *              normally maintained by the free space managers in paged
+ *              allocation mode, that no entry of less than a page in
+ *              size crosses page boundaries, and that entries of page 
+ *              size or greater are page aligned.  This in turn causes 
+ *              problems for the page buffer -- particularly in VFD SWMR
+ *              mode.
+ *
+ *              The correct solution is to modify the fixed and variable 
+ *              length array cache client to repair this.  However, in 
+ *              the interrim, this field exists to detect similar 
+ *              behaviour elsewhere.
+ *
  ****************************************************************************/
 struct H5C_t {
     uint32_t			magic;
@@ -4892,6 +5051,10 @@ struct H5C_t {
     hbool_t 			rdfsm_settled;
     hbool_t			mdfsm_settled;
 
+    /* Fields supporting page buffer hints */
+    const H5C_class_t *         curr_io_type;
+    hbool_t                     curr_read_speculative;
+
 #if H5C_COLLECT_CACHE_STATS
     /* stats fields */
     int64_t                     hits[H5C__MAX_NUM_TYPE_IDS + 1];
@@ -5025,6 +5188,8 @@ H5_DLL herr_t H5C__untag_entry(H5C_t *cache, H5C_cache_entry_t *entry);
 /* Testing functions */
 #ifdef H5C_TESTING
 H5_DLL herr_t H5C__verify_cork_tag_test(hid_t fid, H5O_token_t tag_token, hbool_t status);
+H5_DLL void H5C_set_curr_io_type_splitable(H5C_t * cache_ptr, 
+    hbool_t set_splitable);
 #endif /* H5C_TESTING */
 
 #endif /* _H5Cpkg_H */
diff --git a/src/H5Cprivate.h b/src/H5Cprivate.h
index 23091cb..7678911 100644
--- a/src/H5Cprivate.h
+++ b/src/H5Cprivate.h
@@ -2411,6 +2411,8 @@ H5_DLL herr_t H5C_get_cache_size(H5C_t *cache_ptr, size_t *max_size_ptr,
     uint32_t *cur_num_entries_ptr);
 H5_DLL herr_t H5C_get_cache_flush_in_progress(H5C_t *cache_ptr, hbool_t *flush_in_progress_ptr);
 H5_DLL herr_t H5C_get_cache_hit_rate(H5C_t *cache_ptr, double *hit_rate_ptr);
+H5_DLL int H5C_get_curr_io_client_type(H5C_t * cache_ptr);
+H5_DLL hbool_t H5C_get_curr_read_speculative(H5C_t * cache_ptr);
 H5_DLL herr_t H5C_get_entry_status(const H5F_t *f, haddr_t addr,
     size_t *size_ptr, hbool_t *in_cache_ptr, hbool_t *is_dirty_ptr,
     hbool_t *is_protected_ptr, hbool_t *is_pinned_ptr, hbool_t *is_corked_ptr,
diff --git a/src/H5Cquery.c b/src/H5Cquery.c
index 9f1ec31..477a8ba 100644
--- a/src/H5Cquery.c
+++ b/src/H5Cquery.c
@@ -452,3 +452,111 @@ done:
     FUNC_LEAVE_NOAPI(ret_value)
 } /* H5C_get_mdc_image_info() */
 
+
+/*-------------------------------------------------------------------------
+ * Function:    H5C_get_curr_io_client_type
+ *
+ * Purpose:     Return the type id associated with the metadata cache
+ *              client whose data is currently being read or written.
+ *
+ *              This id is obtained via the curr_io_type field in
+ *              H5C_t, which is set just before most I/O calls from the
+ *              metadata cache, and reset to NULL immediately thereafter.
+ *
+ *              If cache_ptr->curr_io_type is NULL, the function
+ *              returns -1.
+ *
+ *              Note: At present, cache_ptr->curr_io_type should always
+ *              be defined in the serial case with the exception
+ *              of cache image I/O.  In general, it is not defined in
+ *              the parallel case.  This is not a problem for now, as
+ *              this function is used in page buffer sanity checking,
+ *              and for now at least, the page buffer is not enabled in
+ *              the parallel case.
+ *
+ * Return:      ID of cache client whose image is being read or written,
+ *              or H5AC_NTYPES if cache_ptr->curr_io_type is undefined.
+ *
+ * Programmer:  John Mainzer
+ *              3/31/20
+ *
+ * Changes:     None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+int
+H5C_get_curr_io_client_type(H5C_t * cache_ptr)
+{
+    int ret_value = -1;   /* Return value */
+
+    FUNC_ENTER_NOAPI_NOINIT_NOERR
+
+    HDassert(cache_ptr);
+    HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC);
+
+    if ( cache_ptr->curr_io_type ) {
+
+        ret_value = cache_ptr->curr_io_type->id;
+    }
+
+
+    FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5C_get_curr_io_client_type() */
+
+
+/*-------------------------------------------------------------------------
+ * Function:    H5C_get_curr_read_speculative
+ *
+ * Purpose:     Return a boolean flag indicating whether the current
+ *              read is speculative.  
+ *              
+ *              Note that this value is only defined during a read generated
+ *              by the metadatat cache.  At all other times, the return
+ *              value undefined (although the current implementation 
+ *              returns FALSE in such cases).
+ *
+ *              Note also that this function exists to provide hints to the 
+ *              page buffer, which for now at least, is only available in
+ *              the serial case.  It should not be depended upon in the 
+ *              parallel case -- at least until verified, and potential 
+ *              interactions with collective metadata reads are investigated
+ *              and dismissed.
+ *
+ * Return:      True if the current call to H5F_block_read() by the 
+ *              metadata cache is an initial read attempt for a cache 
+ *              client whose speculative read flag is set (in H5AC_class_t),
+ *              and false otherwise.
+ *
+ *              Return value is undefined if a call to H5F_block_read by
+ *              the metadata cache is not in progress.
+ *
+ * Programmer:  John Mainzer
+ *              3/31/20
+ *
+ * Changes:     None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+hbool_t
+H5C_get_curr_read_speculative(H5C_t * cache_ptr)
+{
+    hbool_t ret_value = FALSE;   /* Return value */
+
+    FUNC_ENTER_NOAPI_NOINIT_NOERR
+
+    HDassert(cache_ptr);
+    HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC);
+
+    if ( cache_ptr->curr_io_type ) {
+
+        ret_value = cache_ptr->curr_read_speculative;
+    }
+
+    FUNC_LEAVE_NOAPI(ret_value)
+
+} /* H5C_get_curr_read_speculative() */
+
+
diff --git a/src/H5Ctest.c b/src/H5Ctest.c
index 7f24302..b549da5 100644
--- a/src/H5Ctest.c
+++ b/src/H5Ctest.c
@@ -78,8 +78,6 @@ typedef struct {
 /* Local Variables */
 /*******************/
 
-
-
 
 /*-------------------------------------------------------------------------
  * Function:    H5C__verify_cork_tag_test_cb
@@ -167,3 +165,57 @@ done:
     FUNC_LEAVE_NOAPI(ret_value)
 } /* H5C__verify_cork_tag_test() */
 
+
+/*-------------------------------------------------------------------------
+ * Function:    H5C_set_curr_io_type_splitable()
+ *
+ * Purpose:     To test the meta data entry splitting capability in the page 
+ *              buffer (needed to deal with H5FA and H5EA's unfortunate
+ *              design choice of sub-allocating multiple metadata entries 
+ *              out of a single file space allocation), we must be able
+ *              to configure the metadata cache to report that the 
+ *              current I/O request is for such an entry.
+ *
+ *              To do this, we must set cache_ptr->curr_io_type to 
+ *              point to the instance of H5C_class_t with one such 
+ *              client.
+ *
+ *              This function does this by setting cache_ptr->curr_io_type
+ *              to H5AC_EARRAY_DBLK_PAGE if set_splitable is TRUE, and to 
+ *              NULL otherwise.
+ *
+ *              Needless to say, this is purely a testing function, and
+ *              should not be called otherwise.
+ *              
+ * Return:      void
+ *
+ * Programmer:  John Mainzer
+ *              4/10/20
+ *
+ * Changes:     None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+void
+H5C_set_curr_io_type_splitable(H5C_t * cache_ptr, hbool_t set_splitable)
+{
+    FUNC_ENTER_NOAPI_NOINIT_NOERR
+
+    HDassert(cache_ptr);
+    HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC);
+
+    if ( set_splitable ) {
+
+        cache_ptr->curr_io_type = H5AC_EARRAY_DBLK_PAGE;
+
+    } else {
+
+        cache_ptr->curr_io_type = NULL;
+    }
+
+
+    FUNC_LEAVE_NOAPI_VOID
+
+} /* H5C_set_curr_io_type_splitable() */
+
diff --git a/src/H5PB.c b/src/H5PB.c
index 1e0bab7..d20be7b 100644
--- a/src/H5PB.c
+++ b/src/H5PB.c
@@ -52,9 +52,12 @@
 /****************/
 
 /* Round _x down to nearest _size. */
+/* not used at present */
+/*
 #ifndef rounddown
 #define rounddown(_x, _size) (((_x) / (_size)) * (_size))
 #endif
+*/
 
 /* Round _x up to nearest _size. */
 #ifndef roundup
@@ -113,14 +116,6 @@ static herr_t H5PB__write_meta(H5F_shared_t *, H5FD_mem_t, haddr_t,
 static herr_t H5PB__write_raw(H5F_shared_t *, H5FD_mem_t, haddr_t, 
     size_t, const void *);
 
-static void metadata_section_split(size_t, haddr_t, size_t, const void *,
-    metadata_section_t *);
-
-static herr_t metadata_multipart_read(H5F_shared_t *, H5FD_mem_t, haddr_t,
-    size_t, void *);
-
-static herr_t metadata_multipart_write(H5F_shared_t *, H5FD_mem_t, haddr_t,
-    size_t, const void *);
 
 static void H5PB_log_access_by_size_counts(const H5PB_t *);
 
@@ -225,6 +220,8 @@ H5PB_reset_stats(H5PB_t *pb_ptr)
     pb_ptr->max_dwl_len                      = 0;
     pb_ptr->max_dwl_size                     = 0;
     pb_ptr->total_dwl_ins_depth              = 0;
+    pb_ptr->md_read_splits                   = 0;
+    pb_ptr->md_write_splits                  = 0;
 
     FUNC_LEAVE_NOAPI(SUCCEED)
 
@@ -255,7 +252,13 @@ H5PB_reset_stats(H5PB_t *pb_ptr)
  *              --bypasses: the number of metadata and raw data accesses 
  *                          that bypass the page buffer layer
  *
- * Return:	    Non-negative on success/Negative on failure
+ *              TODO: The available stats have changed considerably
+ *                    since Mohamad wrote this routine.  Update 
+ *                    the function once things settle down.
+ *
+ *                                              JRM -- 4/13/20
+ *
+ * Return:	Non-negative on success/Negative on failure
  *
  * Programmer:	Mohamad Chaarawi
  *
@@ -300,7 +303,9 @@ H5PB_get_stats(const H5PB_t *pb_ptr, unsigned accesses[2], unsigned hits[2],
  *
  * Programmer:	John Mainzer -- 10/12/18
  *
- * Changes:     None.
+ * Changes:     Added support for md_read_splits and md_write_splits.
+ *
+ *                                            JRM -- 4/11/20
  *
  *-------------------------------------------------------------------------
  */
@@ -407,10 +412,14 @@ H5PB_print_stats(const H5PB_t *pb_ptr)
         ave_delayed_write_ins_depth = (double)(pb_ptr->total_dwl_ins_depth) /
                                       (double)(pb_ptr->delayed_writes);
     }
+
     HDfprintf(stdout, 
         "delayed writes / ave delay / ave ins depth = %lld / %llf / %llf\n",
         pb_ptr->delayed_writes, ave_delayed_write, ave_delayed_write_ins_depth);
 
+    HDfprintf(stdout, "metadata read / write splits = %lld / %lld.\n",
+              pb_ptr->md_read_splits, pb_ptr->md_write_splits);
+
     FUNC_LEAVE_NOAPI(SUCCEED)
 
 } /* H5PB_print_stats */
@@ -447,7 +456,10 @@ H5PB_print_stats(const H5PB_t *pb_ptr)
  *
  * Programmer:	John Mainzer -- 10/12/18
  *
- * Changes:     None.
+ * Changes:     Modified function to function to prevent the insertion 
+ *              of raw data pages when operating in VFD SWMR mode.
+ *
+ *                                          JRM -- 3/25/20
  *
  *-------------------------------------------------------------------------
  */
@@ -471,7 +483,8 @@ H5PB_add_new_page(H5F_shared_t *shared, H5FD_mem_t type, haddr_t page_addr)
 
     if ( H5FD_MEM_DRAW == type ) { /* raw data page insertion */
 
-        if ( pb_ptr->min_md_pages == pb_ptr->max_pages ) {
+        if ( ( pb_ptr->min_md_pages == pb_ptr->max_pages ) ||
+             ( pb_ptr->vfd_swmr ) ) {
 
             can_insert = FALSE;
 
@@ -517,7 +530,12 @@ done:
  *
  * Programmer:	John Mainzer -- 10/11/18
  *
- * Changes:     None.
+ * Changes:     Added initialization for the vfd_swmr field.  Also 
+ *              added code to force min_rd_pages to 0 if vfd_swrm is
+ *              TRUE.  Do this since we now exclude raw data from the 
+ *              page buffer when operating in VFD SWMR mode.
+ *
+ *                                               JRM -- 3/28/20
  *
  *-------------------------------------------------------------------------
  */
@@ -525,6 +543,7 @@ herr_t
 H5PB_create(H5F_shared_t *shared, size_t size, unsigned page_buf_min_meta_perc, 
             unsigned page_buf_min_raw_perc)
 {
+    hbool_t vfd_swmr = FALSE;
     hbool_t vfd_swmr_writer = FALSE;
     int i;
     int32_t min_md_pages;
@@ -575,11 +594,21 @@ H5PB_create(H5F_shared_t *shared, size_t size, unsigned page_buf_min_meta_perc,
              (int32_t)(size / shared->fs_page_size));
 
 
-    /* compute vfd_swmr_writer */
-    if ( ( H5F_SHARED_VFD_SWMR_CONFIG(shared) ) && ( H5F_SHARED_INTENT(shared) & H5F_ACC_RDWR ) ) {
+    /* compute vfd_swrm and vfd_swmr_writer */
+    if ( H5F_SHARED_VFD_SWMR_CONFIG(shared) ) {
+
+        vfd_swmr = TRUE;
+
+        /* force min_rd_pages to zero since raw data is exclued from
+         * the page buffer in VFD SWMR mode.
+         */
+        min_rd_pages = 0;
+
+        if ( H5F_SHARED_INTENT(shared) & H5F_ACC_RDWR ) {
 
-        HDassert(shared->vfd_swmr_config.writer);
-        vfd_swmr_writer = TRUE;
+            HDassert(shared->vfd_swmr_config.writer);
+            vfd_swmr_writer = TRUE;
+        }
     }
 
 
@@ -629,6 +658,7 @@ H5PB_create(H5F_shared_t *shared, size_t size, unsigned page_buf_min_meta_perc,
     /* VFD SWMR specific fields.  
      * The following fields are defined iff vfd_swmr_writer is TRUE. 
      */
+    pb_ptr->vfd_swmr         = vfd_swmr;
     pb_ptr->vfd_swmr_writer  = vfd_swmr_writer;
     pb_ptr->mpmde_count      = 0;
     pb_ptr->cur_tick         = 0;
@@ -965,9 +995,11 @@ H5PB_log_access_by_size_counts(const H5PB_t *pb)
  *
  *              2) If the read is for raw data, and the page buffer is 
  *                 configured for metadata only (i.e. min_md_pages == 
- *                 max_pages), simply read from the HDF5 file and return.
+ *                 max_pages), or if we are operating in VFD SWMR mode
+ *                 (i.e. vfd_swmr == TRUE), simply read from the HDF5 
+ *                 file and return.
  *
- *              3) If the read is for raw data, and it of page size or 
+ *              3) If the read is for raw data, and is of page size or 
  *                 larger, read it directly from the HDF5 file.  
  *
  *                 It is possible that the page buffer contains dirty pages 
@@ -997,17 +1029,41 @@ H5PB_log_access_by_size_counts(const H5PB_t *pb)
  *              between small and multi-page metadata entries so that 
  *              pages containing the former will be buffered and the 
  *              latter be read directly from file.
- *  
- *              Unfortunately, the metadata cache does not always know the 
+ *
+ *              Unfortunately, there are several flies in the ointment.
+ *
+ *              First, the fixed and extensible array on disk data 
+ *              structures allocate multiple metadata cache entries in 
+ *              a single block, and use this fact to make the addresses
+ *              of all but the first entry in the block computable.  While
+ *              this simplifies the fixed and extensible array on disk data
+ *              structures, if complicates the metadata cache and the page
+ *              buffer.  Needless to say, the correct solution to this 
+ *              problem is to remove the complexity at its source.  However, 
+ *              for now, we must code around the problem.
+ *
+ *              Thus, this function must examine each read request
+ *              to determine if it crosses page boundaries and is not for 
+ *              two or more complete pages.  If it does, and it is one of
+ *              the fixed or extensible array entries that is sub-allocated
+ *              from a larger space allocation, the read request must be 
+ *              split into the minimal set of read requests that either 
+ *              don't cross page boundaries, or are page aligned and 
+ *              consist of an integral number of pages.
+ *
+ *
+ *              Second, the metadata cache does not always know the 
  *              size of metadata entries when it tries to read them.  In 
  *              such cases, it issues speculative reads that may be either 
  *              smaller or larger than the actual size of the piece of 
  *              metadata that is finally read.
  *
  *              Since we are guaranteed that all metadata allocations larger
- *              that one page are page aligned, we can safely clip at the 
- *              page boundary any non page aligned metadata read that crosses
- *              page boundaries.
+ *              that one page are page aligned (with the exception of those
+ *              sub-allocated from larger allocations -- which we deal with
+ *              by splitting I/O requests as discussed above), we can safely 
+ *              clip at the page boundary any non page aligned metadata 
+ *              read that crosses page boundaries. 
  *
  *              However, page aligned reads could wind up being either
  *              small or multi-page.  This results in two scenarios that
@@ -1048,15 +1104,13 @@ H5PB_log_access_by_size_counts(const H5PB_t *pb)
  *
  *              8) If the read is for metadata, is page aligned, is larger 
  *                 than one page, and there is a regular entry at the target
- *                 page address, test to see if the last read was for the 
- *                 same address.
+ *                 page address, test to see if the read is speculative.
  *
- *                 If was, evict the page, and satisfy the read from file.
- *                 Flag an error if the page was dirty.
+ *                 If it is not, evict the page, and satisfy the read from 
+ *                 file.  Flag an error if the page was dirty.
  *
- *                 If the last read was for a different page, clip the read 
- *                 to one page, and satisfy the read from the existing 
- *                 regular entry.
+ *                 If it is, clip the read to one page, and satisfy the 
+ *                 read from the existing regular entry.
  *
  *              9) If the read is for metadata, is page aligned, is larger
  *                 than one page, and there is a multi-page metadata entry
@@ -1091,63 +1145,337 @@ H5PB_log_access_by_size_counts(const H5PB_t *pb)
  *
  * Programmer:	John Mainzer -- 10/11/18
  *
- * Changes:     None.
+ * Changes:     Updated for discovery of the fact that the fixed and 
+ *              extensible array data structures allocate multiple 
+ *              metadata cache entries in a single block, and thus 
+ *              violate that invarient that metadata entries either 
+ *              do not cross page boundaries, or are page aligned.
+ *
+ *                                               JRM -- 3/28/20
  *
  *-------------------------------------------------------------------------
  */
-/* TBD Add optional raw-data bypass here and at H5PB_write when we
- * are operating in parallel mode.
- */
+ 
 herr_t
 H5PB_read(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size, 
           void *buf/*out*/)
 {
-    H5PB_t *pb_ptr;                    /* Page buffer for this file */
+    H5PB_t *pb_ptr;                     /* Page buffer for this file */
+    hbool_t bypass_pb = FALSE;          /* Whether to bypass page buffering */
+    hbool_t split_read = FALSE;         /* whether the read must be split */
     herr_t ret_value = SUCCEED;         /* Return value */
 
+    /* the following six fields are defined iff split_read is TRUE */
+    haddr_t prefix_addr = HADDR_UNDEF;  /* addr of prefix -- if defined */
+    haddr_t body_addr = HADDR_UNDEF;    /* addr of body -- if defined */
+    haddr_t suffix_addr = HADDR_UNDEF;  /* addr of suffix -- if defined */
+    size_t prefix_size = 0;             /* size of prefix */
+    size_t body_size = 0;               /* size of body */
+    size_t suffix_size = 0;             /* size of suffix */
+    
+
     FUNC_ENTER_NOAPI(FAIL)
 
+    /* Sanity checks */
+    HDassert(shared);
+
     hlog_fast(pbrd, "%s %p type %d %" PRIuHADDR " size %zu",
         __func__, (void *)shared, type, addr, size);
 
+
     pb_ptr = shared->pb_ptr;
 
     if (pb_ptr != NULL && type != H5FD_MEM_DRAW)
         H5PB_count_meta_access_by_size(pb_ptr, size);
 
-    HDassert(pb_ptr == NULL || pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+    if ( pb_ptr == NULL ) {
 
-    /* Bypass the page buffer in case
-     * 1) page buffer is disabled
-     * _) MPI I/O is enabled
-     * 2) page buffer configured for metadata only, and it's a raw-data access
-     * 5) page buffer configured for raw data only, and it's a metadata access
-     */
-    if (pb_ptr == NULL || H5F_SHARED_HAS_FEATURE(shared, H5FD_FEAT_HAS_MPI) ||
-        (H5FD_MEM_DRAW == type && pb_ptr->min_md_pages == pb_ptr->max_pages) ||
-        (H5FD_MEM_DRAW != type && pb_ptr->min_rd_pages == pb_ptr->max_pages)) {
+        bypass_pb = TRUE; /* case 1) -- page buffer is disabled */
+
+    } else {
+
+        HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC);
+
+        if ( H5FD_MEM_DRAW == type ) { /* raw data read */
+
+            if ( ( pb_ptr->min_md_pages == pb_ptr->max_pages ) ||
+                 ( pb_ptr->vfd_swmr ) ) {
+
+                /* case 2) -- page buffer configured for metadata only 
+                 *            or vfd swmr.
+                 */
+                bypass_pb = TRUE;
+
+            }
+        } else { /* metadata read */
+
+            if ( pb_ptr->min_rd_pages == pb_ptr->max_pages ) {
+
+                /* case 5) -- page buffer configured for raw data only */
+                bypass_pb = TRUE;
+
+            } else {
+                /* determine whether the read request must be split,
+                 * and if so, compute the start points and sizes of 
+                 * of the sections.
+                 *
+                 * Note: The following code is almost identical to the 
+                 *       similar code in H5PB_write().  Thus, on the surface, 
+                 *       it is an obvious candidate for refactoring into a 
+                 *       function 0r macro.
+                 *
+                 *       However, there are subtle differences between 
+                 *       the two pieces of code which are driven by the 
+                 *       possibility of speculative reads.  
+                 *
+                 *       More to the point, further changes may be necessary.  
+                 *       Thus we should wait on refactoring until this code has 
+                 *       been in daily use for some time, and it is clear 
+                 *       that further changes are unlikely.
+                 */
+                int mdc_client_id = -1;  /* id of mdc client, or -1 if undef */
+                uint64_t start_page;     /* page index of first page in read */
+                uint64_t second_page;    /* page index of second page in read */
+                uint64_t end_page;       /* page index of last page in read */
+                uint64_t body_page;      /* page index of start of body */
+                haddr_t start_page_addr; /* addr of first page in read */
+                haddr_t second_page_addr;/* addr of second page in read */
+                haddr_t end_page_addr;   /* addr of last page in read */
+                haddr_t end_addr;        /* addr of last byte in read */
+                
+                /* Calculate the aligned address of the first page */
+                start_page = (addr / pb_ptr->page_size);
+                start_page_addr = start_page * pb_ptr->page_size;
+
+                /* Calculate the aligned address of the last page */
+                end_addr = addr + (haddr_t)(size - 1);
+                end_page = end_addr / (haddr_t)(pb_ptr->page_size);
+                end_page_addr = end_page * pb_ptr->page_size;
+
+                HDassert(start_page_addr <= addr);
+                HDassert(addr < start_page_addr + (haddr_t)(pb_ptr->page_size));
+
+                HDassert(start_page <= end_page);
+                HDassert(end_page_addr <= ((addr + (haddr_t)size - 1))); 
+                HDassert((addr + (haddr_t)size - 1) < 
+                         (end_page_addr + pb_ptr->page_size)); 
+
+                /* test to see if the read crosses a page boundary, and 
+                 * does not start on a page boundary, and is not of an 
+                 * integral number of pages.
+                 */
+                if ( ( start_page < end_page ) &&
+                     ( ! ( ( addr == start_page_addr ) && 
+                           ( end_page_addr + (haddr_t)(pb_ptr->page_size) == 
+                             end_addr + 1 ) ) ) ) {
+
+                    /* the read crosses a page boundary and is not 
+                     * page aligned and of length some multiple of page size.
+                     *
+                     * Test to see if the read is for a metadata entry that 
+                     * is sub-allocated from a larger space allocation.
+                     *
+                     * Note that the following test may have to be
+                     * adjusted.
+                     */
+                    mdc_client_id = H5C_get_curr_io_client_type(shared->cache);
+                
+                    if ( ( mdc_client_id == (int)H5AC_EARRAY_DBLK_PAGE_ID ) || \
+                         ( mdc_client_id == (int)H5AC_FARRAY_DBLK_PAGE_ID ) ) {
+
+                        split_read = TRUE;
+                    }
+                }
+
+                if ( split_read ) {
+
+                    /* compute the base addresses and length of the prefix,
+                     * body, and suffix of the read, where these terms are 
+                     * defined as follows:
+                     *
+                     * prefix: All bytes from addr to the first page address
+                     *         at or after addr.  If addr == start_page_addr,
+                     *         the prefix is empty.
+                     *
+                     * body:   All bytes from the first page address covered
+                     *         by the read up to but not including the last 
+                     *         page address in the read.  Note that the 
+                     *         length of the body must be a multiple of the 
+                     *         page size.  If only one page address is 
+                     *         included in the read, the body is empty.
+                     *
+                     * suffix: All bytes from the last page address in the
+                     *         read until the end of the read.  If the 
+                     *         read ends on a page boundary, the suffix is
+                     *         empty.
+                     *
+                     * Since we know that the read crosses at least one 
+                     * page boundary, and we have aleady filtered out the 
+                     * body only case, at least two of the above must be 
+                     * non-empty.
+                     */
+
+                    second_page = start_page + 1;
+                    second_page_addr =
+                        (haddr_t)(second_page * pb_ptr->page_size);
+
+                    if ( addr > start_page_addr ) { /* prefix exists */
+
+                        prefix_addr = addr;
+                        prefix_size = (size_t)(second_page_addr - addr);
+
+                        HDassert(prefix_addr > start_page_addr);
+                        HDassert(prefix_size < pb_ptr->page_size);
+                        HDassert(((size_t)(addr - start_page_addr) + \
+                                 prefix_size) == pb_ptr->page_size);
+                    }
+
+                    if ( size - prefix_size >= pb_ptr->page_size ) {
+                    
+                        /* body exists */
+
+                        if ( addr == start_page_addr ) {
+
+                            body_page = start_page;
+                            body_addr = start_page_addr;
+
+                        } else {
+
+                            body_page = second_page;
+                            body_addr = second_page_addr;
+                        }
+
+                        if ( end_addr < end_page_addr +
+                                        (haddr_t)(pb_ptr->page_size - 1) ) {
+
+                            /* suffix exists */
+                            body_size = (size_t)(end_page - body_page) *
+                                        pb_ptr->page_size;
+
+                        } else {
+
+                            /* suffix is empty */
+                            body_size = (size_t)(end_page - body_page + 1) *
+                                        pb_ptr->page_size;
+                        }
+
+                        HDassert((body_page == start_page) || \
+                                 (body_page == start_page + 1));
+
+                        HDassert(body_addr == \
+                                 (haddr_t)(body_page * pb_ptr->page_size));
+
+                        HDassert(body_size < size);
+                        HDassert(body_size >= pb_ptr->page_size);
+
+
+                        HDassert(body_addr == \
+                                 addr + (haddr_t)prefix_size);
+                        HDassert((body_addr + (haddr_t)body_size) \
+                                  <= (end_addr + 1));
+                    }
 
-        if (H5FD_read(shared->lf, type, addr, size, buf) < 0) {
-            HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL,
-                        "read through lower VFD failed");
+                    if ( end_addr < end_page_addr +
+                                    (haddr_t)(pb_ptr->page_size - 1) ) {
+
+                        suffix_addr = end_page_addr;
+                        suffix_size = (end_addr + 1) - end_page_addr;
+
+                        HDassert(suffix_addr == \
+                                 addr + (haddr_t)(prefix_size + body_size));
+                    }
+
+                    HDassert(size == prefix_size + body_size + suffix_size);
+                }
+            }
         }
+    }
+
+#ifdef H5_HAVE_PARALLEL
+    /* at present, the page buffer must be disabled in the parallel case.
+     * However, just in case ...
+     */
+    if ( H5F_SHARED_HAS_FEATURE(shared, H5FD_FEAT_HAS_MPI) ) {
+
+        bypass_pb = TRUE;
+
+    } /* end if */
+#endif /* H5_HAVE_PARALLEL */
+
+
+    if ( bypass_pb ) { /* cases 1, 2. and 5 */
+
+        if ( H5FD_read(shared->lf, type, addr, size, buf) < 0 )
+
+            HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
+                        "read through  failed")
+
+        /* Update statistics */
+        if ( pb_ptr ) {
 
-        if (pb_ptr != NULL)
             H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size);
-        HGOTO_DONE(SUCCEED);
-    }
+        }
+    } else {
 
-    if (H5FD_MEM_DRAW == type) { /* cases 3 and 4 */
-        if (H5PB__read_raw(shared, type, addr, size, buf) < 0)
-            HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, "raw read failed");
-    } else if (metadata_multipart_read(shared, type, addr, size, buf) < 0)
-        HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, "meta read failed");
+        if ( H5FD_MEM_DRAW == type ) { /* cases 3 and 4 */
 
-    H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size);
+            if ( H5PB__read_raw(shared, type, addr, size, buf) < 0 )
+
+                HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
+                            "H5PB_read_raw() failed")
+
+        } else if ( split_read ) {
+
+            /* handle the sub-allocated entry case */
+
+            /* read prefix if it exists */
+            if ( prefix_size > 0 ) {
+                
+                if ( H5PB__read_meta(shared, type, prefix_addr, 
+                                     prefix_size, buf) < 0 )
+
+                    HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
+                                "H5PB_read_meta() failed on prefix")
+            }
+
+            /* read body -- if it exists. */
+            if ( body_size > 0 ) {
+
+                if ( H5PB__read_meta(shared, type, body_addr, body_size, 
+                                     (void *)((uint8_t *)buf + 
+                                               prefix_size)) < 0 )
+
+                    HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
+                                "H5PB_read_meta() failed on body")
+            }
+
+            /* read suffix -- if it exists. */
+            if ( suffix_size > 0 ) {
+
+                if ( H5PB__read_meta(shared, type, suffix_addr, suffix_size, 
+                                     (void *)((uint8_t *)buf + prefix_size + 
+                                                 body_size)) < 0 )
+
+                    HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
+                                "H5PB_read_meta() failed on suffix")
+            }
+
+            H5PB__UPDATE_STATS_FOR_READ_SPLIT(pb_ptr)
+
+        } else { /* pass to H5PB_read_meta() -- cases 6, 7, 8, 9, & 10 */
+                
+            if ( H5PB__read_meta(shared, type, addr, size, buf) < 0 )
+
+                    HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
+                                "H5PB_read_meta() failed")
+        }
+    }
 
 done:
+
     FUNC_LEAVE_NOAPI(ret_value)
-}
+
+} /* H5PB_read() */
 
 /* Remove the entry corresponding to lower-file page number `page`.
  * Return 0 if there was no such entry or if the entry was removed
@@ -1241,12 +1569,16 @@ herr_t
 H5PB_remove_entry(H5F_shared_t *shared, haddr_t addr)
 {
     uint64_t page;
-    H5PB_t *pb_ptr;
+    H5PB_t *pb_ptr = NULL;
     H5PB_entry_t *entry_ptr = NULL;
-    herr_t ret_value = SUCCEED;
+    herr_t ret_value = SUCCEED;             /* Return value */
 
     FUNC_ENTER_NOAPI(FAIL)
 
+    /* Sanity checks */
+    HDassert(shared);
+    HDassert(shared->pb_ptr);
+
     pb_ptr = shared->pb_ptr;
 
     /* Calculate the page offset */
@@ -1306,50 +1638,169 @@ done:
 
 } /* H5PB_remove_entry */
 
+
+/*-------------------------------------------------------------------------
+ *
+ * Function:    H5PB_remove_entries
+ *
+ * Purpose:     Remove entries in the page buffer associated with a 
+ *              newly freed multi-page block of file space.  
+ *
+ *              There are several possible situations here.
+ *
+ *              In the context of metadata, there are two possible cases.
+ *
+ *              1) The block of file space is associated with a metadata 
+ *                 entry.
+ *
+ *                 In regular operating mode, this entry will not be 
+ *                 cached in the page buffer, so there should be nothing
+ *                 to do.
+ *
+ *                 In VFD SWMR mode, the entry may be cached in a single
+ *                 multi-page entry.
+ *
+ *              2) The block of file space has been sub-allocated 
+ *                 into multiple metadata entries (i.e. fixed and extensible
+ *                 array).  In this case, the individual entries may cross
+ *                 boundaries without being page aligned -- however, for 
+ *                 purposes of the page buffer, I/O requests on these 
+ *                 entries will have been broken up into requests that 
+ *                 either do not cross page boundaries or are page aligned.
+ *
+ *              In the context of raw data, the page buffer may or may 
+ *              not contain regular entries scattered over the space 
+ *              touched by the newly freed file space.
+ *
+ *              In all contexts, there is no guarantee that the page buffer
+ *              will contain any of the possible entries.
+ *
+ *              Space allocations larger than one page must be page alligned.
+ *              Further, any space between the end of a multi-page allocation 
+ *              and the next page boundary will remain un-allocated until after
+ *              the original allocation is freed.  This implies that:
+ *
+ *              1) The address passed into this call must be page aligned.
+ *
+ *              2) The page buffer may safely discard any page that 
+ *                 intersects with the newly freed file space allocation.
+ *
+ *              The bottom line here is that we must scan the page buffer
+ *              index, and discard all entries that intersect the supplied
+ *              address and length.  As a sanity check, we must verify that
+ *              any such entries don't overlap.  
+ *
+ *              Also, in the context of the VFD SWMR write, it is possible
+ *              that the discarded pages will reside in the tick list or 
+ *              the delayed write list -- if so, they must be removed 
+ *              prior to eviction.
+ *
+ *              Note: 
+ *
+ *              This function scans the page buffer hash table to 
+ *              find entries to remove.  While this is normally 
+ *              pretty in-expensive, a very large (i.e. GB) file 
+ *              space free may impose significant cost.
+ *
+ *              As best I understand it, such frees are rare, so 
+ *              the current solution should be good enough for now.
+ *              However, if we determine that the current solution 
+ *              is too expensive, two alternate solutions come to mine.
+ *
+ *              a) Scan the index list instead of the hash table
+ *                 if the free is sufficiently large.  Also, skip 
+ *                 entirely if the page buffer doesn't contain any
+ *                 pages of the appropriate type.
+ *
+ *              b) Whenever writing a large metadata entry, scan for 
+ *                 intersecting entries and delete them.  (potential 
+ *                 issues with fixed and variable array entries are 
+ *                 dealt with via the splitting mechanism.)  In this 
+ *                 case we would also have to simply ignore writes 
+ *                 beyond EOA on flush or close.
+ *
+ *                 Note that we already scan for intersecting entries
+ *                 on large raw data writes -- with possible performance
+ *                 issues for large writes.
+ *
+ *                                            JRM -- 4/25/20
+ *
+ * Return:      Non-negative on success/Negative on failure
+ *
+ * Programmer:  John Mainzer 4/25/20
+ *
+ * Changes:     None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
 herr_t
 H5PB_remove_entries(H5F_shared_t *shared, haddr_t addr, hsize_t size)
 {
-    H5PB_t *pb_ptr;
-    H5PB_entry_t *entry_ptr;
-    herr_t ret_value = SUCCEED;
-    metadata_section_t section[3] = {{0, 0, NULL}, {0, 0, NULL}, {0, 0, NULL}};
-    int i;
+    uint64_t i;
+    uint64_t start_page;
+    uint64_t end_page;
+    int64_t entry_pages = 0;
+    hsize_t entry_size;
+    H5PB_t *pb_ptr = NULL;
+    H5PB_entry_t *entry_ptr = NULL;
+    herr_t ret_value = SUCCEED;             /* Return value */
 
     FUNC_ENTER_NOAPI(FAIL)
 
+    /* Sanity checks */
+    HDassert(shared);
+    HDassert(shared->pb_ptr);
+
     pb_ptr = shared->pb_ptr;
 
-    HDassert(addr % pb_ptr->page_size == 0);
+    /* Calculate the start_page offset */
+    start_page = (addr / pb_ptr->page_size);
 
-    if (size > pb_ptr->page_size) {
-        hlog_fast(pbrm,
-            "removing multipage region [%" PRIuHADDR ", %" PRIuHADDR ")",
-            addr, addr + size);
-    }
+    HDassert(addr == start_page * pb_ptr->page_size);
 
-    metadata_section_split(pb_ptr->page_size, addr, size, NULL, section);
+    /* Calculate the end_page offset */
+    end_page = ((addr + (haddr_t)(size - 1)) / pb_ptr->page_size);
 
-    for (i = 0; i < 3; i++) {
-        metadata_section_t *iter = &section[i];
+    HDassert(start_page <= end_page);
+    HDassert(((end_page - start_page) * pb_ptr->page_size) <= size);
+    HDassert(size <= ((end_page - start_page + 1) * pb_ptr->page_size));
+    
+    for ( i = start_page; i <= end_page; i++ )
+    {
+        /* test to see if page i exists */
+        H5PB__SEARCH_INDEX(pb_ptr, i, entry_ptr, FAIL)
 
-        if (iter->len == 0)
-            continue;
+        if ( entry_ptr ) {
 
-        if (iter->len < size) {
-            hlog_fast(pbrm, "removing entry [%" PRIuHADDR ", %" PRIuHADDR ") "
-                "for split region [%" PRIuHADDR ", %" PRIuHADDR ")",
-                iter->addr, iter->addr + iter->len, addr, addr + size);
-        }
+            /* verify that this entry doesn't overlap with a previously
+             * visited entry.
+             */
+            HDassert(entry_pages <= 0);
 
-        assert(iter->addr % pb_ptr->page_size == 0);
+            entry_size = entry_ptr->size;
+            entry_pages = (int64_t)(entry_size / pb_ptr->page_size);
 
-        if (H5PB_remove_entry(shared, iter->addr) < 0)
-            HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, "forced eviction failed")
+            if ( (uint64_t)entry_pages * pb_ptr->page_size < entry_size ) {
+
+                entry_pages++;
+            }
+
+            /* remove the entry */
+            if ( H5PB_remove_entry(shared, entry_ptr->addr) < 0 )
+
+                HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
+                            "H5PB_remove_entry() failed")
+
+        }
+        entry_pages--;
     }
 
 done:
+
     FUNC_LEAVE_NOAPI(ret_value)
-}
+
+} /* H5PB_remove_entries() */
 
 
 /*-------------------------------------------------------------------------
@@ -1749,9 +2200,9 @@ done:
  *
  *-------------------------------------------------------------------------
  */
-herr_t 
-H5PB_vfd_swmr__update_index(H5F_t *f, 
-                            uint32_t * idx_ent_added_ptr, 
+herr_t
+H5PB_vfd_swmr__update_index(H5F_t *f,
+                            uint32_t * idx_ent_added_ptr,
                             uint32_t * idx_ent_modified_ptr,
                             uint32_t * idx_ent_not_in_tl_ptr,
                             uint32_t * idx_ent_not_in_tl_flushed_ptr)
@@ -1777,7 +2228,7 @@ H5PB_vfd_swmr__update_index(H5F_t *f,
     idx = shared->mdf_idx;
 
     HDassert(idx);
-    
+
     pb_ptr = shared->pb_ptr;
 
     HDassert(pb_ptr);
@@ -1806,7 +2257,7 @@ H5PB_vfd_swmr__update_index(H5F_t *f,
         if ( ie_ptr == NULL ) { /* alloc new entry in the metadata file index*/
             uint32_t new_index_entry_index;
 
-            new_index_entry_index = shared->mdf_idx_entries_used + 
+            new_index_entry_index = shared->mdf_idx_entries_used +
                                     idx_ent_added++;
 
             if (new_index_entry_index >= shared->mdf_idx_len &&
@@ -1859,7 +2310,7 @@ H5PB_vfd_swmr__update_index(H5F_t *f,
         ie_ptr->tick_of_last_flush = 0;
     }
 
-    /* scan the metadata file index for entries that don't appear in the 
+    /* scan the metadata file index for entries that don't appear in the
      * tick list.  If the index entry is dirty, and either doesn't appear
      * in the page buffer, or is clean in the page buffer, mark the index
      * entry clean and as having been flushed in the current tick.
@@ -1891,7 +2342,7 @@ H5PB_vfd_swmr__update_index(H5F_t *f,
         }
     }
 
-    HDassert(idx_ent_modified + idx_ent_not_in_tl == 
+    HDassert(idx_ent_modified + idx_ent_not_in_tl ==
              shared->mdf_idx_entries_used);
 
     HDassert(idx_ent_modified + idx_ent_not_in_tl + idx_ent_added <=
@@ -1903,8 +2354,10 @@ H5PB_vfd_swmr__update_index(H5F_t *f,
     *idx_ent_not_in_tl_flushed_ptr = idx_ent_not_in_tl_flushed;
 
 done:
+
     FUNC_LEAVE_NOAPI(ret_value)
-}
+
+} /* H5PB_vfd_swmr__update_index() */
 
 
 /*-------------------------------------------------------------------------
@@ -1919,9 +2372,10 @@ done:
  *
  *              2) If the write is raw data, and the page buffer is 
  *                 configured for metadata only (i.e. min_md_pages == 
- *                 max_pages), simply write to the HDF5 file and return.
+ *                 max_pages), or if the page buffer is operating in 
+ *                 vfd_swmr mode, simply write to the HDF5 file and return.
  *
- *              3) If the write is raw data, and it of page size or 
+ *              3) If the write is raw data, and is of page size or 
  *                 larger, write directly from the HDF5 file.  
  *
  *                 It is possible that the write intersects one or more 
@@ -1941,13 +2395,68 @@ done:
  *                 configured for raw data only (i.e. min_rd_pages == 
  *                 max_pages), simply write to the HDF5 file and return.
  *
+ *              The free space manager guarantees that allocations larger
+ *              than one page will be page alligned, and that allocations
+ *              of size less than or equal to page size will not cross page
+ *              boundaries.  Further, unlike raw data, metadata is always
+ *              written and read atomically.
+ *
+ *              In principle, this should make it easy to discriminate
+ *              between small and multi-page metadata entries so that
+ *              pages containing the former will be buffered and the
+ *              latter be written directly to file.
+ *
+ *              Unfortunately, there is a fly in the ointment.
+ *
+ *              The fixed and extensible array on disk data
+ *              structures allocate multiple metadata cache entries in
+ *              a single block, and use this fact to make the addresses
+ *              of all but the first entry in the block computable.  While
+ *              this simplifies the fixed and extensible array on disk data
+ *              structures, it complicates the metadata cache and the page
+ *              buffer.
+ *
+ *              From the page buffer perspective, it breaks the invarient
+ *              that metadata entries of less than page size don't cross
+ *              page boundaries, and those of size greater than or equal
+ *              to page size start on page boundaries -- which is important
+ *              for VFD SWMR as it allows efficient management of multi-page
+ *              metadata entries.
+ *
+ *              While it is tempting to repair the fixed and extensible
+ *              array data structures so as to remove this irregularity,
+ *              and remove the resulting complexity from both the metadata
+ *              cache and the page buffer, this is a ticklish task, as there
+ *              are already files in the wild that use the existing versions
+ *              of these data structures.  Thus, due to resource constraints,
+ *              we have to program around the issue for now.
+ *
+ *              Fortunately, for purposes of the page buffer, this is
+ *              relatively easy -- when we encounter a metadata write 
+ *              that crosses one or more page boundaries, and is not 
+ *              both page aligned and an integral number of pages, we 
+ *              query the metadata cache to determine the type of the 
+ *              client whose data is being writtne.  If it is one of the
+ *              mis-behaving types, we split it into two or three writes
+ *              such that each write either doesn't cross page boundaries,
+ *              or is page aligned and an integral number of pages.
+ *
+ *              This is done in this function, and is not reflected in 
+ *              the case analysis in the rest of this comment.
+ *
  *              6) If the write is of metadata, the write is larger than
- *                 one page, and vfd_swmr_writer is FALSE, simply read 
- *                 from the HDF5 file.  There is no need to check the 
+ *                 one page, and vfd_swmr_writer is FALSE, simply write 
+ *                 to the HDF5 file.  There is no need to check the 
  *                 page buffer, as metadata is always read atomically, 
  *                 and entries of this size are not buffered in the page 
  *                 buffer.
  *
+ *                 Observe that this write must be page aligned.  This 
+ *                 should be enforced by the free space manager, but
+ *                 for now it is enforced by the above mentioned practice
+ *                 of splitting writes from cache client that don't 
+ *                 allocate each entry separately.
+ *
  *              7) If the write is of metadata, the write is larger than
  *                 one page, and vfd_swmr_writer is TRUE, the write must
  *                 buffered in the page buffer until the end of the tick.
@@ -1980,7 +2489,17 @@ done:
  *
  * Programmer:	John Mainzer -- 10/11/18
  *
- * Changes:     None.
+ * Changes:     Updated to support splitting of metadata writes that 
+ *              are not page aligned and cross page boundaries into 
+ *              2 or 3 writes that are either page aligned or do not
+ *              cross page boundaries.  Full details in the header 
+ *              comment above, that has been updated to document 
+ *              this change.
+ *
+ *              Also updated case 2 to bypass the page buffer for raw
+ *              data writes in vfd swmr mode.
+ *
+ *                                      JRM -- 4/5/20
  *
  *-------------------------------------------------------------------------
  */
@@ -1988,10 +2507,19 @@ herr_t
 H5PB_write(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size, 
            const void *buf)
 {
-    H5PB_t *pb_ptr;                    /* Page buffer for this file */
+    H5PB_t *pb_ptr;                     /* Page buffer for this file */
     hbool_t bypass_pb = FALSE;          /* Whether to bypass page buffering */
+    hbool_t split_write = FALSE;        /* whether md write must be split */
     herr_t ret_value = SUCCEED;         /* Return value */
 
+    /* the following six fields are defined iff split_write is TRUE */
+    haddr_t prefix_addr = HADDR_UNDEF;  /* addr of prefix -- if defined */
+    haddr_t body_addr = HADDR_UNDEF;    /* addr of body -- if defined */
+    haddr_t suffix_addr = HADDR_UNDEF;  /* addr of suffix -- if defined */
+    size_t prefix_size = 0;             /* size of prefix */
+    size_t body_size = 0;               /* size of body */
+    size_t suffix_size = 0;             /* size of suffix */
+
     FUNC_ENTER_NOAPI(FAIL)
 
     hlog_fast(pbwr, "%s %p type %d addr %" PRIuHADDR " size %zu",
@@ -2012,7 +2540,8 @@ H5PB_write(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size,
 
         if ( H5FD_MEM_DRAW == type ) { /* raw data write */
 
-            if ( pb_ptr->min_md_pages == pb_ptr->max_pages ) {
+            if ( ( pb_ptr->min_md_pages == pb_ptr->max_pages ) ||
+                 ( pb_ptr->vfd_swmr ) ) {
 
                 /* case 2) -- page buffer configured for metadata only */
                 bypass_pb = TRUE;
@@ -2025,13 +2554,207 @@ H5PB_write(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size,
                 /* case 5) -- page buffer configured for raw data only */
                 bypass_pb = TRUE;
 
-            } else if ( ( size >= pb_ptr->page_size ) && 
-                        ( ! ( pb_ptr->vfd_swmr_writer ) ) ) {
+            } else {
 
-                /* case 6) -- md read larger than one page and 
-                 *            pb_ptr->vfd_swmr_writer is FALSE.
+                /* determine whether the write request must be split,
+                 * and if so, compute the start points and sizes of
+                 * of the sections.
+                 *
+                 * Note: The following code is almost identical to the 
+                 *       similar code in H5PB_read().  Thus, on the surface, 
+                 *       it is an obvious candidate for refactoring into a 
+                 *       function or macro.
+                 *
+                 *       However, there are subtle differences between 
+                 *       the two pieces of code which are driven by the 
+                 *       possibility of speculative reads.  
+                 *
+                 *       More to the point, further changes may be necessary.  
+                 *       Thus we should wait on refactoring until this code has 
+                 *       been in daily use for some time, and it is clear 
+                 *       that further changes are unlikely.
                  */
-                bypass_pb = TRUE;
+                int mdc_client_id = -1;  /* id of mdc client, or -1 if undef */
+                uint64_t start_page;     /* page index of first page in read */
+                uint64_t second_page;    /* page index of second page in read */
+                uint64_t end_page;       /* page index of last page in read */
+                uint64_t body_page;      /* page index of start of body */
+                haddr_t start_page_addr; /* addr of first page in read */
+                haddr_t second_page_addr;/* addr of second page in read */
+                haddr_t end_page_addr;   /* addr of last page in read */
+                haddr_t end_addr;        /* addr of last byte in read */
+
+                /* Calculate the aligned address of the first page */
+                start_page = (addr / pb_ptr->page_size);
+                start_page_addr = start_page * pb_ptr->page_size;
+
+                /* Calculate the aligned address of the last page */
+                end_addr = addr + (haddr_t)(size - 1);
+                end_page = end_addr / (haddr_t)(pb_ptr->page_size);
+                end_page_addr = end_page * pb_ptr->page_size;
+
+                HDassert(start_page_addr <= addr);
+                HDassert(addr < start_page_addr + (haddr_t)(pb_ptr->page_size));
+
+                HDassert(start_page <= end_page);
+                HDassert(end_page_addr <= ((addr + (haddr_t)size - 1)));
+                HDassert((addr + (haddr_t)size - 1) <
+                         (end_page_addr + pb_ptr->page_size));
+
+                /* test to see if the write crosses a page boundary, and
+                 * does not start on a page boundary, and is not of an
+                 * integral number of pages.
+                 */
+                if ( ( start_page < end_page ) &&
+                     ( ! ( ( addr == start_page_addr ) &&
+                           ( end_page_addr + (haddr_t)(pb_ptr->page_size) == 
+                             end_addr + 1 ) ) ) ) {
+
+                    /* the read crosses a page boundary and is not
+                     * page aligned and of length some multiple of page size.
+                     *
+                     * Test to see if the read is for a metadata entry that
+                     * is sub-allocated from a larger space allocation.
+                     *
+                     * Note that the following test may have to be
+                     * adjusted.
+                     */
+                    mdc_client_id = H5C_get_curr_io_client_type(shared->cache);
+
+                    if ( ( mdc_client_id == (int)H5AC_EARRAY_DBLK_PAGE_ID ) || \
+                         ( mdc_client_id == (int)H5AC_FARRAY_DBLK_PAGE_ID ) ) {
+
+                        split_write = TRUE;
+
+                    } else {
+
+                        HDassert(addr == start_page_addr);
+                        HDassert(size > pb_ptr->page_size);
+
+                        if ( ! pb_ptr->vfd_swmr_writer ) {
+
+                            /* case 6) -- multi-page entry with fixed /
+                             * extensible array filtered out, and no
+                             * no VFD swmr.
+                             */
+                            bypass_pb = TRUE;
+                        }
+                    }
+                } else if ( ( size > pb_ptr->page_size ) &&
+                            ( ! pb_ptr->vfd_swmr_writer ) ) {
+
+                    /* write is larger than page size and we are not
+                     * in VFD SWMR mode -- bypass the page buffer.
+                     * This is also case 6.  We catch it here as
+                     * the code to determine whether to split only
+                     * looks at I/O requests that cross page bundaries
+                     * and are not both page aligned and an integral 
+                     * number of pages in length.
+                     */
+                    HDassert(start_page_addr == addr);
+                    bypass_pb = TRUE;
+                }
+
+                if ( split_write ) {
+
+                    /* compute the base addresses and length of the prefix,
+                     * body, and suffix of the write, where these terms are
+                     * defined as follows:
+                     *
+                     * prefix: All bytes from addr to the first page address
+                     *         at or after addr.  If addr == start_page_addr,
+                     *         the prefix is empty.
+                     *
+                     * body:   All bytes from the first page address covered
+                     *         by the write up to but not including the last
+                     *         page address in the write.  Note that the
+                     *         length of the body must be a multiple of the
+                     *         page size.  If only one page address is
+                     *         included in the write, the body is empty.
+                     *
+                     * suffix: All bytes from the last page address in the
+                     *         write until the end of the write.  If the
+                     *         write ends on a page boundary, the suffix is
+                     *         empty.
+                     *
+                     * Since we know that the write crosses at least one
+                     * page boundary, and we have aleady filtered out the
+                     * body only case, at least two of the above must be
+                     * non-empty.
+                     */
+
+                    second_page = start_page + 1;
+                    second_page_addr =
+                        (haddr_t)(second_page * pb_ptr->page_size);
+
+                    if ( addr > start_page_addr ) { /* prefix exists */
+
+                        prefix_addr = addr;
+                        prefix_size = (size_t)(second_page_addr - addr);
+
+                        HDassert(prefix_addr > start_page_addr);
+                        HDassert(prefix_size < pb_ptr->page_size);
+                        HDassert(((size_t)(addr - start_page_addr) + \
+                                 prefix_size) == pb_ptr->page_size);
+                    }
+
+                    if ( size - prefix_size >= pb_ptr->page_size ) {
+
+                        /* body exists */
+
+                        if ( addr == start_page_addr ) {
+
+                            body_page = start_page;
+                            body_addr = start_page_addr;
+
+                        } else {
+
+                            body_page = second_page;
+                            body_addr = second_page_addr;
+                        }
+
+                        if ( end_addr < end_page_addr + 
+                                        (haddr_t)(pb_ptr->page_size - 1) ) {
+
+                            /* suffix exists */
+                            body_size = (size_t)(end_page - body_page) *
+                                        pb_ptr->page_size;
+
+                        } else {
+
+                            /* suffix is empty */
+                            body_size = (size_t)(end_page - body_page + 1) *
+                                        pb_ptr->page_size;
+                        }
+
+                        HDassert((body_page == start_page) || \
+                                 (body_page == start_page + 1));
+
+                        HDassert(body_addr == \
+                                 (haddr_t)(body_page * pb_ptr->page_size));
+
+                        HDassert(body_size < size);
+                        HDassert(body_size >= pb_ptr->page_size);
+
+
+                        HDassert(body_addr == \
+                                 addr + (haddr_t)prefix_size);
+                        HDassert((body_addr + (haddr_t)body_size) \
+                                  <= (end_addr + 1));
+                    }
+
+                    if ( end_addr < end_page_addr + 
+                                    (haddr_t)(pb_ptr->page_size - 1) ) {
+
+                        suffix_addr = end_page_addr;
+                        suffix_size = (end_addr + 1) - end_page_addr;
+
+                        HDassert(suffix_addr == \
+                                 addr + (haddr_t)(prefix_size + body_size));
+                    }
+
+                    HDassert(size == prefix_size + body_size + suffix_size);
+                }
             }
         }
     }
@@ -2047,6 +2770,7 @@ H5PB_write(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size,
     } /* end if */
 #endif /* H5_HAVE_PARALLEL */
 
+
     if ( bypass_pb ) { /* cases 1, 2. 5, and 6 */
 
         if ( H5FD_write(shared->lf, type, addr, size, buf) < 0 )
@@ -2068,15 +2792,84 @@ H5PB_write(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size,
                 HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \
                             "H5PB_read_raw() failed")
 
+        } else if ( split_write ) {
+
+            /* handle the sub-allocated entry case */
+
+            /* write prefix if it exists */
+            if ( prefix_size > 0 ) {
+
+                if ( H5PB__write_meta(shared, type, addr, 
+                                      prefix_size, buf) < 0 )
+
+                    HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \
+                                "H5PB__write_meta() failed on prefix")
+            }
+
+            /* write the body if it exists */
+            if ( body_size > 0 ) {
+
+                /* The "body_size == pb_ptr->page_size" clause in the 
+                 * following if is required since in normal operating
+                 * mode, the page buffer buffers metadata I/O 
+                 * requests of page size or less.
+                 *
+                 * Thus this clause ensures that a single page body
+                 * does not bypass the page buffer, setting the potential
+                 * for an older version to shadow the most recent version.
+                 *
+                 * Note: The page buffer really shouldn't buffer page 
+                 *       aligned single page metadata I/O requests, as it 
+                 *       creates extra overhead to no purpose.  However,
+                 *       fixing this is a bit tricky, and the case doesn't 
+                 *       appear to be common.  Thus, while it should be 
+                 *       fixed,  I don't think it is urgent.
+                 *       
+                 *                           JRM 4/19/20
+                 */
+                if ( ( pb_ptr->vfd_swmr ) ||
+                     ( body_size == pb_ptr->page_size ) ) {
+
+                    if ( H5PB__write_meta(shared, type, body_addr, body_size, 
+                                          (const void *)((const uint8_t *)buf +
+                                                    prefix_size)) < 0 )
+
+                    HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \
+                                "H5PB__write_meta() failed on body")
+
+                } else {
+
+                    if ( H5FD_write(shared->lf, type, body_addr, body_size, 
+                                    (const void *)((const uint8_t *)buf +
+                                                         prefix_size)) < 0 )
+
+                        HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \
+                                     "write through of body failed")
+
+                    H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size);
+                }
+            }
+
+            /* write the suffix if it exists */
+            if ( suffix_size > 0 ) {
+
+                if ( H5PB__write_meta(shared, type, suffix_addr, suffix_size,
+                                      (const void *)((const uint8_t *)buf + 
+                                       prefix_size + body_size)) < 0 )
+
+                    HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \
+                                "H5PB_write_meta() failed on suffix")
+            }
+
+            H5PB__UPDATE_STATS_FOR_WRITE_SPLIT(pb_ptr)
+
         } else { /* cases 7, and 8 */
 
-            if ( metadata_multipart_write(shared, type, addr, size, buf) < 0 )
+            if ( H5PB__write_meta(shared, type, addr, size, buf) < 0 )
 
                 HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \
-                            "H5PB_read_meta() failed")
+                            "H5PB_write_meta() failed")
         }
-
-        H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size);
     }
 
 done:
@@ -3071,118 +3864,6 @@ done:
 
 } /* H5PB__mark_entry_dirty() */
 
-static void
-metadata_section_split(size_t pgsz, haddr_t addr, size_t len, const void *_buf,
-    metadata_section_t *section)
-{
-    int i;
-    size_t totlen = 0;
-    haddr_t whole_pgaddr, tail_pgaddr;
-    const char *buf = _buf;
-    metadata_section_t *head = &section[0], *middle = &section[1],
-        *tail = &section[2];
-
-    /* Try to find the address of the first whole page, and the address of
-     * the page after the last whole page.
-     */
-    whole_pgaddr = roundup(addr, pgsz);
-    tail_pgaddr = rounddown(addr + len, pgsz);
-
-    /* In the degenerate case where the first whole page is "after" the last,
-     * actually the entire access lands between page boundaries.
-     */
-    if (whole_pgaddr > tail_pgaddr) {
-        assert(len < pgsz);
-        head->addr = addr;
-        head->len = len;
-        head->buf = buf;
-        return;
-    }
-
-    /* `head` spans any range beginning before the first page boundary. */
-    if (addr < whole_pgaddr) {
-        head->buf = buf;
-        head->len = pgsz - addr % pgsz;
-        head->addr = addr;
-    }
-
-    /* `middle` spans one or more whole pages in between the end of
-     * `head` and before the beginning of `tail`.
-     */
-    if (whole_pgaddr < tail_pgaddr) {
-        middle->buf = (buf == NULL) ? NULL : &buf[whole_pgaddr - addr];
-        middle->len = tail_pgaddr - whole_pgaddr;
-        middle->addr = whole_pgaddr;
-    }
-
-    /* `tail` spans residual bytes that follow the last page boundary. */
-    if (tail_pgaddr < addr + len) {
-        tail->len = (addr + len) - tail_pgaddr;
-        tail->buf = (buf == NULL) ? NULL : &buf[tail_pgaddr - addr];
-        tail->addr = tail_pgaddr;
-    }
-
-    for (i = 0; i < 3; i++) {
-        metadata_section_t *iter = &section[i];
-        if (iter->len == 0)
-            continue;
-        assert(iter->addr == addr + totlen);
-        assert(iter->buf == ((buf == NULL) ? NULL : &buf[totlen]));
-//      assert(i == 0 || iter[-1].buf + iter[-1].len == iter->buf);
-        totlen += iter->len;
-    }
-
-    assert(totlen == len);
-}
-
-static herr_t
-metadata_multipart_read(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr,
-    size_t len, void *_buf/*out*/)
-{
-    herr_t rc;
-    int i;
-    const size_t pgsz = shared->pb_ptr->page_size;
-    metadata_section_t section[3] = {{0, 0, NULL}, {0, 0, NULL}, {0, 0, NULL}};
-
-    metadata_section_split(pgsz, addr, len, _buf, section);
-
-    for (i = 0; i < 3; i++) {
-        metadata_section_t *iter = &section[i];
-        if (iter->buf == NULL)
-            continue;
-        rc = H5PB__read_meta(shared, type, iter->addr, iter->len,
-            (void *)(uintptr_t)iter->buf);
-        if (rc < 0)
-            return rc;
-    }
-
-    return SUCCEED;
-}
-
-static herr_t
-metadata_multipart_write(H5F_shared_t *shared, H5FD_mem_t type,
-    haddr_t addr, size_t len, const void *_buf/*out*/)
-{
-    herr_t rc;
-    int i;
-    const size_t pgsz = shared->pb_ptr->page_size;
-    metadata_section_t section[3] = {{0, 0, NULL}, {0, 0, NULL}, {0, 0, NULL}};
-
-    metadata_section_split(pgsz, addr, len, _buf, section);
-
-    for (i = 0; i < 3; i++) {
-        metadata_section_t *iter = &section[i];
-
-        if (iter->buf == NULL)
-            continue;
-        rc = H5PB__write_meta(shared, type, iter->addr, iter->len, iter->buf);
-        if (rc < 0)
-            return rc;
-    }
-
-    return SUCCEED;
-}
-
 
 /*-------------------------------------------------------------------------
  *
@@ -3198,21 +3879,25 @@ metadata_multipart_write(H5F_shared_t *shared, H5FD_mem_t type,
  *                 existing page, it must not be a multi-page metadata 
  *                 entry.  It it is, flag an error.
  *
+ *                 Recall that by the time we get to this function,
+ *                 un-aligned page reads from the fixed and variable 
+ *                 length array structures that cross page boundaries
+ *                 have already been split into two or three reads 
+ *                 that conform to the usual pattern of metadata reads.
+ *
  *              7) If the read is for metadata, is page aligned, is larger 
  *                 than one page, and there is no entry in the page buffer,
  *                 satisfy the read from the file
  *
  *              8) If the read is for metadata, is page aligned, is larger 
  *                 than one page, and there is a regular entry at the target
- *                 page address, test to see if the last read was for the 
- *                 same address.
+ *                 page address, test to see if the read is speculative.
  *
- *                 If was, evict the page, and satisfy the read from file.
- *                 Flag an error if the page was dirty.
+ *                 If it is not, evict the page, and satisfy the read from 
+ *                 file.  Flag an error if the page was dirty.
  *
- *                 If the last read was for a different page, clip the read 
- *                 to one page, and satisfy the read from the existing 
- *                 regular entry.
+ *                 If it is, clip the read to one page, and satisfy the 
+ *                 read from the existing regular entry.
  *
  *              9) If the read is for metadata, is page aligned, is larger
  *                 than one page, and there is a multi-page metadata entry
@@ -3244,7 +3929,7 @@ metadata_multipart_write(H5F_shared_t *shared, H5FD_mem_t type,
  *
  *                P/A       == page aligned
  *                size > PL == size > page length
- *                PA        == previous address
+ *                Spec      == speculative read
  *                A         == current address
  *
  *              In the entry exists column:
@@ -3254,7 +3939,7 @@ metadata_multipart_write(H5F_shared_t *shared, H5FD_mem_t type,
  *                MPMDE     == multi-page metadata entry
  *
  *       | size | entry  | VFD  |         |
- *  P/A: | > PL | exists | SWMR | PA == A | Comments:
+ *  P/A: | > PL | exists | SWMR |  Spec   | Comments:
  * ------+------+--------+------+---------+-------------------------------------
  *   N   |  X   | N || R |  X   |    X    | Clip read to page boundary if 
  *       |      |        |      |         | necessary
@@ -3267,10 +3952,10 @@ metadata_multipart_write(H5F_shared_t *shared, H5FD_mem_t type,
  * ------+------+--------+------+---------+-------------------------------------
  *   Y   |  Y   |   N    |  X   |    X    | Satisfy read from file (case 7)
  * ------+------+--------+------+---------+-------------------------------------
- *   Y   |  Y   |   R    |  X   |    N    | Clip read to page boundary
+ *   Y   |  Y   |   R    |  X   |    Y    | Clip read to page boundary
  *       |      |        |      |         | Satisfy read from entry  (case 8)
  * ------+------+--------+------+---------+-------------------------------------
- *   Y   |  Y   |   R    |  X   |    Y    | Evict entry 
+ *   Y   |  Y   |   R    |  X   |    N    | Evict entry 
  *       |      |        |      |         | (must be clean -- flag error if not)
  *       |      |        |      |         | Satisfy read from file (case 8)
  * ------+------+--------+------+---------+-------------------------------------
@@ -3308,20 +3993,25 @@ metadata_multipart_write(H5F_shared_t *shared, H5FD_mem_t type,
  *
  * Programmer:	John Mainzer -- 10/11/18
  *
- * Changes:     None.
+ * Changes:     Updated to use the speculative read hint from the 
+ *              metadata cache, and remove the static variable 
+ *              containing the base address of the last read.
+ *
+ *                                             JRM -- 4/5/20
  *
  *-------------------------------------------------------------------------
  */
 static herr_t
-H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size, 
-                void *buf/*out*/)
+H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, 
+                size_t size, void *buf/*out*/)
 {
+    hbool_t bypass = FALSE;                 /* flag indicating PB bypassed */
+    hbool_t speculative = FALSE;            /* speculative read hint from mdc */
     H5PB_t *pb_ptr;                         /* Page buffer for this file */
     H5PB_entry_t *entry_ptr;                /* Pointer to page buffer entry */
     H5FD_t *file;                           /* File driver pointer */
     uint64_t page;		            /* page offset of addr */
     haddr_t page_addr;                      /* page containing addr */
-    static haddr_t prev_addr = HADDR_UNDEF; /* addr of last call */
     size_t offset;                          /* offset of read in page */
     size_t clipped_size;                    /* possibley clipped size */
     herr_t ret_value = SUCCEED;             /* Return value */
@@ -3380,7 +4070,8 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size
                                        TRUE, FALSE)
 
         if ( ( NULL == entry_ptr ) &&
-             ( H5PB__load_page(shared, pb_ptr, page_addr, type, &entry_ptr) < 0 ) )
+             ( H5PB__load_page(shared, pb_ptr, page_addr, 
+                               type, &entry_ptr) < 0 ) )
 
             HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
                         "page buffer page load request failed (1)")
@@ -3405,7 +4096,7 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size
 
         HDassert( page_addr == addr );
 
-        if ( size >= pb_ptr->page_size ) {
+        if ( size > pb_ptr->page_size ) {
 
             /* search the page buffer for an entry at page */
             H5PB__SEARCH_INDEX(pb_ptr, page, entry_ptr, FAIL)
@@ -3414,10 +4105,11 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size
             if ( entry_ptr == NULL ) { /* case 7 */
 
                 /* update hit rate stats */
-                H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, FALSE, TRUE, size > pb_ptr->page_size)
+                H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, FALSE, \
+                                               TRUE, size > pb_ptr->page_size)
 
-                /* If the read is for metadata, is page aligned, is larger 
-                 * than one page, and there is no entry in the page buffer,
+                /* If the read is for metadata, is page aligned, is larger
+                 * than page size, and there is no entry in the page buffer,
                  * satisfy the read from the file
                  */
                 if ( H5FD_read(file, type, addr, size, buf) < 0)
@@ -3425,7 +4117,10 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size
                     HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
                                 "driver read request failed (1)")
 
+                bypass = TRUE;
+
                 H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size);
+
             } else {
 
                 HDassert( entry_ptr );
@@ -3436,28 +4131,29 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size
 
                     /* If the read is for metadata, is page aligned, is larger 
                      * than one page, and there is a regular entry at the target
-                     * page address, test to see if the last read was for the 
-                     * same address.
+                     * page address, test to see if the read is speculative.
                      *
-                     * If it was, evict the page, and satisfy the read from
+                     * If it is not, evict the page, and satisfy the read from
                      * file.  Flag an error if the page was dirty.
                      *
-                     * If the last read was for a different page, clip the read 
-                     * to one page, and satisfy the read from the existing 
-                     * regular entry.
+                     * If it is, clip the read to one page, and satisfy 
+                     * the read from the existing regular entry.
                      */
 
                     HDassert( entry_ptr->size == pb_ptr->page_size );
 
-                    if ( addr == prev_addr ) {
+                    speculative = H5C_get_curr_read_speculative(shared->cache);
+
+                    if ( ! speculative ) {
 
-                        /* since this is a second try, don't update 
+                        /* since this is likely a second try, don't update 
                          * hit rate stats.
                          */
 
                         HDassert( ! ( entry_ptr->is_dirty ) );
 
-                        if (H5PB__evict_entry(shared, entry_ptr, TRUE, false) < 0)
+                        if ( H5PB__evict_entry(shared, entry_ptr, 
+                                               TRUE, false) < 0 )
 
                             HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \
                                         "forced eviction failed (1)")
@@ -3466,7 +4162,9 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size
                             HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
                                         "driver read request failed (2)")
 
+                        bypass = TRUE;
                         H5PB__UPDATE_STATS_FOR_BYPASS(pb_ptr, type, size);
+
                     } else {
 
                         HDassert( entry_ptr->image_ptr );
@@ -3486,7 +4184,8 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size
                         }
 
                         /* update hit rate stats */
-                        H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, TRUE, TRUE, FALSE)
+                        H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, TRUE, \
+                                                       TRUE, FALSE)
                     }
                 } else { /* case 9 */
 
@@ -3556,7 +4255,8 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size
                                            TRUE, FALSE)
 
             if ( ( NULL == entry_ptr ) &&
-                 ( H5PB__load_page(shared, pb_ptr, page_addr, type, &entry_ptr) < 0))
+                 ( H5PB__load_page(shared, pb_ptr, page_addr, 
+                                   type, &entry_ptr) < 0))
 
                 HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \
                             "page buffer page load request failed (2)")
@@ -3579,7 +4279,8 @@ H5PB__read_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size
         }
     }
 
-    prev_addr = addr;
+    if ( ! bypass )
+        H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size);
 
 done:
 
@@ -3877,6 +4578,8 @@ H5PB__read_raw(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size,
         }
     } /* end else */
 
+    H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size);
+
 done:
 
     FUNC_LEAVE_NOAPI(ret_value)
@@ -4120,6 +4823,8 @@ H5PB__write_meta(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr,
         H5PB__INSERT_IN_TL(pb_ptr, entry_ptr, FAIL)
     }
 
+    H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size);
+
 done:
 
     FUNC_LEAVE_NOAPI(ret_value)
@@ -4168,8 +4873,8 @@ done:
  *-------------------------------------------------------------------------
  */
 static herr_t
-H5PB__write_raw(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size, 
-                const void *buf/*out*/)
+H5PB__write_raw(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, 
+                size_t size, const void *buf/*out*/)
 {
     H5PB_t *pb_ptr;                    /* Page buffer for this file */
     H5PB_entry_t *entry_ptr;           /* Pointer to page buffer entry */
@@ -4419,6 +5124,8 @@ H5PB__write_raw(H5F_shared_t *shared, H5FD_mem_t type, haddr_t addr, size_t size
         }
     }
 
+    H5PB__UPDATE_STATS_FOR_ACCESS(pb_ptr, type, size);
+
 done:
 
     FUNC_LEAVE_NOAPI(ret_value)
diff --git a/src/H5PBpkg.h b/src/H5PBpkg.h
index fb9f29f..14804ac 100644
--- a/src/H5PBpkg.h
+++ b/src/H5PBpkg.h
@@ -812,6 +812,20 @@ if ( ( (entry_ptr) == NULL ) ||                                                \
     ((pb_ptr)->loads[i])++;                                   \
 } /* H5PB__UPDATE_STATS_FOR_LOAD */
 
+#define H5PB__UPDATE_STATS_FOR_READ_SPLIT(pb_ptr)             \
+{                                                             \
+    HDassert(pb_ptr);                                         \
+    HDassert((pb_ptr)->magic == H5PB__H5PB_T_MAGIC);          \
+    (pb_ptr->md_read_splits)++;                               \
+} /* H5PB__UPDATE_STATS_FOR_READ_SPLIT */
+
+#define H5PB__UPDATE_STATS_FOR_WRITE_SPLIT(pb_ptr)            \
+{                                                             \
+    HDassert(pb_ptr);                                         \
+    HDassert((pb_ptr)->magic == H5PB__H5PB_T_MAGIC);          \
+    (pb_ptr->md_write_splits)++;                              \
+} /* H5PB__UPDATE_STATS_FOR_READ_SPLIT */
+
 #else /* H5PB__COLLECT_PAGE_BUFFER_STATS */
 
 #define H5PB__UPDATE_PB_HIT_RATE_STATS(pb_ptr, hit, is_metadata, is_mpmde) 
@@ -834,6 +848,8 @@ if ( ( (entry_ptr) == NULL ) ||                                                \
 #define H5PB__UPDATE_STATS_FOR_CLEAR(pb_ptr, entry_ptr)
 #define H5PB__UPDATE_STATS_FOR_INSERTION(pb_ptr, entry_ptr)
 #define H5PB__UPDATE_STATS_FOR_LOAD(pb_ptr, entry_ptr)
+#define H5PB__UPDATE_STATS_FOR_READ_SPLIT(pb_ptr)
+#define H5PB__UPDATE_STATS_FOR_WRITE_SPLIT(pb_ptr)
 
 #endif /* H5PB__COLLECT_PAGE_BUFFER_STATS */
 
diff --git a/src/H5PBprivate.h b/src/H5PBprivate.h
index 983d183..97de7ae 100644
--- a/src/H5PBprivate.h
+++ b/src/H5PBprivate.h
@@ -249,6 +249,9 @@ typedef struct H5PB_entry_t H5PB_entry_t;
  *
  * FIELDS SUPPORTING VFD SWMR:
  *
+ * If the file is opened in VFD SWMR mode (i.e. vfd_swmr == TRUE), all 
+ * raw data I/O must be passed through to the HDF5 file
+ *
  * If the file is opened as a VFD SWMR writer (i.e. vfd_swmr_writer == TRUE),
  * the page buffer must retain the data necessary to update the metadata 
  * file at the end of each tick, and also delay writes as necessary so as 
@@ -285,8 +288,12 @@ typedef struct H5PB_entry_t H5PB_entry_t;
  * The remainder of this sections contains discussions of the fields and 
  * data structures used to support the above operations.
  *
+ * vfd_swmr:    Boolean flag that is set to TRUE IFF the file is opened
+ *              in VFD SWMR mode -- either reader or writer.  This field
+ *              is used to exclude raw data from the page buffer.
+ *
  * vfd_swmr_writer: Boolean flag that is set to TRUE iff the file is 
- *              the file is opened in VFD SWMR mode.  The remaining 
+ *              is opened in VFD SWMR writer mode.  The remaining 
  *              VFD SWMR fields are defined iff vfd_swmr_writer is TRUE.
  *
  * mpmde_count: int64_t containing the number of multi-page metadata 
@@ -528,6 +535,16 @@ typedef struct H5PB_entry_t H5PB_entry_t;
  * total_dwl_ins_depth: int64_t containing the total insertion depth 
  *              required to maintain the odering invarient on the 
  *              delayed write list.
+ *
+ * md_read_splits:  int64_t containing the number of metadata reads that 
+ *              are split into two or three sub-reads to manage the 
+ *              case in which a group of metadata cache clients 
+ *              sub-allocate entries from a single file space allocationn.
+ *
+ * md_write_splits:  int64_t containing the number of metadata writes that 
+ *              are split into two or three sub-writes to manage the 
+ *              case in which a group of metadata cache clients 
+ *              sub-allocate entries from a single file space allocationn.
  *    
  ******************************************************************************/
 
@@ -578,6 +595,7 @@ typedef struct H5PB_t {
 
     /* Fields for VFD SWMR operations: */
 
+    hbool_t vfd_swmr;
     hbool_t vfd_swmr_writer;
     int64_t mpmde_count;
     uint64_t cur_tick;
@@ -646,6 +664,8 @@ typedef struct H5PB_t {
     int64_t max_dwl_len;
     int64_t max_dwl_size;
     int64_t total_dwl_ins_depth;
+    int64_t md_read_splits;
+    int64_t md_write_splits;
 
 } H5PB_t;
 
@@ -671,6 +691,7 @@ H5_DLL herr_t H5PB_add_new_page(H5F_shared_t *, H5FD_mem_t, haddr_t);
 H5_DLL herr_t H5PB_update_entry(H5PB_t *, haddr_t, size_t, const void *);
 
 H5_DLL herr_t H5PB_remove_entry(H5F_shared_t *, haddr_t);
+
 H5_DLL herr_t H5PB_remove_entries(H5F_shared_t *, haddr_t, hsize_t);
 
 H5_DLL herr_t H5PB_read(H5F_shared_t *, H5FD_mem_t, haddr_t,