summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Young <dyoung@hdfgroup.org>2020-08-31 23:56:25 (GMT)
committerDavid Young <dyoung@hdfgroup.org>2020-08-31 23:56:25 (GMT)
commit79560265af7dd3ae34f12b3eb4045ca2f1cb7520 (patch)
tree27033dffdd2a934975b897c039d4a82f7b7560f0
parentb0e99604609fa054b914ff0638f3d6345b8e7774 (diff)
parentf45bb2197958238c109c953315202d67af5238ef (diff)
downloadhdf5-79560265af7dd3ae34f12b3eb4045ca2f1cb7520.zip
hdf5-79560265af7dd3ae34f12b3eb4045ca2f1cb7520.tar.gz
hdf5-79560265af7dd3ae34f12b3eb4045ca2f1cb7520.tar.bz2
Merge branch 'feature/vfd_swmr' into multi
-rw-r--r--doc/vfd-swmr-user-guide.md4
-rw-r--r--src/H5FDprivate.h31
-rw-r--r--src/H5FDvfd_swmr.c45
-rw-r--r--src/H5FDvfd_swmr_instr.c31
-rw-r--r--src/H5FDvfd_swmr_private.h2
-rw-r--r--src/H5Fpkg.h52
-rw-r--r--src/H5Fpublic.h20
-rw-r--r--src/H5Fvfd_swmr.c1
-rw-r--r--src/H5HGtrap.c18
-rw-r--r--src/H5Pfapl.c8
10 files changed, 152 insertions, 60 deletions
diff --git a/doc/vfd-swmr-user-guide.md b/doc/vfd-swmr-user-guide.md
index 69b7f96..dadd783 100644
--- a/doc/vfd-swmr-user-guide.md
+++ b/doc/vfd-swmr-user-guide.md
@@ -446,8 +446,8 @@ Improvements to VFD SWMR may also alleviate the problem.
## Microsoft Windows
-VFD SWMR does not support Microsoft Windows at this time. We do plan to
-add support this year.
+VFD SWMR does not support Microsoft Windows at this time. We are
+investigating to see when we can add Windows support.
## Supported filesystems
diff --git a/src/H5FDprivate.h b/src/H5FDprivate.h
index 0d05b15..b1d8708 100644
--- a/src/H5FDprivate.h
+++ b/src/H5FDprivate.h
@@ -157,6 +157,15 @@
* lower file and is therefore about to be removed from the
* metadata file
*
+ * garbage: `true` if the entry is marked for garbage collection and is
+ * thus invalid.
+ *
+ * For n the number of entries, deleting an entry is O(n).
+ * H5PB_dest() deletes all entries. Instead of deleting
+ * entries one-by-one at O(n^2) cost, H5PB_dest() marks
+ * each disused entry for garbage collection and sweeps all
+ * entries up before it is done.
+ *
*----------------------------------------------------------------------------
*/
typedef struct H5FD_vfd_swmr_idx_entry_t {
@@ -175,8 +184,8 @@ typedef struct H5FD_vfd_swmr_idx_entry_t {
/*
* tick_num: Sequence number of the current tick.
- * Initialized to zero on file creation/open, and incremented by the
- * VFD SWMR writer at the end of each tick.
+ * Initialized to zero on file creation/open, and incremented
+ * by the VFD SWMR writer at the end of each tick.
* num_entries: The number of entires in the index.
* entries: The array of index entries
*/
@@ -203,19 +212,31 @@ typedef struct H5FD_vfd_swmr_md_header {
size_t index_length;
} H5FD_vfd_swmr_md_header;
+/* Lookup the shadow-index entry corresponding to page number `target_page`
+ * in the HDF5 file and return it. If there is no match, return NULL.
+ *
+ * The lookup is performed by binary search on the `nentries` shadow index
+ * entries at `idx`. The entries must be sorted by their offset in the
+ * HDF5 file. Each entry must have a unique HDF5 file offset.
+ *
+ * If `reuse_garbage` is true, then entries marked for garbage collection
+ * are eligible search results. Return NULL if a matching entry is
+ * found, but the entry is marked for garbage collection and `reuse_garbage`
+ * is false.
+ */
static inline H5FD_vfd_swmr_idx_entry_t *
vfd_swmr_pageno_to_mdf_idx_entry(H5FD_vfd_swmr_idx_entry_t *idx,
- uint32_t nindices, uint64_t target_page, bool reuse_garbage)
+ uint32_t nentries, uint64_t target_page, bool reuse_garbage)
{
uint32_t top;
uint32_t bottom;
uint32_t probe;
- if (nindices < 1)
+ if (nentries < 1)
return NULL;
bottom = 0;
- top = nindices;
+ top = nentries;
do {
probe = (top + bottom) / 2;
diff --git a/src/H5FDvfd_swmr.c b/src/H5FDvfd_swmr.c
index e2f8513..898adaf 100644
--- a/src/H5FDvfd_swmr.c
+++ b/src/H5FDvfd_swmr.c
@@ -50,16 +50,25 @@ typedef struct H5FD_vfd_swmr_t {
H5FD_vfd_swmr_md_header md_header; /* Metadata file header */
H5FD_vfd_swmr_md_index md_index; /* Metadata file index */
- uint32_t api_elapsed_nslots;
uint64_t *api_elapsed_ticks; /* Histogram of ticks elapsed
* inside the API (reader only).
+ * api_elapsed_ticks[elapsed] is
+ * the number of times `elapsed`
+ * ticks passed in an API call
+ * during the program lifetime.
*/
+ uint32_t api_elapsed_nbuckets; /* Number of histogram buckets. */
+
hbool_t pb_configured; /* boolean flag set to TRUE */
/* when the page buffer is */
/* and to FALSE otherwise. */
/* Used for sanity checking. */
H5F_vfd_swmr_config_t config;
- bool writer; /* True iff configured to write. */
+ bool writer; /* True iff configured to write.
+ * All methods on a write-mode
+ * SWMR VFD instance are passed
+ * to the lower VFD instance.
+ */
} H5FD_vfd_swmr_t;
#define MAXADDR (((haddr_t)1<<(8*sizeof(HDoff_t)-1))-1)
@@ -252,18 +261,22 @@ done:
FUNC_LEAVE_API(ret_value)
} /* end H5Pset_fapl_vfd_swmr() */
+/* Perform the reader-only aspects of opening in VFD SWMR mode:
+ * initialize histogram of ticks spent in API calls, wait for the
+ * shadow file to appear, load the header and index.
+ */
static herr_t
H5FD__swmr_reader_open(H5FD_vfd_swmr_t *file)
{
- h5_retry_t retry; /* retry state */
+ h5_retry_t retry;
bool do_try; /* more tries remain */
herr_t ret_value = SUCCEED;
FUNC_ENTER_STATIC
- file->api_elapsed_nslots = file->config.max_lag + 1;
+ file->api_elapsed_nbuckets = file->config.max_lag + 1;
file->api_elapsed_ticks =
- calloc(file->api_elapsed_nslots, sizeof(*file->api_elapsed_ticks));
+ calloc(file->api_elapsed_nbuckets, sizeof(*file->api_elapsed_ticks));
if (file->api_elapsed_ticks == NULL) {
HGOTO_ERROR(H5E_FILE, H5E_CANTALLOC, FAIL,
@@ -408,6 +421,10 @@ done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD_vfd_swmr_open() */
+/* Perform the reader-only aspects of closing in VFD SWMR mode: optionally
+ * log and always release the histogram of ticks spent in API calls,
+ * close the shadow file, release the shadow index.
+ */
static void
swmr_reader_close(H5FD_vfd_swmr_t *file)
{
@@ -415,7 +432,7 @@ swmr_reader_close(H5FD_vfd_swmr_t *file)
if (file->api_elapsed_ticks != NULL) {
uint32_t i;
- for (i = 0; i < file->api_elapsed_nslots; i++) {
+ for (i = 0; i < file->api_elapsed_nbuckets; i++) {
hlog_fast(swmr_stats,
"%s: %" PRIu32 " ticks elapsed in API %" PRIu64 " times",
__func__, i, file->api_elapsed_ticks[i]);
@@ -958,6 +975,9 @@ H5FD_vfd_swmr_write(H5FD_t *_file, H5FD_mem_t type,
{
H5FD_vfd_swmr_t *file = (H5FD_vfd_swmr_t *)_file;
+ /* This routine should only be called if the VFD instance is opened
+ * for writing.
+ */
HDassert(file->writer);
return H5FD_write(file->hdf5_file_lf, type, addr, size, buf);
@@ -980,12 +1000,8 @@ H5FD_vfd_swmr_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id,
{
H5FD_vfd_swmr_t *file = (H5FD_vfd_swmr_t *)_file; /* VFD SWMR file struct */
- /* The VFD SWMR vfd should only be used by the VFD SWMR reader,
- * and thus this file should only be opened R/O.
- *
- * Thus this function should never be called and should return error
- *
- * For now, just assert FALSE.
+ /* This routine should only be called if the VFD instance is opened
+ * for writing.
*/
HDassert(file->writer);
@@ -1628,12 +1644,15 @@ H5FD_vfd_swmr_set_pb_configured(H5FD_t *_file)
} /* H5FD_vfd_swmr_set_pb_configured() */
+/* In the histogram of ticks spent in API calls, increase the bucket
+ * for `elapsed` ticks by one.
+ */
void
H5FD_vfd_swmr_record_elapsed_ticks(H5FD_t *_file, uint64_t elapsed)
{
H5FD_vfd_swmr_t *file = (H5FD_vfd_swmr_t *)_file;
- uint32_t elapsed_idx = MIN(elapsed, file->api_elapsed_nslots);
+ uint32_t elapsed_idx = MIN(elapsed, file->api_elapsed_nbuckets);
file->api_elapsed_ticks[elapsed_idx]++;
}
diff --git a/src/H5FDvfd_swmr_instr.c b/src/H5FDvfd_swmr_instr.c
index 865483c..da62351 100644
--- a/src/H5FDvfd_swmr_instr.c
+++ b/src/H5FDvfd_swmr_instr.c
@@ -14,6 +14,37 @@
#include "H5Fpublic.h"
#include "H5FDvfd_swmr.h"
+/* vfd_swmr_writer_may_increase_tick_to() and
+ * vfd_swmr_reader_did_increase_tick_to() are instrumentation points for
+ * VFD SWMR tests to use to coordinate the tick-number increases
+ * on a single writer with the progress of a single reader.
+ *
+ * This file provides the default, do-nothing implementations for both
+ * instrumentation routines.
+ *
+ * A VFD SWMR writer calls vfd_swmr_writer_may_increase_tick_to() with the
+ * increased tick number that it proposes, `tick_num`. The argument
+ * `wait_for_reader` tells whether or not the writer can wait for the reader
+ * before increasing its tick number. If `true`, then
+ * vfd_swmr_writer_may_increase_tick_to() should
+ * block until the reader is finished using the shadow-file content
+ * from ticks `tick_num - max_lag` and before, returning `true`.
+ * If `false`, then
+ * vfd_swmr_writer_may_increase_tick_to() immediately return `true` if
+ * the new tick number does permissible, otherwise `false`.
+ *
+ * After a VFD SWMR reader increases its tick number, it calls
+ * vfd_swmr_reader_did_increase_tick_to() with the new tick number.
+ *
+ * The test programs test/vfd_swmr_zoo_{reader,writer} provide
+ * their own vfd_swmr_writer_may_increase_tick_to() and
+ * vfd_swmr_reader_did_increase_tick_to() implementations that override the
+ * ones in the library. In the "zoo"
+ * test (test/vfd_swmr_zoo_{reader,writer}), the reader and the writer
+ * use a shared file to coordinate tick-number increases so that the writer
+ * can call H5Fvfd_swmr_end_tick() to increase its tick number at an arbitrary
+ * rate without outrunning the reader.
+ */
bool
vfd_swmr_writer_may_increase_tick_to(uint64_t H5_ATTR_UNUSED tick_num,
bool H5_ATTR_UNUSED wait_for_reader)
diff --git a/src/H5FDvfd_swmr_private.h b/src/H5FDvfd_swmr_private.h
index 67a72ec..74a937f 100644
--- a/src/H5FDvfd_swmr_private.h
+++ b/src/H5FDvfd_swmr_private.h
@@ -53,7 +53,7 @@ typedef struct eot_queue_entry {
hbool_t vfd_swmr_writer;
uint64_t tick_num;
struct timespec end_of_tick;
- struct H5F_t *vfd_swmr_file; /* NOTE: for the time being use H5F_t instead H5F_file_t */
+ struct H5F_t *vfd_swmr_file; /* NOTE: for the time being use H5F_t instead H5F_shared_t */
TAILQ_ENTRY(eot_queue_entry) link;
} eot_queue_entry_t;
diff --git a/src/H5Fpkg.h b/src/H5Fpkg.h
index 1136cb4..71d0ce4 100644
--- a/src/H5Fpkg.h
+++ b/src/H5Fpkg.h
@@ -217,24 +217,14 @@ typedef struct H5F_mtab_t {
H5F_mount_t *child; /* An array of mount records */
} H5F_mtab_t;
-/*
- * VFD SWMR: Entry for the delayed free space release doubly linked list
- *
- * md_file_page_offset: Unsigned 64-bit value containing the base address
- * of the metadata page, or multi page metadata entry
- * in the metadata file IN PAGES.
- * To obtain byte offset, multiply this value by the
- * page size.
- * length: The length of the metadata page or multi page
- * metadata entry in BYTES.
- * tick_num: Sequence # of the current tick
- * link: tailqueue linkage
+/* Deferred-free record for the shadow file: records a region of bytes in
+ * the shadow file to release after max_lag ticks.
*/
typedef struct shadow_defree {
- uint64_t offset;
- uint32_t length;
- uint64_t tick_num;
- TAILQ_ENTRY(shadow_defree) link;
+ uint64_t offset; // offset of the region in *bytes*
+ uint32_t length; // length of the region in *bytes*
+ uint64_t tick_num; // tick number when the free was deferred
+ TAILQ_ENTRY(shadow_defree) link; // deferred-free queue linkage
} shadow_defree_t;
/* Structure specifically to store superblock. This was originally
@@ -256,17 +246,27 @@ typedef struct H5F_super_t {
H5G_entry_t *root_ent; /* Root group symbol table entry */
} H5F_super_t;
-/* VFD SWMR: deferred free on the lower VFD. */
+/* Deferred-free record for the lower file: records a region of bytes in
+ * the file below the SWMR VFD to release after a delay.
+ */
typedef struct lower_defree {
- SIMPLEQ_ENTRY(lower_defree) link;
- H5FD_mem_t alloc_type;
- haddr_t addr;
- hsize_t size;
- uint64_t free_after_tick;
+ SIMPLEQ_ENTRY(lower_defree) link; // deferred-free queue linkage
+ H5FD_mem_t alloc_type; // type with which the region was allocated
+ haddr_t addr; // start of the region *in bytes*
+ hsize_t size; // length of the region *in bytes*
+ uint64_t free_after_tick; /* the region may be reused on tick
+ * free_after_tick + 1 at the earliest
+ */
} lower_defree_t;
+/* Queue of deferred-free records (lower_defree_t) for the lower file, sorted
+ * head-to-tail in increasing `free_after_tick` order.
+ */
typedef SIMPLEQ_HEAD(lower_defree_queue, lower_defree) lower_defree_queue_t;
+/* Queue of deferred-free records (shadow_defree_t) for the shadow file, sorted
+ * head-to-tail in increasing `tick_num` order.
+ */
typedef TAILQ_HEAD(shadow_defree_queue, shadow_defree) shadow_defree_queue_t;
/*
@@ -403,7 +403,9 @@ struct H5F_shared_t {
* configuration from the
* FAPL used to open the file
*/
- haddr_t writer_index_offset;
+ haddr_t writer_index_offset; /* Current byte offset of the
+ * shadow index in the shadow file.
+ */
hbool_t vfd_swmr; /* The file is opened with VFD
* SWMR configured or not
*/
@@ -413,7 +415,9 @@ struct H5F_shared_t {
uint64_t tick_num; /* Number of the current tick */
struct timespec end_of_tick; /* End time of the current tick */
- lower_defree_queue_t lower_defrees; /* For use by VFD SWMR writers. */
+ lower_defree_queue_t lower_defrees; /* Records of lower-file space
+ * awaiting reclamation.
+ */
/* VFD SWMR metadata file index */
H5FD_vfd_swmr_idx_entry_t * mdf_idx; /* pointer to an array of instance
* of H5FD_vfd_swmr_idx_entry_t of
diff --git a/src/H5Fpublic.h b/src/H5Fpublic.h
index d4f6341..17d8ce1 100644
--- a/src/H5Fpublic.h
+++ b/src/H5Fpublic.h
@@ -260,17 +260,15 @@ typedef herr_t (*H5F_flush_cb_t)(hid_t object_id, void *udata);
* is selected.
*
* md_pages_reserved:
- * An integer field indicating the number of pages reserved
- * at the head of the metadata file. This value must be greater than
- * or equal to 1.
- * When the metadata file is created, the specified number of pages is
- * reserved at the head of the metadata file. In the current
- * implementation, the size of the metadata file header plus the
- * index is limited to this size.
- * Further, in the POSIX case, when readers check for an updated index,
- * this check will start with a read of md_pages_reserved pages from
- * the head of the metadata file.
- *
+ * The `md_pages_reserved` parameter tells how many pages to reserve
+ * at the beginning of the shadow file for the shadow-file header
+ * and the shadow index. The header has an entire page to itself.
+ * The remaining `md_pages_reserved - 1` pages are reserved for the
+ * shadow index. If the index grows larger than its initial
+ * allocation, then it will move to a new location in the shadow file,
+ * and the initial allocation will be reclaimed. `md_pages_reserved`
+ * must be at least 2.
+ *
* pb_expansion_threshold:
* An integer field indicating the threshold for the page buffer size.
* During a tick, the page buffer must expand as necessary to retain copies
diff --git a/src/H5Fvfd_swmr.c b/src/H5Fvfd_swmr.c
index 376fa38..dbe04ec 100644
--- a/src/H5Fvfd_swmr.c
+++ b/src/H5Fvfd_swmr.c
@@ -1956,6 +1956,7 @@ vfd_swmr_enlarge_shadow_index(H5F_t *f)
old_mdf_idx = shared->mdf_idx;
old_mdf_idx_len = shared->mdf_idx_len;
+ /* New length is double previous or UINT32_MAX, whichever is smaller. */
if (UINT32_MAX - old_mdf_idx_len >= old_mdf_idx_len)
new_mdf_idx_len = old_mdf_idx_len * 2;
else
diff --git a/src/H5HGtrap.c b/src/H5HGtrap.c
index 2f09d48..6b52007 100644
--- a/src/H5HGtrap.c
+++ b/src/H5HGtrap.c
@@ -23,6 +23,24 @@
#include "H5Eprivate.h" /* Error handling */
#include "H5HGpkg.h" /* Global heaps */
+/* H5HG_trap() is an instrumentation point for the global heap.
+ * The H5HG_trap() result modifies the global heap's treatment of
+ * an unexpected condition that ordinarily would cause an
+ * HDassert() statement to abort the program.
+ *
+ * Currently, just one function, H5HG_read(), calls H5HG_trap(), using
+ * the `reason` string "out of bounds".
+ *
+ * Test programs such as test/vfd_swmr_vlstr_{reader,writer}.c provide
+ * their own H5HG_trap() implementation that overrides the one in the library.
+ *
+ * H5HG_trap() returns `true` if the caller should generate an error-stack
+ * entry and return an error code to the caller's caller.
+ *
+ * H5HG_trap() returns `false` if the caller should blithely carry on;
+ * if NDEBUG is not #defined, then the caller will ordinarily abort the
+ * program in a subsequent HDassert() statement.
+ */
bool
H5HG_trap(const char *reason)
{
diff --git a/src/H5Pfapl.c b/src/H5Pfapl.c
index ed6f06a..878ab44 100644
--- a/src/H5Pfapl.c
+++ b/src/H5Pfapl.c
@@ -4083,15 +4083,15 @@ H5P__facc_vfd_swmr_config_dec(const void **_pp, void *_value)
/* int */
INT32DECODE(*pp, config->version);
- INT32DECODE(*pp, config->tick_len);
- INT32DECODE(*pp, config->max_lag);
+ UINT32DECODE(*pp, config->tick_len);
+ UINT32DECODE(*pp, config->max_lag);
H5_DECODE_UNSIGNED(*pp, config->writer);
H5_DECODE_UNSIGNED(*pp, config->flush_raw_data);
/* int */
- INT32DECODE(*pp, config->md_pages_reserved);
- INT32DECODE(*pp, config->pb_expansion_threshold);
+ UINT32DECODE(*pp, config->md_pages_reserved);
+ UINT32DECODE(*pp, config->pb_expansion_threshold);
HDstrcpy(config->md_file_path, (const char *)(*pp));
*pp += H5F__MAX_VFD_SWMR_FILE_NAME_LEN + 1;