diff options
author | David Young <dyoung@hdfgroup.org> | 2020-08-31 23:56:25 (GMT) |
---|---|---|
committer | David Young <dyoung@hdfgroup.org> | 2020-08-31 23:56:25 (GMT) |
commit | 79560265af7dd3ae34f12b3eb4045ca2f1cb7520 (patch) | |
tree | 27033dffdd2a934975b897c039d4a82f7b7560f0 | |
parent | b0e99604609fa054b914ff0638f3d6345b8e7774 (diff) | |
parent | f45bb2197958238c109c953315202d67af5238ef (diff) | |
download | hdf5-79560265af7dd3ae34f12b3eb4045ca2f1cb7520.zip hdf5-79560265af7dd3ae34f12b3eb4045ca2f1cb7520.tar.gz hdf5-79560265af7dd3ae34f12b3eb4045ca2f1cb7520.tar.bz2 |
Merge branch 'feature/vfd_swmr' into multi
-rw-r--r-- | doc/vfd-swmr-user-guide.md | 4 | ||||
-rw-r--r-- | src/H5FDprivate.h | 31 | ||||
-rw-r--r-- | src/H5FDvfd_swmr.c | 45 | ||||
-rw-r--r-- | src/H5FDvfd_swmr_instr.c | 31 | ||||
-rw-r--r-- | src/H5FDvfd_swmr_private.h | 2 | ||||
-rw-r--r-- | src/H5Fpkg.h | 52 | ||||
-rw-r--r-- | src/H5Fpublic.h | 20 | ||||
-rw-r--r-- | src/H5Fvfd_swmr.c | 1 | ||||
-rw-r--r-- | src/H5HGtrap.c | 18 | ||||
-rw-r--r-- | src/H5Pfapl.c | 8 |
10 files changed, 152 insertions, 60 deletions
diff --git a/doc/vfd-swmr-user-guide.md b/doc/vfd-swmr-user-guide.md index 69b7f96..dadd783 100644 --- a/doc/vfd-swmr-user-guide.md +++ b/doc/vfd-swmr-user-guide.md @@ -446,8 +446,8 @@ Improvements to VFD SWMR may also alleviate the problem. ## Microsoft Windows -VFD SWMR does not support Microsoft Windows at this time. We do plan to -add support this year. +VFD SWMR does not support Microsoft Windows at this time. We are +investigating to see when we can add Windows support. ## Supported filesystems diff --git a/src/H5FDprivate.h b/src/H5FDprivate.h index 0d05b15..b1d8708 100644 --- a/src/H5FDprivate.h +++ b/src/H5FDprivate.h @@ -157,6 +157,15 @@ * lower file and is therefore about to be removed from the * metadata file * + * garbage: `true` if the entry is marked for garbage collection and is + * thus invalid. + * + * For n the number of entries, deleting an entry is O(n). + * H5PB_dest() deletes all entries. Instead of deleting + * entries one-by-one at O(n^2) cost, H5PB_dest() marks + * each disused entry for garbage collection and sweeps all + * entries up before it is done. + * *---------------------------------------------------------------------------- */ typedef struct H5FD_vfd_swmr_idx_entry_t { @@ -175,8 +184,8 @@ typedef struct H5FD_vfd_swmr_idx_entry_t { /* * tick_num: Sequence number of the current tick. - * Initialized to zero on file creation/open, and incremented by the - * VFD SWMR writer at the end of each tick. + * Initialized to zero on file creation/open, and incremented + * by the VFD SWMR writer at the end of each tick. * num_entries: The number of entires in the index. * entries: The array of index entries */ @@ -203,19 +212,31 @@ typedef struct H5FD_vfd_swmr_md_header { size_t index_length; } H5FD_vfd_swmr_md_header; +/* Lookup the shadow-index entry corresponding to page number `target_page` + * in the HDF5 file and return it. If there is no match, return NULL. + * + * The lookup is performed by binary search on the `nentries` shadow index + * entries at `idx`. The entries must be sorted by their offset in the + * HDF5 file. Each entry must have a unique HDF5 file offset. + * + * If `reuse_garbage` is true, then entries marked for garbage collection + * are eligible search results. Return NULL if a matching entry is + * found, but the entry is marked for garbage collection and `reuse_garbage` + * is false. + */ static inline H5FD_vfd_swmr_idx_entry_t * vfd_swmr_pageno_to_mdf_idx_entry(H5FD_vfd_swmr_idx_entry_t *idx, - uint32_t nindices, uint64_t target_page, bool reuse_garbage) + uint32_t nentries, uint64_t target_page, bool reuse_garbage) { uint32_t top; uint32_t bottom; uint32_t probe; - if (nindices < 1) + if (nentries < 1) return NULL; bottom = 0; - top = nindices; + top = nentries; do { probe = (top + bottom) / 2; diff --git a/src/H5FDvfd_swmr.c b/src/H5FDvfd_swmr.c index e2f8513..898adaf 100644 --- a/src/H5FDvfd_swmr.c +++ b/src/H5FDvfd_swmr.c @@ -50,16 +50,25 @@ typedef struct H5FD_vfd_swmr_t { H5FD_vfd_swmr_md_header md_header; /* Metadata file header */ H5FD_vfd_swmr_md_index md_index; /* Metadata file index */ - uint32_t api_elapsed_nslots; uint64_t *api_elapsed_ticks; /* Histogram of ticks elapsed * inside the API (reader only). + * api_elapsed_ticks[elapsed] is + * the number of times `elapsed` + * ticks passed in an API call + * during the program lifetime. */ + uint32_t api_elapsed_nbuckets; /* Number of histogram buckets. */ + hbool_t pb_configured; /* boolean flag set to TRUE */ /* when the page buffer is */ /* and to FALSE otherwise. */ /* Used for sanity checking. */ H5F_vfd_swmr_config_t config; - bool writer; /* True iff configured to write. */ + bool writer; /* True iff configured to write. + * All methods on a write-mode + * SWMR VFD instance are passed + * to the lower VFD instance. + */ } H5FD_vfd_swmr_t; #define MAXADDR (((haddr_t)1<<(8*sizeof(HDoff_t)-1))-1) @@ -252,18 +261,22 @@ done: FUNC_LEAVE_API(ret_value) } /* end H5Pset_fapl_vfd_swmr() */ +/* Perform the reader-only aspects of opening in VFD SWMR mode: + * initialize histogram of ticks spent in API calls, wait for the + * shadow file to appear, load the header and index. + */ static herr_t H5FD__swmr_reader_open(H5FD_vfd_swmr_t *file) { - h5_retry_t retry; /* retry state */ + h5_retry_t retry; bool do_try; /* more tries remain */ herr_t ret_value = SUCCEED; FUNC_ENTER_STATIC - file->api_elapsed_nslots = file->config.max_lag + 1; + file->api_elapsed_nbuckets = file->config.max_lag + 1; file->api_elapsed_ticks = - calloc(file->api_elapsed_nslots, sizeof(*file->api_elapsed_ticks)); + calloc(file->api_elapsed_nbuckets, sizeof(*file->api_elapsed_ticks)); if (file->api_elapsed_ticks == NULL) { HGOTO_ERROR(H5E_FILE, H5E_CANTALLOC, FAIL, @@ -408,6 +421,10 @@ done: FUNC_LEAVE_NOAPI(ret_value) } /* end H5FD_vfd_swmr_open() */ +/* Perform the reader-only aspects of closing in VFD SWMR mode: optionally + * log and always release the histogram of ticks spent in API calls, + * close the shadow file, release the shadow index. + */ static void swmr_reader_close(H5FD_vfd_swmr_t *file) { @@ -415,7 +432,7 @@ swmr_reader_close(H5FD_vfd_swmr_t *file) if (file->api_elapsed_ticks != NULL) { uint32_t i; - for (i = 0; i < file->api_elapsed_nslots; i++) { + for (i = 0; i < file->api_elapsed_nbuckets; i++) { hlog_fast(swmr_stats, "%s: %" PRIu32 " ticks elapsed in API %" PRIu64 " times", __func__, i, file->api_elapsed_ticks[i]); @@ -958,6 +975,9 @@ H5FD_vfd_swmr_write(H5FD_t *_file, H5FD_mem_t type, { H5FD_vfd_swmr_t *file = (H5FD_vfd_swmr_t *)_file; + /* This routine should only be called if the VFD instance is opened + * for writing. + */ HDassert(file->writer); return H5FD_write(file->hdf5_file_lf, type, addr, size, buf); @@ -980,12 +1000,8 @@ H5FD_vfd_swmr_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, { H5FD_vfd_swmr_t *file = (H5FD_vfd_swmr_t *)_file; /* VFD SWMR file struct */ - /* The VFD SWMR vfd should only be used by the VFD SWMR reader, - * and thus this file should only be opened R/O. - * - * Thus this function should never be called and should return error - * - * For now, just assert FALSE. + /* This routine should only be called if the VFD instance is opened + * for writing. */ HDassert(file->writer); @@ -1628,12 +1644,15 @@ H5FD_vfd_swmr_set_pb_configured(H5FD_t *_file) } /* H5FD_vfd_swmr_set_pb_configured() */ +/* In the histogram of ticks spent in API calls, increase the bucket + * for `elapsed` ticks by one. + */ void H5FD_vfd_swmr_record_elapsed_ticks(H5FD_t *_file, uint64_t elapsed) { H5FD_vfd_swmr_t *file = (H5FD_vfd_swmr_t *)_file; - uint32_t elapsed_idx = MIN(elapsed, file->api_elapsed_nslots); + uint32_t elapsed_idx = MIN(elapsed, file->api_elapsed_nbuckets); file->api_elapsed_ticks[elapsed_idx]++; } diff --git a/src/H5FDvfd_swmr_instr.c b/src/H5FDvfd_swmr_instr.c index 865483c..da62351 100644 --- a/src/H5FDvfd_swmr_instr.c +++ b/src/H5FDvfd_swmr_instr.c @@ -14,6 +14,37 @@ #include "H5Fpublic.h" #include "H5FDvfd_swmr.h" +/* vfd_swmr_writer_may_increase_tick_to() and + * vfd_swmr_reader_did_increase_tick_to() are instrumentation points for + * VFD SWMR tests to use to coordinate the tick-number increases + * on a single writer with the progress of a single reader. + * + * This file provides the default, do-nothing implementations for both + * instrumentation routines. + * + * A VFD SWMR writer calls vfd_swmr_writer_may_increase_tick_to() with the + * increased tick number that it proposes, `tick_num`. The argument + * `wait_for_reader` tells whether or not the writer can wait for the reader + * before increasing its tick number. If `true`, then + * vfd_swmr_writer_may_increase_tick_to() should + * block until the reader is finished using the shadow-file content + * from ticks `tick_num - max_lag` and before, returning `true`. + * If `false`, then + * vfd_swmr_writer_may_increase_tick_to() immediately return `true` if + * the new tick number does permissible, otherwise `false`. + * + * After a VFD SWMR reader increases its tick number, it calls + * vfd_swmr_reader_did_increase_tick_to() with the new tick number. + * + * The test programs test/vfd_swmr_zoo_{reader,writer} provide + * their own vfd_swmr_writer_may_increase_tick_to() and + * vfd_swmr_reader_did_increase_tick_to() implementations that override the + * ones in the library. In the "zoo" + * test (test/vfd_swmr_zoo_{reader,writer}), the reader and the writer + * use a shared file to coordinate tick-number increases so that the writer + * can call H5Fvfd_swmr_end_tick() to increase its tick number at an arbitrary + * rate without outrunning the reader. + */ bool vfd_swmr_writer_may_increase_tick_to(uint64_t H5_ATTR_UNUSED tick_num, bool H5_ATTR_UNUSED wait_for_reader) diff --git a/src/H5FDvfd_swmr_private.h b/src/H5FDvfd_swmr_private.h index 67a72ec..74a937f 100644 --- a/src/H5FDvfd_swmr_private.h +++ b/src/H5FDvfd_swmr_private.h @@ -53,7 +53,7 @@ typedef struct eot_queue_entry { hbool_t vfd_swmr_writer; uint64_t tick_num; struct timespec end_of_tick; - struct H5F_t *vfd_swmr_file; /* NOTE: for the time being use H5F_t instead H5F_file_t */ + struct H5F_t *vfd_swmr_file; /* NOTE: for the time being use H5F_t instead H5F_shared_t */ TAILQ_ENTRY(eot_queue_entry) link; } eot_queue_entry_t; diff --git a/src/H5Fpkg.h b/src/H5Fpkg.h index 1136cb4..71d0ce4 100644 --- a/src/H5Fpkg.h +++ b/src/H5Fpkg.h @@ -217,24 +217,14 @@ typedef struct H5F_mtab_t { H5F_mount_t *child; /* An array of mount records */ } H5F_mtab_t; -/* - * VFD SWMR: Entry for the delayed free space release doubly linked list - * - * md_file_page_offset: Unsigned 64-bit value containing the base address - * of the metadata page, or multi page metadata entry - * in the metadata file IN PAGES. - * To obtain byte offset, multiply this value by the - * page size. - * length: The length of the metadata page or multi page - * metadata entry in BYTES. - * tick_num: Sequence # of the current tick - * link: tailqueue linkage +/* Deferred-free record for the shadow file: records a region of bytes in + * the shadow file to release after max_lag ticks. */ typedef struct shadow_defree { - uint64_t offset; - uint32_t length; - uint64_t tick_num; - TAILQ_ENTRY(shadow_defree) link; + uint64_t offset; // offset of the region in *bytes* + uint32_t length; // length of the region in *bytes* + uint64_t tick_num; // tick number when the free was deferred + TAILQ_ENTRY(shadow_defree) link; // deferred-free queue linkage } shadow_defree_t; /* Structure specifically to store superblock. This was originally @@ -256,17 +246,27 @@ typedef struct H5F_super_t { H5G_entry_t *root_ent; /* Root group symbol table entry */ } H5F_super_t; -/* VFD SWMR: deferred free on the lower VFD. */ +/* Deferred-free record for the lower file: records a region of bytes in + * the file below the SWMR VFD to release after a delay. + */ typedef struct lower_defree { - SIMPLEQ_ENTRY(lower_defree) link; - H5FD_mem_t alloc_type; - haddr_t addr; - hsize_t size; - uint64_t free_after_tick; + SIMPLEQ_ENTRY(lower_defree) link; // deferred-free queue linkage + H5FD_mem_t alloc_type; // type with which the region was allocated + haddr_t addr; // start of the region *in bytes* + hsize_t size; // length of the region *in bytes* + uint64_t free_after_tick; /* the region may be reused on tick + * free_after_tick + 1 at the earliest + */ } lower_defree_t; +/* Queue of deferred-free records (lower_defree_t) for the lower file, sorted + * head-to-tail in increasing `free_after_tick` order. + */ typedef SIMPLEQ_HEAD(lower_defree_queue, lower_defree) lower_defree_queue_t; +/* Queue of deferred-free records (shadow_defree_t) for the shadow file, sorted + * head-to-tail in increasing `tick_num` order. + */ typedef TAILQ_HEAD(shadow_defree_queue, shadow_defree) shadow_defree_queue_t; /* @@ -403,7 +403,9 @@ struct H5F_shared_t { * configuration from the * FAPL used to open the file */ - haddr_t writer_index_offset; + haddr_t writer_index_offset; /* Current byte offset of the + * shadow index in the shadow file. + */ hbool_t vfd_swmr; /* The file is opened with VFD * SWMR configured or not */ @@ -413,7 +415,9 @@ struct H5F_shared_t { uint64_t tick_num; /* Number of the current tick */ struct timespec end_of_tick; /* End time of the current tick */ - lower_defree_queue_t lower_defrees; /* For use by VFD SWMR writers. */ + lower_defree_queue_t lower_defrees; /* Records of lower-file space + * awaiting reclamation. + */ /* VFD SWMR metadata file index */ H5FD_vfd_swmr_idx_entry_t * mdf_idx; /* pointer to an array of instance * of H5FD_vfd_swmr_idx_entry_t of diff --git a/src/H5Fpublic.h b/src/H5Fpublic.h index d4f6341..17d8ce1 100644 --- a/src/H5Fpublic.h +++ b/src/H5Fpublic.h @@ -260,17 +260,15 @@ typedef herr_t (*H5F_flush_cb_t)(hid_t object_id, void *udata); * is selected. * * md_pages_reserved: - * An integer field indicating the number of pages reserved - * at the head of the metadata file. This value must be greater than - * or equal to 1. - * When the metadata file is created, the specified number of pages is - * reserved at the head of the metadata file. In the current - * implementation, the size of the metadata file header plus the - * index is limited to this size. - * Further, in the POSIX case, when readers check for an updated index, - * this check will start with a read of md_pages_reserved pages from - * the head of the metadata file. - * + * The `md_pages_reserved` parameter tells how many pages to reserve + * at the beginning of the shadow file for the shadow-file header + * and the shadow index. The header has an entire page to itself. + * The remaining `md_pages_reserved - 1` pages are reserved for the + * shadow index. If the index grows larger than its initial + * allocation, then it will move to a new location in the shadow file, + * and the initial allocation will be reclaimed. `md_pages_reserved` + * must be at least 2. + * * pb_expansion_threshold: * An integer field indicating the threshold for the page buffer size. * During a tick, the page buffer must expand as necessary to retain copies diff --git a/src/H5Fvfd_swmr.c b/src/H5Fvfd_swmr.c index 376fa38..dbe04ec 100644 --- a/src/H5Fvfd_swmr.c +++ b/src/H5Fvfd_swmr.c @@ -1956,6 +1956,7 @@ vfd_swmr_enlarge_shadow_index(H5F_t *f) old_mdf_idx = shared->mdf_idx; old_mdf_idx_len = shared->mdf_idx_len; + /* New length is double previous or UINT32_MAX, whichever is smaller. */ if (UINT32_MAX - old_mdf_idx_len >= old_mdf_idx_len) new_mdf_idx_len = old_mdf_idx_len * 2; else diff --git a/src/H5HGtrap.c b/src/H5HGtrap.c index 2f09d48..6b52007 100644 --- a/src/H5HGtrap.c +++ b/src/H5HGtrap.c @@ -23,6 +23,24 @@ #include "H5Eprivate.h" /* Error handling */ #include "H5HGpkg.h" /* Global heaps */ +/* H5HG_trap() is an instrumentation point for the global heap. + * The H5HG_trap() result modifies the global heap's treatment of + * an unexpected condition that ordinarily would cause an + * HDassert() statement to abort the program. + * + * Currently, just one function, H5HG_read(), calls H5HG_trap(), using + * the `reason` string "out of bounds". + * + * Test programs such as test/vfd_swmr_vlstr_{reader,writer}.c provide + * their own H5HG_trap() implementation that overrides the one in the library. + * + * H5HG_trap() returns `true` if the caller should generate an error-stack + * entry and return an error code to the caller's caller. + * + * H5HG_trap() returns `false` if the caller should blithely carry on; + * if NDEBUG is not #defined, then the caller will ordinarily abort the + * program in a subsequent HDassert() statement. + */ bool H5HG_trap(const char *reason) { diff --git a/src/H5Pfapl.c b/src/H5Pfapl.c index ed6f06a..878ab44 100644 --- a/src/H5Pfapl.c +++ b/src/H5Pfapl.c @@ -4083,15 +4083,15 @@ H5P__facc_vfd_swmr_config_dec(const void **_pp, void *_value) /* int */ INT32DECODE(*pp, config->version); - INT32DECODE(*pp, config->tick_len); - INT32DECODE(*pp, config->max_lag); + UINT32DECODE(*pp, config->tick_len); + UINT32DECODE(*pp, config->max_lag); H5_DECODE_UNSIGNED(*pp, config->writer); H5_DECODE_UNSIGNED(*pp, config->flush_raw_data); /* int */ - INT32DECODE(*pp, config->md_pages_reserved); - INT32DECODE(*pp, config->pb_expansion_threshold); + UINT32DECODE(*pp, config->md_pages_reserved); + UINT32DECODE(*pp, config->pb_expansion_threshold); HDstrcpy(config->md_file_path, (const char *)(*pp)); *pp += H5F__MAX_VFD_SWMR_FILE_NAME_LEN + 1; |