From 29ee63787b8b75bdbf16468c9a7d9955e55eec78 Mon Sep 17 00:00:00 2001 From: David Young Date: Mon, 3 Feb 2020 15:04:21 -0600 Subject: Numerous changes supporting a floating shadow index: Add to the H5F_shared_t (!) a new member that tells the index in the shadow file where the index should be written. Allocate shadow filespace for the header and the index separately so that the index can float. Update tests to match the expected original location of the index. Introduce vfd_swmr_enlarge_shadow_index(), a routine that allocates space in the shadow file for a new index that has (up to) twice as many entries as the old index, allocates a new in-core index of the same size, and copies the old in-core index to the new. Call vfd_swmr_enlarge_shadow_index() in H5PB_vfd_swmr__update_index() when the in-core index has too few slots. In the comment at the top of H5FD__vfd_swmr_load_hdr_and_idx(), describe the protocol that it follows, now, when it reads the shadow header and index. Delete some dead code in the function and add a bit of diagnostic code. TBD quiet the diagnostic code. In H5F_vfd_swmr_init(), follow the protocol: write the index, first, then the header. Modify property-list checks and tests to reserve no fewer than two pages at the front of the shadow file for the header and index. --- src/H5FDprivate.h | 1 + src/H5FDvfd_swmr.c | 89 ++++++++++++++++++++---------------------- src/H5Fpkg.h | 1 + src/H5Ftest.c | 2 +- src/H5Fvfd_swmr.c | 111 ++++++++++++++++++++++++++++++++++++++++++++--------- src/H5PB.c | 4 +- src/H5Pfapl.c | 6 +-- test/vfd_swmr.c | 4 +- 8 files changed, 143 insertions(+), 75 deletions(-) diff --git a/src/H5FDprivate.h b/src/H5FDprivate.h index b201809..e0405d3 100644 --- a/src/H5FDprivate.h +++ b/src/H5FDprivate.h @@ -351,6 +351,7 @@ H5_DLL int shadow_image_defer_free(struct H5F_shared_t *, H5_DLL herr_t H5FD_vfd_swmr_get_tick_and_idx(H5FD_t *_file, hbool_t read_index, uint64_t *tick_ptr, uint32_t *num_entries_ptr, H5FD_vfd_swmr_idx_entry_t index[]); +H5_DLL H5FD_vfd_swmr_idx_entry_t *vfd_swmr_enlarge_shadow_index(struct H5F_t *); H5_DLL hbool_t H5FD_is_vfd_swmr_driver(H5FD_t *_file); H5_DLL H5FD_t *H5FD_vfd_swmr_get_underlying_vfd(H5FD_t *_file); H5_DLL void H5FD_vfd_swmr_dump_status(H5FD_t *_file, int64_t page); diff --git a/src/H5FDvfd_swmr.c b/src/H5FDvfd_swmr.c index 7b2e52b..4ee686d 100644 --- a/src/H5FDvfd_swmr.c +++ b/src/H5FDvfd_swmr.c @@ -1082,52 +1082,50 @@ done: } /* end H5FD_vfd_swmr_unlock() */ -/*------------------------------------------------------------------------- +/* * Function: H5FD__vfd_swmr_load_hdr_and_idx() * * Purpose: Load and decode the header and index in the metadata file * - * Try to load and decode the header: - * - * --If fail, RETRY - * - * --If succeed: - * - * --If the size of header and index does not fit within - * md_pages_reserved, return error - * - * --If NOT an initial open call: - * - * --If tick_num just read is the same as the VFD's - * local copy, just return - * - * --If tick_num just read is less than the VFD's - * local copy, return error - * - * --If tick_num just read is greater than the VFD's - * local copy or an initial open call: - * - * --Try to load and decode the index: - * - * --If fail, RETRY - * - * --If succeed: - * - * --If tick_num in header matches that in - * index, replace the VFD's local copy with - * the header and index just read - * - * --If tick_num in header is 1 greater than - * that in index, RETRY - * - * --Otherwise, return error + * In H5FD__vfd_swmr_load_hdr_and_idx(), we follow this protocol for reading + * the shadow file: + * + * 0 If the maximum number of retries have been attempted, then exit + * with an error. + * + * 1 Try to read the shadow file *header*. If successful, continue to 2. + * + * If there is a hard failure, then return an error. If there is a failure + * that may be transient, then sleep and retry at 0. + * + * 2 If the tick number in the header is less than the tick last read by the VFD, + * then return an error. + * + * 3 If the tick number in the header is equal to the last tick read by the + * VFD, then exit without doing anything. + * + * 4 Try to read the shadow file *index*. If successful, continue to 5. + * + * If there is a hard failure, then return an error. If there is a failure + * that may be transient, then sleep and retry at 0. + * + * 5 If a different tick number was read from the index than from the index, + * then continue at 0. + * + * 6 Try to *re-read* the shadow file *header*. If successful, continue to 7. + * + * If there is a hard failure, then return an error. If there is a failure + * that may be transient, then sleep and retry at 0. + * + * 7 Compare the header that was read previously with the new header. If + * the new header is different than the old, then we may not have read + * the index at the right shadow-file offset, or the index may have been + * read in an inconsistent state, so sleep and retry at 0. Otherwise, + * return success. * * Return: Success: SUCCEED * Failure: FAIL * - * Programmer: Vailin Choi - * - *------------------------------------------------------------------------- */ static herr_t H5FD__vfd_swmr_load_hdr_and_idx(H5FD_t *_file, hbool_t open) @@ -1141,6 +1139,7 @@ H5FD__vfd_swmr_load_hdr_and_idx(H5FD_t *_file, hbool_t open) H5FD_vfd_swmr_md_index md_index; /* Metadata file index */ herr_t ret_value = SUCCEED; /* Return value */ htri_t rc; + static uint64_t last_index_offset = 0; FUNC_ENTER_STATIC @@ -1161,17 +1160,11 @@ H5FD__vfd_swmr_load_hdr_and_idx(H5FD_t *_file, hbool_t open) if (rc != TRUE) HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "could not read header"); -#if 0 - /* Error if header + index does not fit within md_pages_reserved - * - * This check doesn't make sense if the index floats, does it? --dyoung - */ - if (H5FD_MD_HEADER_SIZE + md_header.index_length > - (hsize_t)file->md_pages_reserved * md_header.fs_page_size) { - HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, - "header + index does not fit within md_pages_reserved"); + if (md_header.index_offset != last_index_offset) { + fprintf(stderr, "index offset %" PRIu64 "\n", + md_header.index_offset); + last_index_offset = md_header.index_offset; } -#endif if (open) ; // ignore tick number on open diff --git a/src/H5Fpkg.h b/src/H5Fpkg.h index de0f9f2..b0878f7 100644 --- a/src/H5Fpkg.h +++ b/src/H5Fpkg.h @@ -402,6 +402,7 @@ struct H5F_shared_t { * configuration from the * FAPL used to open the file */ + haddr_t writer_index_offset; hbool_t vfd_swmr; /* The file is opened with VFD * SWMR configured or not */ diff --git a/src/H5Ftest.c b/src/H5Ftest.c index ed3fec1..e1569a7 100644 --- a/src/H5Ftest.c +++ b/src/H5Ftest.c @@ -497,7 +497,7 @@ H5F__vfd_swmr_verify_md_hdr_and_idx(H5F_t *f, HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "incorrect index_length read from metadata file") /* Verify index_offset read from header in the metadata file is the size of md header */ - if(md_hdr->index_offset != H5FD_MD_HEADER_SIZE) + if(md_hdr->index_offset != f->shared->fs_page_size) HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "incorrect index_offset read from metadata file") /* Verify num_entries read from index in the metadata file is num_entries */ diff --git a/src/H5Fvfd_swmr.c b/src/H5Fvfd_swmr.c index 1eef792..6c9ba05 100644 --- a/src/H5Fvfd_swmr.c +++ b/src/H5Fvfd_swmr.c @@ -160,7 +160,7 @@ herr_t H5F_vfd_swmr_init(H5F_t *f, hbool_t file_create) { hsize_t md_size; /* Size of the metadata file */ - haddr_t md_addr; /* Address returned from H5MV_alloc() */ + haddr_t hdr_addr, idx_addr; /* Addresses returned from H5MV_alloc() */ herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_NOAPI(FAIL) @@ -193,11 +193,24 @@ H5F_vfd_swmr_init(H5F_t *f, hbool_t file_create) md_size = (hsize_t)f->shared->vfd_swmr_config.md_pages_reserved * f->shared->fs_page_size; - /* Make sure that the free-space manager for the metadata file is initialized */ - if((md_addr = H5MV_alloc(f, md_size)) == HADDR_UNDEF) - HGOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, \ - "error in allocating md_pages_reserved from the metadata file") - HDassert(H5F_addr_eq(md_addr, H5FD_MD_HEADER_OFF)); + assert(f->shared->fs_page_size >= H5FD_MD_HEADER_SIZE); + + /* Allocate an entire page from the shadow file for the header. */ + if((hdr_addr = H5MV_alloc(f, f->shared->fs_page_size)) == HADDR_UNDEF) { + HGOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, + "error allocating shadow-file header"); + } + HDassert(H5F_addr_eq(hdr_addr, H5FD_MD_HEADER_OFF)); + + idx_addr = H5MV_alloc(f, md_size - f->shared->fs_page_size); + if (idx_addr == HADDR_UNDEF) { + HGOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, + "error allocating shadow-file index"); + } + + HDassert(H5F_addr_eq(idx_addr, f->shared->fs_page_size)); + + f->shared->writer_index_offset = idx_addr; /* Set the metadata file size to md_pages_reserved */ if ( -1 == HDftruncate(f->shared->vfd_swmr_md_fd, (HDoff_t)md_size) ) @@ -213,15 +226,15 @@ H5F_vfd_swmr_init(H5F_t *f, hbool_t file_create) */ if ( !file_create ) { - if ( H5F__vfd_swmr_construct_write_md_hdr(f, 0) < 0 ) + if ( H5F__vfd_swmr_construct_write_md_idx(f, 0, NULL) < 0 ) HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, \ - "fail to create header in md") + "fail to create index in md") - if ( H5F__vfd_swmr_construct_write_md_idx(f, 0, NULL) < 0 ) + if ( H5F__vfd_swmr_construct_write_md_hdr(f, 0) < 0 ) HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, \ - "fail to create index in md") + "fail to create header in md") } } else { /* VFD SWMR reader */ @@ -556,6 +569,12 @@ H5F_update_vfd_swmr_metadata_file(H5F_t *f, uint32_t num_entries, HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, \ "unable to flush clean entry") +#if 0 + fprintf(stderr, "released %" PRIu32 " bytes at %" PRIu64 "\n", + old_image->length, + old_image->md_file_page_offset * f->shared->fs_page_size); +#endif + /* Remove the entry from the delayed list */ TAILQ_REMOVE(&f->shared->old_images, old_image, link); @@ -1601,7 +1620,7 @@ H5F__vfd_swmr_construct_write_md_hdr(H5F_t *f, uint32_t num_entries) /* Encode page size, tick number, index offset, index length */ UINT32ENCODE(p, f->shared->fs_page_size); UINT64ENCODE(p, f->shared->tick_num); - UINT64ENCODE(p, hdr_size); + UINT64ENCODE(p, f->shared->writer_index_offset); UINT64ENCODE(p, H5FD_MD_INDEX_SIZE(num_entries)); /* Calculate checksum for header */ @@ -1714,10 +1733,8 @@ H5F__vfd_swmr_construct_write_md_idx(H5F_t *f, uint32_t num_entries, /* Verify the md file descriptor exists */ HDassert(f->shared->vfd_swmr_md_fd >= 0); - /* Set to right after the header */ - if ( HDlseek(f->shared->vfd_swmr_md_fd, (HDoff_t)(H5FD_MD_HEADER_OFF + H5FD_MD_HEADER_SIZE), - SEEK_SET) < 0) - + if (HDlseek(f->shared->vfd_swmr_md_fd, + (HDoff_t)f->shared->writer_index_offset, SEEK_SET) < 0) HGOTO_ERROR(H5E_VFL, H5E_SEEKERROR, FAIL, \ "unable to seek in metadata file") @@ -1815,13 +1832,14 @@ H5F__vfd_swmr_writer__create_index(H5F_t * f) HDassert(f->shared->mdf_idx_len == 0); HDassert(f->shared->mdf_idx_entries_used == 0); - bytes_available = (size_t)f->shared->fs_page_size * - (size_t)(f->shared->vfd_swmr_config.md_pages_reserved) - - H5FD_MD_HEADER_SIZE; + bytes_available = + (size_t)f->shared->fs_page_size * + (size_t)(f->shared->vfd_swmr_config.md_pages_reserved - 1); HDassert(bytes_available > 0); - entries_in_index = bytes_available / H5FD_MD_INDEX_ENTRY_SIZE; + entries_in_index = + (bytes_available - H5FD_MD_INDEX_SIZE(0)) / H5FD_MD_INDEX_ENTRY_SIZE; HDassert(entries_in_index > 0); @@ -1844,6 +1862,61 @@ done: } /* end H5F__vfd_swmr_writer__create_index() */ +H5FD_vfd_swmr_idx_entry_t * +vfd_swmr_enlarge_shadow_index(H5F_t *f) +{ + H5F_shared_t *shared = f->shared; + H5FD_vfd_swmr_idx_entry_t *ret_value = NULL; + haddr_t idx_addr; + hsize_t idx_size; + H5FD_vfd_swmr_idx_entry_t *new_mdf_idx = NULL, *old_mdf_idx; + uint32_t new_mdf_idx_len, old_mdf_idx_len; + + FUNC_ENTER_NOAPI(NULL) + + old_mdf_idx = shared->mdf_idx; + old_mdf_idx_len = shared->mdf_idx_len; + + if (UINT32_MAX - old_mdf_idx_len >= old_mdf_idx_len) + new_mdf_idx_len = old_mdf_idx_len * 2; + else + new_mdf_idx_len = UINT32_MAX; + + idx_size = H5FD_MD_INDEX_SIZE(new_mdf_idx_len); + + idx_addr = H5MV_alloc(f, idx_size); + + if (idx_addr == HADDR_UNDEF) { + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, + "shadow-file allocation failed for index") + } + + new_mdf_idx = HDmalloc(new_mdf_idx_len * sizeof(new_mdf_idx[0])); + + if (new_mdf_idx == NULL) { + (void)H5MV_free(f, idx_addr, idx_size); + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, + "memory allocation failed for md index") + } + + /* Copy the old index in its entirety to the new, instead of copying + * just the _entries_used, because the caller may have been in the + * process of adding entries, and some callers may not update + * _entries_used immediately. + */ + memcpy(new_mdf_idx, old_mdf_idx, sizeof(new_mdf_idx[0]) * old_mdf_idx_len); + + fprintf(stderr, "ding ding\n"); + /* TBD record previous index offset & size to be freed after the new one + * is in service. + */ + shared->writer_index_offset = idx_addr; + ret_value = shared->mdf_idx = new_mdf_idx; + shared->mdf_idx_len = new_mdf_idx_len; +done: + FUNC_LEAVE_NOAPI(ret_value) +} + /*------------------------------------------------------------------------- * diff --git a/src/H5PB.c b/src/H5PB.c index 86c5e21..3b140d7 100644 --- a/src/H5PB.c +++ b/src/H5PB.c @@ -1746,8 +1746,8 @@ H5PB_vfd_swmr__update_index(H5F_t *f, new_index_entry_index = shared->mdf_idx_entries_used + idx_ent_added++; - if ( new_index_entry_index >= shared->mdf_idx_len ) { - + if (new_index_entry_index >= shared->mdf_idx_len && + (idx = vfd_swmr_enlarge_shadow_index(f)) == NULL) { HDfprintf(stderr, "\n\nmax mdf index len (%" PRIu32 ") exceeded.\n\n", shared->mdf_idx_len); diff --git a/src/H5Pfapl.c b/src/H5Pfapl.c index a3873b4..f28a381 100644 --- a/src/H5Pfapl.c +++ b/src/H5Pfapl.c @@ -5582,9 +5582,9 @@ H5Pset_vfd_swmr_config(hid_t plist_id, H5F_vfd_swmr_config_t *config_ptr) if(config_ptr->max_lag < 3 ) HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "max_lag must be at least 3") - /* This field must be >= 1 */ - if(config_ptr->md_pages_reserved < 1 ) - HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "md_pages_reserved must be at least 1") + /* This field must be >= 2 */ + if(config_ptr->md_pages_reserved < 2 ) + HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "md_pages_reserved must be at least 2") /* This field must be in the range [0, 100] */ if(config_ptr->pb_expansion_threshold < 0 || config_ptr->pb_expansion_threshold > H5F__MAX_PB_EXPANSION_THRESHOLD) diff --git a/test/vfd_swmr.c b/test/vfd_swmr.c index 9cc481d..a3b52d5 100644 --- a/test/vfd_swmr.c +++ b/test/vfd_swmr.c @@ -63,7 +63,7 @@ static unsigned test_writer_md(void); * --version: should be a known version * --tick_len: should be >= 0 * --max_lag: should be >= 3 - * --md_pages_reserved: should be >= 1 + * --md_pages_reserved: should be >= 2 * --md_file_path: should contain the metadata file path (POSIX) * B) Verify that info set in the fapl is retrieved correctly. * @@ -709,7 +709,7 @@ test_writer_create_open_flush(void) my_config->tick_len = 1; my_config->max_lag = 3; my_config->writer = TRUE; - my_config->md_pages_reserved = 1; + my_config->md_pages_reserved = 2; HDstrcpy(my_config->md_file_path, MD_FILENAME); /* Set the VFD SWMR configuration in fapl */ -- cgit v0.12