From 0fbd8654f7d5af36621cb4588936c16606bbe298 Mon Sep 17 00:00:00 2001 From: David Young Date: Fri, 24 Jan 2020 14:49:09 -0600 Subject: Prepare the VFD SWMR reader for a "floating" shadow index by overhauling the way that the shadow header and shadow index are loaded. In H5FD__vfd_swmr_load_hdr_and_idx(), adopt a new protocol for reading the shadow file: 0 If the maximum number of retries have been attempted, then exit with an error. 1 Try to read the shadow file *header*. If successful, continue to 2. If there is a hard failure, then return an error. If there is a failure that may be transient, then sleep and retry at 0. 2 If the tick number in the header is less than the tick last read by the VFD, then return an error. 3 If the tick number in the header is equal to the last tick read by the VFD, then exit without doing anything. 4 Try to read the shadow file *index*. If successful, continue to 5. If there is a hard failure, then return an error. If there is a failure that may be transient, then sleep and retry at 0. 5 If a different tick number was read from the index than from the index, then continue at 0. 6 Try to *re-read* the shadow file *header*. If successful, continue to 7. If there is a hard failure, then return an error. If there is a failure that may be transient, then sleep and retry at 0. 7 Compare the header that was read previously with the new header. If the new header is different than the old, then we may not have read the index at the right shadow-file offset, or the index may have been read in an inconsistent state, so sleep and retry at 0. Otherwise, return success. Simplify H5FD__vfd_swmr_header_deserialize() and H5FD__vfd_swmr_index_deserialize(). Remove their retry loops. Make each return TRUE on success, FALSE on an error that may be transient, and FAIL on an irrecoverable error. In H5FD__vfd_swmr_header_deserialize(), do not check the size of the shadow file with fstat(2), since the read(2) will fail if the file is too small. This saves us a system call. Lightly consti-ify H5FD__vfd_swmr_index_deserialize() arguments. In H5FD__vfd_swmr_load_hdr_and_idx(): Consolidate all of the retry-looping. Increase the initial retry delay from 1ns to 1/10s. Delete the disused maximum-retry constants. Use #if 0 to disable some error-checking code that ought to be unnecessary under the new protocol. Don't memset() the header and index header, but make sure they're fully initialized with real content, instead. --- src/H5FDprivate.h | 3 +- src/H5FDvfd_swmr.c | 418 +++++++++++++++++++++++++---------------------------- 2 files changed, 201 insertions(+), 220 deletions(-) diff --git a/src/H5FDprivate.h b/src/H5FDprivate.h index 2aba759..200ae44 100644 --- a/src/H5FDprivate.h +++ b/src/H5FDprivate.h @@ -81,8 +81,7 @@ /* Retries for metadata file */ #define H5FD_VFD_SWMR_MD_FILE_RETRY_MAX 50 /* Maximum retries when opening the MD file */ -#define H5FD_VFD_SWMR_MD_LOAD_RETRY_MAX 20 /* Maximum retries when trying to load the MD file header and index */ -#define H5FD_VFD_SWMR_MD_HEADER_RETRY_MAX 40 /* Maximum retries when deserializing the MD file header */ +#define H5FD_VFD_SWMR_MD_LOAD_RETRY_MAX 120 /* Maximum retries when trying to load the MD file header and index */ #define H5FD_VFD_SWMR_MD_INDEX_RETRY_MAX 5 /* Maximum retries when deserializing the MD file index */ diff --git a/src/H5FDvfd_swmr.c b/src/H5FDvfd_swmr.c index b372424..6df887a 100644 --- a/src/H5FDvfd_swmr.c +++ b/src/H5FDvfd_swmr.c @@ -81,10 +81,10 @@ static herr_t H5FD_vfd_swmr_lock(H5FD_t *_file, hbool_t rw); static herr_t H5FD_vfd_swmr_unlock(H5FD_t *_file); /* VFD SWMR */ -static herr_t H5FD__vfd_swmr_header_deserialize(H5FD_t *_file, +static htri_t H5FD__vfd_swmr_header_deserialize(H5FD_t *_file, H5FD_vfd_swmr_md_header *md_header); -static herr_t H5FD__vfd_swmr_index_deserialize(H5FD_t *_file, - H5FD_vfd_swmr_md_index *md_index, H5FD_vfd_swmr_md_header *md_header); +static htri_t H5FD__vfd_swmr_index_deserialize(const H5FD_t *_file, + H5FD_vfd_swmr_md_index *md_index, const H5FD_vfd_swmr_md_header *md_header); static herr_t H5FD__vfd_swmr_load_hdr_and_idx(H5FD_t *_file, hbool_t open); static const H5FD_class_t H5FD_vfd_swmr_g = { @@ -1136,71 +1136,81 @@ H5FD__vfd_swmr_load_hdr_and_idx(H5FD_t *_file, hbool_t open) (H5FD_vfd_swmr_t *)_file; bool do_try; h5_retry_t retry; - H5FD_vfd_swmr_md_header md_header; /* Metadata file header */ + H5FD_vfd_swmr_md_header md_header; /* Metadata file header, take 1 */ + H5FD_vfd_swmr_md_header md_header_two; /* Metadata file header, take 2 */ H5FD_vfd_swmr_md_index md_index; /* Metadata file index */ herr_t ret_value = SUCCEED; /* Return value */ + htri_t rc; FUNC_ENTER_STATIC for (do_try = h5_retry_init(&retry, H5FD_VFD_SWMR_MD_LOAD_RETRY_MAX, - 1, H5_RETRY_ONE_SECOND); + H5_RETRY_ONE_SECOND / 10, H5_RETRY_ONE_SECOND); do_try; do_try = h5_retry_next(&retry)) { - HDmemset(&md_header, 0, sizeof(H5FD_vfd_swmr_md_header)); - HDmemset(&md_index, 0, sizeof(H5FD_vfd_swmr_md_index)); - /* Load and decode the header */ + /* Load and decode the header. Go around again on a temporary + * failure (FALSE). Bail on an irrecoverable failure (FAIL). + */ + rc = H5FD__vfd_swmr_header_deserialize(_file, &md_header); - if(H5FD__vfd_swmr_header_deserialize(_file, &md_header) < 0) + /* Temporary failure, try again. */ + if (rc == FALSE) continue; - /* Error if header + index does not fit within md_pages_reserved */ - if((H5FD_MD_HEADER_SIZE + md_header.index_length) > - (uint64_t)((hsize_t)file->md_pages_reserved * - md_header.fs_page_size)) - - HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, \ - "header + index does not fit within md_pages_reserved") - - if(!open) { - - if(md_header.tick_num == file->md_header.tick_num) { - - break; - - } else if(md_header.tick_num < file->md_header.tick_num) + if (rc != TRUE) + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "could not read header"); - HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, \ - "tick number read is less than local copy") - } +#if 0 + /* Error if header + index does not fit within md_pages_reserved + * + * This check doesn't make sense if the index floats, does it? --dyoung + */ + if (H5FD_MD_HEADER_SIZE + md_header.index_length > + (hsize_t)file->md_pages_reserved * md_header.fs_page_size) { + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, + "header + index does not fit within md_pages_reserved"); + } +#endif + + if (open) + ; // ignore tick number on open + else if (md_header.tick_num == file->md_header.tick_num) { + /* If the tick number in the header hasn't increased since last + * time, then there is not a complete new index to read, so + * get out. + */ + HGOTO_DONE(SUCCEED); + } else if (md_header.tick_num < file->md_header.tick_num) { + /* The tick number must not move backward. */ + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, + "tick number in header moved backwards"); + } HDassert(md_header.tick_num > file->md_header.tick_num || open); - /* Load and decode the index */ - if (H5FD__vfd_swmr_index_deserialize(_file, &md_index, &md_header) < 0) - continue; - - /* tick_num is the same in both header and index */ - if(md_header.tick_num == md_index.tick_num) { - - /* Copy header to VFD local copy */ - file->md_header = md_header; - - /* Free VFD local entries */ - if (file->md_index.entries != NULL) { + /* Load and decode the index. Go around again on a temporary + * failure (FALSE). Bail on an irrecoverable failure (FAIL). + */ + rc = H5FD__vfd_swmr_index_deserialize(_file, &md_index, &md_header); - HDassert(file->md_index.num_entries); + if (rc == FALSE) + continue; - file->md_index.entries = (H5FD_vfd_swmr_idx_entry_t *) - H5FL_SEQ_FREE(H5FD_vfd_swmr_idx_entry_t, - file->md_index.entries); - } + if (rc != TRUE) + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "could not read index"); - /* Copy index info to VFD local copy */ - file->md_index = md_index; - md_index.entries = NULL; + /* If the tick_num is the same in both header and index, + * and the header reads the same the second time as the first time, + * then we should have a consistent picture of the index. + */ + if (md_header.tick_num == md_index.tick_num && + (rc = H5FD__vfd_swmr_header_deserialize(_file, + &md_header_two)) == TRUE && + md_header.tick_num == md_header_two.tick_num && + md_header.index_length == md_header_two.index_length && + md_header.index_offset == md_header_two.index_offset) break; - } if (md_index.entries != NULL) { @@ -1210,13 +1220,23 @@ H5FD__vfd_swmr_load_hdr_and_idx(H5FD_t *_file, hbool_t open) md_index.entries); } + if (rc == FAIL) { + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, + "could not re-read header"); + } + +#if 0 /* Error when tick_num in header is more than one greater * than in the index + * + * It's ok if this happens, we'll catch it and retry + * until timeout. --dyoung */ if (md_header.tick_num > (md_index.tick_num + 1)) { HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "tick number mis-match in header and index"); } +#endif } /* Exhaust all retries for loading and decoding the md file header @@ -1227,6 +1247,21 @@ H5FD__vfd_swmr_load_hdr_and_idx(H5FD_t *_file, hbool_t open) "error in loading/decoding the metadata file header and index") } + /* Free VFD local entries */ + if (file->md_index.entries != NULL) { + + HDassert(file->md_index.num_entries); + + file->md_index.entries = (H5FD_vfd_swmr_idx_entry_t *) + H5FL_SEQ_FREE(H5FD_vfd_swmr_idx_entry_t, + file->md_index.entries); + } + + /* Copy header and index to VFD */ + file->md_header = md_header; + file->md_index = md_index; + md_index.entries = NULL; + done: FUNC_LEAVE_NOAPI(ret_value) @@ -1249,91 +1284,59 @@ done: * *------------------------------------------------------------------------- */ -static herr_t +static htri_t H5FD__vfd_swmr_header_deserialize(H5FD_t *_file, H5FD_vfd_swmr_md_header *md_header) { H5FD_vfd_swmr_t *file = /* VFD SWMR file struct */ (H5FD_vfd_swmr_t *)_file; - struct stat stat_buf; /* Buffer for stat info */ uint8_t image[H5FD_MD_HEADER_SIZE]; /* Buffer for element data */ - uint32_t stored_chksum; /* Stored metadata checksum value */ - uint32_t computed_chksum; /* Computed metadata checksum */ - /* value */ - uint64_t nanosec = 1; /* # of nanoseconds to sleep */ - /* between retries */ - unsigned file_retries = /* Retries for 'stat' the file */ - H5FD_VFD_SWMR_MD_FILE_RETRY_MAX; - unsigned header_retries = /* Retries for loading header */ - H5FD_VFD_SWMR_MD_HEADER_RETRY_MAX; - uint8_t *p = NULL; /* Pointer to buffer */ - herr_t ret_value = SUCCEED; /* Return value */ + uint32_t stored_chksum; /* Stored metadata checksum */ + uint32_t computed_chksum; /* Computed metadata checksum */ + uint8_t *p; + htri_t ret_value = TRUE; uint64_t index_length; + ssize_t nread; FUNC_ENTER_STATIC - /* Try to stat the metadata file till md header size */ - do { - /* Retrieve the metadata file size */ - if(HDfstat(file->md_fd, &stat_buf)) - - HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, \ - "unable to fstat the md file") - - /* Verify file size is at least header size */ - if(stat_buf.st_size >= H5FD_MD_HEADER_SIZE) - break; - - /* Sleep and double the sleep time next time */ - H5_nanosleep(nanosec); - nanosec *= 2; - } while (--file_retries); - - /* Exhaust all retries for "stat" the md file */ - if(file_retries == 0) - - HGOTO_ERROR(H5E_VFL, H5E_OPENERROR, FAIL, \ - "unable to the metadata file after all retry attempts") + /* Set file pointer to the beginning the file */ + if (lseek(file->md_fd, H5FD_MD_HEADER_OFF, SEEK_SET) < 0) { + HGOTO_ERROR(H5E_VFL, H5E_SEEKERROR, FAIL, \ + "unable to seek in metadata file"); + } - /* Try to get valid magic and checksum for header */ - p = image; - do { - /* Set file pointer to the beginning the file */ - if(HDlseek(file->md_fd, (HDoff_t)H5FD_MD_HEADER_OFF, SEEK_SET) < 0) + /* Read the header */ + nread = read(file->md_fd, image, H5FD_MD_HEADER_SIZE); - HGOTO_ERROR(H5E_VFL, H5E_SEEKERROR, FAIL, \ - "unable to seek in metadata file") + /* Try again if a signal interrupted the read. */ + if (nread == -1 && errno == EINTR) + HGOTO_DONE(FALSE); - /* Read the header */ - if(HDread(file->md_fd, image, H5FD_MD_HEADER_SIZE) < - H5FD_MD_HEADER_SIZE) + /* We cannot recover from any other error by trying again, + * so bail out. + */ + if (nread == -1) { + HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, + "error in reading the shadow header"); + } - HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, \ - "error in reading the header in metadata file") + if ((uint64_t)nread < H5FD_MD_HEADER_SIZE) + HGOTO_DONE(FALSE); - /* Verify magic number */ - if(HDmemcmp(p, H5FD_MD_HEADER_MAGIC, (size_t)H5_SIZEOF_MAGIC) == 0) { + /* Verify magic number */ + if (memcmp(image, H5FD_MD_HEADER_MAGIC, H5_SIZEOF_MAGIC) != 0) + HGOTO_DONE(FALSE); - /* Verify stored and computed checksums are equal */ - H5F_get_checksums(image, H5FD_MD_HEADER_SIZE, &stored_chksum, - &computed_chksum); + /* Verify stored and computed checksums are equal */ + H5F_get_checksums(image, H5FD_MD_HEADER_SIZE, &stored_chksum, + &computed_chksum); - if(stored_chksum == computed_chksum) - break; - } - /* Sleep and double the sleep time next time */ - H5_nanosleep(nanosec); - nanosec *= 2; - } while(--header_retries); - - /* Exhaust all retries for loading the header */ - if(header_retries == 0) - - HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, \ - "incorrect checksum after after all read attempts") + if (stored_chksum != computed_chksum) + HGOTO_DONE(FALSE); /* Header magic is already valid */ - p += H5_SIZEOF_MAGIC; + p = image + H5_SIZEOF_MAGIC; /* Deserialize page size, tick number, index offset, index length */ UINT32DECODE(p, md_header->fs_page_size); @@ -1367,7 +1370,7 @@ done: -/*------------------------------------------------------------------------- +/* * Function: H5FD__vfd_swmr_index_deserialize() * * Purpose: Load and decode the index in the metadata file @@ -1378,26 +1381,23 @@ done: * --Decode the index entries if the tick number in the header and * the index match * - * Return: Success: SUCCEED + * Return: Success: TRUE * Failure: FAIL + * Retry: FALSE * - * Programmer: Vailin Choi - * - *------------------------------------------------------------------------- */ -static herr_t -H5FD__vfd_swmr_index_deserialize(H5FD_t *_file, - H5FD_vfd_swmr_md_index *md_index, H5FD_vfd_swmr_md_header *md_header) +static htri_t +H5FD__vfd_swmr_index_deserialize(const H5FD_t *_file, + H5FD_vfd_swmr_md_index *md_index, const H5FD_vfd_swmr_md_header *md_header) { - H5FD_vfd_swmr_t *file = (H5FD_vfd_swmr_t *)_file; /* VFD SWMR file struct */ + const H5FD_vfd_swmr_t *file = (const H5FD_vfd_swmr_t *)_file; uint8_t *image; /* Buffer */ uint8_t *p = NULL; /* Pointer to buffer */ uint32_t stored_chksum; /* Stored metadata checksum value */ uint32_t computed_chksum; /* Computed metadata checksum value */ unsigned i; /* Local index variable */ - herr_t ret_value = SUCCEED; /* Return value */ - h5_retry_t retry; /* retry state */ - bool do_try; /* more tries remain */ + htri_t ret_value = TRUE; + ssize_t nread; FUNC_ENTER_STATIC @@ -1407,96 +1407,82 @@ H5FD__vfd_swmr_index_deserialize(H5FD_t *_file, "memory allocation failed for index's on disk image buffer"); } - /* Verify magic and checksum for index */ - p = image; - for (do_try = h5_retry_init(&retry, H5FD_VFD_SWMR_MD_INDEX_RETRY_MAX, - H5_RETRY_DEFAULT_MINIVAL, - H5_RETRY_ONE_SECOND); - do_try; - do_try = h5_retry_next(&retry)) { - ssize_t nread; - - /* TBD On each try, seek to the header and read it. This - * entails merging H5FD__vfd_swmr_header_deserialize with this - * function (H5FD__vfd_swmr_index_deserialize). - */ - - /* We may seek past EOF. That's ok, the read(2) will catch that. */ - if (lseek(file->md_fd, (HDoff_t)md_header->index_offset, SEEK_SET) < 0){ - HGOTO_ERROR(H5E_VFL, H5E_SEEKERROR, FAIL, - "unable to seek in metadata file"); - } + /* TBD On each try, seek to the header and read it. This + * entails merging H5FD__vfd_swmr_header_deserialize with this + * function (H5FD__vfd_swmr_index_deserialize). + */ - nread = read(file->md_fd, image, md_header->index_length); + /* We may seek past EOF. That's ok, the read(2) will catch that. */ + if (lseek(file->md_fd, (HDoff_t)md_header->index_offset, SEEK_SET) < 0){ + HGOTO_ERROR(H5E_VFL, H5E_SEEKERROR, FAIL, + "unable to seek in metadata file"); + } - /* Try again if a signal interrupted the read. */ - if (nread == -1 && errno == EINTR) - continue; + nread = read(file->md_fd, image, md_header->index_length); - /* We cannot recover from any other error by trying again, - * so bail out. - */ - if (nread == -1) { - HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, - "error in reading the header in metadata file"); - } + /* Try again if a signal interrupted the read. */ + if (nread == -1 && errno == EINTR) + HGOTO_DONE(FALSE); - /* Try again if the read was not full. - * - * XXX XXX XXX - * A short read should not be possible under the protocol that - * I intend to adopt: the writer will write(2) the new index. - * In a second write(2), the header describing that index - * will be written. POSIX will guarantee that the former - * write is visible before the latter. Under the protocol, - * there should always be `index_length` bytes available to - * read at `index_offset`. If not, the reader should treat it - * like an unrecoverable error instead of retrying. - */ - if ((uint64_t)nread < md_header->index_length) - continue; + /* We cannot recover from any other error by trying again, + * so bail out. + */ + if (nread == -1) { + HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, + "error in reading the header in metadata file"); + } - /* If the index magic is incorrect, then assume that is a - * intermittent error such as a "torn write." Try again. - * - * XXX XXX XXX - * Under the new protocol, where the index is written in - * one write(2), and the header is written in a distinct - * second write(2), it is reasonable to expect that the - * index-write is complete when the index-read occurs. - * So we should not read bad magic because we read a - * "torn" write. - * - * (I am not sure I believe any recent version of UNIX or - * Linux suffers from torn writes! Linux manual pages - * indicate that there was an issue, but it was fixed.) - * - * It is possible under the new protocol that we read - * the header on tick `t`, then an arbitrary delay - * occurs (the user taps Control-Z, say), and then we - * read the index on tick `t + max_lag + 1` or later. - * In the mean time, the index may have moved, and its - * storage may have been reused. In that case, we could - * read bad magic. It's possible to recover, then by - * re-reading the header. - */ - if (memcmp(p, H5FD_MD_INDEX_MAGIC, H5_SIZEOF_MAGIC) != 0) - continue; + /* Try again if the read was not full. + * + * XXX XXX XXX + * A short read should not be possible under the protocol that + * I intend to adopt: the writer will write(2) the new index. + * In a second write(2), the header describing that index + * will be written. POSIX will guarantee that the former + * write is visible before the latter. Under the protocol, + * there should always be `index_length` bytes available to + * read at `index_offset`. If not, the reader should treat it + * like an unrecoverable error instead of retrying. + */ + if ((size_t)nread < md_header->index_length) + HGOTO_DONE(FALSE); - /* Verify stored and computed checksums are equal */ - H5F_get_checksums(image, md_header->index_length, &stored_chksum, - &computed_chksum); + /* If the index magic is incorrect, then assume that is a + * temporary error such as a "torn write." Try again. + * + * XXX XXX XXX + * Under the new protocol, where the index is written in + * one write(2), and the header is written in a distinct + * second write(2), it is reasonable to expect that the + * index-write is complete when the index-read occurs. + * So we should not read bad magic because we read a + * "torn" write. + * + * (I am not sure I believe any recent version of UNIX or + * Linux suffers from torn writes on local filesystems! + * Linux manual pages indicate that there was an issue, but + * it was fixed.) + * + * It is possible under the new protocol that we read + * the header on tick `t`, then an arbitrary delay + * occurs (the user taps Control-Z, say), and then we + * read the index on tick `t + max_lag + 1` or later. + * In the mean time, the index may have moved, and its + * storage may have been reused. In that case, we could + * read bad magic. It's possible to recover by + * re-reading the header. + */ + if (memcmp(image, H5FD_MD_INDEX_MAGIC, H5_SIZEOF_MAGIC) != 0) + HGOTO_DONE(FALSE); - if (stored_chksum == computed_chksum) - break; - } + /* Verify stored and computed checksums are equal */ + H5F_get_checksums(image, md_header->index_length, &stored_chksum, + &computed_chksum); - /* Exhaust all retries for loading the index */ - if (!do_try) - HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "read attempts exhausted") + if (stored_chksum != computed_chksum) + HGOTO_DONE(FALSE); - /* Magic is already valid */ - p += H5_SIZEOF_MAGIC; + p = image + H5_SIZEOF_MAGIC; /* Deserialize the index info: tick number, number of entries, entries, * checksum @@ -1521,7 +1507,8 @@ H5FD__vfd_swmr_index_deserialize(H5FD_t *_file, UINT32DECODE(p, md_index->entries[i].length); UINT32DECODE(p, md_index->entries[i].chksum); } - } + } else + md_index->entries = NULL; /* Checksum is already valid */ UINT32DECODE(p, stored_chksum); @@ -1537,20 +1524,15 @@ H5FD__vfd_swmr_index_deserialize(H5FD_t *_file, done: - if(image) { - + if (image != NULL) image = H5MM_xfree(image); - } - if(ret_value < 0) { + if (ret_value == FAIL && md_index->entries != NULL) { - if(md_index->entries) { + HDassert(md_index->num_entries != 0); - HDassert(md_index->num_entries); - - md_index->entries = (H5FD_vfd_swmr_idx_entry_t *) - H5FL_SEQ_FREE(H5FD_vfd_swmr_idx_entry_t, md_index->entries); - } + md_index->entries = + H5FL_SEQ_FREE(H5FD_vfd_swmr_idx_entry_t, md_index->entries); } FUNC_LEAVE_NOAPI(ret_value) -- cgit v0.12