diff options
-rw-r--r-- | src/H5FDprivate.h | 3 | ||||
-rw-r--r-- | src/H5FDvfd_swmr.c | 418 |
2 files changed, 201 insertions, 220 deletions
diff --git a/src/H5FDprivate.h b/src/H5FDprivate.h index 2aba759..200ae44 100644 --- a/src/H5FDprivate.h +++ b/src/H5FDprivate.h @@ -81,8 +81,7 @@ /* Retries for metadata file */ #define H5FD_VFD_SWMR_MD_FILE_RETRY_MAX 50 /* Maximum retries when opening the MD file */ -#define H5FD_VFD_SWMR_MD_LOAD_RETRY_MAX 20 /* Maximum retries when trying to load the MD file header and index */ -#define H5FD_VFD_SWMR_MD_HEADER_RETRY_MAX 40 /* Maximum retries when deserializing the MD file header */ +#define H5FD_VFD_SWMR_MD_LOAD_RETRY_MAX 120 /* Maximum retries when trying to load the MD file header and index */ #define H5FD_VFD_SWMR_MD_INDEX_RETRY_MAX 5 /* Maximum retries when deserializing the MD file index */ diff --git a/src/H5FDvfd_swmr.c b/src/H5FDvfd_swmr.c index b372424..6df887a 100644 --- a/src/H5FDvfd_swmr.c +++ b/src/H5FDvfd_swmr.c @@ -81,10 +81,10 @@ static herr_t H5FD_vfd_swmr_lock(H5FD_t *_file, hbool_t rw); static herr_t H5FD_vfd_swmr_unlock(H5FD_t *_file); /* VFD SWMR */ -static herr_t H5FD__vfd_swmr_header_deserialize(H5FD_t *_file, +static htri_t H5FD__vfd_swmr_header_deserialize(H5FD_t *_file, H5FD_vfd_swmr_md_header *md_header); -static herr_t H5FD__vfd_swmr_index_deserialize(H5FD_t *_file, - H5FD_vfd_swmr_md_index *md_index, H5FD_vfd_swmr_md_header *md_header); +static htri_t H5FD__vfd_swmr_index_deserialize(const H5FD_t *_file, + H5FD_vfd_swmr_md_index *md_index, const H5FD_vfd_swmr_md_header *md_header); static herr_t H5FD__vfd_swmr_load_hdr_and_idx(H5FD_t *_file, hbool_t open); static const H5FD_class_t H5FD_vfd_swmr_g = { @@ -1136,71 +1136,81 @@ H5FD__vfd_swmr_load_hdr_and_idx(H5FD_t *_file, hbool_t open) (H5FD_vfd_swmr_t *)_file; bool do_try; h5_retry_t retry; - H5FD_vfd_swmr_md_header md_header; /* Metadata file header */ + H5FD_vfd_swmr_md_header md_header; /* Metadata file header, take 1 */ + H5FD_vfd_swmr_md_header md_header_two; /* Metadata file header, take 2 */ H5FD_vfd_swmr_md_index md_index; /* Metadata file index */ herr_t ret_value = SUCCEED; /* Return value */ + htri_t rc; FUNC_ENTER_STATIC for (do_try = h5_retry_init(&retry, H5FD_VFD_SWMR_MD_LOAD_RETRY_MAX, - 1, H5_RETRY_ONE_SECOND); + H5_RETRY_ONE_SECOND / 10, H5_RETRY_ONE_SECOND); do_try; do_try = h5_retry_next(&retry)) { - HDmemset(&md_header, 0, sizeof(H5FD_vfd_swmr_md_header)); - HDmemset(&md_index, 0, sizeof(H5FD_vfd_swmr_md_index)); - /* Load and decode the header */ + /* Load and decode the header. Go around again on a temporary + * failure (FALSE). Bail on an irrecoverable failure (FAIL). + */ + rc = H5FD__vfd_swmr_header_deserialize(_file, &md_header); - if(H5FD__vfd_swmr_header_deserialize(_file, &md_header) < 0) + /* Temporary failure, try again. */ + if (rc == FALSE) continue; - /* Error if header + index does not fit within md_pages_reserved */ - if((H5FD_MD_HEADER_SIZE + md_header.index_length) > - (uint64_t)((hsize_t)file->md_pages_reserved * - md_header.fs_page_size)) - - HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, \ - "header + index does not fit within md_pages_reserved") - - if(!open) { - - if(md_header.tick_num == file->md_header.tick_num) { - - break; - - } else if(md_header.tick_num < file->md_header.tick_num) + if (rc != TRUE) + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "could not read header"); - HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, \ - "tick number read is less than local copy") - } +#if 0 + /* Error if header + index does not fit within md_pages_reserved + * + * This check doesn't make sense if the index floats, does it? --dyoung + */ + if (H5FD_MD_HEADER_SIZE + md_header.index_length > + (hsize_t)file->md_pages_reserved * md_header.fs_page_size) { + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, + "header + index does not fit within md_pages_reserved"); + } +#endif + + if (open) + ; // ignore tick number on open + else if (md_header.tick_num == file->md_header.tick_num) { + /* If the tick number in the header hasn't increased since last + * time, then there is not a complete new index to read, so + * get out. + */ + HGOTO_DONE(SUCCEED); + } else if (md_header.tick_num < file->md_header.tick_num) { + /* The tick number must not move backward. */ + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, + "tick number in header moved backwards"); + } HDassert(md_header.tick_num > file->md_header.tick_num || open); - /* Load and decode the index */ - if (H5FD__vfd_swmr_index_deserialize(_file, &md_index, &md_header) < 0) - continue; - - /* tick_num is the same in both header and index */ - if(md_header.tick_num == md_index.tick_num) { - - /* Copy header to VFD local copy */ - file->md_header = md_header; - - /* Free VFD local entries */ - if (file->md_index.entries != NULL) { + /* Load and decode the index. Go around again on a temporary + * failure (FALSE). Bail on an irrecoverable failure (FAIL). + */ + rc = H5FD__vfd_swmr_index_deserialize(_file, &md_index, &md_header); - HDassert(file->md_index.num_entries); + if (rc == FALSE) + continue; - file->md_index.entries = (H5FD_vfd_swmr_idx_entry_t *) - H5FL_SEQ_FREE(H5FD_vfd_swmr_idx_entry_t, - file->md_index.entries); - } + if (rc != TRUE) + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "could not read index"); - /* Copy index info to VFD local copy */ - file->md_index = md_index; - md_index.entries = NULL; + /* If the tick_num is the same in both header and index, + * and the header reads the same the second time as the first time, + * then we should have a consistent picture of the index. + */ + if (md_header.tick_num == md_index.tick_num && + (rc = H5FD__vfd_swmr_header_deserialize(_file, + &md_header_two)) == TRUE && + md_header.tick_num == md_header_two.tick_num && + md_header.index_length == md_header_two.index_length && + md_header.index_offset == md_header_two.index_offset) break; - } if (md_index.entries != NULL) { @@ -1210,13 +1220,23 @@ H5FD__vfd_swmr_load_hdr_and_idx(H5FD_t *_file, hbool_t open) md_index.entries); } + if (rc == FAIL) { + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, + "could not re-read header"); + } + +#if 0 /* Error when tick_num in header is more than one greater * than in the index + * + * It's ok if this happens, we'll catch it and retry + * until timeout. --dyoung */ if (md_header.tick_num > (md_index.tick_num + 1)) { HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "tick number mis-match in header and index"); } +#endif } /* Exhaust all retries for loading and decoding the md file header @@ -1227,6 +1247,21 @@ H5FD__vfd_swmr_load_hdr_and_idx(H5FD_t *_file, hbool_t open) "error in loading/decoding the metadata file header and index") } + /* Free VFD local entries */ + if (file->md_index.entries != NULL) { + + HDassert(file->md_index.num_entries); + + file->md_index.entries = (H5FD_vfd_swmr_idx_entry_t *) + H5FL_SEQ_FREE(H5FD_vfd_swmr_idx_entry_t, + file->md_index.entries); + } + + /* Copy header and index to VFD */ + file->md_header = md_header; + file->md_index = md_index; + md_index.entries = NULL; + done: FUNC_LEAVE_NOAPI(ret_value) @@ -1249,91 +1284,59 @@ done: * *------------------------------------------------------------------------- */ -static herr_t +static htri_t H5FD__vfd_swmr_header_deserialize(H5FD_t *_file, H5FD_vfd_swmr_md_header *md_header) { H5FD_vfd_swmr_t *file = /* VFD SWMR file struct */ (H5FD_vfd_swmr_t *)_file; - struct stat stat_buf; /* Buffer for stat info */ uint8_t image[H5FD_MD_HEADER_SIZE]; /* Buffer for element data */ - uint32_t stored_chksum; /* Stored metadata checksum value */ - uint32_t computed_chksum; /* Computed metadata checksum */ - /* value */ - uint64_t nanosec = 1; /* # of nanoseconds to sleep */ - /* between retries */ - unsigned file_retries = /* Retries for 'stat' the file */ - H5FD_VFD_SWMR_MD_FILE_RETRY_MAX; - unsigned header_retries = /* Retries for loading header */ - H5FD_VFD_SWMR_MD_HEADER_RETRY_MAX; - uint8_t *p = NULL; /* Pointer to buffer */ - herr_t ret_value = SUCCEED; /* Return value */ + uint32_t stored_chksum; /* Stored metadata checksum */ + uint32_t computed_chksum; /* Computed metadata checksum */ + uint8_t *p; + htri_t ret_value = TRUE; uint64_t index_length; + ssize_t nread; FUNC_ENTER_STATIC - /* Try to stat the metadata file till md header size */ - do { - /* Retrieve the metadata file size */ - if(HDfstat(file->md_fd, &stat_buf)) - - HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, \ - "unable to fstat the md file") - - /* Verify file size is at least header size */ - if(stat_buf.st_size >= H5FD_MD_HEADER_SIZE) - break; - - /* Sleep and double the sleep time next time */ - H5_nanosleep(nanosec); - nanosec *= 2; - } while (--file_retries); - - /* Exhaust all retries for "stat" the md file */ - if(file_retries == 0) - - HGOTO_ERROR(H5E_VFL, H5E_OPENERROR, FAIL, \ - "unable to the metadata file after all retry attempts") + /* Set file pointer to the beginning the file */ + if (lseek(file->md_fd, H5FD_MD_HEADER_OFF, SEEK_SET) < 0) { + HGOTO_ERROR(H5E_VFL, H5E_SEEKERROR, FAIL, \ + "unable to seek in metadata file"); + } - /* Try to get valid magic and checksum for header */ - p = image; - do { - /* Set file pointer to the beginning the file */ - if(HDlseek(file->md_fd, (HDoff_t)H5FD_MD_HEADER_OFF, SEEK_SET) < 0) + /* Read the header */ + nread = read(file->md_fd, image, H5FD_MD_HEADER_SIZE); - HGOTO_ERROR(H5E_VFL, H5E_SEEKERROR, FAIL, \ - "unable to seek in metadata file") + /* Try again if a signal interrupted the read. */ + if (nread == -1 && errno == EINTR) + HGOTO_DONE(FALSE); - /* Read the header */ - if(HDread(file->md_fd, image, H5FD_MD_HEADER_SIZE) < - H5FD_MD_HEADER_SIZE) + /* We cannot recover from any other error by trying again, + * so bail out. + */ + if (nread == -1) { + HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, + "error in reading the shadow header"); + } - HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, \ - "error in reading the header in metadata file") + if ((uint64_t)nread < H5FD_MD_HEADER_SIZE) + HGOTO_DONE(FALSE); - /* Verify magic number */ - if(HDmemcmp(p, H5FD_MD_HEADER_MAGIC, (size_t)H5_SIZEOF_MAGIC) == 0) { + /* Verify magic number */ + if (memcmp(image, H5FD_MD_HEADER_MAGIC, H5_SIZEOF_MAGIC) != 0) + HGOTO_DONE(FALSE); - /* Verify stored and computed checksums are equal */ - H5F_get_checksums(image, H5FD_MD_HEADER_SIZE, &stored_chksum, - &computed_chksum); + /* Verify stored and computed checksums are equal */ + H5F_get_checksums(image, H5FD_MD_HEADER_SIZE, &stored_chksum, + &computed_chksum); - if(stored_chksum == computed_chksum) - break; - } - /* Sleep and double the sleep time next time */ - H5_nanosleep(nanosec); - nanosec *= 2; - } while(--header_retries); - - /* Exhaust all retries for loading the header */ - if(header_retries == 0) - - HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, \ - "incorrect checksum after after all read attempts") + if (stored_chksum != computed_chksum) + HGOTO_DONE(FALSE); /* Header magic is already valid */ - p += H5_SIZEOF_MAGIC; + p = image + H5_SIZEOF_MAGIC; /* Deserialize page size, tick number, index offset, index length */ UINT32DECODE(p, md_header->fs_page_size); @@ -1367,7 +1370,7 @@ done: -/*------------------------------------------------------------------------- +/* * Function: H5FD__vfd_swmr_index_deserialize() * * Purpose: Load and decode the index in the metadata file @@ -1378,26 +1381,23 @@ done: * --Decode the index entries if the tick number in the header and * the index match * - * Return: Success: SUCCEED + * Return: Success: TRUE * Failure: FAIL + * Retry: FALSE * - * Programmer: Vailin Choi - * - *------------------------------------------------------------------------- */ -static herr_t -H5FD__vfd_swmr_index_deserialize(H5FD_t *_file, - H5FD_vfd_swmr_md_index *md_index, H5FD_vfd_swmr_md_header *md_header) +static htri_t +H5FD__vfd_swmr_index_deserialize(const H5FD_t *_file, + H5FD_vfd_swmr_md_index *md_index, const H5FD_vfd_swmr_md_header *md_header) { - H5FD_vfd_swmr_t *file = (H5FD_vfd_swmr_t *)_file; /* VFD SWMR file struct */ + const H5FD_vfd_swmr_t *file = (const H5FD_vfd_swmr_t *)_file; uint8_t *image; /* Buffer */ uint8_t *p = NULL; /* Pointer to buffer */ uint32_t stored_chksum; /* Stored metadata checksum value */ uint32_t computed_chksum; /* Computed metadata checksum value */ unsigned i; /* Local index variable */ - herr_t ret_value = SUCCEED; /* Return value */ - h5_retry_t retry; /* retry state */ - bool do_try; /* more tries remain */ + htri_t ret_value = TRUE; + ssize_t nread; FUNC_ENTER_STATIC @@ -1407,96 +1407,82 @@ H5FD__vfd_swmr_index_deserialize(H5FD_t *_file, "memory allocation failed for index's on disk image buffer"); } - /* Verify magic and checksum for index */ - p = image; - for (do_try = h5_retry_init(&retry, H5FD_VFD_SWMR_MD_INDEX_RETRY_MAX, - H5_RETRY_DEFAULT_MINIVAL, - H5_RETRY_ONE_SECOND); - do_try; - do_try = h5_retry_next(&retry)) { - ssize_t nread; - - /* TBD On each try, seek to the header and read it. This - * entails merging H5FD__vfd_swmr_header_deserialize with this - * function (H5FD__vfd_swmr_index_deserialize). - */ - - /* We may seek past EOF. That's ok, the read(2) will catch that. */ - if (lseek(file->md_fd, (HDoff_t)md_header->index_offset, SEEK_SET) < 0){ - HGOTO_ERROR(H5E_VFL, H5E_SEEKERROR, FAIL, - "unable to seek in metadata file"); - } + /* TBD On each try, seek to the header and read it. This + * entails merging H5FD__vfd_swmr_header_deserialize with this + * function (H5FD__vfd_swmr_index_deserialize). + */ - nread = read(file->md_fd, image, md_header->index_length); + /* We may seek past EOF. That's ok, the read(2) will catch that. */ + if (lseek(file->md_fd, (HDoff_t)md_header->index_offset, SEEK_SET) < 0){ + HGOTO_ERROR(H5E_VFL, H5E_SEEKERROR, FAIL, + "unable to seek in metadata file"); + } - /* Try again if a signal interrupted the read. */ - if (nread == -1 && errno == EINTR) - continue; + nread = read(file->md_fd, image, md_header->index_length); - /* We cannot recover from any other error by trying again, - * so bail out. - */ - if (nread == -1) { - HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, - "error in reading the header in metadata file"); - } + /* Try again if a signal interrupted the read. */ + if (nread == -1 && errno == EINTR) + HGOTO_DONE(FALSE); - /* Try again if the read was not full. - * - * XXX XXX XXX - * A short read should not be possible under the protocol that - * I intend to adopt: the writer will write(2) the new index. - * In a second write(2), the header describing that index - * will be written. POSIX will guarantee that the former - * write is visible before the latter. Under the protocol, - * there should always be `index_length` bytes available to - * read at `index_offset`. If not, the reader should treat it - * like an unrecoverable error instead of retrying. - */ - if ((uint64_t)nread < md_header->index_length) - continue; + /* We cannot recover from any other error by trying again, + * so bail out. + */ + if (nread == -1) { + HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, + "error in reading the header in metadata file"); + } - /* If the index magic is incorrect, then assume that is a - * intermittent error such as a "torn write." Try again. - * - * XXX XXX XXX - * Under the new protocol, where the index is written in - * one write(2), and the header is written in a distinct - * second write(2), it is reasonable to expect that the - * index-write is complete when the index-read occurs. - * So we should not read bad magic because we read a - * "torn" write. - * - * (I am not sure I believe any recent version of UNIX or - * Linux suffers from torn writes! Linux manual pages - * indicate that there was an issue, but it was fixed.) - * - * It is possible under the new protocol that we read - * the header on tick `t`, then an arbitrary delay - * occurs (the user taps Control-Z, say), and then we - * read the index on tick `t + max_lag + 1` or later. - * In the mean time, the index may have moved, and its - * storage may have been reused. In that case, we could - * read bad magic. It's possible to recover, then by - * re-reading the header. - */ - if (memcmp(p, H5FD_MD_INDEX_MAGIC, H5_SIZEOF_MAGIC) != 0) - continue; + /* Try again if the read was not full. + * + * XXX XXX XXX + * A short read should not be possible under the protocol that + * I intend to adopt: the writer will write(2) the new index. + * In a second write(2), the header describing that index + * will be written. POSIX will guarantee that the former + * write is visible before the latter. Under the protocol, + * there should always be `index_length` bytes available to + * read at `index_offset`. If not, the reader should treat it + * like an unrecoverable error instead of retrying. + */ + if ((size_t)nread < md_header->index_length) + HGOTO_DONE(FALSE); - /* Verify stored and computed checksums are equal */ - H5F_get_checksums(image, md_header->index_length, &stored_chksum, - &computed_chksum); + /* If the index magic is incorrect, then assume that is a + * temporary error such as a "torn write." Try again. + * + * XXX XXX XXX + * Under the new protocol, where the index is written in + * one write(2), and the header is written in a distinct + * second write(2), it is reasonable to expect that the + * index-write is complete when the index-read occurs. + * So we should not read bad magic because we read a + * "torn" write. + * + * (I am not sure I believe any recent version of UNIX or + * Linux suffers from torn writes on local filesystems! + * Linux manual pages indicate that there was an issue, but + * it was fixed.) + * + * It is possible under the new protocol that we read + * the header on tick `t`, then an arbitrary delay + * occurs (the user taps Control-Z, say), and then we + * read the index on tick `t + max_lag + 1` or later. + * In the mean time, the index may have moved, and its + * storage may have been reused. In that case, we could + * read bad magic. It's possible to recover by + * re-reading the header. + */ + if (memcmp(image, H5FD_MD_INDEX_MAGIC, H5_SIZEOF_MAGIC) != 0) + HGOTO_DONE(FALSE); - if (stored_chksum == computed_chksum) - break; - } + /* Verify stored and computed checksums are equal */ + H5F_get_checksums(image, md_header->index_length, &stored_chksum, + &computed_chksum); - /* Exhaust all retries for loading the index */ - if (!do_try) - HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "read attempts exhausted") + if (stored_chksum != computed_chksum) + HGOTO_DONE(FALSE); - /* Magic is already valid */ - p += H5_SIZEOF_MAGIC; + p = image + H5_SIZEOF_MAGIC; /* Deserialize the index info: tick number, number of entries, entries, * checksum @@ -1521,7 +1507,8 @@ H5FD__vfd_swmr_index_deserialize(H5FD_t *_file, UINT32DECODE(p, md_index->entries[i].length); UINT32DECODE(p, md_index->entries[i].chksum); } - } + } else + md_index->entries = NULL; /* Checksum is already valid */ UINT32DECODE(p, stored_chksum); @@ -1537,20 +1524,15 @@ H5FD__vfd_swmr_index_deserialize(H5FD_t *_file, done: - if(image) { - + if (image != NULL) image = H5MM_xfree(image); - } - if(ret_value < 0) { + if (ret_value == FAIL && md_index->entries != NULL) { - if(md_index->entries) { + HDassert(md_index->num_entries != 0); - HDassert(md_index->num_entries); - - md_index->entries = (H5FD_vfd_swmr_idx_entry_t *) - H5FL_SEQ_FREE(H5FD_vfd_swmr_idx_entry_t, md_index->entries); - } + md_index->entries = + H5FL_SEQ_FREE(H5FD_vfd_swmr_idx_entry_t, md_index->entries); } FUNC_LEAVE_NOAPI(ret_value) |