diff options
-rw-r--r-- | src/H5FDvfd_swmr.c | 136 |
1 files changed, 83 insertions, 53 deletions
diff --git a/src/H5FDvfd_swmr.c b/src/H5FDvfd_swmr.c index c37f674..c51c42a 100644 --- a/src/H5FDvfd_swmr.c +++ b/src/H5FDvfd_swmr.c @@ -1392,78 +1392,108 @@ H5FD__vfd_swmr_index_deserialize(H5FD_t *_file, H5FD_vfd_swmr_t *file = (H5FD_vfd_swmr_t *)_file; /* VFD SWMR file struct */ uint8_t *image; /* Buffer */ uint8_t *p = NULL; /* Pointer to buffer */ - struct stat stat_buf; /* Buffer for stat info */ uint32_t stored_chksum; /* Stored metadata checksum value */ uint32_t computed_chksum; /* Computed metadata checksum value */ - uint64_t nanosec = 1; /* # of nanoseconds to sleep between */ - /* retries */ unsigned i; /* Local index variable */ - unsigned file_retries = /* Retries for 'stat' the file */ - H5FD_VFD_SWMR_MD_FILE_RETRY_MAX; - unsigned index_retries = /* Retries for loading the index */ - H5FD_VFD_SWMR_MD_INDEX_RETRY_MAX; herr_t ret_value = SUCCEED; /* Return value */ + h5_retry_t retry; /* retry state */ + bool do_try; /* more tries remain */ FUNC_ENTER_STATIC - /* Try to stat the metadata file till at least md (header+index) size */ - do { - /* Retrieve the metadata file size */ - if(HDfstat(file->md_fd, &stat_buf)) - - HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, \ - "unable to fstat the md file") - - /* Verify file size is at least header size */ - if((uint64_t)stat_buf.st_size >= - (H5FD_MD_HEADER_SIZE + md_header->index_length)) - break; - - /* Sleep and double the sleep time next time */ - H5_nanosleep(nanosec); - nanosec *= 2; - } while (--file_retries); - /* Allocate buffer for reading index */ - if(NULL == (image = (uint8_t *)H5MM_malloc(md_header->index_length))) - - HGOTO_ERROR(H5E_VFL, H5E_CANTALLOC, FAIL, \ - "memory allocation failed for index's on disk image buffer") + if (NULL == (image = H5MM_malloc(md_header->index_length))) { + HGOTO_ERROR(H5E_VFL, H5E_CANTALLOC, FAIL, + "memory allocation failed for index's on disk image buffer"); + } /* Verify magic and checksum for index */ p = image; - do { - if(HDlseek(file->md_fd, (HDoff_t)md_header->index_offset, SEEK_SET) < 0) - - HGOTO_ERROR(H5E_VFL, H5E_SEEKERROR, FAIL, \ - "unable to seek in metadata file") + for (do_try = h5_retry_init(&retry, H5FD_VFD_SWMR_MD_INDEX_RETRY_MAX, + H5_RETRY_DEFAULT_MINIVAL, + H5_RETRY_ONE_SECOND); + do_try; + do_try = h5_retry_next(&retry)) { + ssize_t nread; - if(HDread(file->md_fd, image, md_header->index_length) < - (int64_t)md_header->index_length) + /* TBD On each try, seek to the header and read it. This + * entails merging H5FD__vfd_swmr_header_deserialize with this + * function (H5FD__vfd_swmr_index_deserialize). + */ - HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, \ - "error in reading the header in metadata file") + /* We may seek past EOF. That's ok, the read(2) will catch that. */ + if (lseek(file->md_fd, (HDoff_t)md_header->index_offset, SEEK_SET) < 0){ + HGOTO_ERROR(H5E_VFL, H5E_SEEKERROR, FAIL, + "unable to seek in metadata file"); + } - /* Verify valid magic for index */ - if(HDmemcmp(p, H5FD_MD_INDEX_MAGIC, (size_t)H5_SIZEOF_MAGIC) == 0) { + nread = read(file->md_fd, image, md_header->index_length); - /* Verify stored and computed checksums are equal */ - H5F_get_checksums(image, md_header->index_length, &stored_chksum, - &computed_chksum); + /* Try again if a signal interrupted the read. */ + if (nread == -1 && errno == EINTR) + continue; - if(stored_chksum == computed_chksum) - break; + /* We cannot recover from any other error by trying again, + * so bail out. + */ + if (nread == -1) { + HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, + "error in reading the header in metadata file"); } - /* Double the sleep time next time */ - H5_nanosleep(nanosec); - nanosec *= 2; - } while(--index_retries); - /* Exhaust all retries for loading the index */ - if(index_retries == 0) + /* Try again if the read was not full. + * + * XXX XXX XXX + * A short read should not be possible under the protocol that + * I intend to adopt: the writer will write(2) the new index. + * In a second write(2), the header describing that index + * will be written. POSIX will guarantee that the former + * write is visible before the latter. Under the protocol, + * there should always be `index_length` bytes available to + * read at `index_offset`. If not, the reader should treat it + * like an unrecoverable error instead of retrying. + */ + if ((uint64_t)nread < md_header->index_length) + continue; - HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, \ - "incorrect checksum after after all read attempts") + /* If the index magic is incorrect, then assume that is a + * intermittent error such as a "torn write." Try again. + * + * XXX XXX XXX + * Under the new protocol, where the index is written in + * one write(2), and the header is written in a distinct + * second write(2), it is reasonable to expect that the + * index-write is complete when the index-read occurs. + * So we should not read bad magic because we read a + * "torn" write. + * + * (I am not sure I believe any recent version of UNIX or + * Linux suffers from torn writes! Linux manual pages + * indicate that there was an issue, but it was fixed.) + * + * It is possible under the new protocol that we read + * the header on tick `t`, then an arbitrary delay + * occurs (the user taps Control-Z, say), and then we + * read the index on tick `t + max_lag + 1` or later. + * In the mean time, the index may have moved, and its + * storage may have been reused. In that case, we could + * read bad magic. It's possible to recover, then by + * re-reading the header. + */ + if (memcmp(p, H5FD_MD_INDEX_MAGIC, H5_SIZEOF_MAGIC) != 0) + continue; + + /* Verify stored and computed checksums are equal */ + H5F_get_checksums(image, md_header->index_length, &stored_chksum, + &computed_chksum); + + if (stored_chksum == computed_chksum) + break; + } + + /* Exhaust all retries for loading the index */ + if (!do_try) + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "read attempts exhausted") /* Magic is already valid */ p += H5_SIZEOF_MAGIC; |