diff options
author | David Young <dyoung@hdfgroup.org> | 2020-05-13 15:30:00 (GMT) |
---|---|---|
committer | David Young <dyoung@hdfgroup.org> | 2020-05-13 15:30:00 (GMT) |
commit | b4813b0c54f017ffd59dd0e7dd68a5fdbbc799a2 (patch) | |
tree | 4e093587d8b99a1a6eeee21378bdc83524ebc4cc /src/H5FDvfd_swmr.c | |
parent | 92142cba386922cf7a087dbf26aef9c189cdef0b (diff) | |
download | hdf5-b4813b0c54f017ffd59dd0e7dd68a5fdbbc799a2.zip hdf5-b4813b0c54f017ffd59dd0e7dd68a5fdbbc799a2.tar.gz hdf5-b4813b0c54f017ffd59dd0e7dd68a5fdbbc799a2.tar.bz2 |
In H5FD_vfd_swmr_read(), do not verify checksums on shadow entries
that are longer than the buffer that the caller supplied: the checksum
usually will fail, but that's not actually a fatal condition, and
usually we will have another opportunity to verify the checksum.
In H5FD_vfd_swmr_read(), remove a bunch of disused code.
In H5FD_vfd_swmr_read(), do not re-read a shadow image that has a
bad checksum, because a bad checksum indicates a serious problem
(writer outran reader, OS defect, hardware failure) from which
H5FD_vfd_swmr_read() cannot recover.
Rationale: the writer write(2)s new shadow images before the new index,
and the new index before the new header. In H5FD_vfd_swmr_read(),
the reader has read(2) both the index and the header in full. POSIX
semantics indicate that in these circumstances, the last shadow image
write(2) MUST be completely visible when we read(2). That is, the index
write(2) & read(2) and the header write(2) & read(2) pair cannot
divide a preceding shadow-image write(2).
The reader may see a "torn" image at this juncture if, for example,
the writer got max_lag ticks ahead of it and reused the storage for
this shadow image. Even if the reader "recovers" by re-reading the
image until its checksum is correct, it cannot be sure that the
image thus read is the right one for the HDF5 address passed to
H5FD_vfd_swmr_read(), and it cannot be sure that the image thus read is
not stale, because it's operating with an out-of-date shadow index.
Add log outlets swmr_read, swmr_read_exception, and swmr_read_err.
Log to `swmr_read` on entry to H5FD_vfd_swmr_read(), log to
`swmr_read_exception` when checksums are skipped for exceptional
conditions (page buffer not configured, buffer shorter than shadow
image), and log to `swmr_read_err` when the checksum fails.
Diffstat (limited to 'src/H5FDvfd_swmr.c')
-rw-r--r-- | src/H5FDvfd_swmr.c | 220 |
1 files changed, 63 insertions, 157 deletions
diff --git a/src/H5FDvfd_swmr.c b/src/H5FDvfd_swmr.c index ef6b6b0..12fe518 100644 --- a/src/H5FDvfd_swmr.c +++ b/src/H5FDvfd_swmr.c @@ -88,6 +88,9 @@ static htri_t H5FD__vfd_swmr_index_deserialize(const H5FD_t *_file, static herr_t H5FD__vfd_swmr_load_hdr_and_idx(H5FD_t *_file, hbool_t open); HLOG_OUTLET_SHORT_DEFN(index_motion, swmr); +HLOG_OUTLET_SHORT_DEFN(swmr_read, swmr); +HLOG_OUTLET_SHORT_DEFN(swmr_read_exception, swmr_read); +HLOG_OUTLET_MEDIUM_DEFN(swmr_read_err, swmr_read_exception, HLOG_OUTLET_S_ON); static const H5FD_class_t H5FD_vfd_swmr_g = { "vfd_swmr", /* name */ @@ -658,28 +661,26 @@ done: static herr_t H5FD_vfd_swmr_read(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, - const haddr_t addr, size_t size, void *buf /*out*/) + const haddr_t addr, size_t size, void * const buf /*out*/) { const size_t init_size = size; haddr_t target_page; haddr_t page_offset; H5FD_vfd_swmr_t *file = (H5FD_vfd_swmr_t *)_file; - H5FD_vfd_swmr_idx_entry_t *index = NULL; /* Metadata file index */ - H5FD_vfd_swmr_idx_entry_t *entry; - unsigned entry_retries = /* # of retries */ - H5FD_VFD_SWMR_MD_INDEX_RETRY_MAX; - uint64_t nanosec = 1; /* # of nanoseconds to sleep */ - /* between retries */ - uint32_t num_entries = 0; /* Number of entries in index */ - uint32_t fs_page_size; /* Page size */ - uint32_t computed_chksum; /* Computed checksum */ - herr_t ret_value = SUCCEED; /* Return value */ + H5FD_vfd_swmr_idx_entry_t *index, *entry; + uint32_t num_entries = 0; + uint32_t fs_page_size; + herr_t ret_value = SUCCEED; + char *p = buf; FUNC_ENTER_NOAPI_NOINIT HDassert(file && file->pub.cls); HDassert(buf); + hlog_fast(swmr_read, "%s: enter type %d addr %" PRIuHADDR " size %zu", + __func__, type, addr, size); + index = file->md_index.entries; num_entries = file->md_index.num_entries; fs_page_size = file->md_header.fs_page_size; @@ -687,28 +688,9 @@ H5FD_vfd_swmr_read(H5FD_t *_file, H5FD_mem_t type, /* Try finding the addr from the index */ target_page = addr / fs_page_size; -#if 0 /* JRM */ - HDfprintf(stderr, - "vfd swmr read target page/size = %" PRIu64 "/%" PRIu32 "\n", - target_page, size); - HDfprintf(stderr, "vfd swmr read index num_entries = %d\n", num_entries); -#endif /* JRM */ - entry = vfd_swmr_pageno_to_mdf_idx_entry(index, num_entries, target_page, false); -#if 0 /* JRM */ - if (entry != NULL) { - HDfprintf(stderr, - "vfd swmr read found target page/idx %" PRIu64 "/%td.\n", - target_page, entry - index); - } else { - HDfprintf(stderr, - "vfd swmr read passing through page / size = %" PRIu64 "/%zu\n", - target_page, size); - } -#endif /* JRM */ - if (entry == NULL) { /* Cannot find addr in index, read from the underlying hdf5 file */ if(H5FD_read(file->hdf5_file_lf, type, addr, size, buf) < 0) { @@ -723,149 +705,73 @@ H5FD_vfd_swmr_read(H5FD_t *_file, H5FD_mem_t type, page_offset = addr - (target_page * fs_page_size); -#if 0 /* JRM */ - if ( ( page_offset != 0 ) && - ( ( file->pb_configured ) || - ( page_offset + size > fs_page_size ) ) ) { - - HDfprintf(stderr, - "page_offset = %" PRIuHADDR ", size = %zu, " - "page_size = %" PRIu32 "\n", page_offset, size, fs_page_size); - } -#endif /* JRM */ - HDassert( ( page_offset == 0 ) || ( ( ! file->pb_configured ) && ( page_offset + size <= fs_page_size ) ) ); -#if 0 /* JRM */ - HDfprintf(stderr, - "addr = %" PRIuHADDR ", page = %" PRIuHADDR ", len = %zu\n", - addr, addr / fs_page_size, size); - HDfprintf(stderr, - "reading index[%td] fo/mdfo/l/chksum/fc/lc = %" PRIu64 "/%" PRIu64 "/%" PRIu32 "/%" PRIu32 "\n", - entry - index, - entry->hdf5_page_offset, - entry->md_file_page_offset, - entry->length, - entry->chksum); - - -#endif /* JRM */ - HDassert(entry->hdf5_page_offset * fs_page_size <= addr); HDassert(addr < (entry->hdf5_page_offset + 1) * fs_page_size); -#if 0 /* JRM */ - if ( size != entry->length ) { - - HDfprintf(stderr, - "size = %" PRIu32 ", index[%td].length = %" PRIu32 ".\n", - size, entry - index, entry->length); - } - - if ( ( init_size != 8 ) && - ( init_size != entry->length ) ) { - - HDfprintf(stderr, - "ERROR: addr = %" PRIuHADDR ", page = %" PRIuHADDR - ", len = %zu\n", addr, addr / fs_page_size, init_size); - } + HDassert(page_offset + init_size <= entry->length); - HDassert((init_size == 8) || - (init_size == entry->length)); -#endif /* JRM */ - - do { - char *p = buf; - - if(HDlseek(file->md_fd, (HDoff_t) - ((entry->md_file_page_offset * fs_page_size) - + page_offset), SEEK_SET) < 0) - - HGOTO_ERROR(H5E_VFL, H5E_SEEKERROR, FAIL, \ - "unable to seek in metadata file") + if(HDlseek(file->md_fd, (HDoff_t) + ((entry->md_file_page_offset * fs_page_size) + + page_offset), SEEK_SET) < 0) + HGOTO_ERROR(H5E_VFL, H5E_SEEKERROR, FAIL, "unable to seek in metadata file") - /* Coding borrowed from sec2 read */ - while(size > 0) { - - h5_posix_io_t bytes_in = 0; /* # of bytes to read */ - h5_posix_io_ret_t bytes_read = -1; /* # of bytes actually */ - /* read */ - - /* Trying to read more bytes than the return type can handle is - * undefined behavior in POSIX. - */ - if(size > H5_POSIX_MAX_IO_BYTES) - bytes_in = H5_POSIX_MAX_IO_BYTES; - else - bytes_in = (h5_posix_io_t)size; + /* Coding borrowed from sec2 read */ + while(size > 0) { - do { - bytes_read = HDread(file->md_fd, p, bytes_in); - } while(-1 == bytes_read && EINTR == errno); + h5_posix_io_t bytes_in; /* # of bytes to read */ + h5_posix_io_ret_t bytes_read; /* # of bytes actually read */ - if(-1 == bytes_read) /* error */ - - HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, \ - "error reading the page/multi-page entry from the md file") - - HDassert(bytes_read >= 0); - HDassert((size_t)bytes_read <= size); - - size -= (size_t)bytes_read; - p += bytes_read; - } - - /* Verify stored and computed checksums are equal */ -#if 0 /* JRM */ - computed_chksum = H5_checksum_metadata(buf, entry->length, 0); -#else /* JRM */ - /* this is a hack to allow the library to find the superblock - * signature -- clean this up. - * JRM -- 1/14/19 + /* Trying to read more bytes than the return type can handle is + * undefined behavior in POSIX. */ - if ( file->pb_configured ) { - computed_chksum = H5_checksum_metadata(buf, entry->length, 0); - } else { - computed_chksum = entry->chksum; - } -#endif /* JRM */ - -#if 0 /* JRM */ - HDfprintf(stderr, - "computed / actual chksum / fc / lc = 0x%" PRIx32 "/0x%" PRIx32 "/%x/%x\n", - computed_chksum, - entry->chksum, - ((char *)buf)[0], - ((char *)buf)[4095]); -#endif /* JRM */ - - if(entry->chksum == computed_chksum) - break; + if(size > H5_POSIX_MAX_IO_BYTES) + bytes_in = MIN(H5_POSIX_MAX_IO_BYTES, size); + else + bytes_in = (h5_posix_io_t)size; - /* Double the sleep time next time */ - H5_nanosleep(nanosec); - nanosec *= 2; + do { + bytes_read = HDread(file->md_fd, p, bytes_in); + } while (-1 == bytes_read && EINTR == errno); - } while(--entry_retries); + if(-1 == bytes_read) + HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "error reading the page/multi-page entry from the md file") - /* Exhaust all retries for reading the page/multi-page entry */ - if(entry_retries == 0) { + HDassert(0 <= bytes_read && (size_t)bytes_read <= size); - HDfprintf(stderr, "ERROR: addr = %" PRIuHADDR - ", page = %" PRIuHADDR ", len = %zu\n", - addr, addr / fs_page_size, init_size); - HDfprintf(stderr, " type = %d\n", type); - HDfprintf(stderr, - "reading index[%td] fo/mdfo/l/chksum/fc/lc = %" PRIu64 "/%" PRIu64 "/%" PRIu32 "/%" PRIx32 "\n", - entry - index, - entry->hdf5_page_offset, - entry->md_file_page_offset, - entry->length, - entry->chksum); + size -= (size_t)bytes_read; + p += bytes_read; + } - HGOTO_ERROR(H5E_VFL, H5E_CANTLOAD, FAIL, \ - "error in reading the page/multi-page entry") + /* Verify stored and computed checksums are equal. + * + * Ignore the checksum if the buffer (buf, size) is not large enough + * to hold the entire shadow image. Assume that the caller will + * read the entry fully, later. + * + * Ignore checksum if the page buffer is not configured---this + * is John's hack to allow the library to find the superblock + * signature. + */ + if (!file->pb_configured) { + hlog_fast(swmr_read_exception, + "%s: skipping checksum, page buffer not configured", __func__); + } else if (entry->length != init_size) { + hlog_fast(swmr_read_exception, + "%s: skipping checksum, buffer size != entry size", __func__); + } else if (H5_checksum_metadata(buf, entry->length, 0) != entry->chksum) { + hlog_fast(swmr_read_err, "%s: bad checksum", __func__); + hlog_fast(swmr_read_err, "addr %" PRIuHADDR " page %" PRIuHADDR + " len %zu type %d ...", addr, addr / fs_page_size, init_size, type); + hlog_fast(swmr_read_err, "... index[%" PRId64 "] lower pgno %" PRIu64 + " shadow pgno %" PRIu64 " len %" PRIu32 " sum %" PRIx32, + (int64_t)(entry - index), entry->hdf5_page_offset, + entry->md_file_page_offset, entry->length, entry->chksum); + + HGOTO_ERROR(H5E_VFL, H5E_CANTLOAD, FAIL, + "checksum error in shadow file entry"); } done: |