diff options
-rw-r--r-- | src/H5FDprivate.h | 2 | ||||
-rw-r--r-- | src/H5Fint.c | 4 | ||||
-rw-r--r-- | src/H5Fpkg.h | 23 | ||||
-rw-r--r-- | src/H5Ftest.c | 19 | ||||
-rw-r--r-- | src/H5Fvfd_swmr.c | 223 | ||||
-rw-r--r-- | src/H5PB.c | 67 |
6 files changed, 195 insertions, 143 deletions
diff --git a/src/H5FDprivate.h b/src/H5FDprivate.h index c6a53d6..9df0c13 100644 --- a/src/H5FDprivate.h +++ b/src/H5FDprivate.h @@ -347,6 +347,8 @@ H5_DLL herr_t H5FD_set_paged_aggr(H5FD_t *file, hbool_t paged); H5_DLL herr_t H5FD_get_driver_name(const H5FD_t *file, char **driver_name); /* Function prototypes for VFD SWMR */ +H5_DLL int vfd_swmr_idx_entry_defer_free(struct H5F_shared_t *, + const H5FD_vfd_swmr_idx_entry_t *); H5_DLL herr_t H5FD_vfd_swmr_get_tick_and_idx(H5FD_t *_file, hbool_t read_index, uint64_t *tick_ptr, uint32_t *num_entries_ptr, H5FD_vfd_swmr_idx_entry_t index[]); diff --git a/src/H5Fint.c b/src/H5Fint.c index 282a9de..661ec40 100644 --- a/src/H5Fint.c +++ b/src/H5Fint.c @@ -1117,9 +1117,7 @@ H5F__new(H5F_shared_t *shared, unsigned flags, hid_t fcpl_id, hid_t fapl_id, H5F f->shared->vfd_swmr_md_fd = -1; f->shared->fs_man_md = NULL; - f->shared->dl_head_ptr = NULL; - f->shared->dl_tail_ptr = NULL; - f->shared->dl_len = 0; + TAILQ_INIT(&f->shared->old_images); /* Get the VOL connector info */ if(H5F__set_vol_conn(f) < 0) diff --git a/src/H5Fpkg.h b/src/H5Fpkg.h index 8acd919..4dfcdc2 100644 --- a/src/H5Fpkg.h +++ b/src/H5Fpkg.h @@ -225,17 +225,15 @@ typedef struct H5F_mtab_t { * length: The length of the metadata page or multi page * metadata entry in BYTES. * tick_num: Sequence # of the current tick - * next: Pointer to the next entry in the the list - * prev: Pointer to the previous entry in the list + * link: tailqueue linkage */ -typedef struct H5F_vfd_swmr_dl_entry_t { +typedef struct old_image { uint64_t hdf5_page_offset; uint64_t md_file_page_offset; uint32_t length; uint64_t tick_num; - struct H5F_vfd_swmr_dl_entry_t *next; - struct H5F_vfd_swmr_dl_entry_t *prev; -} H5F_vfd_swmr_dl_entry_t; + TAILQ_ENTRY(old_image) link; +} old_image_t; /* Structure specifically to store superblock. This was originally * maintained entirely within H5F_shared_t, but is now extracted @@ -266,7 +264,9 @@ typedef struct deferred_free { uint64_t free_after_tick; } deferred_free_t; -typedef SIMPLEQ_HEAD(deferred_free_head, deferred_free) deferred_free_head_t; +typedef SIMPLEQ_HEAD(deferred_free_queue, deferred_free) deferred_free_queue_t; + +typedef TAILQ_HEAD(old_image_queue, old_image) old_image_queue_t; /* * Define the structure to store the file information for HDF5 files. One of @@ -410,7 +410,7 @@ struct H5F_shared_t { */ uint64_t tick_num; /* Number of the current tick */ struct timespec end_of_tick; /* End time of the current tick */ - deferred_free_head_t deferred_frees; /* For use by VFD SWMR writers. */ + deferred_free_queue_t deferred_frees; /* For use by VFD SWMR writers. */ /* VFD SWMR metadata file index */ H5FD_vfd_swmr_idx_entry_t * mdf_idx; /* pointer to an array of instance * of H5FD_vfd_swmr_idx_entry_t of @@ -467,12 +467,7 @@ struct H5F_shared_t { */ /* Delayed free space release doubly linked list */ - uint32_t dl_len; /* # of entries in the list */ - H5F_vfd_swmr_dl_entry_t *dl_head_ptr; /* Points to the beginning of - * the list - */ - H5F_vfd_swmr_dl_entry_t *dl_tail_ptr; /* Points to the end of the list */ - + old_image_queue_t old_images; char *extpath; /* Path for searching target external link file */ #ifdef H5_HAVE_PARALLEL diff --git a/src/H5Ftest.c b/src/H5Ftest.c index cebbbe9..90657ef 100644 --- a/src/H5Ftest.c +++ b/src/H5Ftest.c @@ -526,6 +526,17 @@ done: FUNC_LEAVE_NOAPI(ret_value) } /* H5F__vfd_swmr_verify_md_hdr_and_idx() */ +static unsigned +count_old_images(old_image_queue_t *old_images) +{ + old_image_t *old_image; + unsigned count = 0; + + TAILQ_FOREACH(old_image, old_images, link) + count++; + + return count; +} /*------------------------------------------------------------------------- @@ -536,15 +547,15 @@ done: * --info read from the metadata file is as indicated by * the input: num_entries, index * --# of entries on the delayed list is as indicated by - * the input: num_dl_entries + * the input: nold_images * * Return: SUCCEED/FAIL * *------------------------------------------------------------------------- */ herr_t -H5F__vfd_swmr_writer_md_test(hid_t file_id, unsigned num_entries, H5FD_vfd_swmr_idx_entry_t *index, - unsigned num_dl_entries) +H5F__vfd_swmr_writer_md_test(hid_t file_id, unsigned num_entries, + H5FD_vfd_swmr_idx_entry_t *index, unsigned nold_images) { H5F_t *f; /* File pointer */ int md_fd = -1; /* The metadata file descriptor */ @@ -566,7 +577,7 @@ H5F__vfd_swmr_writer_md_test(hid_t file_id, unsigned num_entries, H5FD_vfd_swmr_ HGOTO_ERROR(H5E_FILE, H5E_CANTALLOC, FAIL, "error updating the md file with the index") /* Verify the number of entries in the delayed list is as expected */ - if(f->shared->dl_len < num_dl_entries) + if(count_old_images(&f->shared->old_images) < nold_images) HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "incorrect # of entries in the delayed list") /* Open the metadata file */ diff --git a/src/H5Fvfd_swmr.c b/src/H5Fvfd_swmr.c index 72c2aec..11187b1 100644 --- a/src/H5Fvfd_swmr.c +++ b/src/H5Fvfd_swmr.c @@ -57,38 +57,6 @@ #define nanosecs_per_second 1000000000 /* nanoseconds per second */ #define nanosecs_per_tenth_sec 100000000 /* nanoseconds per 0.1 second */ -/* Remove an entry from the doubly linked list */ -#define H5F__LL_REMOVE(entry_ptr, head_ptr, tail_ptr) \ -{ \ - if((head_ptr) == (entry_ptr)) { \ - (head_ptr) = (entry_ptr)->next; \ - if((head_ptr) != NULL ) \ - (head_ptr)->prev = NULL; \ - } else \ - (entry_ptr)->prev->next = (entry_ptr)->next; \ - if((tail_ptr) == (entry_ptr)) { \ - (tail_ptr) = (entry_ptr)->prev; \ - if((tail_ptr) != NULL) \ - (tail_ptr)->next = NULL; \ - } else \ - (entry_ptr)->next->prev = (entry_ptr)->prev; \ - entry_ptr->next = NULL; \ - entry_ptr->prev = NULL; \ -} /* H5F__LL_REMOVE() */ - -/* Prepend an entry to the doubly linked list */ -#define H5F__LL_PREPEND(entry_ptr, head_ptr, tail_ptr) \ -{ \ - if((head_ptr) == NULL) { \ - (head_ptr) = (entry_ptr); \ - (tail_ptr) = (entry_ptr); \ - } else { \ - (head_ptr)->prev = (entry_ptr); \ - (entry_ptr)->next = (head_ptr); \ - (head_ptr) = (entry_ptr); \ - } \ -} /* H5F__LL_PREPEND() */ - /********************/ /* Local Prototypes */ /********************/ @@ -118,6 +86,23 @@ unsigned int vfd_swmr_api_entries_g = 0;/* Times the library was entered * on the 0->1 and 1->0 * transitions. */ +static const bool ldbg_enabled = false; + +static void +ldbgf(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + + if (!ldbg_enabled) + return; + + (void)vprintf(fmt, ap); + + va_end(ap); +} + /* * The head of the end of tick queue (EOT queue) for files opened in either * VFD SWMR write or VFD SWMR read mode @@ -128,8 +113,8 @@ eot_queue_t eot_queue_g = TAILQ_HEAD_INITIALIZER(eot_queue_g); /* Local Variables */ /*******************/ -/* Declare a free list to manage the H5F_vfd_swmr_dl_entry_t struct */ -H5FL_DEFINE(H5F_vfd_swmr_dl_entry_t); +/* Declare a free list to manage the old_image_t struct */ +H5FL_DEFINE(old_image_t); /* Declare a free list to manage the eot_queue_entry_t struct */ H5FL_DEFINE(eot_queue_entry_t); @@ -316,7 +301,7 @@ done: herr_t H5F_vfd_swmr_close_or_flush(H5F_t *f, hbool_t closing) { - H5F_vfd_swmr_dl_entry_t *curr, *next; + old_image_t *curr; herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_NOAPI(FAIL) @@ -360,14 +345,12 @@ H5F_vfd_swmr_close_or_flush(H5F_t *f, hbool_t closing) "unable to close the free-space manager for the metadata file") /* Free the delayed list */ - for (curr = f->shared->dl_head_ptr; - curr != NULL && (next = curr->next, true); - curr = next) { - H5FL_FREE(H5F_vfd_swmr_dl_entry_t, curr); + while ((curr = TAILQ_FIRST(&f->shared->old_images)) != NULL) { + TAILQ_REMOVE(&f->shared->old_images, curr, link); + H5FL_FREE(old_image_t, curr); } - f->shared->dl_head_ptr = f->shared->dl_tail_ptr = NULL; - + assert(TAILQ_EMPTY(&f->shared->old_images)); } else { /* For file flush */ /* Update end_of_tick */ @@ -383,6 +366,26 @@ done: } /* H5F_vfd_swmr_close_or_flush() */ +int +vfd_swmr_idx_entry_defer_free(H5F_shared_t *shared, + const H5FD_vfd_swmr_idx_entry_t *entry) +{ + old_image_t *old_image; + + if (NULL == (old_image = H5FL_CALLOC(old_image_t))) + return -1; + + old_image->hdf5_page_offset = entry->hdf5_page_offset; + old_image->md_file_page_offset = entry->md_file_page_offset; + old_image->length = entry->length; + old_image->tick_num = shared->tick_num; + + if (TAILQ_EMPTY(&shared->old_images)) + ldbgf("Adding to the old images list.\n"); + + TAILQ_INSERT_HEAD(&shared->old_images, old_image, link); + return 0; +} /*------------------------------------------------------------------------- @@ -421,14 +424,14 @@ done: */ herr_t H5F_update_vfd_swmr_metadata_file(H5F_t *f, uint32_t num_entries, - H5FD_vfd_swmr_idx_entry_t index[]) + H5FD_vfd_swmr_idx_entry_t *index) { - H5F_vfd_swmr_dl_entry_t *prev; /* Points to the previous entry - * in the delayed list - */ - H5F_vfd_swmr_dl_entry_t *dl_entry; /* Points to an entry in the - * delayed list - */ + old_image_t *prev; /* Points to the previous entry + * in the delayed list + */ + old_image_t *old_image; /* Points to an entry in the + * delayed list + */ haddr_t md_addr; /* Address in the metadata file */ unsigned i; /* Local index variable */ herr_t ret_value = SUCCEED; /* Return value */ @@ -436,10 +439,16 @@ H5F_update_vfd_swmr_metadata_file(H5F_t *f, uint32_t num_entries, FUNC_ENTER_NOAPI(FAIL) /* Sort index entries by increasing offset in the HDF5 file */ - if ( num_entries ) { - - HDqsort(index, num_entries, sizeof(H5FD_vfd_swmr_idx_entry_t), - H5F__idx_entry_cmp); + if (num_entries > 0) { + HDqsort(index, num_entries, sizeof(*index), H5F__idx_entry_cmp); +#if 0 + /* Assert that there are not any HDF5 page offsets duplicated in + * here. + */ + for (i = 1; i < num_entries; i++) { + assert(index[i].hdf5_page_offset != index[i - 1].hdf5_page_offset); + } +#endif } /* For each non-null entry_ptr in the index: @@ -456,69 +465,60 @@ H5F_update_vfd_swmr_metadata_file(H5F_t *f, uint32_t num_entries, */ for ( i = 0; i < num_entries; i++ ) { - if ( index[i].entry_ptr != NULL ) { - - /* Prepend previous image of the entry to the delayed list */ - if ( index[i].md_file_page_offset ) { - - if ( NULL == (dl_entry = H5FL_CALLOC(H5F_vfd_swmr_dl_entry_t))) - - HGOTO_ERROR(H5E_FILE, H5E_CANTALLOC, FAIL, \ - "unable to allocate the delayed entry") - - dl_entry->hdf5_page_offset = index[i].hdf5_page_offset; - dl_entry->md_file_page_offset = index[i].md_file_page_offset; - dl_entry->length = index[i].length; - dl_entry->tick_num = f->shared->tick_num; - - H5F__LL_PREPEND(dl_entry, f->shared->dl_head_ptr, f->shared->dl_tail_ptr); - f->shared->dl_len++; + if (index[i].entry_ptr == NULL) + continue; + + /* Prepend previous image of the entry to the delayed list */ + if ( index[i].md_file_page_offset ) { + if (vfd_swmr_idx_entry_defer_free(f->shared, &index[i]) == -1) { + HGOTO_ERROR(H5E_FILE, H5E_CANTALLOC, FAIL, \ + "unable to allocate the delayed entry") } + } - /* Allocate space for the entry in the metadata file */ - if((md_addr = H5MV_alloc(f, index[i].length)) == HADDR_UNDEF) + /* Allocate space for the entry in the metadata file */ + if((md_addr = H5MV_alloc(f, index[i].length)) == HADDR_UNDEF) - HGOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, \ - "error in allocating space from the metadata file") + HGOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, \ + "error in allocating space from the metadata file") - /* Compute checksum and update the index entry */ - index[i].md_file_page_offset = md_addr/f->shared->fs_page_size; - index[i].chksum = H5_checksum_metadata(index[i].entry_ptr, - (size_t)(index[i].length), 0); + /* Compute checksum and update the index entry */ + index[i].md_file_page_offset = md_addr/f->shared->fs_page_size; + index[i].chksum = H5_checksum_metadata(index[i].entry_ptr, + index[i].length, 0); #if 0 /* JRM */ - HDfprintf(stderr, - "writing index[%d] fo/mdfo/l/chksum/fc/lc = %lld/%lld/%ld/%lx/%lx/%lx\n", - i, - index[i].hdf5_page_offset, - index[i].md_file_page_offset, - index[i].length, - index[i].chksum, - (((char*)(index[i].entry_ptr))[0]), - (((char*)(index[i].entry_ptr))[4095])); - - HDassert(md_addr == index[i].md_file_page_offset * - f->shared->fs_page_size); - HDassert(f->shared->fs_page_size == 4096); + HDfprintf(stderr, + "writing index[%d] fo/mdfo/l/chksum/fc/lc = %lld/%lld/%ld/%lx/%lx/%lx\n", + i, + index[i].hdf5_page_offset, + index[i].md_file_page_offset, + index[i].length, + index[i].chksum, + (((char*)(index[i].entry_ptr))[0]), + (((char*)(index[i].entry_ptr))[4095])); + + HDassert(md_addr == index[i].md_file_page_offset * + f->shared->fs_page_size); + HDassert(f->shared->fs_page_size == 4096); #endif /* JRM */ - /* Seek and write the entry to the metadata file */ - if ( HDlseek(f->shared->vfd_swmr_md_fd, (HDoff_t)md_addr, - SEEK_SET) < 0) + /* Seek and write the entry to the metadata file */ + if ( HDlseek(f->shared->vfd_swmr_md_fd, (HDoff_t)md_addr, + SEEK_SET) < 0) - HGOTO_ERROR(H5E_FILE, H5E_SEEKERROR, FAIL, \ - "unable to seek in the metadata file") + HGOTO_ERROR(H5E_FILE, H5E_SEEKERROR, FAIL, \ + "unable to seek in the metadata file") - if ( HDwrite(f->shared->vfd_swmr_md_fd, index[i].entry_ptr, - index[i].length) != (ssize_t)index[i].length ) + if ( HDwrite(f->shared->vfd_swmr_md_fd, index[i].entry_ptr, + index[i].length) != (ssize_t)index[i].length ) - HGOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, \ - "error in writing the page/multi-page entry to metadata file") + HGOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, \ + "error in writing the page/multi-page entry to metadata file") - /* Set entry_ptr to NULL */ - index[i].entry_ptr = NULL; + /* Set entry_ptr to NULL */ + index[i].entry_ptr = NULL; - } /* end if */ } /* end for */ /* Construct and write index to the metadata file */ @@ -544,27 +544,25 @@ H5F_update_vfd_swmr_metadata_file(H5F_t *f, uint32_t num_entries, * --remove the associated entries from the list */ - for (dl_entry = f->shared->dl_tail_ptr; - dl_entry != NULL && (prev = dl_entry->prev, true); - dl_entry = prev) { + TAILQ_FOREACH_REVERSE_SAFE(old_image, &f->shared->old_images, + old_image_queue, link, prev) { /* max_lag is at least 3 */ if ( ( f->shared->tick_num > f->shared->vfd_swmr_config.max_lag ) && - ( dl_entry->tick_num <= + ( old_image->tick_num <= f->shared->tick_num - f->shared->vfd_swmr_config.max_lag ) ) { - if ( H5MV_free(f, dl_entry->md_file_page_offset * - f->shared->fs_page_size, dl_entry->length) < 0 ) + if ( H5MV_free(f, old_image->md_file_page_offset * + f->shared->fs_page_size, old_image->length) < 0 ) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, \ "unable to flush clean entry") /* Remove the entry from the delayed list */ - H5F__LL_REMOVE(dl_entry, f->shared->dl_head_ptr, f->shared->dl_tail_ptr) - f->shared->dl_len--; + TAILQ_REMOVE(&f->shared->old_images, old_image, link); /* Free the delayed entry struct */ - H5FL_FREE(H5F_vfd_swmr_dl_entry_t, dl_entry); + H5FL_FREE(old_image_t, old_image); } else { @@ -572,6 +570,9 @@ H5F_update_vfd_swmr_metadata_file(H5F_t *f, uint32_t num_entries, } } + if (TAILQ_EMPTY(&f->shared->old_images)) + ldbgf("Emptied the old images list.\n"); + done: FUNC_LEAVE_NOAPI(ret_value) @@ -1133,6 +1133,37 @@ done: } /* end H5PB_read() */ +static int +vfd_swmr_mdf_idx_entry_remove(H5F_shared_t *shared, uint64_t page) +{ + ptrdiff_t idx_idx, last_idx; + H5FD_vfd_swmr_idx_entry_t *idx_entry; + + idx_entry = vfd_swmr_pageno_to_mdf_idx_entry(shared->mdf_idx, + shared->mdf_idx_entries_used, page); + + if (idx_entry == NULL) + return 0; + + if (shared->vfd_swmr_writer && idx_entry->md_file_page_offset != 0 && + vfd_swmr_idx_entry_defer_free(shared, idx_entry) != 0) + return -1; + + idx_idx = idx_entry - shared->mdf_idx; + last_idx = shared->mdf_idx_entries_used - 1; + + shared->mdf_idx[idx_idx] = shared->mdf_idx[last_idx]; + if (shared->mdf_idx_entries_used > idx_idx + 1) { + const size_t ntocopy = + (size_t)(shared->mdf_idx_entries_used - (idx_idx + 1)); + memmove(&shared->mdf_idx[idx_idx], + &shared->mdf_idx[idx_idx + 1], + ntocopy * sizeof(shared->mdf_idx[idx_idx + 1])); + } + shared->mdf_idx_entries_used--; + return 0; +} + /*------------------------------------------------------------------------- * @@ -1239,22 +1270,34 @@ H5PB_remove_entry(H5F_shared_t *shared, haddr_t addr) HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, "forced eviction failed") - /* Do we need to remove the entry from the metadata file index in - * the VFD SWMR case? + /* We need to remove the entry from the shadow file index in + * the VFD SWMR case, so do that next. * - * Probably yes -- suppose a page is deallocated, and a multipage - * metadata entry is allocated at the same base address. This would - * change the metadata file entry size. + * If a multipage metadata entry is deallocated, and a new, single-page + * metadata entry is allocated at the same base address, then + * the old shadow index entry will still tell the size of the previous + * image, which is greater than a page, and a shadow-file flush will + * access bytes past the end of the entry's image. * - * However, this is sufficiently improbably that it doesn't cause - * problems (that I know of) at present. + * When we add code to allow entries + * to age out of the metadata file index, that may provide + * code that we can reuse to perform this invalidation. * - * Unless it does, hold off on this until we add code to allow entries - * to age out of the metadata file index, as that will give us the - * necessary infrastructure. + * It's also possible (I think) for the index-entry size to be set + * to one page, and then for a multipage entry to appear later at that + * same index entry. The recorded size will still say the same, but + * the image will be bigger. So the shadow file will never see the + * entire image written, just the first page of the image. * - * JRM -- 12/6/18 + * XXX The H5PB__evict_entry() call immediately prior should have called + * XXX vfd_swmr_mdf_idx_entry_remove() for this page. Need to + * XXX move this detailed comment and delete the redundant call to + * XXX vfd_swmr_mdf_idx_entry_remove(), no? */ + if (vfd_swmr_mdf_idx_entry_remove(shared, page) == -1) { + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, + "failed to remove shadow index entry") + } } done: @@ -2369,6 +2412,8 @@ H5PB__evict_entry(H5F_shared_t *shared, H5PB_entry_t *entry_ptr, hbool_t force) /* remove the entry from the hash table */ H5PB__DELETE_FROM_INDEX(pb_ptr, entry_ptr, FAIL) + vfd_swmr_mdf_idx_entry_remove(shared, entry_ptr->page); + /* update stats for eviction */ H5PB__UPDATE_STATS_FOR_EVICTION(pb_ptr, entry_ptr) |