diff options
author | mainzer <mainzer#hdfgroup.org> | 2018-12-07 21:55:55 (GMT) |
---|---|---|
committer | mainzer <mainzer#hdfgroup.org> | 2018-12-07 21:55:55 (GMT) |
commit | 1520b17a3412544213e9a48b677034ff1d4d12da (patch) | |
tree | cace37fd9e9b667984cd41609f86a800d939d8c7 | |
parent | 8cb185cb081e3d63440b41c18555fbff0dc07732 (diff) | |
download | hdf5-1520b17a3412544213e9a48b677034ff1d4d12da.zip hdf5-1520b17a3412544213e9a48b677034ff1d4d12da.tar.gz hdf5-1520b17a3412544213e9a48b677034ff1d4d12da.tar.bz2 |
interim checkin of VFD SWMR writer EOT code.
Added code supporting first cut at the writer end of tick operations.
Tested (to the extent possible) on charis and jelly.
-rw-r--r-- | src/H5Fint.c | 328 | ||||
-rw-r--r-- | src/H5Fpkg.h | 4 | ||||
-rw-r--r-- | src/H5PB.c | 163 | ||||
-rw-r--r-- | src/H5PBpkg.h | 126 | ||||
-rw-r--r-- | src/H5PBprivate.h | 2 | ||||
-rw-r--r-- | src/H5private.h | 77 | ||||
-rw-r--r-- | test/vfd_swmr_generator.c | 10 |
7 files changed, 609 insertions, 101 deletions
diff --git a/src/H5Fint.c b/src/H5Fint.c index d7ba880..658f1aa 100644 --- a/src/H5Fint.c +++ b/src/H5Fint.c @@ -126,6 +126,8 @@ static herr_t H5F__vfd_swmr_construct_write_md_hdr(H5F_t *f, uint32_t num_entrie static herr_t H5F__vfd_swmr_construct_write_md_idx(H5F_t *f, uint32_t num_entries, struct H5FD_vfd_swmr_idx_entry_t index[]); static herr_t H5F__idx_entry_cmp(const void *_entry1, const void *_entry2); static herr_t H5F__vfd_swmr_writer__create_index(H5F_t * f); +static herr_t H5F_vfd_swmr_writer__prep_for_flush_or_close(H5F_t *f); +static herr_t H5F__vfd_swmr_writer__wait_a_tick(H5F_t *f); @@ -138,7 +140,11 @@ H5F_t *vfd_swmr_file_g = NULL; /* Points to the file struct */ hbool_t vfd_swmr_g = FALSE; /* Is this a VFD SWMR configured file */ hbool_t vfd_swmr_writer_g = FALSE; /* Is this the VFD SWMR writer */ uint64_t tick_num_g = 0; /* The current tick_num */ +#if 1 /* clock_gettime() version */ /* JRM */ struct timespec end_of_tick_g; /* The current end_of_tick */ +#else /* gettimeofday() version */ /* JRM */ +struct timeval end_of_tick_g; /* The current end_of_tick */ +#endif /* gettimeofday() version */ /* JRM */ /*****************************/ @@ -1101,11 +1107,18 @@ H5F__new(H5F_file_t *shared, unsigned flags, hid_t fcpl_id, hid_t fapl_id, H5FD_ HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, NULL, "can't get VFD SWMR config info") /* Initialization for VFD SWMR */ - f->shared->vfd_swmr_md_fd = -1; - f->shared->fs_man_md = NULL; - f->shared->dl_head_ptr = NULL; - f->shared->dl_tail_ptr = NULL; - f->shared->dl_len = 0; + f->shared->vfd_swmr = FALSE; + f->shared->vfd_swmr_writer = FALSE; + f->shared->tick_num = 0; + f->shared->mdf_idx = NULL; + f->shared->mdf_idx_len = 0; + f->shared->mdf_idx_entries_used = 0; + + f->shared->vfd_swmr_md_fd = -1; + f->shared->fs_man_md = NULL; + f->shared->dl_head_ptr = NULL; + f->shared->dl_tail_ptr = NULL; + f->shared->dl_len = 0; /* Create a metadata cache with the specified number of elements. * The cache might be created with a different number of elements and @@ -1326,6 +1339,12 @@ H5F__dest(H5F_t *f, hbool_t flush) /* Push error, but keep going*/ HDONE_ERROR(H5E_FILE, H5E_CANTRELEASE, FAIL, "problems closing file") + /* If this is a VFD SWMR writer, prep for flush or close */ + if((f->shared->vfd_swmr) && (f->shared->vfd_swmr_writer) && + (H5F_vfd_swmr_writer__prep_for_flush_or_close(f) < 0)) + /* Push error, but keep going*/ + HDONE_ERROR(H5E_IO, H5E_CANTFLUSH, FAIL, "vfd swmr prep for flush or close failed") + /* Shutdown the page buffer cache */ if(H5PB_dest(f) < 0) /* Push error, but keep going*/ @@ -1955,6 +1974,12 @@ H5F__flush_phase2(H5F_t *f, hbool_t closing) /* Push error, but keep going*/ HDONE_ERROR(H5E_IO, H5E_CANTFLUSH, FAIL, "unable to flush metadata accumulator") + /* If this is a VFD SWMR writer, prep for flush or close */ + if((f->shared->vfd_swmr) && (f->shared->vfd_swmr_writer) && + (H5F_vfd_swmr_writer__prep_for_flush_or_close(f) < 0)) + /* Push error, but keep going*/ + HDONE_ERROR(H5E_IO, H5E_CANTFLUSH, FAIL, "vfd swmr prep for flush or close failed") + /* Flush the page buffer */ if(H5PB_flush(f) < 0) /* Push error, but keep going*/ @@ -2039,6 +2064,7 @@ H5F__close(hid_t file_id) if((nref = H5I_get_ref(file_id, FALSE)) < 0) HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "can't get ID ref count") + if(nref == 1) if(H5F__flush(f) < 0) HGOTO_ERROR(H5E_FILE, H5E_CANTFLUSH, FAIL, "unable to flush cache") @@ -3606,7 +3632,7 @@ done: * For VFD SWMR writer: * * --set vfd_swmr_writer_g to TRUE - * --set tick_num_g to 0 + * --set tick_num_g to 1 * --create the metadata file * --when opening an existing HDF5 file, write header and * empty index in the metadata file @@ -3644,7 +3670,12 @@ H5F__vfd_swmr_init(H5F_t *f, hbool_t file_create) HDassert(f->shared->vfd_swmr_config.vfd_swmr_writer); vfd_swmr_writer_g = f->shared->vfd_swmr_writer = TRUE; - tick_num_g = f->shared->tick_num = 0; + tick_num_g = f->shared->tick_num = 1; + + if ( H5PB_vfd_swmr__set_tick(f) < 0 ) + + HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, \ + "Can't update page buffer current tick") /* Create the metadata file */ if ( ((f->shared->vfd_swmr_md_fd = @@ -3907,6 +3938,7 @@ done: * *------------------------------------------------------------------------- */ +#if 1 /* clock_gettime() version */ /* JRM */ static herr_t H5F__vfd_swmr_update_end_of_tick_and_tick_num(H5F_t *f, hbool_t incr_tick_num) { @@ -3954,6 +3986,11 @@ H5F__vfd_swmr_update_end_of_tick_and_tick_num(H5F_t *f, hbool_t incr_tick_num) tick_num_g++; #endif /* JRM */ f->shared->tick_num = tick_num_g; + + if ( H5PB_vfd_swmr__set_tick(f) < 0 ) + + HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, \ + "Can't update page buffer current tick") } /* @@ -3982,6 +4019,72 @@ done: } /* H5F__vfd_swmr_update_end_of_tick_and_tick_num() */ +#else /* gettimeofday() version */ /* JRM */ + +static herr_t +H5F__vfd_swmr_update_end_of_tick_and_tick_num(H5F_t *f, hbool_t incr_tick_num) +{ + struct timeval curr; /* Current time in struct timeval */ + struct timeval new_end_of_tick; /* new end_of_tick in struct timeval */ + uint64_t tlen_usecs; + uint64_t new_usecs; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_STATIC + + /* Get current time in struct timespec */ + if ( HDgettimeofday(&curr, NULL) < 0 ) + + HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, \ + "can't get time via gettimeofday()") + + /* Convert tick_len to u_secs */ + tlen_usecs = f->shared->vfd_swmr_config.tick_len * TENTH_SEC_TO_MICROSECS; + + /* compute new end of tick */ + new_end_of_tick.tv_sec = curr.tv_sec; + new_usecs = curr.tv_usec + tlen_usecs; + + while ( new_usecs > SECOND_TO_MICROSECS ) { + + (new_end_of_tick.tv_sec)++; + new_usecs -= SECOND_TO_MICROSECS; + } + + new_end_of_tick.tv_usec = (suseconds_t)new_usecs; + + /* Update end_of_tick_g, f->shared->end_of_tick */ + + HDmemcpy(&end_of_tick_g, &new_end_of_tick, sizeof(struct timeval)); + HDmemcpy(&f->shared->end_of_tick, &new_end_of_tick, sizeof(struct timeval)); + + /* + * Update tick_num_g, f->shared->tick_num + */ + if ( incr_tick_num ) { + + /* Regardless of elapsed time, only increment the tick num by 1 + * so as to avoid the possibility of using up all of max_lag in + * one or two ticks. + */ + tick_num_g++; + + f->shared->tick_num = tick_num_g; + + if ( H5PB_vfd_swmr__set_tick(f) < 0 ) + + HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, \ + "Can't update page buffer current tick") + } + +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5F__vfd_swmr_update_end_of_tick_and_tick_num() */ + +#endif /* gettimeofday() version */ /* JRM */ + /*------------------------------------------------------------------------- * @@ -4319,7 +4422,7 @@ done: * the metadata file index, failure to delay such writes can * result in message from the future bugs. * - * The easy case case is pages or multi-page metadata entries + * The easy case is pages or multi-page metadata entries * have just been allocated. Obviously, these can be written * immediately. This case is tracked and tested by the page * buffer proper. @@ -4363,7 +4466,7 @@ H5F_vfd_swmr_writer__delay_write(H5F_t *f, uint64_t page, idx = f->shared->mdf_idx; - HDassert((idx) ||( f->shared->tick_num <= 0)); + HDassert((idx) ||( f->shared->tick_num <= 1)); /* do a binary search on the metadata file index to see if * it already contains an entry for *pbe_ptr. @@ -4431,6 +4534,134 @@ done: /*------------------------------------------------------------------------- * + * Function: H5F_vfd_swmr_writer__prep_for_flush_or_close + * + * Purpose: In the context of the VFD SWMR writer, two issues must be + * addressed before the page buffer can be flushed -- as is + * necessary on both HDF5 file flush or close: + * + * 1) We must force an end of tick so as to clean the tick list + * in the page buffer. + * + * 2) If the page buffer delayed write list is not empty, we + * must repeatedly wait a tick and then run the writer end + * of tick function until the delayed write list drains. + * + * This function manages these details. + * + * Return: SUCCEED/FAIL + * + * Programmer: John Mainzer 11/27/18 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +herr_t +H5F_vfd_swmr_writer__prep_for_flush_or_close(H5F_t *f) +{ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(FAIL) + + HDassert(f); + HDassert(f->shared); + HDassert(f->shared->vfd_swmr); + HDassert(f->shared->vfd_swmr_writer); + HDassert(f->shared->pb_ptr); + + /* since we are about to flush the page buffer, force and end of + * tick so as to avoid attempts to flush entries on the page buffer + * tick list that were modified during the current tick. + */ + if ( H5F_vfd_swmr_writer_end_of_tick() < 0 ) + + HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, \ + "H5F_vfd_swmr_writer_end_of_tick() failed.") + + while(f->shared->pb_ptr->dwl_len > 0) { + + if(H5F__vfd_swmr_writer__wait_a_tick(f) < 0) + + HGOTO_ERROR(H5E_FILE, H5E_CANTFLUSH, FAIL, "wait a tick failed.") + } + +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5F_vfd_swmr_writer__prep_for_flush_or_close() */ + + +/*------------------------------------------------------------------------- + * + * Function: H5F__vfd_swmr_writer__wait_a_tick + * + * Purpose: Before a file that has been opened by a VFD SWMR writer, + * all pending delayed writes must be allowed drain. + * + * This function facilitates this by sleeping for a tick, and + * the running the writer end of tick function. + * + * It should only be called as part the flush or close operations. + * + * Return: SUCCEED/FAIL + * + * Programmer: John Mainzer 11/23/18 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +herr_t +H5F__vfd_swmr_writer__wait_a_tick(H5F_t *f) +{ + int result; + struct timespec req; + struct timespec rem; + uint64_t tick_in_nsec; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(FAIL) + + HDassert(f); + HDassert(f->shared); + HDassert(f->shared->vfd_swmr); + HDassert(f->shared->vfd_swmr_writer); + HDassert((f == vfd_swmr_file_g) || + ((vfd_swmr_file_g) && (f->shared == vfd_swmr_file_g->shared))); + + tick_in_nsec = f->shared->vfd_swmr_config.tick_len * TENTH_SEC_TO_NANOSECS; + req.tv_nsec = (long)(tick_in_nsec % SECOND_TO_NANOSECS); + req.tv_sec = (time_t)(tick_in_nsec / SECOND_TO_NANOSECS); + + result = HDnanosleep(&req, &rem); + + while ( result == -1 ) { + + req.tv_nsec = rem.tv_nsec; + req.tv_sec = rem.tv_sec; + result = HDnanosleep(&req, &rem); + } + + if ( result != 0 ) + + HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, "HDnanosleep() failed.") + + if ( H5F_vfd_swmr_writer_end_of_tick() < 0 ) + + HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, \ + "H5F_vfd_swmr_writer_end_of_tick() failed.") + +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5F__vfd_swmr_writer__wait_a_tick() */ + + +/*------------------------------------------------------------------------- + * * Function: H5F_vfd_swmr_writer_end_of_tick * * Purpose: Main routine for managing the end of tick for the VFD @@ -4445,7 +4676,15 @@ done: * * 2) Flush the metadata cache to the page buffer. * - * 3) If this is the first tick (i.e. tick == 0), create the + * Note that we must run a tick after the destruction + * of the metadata cache, since this operation will usually + * dirty the first page in the HDF5 file. However, the + * metadata cache will no longer exist at this point. + * + * Thus, we must check for the existance of the metadata + * cache, and only attempt to flush it if it exists. + * + * 3) If this is the first tick (i.e. tick == 1), create the * in memory version of the metadata file index. * * 4) Scan the page buffer tick list, and use it to update @@ -4459,25 +4698,15 @@ done: * * (This is an optimization -- adress it later) * - * 6) Scan the page buffer delayed write list for entries that - * may now be written, and move any such entries to the - * page buffer LRU. - * - * (For the first cut, we will assume file was just created, - * that there have been no flushes, and that no entries - * have been removed from the metadata file index. Under - * these circumstances, the delayed write list must always - * be empty. Thus delay implementing this.) - * - * 7) Update the metadata file. Must do this before we + * 6) Update the metadata file. Must do this before we * release the tick list, as otherwise the page buffer * entry images may not be available. * - * 8) Release the page buffer tick list. + * 7) Release the page buffer tick list. * - * 9) Release any delayed writes whose delay has expired. + * 8) Release any delayed writes whose delay has expired. * - * 10) Increment the tick, and update the end of tick. + * 9) Increment the tick, and update the end of tick. * * In passing, generate log entries as appropriate. * @@ -4507,6 +4736,7 @@ H5F_vfd_swmr_writer_end_of_tick(void) HDassert(f->shared); HDassert(f->shared->pb_ptr); HDassert(f->shared->vfd_swmr_writer); + /* 1) If requested, flush all raw data to the HDF5 file. * @@ -4518,17 +4748,20 @@ H5F_vfd_swmr_writer_end_of_tick(void) } - /* 2) Flush the metadata cache to the page buffer. */ - if ( H5AC_flush(f) < 0 ) + /* 2) If it exists, flush the metadata cache to the page buffer. */ + if ( f->shared->cache ) { + + if ( H5AC_flush(f) < 0 ) + + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, \ + "Can't flush metadata cache to the page buffer") + } - HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, \ - "Can't flush metadata cache to the page buffer") - - /* 3) If this is the first tick (i.e. tick == 0), create the + /* 3) If this is the first tick (i.e. tick == 1), create the * in memory version of the metadata file index. */ - if ( ( f->shared->tick_num == 0 ) && + if ( ( f->shared->tick_num == 1 ) && ( H5F__vfd_swmr_writer__create_index(f) < 0 ) ) @@ -4553,31 +4786,26 @@ H5F_vfd_swmr_writer_end_of_tick(void) */ - /* 6) Scan the page buffer delayed write list for entries that - * may now be written, and move any such entries to the - * page buffer LRU. - * - * (For the first cut, we will assume file was just created, - * that there have been no flushes, and that no entries - * have been removed from the metadata file index. Under - * these circumstances, the delayed write list must always - * be empty. Thus delay implementing this.) - */ - HDassert( f->shared->pb_ptr->dwl_len == 0 ); - - - /* 7) Update the metadata file. Must do this before we + /* 6) Update the metadata file. Must do this before we * release the tick list, as otherwise the page buffer * entry images may not be available. * * Note that this operation will restore the index to * sorted order. */ - if ( H5F_update_vfd_swmr_metadata_file(f, + if ( (uint32_t)(f->shared->mdf_idx_entries_used + idx_entries_added) > 0 ) { + + if ( H5F_update_vfd_swmr_metadata_file(f, (uint32_t)(f->shared->mdf_idx_entries_used + idx_entries_added), f->shared->mdf_idx) < 0 ) - HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, "can't update MD file") + HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, "can't update MD file") + } else { + + if ( H5F_update_vfd_swmr_metadata_file(f, 0, NULL) < 0 ) + + HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, "can't update MD file") + } /* at this point the metadata file index should be sorted -- update * f->shared->mdf_idx_entries_used. @@ -4587,19 +4815,19 @@ H5F_vfd_swmr_writer_end_of_tick(void) HDassert(f->shared->mdf_idx_entries_used <= f->shared->mdf_idx_len); - /* 8) Release the page buffer tick list. */ + /* 7) Release the page buffer tick list. */ if ( H5PB_vfd_swmr__release_tick_list(f) < 0 ) HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, "can't release tick list") - /* 9) Release any delayed writes whose delay has expired */ + /* 8) Release any delayed writes whose delay has expired */ if ( H5PB_vfd_swmr__release_delayed_writes(f) < 0 ) HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, "can't release delayed writes") - /* 10) Increment the tick, and update the end of tick. */ + /* 9) Increment the tick, and update the end of tick. */ if( vfd_swmr_file_g ) { /* Update end_of_tick */ diff --git a/src/H5Fpkg.h b/src/H5Fpkg.h index d725f77..55aae9e 100644 --- a/src/H5Fpkg.h +++ b/src/H5Fpkg.h @@ -393,7 +393,11 @@ struct H5F_file_t { * not */ uint64_t tick_num; /* Number of the current tick */ +#if 1 /* use clock_gettime() */ /* JRM */ struct timespec end_of_tick; /* End time of the current tick */ +#else /* use gettimeofday() */ /* JRM */ + struct timeval end_of_tick; /* End time of the current tick */ +#endif /* use gettimeofday() */ /* JRM */ /* VFD SWMR metadata file index */ H5FD_vfd_swmr_idx_entry_t * mdf_idx; /* pointer to an array of instance @@ -495,6 +495,7 @@ herr_t H5PB_create(H5F_t *f, size_t size, unsigned page_buf_min_meta_perc, unsigned page_buf_min_raw_perc) { + hbool_t vfd_swmr_writer = FALSE; int i; int32_t min_md_pages; int32_t min_rd_pages; @@ -545,6 +546,14 @@ H5PB_create(H5F_t *f, size_t size, unsigned page_buf_min_meta_perc, (int32_t)(size / f->shared->fs_page_size)); + /* compute vfd_swmr_writer */ + if ( ( H5F_VFD_SWMR_CONFIG(f) ) && ( H5F_INTENT(f) & H5F_ACC_RDWR ) ) { + + HDassert(f->shared->vfd_swmr_config.vfd_swmr_writer); + vfd_swmr_writer = TRUE; + } + + /* Allocate the new page buffering structure */ if(NULL == (pb_ptr = H5FL_MALLOC(H5PB_t))) @@ -591,7 +600,7 @@ H5PB_create(H5F_t *f, size_t size, unsigned page_buf_min_meta_perc, /* VFD SWMR specific fields. * The following fields are defined iff vfd_swmr_writer is TRUE. */ - pb_ptr->vfd_swmr_writer = FALSE; + pb_ptr->vfd_swmr_writer = vfd_swmr_writer; pb_ptr->mpmde_count = 0; pb_ptr->cur_tick = 0; @@ -1097,9 +1106,12 @@ done: * Function: H5PB_remove_entry * * Purpose: Remove possible metadata entry with ADDR from the PB cache. + * * This is in response to the data corruption bug from fheap.c * with page buffering + page strategy. + * * Note: Large metadata page bypasses the PB cache. + * * Note: Update of raw data page (large or small sized) is * handled by the PB cache. * @@ -1109,6 +1121,13 @@ done: * * Changes: Reworked function for re-implementation of the page buffer. * + * In the context of VFD SWMR, it is possible that the + * discarded page or multi-page metadata entry has been + * modified during the current tick and/or is subject to a + * delayed write. We must detect this, and remove the entry + * from the tick list and/or delayed write list before it is + * evicted. + * * Vailin: I think we need to do this for raw data as well. * * JRM -- 10/23/18 @@ -1145,6 +1164,25 @@ H5PB_remove_entry(const H5F_t *f, haddr_t addr) HDassert(entry_ptr->addr == addr); HDassert(entry_ptr->size == pb_ptr->page_size); + if ( entry_ptr->modified_this_tick ) { + + H5PB__REMOVE_FROM_TL(pb_ptr, entry_ptr, FAIL); + + entry_ptr->modified_this_tick = FALSE; + } + + if ( entry_ptr->delay_write_until > 0 ) { + + entry_ptr->delay_write_until = 0; + + H5PB__REMOVE_FROM_DWL(pb_ptr, entry_ptr, FAIL) + + if ( ! ( entry_ptr->is_mpmde ) ) { + + H5PB__UPDATE_RP_FOR_INSERTION(pb_ptr, entry_ptr, FAIL); + } + } + /* if the entry is dirty, mark it clean before we evict */ if ( ( entry_ptr->is_dirty ) && ( H5PB__mark_entry_clean(pb_ptr, entry_ptr) < 0 ) ) @@ -1156,6 +1194,22 @@ H5PB_remove_entry(const H5F_t *f, haddr_t addr) HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, "forced eviction failed") + /* Do we need to remove the entry from the metadata file index in + * the VFD SWMR case? + * + * Probably yes -- suppose a page is deallocated, and a multipage + * metadata entry is allocated at the same base address. This would + * change the metadata file entry size. + * + * However, this is sufficiently improbably that it doesn't cause + * problems (that I know of) at present. + * + * Unless it does, hold off on this until we add code to allow entries + * to age out of the metadata file index, as that will give us the + * necessary infrastructure. + * + * JRM -- 12/6/18 + */ } done: @@ -1312,12 +1366,12 @@ H5PB_vfd_swmr__release_delayed_writes(H5F_t * f) HDassert(entry_ptr->is_dirty); - H5PB__REMOVE_FROM_DWL(pb_ptr, entry_ptr, FAIL) - entry_ptr->delay_write_until = 0; - if ( entry_ptr->is_mpmde ) { /* flush and evict now */ + H5PB__REMOVE_FROM_DWL(pb_ptr, entry_ptr, FAIL) + if ( entry_ptr->is_mpmde ) { /* flush and evict now */ + if ( H5PB__flush_entry(f, pb_ptr, entry_ptr) < 0 ) HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \ @@ -1429,6 +1483,59 @@ done: /*------------------------------------------------------------------------- * + * Function: H5PB_vfd_swmr__set_tick + * + * Purpose: At the beginning of each tick, the page buffer must be told + * to synchronize its copy of the current tick with that of + * the file to which the page buffer belongs. + * + * This function performs this function. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: John Mainzer -- 11/20/18 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +herr_t +H5PB_vfd_swmr__set_tick(H5F_t * f) +{ + H5PB_t * pb_ptr = NULL; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(FAIL) + + /* Sanity checks */ + HDassert(f); + HDassert(f->shared); + HDassert(f->shared->vfd_swmr); + HDassert(f->shared->vfd_swmr_writer); + + pb_ptr = f->shared->pb_ptr; + + HDassert(pb_ptr); + HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); + HDassert(pb_ptr->vfd_swmr_writer); + + /* the tick must always increase by 1 -- verify this */ + if ( f->shared->tick_num != pb_ptr->cur_tick + 1 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "f->shared->tick_num != pb_ptr->cur_tick + 1 ?!?!") + + pb_ptr->cur_tick = f->shared->tick_num; + +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5PB_vfd_swmr__release_tick_list */ + + +/*------------------------------------------------------------------------- + * * Function: H5PB_vfd_swmr__update_index * * Purpose: In the VFD SWMR writer, all metadata writes to the page @@ -1611,7 +1718,8 @@ H5PB_vfd_swmr__update_index(H5F_t * f, if ( new_index_entry_index >= f->shared->mdf_idx_len ) { - HDfprintf(stderr, "\n\nmax mdf index len exceeded.\n\n"); + HDfprintf(stderr, "\n\nmax mdf index len (%d)exceeded.\n\n", + f->shared->mdf_idx_len); exit(1); } @@ -1648,6 +1756,8 @@ H5PB_vfd_swmr__update_index(H5F_t * f, } HDassert(ie_ptr); + + pbe_ptr = pbe_ptr->tl_next; } /* scan the metadata file index for entries that don't appear in the @@ -2060,16 +2170,19 @@ H5PB__create_new_page(H5PB_t *pb_ptr, haddr_t addr, size_t size, entry_ptr->mem_type = type; entry_ptr->is_metadata = (type != H5FD_MEM_DRAW); entry_ptr->is_mpmde = ((entry_ptr->is_metadata) && - (size > pb_ptr->page_size)); + (size > pb_ptr->page_size)); entry_ptr->is_dirty = FALSE; /* insert in the hash table */ H5PB__INSERT_IN_INDEX(pb_ptr, entry_ptr, FAIL) inserted_in_index = TRUE; - /* insert at the head of the LRU */ - H5PB__UPDATE_RP_FOR_INSERTION(pb_ptr, entry_ptr, FAIL) - inserted_in_lru = TRUE; + /* insert at the head of the LRU if it isn't a multi-page metadata entry */ + if ( ! entry_ptr->is_mpmde ) { + + H5PB__UPDATE_RP_FOR_INSERTION(pb_ptr, entry_ptr, FAIL) + inserted_in_lru = TRUE; + } /* updates stats */ H5PB__UPDATE_STATS_FOR_INSERTION(pb_ptr, entry_ptr); @@ -2165,6 +2278,13 @@ H5PB__deallocate_page(H5PB_entry_t *entry_ptr) * and error unless the force parameter is TRUE, in which * case, these constraints are igmored. * + * In the context of VFD SWMR, there is also the requirement + * that entries to be evicted not be on the tick list, and + * also not reside on the delayed write list. In the rare + * case in which such a page is discarded by the free space + * manager, it must be removed from the tick list and/or the + * delayed write list before being evicted by this function. + * * Return: Non-negative on success/Negative on failure * * Programmer: John Mainzer -- 10/14/18 @@ -2229,8 +2349,11 @@ H5PB__evict_entry(H5PB_t *pb_ptr, H5PB_entry_t *entry_ptr, hbool_t force) HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, "mark entry clean failed") } - /* remove the entry from the LRU */ - H5PB__UPDATE_RP_FOR_EVICTION(pb_ptr, entry_ptr, FAIL) + /* if the entry is in the replacement policy, remove it */ + if ( ! (entry_ptr->is_mpmde) ) { + + H5PB__UPDATE_RP_FOR_EVICTION(pb_ptr, entry_ptr, FAIL) + } /* remove the entry from the hash table */ H5PB__DELETE_FROM_INDEX(pb_ptr, entry_ptr, FAIL) @@ -2296,7 +2419,7 @@ H5PB__flush_entry(H5F_t *f, H5PB_t *pb_ptr, H5PB_entry_t *entry_ptr) HDassert(entry_ptr->image_ptr); HDassert(entry_ptr->is_dirty); HDassert((pb_ptr->vfd_swmr_writer) || (!(entry_ptr->is_mpmde))); - HDassert(0 == (entry_ptr->delay_write_until)); + HDassert((uint64_t)0 == (entry_ptr->delay_write_until)); /* Retrieve the 'eoa' for the file */ if ( HADDR_UNDEF == (eoa = H5F_get_eoa(f, entry_ptr->mem_type)) ) @@ -2766,7 +2889,6 @@ done: static herr_t H5PB__mark_entry_dirty(H5F_t * f, H5PB_t *pb_ptr, H5PB_entry_t *entry_ptr) { - uint64_t delay_write_until = 0; herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_NOAPI(FAIL) @@ -2795,12 +2917,19 @@ H5PB__mark_entry_dirty(H5F_t * f, H5PB_t *pb_ptr, H5PB_entry_t *entry_ptr) if ( ( pb_ptr->vfd_swmr_writer ) && ( entry_ptr->loaded ) && ( H5F_vfd_swmr_writer__delay_write(f, entry_ptr->page, - &delay_write_until) < 0 ) ) + &(entry_ptr->delay_write_until)) < 0 ) ) HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ "get delayed write request failed") - if ( delay_write_until > 0 ) { + if ( entry_ptr->delay_write_until > 0 ) { + + if ( ! ( entry_ptr->is_mpmde ) ) { + + /* remove the entry from the replacement policy */ + + H5PB__UPDATE_RP_FOR_REMOVE(pb_ptr, entry_ptr, FAIL) + } H5PB__INSERT_IN_DWL(pb_ptr, entry_ptr, FAIL) @@ -3635,8 +3764,8 @@ H5PB__write_meta(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, HDassert((size <= pb_ptr->page_size) || (addr == page_addr)); - /* case 7) metadata read of size greater than page size. */ - if ( size >= pb_ptr->page_size ) { + /* case 7) metadata write of size greater than page size. */ + if ( size > pb_ptr->page_size ) { /* The write must be for a multi-page metadata entry, and * we must be running as a VFD SWMR writer. diff --git a/src/H5PBpkg.h b/src/H5PBpkg.h index c6d13db..4af81e6 100644 --- a/src/H5PBpkg.h +++ b/src/H5PBpkg.h @@ -108,7 +108,7 @@ if ( ( ( ( (head_ptr) == NULL ) || ( (tail_ptr) == NULL ) ) && \ ) \ ) \ ) { \ - HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, (fv), "DLL sanity check failed") \ + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, (fv), "DLL sanity check failed") \ } #define H5PB__DLL_PRE_INSERT_SC(entry_ptr, head_ptr, tail_ptr, len, Size, fv) \ @@ -129,7 +129,8 @@ if ( ( (entry_ptr) == NULL ) || \ ) \ ) \ ) { \ - HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, (fv), "DLL pre insert SC failed") \ + HDassert(FALSE); \ + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, (fv), "DLL pre insert SC failed") \ } #else /* H5PB__DO_SANITY_CHECKS */ @@ -310,9 +311,30 @@ if ( ( ( ( (head_ptr) == NULL ) || ( (tail_ptr) == NULL ) ) && \ #define H5PB__IL_DLL_APPEND(entry_ptr, head_ptr, tail_ptr, len, Size, fail_val)\ -{ \ +{ \ H5PB__IL_DLL_PRE_INSERT_SC(entry_ptr, head_ptr, tail_ptr, len, Size, \ - fail_val) \ + fail_val) \ + if ( (head_ptr) == NULL ) \ + { \ + (head_ptr) = (entry_ptr); \ + (tail_ptr) = (entry_ptr); \ + } \ + else \ + { \ + (tail_ptr)->il_next = (entry_ptr); \ + (entry_ptr)->il_prev = (tail_ptr); \ + (tail_ptr) = (entry_ptr); \ + } \ + (len)++; \ + (Size) += (int64_t)((entry_ptr)->size); \ + H5PB__IL_DLL_SC(head_ptr, tail_ptr, len, Size, fail_val) \ +} /* H5PB__IL_DLL_APPEND() */ + +#define H5PB__IL_DLL_PREPEND(entry_ptr, head_ptr, tail_ptr, len, Size, \ + fail_val) \ +{ \ + H5PB__DLL_PRE_INSERT_SC(entry_ptr, head_ptr, tail_ptr, len, Size, \ + fail_val) \ if ( (head_ptr) == NULL ) \ { \ (head_ptr) = (entry_ptr); \ @@ -320,14 +342,45 @@ if ( ( ( ( (head_ptr) == NULL ) || ( (tail_ptr) == NULL ) ) && \ } \ else \ { \ - (tail_ptr)->il_next = (entry_ptr); \ - (entry_ptr)->il_prev = (tail_ptr); \ - (tail_ptr) = (entry_ptr); \ + (head_ptr)->il_prev = (entry_ptr); \ + (entry_ptr)->il_next = (head_ptr); \ + (head_ptr) = (entry_ptr); \ } \ (len)++; \ (Size) += (int64_t)((entry_ptr)->size); \ - H5PB__IL_DLL_SC(head_ptr, tail_ptr, len, Size, fail_val) \ -} /* H5PB__IL_DLL_APPEND() */ +} /* H5PB__DLL_PREPEND() */ + + +#define H5PB__IL_DLL_INSERT_BEFORE(entry_ptr, suc_ptr, head_ptr, tail_ptr, \ + len, Size, fail_val) \ +{ \ + HDassert( ((suc_ptr) == NULL) || \ + ((suc_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC) ); \ + \ + if ( suc_ptr == NULL ) \ + /* list empty or no successor -- append */ \ + H5PB__IL_DLL_APPEND(entry_ptr, head_ptr, tail_ptr, len, Size, \ + fail_val) \ + \ + else if ( suc_ptr->il_prev == NULL ) \ + /* successor at head of list -- prepend */ \ + H5PB__IL_DLL_PREPEND(entry_ptr, head_ptr, tail_ptr, len, Size, \ + fail_val) \ + \ + else /* sucessor in body of list -- insert before it */ \ + { \ + H5PB__IL_DLL_PRE_INSERT_SC(entry_ptr, head_ptr, tail_ptr, len, Size, \ + fail_val) \ + HDassert(suc_ptr->il_prev->magic == H5PB__H5PB_ENTRY_T_MAGIC); \ + HDassert(suc_ptr->il_prev->il_next == suc_ptr); \ + entry_ptr->il_prev = suc_ptr->il_prev; \ + entry_ptr->il_prev->il_next = entry_ptr; \ + entry_ptr->il_next = suc_ptr; \ + suc_ptr->il_prev = entry_ptr; \ + (len)++; \ + (Size) += (int64_t)((entry_ptr)->size); \ + } \ +} /* H5PB__DLL_INSERT_BEFORE() */ #define H5PB__IL_DLL_REMOVE(entry_ptr, head_ptr, tail_ptr, len, Size, fv) \ { \ @@ -1295,6 +1348,54 @@ if ( ( (pb_ptr)->index_size != \ \ } /* H5PB__UPDATE_RP_FOR_EVICTION */ +/*------------------------------------------------------------------------- + * + * Macro: H5PB__UPDATE_RP_FOR_REMOVE + * + * Purpose: Update the replacement policy data structures for the + * removal of the specified page buffer entry from the + * replacement policy, but not from the page buffer. + * + * At present, this this only happens when an entry is + * dirtied, and subject to a delayed write. + * + * At present, we only support the modified LRU policy, so + * this function deals with that case unconditionally. If + * we ever support other replacement policies, the function + * should switch on the current policy and act accordingly. + * + * Return: Non-negative on success/Negative on failure. + * + * Programmer: John Mainzer, 10/09/18 + * + * Modifications: + * + * None. + * + *------------------------------------------------------------------------- + */ + +#define H5PB__UPDATE_RP_FOR_REMOVE(pb_ptr, entry_ptr, fail_val) \ +{ \ + HDassert( (pb_ptr) ); \ + HDassert( (pb_ptr)->magic == H5PB__H5PB_T_MAGIC ); \ + HDassert( (entry_ptr) ); \ + HDassert( (entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC ); \ + HDassert( ! ((entry_ptr)->is_mpmde) ); \ + HDassert( (entry_ptr)->size == pb_ptr->page_size ); \ + \ + /* modified LRU specific code */ \ + \ + /* remove the entry from the LRU list. */ \ + \ + H5PB__DLL_REMOVE((entry_ptr), (pb_ptr)->LRU_head_ptr, \ + (pb_ptr)->LRU_tail_ptr, (pb_ptr)->LRU_len, \ + (pb_ptr)->LRU_size, (fail_val)) \ + \ + /* End modified LRU specific code. */ \ + \ +} /* H5PB__UPDATE_RP_FOR_EVICTION */ + /*------------------------------------------------------------------------- * @@ -1646,7 +1747,8 @@ if ( ( (pb_ptr)->index_size != \ suc_ptr = suc_ptr->next; \ } \ \ - H5PB__DLL_INSERT_BEFORE((entry_ptr), (suc_ptr), (pb_ptr)->dwl_head_ptr, \ + H5PB__DLL_INSERT_BEFORE((entry_ptr), (suc_ptr), \ + (pb_ptr)->dwl_head_ptr, \ (pb_ptr)->dwl_tail_ptr, (pb_ptr)->dwl_len, \ (pb_ptr)->dwl_size, (fail_val)) \ \ @@ -1685,11 +1787,11 @@ if ( ( (pb_ptr)->index_size != \ HDassert( (entry_ptr) ); \ HDassert( (entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC ); \ HDassert( (entry_ptr)->size >= pb_ptr->page_size ); \ - HDassert( (entry_ptr)->delay_write_until < (pb_ptr)->cur_tick ); \ + HDassert( (entry_ptr)->delay_write_until == 0 ); \ \ /* remove the entry from the delayed write list. */ \ \ - H5PB__TL_DLL_REMOVE((entry_ptr), (pb_ptr)->dwl_head_ptr, \ + H5PB__DLL_REMOVE((entry_ptr), (pb_ptr)->dwl_head_ptr, \ (pb_ptr)->dwl_tail_ptr, (pb_ptr)->dwl_len, \ (pb_ptr)->dwl_size, (fail_val)) \ \ diff --git a/src/H5PBprivate.h b/src/H5PBprivate.h index 7aabcd5..201a298 100644 --- a/src/H5PBprivate.h +++ b/src/H5PBprivate.h @@ -694,6 +694,8 @@ H5_DLL herr_t H5PB_vfd_swmr__release_delayed_writes(H5F_t * f); H5_DLL herr_t H5PB_vfd_swmr__release_tick_list(H5F_t * f); +H5_DLL herr_t H5PB_vfd_swmr__set_tick(H5F_t * f); + H5_DLL herr_t H5PB_vfd_swmr__update_index(H5F_t * f, int * idx_ent_added_ptr, int * idx_ent_modified_ptr, int * idx_ent_not_in_tl_ptr, int * idx_ent_not_in_tl_flushed_ptr); diff --git a/src/H5private.h b/src/H5private.h index fb01d06..b64903b 100644 --- a/src/H5private.h +++ b/src/H5private.h @@ -375,8 +375,12 @@ # define H5_EXP2(n) (1 << (n)) /* VFD SWMR */ -#define SECOND_TO_NANOSECS 1000000000 /* Second to nanoseconds */ -#define TENTH_SEC_TO_NANOSECS 100000000 /* Tenth of a second to nanoseconds */ +#define SECOND_TO_NANOSECS 1000000000 /* Second to nanoseconds */ +#define TENTH_SEC_TO_NANOSECS 100000000 /* 0.1 second to nanoseconds */ +#if 0 /* use gettimeofday() */ /* JRM */ +#define SECOND_TO_MICROSECS 1000000 /* Second to microseconds */ +#define TENTH_SEC_TO_MICROSECS 100000 /* 0.1 second to microseconds */ +#endif /* use gettimeofday() */ /* JRM */ /* * HDF Boolean type. @@ -1953,7 +1957,11 @@ extern hbool_t H5_libterm_g; /* Is the library being shutdown? */ extern hbool_t vfd_swmr_g; extern hbool_t vfd_swmr_writer_g; extern uint64_t tick_num_g; +#if 1 /* use clock_gettime() */ /* JRM */ extern struct timespec end_of_tick_g; +#else /* use gettimeofday() */ /* JRM */ +extern struct timeval end_of_tick_g; +#endif /* use gettimeofday() */ /* JRM */ #endif /* H5_HAVE_THREADSAFE */ @@ -2070,26 +2078,53 @@ H5_DLL herr_t H5CX_pop(void); \ BEGIN_MPE_LOG -#define VFD_SWMR_TEST_FOR_END_OF_TICK(swmr_reader_exit, err) \ - /* Initialize the library */ \ - if(vfd_swmr_g) { \ - struct timespec curr_time; \ - long curr_nsecs, end_nsecs; \ - if(HDclock_gettime(CLOCK_MONOTONIC, &curr_time) < 0) \ - HGOTO_ERROR(H5E_FUNC, H5E_CANTGET, err, "can't get time via clock_gettime") \ - curr_nsecs = curr_time.tv_sec * 1000000000 + curr_time.tv_nsec; \ - end_nsecs = end_of_tick_g.tv_sec * 1000000000 + end_of_tick_g.tv_nsec; \ - if(curr_nsecs > end_nsecs) { \ - if(vfd_swmr_writer_g) { \ - if(H5F_vfd_swmr_writer_end_of_tick() < 0) \ - HGOTO_ERROR(H5E_FUNC, H5E_CANTSET, err, "end of tick error for VFD SWMR writer") \ - } \ - else if(!swmr_reader_exit) { \ - if(H5F_vfd_swmr_reader_end_of_tick() < 0) \ - HGOTO_ERROR(H5E_FUNC, H5E_CANTSET, err, "end of tick error for VFD SWMR reader") \ - } \ - } \ +#if 1 /* clock_gettime() version */ /* JRM */ +#define VFD_SWMR_TEST_FOR_END_OF_TICK(swmr_reader_exit, err) \ + /* Initialize the library */ \ + if(vfd_swmr_g) { \ + struct timespec curr_time; \ + long curr_nsecs, end_nsecs; \ + if(HDclock_gettime(CLOCK_MONOTONIC, &curr_time) < 0) \ + HGOTO_ERROR(H5E_FUNC, H5E_CANTGET, err, \ + "can't get time via clock_gettime") \ + curr_nsecs = curr_time.tv_sec * 1000000000 + curr_time.tv_nsec; \ + end_nsecs = end_of_tick_g.tv_sec * 1000000000 + end_of_tick_g.tv_nsec;\ + if(curr_nsecs > end_nsecs) { \ + if(vfd_swmr_writer_g) { \ + if(H5F_vfd_swmr_writer_end_of_tick() < 0) \ + HGOTO_ERROR(H5E_FUNC, H5E_CANTSET, err, \ + "end of tick error for VFD SWMR writer") \ + } \ + else if(!swmr_reader_exit) { \ + if(H5F_vfd_swmr_reader_end_of_tick() < 0) \ + HGOTO_ERROR(H5E_FUNC, H5E_CANTSET, err, \ + "end of tick error for VFD SWMR reader") \ + } \ + } \ + } +#else /* gettimeofday() version */ /* JRM */ +#define VFD_SWMR_TEST_FOR_END_OF_TICK(swmr_reader_exit, err) \ + /* Initialize the library */ \ + if(vfd_swmr_g) { \ + struct timeval curr_time; \ + if(HDgettimeofday(&curr_time, NULL) < 0) \ + HGOTO_ERROR(H5E_FUNC, H5E_CANTGET, err, \ + "can't get time via gettimeofday()") \ + if((curr_time.tv_sec >= end_of_tick_g.tv_sec) && \ + (curr_time.tv_usec >= end_of_tick_g.tv_usec)) { \ + if(vfd_swmr_writer_g) { \ + if(H5F_vfd_swmr_writer_end_of_tick() < 0) \ + HGOTO_ERROR(H5E_FUNC, H5E_CANTSET, err, \ + "end of tick error for VFD SWMR writer") \ + } \ + else if(!swmr_reader_exit) { \ + if(H5F_vfd_swmr_reader_end_of_tick() < 0) \ + HGOTO_ERROR(H5E_FUNC, H5E_CANTSET, err, \ + "end of tick error for VFD SWMR reader") \ + } \ + } \ } +#endif /* gettimeofday() version */ /* JRM */ /* Use this macro for all "normal" API functions */ #define FUNC_ENTER_API(err) {{ \ diff --git a/test/vfd_swmr_generator.c b/test/vfd_swmr_generator.c index d3d5373..72b2fd1 100644 --- a/test/vfd_swmr_generator.c +++ b/test/vfd_swmr_generator.c @@ -179,7 +179,7 @@ gen_skeleton(const char *filename, hbool_t verbose, hbool_t vfd_swmr_write, config->tick_len = 4; config->max_lag = 6; config->vfd_swmr_writer = TRUE; - config->md_pages_reserved = 2; + config->md_pages_reserved = 200; HDstrcpy(config->md_file_path, "my_md_file"); /* Enable VFD SWMR configuration in fapl */ @@ -240,6 +240,10 @@ gen_skeleton(const char *filename, hbool_t verbose, hbool_t vfd_swmr_write, if(verbose) HDfprintf(stderr, "Creating datasets\n"); +#if 1 /* delete this once the race condiditon bug is fixed */ /* JRM */ + sleep(1); +#endif /* JRM */ + /* Create the datasets */ for(u = 0; u < NLEVELS; u++) for(v = 0; v < symbol_count[u]; v++) { @@ -287,6 +291,10 @@ gen_skeleton(const char *filename, hbool_t verbose, hbool_t vfd_swmr_write, return -1; if(H5Tclose(tid) < 0) return -1; + + if(verbose) + HDfprintf(stderr, "Closing file\n"); + if(H5Fclose(fid) < 0) return -1; |