From 2f2cf050e68c02c397e032e1b43d05ed8bafafbf Mon Sep 17 00:00:00 2001 From: mainzer Date: Mon, 19 Nov 2018 02:05:37 -0600 Subject: local commit of first cut at vfd swmr writer EOT code. This is necessary to allow access to Vailin's recent changes This version passes non-swmr tests in a serial / debug build on charis. --- src/H5FDprivate.h | 102 ++++--- src/H5Fint.c | 789 +++++++++++++++++++++++++++++++++++++++++++++------- src/H5Fpkg.h | 62 ++++- src/H5Fprivate.h | 2 + src/H5Fpublic.h | 4 +- src/H5PB.c | 721 +++++++++++++++++++++++++++++++++++++++++------- src/H5PBpkg.h | 805 ++++++++++++++++++++++++++++++++++++++---------------- src/H5PBprivate.h | 141 +++++++++- 8 files changed, 2143 insertions(+), 483 deletions(-) diff --git a/src/H5FDprivate.h b/src/H5FDprivate.h index 45ad4c2..b9582b3 100644 --- a/src/H5FDprivate.h +++ b/src/H5FDprivate.h @@ -87,55 +87,77 @@ /* Internal representation of metadata file index entry */ -/* - * hdf5_page_offset: Unsigned 64-bit value containing the base address of the - * metadata page, or multi page metadata entry in the HDF5 - * file IN PAGES. - * To obtain byte offset, multiply this value by the page size. + +/*---------------------------------------------------------------------------- + * + * struct H5FD_vfd_swmr_idx_entry_t + * + * Indicies into the VFD SWMR metadata file are maintained in arrays of + * instances of H5FD_vfd_swmr_index_t. + * + * The fields of H5FD_vfd_swmr_idx_entry_t are discussed below. + * + * hdf5_page_offset: Unsigned 64-bit value containing the base address of the + * metadata page, or multi page metadata entry in the HDF5 + * file IN PAGES. * - * md_file_page_offset: Unsigned 64-bit value containing the base address of the - * metadata page, or multi page metadata entry in the metadata - * file IN PAGES. - * To obtain byte offset, multiply this value by the page size. + * To obtain byte offset, multiply this value by the page size. * - * length: The length of the metadata page or multi- page metadata entry - * in BYTES. + * md_file_page_offset: Unsigned 64-bit value containing the base address of + * the metadata page, or multi page metadata entry in the metadata + * file IN PAGES. * - * chksum: Checksum for the metadata page or multi-page metadata entry. - * For the VFD SWMR writer, this value is undefined until the - * referenced entry has been written to the metadata file. + * To obtain byte offset, multiply this value by the page size. + * + * length: The length of the metadata page or multi- page metadata entry + * in BYTES. + * + * chksum: Checksum for the metadata page or multi-page metadata entry. + * For the VFD SWMR writer, this value is undefined until the + * referenced entry has been written to the metadata file. * - * entry_ptr: Used by the VFD SWMR writer only. - * For the VFD SWMR reader, this field should always be NULL. - * If the referenced metadata page or multi-page metadata - * entry was modified in the current tick, this field points to - * a buffer in the page buffer containing its value. - * This field is used by the metadata file creation/update code - * to access the metadata pages or multi-page metadata entries - * so that their current values can be copied into the metadata - * file. After this copy, this field should be set to NULL. + * entry_ptr: Used by the VFD SWMR writer only. + * + * For the VFD SWMR reader, this field should always be NULL. + * If the referenced metadata page or multi-page metadata + * entry was modified in the current tick, this field points to + * a buffer in the page buffer containing its value. + * This field is used by the metadata file creation/update code + * to access the metadata pages or multi-page metadata entries + * so that their current values can be copied into the metadata + * file. After this copy, this field should be set to NULL. * - * tick_of_last_change: Number of the last tick in which this index entry was changed. - * Used by the VFD SWMR writer only. - * For the VFD SWMR reader, this field will always be set to 0. + * tick_of_last_change: Number of the last tick in which this index entry + * was changed. * - * clean: Used by the VFD SWMR writer only. - * Set to TRUE whenever the referenced metadata page or multi-page - * metadata entry is written to the HDF5 file. - * Set to FALSE whenever it is marked dirty in the page buffer. + * Used by the VFD SWMR writer only. * - * tick_of_last_flush: Number of the tick in which this entry was last written to the - * HDF5 file or zero if it has never been flusehd. - * Used by the VFD SWMR writer only. - * For the VFD SWMR reader, this field should always be 0. + * For the VFD SWMR reader, this field will always be set to 0. + * + * clean: Used by the VFD SWMR writer only. + * + * Set to TRUE whenever the referenced metadata page or + * multi-page metadata entry is written to the HDF5 file. + * Set to FALSE whenever it is marked dirty in the page buffer. + * + * tick_of_last_flush: Number of the tick in which this entry was last + * written to the HDF5 file or zero if it has never been flusehd. + * + * Used by the VFD SWMR writer only. + * + * For the VFD SWMR reader, this field should always be 0. * - * delayed_flush: If the flush of the referenced metadata page or multi-page - * metadata entry must be delayed, the earliest tick in which - * it may be flushed, or zero if there is no such constraint. - * Used by the VFD SWMR writer only. + * delayed_flush: If the flush of the referenced metadata page or multi-page + * metadata entry must be delayed, the earliest tick in which + * it may be flushed, or zero if there is no such constraint. + * + * Used by the VFD SWMR writer only. + * + * is_moved_to_hdf5_file: Set to TRUE iff the entry referenced is in the + * HDF5 file and is therefore about to be removed from the + * metadata file * - * is_moved_to_hdf5_file: Set to TRUE iff the entry referenced is in the HDF5 file and - * is therefore about to be removed from the metadata file + *---------------------------------------------------------------------------- */ typedef struct H5FD_vfd_swmr_idx_entry_t { uint64_t hdf5_page_offset; diff --git a/src/H5Fint.c b/src/H5Fint.c index bca09b2..2df2d7e 100644 --- a/src/H5Fint.c +++ b/src/H5Fint.c @@ -125,6 +125,8 @@ static herr_t H5F__vfd_swmr_update_end_of_tick_and_tick_num(H5F_t *f, hbool_t in static herr_t H5F__vfd_swmr_construct_write_md_hdr(H5F_t *f, uint32_t num_entries); static herr_t H5F__vfd_swmr_construct_write_md_idx(H5F_t *f, uint32_t num_entries, struct H5FD_vfd_swmr_idx_entry_t index[]); static herr_t H5F__idx_entry_cmp(const void *_entry1, const void *_entry2); +static herr_t H5F__vfd_swmr_writer__create_index(H5F_t * f); + /*********************/ @@ -3588,25 +3590,39 @@ done: /*------------------------------------------------------------------------- + * * Function: H5F__vfd_swmr_init * - * Purpose: Initialize globals and the corresponding fields in file pointer. - * For both: - * --set vfd_swmr_g to TRUE - * --set vfd_swmr_file_g to f - * --set end_of_tick to the current time + tick length + * Purpose: Initialize globals and the corresponding fields in + * file pointer. + * + * For both VFD SWMR writer and reader: + * + * --set vfd_swmr_g to TRUE + * --set vfd_swmr_file_g to f + * --set end_of_tick to the current time + tick length + * * For VFD SWMR writer: - * --set vfd_swmr_writer_g to TRUE - * --set tick_num_g to 0 - * --create the metadata file - * --when opening an existing HDF5 file, write header and empty index in the metadata file + * + * --set vfd_swmr_writer_g to TRUE + * --set tick_num_g to 0 + * --create the metadata file + * --when opening an existing HDF5 file, write header and + * empty index in the metadata file + * * For VFD SWMR reader: - * --set vfd_swmr_writer_g to FALSE - * --set tick_num_g to the current tick read from the metadata file + * + * --set vfd_swmr_writer_g to FALSE + * --set tick_num_g to the current tick read from the + * metadata file * * Return: Success: SUCCEED * Failure: FAIL * + * Programmer: Vailin Choi -- 11/??/18 + * + * Changes: None. + * *------------------------------------------------------------------------- */ static herr_t @@ -3623,63 +3639,96 @@ H5F__vfd_swmr_init(H5F_t *f, hbool_t file_create) vfd_swmr_file_g = f; if(H5F_INTENT(f) & H5F_ACC_RDWR) { + HDassert(f->shared->vfd_swmr_config.vfd_swmr_writer); vfd_swmr_writer_g = f->shared->vfd_swmr_writer = TRUE; tick_num_g = f->shared->tick_num = 0; /* Create the metadata file */ - if(((f->shared->vfd_swmr_md_fd = HDopen(f->shared->vfd_swmr_config.md_file_path, O_CREAT|O_RDWR, H5_POSIX_CREATE_MODE_RW))) < 0) - HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL, "unable to create the metadata file") + if ( ((f->shared->vfd_swmr_md_fd = + HDopen(f->shared->vfd_swmr_config.md_file_path, O_CREAT|O_RDWR, + H5_POSIX_CREATE_MODE_RW))) < 0 ) + + HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL, \ + "unable to create the metadata file") - md_size = (hsize_t)f->shared->vfd_swmr_config.md_pages_reserved * f->shared->fs_page_size; + md_size = (hsize_t)f->shared->vfd_swmr_config.md_pages_reserved * + f->shared->fs_page_size; /* Set the metadata file size to md_pages_reserved */ - if(-1 == HDftruncate(f->shared->vfd_swmr_md_fd, (HDoff_t)md_size)) - HGOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, "truncate fail for the metadata file") + if ( -1 == HDftruncate(f->shared->vfd_swmr_md_fd, (HDoff_t)md_size) ) + + HGOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, \ + "truncate fail for the metadata file") /* Set eof for metadata file to md_pages_reserved */ f->shared->vfd_swmr_md_eoa = (haddr_t)md_size; - /* When opening an existing HDF5 file, create header and empty index in the metadata file */ - if(!file_create) { - if(H5F__vfd_swmr_construct_write_md_hdr(f, 0) < 0) - HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, "fail to create header in md") - if(H5F__vfd_swmr_construct_write_md_idx(f, 0, NULL) < 0) - HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, "fail to create index in md") + /* When opening an existing HDF5 file, create header and empty + * index in the metadata file + */ + if ( !file_create ) { + + if ( H5F__vfd_swmr_construct_write_md_hdr(f, 0) < 0 ) + + HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, \ + "fail to create header in md") + + if ( H5F__vfd_swmr_construct_write_md_idx(f, 0, NULL) < 0 ) + + HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, \ + "fail to create index in md") } } else { /* VFD SWMR reader */ + HDassert(!f->shared->vfd_swmr_config.vfd_swmr_writer); + vfd_swmr_writer_g = f->shared->vfd_swmr_writer = FALSE; /* Set tick_num_g to the current tick read from the metadata file */ - if(H5FD_vfd_swmr_get_tick_and_idx(f->shared->lf, FALSE, &tick_num_g, NULL, NULL) < 0) - HGOTO_ERROR(H5E_FILE, H5E_CANTLOAD, FAIL, "unable to load/decode metadata file") + if ( H5FD_vfd_swmr_get_tick_and_idx(f->shared->lf, FALSE, + &tick_num_g, NULL, NULL) < 0 ) + + HGOTO_ERROR(H5E_FILE, H5E_CANTLOAD, FAIL, \ + "unable to load/decode metadata file") + f->shared->tick_num = tick_num_g; } /* Update end_of_tick */ - if(H5F__vfd_swmr_update_end_of_tick_and_tick_num(f, FALSE) < 0) + if ( H5F__vfd_swmr_update_end_of_tick_and_tick_num(f, FALSE) < 0 ) + HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, "unable to update end of tick") done: + FUNC_LEAVE_NOAPI(ret_value) + } /* H5F__vfd_swmr_init() */ + /*------------------------------------------------------------------------- + * * Function: H5F__vfd_swmr_construct_write_md_hdr * * Purpose: Encode and write header to the metadata file. + * * This is used by the VFD SWMR writer: - * --when opening an existing HDF5 file - * --when closing the HDF5 file - * --after flushing an HDF5 file - * --when updating the metadata file + * + * --when opening an existing HDF5 file + * --when closing the HDF5 file + * --after flushing an HDF5 file + * --when updating the metadata file * * Return: Success: SUCCEED * Failure: FAIL * + * Programmer: Vailin Choi -- 11/??/18 + * + * Changes: None. + * *------------------------------------------------------------------------- */ static herr_t @@ -3718,34 +3767,49 @@ H5F__vfd_swmr_construct_write_md_hdr(H5F_t *f, uint32_t num_entries) HDassert((size_t)(p - image == hdr_size)); /* Set to beginning of the file */ - if(HDlseek(f->shared->vfd_swmr_md_fd, (HDoff_t)0, SEEK_SET) < 0) - HGOTO_ERROR(H5E_VFL, H5E_SEEKERROR, FAIL, "unable to seek in metadata file") + if ( HDlseek(f->shared->vfd_swmr_md_fd, (HDoff_t)0, SEEK_SET) < 0 ) + + HGOTO_ERROR(H5E_VFL, H5E_SEEKERROR, FAIL, \ + "unable to seek in metadata file") /* Write header to the metadata file */ - if(HDwrite(f->shared->vfd_swmr_md_fd, image, hdr_size) != hdr_size) - HGOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, "error in writing header to metadata file") + if ( HDwrite(f->shared->vfd_swmr_md_fd, image, hdr_size) != hdr_size ) + + HGOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, \ + "error in writing header to metadata file") done: + FUNC_LEAVE_NOAPI(ret_value) + } /* H5F__vfd_swmr_construct_write_md_hdr() */ + /*------------------------------------------------------------------------- + * Function: H5F__vfd_swmr_construct_write_md_idx * * Purpose: Encode and write index to the metadata file. + * * This is used by the VFD SWMR writer: - * --when opening an existing HDF5 file - * --when closing the HDF5 file - * --after flushing an HDF5 file - * --when updating the metadata file + * + * --when opening an existing HDF5 file + * --when closing the HDF5 file + * --after flushing an HDF5 file + * --when updating the metadata file * * Return: Success: SUCCEED * Failure: FAIL * + * Programmer: Vailin Choi -- 11/??/18 + * + * Changes: None. + * *------------------------------------------------------------------------- */ static herr_t -H5F__vfd_swmr_construct_write_md_idx(H5F_t *f, uint32_t num_entries, struct H5FD_vfd_swmr_idx_entry_t index[]) +H5F__vfd_swmr_construct_write_md_idx(H5F_t *f, uint32_t num_entries, + struct H5FD_vfd_swmr_idx_entry_t index[]) { uint8_t *image = NULL; /* Pointer to buffer */ uint8_t *p = NULL; /* Pointer to buffer */ @@ -3756,11 +3820,14 @@ H5F__vfd_swmr_construct_write_md_idx(H5F_t *f, uint32_t num_entries, struct H5FD FUNC_ENTER_STATIC - HDassert((num_entries!= 0 && index != NULL) || (num_entries == 0 && index == NULL)); + HDassert((num_entries!= 0 && index != NULL) || + (num_entries == 0 && index == NULL)); /* Allocate space for the buffer to hold the index */ - if((image = (uint8_t *)HDmalloc(idx_size)) == NULL) - HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, "memory allocation failed for md index") + if ( (image = (uint8_t *)HDmalloc(idx_size)) == NULL ) + + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, \ + "memory allocation failed for md index") /* * Encode metadata file index @@ -3798,22 +3865,33 @@ H5F__vfd_swmr_construct_write_md_idx(H5F_t *f, uint32_t num_entries, struct H5FD HDassert(f->shared->vfd_swmr_md_fd >= 0); /* Set to right after the header */ - if(HDlseek(f->shared->vfd_swmr_md_fd, (HDoff_t)H5FD_MD_HEADER_SIZE, SEEK_SET) < 0) - HGOTO_ERROR(H5E_VFL, H5E_SEEKERROR, FAIL, "unable to seek in metadata file") + if ( HDlseek(f->shared->vfd_swmr_md_fd, (HDoff_t)H5FD_MD_HEADER_SIZE, + SEEK_SET) < 0) + + HGOTO_ERROR(H5E_VFL, H5E_SEEKERROR, FAIL, \ + "unable to seek in metadata file") /* Write index to the metadata file */ - if(HDwrite(f->shared->vfd_swmr_md_fd, image, idx_size) != idx_size) - HGOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, "error in writing index to metadata file") + if ( HDwrite(f->shared->vfd_swmr_md_fd, image, idx_size) != idx_size ) + + HGOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, \ + "error in writing index to metadata file") done: - if(image) + + if ( image ) { + HDfree(image); + } + FUNC_LEAVE_NOAPI(ret_value) + } /* H5F__vfd_swmr_construct_write_idx() */ /*------------------------------------------------------------------------- + * * Function: H5F__vfd_swmr_update_end_of_tick_and_tick_num * * Purpose: Update end_of_tick (end_of_tick_g, f->shared->end_of_tick) @@ -3822,6 +3900,10 @@ done: * Return: Success: SUCCEED * Failure: FAIL * + * Programmer: Vailin Choi -- 11/??/18 + * + * Changes: None. + * *------------------------------------------------------------------------- */ static herr_t @@ -3831,15 +3913,19 @@ H5F__vfd_swmr_update_end_of_tick_and_tick_num(H5F_t *f, hbool_t incr_tick_num) struct timespec new_end_of_tick; /* new end_of_tick in struct timespec */ long curr_nsecs; /* current time in nanoseconds */ long tlen_nsecs; /* tick_len in nanoseconds */ +#if 0 /* JRM */ long end_nsecs; /* end_of_tick in nanoseconds */ +#endif /* JRM */ long new_end_nsecs; /* new end_of_tick in nanoseconds */ herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_STATIC /* Get current time in struct timespec */ - if(HDclock_gettime(CLOCK_MONOTONIC, &curr) < 0) - HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "can't get time via clock_gettime") + if ( HDclock_gettime(CLOCK_MONOTONIC, &curr) < 0 ) + + HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, \ + "can't get time via clock_gettime") /* Convert curr to nsecs */ curr_nsecs = curr.tv_sec * SECOND_TO_NANOSECS + curr.tv_nsec; @@ -3850,12 +3936,22 @@ H5F__vfd_swmr_update_end_of_tick_and_tick_num(H5F_t *f, hbool_t incr_tick_num) /* * Update tick_num_g, f->shared->tick_num */ - if(incr_tick_num) { + if ( incr_tick_num ) { + +#if 0 /* JRM */ /* Convert end_of_tick_g to nanoseconds */ - end_nsecs = end_of_tick_g.tv_sec * SECOND_TO_NANOSECS + end_of_tick_g.tv_nsec; + end_nsecs = end_of_tick_g.tv_sec * SECOND_TO_NANOSECS + + end_of_tick_g.tv_nsec; /* Increment tick_num by # of elapsed ticks */ tick_num_g += (1+ (uint64_t)((curr_nsecs - end_nsecs) / tlen_nsecs)); +#else /* JRM */ + /* Regardless of elapsed time, only increment the tick num by 1 + * so as to avoid the possibility of using up all of max_lag in + * one or two ticks. + */ + tick_num_g++; +#endif /* JRM */ f->shared->tick_num = tick_num_g; } @@ -3863,29 +3959,43 @@ H5F__vfd_swmr_update_end_of_tick_and_tick_num(H5F_t *f, hbool_t incr_tick_num) * Update end_of_tick_g, f->shared->end_of_tick */ /* Calculate new end_of_tick */ + + /* TODO: The modulo operation is very expensive on most machines -- + * re-work this code so as to avoid it. + * + * JRM -- 11/12/18 + */ + new_end_nsecs = curr_nsecs + tlen_nsecs; new_end_of_tick.tv_nsec = new_end_nsecs % SECOND_TO_NANOSECS; new_end_of_tick.tv_sec = new_end_nsecs / SECOND_TO_NANOSECS; /* Update end_of_tick */ HDmemcpy(&end_of_tick_g, &new_end_of_tick, sizeof(struct timespec)); - HDmemcpy(&f->shared->end_of_tick, &new_end_of_tick, sizeof(struct timespec)); + HDmemcpy(&f->shared->end_of_tick, &new_end_of_tick, + sizeof(struct timespec)); done: + FUNC_LEAVE_NOAPI(ret_value) + } /* H5F__vfd_swmr_update_end_of_tick_and_tick_num() */ /*------------------------------------------------------------------------- + * * Function: H5F__vfd_swmr_close_or_flush * - * Purpose: Used by the VFD SWMR writer when the HDF5 file is closed or flushed: + * Purpose: Used by the VFD SWMR writer when the HDF5 file is closed + * or flushed: + * * 1) For file close: * --write header and an empty index to the metadata file * --increment tick_num * --close the metadata file * --unlink the metadata file * --close the free-space manager for the metadata file + * * 2) For file flush: * --write header and an empty index to the metadata file * --increment tick_num @@ -3895,6 +4005,10 @@ done: * Return: Success: SUCCEED * Failure: FAIL * + * Programmer: Vailin Choi -- 11/??/18 + * + * Changes: None. + * *------------------------------------------------------------------------- */ static herr_t @@ -3909,52 +4023,71 @@ H5F__vfd_swmr_close_or_flush(H5F_t *f, hbool_t closing) HDassert(f->shared->vfd_swmr_md_fd >= 0); /* Write empty index to the md file */ - if(H5F__vfd_swmr_construct_write_md_idx(f, 0, NULL) < 0) + if ( H5F__vfd_swmr_construct_write_md_idx(f, 0, NULL) < 0 ) + HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, "fail to create index in md") + + /* Write header to the md file */ - if(H5F__vfd_swmr_construct_write_md_hdr(f, 0) < 0) + if ( H5F__vfd_swmr_construct_write_md_hdr(f, 0) < 0 ) + HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, "fail to create header in md") /* Increment tick_num */ tick_num_g = ++f->shared->tick_num; - if(closing) { /* For file close */ + if ( closing ) { /* For file close */ + /* Close the md file */ if(HDclose(f->shared->vfd_swmr_md_fd) < 0) - HGOTO_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, "unable to close the metadata file") + + HGOTO_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, \ + "unable to close the metadata file") f->shared->vfd_swmr_md_fd = -1; /* Unlink the md file */ - if(HDunlink(f->shared->vfd_swmr_config.md_file_path) < 0) - HGOTO_ERROR(H5E_FILE, H5E_CANTREMOVE, FAIL, "unable to unlink the metadata file") + if ( HDunlink(f->shared->vfd_swmr_config.md_file_path) < 0 ) + + HGOTO_ERROR(H5E_FILE, H5E_CANTREMOVE, FAIL, \ + "unable to unlink the metadata file") /* Close the free-space manager for the metadata file */ - if(H5MV_close(f) < 0) - HGOTO_ERROR(H5E_FILE, H5E_CANTRELEASE, FAIL, "unable to close the free-space manager for the metadata file") + if ( H5MV_close(f) < 0 ) + + HGOTO_ERROR(H5E_FILE, H5E_CANTRELEASE, FAIL, \ + "unable to close the free-space manager for the metadata file") /* Free the delayed list */ curr = f->shared->dl_head_ptr; - while(curr != NULL) { + + while ( curr != NULL ) { + next = curr->next; curr = H5FL_FREE(H5F_vfd_swmr_dl_entry_t, curr); curr = next; + } /* end while */ + f->shared->dl_head_ptr = f->shared->dl_tail_ptr = NULL; vfd_swmr_file_g = NULL; } else { /* For file flush */ + /* Update end_of_tick */ - if(H5F__vfd_swmr_update_end_of_tick_and_tick_num(f, TRUE) < 0) - HDONE_ERROR(H5E_FILE, H5E_CANTSET, FAIL, "unable to update end of tick") + if ( H5F__vfd_swmr_update_end_of_tick_and_tick_num(f, TRUE) < 0 ) + + HDONE_ERROR(H5E_FILE, H5E_CANTSET, FAIL, \ + "unable to update end of tick") } done: - FUNC_LEAVE_NOAPI(ret_value) -} /* H5F__vfd_swmr_close_or_flush() */ + FUNC_LEAVE_NOAPI(ret_value) +} /* H5F__vfd_swmr_close_or_flush() */ + /*------------------------------------------------------------------------- * Function: H5F__idx_entry_cmp() * @@ -3989,10 +4122,13 @@ H5F__idx_entry_cmp(const void *_entry1, const void *_entry2) } /* H5F__idx_entry_cmp() */ /*------------------------------------------------------------------------- + * * Function: H5F_update_vfd_swmr_metadata_file() * * Purpose: Update the metadata file with the input index + * * --Sort index + * * --For each non-null entry_ptr in the index entries: * --Insert previous image of the entry onto the delayed list * --Allocate space for the entry in the metadata file @@ -4000,19 +4136,35 @@ H5F__idx_entry_cmp(const void *_entry1, const void *_entry2) * --Update index entry * --Write the entry to the metadata file * --Set entry_ptr to NULL - * --Construct on disk image of the index and write index to the metadata file - * --Construct on disk image of the header and write header to the metadata file - * --Release time out entries from the delayed list to the free-space manager + * + * --Construct on disk image of the index and write index to the + * metadata file + * + * --Construct on disk image of the header and write header to + * the metadata file + * + * --Release time out entries from the delayed list to the + * free-space manager * * Return: SUCCEED/FAIL * + * Programmer: Vailin Choi 11/??/18 + * + * Changes: None. + * + * *------------------------------------------------------------------------- */ herr_t -H5F_update_vfd_swmr_metadata_file(H5F_t *f, uint32_t num_entries, struct H5FD_vfd_swmr_idx_entry_t index[]) +H5F_update_vfd_swmr_metadata_file(H5F_t *f, uint32_t num_entries, + struct H5FD_vfd_swmr_idx_entry_t index[]) { - H5F_vfd_swmr_dl_entry_t *prev; /* Points to the previous entry in the delayed list */ - H5F_vfd_swmr_dl_entry_t *dl_entry; /* Points to an entry in the delayed list */ + H5F_vfd_swmr_dl_entry_t *prev; /* Points to the previous entry + * in the delayed list + */ + H5F_vfd_swmr_dl_entry_t *dl_entry; /* Points to an entry in the + * delayed list + */ haddr_t md_addr; /* Address in the metadata file */ unsigned i; /* Local index variable */ herr_t ret_value = SUCCEED; /* Return value */ @@ -4020,110 +4172,537 @@ H5F_update_vfd_swmr_metadata_file(H5F_t *f, uint32_t num_entries, struct H5FD_vf FUNC_ENTER_NOAPI(FAIL) /* Sort index entries by increasing offset in the HDF5 file */ - if(num_entries) - HDqsort(index, num_entries, sizeof(H5FD_vfd_swmr_idx_entry_t), H5F__idx_entry_cmp); + if ( num_entries ) { + + HDqsort(index, num_entries, sizeof(H5FD_vfd_swmr_idx_entry_t), + H5F__idx_entry_cmp); + } /* For each non-null entry_ptr in the index: - * --Insert previous image of the entry (if exists) to the beginning of the delayed list + * + * --Insert previous image of the entry (if exists) to the + * beginning of the delayed list + * * --Allocate space for the entry in the metadata file - * --Compute checksum, update the index entry, write entry to the metadata file + * + * --Compute checksum, update the index entry, write entry to + * the metadata file + * * --Set entry_ptr to NULL */ - for(i = 0; i < num_entries; i++) { - if(index[i].entry_ptr != NULL) { + for ( i = 0; i < num_entries; i++ ) { + + if ( index[i].entry_ptr != NULL ) { + /* Prepend previous image of the entry to the delayed list */ - if(index[i].md_file_page_offset) { - if(NULL == (dl_entry = H5FL_CALLOC(H5F_vfd_swmr_dl_entry_t))) - HGOTO_ERROR(H5E_FILE, H5E_CANTALLOC, FAIL, "unable to allocate the delayed entry") + if ( index[i].md_file_page_offset ) { + + if ( NULL == (dl_entry = H5FL_CALLOC(H5F_vfd_swmr_dl_entry_t))) + + HGOTO_ERROR(H5E_FILE, H5E_CANTALLOC, FAIL, \ + "unable to allocate the delayed entry") + dl_entry->hdf5_page_offset = index[i].hdf5_page_offset; dl_entry->md_file_page_offset = index[i].md_file_page_offset; dl_entry->length = index[i].length; dl_entry->tick_num = f->shared->tick_num; - H5F_DC_PREPEND(dl_entry, f->shared->dl_head_ptr, f->shared->dl_tail_ptr, f->shared->dl_len); + + H5F_DC_PREPEND(dl_entry, f->shared->dl_head_ptr, \ + f->shared->dl_tail_ptr, f->shared->dl_len); } /* Allocate space for the entry in the metadata file */ if((md_addr = H5MV_alloc(f, index[i].length)) == HADDR_UNDEF) - HGOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, "error in allocating space from the metadata file") + + HGOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, \ + "error in allocating space from the metadata file") + /* Compute checksum and update the index entry */ index[i].md_file_page_offset = md_addr/f->shared->fs_page_size; - index[i].chksum = H5_checksum_metadata(index[i].entry_ptr, (size_t)(index[i].length), 0); + index[i].chksum = H5_checksum_metadata(index[i].entry_ptr, + (size_t)(index[i].length), 0); /* Seek and write the entry to the metadata file */ - if(HDlseek(f->shared->vfd_swmr_md_fd, (HDoff_t)md_addr, SEEK_SET) < 0) - HGOTO_ERROR(H5E_FILE, H5E_SEEKERROR, FAIL, "unable to seek in the metadata file") - if(HDwrite(f->shared->vfd_swmr_md_fd, index[i].entry_ptr, index[i].length) != index[i].length) - HGOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, "error in writing the page/multi-page entry to metadata file") + if ( HDlseek(f->shared->vfd_swmr_md_fd, (HDoff_t)md_addr, + SEEK_SET) < 0) + + HGOTO_ERROR(H5E_FILE, H5E_SEEKERROR, FAIL, \ + "unable to seek in the metadata file") + + if ( HDwrite(f->shared->vfd_swmr_md_fd, index[i].entry_ptr, + index[i].length) != index[i].length ) + + HGOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, \ + "error in writing the page/multi-page entry to metadata file") /* Set entry_ptr to NULL */ index[i].entry_ptr = NULL; - } /* end if */ + } /* end if */ } /* end for */ /* Construct and write index to the metadata file */ - if(H5F__vfd_swmr_construct_write_md_idx(f, num_entries, index) < 0) - HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, "fail to construct & write index to md") + if ( H5F__vfd_swmr_construct_write_md_idx(f, num_entries, index) < 0 ) + + HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, \ + "fail to construct & write index to md") /* Construct and write header to the md file */ - if(H5F__vfd_swmr_construct_write_md_hdr(f, num_entries) < 0) - HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, "fail to construct & write header to md") + if ( H5F__vfd_swmr_construct_write_md_hdr(f, num_entries) < 0 ) + + HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, \ + "fail to construct & write header to md") /* - * Release time out entries from the delayed list by scanning the list from the bottom up: - * --release to the metadata file free space manager all index entries that have - * resided on the list for more than max_lag ticks + * Release time out entries from the delayed list by scanning the + * list from the bottom up: + * + * --release to the metadata file free space manager all index + * entries that have resided on the list for more than + * max_lag ticks + * * --remove the associated entries from the list */ dl_entry = f->shared->dl_tail_ptr; - while(dl_entry != NULL) { + + while ( dl_entry != NULL ) { prev = dl_entry->prev; + /* max_lag is at least 3 */ - if((int)dl_entry->tick_num <= ((int)f->shared->tick_num - f->shared->vfd_swmr_config.max_lag)) { - if(H5MV_free(f, dl_entry->md_file_page_offset * f->shared->fs_page_size, dl_entry->length) < 0) - HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to flush clean entry") + if ( ( f->shared->tick_num > f->shared->vfd_swmr_config.max_lag ) && + ( dl_entry->tick_num <= + f->shared->tick_num - f->shared->vfd_swmr_config.max_lag ) ) { + + if ( H5MV_free(f, dl_entry->md_file_page_offset * + f->shared->fs_page_size, dl_entry->length) < 0 ) + + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, \ + "unable to flush clean entry") /* Remove the entry from the delayed list */ - H5F_DC_REMOVE(dl_entry, f->shared->dl_head_ptr, f->shared->dl_tail_ptr, f->shared->dl_len) + H5F_DC_REMOVE(dl_entry, f->shared->dl_head_ptr, \ + f->shared->dl_tail_ptr, f->shared->dl_len) /* Free the delayed entry struct */ H5FL_FREE(H5F_vfd_swmr_dl_entry_t, dl_entry); - } else + + } else { + break; + } + dl_entry = prev; + } /* end while */ done: + FUNC_LEAVE_NOAPI(ret_value) + } /* end H5F_update_vfd_swmr_metadata_file() */ + +/*------------------------------------------------------------------------- + * + * Function: H5F_vfd_swmr_writer__delay_write + * + * Purpose: Given the base address of a page of metadata, or of a multi- + * page metadata entry, determine whether the write must be + * delayed. + * + * At the conceptual level, the VFD SWMR writer must delay the + * write of any metadata page or multi-page metadata that + * overwrites an existing metadata page or multi-page metadata + * entry until it has appeared in the metadata file index for + * at least max_lag ticks. Since the VFD SWMR reader goes + * to the HDF5 file for any piece of metadata not listed in + * the metadata file index, failure to delay such writes can + * result in message from the future bugs. + * + * The easy case case is pages or multi-page metadata entries + * have just been allocated. Obviously, these can be written + * immediately. This case is tracked and tested by the page + * buffer proper. + * + * This routine looks up the supplied page in the metadata file + * index. + * + * If the entry doesn't exist, the function sets + * *delay_write_until_ptr to the current tick plus max_lag. + * + * If the entry exists, the function sets *delay_write_until_ptr + * equal to the entries delayed flush field if it is greater than + * or equal to the current tick, or zero otherwise. + * + * Return: SUCCEED/FAIL + * + * Programmer: John Mainzer 11/4/18 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +herr_t +H5F_vfd_swmr_writer__delay_write(H5F_t *f, uint64_t page, + uint64_t * delay_write_until_ptr) +{ + int32_t top = -1; + int32_t bottom = 0; + int32_t probe; + uint64_t delay_write_until = 0; + H5FD_vfd_swmr_idx_entry_t * ie_ptr = NULL; + H5FD_vfd_swmr_idx_entry_t * idx = NULL; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(FAIL) + + HDassert(f); + HDassert(f->shared); + HDassert(f->shared->vfd_swmr); + HDassert(f->shared->vfd_swmr_writer); + + idx = f->shared->mdf_idx; + + HDassert((idx) ||( f->shared->tick_num <= 0)); + + /* do a binary search on the metadata file index to see if + * it already contains an entry for *pbe_ptr. + */ + + ie_ptr = NULL; + + if ( idx ) { + + top = f->shared->mdf_idx_entries_used - 1; + bottom = 0; + } + + while ( top >= bottom ) { + + HDassert(idx); + + probe = top + bottom / 2; + + if ( idx[probe].hdf5_page_offset < page ) { + + bottom = probe + 1; + + } else if ( idx[probe].hdf5_page_offset > page ) { + + top = probe - 1; + + } else { /* found it */ + + ie_ptr = idx + probe; + bottom = top + 1; /* to exit loop */ + } + } + + if ( ie_ptr ) { + + if ( ie_ptr->delayed_flush >= f->shared->tick_num ) { + + delay_write_until = ie_ptr->delayed_flush; + } + } else { + + delay_write_until = f->shared->tick_num + + f->shared->vfd_swmr_config.max_lag; + } + + if ( ( delay_write_until != 0 ) && + ( ! ( ( delay_write_until >= f->shared->tick_num ) && + ( delay_write_until <= + (f->shared->tick_num + f->shared->vfd_swmr_config.max_lag) ) + ) + ) + ) + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "VFD SWMR write delay out of range") + + *delay_write_until_ptr = delay_write_until; + +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5F_vfd_swmr_writer__delay_write() */ + + /*------------------------------------------------------------------------- + * * Function: H5F_vfd_swmr_writer_end_of_tick * - * Purpose: Dummy right now + * Purpose: Main routine for managing the end of tick for the VFD + * SWMR writer. + * + * This function performs all end of tick operations for the + * writer -- specifically: + * + * 1) If requested, flush all raw data to the HDF5 file. + * + * (Not for first cut.) + * + * 2) Flush the metadata cache to the page buffer. + * + * 3) If this is the first tick (i.e. tick == 0), create the + * in memory version of the metadata file index. + * + * 4) Scan the page buffer tick list, and use it to update + * the metadata file index, adding or modifying entries as + * appropriate. + * + * 5) Scan the metadata file index for entries that can be + * removed -- specifically entries that have been written + * to the HDF5 file more than max_lag ticks ago, and haven't + * been modified since. + * + * (This is an optimization -- adress it later) + * + * 6) Scan the page buffer delayed write list for entries that + * may now be written, and move any such entries to the + * page buffer LRU. + * + * (For the first cut, we will assume file was just created, + * that there have been no flushes, and that no entries + * have been removed from the metadata file index. Under + * these circumstances, the delayed write list must always + * be empty. Thus delay implementing this.) + * + * 7) Update the metadata file. Must do this before we + * release the tick list, as otherwise the page buffer + * entry images may not be available. + * + * 8) Release the page buffer tick list. + * + * 9) Release any delayed writes whose delay has expired. + * + * 10) Increment the tick, and update the end of tick. + * + * In passing, generate log entries as appropriate. * * Return: SUCCEED/FAIL * + * Programmer: John Mainzer 11/4/18 + * + * Changes: None. + * *------------------------------------------------------------------------- */ herr_t H5F_vfd_swmr_writer_end_of_tick(void) { + int32_t idx_entries_added = 0; + int32_t idx_entries_modified = 0; + int32_t idx_ent_not_in_tl = 0; + int32_t idx_ent_not_in_tl_flushed = 0; + H5F_t * f; herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_NOAPI(FAIL) - if(vfd_swmr_file_g) { + f = vfd_swmr_file_g; + + HDassert(f); + HDassert(f->shared); + HDassert(f->shared->pb_ptr); + HDassert(f->shared->vfd_swmr_writer); + + /* 1) If requested, flush all raw data to the HDF5 file. + * + * (Not for first cut.) + */ + if ( f->shared->vfd_swmr_config.flush_raw_data ) { + + HDassert(FALSE); + } + + + /* 2) Flush the metadata cache to the page buffer. */ + if ( H5AC_flush(f) < 0 ) + + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, \ + "Can't flush metadata cache to the page buffer") + + + /* 3) If this is the first tick (i.e. tick == 0), create the + * in memory version of the metadata file index. + */ + if ( ( f->shared->tick_num == 0 ) && + ( H5F__vfd_swmr_writer__create_index(f) < 0 ) ) + + + /* 4) Scan the page buffer tick list, and use it to update + * the metadata file index, adding or modifying entries as + * appropriate. + */ + if ( H5PB_vfd_swmr__update_index(f, &idx_entries_added, + &idx_entries_modified, + &idx_ent_not_in_tl, + &idx_ent_not_in_tl_flushed) < 0 ) + + HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, "can't update MD file index") + + + /* 5) Scan the metadata file index for entries that can be + * removed -- specifically entries that have been written + * to the HDF5 file more than max_lag ticks ago, and haven't + * been modified since. + * + * (This is an optimization -- adress it later) + */ + + + /* 6) Scan the page buffer delayed write list for entries that + * may now be written, and move any such entries to the + * page buffer LRU. + * + * (For the first cut, we will assume file was just created, + * that there have been no flushes, and that no entries + * have been removed from the metadata file index. Under + * these circumstances, the delayed write list must always + * be empty. Thus delay implementing this.) + */ + HDassert( f->shared->pb_ptr->dwl_len == 0 ); + + + /* 7) Update the metadata file. Must do this before we + * release the tick list, as otherwise the page buffer + * entry images may not be available. + * + * Note that this operation will restore the index to + * sorted order. + */ + if ( H5F_update_vfd_swmr_metadata_file(f, + (uint32_t)(f->shared->mdf_idx_entries_used + idx_entries_added), + f->shared->mdf_idx) < 0 ) + + HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, "can't update MD file") + + /* at this point the metadata file index should be sorted -- update + * f->shared->mdf_idx_entries_used. + */ + f->shared->mdf_idx_entries_used += idx_entries_added; + + HDassert(f->shared->mdf_idx_entries_used <= f->shared->mdf_idx_len); + + + /* 8) Release the page buffer tick list. */ + if ( H5PB_vfd_swmr__release_tick_list(f) < 0 ) + + HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, "can't release tick list") + + + /* 9) Release any delayed writes whose delay has expired */ + if ( H5PB_vfd_swmr__release_delayed_writes(f) < 0 ) + + HGOTO_ERROR(H5E_FILE, H5E_SYSTEM, FAIL, "can't release delayed writes") + + + /* 10) Increment the tick, and update the end of tick. */ + if( vfd_swmr_file_g ) { + /* Update end_of_tick */ - if(H5F__vfd_swmr_update_end_of_tick_and_tick_num(vfd_swmr_file_g, TRUE) < 0) - HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, "unable to update end of tick") + if ( H5F__vfd_swmr_update_end_of_tick_and_tick_num(vfd_swmr_file_g, + TRUE) < 0 ) + + HGOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, \ + "unable to update end of tick") } done: + FUNC_LEAVE_NOAPI(ret_value) + } /* end H5F_vfd_swmr_writer_end_of_tick() */ + +/*------------------------------------------------------------------------- + * + * Function: H5F__vfd_swmr_writer__create_index + * + * Purpose: Allocate and initialize the index for the VFD SWMR metadata + * file. + * + * In the first cut at VFD SWMR, the index is of fixed size, + * as specified by the md_pages_reserved field of the VFD + * SWMR configuration. If we exceed this size we will simply + * abort. Needless to say, this will have to change in the + * production version, but it is good enough for the working + * prototype. + * + * Return: SUCCEED/FAIL + * + * Programmer: John Mainzer 11/5/18 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +herr_t +H5F__vfd_swmr_writer__create_index(H5F_t * f) +{ + int i; + size_t bytes_available; + int32_t entries_in_index; + size_t index_size; + H5FD_vfd_swmr_idx_entry_t * index = NULL; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(FAIL) + + f = vfd_swmr_file_g; + + HDassert(f); + HDassert(f->shared); + HDassert(f->shared->vfd_swmr_writer); + HDassert(f->shared->mdf_idx == NULL); + HDassert(f->shared->mdf_idx_len == 0); + HDassert(f->shared->mdf_idx_entries_used == 0); + + bytes_available = (size_t)f->shared->fs_page_size * + (size_t)(f->shared->vfd_swmr_config.md_pages_reserved) - + H5FD_MD_HEADER_SIZE; + + HDassert(bytes_available > 0); + + entries_in_index = (int32_t)(bytes_available / H5FD_MD_INDEX_ENTRY_SIZE); + + HDassert(entries_in_index > 0); + + index_size = sizeof(H5FD_vfd_swmr_idx_entry_t) * (size_t)entries_in_index; + index = (H5FD_vfd_swmr_idx_entry_t *)HDmalloc(index_size); + + if ( index == NULL ) + + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, \ + "memory allocation failed for md index") + + for ( i = 0; i < entries_in_index; i++ ) { + + index[i].hdf5_page_offset = 0; + index[i].md_file_page_offset = 0; + index[i].length = 0; + index[i].chksum = 0; + index[i].entry_ptr = NULL; + index[i].tick_of_last_change = 0; + index[i].clean = FALSE; + index[i].tick_of_last_flush = 0; + index[i].delayed_flush = 0; + index[i].moved_to_hdf5_file = FALSE; + } + + f->shared->mdf_idx = index; + f->shared->mdf_idx_len = entries_in_index; + f->shared->mdf_idx_entries_used = 0; + +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} /* end H5F__vfd_swmr_writer__create_index() */ + /*------------------------------------------------------------------------- * Function: H5F_vfd_swmr_reader_end_of_tick * diff --git a/src/H5Fpkg.h b/src/H5Fpkg.h index 9e523de..d725f77 100644 --- a/src/H5Fpkg.h +++ b/src/H5Fpkg.h @@ -35,6 +35,7 @@ /* Other private headers needed by this file */ #include "H5private.h" /* Generic Functions */ #include "H5ACprivate.h" /* Metadata cache */ +#include "H5FDprivate.h" /* VFD -- for VFD SWMR */ #include "H5FLprivate.h" /* Free Lists */ #include "H5FOprivate.h" /* File objects */ #include "H5FSprivate.h" /* File free space */ @@ -381,24 +382,69 @@ struct H5F_file_t { /* VFD SWMR */ /* Configuration info */ - H5F_vfd_swmr_config_t vfd_swmr_config; /* Copy of the VFD SWMR configuration from the - FAPL used to open the file */ - hbool_t vfd_swmr; /* The file is opened with VFD SWMR configured or not*/ - hbool_t vfd_swmr_writer; /* This is the VFD SWMR writer or not */ + H5F_vfd_swmr_config_t vfd_swmr_config; /* Copy of the VFD SWMR + * configuration from the + * FAPL used to open the file + */ + hbool_t vfd_swmr; /* The file is opened with VFD + * SWMR configured or not + */ + hbool_t vfd_swmr_writer; /* This is the VFD SWMR writer or + * not + */ uint64_t tick_num; /* Number of the current tick */ struct timespec end_of_tick; /* End time of the current tick */ + /* VFD SWMR metadata file index */ + H5FD_vfd_swmr_idx_entry_t * mdf_idx; /* pointer to an array of instance + * of H5FD_vfd_swmr_idx_entry_t of + * length mdf_idx_len. This array + * is used by the vfd swmr writer + * to assemble the metadata file + * index at the end of each tick, + * and by the vfd swmr readers to + * track changes in the index. + * With one brief exception during + * writer end of tick processing, + * this index will alwasy be sorted + * in increasing HDF5 file page + * offset order. + * + * This field should be NULL unless + * the index is defined. + */ + int32_t mdf_idx_len; /* number of entries in the array + * of instances of + * H5FD_vfd_swmr_idx_entry_t pointed + * to by mdf_idx above. Note that + * not all entries in the index + * need be used. + */ + int32_t mdf_idx_entries_used; /* Number of entries in *mdf_idx + * that are in use -- these will + * be contiguous at indicies 0 + * through mdf_idx_entries_used - 1. + */ + /* Metadata file for VFD SWMR writer */ - int vfd_swmr_md_fd; /* POSIX: file descriptor for the metadata file */ - haddr_t vfd_swmr_md_eoa; /* POSIX: eoa for the metadata file */ + int vfd_swmr_md_fd; /* POSIX: file descriptor for the + * metadata file + */ + haddr_t vfd_swmr_md_eoa; /* POSIX: eoa for the metadata + * file + */ /* Free space manager for the metadata file */ H5FS_t *fs_man_md; /* Free-space manager */ - H5F_fs_state_t fs_state_md; /* State of the free space manager */ + H5F_fs_state_t fs_state_md; /* State of the free space + * manager + */ /* Delayed free space release doubly linked list */ uint32_t dl_len; /* # of entries in the list */ - H5F_vfd_swmr_dl_entry_t *dl_head_ptr; /* Points to the beginning of the list */ + H5F_vfd_swmr_dl_entry_t *dl_head_ptr; /* Points to the beginning of + * the list + */ H5F_vfd_swmr_dl_entry_t *dl_tail_ptr; /* Points to the end of the list */ }; diff --git a/src/H5Fprivate.h b/src/H5Fprivate.h index dc407c8..cad92fa 100644 --- a/src/H5Fprivate.h +++ b/src/H5Fprivate.h @@ -879,6 +879,8 @@ H5_DLL herr_t H5F_cwfs_remove_heap(H5F_file_t *shared, struct H5HG_heap_t *heap) H5_DLL herr_t H5F_debug(H5F_t *f, FILE * stream, int indent, int fwidth); /* VFD SWMR */ +H5_DLL herr_t H5F_vfd_swmr_writer__delay_write(H5F_t *f, uint64_t page, + uint64_t * delay_write_until_ptr); H5_DLL herr_t H5F_vfd_swmr_writer_end_of_tick(void); H5_DLL herr_t H5F_vfd_swmr_reader_end_of_tick(void); H5_DLL herr_t H5F_update_vfd_swmr_metadata_file(H5F_t *f, uint32_t index_len, struct H5FD_vfd_swmr_idx_entry_t *index); diff --git a/src/H5Fpublic.h b/src/H5Fpublic.h index 9c47098..c2bfb21 100644 --- a/src/H5Fpublic.h +++ b/src/H5Fpublic.h @@ -222,8 +222,8 @@ typedef herr_t (*H5F_flush_cb_t)(hid_t object_id, void *udata); #define H5F__MAX_VFD_SWMR_FILE_NAME_LEN 1024 typedef struct H5F_vfd_swmr_config_t { int32_t version; - int32_t tick_len; - int32_t max_lag; + uint32_t tick_len; + uint32_t max_lag; hbool_t vfd_swmr_writer;/****/ hbool_t flush_raw_data; int32_t md_pages_reserved; diff --git a/src/H5PB.c b/src/H5PB.c index c89c381..a3cac89 100644 --- a/src/H5PB.c +++ b/src/H5PB.c @@ -97,7 +97,7 @@ static herr_t H5PB__make_space(H5F_t *f, H5PB_t *pb_ptr, static herr_t H5PB__mark_entry_clean(H5PB_t *pb_ptr, H5PB_entry_t *entry_ptr); -static herr_t H5PB__mark_entry_dirty(H5PB_t *pb_ptr, +static herr_t H5PB__mark_entry_dirty(H5F_t * f, H5PB_t *pb_ptr, H5PB_entry_t *entry_ptr); static herr_t H5PB__read_meta(H5F_t *f, H5FD_mem_t type, haddr_t addr, @@ -188,6 +188,10 @@ H5PB_reset_stats(H5PB_t *pb_ptr) pb_ptr->failed_ht_searches = 0; pb_ptr->total_failed_ht_search_depth = 0; pb_ptr->max_index_len = 0; + pb_ptr->max_clean_index_len = 0; + pb_ptr->max_dirty_index_len = 0; + pb_ptr->max_clean_index_size = 0; + pb_ptr->max_dirty_index_size = 0; pb_ptr->max_index_size = 0; pb_ptr->max_rd_pages = 0; pb_ptr->max_md_pages = 0; @@ -389,10 +393,17 @@ H5PB_print_stats(const H5PB_t *pb_ptr) * buffer is configured to allow pages of the specified * type. * - * This function is called by the - * from the MF layer when a new page is allocated to - * indicate to the page buffer layer that a read of the page - * from the file is not necessary since it's an empty page. + * This function is called by the MF layer when a new page + * is allocated to indicate to the page buffer layer that + * a read of the page from the file is not necessary since + * it's an empty page. + * + * For purposes of the VFD SWMR writer, we also track pages + * that are inserted via this call, as the fact that the + * page was allocated implies that an earlier version does + * not exist in the HDF5 file, and thus we need not concern + * ourselves with delaying the write of this pages to avoid + * messages from the future on the reader. * * Note that this function inserts the new page without * attempting to make space. This can result in the page @@ -452,6 +463,9 @@ H5PB_add_new_page(H5F_t *f, H5FD_mem_t type, haddr_t page_addr) HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ "new page buffer page creation failed.") + /* make note that this page was allocated, not loaded from file */ + entry_ptr->loaded = FALSE; + /* updates stats */ H5PB__UPDATE_STATS_FOR_INSERTION(pb_ptr, entry_ptr); } @@ -538,53 +552,61 @@ H5PB_create(H5F_t *f, size_t size, unsigned page_buf_min_meta_perc, /* initialize the new instance of H5PB_t */ - pb_ptr->magic = H5PB__H5PB_T_MAGIC; - pb_ptr->page_size = f->shared->fs_page_size; + pb_ptr->magic = H5PB__H5PB_T_MAGIC; + pb_ptr->page_size = f->shared->fs_page_size; H5_CHECKED_ASSIGN(pb_ptr->page_size, size_t, \ f->shared->fs_page_size, hsize_t); - pb_ptr->max_pages = (int32_t)(size / f->shared->fs_page_size); - pb_ptr->curr_pages = 0; - pb_ptr->curr_md_pages = 0; - pb_ptr->curr_rd_pages = 0; - pb_ptr->min_md_pages = min_md_pages; - pb_ptr->min_rd_pages = min_rd_pages; + pb_ptr->max_pages = (int32_t)(size / f->shared->fs_page_size); + pb_ptr->curr_pages = 0; + pb_ptr->curr_md_pages = 0; + pb_ptr->curr_rd_pages = 0; + pb_ptr->min_md_pages = min_md_pages; + pb_ptr->min_rd_pages = min_rd_pages; - pb_ptr->max_size = size; - pb_ptr->min_meta_perc = page_buf_min_meta_perc; - pb_ptr->min_raw_perc = page_buf_min_raw_perc; + pb_ptr->max_size = size; + pb_ptr->min_meta_perc = page_buf_min_meta_perc; + pb_ptr->min_raw_perc = page_buf_min_raw_perc; /* index */ for ( i = 0; i < H5PB__HASH_TABLE_LEN; i++ ) - pb_ptr->ht[i] = NULL; - pb_ptr->index_len = 0; - pb_ptr->index_size = 0; + pb_ptr->ht[i] = NULL; + pb_ptr->index_len = 0; + pb_ptr->clean_index_len = 0; + pb_ptr->dirty_index_len = 0; + pb_ptr->index_size = 0; + pb_ptr->clean_index_size = 0; + pb_ptr->dirty_index_size = 0; + pb_ptr->il_len = 0; + pb_ptr->il_size = 0; + pb_ptr->il_head = NULL; + pb_ptr->il_tail = NULL; /* LRU */ - pb_ptr->LRU_len = 0; - pb_ptr->LRU_size = 0; - pb_ptr->LRU_head_ptr = NULL; - pb_ptr->LRU_tail_ptr = NULL; + pb_ptr->LRU_len = 0; + pb_ptr->LRU_size = 0; + pb_ptr->LRU_head_ptr = NULL; + pb_ptr->LRU_tail_ptr = NULL; /* VFD SWMR specific fields. * The following fields are defined iff vfd_swmr_writer is TRUE. */ - pb_ptr->vfd_swmr_writer = FALSE; - pb_ptr->mpmde_count = 0; - pb_ptr->cur_tick = 0; + pb_ptr->vfd_swmr_writer = FALSE; + pb_ptr->mpmde_count = 0; + pb_ptr->cur_tick = 0; /* delayed write list */ - pb_ptr->max_delay = 0; - pb_ptr->dwl_len = 0; - pb_ptr->dwl_size = 0; - pb_ptr->dwl_head_ptr = NULL; - pb_ptr->dwl_tail_ptr = NULL; + pb_ptr->max_delay = 0; + pb_ptr->dwl_len = 0; + pb_ptr->dwl_size = 0; + pb_ptr->dwl_head_ptr = NULL; + pb_ptr->dwl_tail_ptr = NULL; /* tick list */ - pb_ptr->tl_len = 0; - pb_ptr->tl_size = 0; - pb_ptr->tl_head_ptr = NULL; - pb_ptr->tl_tail_ptr = NULL; + pb_ptr->tl_len = 0; + pb_ptr->tl_size = 0; + pb_ptr->tl_head_ptr = NULL; + pb_ptr->tl_tail_ptr = NULL; H5PB_reset_stats(pb_ptr); @@ -1232,6 +1254,453 @@ done: /*------------------------------------------------------------------------- * + * Function: H5PB_vfd_swmr__release_delayed_writes + * + * Purpose: After the tick list has been released, and before the + * beginning of the next tick, we must scan the delayed + * write list, and release those entries whose delays have + * expired. + * + * Note that pages of metadata, and multi-page metadata entries + * are handled differently. + * + * Regular pages are removed from the delayed write list and + * inserted in the replacement policy + * + * In contrast, multi-page metadata entries are simply + * flushed and evicted. + * + * Since the delayed write list is sorted in decreasing + * delay_write_until order, we start our scan at the bottom + * of the delayed write list and continue upwards until no + * expired entries remain. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: John Mainzer -- 11/15/18 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +herr_t +H5PB_vfd_swmr__release_delayed_writes(H5F_t * f) +{ + H5PB_t * pb_ptr = NULL; + H5PB_entry_t *entry_ptr = NULL; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(FAIL) + + /* Sanity checks */ + HDassert(f); + HDassert(f->shared); + HDassert(f->shared->vfd_swmr); + HDassert(f->shared->vfd_swmr_writer); + + pb_ptr = f->shared->pb_ptr; + + HDassert(pb_ptr); + HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); + HDassert(pb_ptr->vfd_swmr_writer); + + while ( ( pb_ptr->dwl_tail_ptr ) && + ( pb_ptr->dwl_tail_ptr->delay_write_until < + f->shared->tick_num ) ) { + + entry_ptr = pb_ptr->dwl_tail_ptr; + + HDassert(entry_ptr->is_dirty); + + H5PB__REMOVE_FROM_DWL(pb_ptr, entry_ptr, FAIL) + + entry_ptr->delay_write_until = 0; + + if ( entry_ptr->is_mpmde ) { /* flush and evict now */ + + if ( H5PB__flush_entry(f, pb_ptr, entry_ptr) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \ + "flush of mpmde failed") + + if ( H5PB__evict_entry(pb_ptr, entry_ptr, TRUE) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "eviction of mpmde failed") + + } else { /* insert it in the replacement policy */ + + H5PB__UPDATE_RP_FOR_INSERT_APPEND(pb_ptr, entry_ptr, FAIL) + } + } + +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5PB_vfd_swmr__release_delayed_writes() */ + + +/*------------------------------------------------------------------------- + * + * Function: H5PB_vfd_swmr__release_tick_list + * + * Purpose: After the metadata file has been updated, and before the + * beginning of the next tick, we must release the tick list. + * + * This function performs this function. + * + * In passing, flush and evict any multi-page metadata entries + * that are not subject to a delayed write. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: John Mainzer -- 11/12/18 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +herr_t +H5PB_vfd_swmr__release_tick_list(H5F_t * f) +{ + H5PB_t * pb_ptr = NULL; + H5PB_entry_t *entry_ptr = NULL; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(FAIL) + + /* Sanity checks */ + HDassert(f); + HDassert(f->shared); + HDassert(f->shared->vfd_swmr); + HDassert(f->shared->vfd_swmr_writer); + + pb_ptr = f->shared->pb_ptr; + + HDassert(pb_ptr); + HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); + HDassert(pb_ptr->vfd_swmr_writer); + + /* remove all entries from the tick list */ + while ( pb_ptr->tl_head_ptr ) { + + entry_ptr = pb_ptr->tl_head_ptr; + + H5PB__REMOVE_FROM_TL(pb_ptr, entry_ptr, FAIL) + + entry_ptr->modified_this_tick = FALSE; + + if ( entry_ptr->is_mpmde ) { + + HDassert(entry_ptr->is_dirty); + + if ( entry_ptr->delay_write_until == 0 ) { + + /* flush and evict the multi-page metadata entry immediately */ + if ( H5PB__flush_entry(f, pb_ptr, entry_ptr) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_WRITEERROR, FAIL, \ + "flush of mpmde failed") + + if ( H5PB__evict_entry(pb_ptr, entry_ptr, TRUE) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "eviction of mpmde failed") + } + } + /* if the entry is not a multi-page metadata entry, it must already + * be on either the replacment policy or the delayed write list. + * In either case, it will be flush when possible and necessary. + */ + } + + HDassert(pb_ptr->tl_head_ptr == NULL); + HDassert(pb_ptr->tl_tail_ptr == NULL); + HDassert(pb_ptr->tl_len == 0); + HDassert(pb_ptr->tl_size == 0); + +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5PB_vfd_swmr__release_tick_list */ + + +/*------------------------------------------------------------------------- + * + * Function: H5PB_vfd_swmr__update_index + * + * Purpose: In the VFD SWMR writer, all metadata writes to the page + * buffer during a tick are buffered in the page buffer in + * the tick list. Further, the metadata cache is flushed + * to the page buffer at the end of the tick so that all + * metadata changes during the tick are reflected in the + * tick list. + * + * Once this is done, the internal representation of the + * metadata file index must be updated from the tick list + * so that the metadata file can be updated, and the tick + * list can be emptied and prepared to buffer metadata changes + * in the next tick. + * + * This function is called to accomplish this. Its cycle of + * operation is as follows: + * + * 1) Scan the tick list. For each entry (*pbe_ptr), test + * to see if it appears in the index. + * + * If it does the entry must have been modified in the + * past tick. Update the index entry (*ie_ptr) as follows: + * + * a) Set ie_ptr->entry_ptr = pbe_ptr->image_ptr. This + * is needed to give the metadata file update code + * access to the image of the target page or multi-page + * multi-date entry. Note that ie_ptr->entry_ptr will + * be set to NULL as soon as the metadata file is updated, + * so the buffer pointed to by pbe_ptr->image_ptr can + * be safely discarded at any time after the metadata + * file update. + * + * b) Set ie_ptr->tick_of_last_change to the current tick. + * + * c) If pbe_ptr->is_dirty, set ie_ptr->clean to FALSE. + * If pbe_ptr->is_dirty is FALSE, set ie_ptr->clean + * to TRUE and set ie_ptr->tick_of_last_flush to the + * current tick. + * + * If the tick list entry (*pbe_ptr) doesn't appear in + * the index, allocate a metadata file index entry (*ie_ptr), + * and initialize it as follows: + * + * ie_ptr->hdf5_page_offset = pbe_ptr->page + * ie_ptr->length = pbe_ptr->size + * ie_ptr->delayed_flush = pbe_ptr->delay_write_until + * + * and then update the new entry as per the existing entry + * case described above. + * + * 2) Scan the internal representation of the metadata file + * index for entries that do not appear in the tick list. + * For each such entry (*ie_ptr), proceed as follows: + * + * 1) If ie_ptr->clean, we are done -- proceed to the + * next index entry that doesn't appear in the tick list. + * + * 2) Test to see if the cognate entry appears in the page + * buffer. If it doesn't, it must have been flushed and + * evicted in the past tick. Set + * + * ie_ptr->clean = TRUE, and + * + * ie_ptr->tick_of_last_flush = current tick + * + * and proceed to the next index entry that doesn't + * appear in the tick list. + * + * 3) If the cognate entry does appear in the page buffer + * and is clean, proceed as per 2) above. + * + * 4) In all other cases, do nothing, and proceed to the + * next index entry that does not appear in the tick list. + * + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: John Mainzer -- 11/9/18 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +herr_t +H5PB_vfd_swmr__update_index(H5F_t * f, + int * idx_ent_added_ptr, + int * idx_ent_modified_ptr, + int * idx_ent_not_in_tl_ptr, + int * idx_ent_not_in_tl_flushed_ptr) +{ + int32_t i; + int32_t idx_ent_added = 0; + int32_t idx_ent_modified = 0; + int32_t idx_ent_not_in_tl = 0; + int32_t idx_ent_not_in_tl_flushed = 0; + H5PB_t * pb_ptr = NULL; + H5PB_entry_t *pbe_ptr = NULL; + H5FD_vfd_swmr_idx_entry_t * ie_ptr = NULL; + H5FD_vfd_swmr_idx_entry_t * idx = NULL; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(FAIL) + + /* Sanity checks */ + HDassert(f); + HDassert(f->shared); + HDassert(f->shared->vfd_swmr); + HDassert(f->shared->vfd_swmr_writer); + + idx = f->shared->mdf_idx; + + HDassert(idx); + + pb_ptr = f->shared->pb_ptr; + + HDassert(pb_ptr); + HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); + HDassert(pb_ptr->vfd_swmr_writer); + + HDassert(idx_ent_added_ptr); + HDassert(idx_ent_modified_ptr); + HDassert(idx_ent_not_in_tl_ptr); + HDassert(idx_ent_not_in_tl_flushed_ptr); + + /* scan the tick list and insert or update metadata file index entries + * as appropriate. + */ + + pbe_ptr = pb_ptr->tl_head_ptr; + + while ( pbe_ptr ) { + + uint64_t target_page; + int32_t top; + int32_t bottom; + int32_t probe; + + HDassert(pbe_ptr->magic == H5PB__H5PB_ENTRY_T_MAGIC); + + /* do a binary search on the metadata file index to see if + * it already contains an entry for *pbe_ptr. + */ + + ie_ptr = NULL; + top = f->shared->mdf_idx_entries_used - 1; + bottom = 0; + target_page = pbe_ptr->page; + + while ( top >= bottom ) { + + probe = top + bottom / 2; + + if ( idx[probe].hdf5_page_offset < target_page ) { + + bottom = probe + 1; + + } else if ( idx[probe].hdf5_page_offset > target_page ) { + + top = probe - 1; + + } else { /* found it */ + + ie_ptr = idx + probe; + bottom = top + 1; /* to exit loop */ + } + } + + if ( ie_ptr == NULL ) { /* alloc new entry in the metadata file index*/ + + /* for now the metadata file index is of fixed size -- if we + * exceed the maximum size, just abort. + * + * Obviously, this must be fixed for the production version. + */ + int32_t new_index_entry_index; + + new_index_entry_index = f->shared->mdf_idx_entries_used + + idx_ent_added++; + + if ( new_index_entry_index >= f->shared->mdf_idx_len ) { + + HDfprintf(stderr, "\n\nmax mdf index len exceeded.\n\n"); + exit(1); + } + + ie_ptr = idx + new_index_entry_index; + + /* partial initialization of new entry -- rest done later */ + ie_ptr->hdf5_page_offset = target_page; + ie_ptr->md_file_page_offset = 0; /* undefined at this point */ + ie_ptr->length = (uint32_t)(pbe_ptr->size); + ie_ptr->chksum = 0; /* undefined at this point */ + /* ie_ptr->entry_ptr initialized below */ + /* ie_ptr->tick_of_last_change initialized below */ + /* ie_ptr->clean initialized below */ + /* ie_ptr->tick_of_last_flush initialized below */ + ie_ptr->delayed_flush = pbe_ptr->delay_write_until; + ie_ptr->moved_to_hdf5_file = FALSE; + + } else { + + idx_ent_modified++; + } + + ie_ptr->entry_ptr = pbe_ptr->image_ptr; + ie_ptr->tick_of_last_change = f->shared->tick_num; + ie_ptr->clean = !(pbe_ptr->is_dirty); + + if ( ie_ptr->clean ) { + + ie_ptr->tick_of_last_flush = f->shared->tick_num; + + } else { + + ie_ptr->tick_of_last_flush = 0; + } + + HDassert(ie_ptr); + } + + /* scan the metadata file index for entries that don't appear in the + * tick list. If the index entry is dirty, and either doesn't appear + * in the page buffer, or is clean in the page buffer, mark the index + * entry clean and as having been flushed in the current tick. + */ + for ( i = 0; i < f->shared->mdf_idx_entries_used; i++ ) { + + HDassert( ( i == 0 ) || + ( idx[i - 1].hdf5_page_offset < idx[i].hdf5_page_offset ) ); + + if ( idx[i].tick_of_last_change < f->shared->tick_num ) { + + idx_ent_not_in_tl++; + + ie_ptr = idx + i; + + if ( ! ( ie_ptr->clean ) ) { + + H5PB__SEARCH_INDEX(pb_ptr, ie_ptr->hdf5_page_offset, \ + pbe_ptr, FAIL); + + if ( ( ! pbe_ptr ) || ( ! ( pbe_ptr->is_dirty ) ) ) { + + idx_ent_not_in_tl_flushed++; + ie_ptr->clean = TRUE; + ie_ptr->tick_of_last_flush = f->shared->tick_num; + } + } + } + } + + HDassert(idx_ent_modified + idx_ent_not_in_tl == + f->shared->mdf_idx_entries_used); + + HDassert(idx_ent_modified + idx_ent_not_in_tl + idx_ent_added <= + f->shared->mdf_idx_len); + + *idx_ent_added_ptr = idx_ent_added; + *idx_ent_modified_ptr = idx_ent_modified; + *idx_ent_not_in_tl_ptr = idx_ent_not_in_tl; + *idx_ent_not_in_tl_flushed_ptr = idx_ent_not_in_tl_flushed; + +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5PB_vfd_swmr__update_index */ + + +/*------------------------------------------------------------------------- + * * Function: H5PB_write * * Purpose: Write data into the Page Buffer if practical, and to file @@ -1275,14 +1744,17 @@ done: * one page, and vfd_swmr_writer is TRUE, the write must * buffered in the page buffer until the end of the tick. * - * Create a multi-page metadata entry in the page buffer - * and copy the write into it. Insert the new entry in - * the tick list. + * If it doesn't exist already, create a multi-page metadata + * entry in the page buffer and copy the write into it. + * Insert the new entry in the tick list if necessary. * * Test to see if the write of the multi-page metadata * entry must be delayed. If so, place the entry in - * the delayed write list. Otherwise, write the multi-page - * metadata entry to the HDF5 file. + * the delayed write list. Otherwise, the multi-page + * metadata entry will be written to the HDF5 file and + * evicted when the tick list is released at the of the + * tick. + * * * 8) If the write is of metadata, and the write is of size * less than or equal to the page size, write the data @@ -1329,7 +1801,7 @@ H5PB_write(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, HDassert(pb_ptr->magic == H5PB__H5PB_T_MAGIC); - if ( H5FD_MEM_DRAW == type ) { /* raw data read */ + if ( H5FD_MEM_DRAW == type ) { /* raw data write */ if ( pb_ptr->min_md_pages == pb_ptr->max_pages ) { @@ -1337,7 +1809,7 @@ H5PB_write(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, bypass_pb = TRUE; } - } else { /* metadata read */ + } else { /* metadata write */ if ( pb_ptr->min_rd_pages == pb_ptr->max_pages ) { @@ -1483,6 +1955,8 @@ H5PB__allocate_page(H5PB_t *pb_ptr, size_t size, hbool_t clean_image) /* fields supporting the hash table */ entry_ptr->ht_prev = NULL; entry_ptr->ht_next = NULL; + entry_ptr->il_prev = NULL; + entry_ptr->il_next = NULL; /* fields supporting replacement policise */ entry_ptr->next = NULL; @@ -1675,6 +2149,8 @@ H5PB__deallocate_page(H5PB_entry_t *entry_ptr) HDassert(!(entry_ptr->is_dirty)); HDassert(entry_ptr->ht_next == NULL); HDassert(entry_ptr->ht_prev == NULL); + HDassert(entry_ptr->il_next == NULL); + HDassert(entry_ptr->il_prev == NULL); HDassert(entry_ptr->next == NULL); HDassert(entry_ptr->prev == NULL); HDassert(entry_ptr->tl_next == NULL); @@ -1695,7 +2171,7 @@ H5PB__deallocate_page(H5PB_entry_t *entry_ptr) * * Purpose: Evict the target entry from the from the page buffer, and * de-allocate its associated image and instance of - * H5PB_entry_t.. + * H5PB_entry_t. * * In general, entries must be clean before they can be * evicted, and the minimum metadata and raw data limits @@ -1835,8 +2311,7 @@ H5PB__flush_entry(H5F_t *f, H5PB_t *pb_ptr, H5PB_entry_t *entry_ptr) HDassert(entry_ptr->image_ptr); HDassert(entry_ptr->is_dirty); HDassert((pb_ptr->vfd_swmr_writer) || (!(entry_ptr->is_mpmde))); - HDassert( ( ! (pb_ptr->vfd_swmr_writer) ) || - ( (pb_ptr->cur_tick) >= (entry_ptr->delay_write_until) ) ); + HDassert(0 == (entry_ptr->delay_write_until)); /* Retrieve the 'eoa' for the file */ if ( HADDR_UNDEF == (eoa = H5F_get_eoa(f, entry_ptr->mem_type)) ) @@ -1898,11 +2373,14 @@ H5PB__flush_entry(H5F_t *f, H5PB_t *pb_ptr, H5PB_entry_t *entry_ptr) } /* mark the entry clean */ - entry_ptr->is_dirty = FALSE; + if ( H5PB__mark_entry_clean(pb_ptr, entry_ptr) < 0 ) + + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, "mark entry clean failed") /* if the entry is on the LRU, update the replacement policy */ - if ( ! (entry_ptr->is_mpmde) ) { + if ( ( ! (entry_ptr->is_mpmde) ) && + ( entry_ptr->delay_write_until == 0 ) ) { H5PB__UPDATE_RP_FOR_FLUSH(pb_ptr, entry_ptr, FAIL) } @@ -1929,6 +2407,14 @@ done: * even in the VFD SWMR case, as in this context, multi-page * metadata entries are always written in full, and they * may only enter the page buffer as the result of a write. + * + * In the context of VFD SWMR, when an page is loaded from + * file, it is possible that the VFD SWMR writer must delay + * writes to the page to avoid the possibility of message from + * the future bugs on the VFD SWMR reader. For this reason, + * make note of the fact that the entry has be loaded from + * from file, so that the necessary checks can be made when + * writing to the page. * * Return: SUCCEED if no errors are encountered, and * FAIL otherwise. @@ -1986,7 +2472,6 @@ H5PB__load_page(H5F_t *f, H5PB_t *pb_ptr, haddr_t addr, H5FD_mem_t type, */ skip_read = (addr >= eof); - /* make space in the page buffer if necessary */ if ( ( pb_ptr->curr_pages >= pb_ptr->max_pages ) && ( H5PB__make_space(f, pb_ptr, type) < 0 ) ) @@ -2024,6 +2509,11 @@ H5PB__load_page(H5F_t *f, H5PB_t *pb_ptr, haddr_t addr, H5FD_mem_t type, HGOTO_ERROR(H5E_PAGEBUF, H5E_READERROR, FAIL, \ "driver read request failed") + /* If in fact the page was read from file, make note of this fact + * for purposes of VFD SWMR delayed writes in the VFD SWMR writer. + */ + entry_ptr->loaded = ! skip_read; + H5PB__UPDATE_STATS_FOR_LOAD(pb_ptr, entry_ptr) if ( entry_ptr_ptr ) { @@ -2233,6 +2723,10 @@ done: * this case, the entry must be marked clean to avoid * sanity check failures on evictions. * + * While this function does update the index for the + * entry clean, it does not update the replacement policy. + * If this is desired, it must be done by the caller. + * * Return: Non-negative on success/Negative on failure * * Programmer: John Mainzer -- 10/14/18 @@ -2262,13 +2756,12 @@ H5PB__mark_entry_clean(H5PB_t *pb_ptr, H5PB_entry_t *entry_ptr) /* mark the entry clean */ entry_ptr->is_dirty = FALSE; - /* delete this once we start tracking clean and dirty entry is the hash - * table. - */ - if ( ! (entry_ptr->is_mpmde) ) { + /* update the index for the entry clean */ + H5PB__UPDATE_INDEX_FOR_ENTRY_CLEAN(pb_ptr, entry_ptr) - H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, FAIL) - } + /* don't update the replacement policy -- this will be done by + * the caller if desired. + */ done: @@ -2283,14 +2776,17 @@ done: * * Purpose: Mark the target entry as dirty. * - * Under normal circumstances, the entry will be in the - * replacement policy. In this, also update the replacement - * policy for and access. - * - * If pb_ptr->vfd_swmr_writer, it is possible that the target - * is a multi-page metadata entry. In this case, the entry - * is not in the replacement policy, and thus the policy - * should not be updated. + * If pb_ptr->vfd_swmr_writer is FALSE, the entry will be + * in the replacement policy. In this, we simply mark the + * entry as dirty, and update the replacement policy for an + * access. + * + * If pb_ptr->vfd_swmr_writer, it is possible that we must + * delay writes to the target page or multi-page metadata + * entry to avoid message from the future bugs on the VFD + * SWMR readers. In such cases we must set the + * delay_write_until field and insert the entry on the + * delayed write list instead of the replacement policy. * * Return: Non-negative on success/Negative on failure * @@ -2301,8 +2797,9 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5PB__mark_entry_dirty(H5PB_t *pb_ptr, H5PB_entry_t *entry_ptr) +H5PB__mark_entry_dirty(H5F_t * f, H5PB_t *pb_ptr, H5PB_entry_t *entry_ptr) { + uint64_t delay_write_until = 0; herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_NOAPI(FAIL) @@ -2318,14 +2815,49 @@ H5PB__mark_entry_dirty(H5PB_t *pb_ptr, H5PB_entry_t *entry_ptr) HDassert(entry_ptr->image_ptr); HDassert((pb_ptr->vfd_swmr_writer) || (!(entry_ptr->is_mpmde))); - /* mark the entry dirty */ - entry_ptr->is_dirty = TRUE; + /* mark the entry dirty if necessary */ + if ( ! ( entry_ptr->is_dirty ) ) { - /* if the entry is on the LRU, update the replacement policy */ - if ( ( ! (entry_ptr->is_mpmde) ) && - ( entry_ptr->delay_write_until == 0 ) ) { + entry_ptr->is_dirty = TRUE; + + H5PB__UPDATE_INDEX_FOR_ENTRY_DIRTY(pb_ptr, entry_ptr) + + /* since the entry was clean, there can be no pending delayed write */ + HDassert(entry_ptr->delay_write_until == 0); + + if ( ( pb_ptr->vfd_swmr_writer ) && + ( entry_ptr->loaded ) && + ( H5F_vfd_swmr_writer__delay_write(f, entry_ptr->page, + &delay_write_until) < 0 ) ) - H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, FAIL) + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "get delayed write request failed") + + if ( delay_write_until > 0 ) { + + H5PB__INSERT_IN_DWL(pb_ptr, entry_ptr, FAIL) + + } else if ( ! (entry_ptr->is_mpmde) ) { + + H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, FAIL) + + } else { + + /* the entry should be a multi-page metadata entry that + * has been modified this tick. Thus it is only on the + * tick list, and no action is required. + */ + HDassert(entry_ptr->modified_this_tick); + HDassert(entry_ptr->is_mpmde); + HDassert(pb_ptr->vfd_swmr_writer); + } + } else if ( ( ! (entry_ptr->is_mpmde) ) && + ( entry_ptr->delay_write_until == 0 ) ) { + + /* the entry is dirty and on the replacement policy -- just update + * the replacement policy for an access + */ + H5PB__UPDATE_RP_FOR_ACCESS(pb_ptr, entry_ptr, FAIL) } done: @@ -3054,14 +3586,16 @@ done: * one page, and vfd_swmr_writer is TRUE, the write must * buffered in the page buffer until the end of the tick. * - * Create a multi-page metadata entry in the page buffer - * and copy the write into it. Insert the new entry in - * the tick list. + * If it doesn't exist already, create a multi-page metadata + * entry in the page buffer and copy the write into it. + * Insert the new entry in the tick list if necessary. * * Test to see if the write of the multi-page metadata * entry must be delayed. If so, place the entry in - * the delayed write list. Otherwise, write the multi-page - * metadata entry to the HDF5 file. + * the delayed write list. Otherwise, the multi-page + * metadata entry will be written to the HDF5 file and + * evicted when the tick list is released at the of the + * tick. * * 8) If the write is of metadata, and the write is of size * less than or equal to the page size, write the data @@ -3154,6 +3688,8 @@ H5PB__write_meta(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, * test to see if it should be, and move it from the * LRU to the delayed write list and set the delay_write_until * field appropriately. + * + * This is done via the call to H5PB__mark_entry_dirty() */ HDassert(pb_ptr->vfd_swmr_writer); HDassert(addr == page_addr); @@ -3177,6 +3713,11 @@ H5PB__write_meta(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ "can't create new page buffer page") + + /* set entry_ptr->loaded to TRUE so as to trigger the + * the delayed write test in H5PB__mark_entry_dirty(). + */ + entry_ptr->loaded = TRUE; } /* at this point, one way or the other, the multi-page metadata @@ -3191,25 +3732,19 @@ H5PB__write_meta(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, HDmemcpy((uint8_t *)(entry_ptr->image_ptr), buf, size); /* mark the entry dirty */ - if ( H5PB__mark_entry_dirty(pb_ptr, entry_ptr) < 0 ) + if ( H5PB__mark_entry_dirty(f, pb_ptr, entry_ptr) < 0 ) HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ "mark entry dirty failed (1)") - /* insert in tick list if not there already */ if ( ! ( entry_ptr->modified_this_tick ) ) { + entry_ptr->modified_this_tick = TRUE; + H5PB__INSERT_IN_TL(pb_ptr, entry_ptr, FAIL) } - /* Test to see if we must delay the write of the multi-page - * metadata entry, and move it from the LRU to the delayed write - * list if so. - */ - - /* Write function for this -- assert false for now */ - HDassert(FALSE); } else { /* case 8) metadata write of size no larger than page size */ @@ -3267,7 +3802,7 @@ H5PB__write_meta(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, HDmemcpy(((uint8_t *)(entry_ptr->image_ptr) + offset), (const uint8_t *)buf, size); - if ( H5PB__mark_entry_dirty(pb_ptr, entry_ptr) < 0 ) + if ( H5PB__mark_entry_dirty(f, pb_ptr, entry_ptr) < 0 ) HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ "mark entry dirty failed (2)") @@ -3280,16 +3815,10 @@ H5PB__write_meta(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, */ if ( ! ( entry_ptr->modified_this_tick ) ) { + entry_ptr->modified_this_tick = TRUE; + H5PB__INSERT_IN_TL(pb_ptr, entry_ptr, FAIL) } - - /* Test to see if we must delay the write of the multi-page - * metadata entry, and move it from the LRU to the delayed write - * list if so. - */ - - /* Write function for this -- assert false for now */ - HDassert(FALSE); } } @@ -3491,7 +4020,7 @@ H5PB__write_raw(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, HDmemcpy((uint8_t *)entry_ptr->image_ptr + offset, buf, pb_ptr->page_size - (size_t)offset); - if ( H5PB__mark_entry_dirty(pb_ptr, entry_ptr) < 0 ) + if ( H5PB__mark_entry_dirty(f, pb_ptr, entry_ptr) < 0 ) HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ "mark entry dirty failed (1)") @@ -3512,7 +4041,7 @@ H5PB__write_raw(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, (const uint8_t *)buf + offset, (size_t)((addr + size) - last_page_addr)); - if ( H5PB__mark_entry_dirty(pb_ptr, entry_ptr) < 0 ) + if ( H5PB__mark_entry_dirty(f, pb_ptr, entry_ptr) < 0 ) HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ "mark entry dirty failed (2)") @@ -3574,7 +4103,7 @@ H5PB__write_raw(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, HDmemcpy(((uint8_t *)(entry_ptr->image_ptr)) + offset, (const uint8_t *)buf, length); - if ( H5PB__mark_entry_dirty(pb_ptr, entry_ptr) < 0 ) + if ( H5PB__mark_entry_dirty(f, pb_ptr, entry_ptr) < 0 ) HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ "mark entry dirty failed (3)") @@ -3610,7 +4139,7 @@ H5PB__write_raw(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, HDmemcpy((uint8_t *)(entry_ptr->image_ptr), ((const uint8_t *)(buf) + offset), length); - if ( H5PB__mark_entry_dirty(pb_ptr, entry_ptr) < 0 ) + if ( H5PB__mark_entry_dirty(f, pb_ptr, entry_ptr) < 0 ) HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ "mark entry dirty failed (3)") diff --git a/src/H5PBpkg.h b/src/H5PBpkg.h index e71396a..c6d13db 100644 --- a/src/H5PBpkg.h +++ b/src/H5PBpkg.h @@ -69,25 +69,26 @@ #if H5PB__DO_SANITY_CHECKS -#define H5PB__DLL_PRE_REMOVE_SC(entry_ptr, head_ptr, tail_ptr, len, Size, fv) \ -if ( ( (head_ptr) == NULL ) || \ - ( (tail_ptr) == NULL ) || \ - ( (entry_ptr) == NULL ) || \ - ( (len) <= 0 ) || \ - ( (size_t)(Size) < (entry_ptr)->size ) || \ - ( ( (entry_ptr)->prev == NULL ) && ( (head_ptr) != (entry_ptr) ) ) || \ - ( ( (entry_ptr)->next == NULL ) && ( (tail_ptr) != (entry_ptr) ) ) || \ - ( ( (len) == 1 ) && \ - ( ! ( ( (head_ptr) == (entry_ptr) ) && \ - ( (tail_ptr) == (entry_ptr) ) && \ - ( (entry_ptr)->next == NULL ) && \ - ( (entry_ptr)->prev == NULL ) && \ - ( (Size) == (int64_t)((entry_ptr)->size) ) \ - ) \ - ) \ - ) \ - ) { \ - HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, (fv), "DLL pre remove SC failed") \ +#define H5PB__DLL_PRE_REMOVE_SC(entry_ptr, head_ptr, tail_ptr, len, Size, fv) \ +if ( ( (head_ptr) == NULL ) || \ + ( (tail_ptr) == NULL ) || \ + ( (entry_ptr) == NULL ) || \ + ( (len) <= 0 ) || \ + ( (Size) < (int64_t)((entry_ptr)->size ) ) || \ + ( ( (Size) == (int64_t)((entry_ptr)->size) ) && ( ! ( (len) == 1 ) ) ) || \ + ( ( (entry_ptr)->prev == NULL ) && ( (head_ptr) != (entry_ptr) ) ) || \ + ( ( (entry_ptr)->next == NULL ) && ( (tail_ptr) != (entry_ptr) ) ) || \ + ( ( (len) == 1 ) && \ + ( ! ( ( (head_ptr) == (entry_ptr) ) && \ + ( (tail_ptr) == (entry_ptr) ) && \ + ( (entry_ptr)->next == NULL ) && \ + ( (entry_ptr)->prev == NULL ) && \ + ( (Size) == (int64_t)((entry_ptr)->size) ) \ + ) \ + ) \ + ) \ + ) { \ + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, (fv), "DLL pre remove SC failed") \ } #define H5PB__DLL_SC(head_ptr, tail_ptr, len, Size, fv) \ @@ -107,7 +108,7 @@ if ( ( ( ( (head_ptr) == NULL ) || ( (tail_ptr) == NULL ) ) && \ ) \ ) \ ) { \ - HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, (fv), "DLL sanity check failed") \ + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, (fv), "DLL sanity check failed") \ } #define H5PB__DLL_PRE_INSERT_SC(entry_ptr, head_ptr, tail_ptr, len, Size, fv) \ @@ -128,7 +129,7 @@ if ( ( (entry_ptr) == NULL ) || \ ) \ ) \ ) { \ - HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, (fv), "DLL pre insert SC failed") \ + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, (fv), "DLL pre insert SC failed") \ } #else /* H5PB__DO_SANITY_CHECKS */ @@ -238,25 +239,146 @@ if ( ( (entry_ptr) == NULL ) || \ #if H5PB__DO_SANITY_CHECKS +#define H5PB__IL_DLL_PRE_REMOVE_SC(entry_ptr, hd_ptr, tail_ptr, len, Size, fv) \ +if ( ( (hd_ptr) == NULL ) || \ + ( (tail_ptr) == NULL ) || \ + ( (entry_ptr) == NULL ) || \ + ( (len) <= 0 ) || \ + ( (Size) < (int64_t)((entry_ptr)->size) ) || \ + ( ( (Size) == (int64_t)((entry_ptr)->size) ) && \ + ( ! ( (len) == 1 ) ) ) || \ + ( ( (entry_ptr)->il_prev == NULL ) && ( (hd_ptr) != (entry_ptr) ) ) || \ + ( ( (entry_ptr)->il_next == NULL ) && ( (tail_ptr) != (entry_ptr) ) ) || \ + ( ( (len) == 1 ) && \ + ( ! ( ( (hd_ptr) == (entry_ptr) ) && ( (tail_ptr) == (entry_ptr) ) && \ + ( (entry_ptr)->il_next == NULL ) && \ + ( (entry_ptr)->il_prev == NULL ) && \ + ( (Size) == (int64_t)((entry_ptr)->size) ) \ + ) \ + ) \ + ) \ + ) { \ + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, (fv), "il DLL pre remove SC failed") \ +} + +#define H5PB__IL_DLL_PRE_INSERT_SC(entry_ptr, hd_ptr, tail_ptr, len, Size, fv) \ +if ( ( (entry_ptr) == NULL ) || \ + ( (entry_ptr)->il_next != NULL ) || \ + ( (entry_ptr)->il_prev != NULL ) || \ + ( ( ( (hd_ptr) == NULL ) || ( (tail_ptr) == NULL ) ) && \ + ( (hd_ptr) != (tail_ptr) ) \ + ) || \ + ( ( (len) == 1 ) && \ + ( ( (hd_ptr) != (tail_ptr) ) || ( (Size) <= 0 ) || \ + ( (hd_ptr) == NULL ) || ( (int64_t)((hd_ptr)->size) != (Size) ) \ + ) \ + ) || \ + ( ( (len) >= 1 ) && \ + ( ( (hd_ptr) == NULL ) || ( (hd_ptr)->il_prev != NULL ) || \ + ( (tail_ptr) == NULL ) || ( (tail_ptr)->il_next != NULL ) \ + ) \ + ) \ + ) { \ + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, (fv), "IL DLL pre insert SC failed") \ +} + +#define H5PB__IL_DLL_SC(head_ptr, tail_ptr, len, Size, fv) \ +if ( ( ( ( (head_ptr) == NULL ) || ( (tail_ptr) == NULL ) ) && \ + ( (head_ptr) != (tail_ptr) ) \ + ) || \ + ( ( (len) == 1 ) && \ + ( ( (head_ptr) != (tail_ptr) ) || \ + ( (head_ptr) == NULL ) || ( (int64_t)((head_ptr)->size) != (Size) ) \ + ) \ + ) || \ + ( ( (len) >= 1 ) && \ + ( ( (head_ptr) == NULL ) || ( (head_ptr)->il_prev != NULL ) || \ + ( (tail_ptr) == NULL ) || ( (tail_ptr)->il_next != NULL ) \ + ) \ + ) \ + ) { \ + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, (fv), "IL DLL sanity check failed") \ +} + +#else /* H5PB__DO_SANITY_CHECKS */ + +#define H5PB__IL_DLL_PRE_REMOVE_SC(entry_ptr, hd_ptr, tail_ptr, len, Size, fv) +#define H5PB__IL_DLL_PRE_INSERT_SC(entry_ptr, hd_ptr, tail_ptr, len, Size, fv) +#define H5PB__IL_DLL_SC(head_ptr, tail_ptr, len, Size, fv) + +#endif /* H5PB__DO_SANITY_CHECKS */ + + +#define H5PB__IL_DLL_APPEND(entry_ptr, head_ptr, tail_ptr, len, Size, fail_val)\ +{ \ + H5PB__IL_DLL_PRE_INSERT_SC(entry_ptr, head_ptr, tail_ptr, len, Size, \ + fail_val) \ + if ( (head_ptr) == NULL ) \ + { \ + (head_ptr) = (entry_ptr); \ + (tail_ptr) = (entry_ptr); \ + } \ + else \ + { \ + (tail_ptr)->il_next = (entry_ptr); \ + (entry_ptr)->il_prev = (tail_ptr); \ + (tail_ptr) = (entry_ptr); \ + } \ + (len)++; \ + (Size) += (int64_t)((entry_ptr)->size); \ + H5PB__IL_DLL_SC(head_ptr, tail_ptr, len, Size, fail_val) \ +} /* H5PB__IL_DLL_APPEND() */ + +#define H5PB__IL_DLL_REMOVE(entry_ptr, head_ptr, tail_ptr, len, Size, fv) \ +{ \ + H5PB__IL_DLL_PRE_REMOVE_SC(entry_ptr, head_ptr, tail_ptr, len, Size, fv) \ + { \ + if ( (head_ptr) == (entry_ptr) ) \ + { \ + (head_ptr) = (entry_ptr)->il_next; \ + if ( (head_ptr) != NULL ) \ + (head_ptr)->il_prev = NULL; \ + } \ + else \ + (entry_ptr)->il_prev->il_next = (entry_ptr)->il_next; \ + if ( (tail_ptr) == (entry_ptr) ) \ + { \ + (tail_ptr) = (entry_ptr)->il_prev; \ + if ( (tail_ptr) != NULL ) \ + (tail_ptr)->il_next = NULL; \ + } \ + else \ + (entry_ptr)->il_next->il_prev = (entry_ptr)->il_prev; \ + entry_ptr->il_next = NULL; \ + entry_ptr->il_prev = NULL; \ + (len)--; \ + (Size) -= (int64_t)((entry_ptr)->size); \ + } \ + H5PB__IL_DLL_SC(head_ptr, tail_ptr, len, Size, fv) \ +} /* H5PB__IL_DLL_REMOVE() */ + + +#if H5PB__DO_SANITY_CHECKS + #define H5PB__TL_DLL_PRE_REMOVE_SC(entry_ptr, hd_ptr, tail_ptr, len, Size, fv) \ if ( ( (hd_ptr) == NULL ) || \ ( (tail_ptr) == NULL ) || \ ( (entry_ptr) == NULL ) || \ ( (len) <= 0 ) || \ - ( (Size) < (entry_ptr)->size ) || \ - ( ( (Size) == (entry_ptr)->size ) && ( ! ( (len) == 1 ) ) ) || \ + ( (Size) < (int64_t)((entry_ptr)->size ) ) || \ + ( ( (Size) == (int64_t)((entry_ptr)->size) ) && ( ! ( (len) == 1 ) ) ) || \ ( ( (entry_ptr)->tl_prev == NULL ) && ( (hd_ptr) != (entry_ptr) ) ) || \ ( ( (entry_ptr)->tl_next == NULL ) && ( (tail_ptr) != (entry_ptr) ) ) || \ ( ( (len) == 1 ) && \ ( ! ( ( (hd_ptr) == (entry_ptr) ) && ( (tail_ptr) == (entry_ptr) ) && \ ( (entry_ptr)->tl_next == NULL ) && \ - ( (entry_ptr)->tlx_prev == NULL ) && \ - ( (Size) == (entry_ptr)->size ) \ + ( (entry_ptr)->tl_prev == NULL ) && \ + ( (Size) == (int64_t)((entry_ptr)->size) ) \ ) \ ) \ ) \ ) { \ - HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, (fv), "TL DLL pre remove SC failed") \ + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, (fv), "TL DLL pre remove SC failed") \ } #define H5PB__TL_DLL_SC(head_ptr, tail_ptr, len, Size, fv) \ @@ -276,7 +398,7 @@ if ( ( ( ( (head_ptr) == NULL ) || ( (tail_ptr) == NULL ) ) && \ ) \ ) \ ) { \ - HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, (fv), "TL DLL sanity check failed") \ + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, (fv), "TL DLL sanity check failed") \ } #define H5PB__TL_DLL_PRE_INSERT_SC(entry_ptr, hd_ptr, tail_ptr, len, Size, fv) \ @@ -297,7 +419,7 @@ if ( ( (entry_ptr) == NULL ) || \ ) \ ) \ ) { \ - HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, (fv), "TL DLL pre insert SC failed") \ + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, (fv), "TL DLL pre insert SC failed") \ } #else /* H5PB__DO_SANITY_CHECKS */ @@ -369,7 +491,7 @@ if ( ( (entry_ptr) == NULL ) || \ entry_ptr->tl_next = NULL; \ entry_ptr->tl_prev = NULL; \ (len)--; \ - (Size) -= entry_ptr->size; \ + (Size) -= (int64_t)(entry_ptr->size); \ } \ } /* H5PB__TL_DLL_REMOVE() */ @@ -411,16 +533,24 @@ if ( ( (entry_ptr) == NULL ) || \ ((pb_ptr)->misses[ii])++; \ } /* H5PB__UPDATE_PB_HIT_RATE_STATS */ -#define H5PB__UPDATE_HT_SIZE_STATS(pb_ptr) \ - if ( (pb_ptr)->index_len > (pb_ptr)->max_index_len ) \ - (pb_ptr)->max_index_len = (pb_ptr)->index_len; \ - if ( (pb_ptr)->index_size > (pb_ptr)->max_index_size ) \ - (pb_ptr)->max_index_size = (pb_ptr)->index_size; \ - if ( (pb_ptr)->curr_md_pages > (pb_ptr)->max_md_pages ) \ - (pb_ptr)->max_md_pages = (pb_ptr)->curr_md_pages; \ - if ( (pb_ptr)->curr_rd_pages > (pb_ptr)->max_rd_pages ) \ - (pb_ptr)->max_rd_pages = (pb_ptr)->curr_rd_pages; \ - if ( (pb_ptr)->mpmde_count > (pb_ptr)->max_mpmde_count ) \ +#define H5PB__UPDATE_HT_SIZE_STATS(pb_ptr) \ + if ( (pb_ptr)->index_len > (pb_ptr)->max_index_len ) \ + (pb_ptr)->max_index_len = (pb_ptr)->index_len; \ + if ( (pb_ptr)->clean_index_len > (pb_ptr)->max_clean_index_len ) \ + (pb_ptr)->max_clean_index_len = (pb_ptr)->clean_index_len; \ + if ( (pb_ptr)->dirty_index_len > (pb_ptr)->max_dirty_index_len ) \ + (pb_ptr)->max_dirty_index_len = (pb_ptr)->dirty_index_len; \ + if ( (pb_ptr)->index_size > (pb_ptr)->max_index_size ) \ + (pb_ptr)->max_index_size = (pb_ptr)->index_size; \ + if ( (pb_ptr)->clean_index_size > (pb_ptr)->max_clean_index_size ) \ + (pb_ptr)->max_clean_index_size = (pb_ptr)->clean_index_size; \ + if ( (pb_ptr)->dirty_index_size > (pb_ptr)->max_dirty_index_size ) \ + (pb_ptr)->max_dirty_index_size = (pb_ptr)->dirty_index_size; \ + if ( (pb_ptr)->curr_md_pages > (pb_ptr)->max_md_pages ) \ + (pb_ptr)->max_md_pages = (pb_ptr)->curr_md_pages; \ + if ( (pb_ptr)->curr_rd_pages > (pb_ptr)->max_rd_pages ) \ + (pb_ptr)->max_rd_pages = (pb_ptr)->curr_rd_pages; \ + if ( (pb_ptr)->mpmde_count > (pb_ptr)->max_mpmde_count ) \ (pb_ptr)->max_rd_pages = (pb_ptr)->curr_rd_pages; #define H5PB__UPDATE_STATS_FOR_HT_INSERTION(pb_ptr) \ @@ -485,9 +615,9 @@ if ( ( (entry_ptr) == NULL ) || \ #define H5PB__UPDATE_DWL_DELAYED_WRITES(pb_ptr, insertion_depth, delay) \ { \ HDassert((pb_ptr)->vfd_swmr_writer); \ - (pb_ptr)delayed_writes++; \ - (pb_ptr)total_delay += delay; \ - (pb_ptr)total_dwl_ins_depth += (insertion_depth) \ + (pb_ptr)->delayed_writes++; \ + (pb_ptr)->total_delay += (int64_t)(delay); \ + (pb_ptr)->total_dwl_ins_depth += (insertion_depth); \ } @@ -685,221 +815,426 @@ if ( ( (entry_ptr) == NULL ) || \ #if H5PB__DO_SANITY_CHECKS -#define H5PB__PRE_HT_INSERT_SC(pb_ptr, entry_ptr, fail_val) \ -if ( ( (pb_ptr) == NULL ) || \ - ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \ - ( (entry_ptr) == NULL ) || \ - ( (entry_ptr)->magic != H5PB__H5PB_ENTRY_T_MAGIC ) || \ - ( (entry_ptr)->ht_next != NULL ) || \ - ( (entry_ptr)->ht_prev != NULL ) || \ - ( (entry_ptr)->size < pb_ptr->page_size ) || \ - ( H5PB__HASH_FCN((entry_ptr)->page) < 0 ) || \ - ( H5PB__HASH_FCN((entry_ptr)->page) >= H5PB__HASH_TABLE_LEN ) || \ - ( (pb_ptr)->index_len < 0 ) || \ - ( (pb_ptr)->index_size < 0 ) || \ - ( (pb_ptr)->curr_pages < 0 ) || \ - ( (pb_ptr)->curr_rd_pages < 0 ) || \ - ( (pb_ptr)->curr_md_pages < 0 ) || \ - ( ((pb_ptr)->curr_pages != \ - ((pb_ptr)->curr_md_pages + (pb_ptr)->curr_rd_pages)) ) || \ - ( (pb_ptr)->mpmde_count < 0 ) || \ - ( (pb_ptr)->index_len != \ - ((pb_ptr)->curr_pages + (pb_ptr)->mpmde_count) ) ) { \ - HDassert(FALSE); \ - HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, fail_val, "pre HT insert SC failed") \ +#define H5PB__PRE_HT_INSERT_SC(pb_ptr, entry_ptr, fail_val) \ +if ( ( (pb_ptr) == NULL ) || \ + ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \ + ( (entry_ptr) == NULL ) || \ + ( (entry_ptr)->ht_next != NULL ) || \ + ( (entry_ptr)->ht_prev != NULL ) || \ + ( (entry_ptr)->size <= 0 ) || \ + ( H5PB__HASH_FCN((entry_ptr)->page) < 0 ) || \ + ( H5PB__HASH_FCN((entry_ptr)->page) >= H5PB__HASH_TABLE_LEN ) || \ + ( (pb_ptr)->index_size != \ + ((pb_ptr)->clean_index_size + \ + (pb_ptr)->dirty_index_size) ) || \ + ( (pb_ptr)->index_size < ((pb_ptr)->clean_index_size) ) || \ + ( (pb_ptr)->index_size < ((pb_ptr)->dirty_index_size) ) || \ + ( (pb_ptr)->index_len != (pb_ptr)->il_len ) || \ + ( (pb_ptr)->index_size != (pb_ptr)->il_size ) || \ + ( (pb_ptr)->curr_pages < 0 ) || \ + ( (pb_ptr)->curr_rd_pages < 0 ) || \ + ( (pb_ptr)->curr_md_pages < 0 ) || \ + ( ((pb_ptr)->curr_pages != \ + ((pb_ptr)->curr_md_pages + (pb_ptr)->curr_rd_pages)) ) || \ + ( (pb_ptr)->mpmde_count < 0 ) || \ + ( (pb_ptr)->index_len != \ + ((pb_ptr)->curr_pages + (pb_ptr)->mpmde_count) ) ) { \ + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, fail_val, "pre HT insert SC failed") \ } -#define H5PB__POST_HT_INSERT_SC(pb_ptr, entry_ptr, fail_val) \ -if ( ( (pb_ptr) == NULL ) || \ - ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \ - ( (entry_ptr)->magic != H5PB__H5PB_ENTRY_T_MAGIC ) || \ - ( (pb_ptr)->index_len < 1 ) || \ - ( (pb_ptr)->index_len != \ - ((pb_ptr)->curr_pages + (pb_ptr)->mpmde_count) ) || \ - ( (pb_ptr)->index_size < (int64_t)((entry_ptr)->size) ) ) { \ - HDassert(FALSE); \ - HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, fail_val, "post HT insert SC failed") \ +#define H5PB__POST_HT_INSERT_SC(pb_ptr, entry_ptr, fail_val) \ +if ( ( (pb_ptr) == NULL ) || \ + ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \ + ( (pb_ptr)->index_size != \ + ((pb_ptr)->clean_index_size + \ + (pb_ptr)->dirty_index_size) ) || \ + ( (pb_ptr)->index_size < ((pb_ptr)->clean_index_size) ) || \ + ( (pb_ptr)->index_size < ((pb_ptr)->dirty_index_size) ) || \ + ( (pb_ptr)->index_len != (pb_ptr)->il_len ) || \ + ( (pb_ptr)->index_len != \ + ((pb_ptr)->curr_pages + (pb_ptr)->mpmde_count) ) || \ + ( (pb_ptr)->index_size != (pb_ptr)->il_size) ) { \ + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, fail_val, "post HT insert SC failed") \ } -#define H5PB__PRE_HT_REMOVE_SC(pb_ptr, entry_ptr) \ -if ( ( (pb_ptr) == NULL ) || \ - ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \ - ( (pb_ptr)->index_len < 1 ) || \ - ( (entry_ptr) == NULL ) || \ - ( (entry_ptr)->magic != H5PB__H5PB_ENTRY_T_MAGIC ) || \ - ( (entry_ptr)->size < pb_ptr->page_size ) || \ - ( (pb_ptr)->index_len < 1 ) || \ - ( (pb_ptr)->index_size < (int64_t)((entry_ptr)->size) ) || \ - ( ((pb_ptr)->ht)[(H5PB__HASH_FCN((entry_ptr)->page))] \ - == NULL ) || \ - ( ( ((pb_ptr)->ht)[(H5PB__HASH_FCN((entry_ptr)->page))] \ - != (entry_ptr) ) && \ - ( (entry_ptr)->ht_prev == NULL ) ) || \ - ( ( ((pb_ptr)->ht)[(H5PB__HASH_FCN((entry_ptr)->page))] == \ - (entry_ptr) ) && \ - ( (entry_ptr)->ht_prev != NULL ) ) ) { \ - HDassert(FALSE); \ - HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "pre HT remove SC failed") \ +#define H5PB__PRE_HT_REMOVE_SC(pb_ptr, entry_ptr) \ +if ( ( (pb_ptr) == NULL ) || \ + ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \ + ( (pb_ptr)->index_len < 1 ) || \ + ( (entry_ptr) == NULL ) || \ + ( (pb_ptr)->index_size < (int64_t)((entry_ptr)->size) ) || \ + ( (entry_ptr)->size <= 0 ) || \ + ( H5PB__HASH_FCN((entry_ptr)->page) < 0 ) || \ + ( H5PB__HASH_FCN((entry_ptr)->page) >= H5PB__HASH_TABLE_LEN ) || \ + ( ((pb_ptr)->ht)[(H5PB__HASH_FCN((entry_ptr)->page))] \ + == NULL ) || \ + ( ( ((pb_ptr)->ht)[(H5PB__HASH_FCN((entry_ptr)->page))] \ + != (entry_ptr) ) && \ + ( (entry_ptr)->ht_prev == NULL ) ) || \ + ( ( ((pb_ptr)->ht)[(H5PB__HASH_FCN((entry_ptr)->page))] == \ + (entry_ptr) ) && \ + ( (entry_ptr)->ht_prev != NULL ) ) || \ + ( (pb_ptr)->index_size != \ + ((pb_ptr)->clean_index_size + \ + (pb_ptr)->dirty_index_size) ) || \ + ( (pb_ptr)->index_size < ((pb_ptr)->clean_index_size) ) || \ + ( (pb_ptr)->index_size < ((pb_ptr)->dirty_index_size) ) || \ + ( (pb_ptr)->index_len != (pb_ptr)->il_len ) || \ + ( (pb_ptr)->index_size != (pb_ptr)->il_size ) ) { \ + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, "pre HT remove SC failed") \ } -#define H5PB__POST_HT_REMOVE_SC(pb_ptr, entry_ptr) \ -if ( ( (pb_ptr) == NULL ) || \ - ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \ - ( (entry_ptr) == NULL ) || \ - ( (entry_ptr)->magic != H5PB__H5PB_ENTRY_T_MAGIC ) || \ - ( (entry_ptr)->size < (pb_ptr)->page_size ) || \ - ( (entry_ptr)->ht_prev != NULL ) || \ - ( (entry_ptr)->ht_prev != NULL ) || \ - ( (pb_ptr)->index_len < 0 ) || \ - ( (pb_ptr)->index_size < 0 ) || \ - ( (pb_ptr)->curr_pages < 0 ) || \ - ( (pb_ptr)->curr_rd_pages < 0 ) || \ - ( (pb_ptr)->curr_md_pages < 0 ) || \ - ( ((pb_ptr)->curr_pages != \ - ((pb_ptr)->curr_md_pages + (pb_ptr)->curr_rd_pages)) ) || \ - ( (pb_ptr)->mpmde_count < 0 ) || \ - ( (pb_ptr)->index_len != \ - ((pb_ptr)->curr_pages + (pb_ptr)->mpmde_count) ) ) { \ - HDassert(FALSE); \ - HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "post HT remove SC failed") \ +#define H5PB__POST_HT_REMOVE_SC(pb_ptr, entry_ptr) \ +if ( ( (pb_ptr) == NULL ) || \ + ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \ + ( (entry_ptr) == NULL ) || \ + ( (entry_ptr)->size <= 0 ) || \ + ( (entry_ptr)->ht_prev != NULL ) || \ + ( (entry_ptr)->ht_prev != NULL ) || \ + ( (pb_ptr)->index_size != \ + ((pb_ptr)->clean_index_size + \ + (pb_ptr)->dirty_index_size) ) || \ + ( (pb_ptr)->index_size < ((pb_ptr)->clean_index_size) ) || \ + ( (pb_ptr)->index_size < ((pb_ptr)->dirty_index_size) ) || \ + ( (pb_ptr)->index_len != (pb_ptr)->il_len ) || \ + ( (pb_ptr)->index_size != (pb_ptr)->il_size ) || \ + ( (pb_ptr)->curr_pages < 0 ) || \ + ( (pb_ptr)->curr_rd_pages < 0 ) || \ + ( (pb_ptr)->curr_md_pages < 0 ) || \ + ( ((pb_ptr)->curr_pages != \ + ((pb_ptr)->curr_md_pages + (pb_ptr)->curr_rd_pages)) ) || \ + ( (pb_ptr)->mpmde_count < 0 ) || \ + ( (pb_ptr)->index_len != \ + ((pb_ptr)->curr_pages + (pb_ptr)->mpmde_count) ) ) { \ + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, "post HT remove SC failed") \ } -#define H5PB__PRE_HT_SEARCH_SC(pb_ptr, page, fail_val) \ -if ( ( (pb_ptr) == NULL ) || \ - ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \ - ( H5PB__HASH_FCN(page) < 0 ) || \ - ( H5PB__HASH_FCN(page) >= H5PB__HASH_TABLE_LEN ) ) { \ - HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, fail_val, "pre HT search SC failed") \ +#define H5PB__PRE_HT_SEARCH_SC(pb_ptr, page, fail_val) \ +if ( ( (pb_ptr) == NULL ) || \ + ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \ + ( (pb_ptr)->index_size != \ + ((pb_ptr)->clean_index_size + (pb_ptr)->dirty_index_size) ) || \ + ( H5PB__HASH_FCN(page) < 0 ) || \ + ( H5PB__HASH_FCN(page) >= H5PB__HASH_TABLE_LEN ) ) { \ + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, fail_val, "pre HT search SC failed") \ } -#define H5PB__POST_SUC_HT_SEARCH_SC(pb_ptr, entry_ptr, k, fail_val) \ -if ( ( (pb_ptr) == NULL ) || \ - ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \ - ( (pb_ptr)->index_len < 1 ) || \ - ( (entry_ptr) == NULL ) || \ - ( (entry_ptr)->magic != H5PB__H5PB_ENTRY_T_MAGIC ) || \ - ( (pb_ptr)->index_size < (int64_t)((entry_ptr)->size) ) || \ - ( (pb_ptr)->index_len < 1 ) || \ - ( (entry_ptr)->size < (pb_ptr)->page_size ) || \ - ( ( k < 0 ) || ( k >= H5PB__HASH_TABLE_LEN ) ) || \ - ( ((pb_ptr)->ht)[k] == NULL ) || \ - ( ( ((pb_ptr)->ht)[k] != (entry_ptr) ) && \ - ( (entry_ptr)->ht_prev == NULL ) ) || \ - ( ( ((pb_ptr)->ht)[k] == (entry_ptr) ) && \ - ( (entry_ptr)->ht_prev != NULL ) ) || \ - ( ( (entry_ptr)->ht_prev != NULL ) && \ - ( (entry_ptr)->ht_prev->ht_next != (entry_ptr) ) ) || \ - ( ( (entry_ptr)->ht_next != NULL ) && \ - ( (entry_ptr)->ht_next->ht_prev != (entry_ptr) ) ) ) { \ - HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, fail_val, \ - "post successful HT search SC failed") \ +#define H5PB__POST_SUC_HT_SEARCH_SC(pb_ptr, entry_ptr, k, fail_val) \ +if ( ( (pb_ptr) == NULL ) || \ + ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \ + ( (pb_ptr)->index_len < 1 ) || \ + ( (entry_ptr) == NULL ) || \ + ( (pb_ptr)->index_size < (int64_t)((entry_ptr)->size )) || \ + ( (pb_ptr)->index_size != \ + ((pb_ptr)->clean_index_size + (pb_ptr)->dirty_index_size) ) || \ + ( (entry_ptr)->size <= 0 ) || \ + ( ((pb_ptr)->ht)[k] == NULL ) || \ + ( ( ((pb_ptr)->ht)[k] != (entry_ptr) ) && \ + ( (entry_ptr)->ht_prev == NULL ) ) || \ + ( ( ((pb_ptr)->ht)[k] == (entry_ptr) ) && \ + ( (entry_ptr)->ht_prev != NULL ) ) || \ + ( ( (entry_ptr)->ht_prev != NULL ) && \ + ( (entry_ptr)->ht_prev->ht_next != (entry_ptr) ) ) || \ + ( ( (entry_ptr)->ht_next != NULL ) && \ + ( (entry_ptr)->ht_next->ht_prev != (entry_ptr) ) ) ) { \ + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, fail_val, \ + "post successful HT search SC failed") \ } #define H5PB__POST_HT_SHIFT_TO_FRONT_SC(pb_ptr, entry_ptr, k, fail_val) \ if ( ( (pb_ptr) == NULL ) || \ ( ((pb_ptr)->ht)[k] != (entry_ptr) ) || \ ( (entry_ptr)->ht_prev != NULL ) ) { \ - HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, fail_val, \ + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, fail_val, \ "post HT shift to front SC failed") \ } +#define H5PB__PRE_HT_ENTRY_SIZE_CHANGE_SC(pb_ptr, old_size, new_size, \ + entry_ptr, was_clean) \ +if ( ( (pb_ptr) == NULL ) || \ + ( (pb_ptr)->index_len <= 0 ) || \ + ( (pb_ptr)->index_size <= 0 ) || \ + ( (new_size) <= 0 ) || \ + ( (old_size) > (pb_ptr)->index_size ) || \ + ( ( (pb_ptr)->index_len == 1 ) && \ + ( (pb_ptr)->index_size != (old_size) ) ) || \ + ( (pb_ptr)->index_size != \ + ((pb_ptr)->clean_index_size + \ + (pb_ptr)->dirty_index_size) ) || \ + ( (pb_ptr)->index_size < ((pb_ptr)->clean_index_size) ) || \ + ( (pb_ptr)->index_size < ((pb_ptr)->dirty_index_size) ) || \ + ( ( !( was_clean ) || \ + ( (pb_ptr)->clean_index_size < (old_size) ) ) && \ + ( ( (was_clean) ) || \ + ( (pb_ptr)->dirty_index_size < (old_size) ) ) ) || \ + ( (entry_ptr) == NULL ) || \ + ( (pb_ptr)->index_len != (pb_ptr)->il_len ) || \ + ( (pb_ptr)->index_size != (pb_ptr)->il_size ) ) { \ + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "pre HT entry size change SC failed") \ +} + +#define H5PB__POST_HT_ENTRY_SIZE_CHANGE_SC(pb_ptr, old_size, new_size, \ + entry_ptr) \ +if ( ( (pb_ptr) == NULL ) || \ + ( (pb_ptr)->index_len <= 0 ) || \ + ( (pb_ptr)->index_size <= 0 ) || \ + ( (new_size) > (pb_ptr)->index_size ) || \ + ( (pb_ptr)->index_size != \ + ((pb_ptr)->clean_index_size + \ + (pb_ptr)->dirty_index_size) ) || \ + ( (pb_ptr)->index_size < ((pb_ptr)->clean_index_size) ) || \ + ( (pb_ptr)->index_size < ((pb_ptr)->dirty_index_size) ) || \ + ( ( !((entry_ptr)->is_dirty ) || \ + ( (pb_ptr)->dirty_index_size < (new_size) ) ) && \ + ( ( ((entry_ptr)->is_dirty) ) || \ + ( (pb_ptr)->clean_index_size < (new_size) ) ) ) || \ + ( ( (pb_ptr)->index_len == 1 ) && \ + ( (pb_ptr)->index_size != (new_size) ) ) || \ + ( (pb_ptr)->index_len != (pb_ptr)->il_len ) || \ + ( (pb_ptr)->index_size != (pb_ptr)->il_size ) ) { \ + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "post HT entry size change SC failed") \ +} + +#define H5PB__PRE_HT_UPDATE_FOR_ENTRY_CLEAN_SC(pb_ptr, entry_ptr) \ +if ( ( (pb_ptr) == NULL ) || \ + ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \ + ( (pb_ptr)->index_len <= 0 ) || \ + ( (entry_ptr) == NULL ) || \ + ( (entry_ptr)->is_dirty != FALSE ) || \ + ( (pb_ptr)->index_size < (int64_t)((entry_ptr)->size) ) || \ + ( (pb_ptr)->dirty_index_size < (int64_t)((entry_ptr)->size) ) || \ + ( (pb_ptr)->index_size != \ + ((pb_ptr)->clean_index_size + (pb_ptr)->dirty_index_size) ) || \ + ( (pb_ptr)->index_size < ((pb_ptr)->clean_index_size) ) || \ + ( (pb_ptr)->index_size < ((pb_ptr)->dirty_index_size) ) ) { \ + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "pre HT update for entry clean SC failed") \ +} + +#define H5PB__PRE_HT_UPDATE_FOR_ENTRY_DIRTY_SC(pb_ptr, entry_ptr) \ +if ( ( (pb_ptr) == NULL ) || \ + ( (pb_ptr)->magic != H5PB__H5PB_T_MAGIC ) || \ + ( (pb_ptr)->index_len <= 0 ) || \ + ( (entry_ptr) == NULL ) || \ + ( (entry_ptr)->is_dirty != TRUE ) || \ + ( (pb_ptr)->index_size < (int64_t)((entry_ptr)->size) ) || \ + ( (pb_ptr)->clean_index_size < (int64_t)((entry_ptr)->size) ) || \ + ( (pb_ptr)->index_size != \ + ((pb_ptr)->clean_index_size + (pb_ptr)->dirty_index_size) ) || \ + ( (pb_ptr)->index_size < ((pb_ptr)->clean_index_size) ) || \ + ( (pb_ptr)->index_size < ((pb_ptr)->dirty_index_size) ) ) { \ + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "pre HT update for entry dirty SC failed") \ +} + +#define H5PB__POST_HT_UPDATE_FOR_ENTRY_CLEAN_SC(pb_ptr, entry_ptr) \ +if ( ( (pb_ptr)->index_size != \ + ((pb_ptr)->clean_index_size + (pb_ptr)->dirty_index_size) ) || \ + ( (pb_ptr)->index_size < ((pb_ptr)->clean_index_size) ) || \ + ( (pb_ptr)->index_size < ((pb_ptr)->dirty_index_size) ) ) { \ + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "post HT update for entry clean SC failed") \ +} + +#define H5PB__POST_HT_UPDATE_FOR_ENTRY_DIRTY_SC(pb_ptr, entry_ptr) \ +if ( ( (pb_ptr)->index_size != \ + ((pb_ptr)->clean_index_size + (pb_ptr)->dirty_index_size) ) || \ + ( (pb_ptr)->index_size < ((pb_ptr)->clean_index_size) ) || \ + ( (pb_ptr)->index_size < ((pb_ptr)->dirty_index_size) ) ) { \ + HGOTO_ERROR(H5E_PAGEBUF, H5E_SYSTEM, FAIL, \ + "post HT update for entry dirty SC failed") \ +} + #else /* H5PB__DO_SANITY_CHECKS */ #define H5PB__PRE_HT_INSERT_SC(pb_ptr, entry_ptr, fail_val) #define H5PB__POST_HT_INSERT_SC(pb_ptr, entry_ptr, fail_val) #define H5PB__PRE_HT_REMOVE_SC(pb_ptr, entry_ptr) #define H5PB__POST_HT_REMOVE_SC(pb_ptr, entry_ptr) -#define H5PB__PRE_HT_SEARCH_SC(pb_ptr, page, fail_val) +#define H5PB__PRE_HT_SEARCH_SC(pb_ptr, Addr, fail_val) #define H5PB__POST_SUC_HT_SEARCH_SC(pb_ptr, entry_ptr, k, fail_val) #define H5PB__POST_HT_SHIFT_TO_FRONT_SC(pb_ptr, entry_ptr, k, fail_val) +#define H5PB__PRE_HT_UPDATE_FOR_ENTRY_CLEAN_SC(pb_ptr, entry_ptr) +#define H5PB__PRE_HT_UPDATE_FOR_ENTRY_DIRTY_SC(pb_ptr, entry_ptr) +#define H5PB__PRE_HT_ENTRY_SIZE_CHANGE_SC(pb_ptr, old_size, new_size, \ + entry_ptr, was_clean) +#define H5PB__POST_HT_ENTRY_SIZE_CHANGE_SC(pb_ptr, old_size, new_size, \ + entry_ptr) +#define H5PB__POST_HT_UPDATE_FOR_ENTRY_CLEAN_SC(pb_ptr, entry_ptr) +#define H5PB__POST_HT_UPDATE_FOR_ENTRY_DIRTY_SC(pb_ptr, entry_ptr) #endif /* H5PB__DO_SANITY_CHECKS */ -#define H5PB__INSERT_IN_INDEX(pb_ptr, entry_ptr, fail_val) \ + +#define H5PB__INSERT_IN_INDEX(pb_ptr, entry_ptr, fail_val) \ +{ \ + int k; \ + H5PB__PRE_HT_INSERT_SC(pb_ptr, entry_ptr, fail_val) \ + k = H5PB__HASH_FCN((entry_ptr)->page); \ + if(((pb_ptr)->ht)[k] != NULL) { \ + (entry_ptr)->ht_next = ((pb_ptr)->ht)[k]; \ + (entry_ptr)->ht_next->ht_prev = (entry_ptr); \ + } \ + ((pb_ptr)->ht)[k] = (entry_ptr); \ + (pb_ptr)->index_len++; \ + (pb_ptr)->index_size += (int64_t)((entry_ptr)->size); \ + if((entry_ptr)->is_dirty) { \ + (pb_ptr)->dirty_index_size += (int64_t)((entry_ptr)->size); \ + } else { \ + (pb_ptr)->clean_index_size += (int64_t)((entry_ptr)->size); \ + } \ + if ( (entry_ptr)->is_metadata ) { \ + if ( (entry_ptr)->is_mpmde ) { \ + ((pb_ptr)->mpmde_count)++; \ + } else { \ + ((pb_ptr)->curr_md_pages)++; \ + (pb_ptr)->curr_pages++; \ + } \ + } else { \ + ((pb_ptr)->curr_rd_pages)++; \ + (pb_ptr)->curr_pages++; \ + } \ + H5PB__IL_DLL_APPEND((entry_ptr), (pb_ptr)->il_head, \ + (pb_ptr)->il_tail, (pb_ptr)->il_len, \ + (pb_ptr)->il_size, fail_val) \ + H5PB__UPDATE_STATS_FOR_HT_INSERTION(pb_ptr) \ + H5PB__UPDATE_HT_SIZE_STATS(pb_ptr) \ + H5PB__POST_HT_INSERT_SC(pb_ptr, entry_ptr, fail_val) \ +} + +#define H5PB__DELETE_FROM_INDEX(pb_ptr, entry_ptr, fail_val) \ +{ \ + int k; \ + H5PB__PRE_HT_REMOVE_SC(pb_ptr, entry_ptr) \ + k = H5PB__HASH_FCN((entry_ptr)->page); \ + if((entry_ptr)->ht_next) \ + (entry_ptr)->ht_next->ht_prev = (entry_ptr)->ht_prev; \ + if((entry_ptr)->ht_prev) \ + (entry_ptr)->ht_prev->ht_next = (entry_ptr)->ht_next; \ + if(((pb_ptr)->ht)[k] == (entry_ptr)) \ + ((pb_ptr)->ht)[k] = (entry_ptr)->ht_next; \ + (entry_ptr)->ht_next = NULL; \ + (entry_ptr)->ht_prev = NULL; \ + (pb_ptr)->index_len--; \ + (pb_ptr)->index_size -= (int64_t)((entry_ptr)->size); \ + if((entry_ptr)->is_dirty) { \ + (pb_ptr)->dirty_index_size -= (int64_t)((entry_ptr)->size); \ + } else { \ + (pb_ptr)->clean_index_size -= (int64_t)((entry_ptr)->size); \ + } \ + if ( (entry_ptr)->is_metadata ) { \ + if ( (entry_ptr)->is_mpmde ) { \ + ((pb_ptr)->mpmde_count)--; \ + } else { \ + ((pb_ptr)->curr_md_pages)--; \ + (pb_ptr)->curr_pages--; \ + } \ + } else { \ + ((pb_ptr)->curr_rd_pages)--; \ + (pb_ptr)->curr_pages--; \ + } \ + H5PB__IL_DLL_REMOVE((entry_ptr), (pb_ptr)->il_head, \ + (pb_ptr)->il_tail, (pb_ptr)->il_len, \ + (pb_ptr)->il_size, fail_val) \ + H5PB__UPDATE_STATS_FOR_HT_DELETION(pb_ptr) \ + H5PB__POST_HT_REMOVE_SC(pb_ptr, entry_ptr) \ +} + +#define H5PB__SEARCH_INDEX(pb_ptr, Page, entry_ptr, fail_val) \ +{ \ + int k; \ + int depth = 0; \ + H5PB__PRE_HT_SEARCH_SC(pb_ptr, Page, fail_val) \ + k = H5PB__HASH_FCN(Page); \ + entry_ptr = ((pb_ptr)->ht)[k]; \ + while(entry_ptr) { \ + if ( (Page) == (entry_ptr)->page ) { \ + H5PB__POST_SUC_HT_SEARCH_SC(pb_ptr, entry_ptr, k, fail_val) \ + if ( (entry_ptr) != ((pb_ptr)->ht)[k] ) { \ + if ( (entry_ptr)->ht_next ) \ + (entry_ptr)->ht_next->ht_prev = (entry_ptr)->ht_prev; \ + HDassert((entry_ptr)->ht_prev != NULL); \ + (entry_ptr)->ht_prev->ht_next = (entry_ptr)->ht_next; \ + ((pb_ptr)->ht)[k]->ht_prev = (entry_ptr); \ + (entry_ptr)->ht_next = ((pb_ptr)->ht)[k]; \ + (entry_ptr)->ht_prev = NULL; \ + ((pb_ptr)->ht)[k] = (entry_ptr); \ + H5PB__POST_HT_SHIFT_TO_FRONT_SC(pb_ptr, entry_ptr, k, fail_val)\ + } \ + break; \ + } \ + (entry_ptr) = (entry_ptr)->ht_next; \ + (depth)++; \ + } \ + H5PB__UPDATE_STATS_FOR_HT_SEARCH(pb_ptr, (entry_ptr != NULL), depth) \ +} + +#define H5PB__SEARCH_INDEX_NO_STATS(pb_ptr, Page, entry_ptr, fail_val) \ +{ \ + int k; \ + H5PB__PRE_HT_SEARCH_SC(pb_ptr, Page, fail_val) \ + k = H5PB__HASH_FCN(Page); \ + entry_ptr = ((pb_ptr)->ht)[k]; \ + while(entry_ptr) { \ + if ( (Page), (entry_ptr)->page) ) { \ + H5PB__POST_SUC_HT_SEARCH_SC(pb_ptr, entry_ptr, k, fail_val) \ + if ( entry_ptr != ((pb_ptr)->ht)[k] ) { \ + if( (entry_ptr)->ht_next ) \ + (entry_ptr)->ht_next->ht_prev = (entry_ptr)->ht_prev; \ + HDassert((entry_ptr)->ht_prev != NULL); \ + (entry_ptr)->ht_prev->ht_next = (entry_ptr)->ht_next; \ + ((pb_ptr)->ht)[k]->ht_prev = (entry_ptr); \ + (entry_ptr)->ht_next = ((pb_ptr)->ht)[k]; \ + (entry_ptr)->ht_prev = NULL; \ + ((pb_ptr)->ht)[k] = (entry_ptr); \ + H5PB__POST_HT_SHIFT_TO_FRONT_SC(pb_ptr, entry_ptr, k, fail_val)\ + } \ + break; \ + } \ + (entry_ptr) = (entry_ptr)->ht_next; \ + } \ +} + +#define H5PB__UPDATE_INDEX_FOR_ENTRY_CLEAN(pb_ptr, entry_ptr) \ { \ - int k; \ - H5PB__PRE_HT_INSERT_SC(pb_ptr, entry_ptr, fail_val) \ - k = H5PB__HASH_FCN((entry_ptr)->page); \ - if(((pb_ptr)->ht)[k] != NULL) { \ - (entry_ptr)->ht_next = ((pb_ptr)->ht)[k]; \ - (entry_ptr)->ht_next->ht_prev = (entry_ptr); \ - } \ - ((pb_ptr)->ht)[k] = (entry_ptr); \ - (pb_ptr)->index_len++; \ - (pb_ptr)->index_size += (int64_t)((entry_ptr)->size); \ - if ( (entry_ptr)->is_metadata ) { \ - if ( (entry_ptr)->is_mpmde ) { \ - ((pb_ptr)->mpmde_count)++; \ - } else { \ - ((pb_ptr)->curr_md_pages)++; \ - (pb_ptr)->curr_pages++; \ - } \ - } else { \ - ((pb_ptr)->curr_rd_pages)++; \ - (pb_ptr)->curr_pages++; \ - } \ - H5PB__UPDATE_STATS_FOR_HT_INSERTION(pb_ptr) \ - H5PB__UPDATE_HT_SIZE_STATS(pb_ptr) \ - H5PB__POST_HT_INSERT_SC(pb_ptr, entry_ptr, fail_val) \ + H5PB__PRE_HT_UPDATE_FOR_ENTRY_CLEAN_SC(pb_ptr, entry_ptr); \ + (pb_ptr)->dirty_index_size -= (int64_t)((entry_ptr)->size); \ + (pb_ptr)->clean_index_size += (int64_t)((entry_ptr)->size); \ + H5PB__POST_HT_UPDATE_FOR_ENTRY_CLEAN_SC(pb_ptr, entry_ptr); \ } -#define H5PB__DELETE_FROM_INDEX(pb_ptr, entry_ptr, fail_val) \ +#define H5PB__UPDATE_INDEX_FOR_ENTRY_DIRTY(pb_ptr, entry_ptr) \ { \ - int k; \ - H5PB__PRE_HT_REMOVE_SC(pb_ptr, entry_ptr) \ - k = H5PB__HASH_FCN((entry_ptr)->page); \ - if((entry_ptr)->ht_next) \ - (entry_ptr)->ht_next->ht_prev = (entry_ptr)->ht_prev; \ - if((entry_ptr)->ht_prev) \ - (entry_ptr)->ht_prev->ht_next = (entry_ptr)->ht_next; \ - if(((pb_ptr)->ht)[k] == (entry_ptr)) \ - ((pb_ptr)->ht)[k] = (entry_ptr)->ht_next; \ - (entry_ptr)->ht_next = NULL; \ - (entry_ptr)->ht_prev = NULL; \ - (pb_ptr)->index_len--; \ - (pb_ptr)->index_size -= (int64_t)((entry_ptr)->size); \ - if ( (entry_ptr)->is_metadata ) { \ - if ( (entry_ptr)->is_mpmde ) { \ - ((pb_ptr)->mpmde_count)--; \ - } else { \ - ((pb_ptr)->curr_md_pages)--; \ - (pb_ptr)->curr_pages--; \ - } \ - } else { \ - ((pb_ptr)->curr_rd_pages)--; \ - (pb_ptr)->curr_pages--; \ - } \ - H5PB__UPDATE_STATS_FOR_HT_DELETION(pb_ptr) \ - H5PB__POST_HT_REMOVE_SC(pb_ptr, entry_ptr) \ + H5PB__PRE_HT_UPDATE_FOR_ENTRY_DIRTY_SC(pb_ptr, entry_ptr); \ + (pb_ptr)->clean_index_size -= (int64_t)((entry_ptr)->size); \ + (pb_ptr)->dirty_index_size += (int64_t)((entry_ptr)->size); \ + H5PB__POST_HT_UPDATE_FOR_ENTRY_DIRTY_SC(pb_ptr, entry_ptr); \ } -#define H5PB__SEARCH_INDEX(pb_ptr, pg, entry_ptr, f_val) \ -{ \ - int k; \ - int depth = 0; \ - H5PB__PRE_HT_SEARCH_SC((pb_ptr), (pg), (f_val)) \ - k = H5PB__HASH_FCN((pg)); \ - entry_ptr = ((pb_ptr)->ht)[k]; \ - while ( entry_ptr ) { \ - if ( (pg) == (entry_ptr)->page ) { \ - H5PB__POST_SUC_HT_SEARCH_SC(pb_ptr, entry_ptr, k, f_val) \ - if ( entry_ptr != ((pb_ptr)->ht)[k] ) { \ - if ( (entry_ptr)->ht_next ) \ - (entry_ptr)->ht_next->ht_prev = (entry_ptr)->ht_prev; \ - HDassert((entry_ptr)->ht_prev != NULL); \ - (entry_ptr)->ht_prev->ht_next = (entry_ptr)->ht_next; \ - ((pb_ptr)->ht)[k]->ht_prev = (entry_ptr); \ - (entry_ptr)->ht_next = ((pb_ptr)->ht)[k]; \ - (entry_ptr)->ht_prev = NULL; \ - ((pb_ptr)->ht)[k] = (entry_ptr); \ - H5PB__POST_HT_SHIFT_TO_FRONT_SC(pb_ptr, entry_ptr, k, f_val) \ - } \ - break; \ - } \ - (entry_ptr) = (entry_ptr)->ht_next; \ - (depth)++; \ - } \ - H5PB__UPDATE_STATS_FOR_HT_SEARCH(pb_ptr, (entry_ptr != NULL), depth) \ +#define H5PB__UPDATE_INDEX_FOR_SIZE_CHANGE(pb_ptr, old_size, new_size, \ + entry_ptr, was_clean) \ +{ \ + H5PB__PRE_HT_ENTRY_SIZE_CHANGE_SC(pb_ptr, old_size, new_size, \ + entry_ptr, was_clean) \ + (pb_ptr)->index_size -= (old_size); \ + (pb_ptr)->index_size += (new_size); \ + if(was_clean) { \ + (pb_ptr)->clean_index_size -= (old_size); \ + } else { \ + (pb_ptr)->dirty_index_size -= (old_size); \ + } \ + if((entry_ptr)->is_dirty) { \ + (pb_ptr)->dirty_index_size += (new_size); \ + } else { \ + (pb_ptr)->clean_index_size += (new_size); \ + } \ + H5PB__DLL_UPDATE_FOR_SIZE_CHANGE((pb_ptr)->il_len, \ + (pb_ptr)->il_size, \ + (old_size), (new_size)) \ + H5PB__POST_HT_ENTRY_SIZE_CHANGE_SC(pb_ptr, old_size, new_size, \ + entry_ptr) \ } @@ -1072,7 +1407,7 @@ if ( ( (pb_ptr) == NULL ) || \ HDassert( (pb_ptr)->magic == H5PB__H5PB_T_MAGIC ); \ HDassert( (entry_ptr) ); \ HDassert( (entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC ); \ - HDassert( (entry_ptr)->size >= pb_ptr->page_size ); \ + HDassert( (entry_ptr)->size == pb_ptr->page_size ); \ \ /* modified LRU specific code */ \ \ @@ -1217,7 +1552,7 @@ if ( ( (pb_ptr) == NULL ) || \ { \ HDassert( (pb_ptr) ); \ HDassert( (pb_ptr)->magic == H5PB__H5PB_T_MAGIC ); \ - HDassert( (pb_ptr)->vfd_swmr_writer ) \ + HDassert( (pb_ptr)->vfd_swmr_writer ); \ HDassert( (entry_ptr) ); \ HDassert( (entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC ); \ HDassert( (entry_ptr)->modified_this_tick ); \ @@ -1295,7 +1630,7 @@ if ( ( (pb_ptr) == NULL ) || \ \ HDassert( (pb_ptr) ); \ HDassert( (pb_ptr)->magic == H5PB__H5PB_T_MAGIC ); \ - HDassert( (pb_ptr)->vfd_swmr_writer ) \ + HDassert( (pb_ptr)->vfd_swmr_writer ); \ HDassert( (entry_ptr) ); \ HDassert( (entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC ); \ HDassert( (entry_ptr)->size >= pb_ptr->page_size ); \ @@ -1313,7 +1648,7 @@ if ( ( (pb_ptr) == NULL ) || \ \ H5PB__DLL_INSERT_BEFORE((entry_ptr), (suc_ptr), (pb_ptr)->dwl_head_ptr, \ (pb_ptr)->dwl_tail_ptr, (pb_ptr)->dwl_len, \ - (pb_ptr)->dwl_size), (fail_val)) \ + (pb_ptr)->dwl_size, (fail_val)) \ \ if ( entry_ptr->delay_write_until > pb_ptr->max_delay ) \ pb_ptr->max_delay = entry_ptr->delay_write_until; \ @@ -1346,7 +1681,7 @@ if ( ( (pb_ptr) == NULL ) || \ { \ HDassert( (pb_ptr) ); \ HDassert( (pb_ptr)->magic == H5PB__H5PB_T_MAGIC ); \ - HDassert( (pb_ptr)->vfd_swmr_writer ) \ + HDassert( (pb_ptr)->vfd_swmr_writer ); \ HDassert( (entry_ptr) ); \ HDassert( (entry_ptr)->magic == H5PB__H5PB_ENTRY_T_MAGIC ); \ HDassert( (entry_ptr)->size >= pb_ptr->page_size ); \ @@ -1421,6 +1756,9 @@ if ( ( (pb_ptr) == NULL ) || \ * If there are multiple entries in any hash bin, they are stored in a doubly * linked list. * + * To facilitate flushing the page buffer, we also maintain a doubly linked + * list of all entries in the page buffer. + * * ht_next: Next pointer used by the hash table to store multiple * entries in a single hash bin. This field points to the * next entry in the doubly linked list of entries in the @@ -1431,6 +1769,16 @@ if ( ( (pb_ptr) == NULL ) || \ * previous entry in the doubly linked list of entries in * the hash bin, or NULL if there is no previuos entry. * + * il_next: Next pointer used by the index to maintain a doubly linked + * list of all entries in the index (and thus in the page buffer). + * This field contains a pointer to the next entry in the + * index list, or NULL if there is no next entry. + * + * il_prev: Prev pointer used by the index to maintain a doubly linked + * list of all entries in the index (and thus in the page buffer). + * This field contains a pointer to the previous entry in the + * index list, or NULL if there is no previous entry. + * * * Fields supporting replacement policies: * @@ -1487,13 +1835,12 @@ if ( ( (pb_ptr) == NULL ) || \ * ****************************************************************************/ - #define H5PB__H5PB_ENTRY_T_MAGIC 0x02030405 struct H5PB_entry_t { uint32_t magic; - H5PB_t *pb_ptr; + H5PB_t *pb_ptr; haddr_t addr; uint64_t page; size_t size; @@ -1505,6 +1852,8 @@ struct H5PB_entry_t { /* fields supporting the hash table: */ struct H5PB_entry_t *ht_next; struct H5PB_entry_t *ht_prev; + struct H5PB_entry_t *il_next; + struct H5PB_entry_t *il_prev; /* fields supporting replacement policies: */ struct H5PB_entry_t *next; diff --git a/src/H5PBprivate.h b/src/H5PBprivate.h index 2c1f3cb..7aabcd5 100644 --- a/src/H5PBprivate.h +++ b/src/H5PBprivate.h @@ -140,22 +140,87 @@ typedef struct H5PB_entry_t H5PB_entry_t; * hash to the same bucket. That said, we must collect statistics to alert * us should this not be the case. * + * We also maintain a linked list of all entries in the index to facilitate + * flush operations. + * * index Array of pointer to H5PB_entry_t of size * H5PB__HASH_TABLE_LEN. This size must ba a power of 2, * not the usual prime number. * * index_len: Number of entries currently in the hash table used to index - * the page buffer. + * the page buffer. index_len should always equal + * clean_index_len + dirty_index_len. + * + * clean_index_len: Number of clean entries currently in the hash table + * used to index the page buffer. + * + * dirty_index_len: Number of dirty entries currently in the hash table + * used to index the page buffer. * * index_size: Number of bytes currently stored in the hash table used to * index the page buffer. Under normal circumstances, this * value will be index_len * page size. However, if * vfd_swmr_writer is TRUE, it may be larger. * + * index_size should always equal clean_index_size + + * dirty_index_size. + * + * clean_index_size: Number of bytes of clean entries currently stored in + * the hash table used to index the page buffer. + * + * dirty_index_size: Number of bytes of dirty entries currently stored in + * the hash table used to index the page buffer. + * + * il_len: Number of entries on the index list. + * + * This must always be equal to index_len. As such, this + * field is redundant. However, the existing linked list + * management macros expect to maintain a length field, so + * this field exists primarily to avoid adding complexity to + * these macros. + * + * il_size: Number of bytes of cache entries currently stored in the + * index list. + * + * This must always be equal to index_size. As such, this + * field is redundant. However, the existing linked list + * management macros expect to maintain a size field, so + * this field exists primarily to avoid adding complexity to + * these macros. + * + * il_head: Pointer to the head of the doubly linked list of entries in + * the index list. Note that cache entries on this list are + * linked by their il_next and il_prev fields. + * + * This field is NULL if the index is empty. + * + * il_tail: Pointer to the tail of the doubly linked list of entries in + * the index list. Note that cache entries on this list are + * linked by their il_next and il_prev fields. + * + * This field is NULL if the index is empty. + + * + * * Fields supporting the modified LRU policy: * * See most any OS text for a discussion of the LRU replacement policy. * + * Under normal operating circumstances (i.e. vfd_swmr_writer is FALSE) + * all entries will reside both in the index and in the LRU. Further, + * all entries will be of size page_size. + * + * The VFD SWMR writer case (i.e. vfd_swmr_writer is TRUE) is complicated + * by the requirements that we: + * + * 1) buffer all metadat writes (including multi-page metadata writes) that + * occur during a tick, and + * + * 2) when necessary, delay metadata writes for up to max_lag ticks to + * avoid message from the future bugs on the VFD SWMR readers. + * + * See discussion of fields supporting VFD SWMR below for details. + * * Discussions of the individual fields used by the modified LRU replacement * policy follow: * @@ -183,7 +248,43 @@ typedef struct H5PB_entry_t H5PB_entry_t; * This field is NULL if the list is empty. * * - * FIELDS FOR VFD SWMR: + * FIELDS SUPPORTING VFD SWMR: + * + * If the file is opened as a VFD SWMR writer (i.e. vfd_swmr_writer == TRUE), + * the page buffer must retain the data necessary to update the metadata + * file at the end of each tick, and also delay writes as necessary so as + * to avoid message from the future bugs on the VFD SWMR readers. + * + * The tick list exists to allow us to buffer copies of all metadata writes + * during a tick, and the delayed write list supports delayed writes. + * + * If a regular page is written to during a tick, it is placed on the tick + * list. If there is no reason to delay its write to file (i.e. either + * it was just allocated, or it has existed in the metadata file index for + * at least max_lag ticks), it is also placed on the LRU, where it may be + * flushed, but not evicted. If its write must be delayed, it is placed on + * the delayed write list, where it must remain until its write delay is + * satisfied -- at which point it is moved to the LRU. + * + * If a multi-page metadata entry is written during a tick, it is placed on + * the tick list. If, in addition, the write of the entry must be delayed, + * it is also place on the delayed write list. Note that multi-page metadata + * entries may never appear on the LRU. + * + * At the end of each tick, the tick list is emptied. + * + * Regular pages are simply removed from the tick list, as they must already + * appear on either the LRU or the delayed write list. + * + * Multi-page metadata entries that are not also on the delayed write list + * are simply flushed and evicted. + * + * The delayed write list is also scanned at the end of each tick. Regular + * entries that are now flushable are placed at the head of the LRU. Multi- + * page metadata entries that are flushable are flushed and evicted. + * + * The remainder of this sections contains discussions of the fields and + * data structures used to support the above operations. * * vfd_swmr_writer: Boolean flag that is set to TRUE iff the file is * the file is opened in VFD SWMR mode. The remaining @@ -205,8 +306,8 @@ typedef struct H5PB_entry_t H5PB_entry_t; * likely be perciived as file corruption by the reader. * * To facilitate identification of entries that must be removed from the - * DWL, the list always observes the following invarient for any entry - * on the list: + * DWL during the end of tick scan, the list always observes the following + * invarient for any entry on the list: * * entry_ptr->next == NULL || * entry_ptr->delay_write_until >= entry_ptr->next->delay_write_until @@ -384,8 +485,16 @@ typedef struct H5PB_entry_t H5PB_entry_t; * * max_index_len: Largest value attained by the index_len field. * + * max_clean_index_len: Largest value attained by the clean_index_len field. + * + * max_dirty_index_len: Largest value attained by the dirty_index_len field. + * * max_index_size: Largest value attained by the index_size field. * + * max_clean_index_size: Largest value attained by the clean_index_size field. + * + * max_dirty_index_size: Largest value attained by the dirty_index_size field. + * * max_rd_pages: Maximum number of raw data pages in the page buffer. * * max_md_pages: Maximum number of metadata pages in the page buffer. @@ -459,7 +568,15 @@ typedef struct H5PB_t { /* index */ H5PB_entry_t *(ht[H5PB__HASH_TABLE_LEN]); int64_t index_len; + int64_t clean_index_len; + int64_t dirty_index_len; int64_t index_size; + int64_t clean_index_size; + int64_t dirty_index_size; + int64_t il_len; + int64_t il_size; + H5PB_entry_t * il_head; + H5PB_entry_t * il_tail; /* LRU */ int64_t LRU_len; @@ -518,7 +635,11 @@ typedef struct H5PB_t { int64_t failed_ht_searches; int64_t total_failed_ht_search_depth; int64_t max_index_len; + int64_t max_clean_index_len; + int64_t max_dirty_index_len; int64_t max_index_size; + int64_t max_clean_index_size; + int64_t max_dirty_index_size; int64_t max_rd_pages; int64_t max_md_pages; @@ -567,6 +688,17 @@ H5_DLL herr_t H5PB_read(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, H5_DLL herr_t H5PB_write(H5F_t *f, H5FD_mem_t type, haddr_t addr, size_t size, const void *buf); + +/* VFD SWMR specific routines */ +H5_DLL herr_t H5PB_vfd_swmr__release_delayed_writes(H5F_t * f); + +H5_DLL herr_t H5PB_vfd_swmr__release_tick_list(H5F_t * f); + +H5_DLL herr_t H5PB_vfd_swmr__update_index(H5F_t * f, int * idx_ent_added_ptr, + int * idx_ent_modified_ptr, int * idx_ent_not_in_tl_ptr, + int * idx_ent_not_in_tl_flushed_ptr); + + /* Statistics routines */ H5_DLL herr_t H5PB_reset_stats(H5PB_t *page_buf); @@ -576,6 +708,7 @@ H5_DLL herr_t H5PB_get_stats(const H5PB_t *page_buf, unsigned accesses[2], H5_DLL herr_t H5PB_print_stats(const H5PB_t *page_buf); + /* test & debug functions */ H5_DLL herr_t H5PB_page_exists(H5F_t *f, haddr_t addr, hbool_t *page_exists_ptr); -- cgit v0.12