diff options
Diffstat (limited to 'src/H5ACmpio.c')
-rw-r--r-- | src/H5ACmpio.c | 2295 |
1 files changed, 2295 insertions, 0 deletions
diff --git a/src/H5ACmpio.c b/src/H5ACmpio.c new file mode 100644 index 0000000..64c27ab --- /dev/null +++ b/src/H5ACmpio.c @@ -0,0 +1,2295 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright by The HDF Group. * + * Copyright by the Board of Trustees of the University of Illinois. * + * All rights reserved. * + * * + * This file is part of HDF5. The full HDF5 copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the files COPYING and Copyright.html. COPYING can be found at the root * + * of the source code distribution tree; Copyright.html can be found at the * + * root level of an installed copy of the electronic HDF5 document set and * + * is linked from the top-level documents page. It can also be found at * + * http://hdfgroup.org/HDF5/doc/Copyright.html. If you do not have * + * access to either file, you may request a copy from help@hdfgroup.org. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +/*------------------------------------------------------------------------- + * + * Created: H5ACmpio.c + * Jun 20 2015 + * Quincey Koziol <koziol@hdfgroup.org> + * + * Purpose: Functions in this file implement support for parallel + * I/O cache functionality + * + *------------------------------------------------------------------------- + */ + +/****************/ +/* Module Setup */ +/****************/ + +#define H5AC_PACKAGE /*suppress error about including H5ACpkg */ +#define H5C_PACKAGE /*suppress error about including H5Cpkg */ +#define H5F_PACKAGE /*suppress error about including H5Fpkg */ + +/* Interface initialization */ +#define H5_INTERFACE_INIT_FUNC H5AC__init_mpio_interface + +/***********/ +/* Headers */ +/***********/ +#include "H5private.h" /* Generic Functions */ +#include "H5ACpkg.h" /* Metadata cache */ +#include "H5Cpkg.h" /* Cache */ +#include "H5Eprivate.h" /* Error handling */ +#include "H5Fpkg.h" /* Files */ +#include "H5MMprivate.h" /* Memory management */ + +#ifdef H5_HAVE_PARALLEL + +/****************/ +/* Local Macros */ +/****************/ + + +/******************/ +/* Local Typedefs */ +/******************/ + +/**************************************************************************** + * + * structure H5AC_slist_entry_t + * + * The dirty entry list maintained via the d_slist_ptr field of H5AC_aux_t + * and the cleaned entry list maintained via the c_slist_ptr field of + * H5AC_aux_t are just lists of the file offsets of the dirty/cleaned + * entries. Unfortunately, the slist code makes us define a dynamically + * allocated structure to store these offsets in. This structure serves + * that purpose. Its fields are as follows: + * + * addr: file offset of a metadata entry. Entries are added to this + * list (if they aren't there already) when they are marked + * dirty in an unprotect, inserted, or moved. They are + * removed when they appear in a clean entries broadcast. + * + ****************************************************************************/ +typedef struct H5AC_slist_entry_t +{ + haddr_t addr; +} H5AC_slist_entry_t; + +/* User data for address list building callbacks */ +typedef struct H5AC_addr_list_ud_t +{ + H5AC_aux_t * aux_ptr; /* 'Auxiliary' parallel cache info */ + haddr_t * addr_buf_ptr; /* Array to store addresses */ + int i; /* Counter for position in array */ +} H5AC_addr_list_ud_t; + + +/********************/ +/* Local Prototypes */ +/********************/ + +static herr_t H5AC__broadcast_candidate_list(H5AC_t *cache_ptr, + int *num_entries_ptr, haddr_t **haddr_buf_ptr_ptr); +static herr_t H5AC__broadcast_clean_list(H5AC_t *cache_ptr); +static herr_t H5AC__construct_candidate_list(H5AC_t *cache_ptr, + H5AC_aux_t *aux_ptr, int sync_point_op); +static herr_t H5AC__copy_candidate_list_to_buffer(const H5AC_t *cache_ptr, + int *num_entries_ptr, haddr_t **haddr_buf_ptr_ptr); +static herr_t H5AC__propagate_and_apply_candidate_list(H5F_t *f, hid_t dxpl_id); +static herr_t H5AC__propagate_flushed_and_still_clean_entries_list(H5F_t *f, + hid_t dxpl_id); +static herr_t H5AC__receive_haddr_list(MPI_Comm mpi_comm, int *num_entries_ptr, + haddr_t **haddr_buf_ptr_ptr); +static herr_t H5AC__receive_candidate_list(const H5AC_t *cache_ptr, + int *num_entries_ptr, haddr_t **haddr_buf_ptr_ptr); +static herr_t H5AC__receive_and_apply_clean_list(H5F_t *f, hid_t dxpl_id); +static herr_t H5AC__tidy_cache_0_lists(H5AC_t *cache_ptr, int num_candidates, + haddr_t *candidates_list_ptr); +static herr_t H5AC__rsp__dist_md_write__flush(H5F_t *f, hid_t dxpl_id); +static herr_t H5AC__rsp__dist_md_write__flush_to_min_clean(H5F_t *f, hid_t dxpl_id); +static herr_t H5AC__rsp__p0_only__flush(H5F_t *f, hid_t dxpl_id); +static herr_t H5AC__rsp__p0_only__flush_to_min_clean(H5F_t *f, hid_t dxpl_id); + + +/*********************/ +/* Package Variables */ +/*********************/ + +/* Declare a free list to manage the H5AC_aux_t struct */ +H5FL_DEFINE(H5AC_aux_t); + + +/*****************************/ +/* Library Private Variables */ +/*****************************/ + + +/*******************/ +/* Local Variables */ +/*******************/ + +/* Declare a free list to manage the H5AC_slist_entry_t struct */ +H5FL_DEFINE_STATIC(H5AC_slist_entry_t); + + +/*------------------------------------------------------------------------- + * Function: H5AC__init_mpio_interface + * + * Purpose: Initialize interface-specific information + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: Quincey Koziol + * 6/20/15 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5AC__init_mpio_interface(void) +{ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_STATIC + + /* Funnel all work to H5AC_init() */ + if(H5AC_init() < 0) + HGOTO_ERROR(H5E_FUNC, H5E_CANTINIT, FAIL, "interface initialization failed") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* H5AC__init_mpio_interface() */ + + +/*------------------------------------------------------------------------- + * Function: H5AC__set_sync_point_done_callback + * + * Purpose: Set the value of the sync_point_done callback. This + * callback is used by the parallel test code to verify + * that the expected writes and only the expected writes + * take place during a sync point. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: John Mainzer + * 5/9/10 + * + *------------------------------------------------------------------------- + */ +herr_t +H5AC__set_sync_point_done_callback(H5C_t * cache_ptr, + void (* sync_point_done)(int num_writes, haddr_t * written_entries_tbl)) +{ + H5AC_aux_t * aux_ptr; + + FUNC_ENTER_PACKAGE_NOERR + + /* Sanity checks */ + HDassert(cache_ptr && (cache_ptr->magic == H5C__H5C_T_MAGIC)); + aux_ptr = (H5AC_aux_t *)(cache_ptr->aux_ptr); + HDassert(aux_ptr != NULL); + HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC); + + aux_ptr->sync_point_done = sync_point_done; + + FUNC_LEAVE_NOAPI(SUCCEED) +} /* H5AC__set_sync_point_done_callback() */ + + +/*------------------------------------------------------------------------- + * Function: H5AC__set_write_done_callback + * + * Purpose: Set the value of the write_done callback. This callback + * is used to improve performance of the parallel test bed + * for the cache. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: John Mainzer + * 5/11/06 + * + *------------------------------------------------------------------------- + */ +herr_t +H5AC__set_write_done_callback(H5C_t * cache_ptr, void (* write_done)(void)) +{ + H5AC_aux_t * aux_ptr; + + FUNC_ENTER_PACKAGE_NOERR + + /* Sanity checks */ + HDassert(cache_ptr && (cache_ptr->magic == H5C__H5C_T_MAGIC)); + aux_ptr = (H5AC_aux_t *)(cache_ptr->aux_ptr); + HDassert( aux_ptr != NULL ); + HDassert( aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC ); + + aux_ptr->write_done = write_done; + + FUNC_LEAVE_NOAPI(SUCCEED) +} /* H5AC__set_write_done_callback() */ + + +/*------------------------------------------------------------------------- + * Function: H5AC_add_candidate() + * + * Purpose: Add the supplied metadata entry address to the candidate + * list. Verify that each entry added does not appear in + * the list prior to its insertion. + * + * This function is intended for used in constructing list + * of entried to be flushed during sync points. It shouldn't + * be called anywhere else. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: John Mainzer + * 3/17/10 + * + *------------------------------------------------------------------------- + */ +herr_t +H5AC_add_candidate(H5AC_t * cache_ptr, haddr_t addr) +{ + H5AC_aux_t * aux_ptr; + H5AC_slist_entry_t * slist_entry_ptr = NULL; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(FAIL) + + /* Sanity checks */ + HDassert(cache_ptr != NULL); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + aux_ptr = (H5AC_aux_t *)(cache_ptr->aux_ptr); + HDassert(aux_ptr != NULL); + HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC); + HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED); + HDassert(aux_ptr->candidate_slist_ptr != NULL); + + /* Construct an entry for the supplied address, and insert + * it into the candidate slist. + */ + if(NULL == (slist_entry_ptr = H5FL_MALLOC(H5AC_slist_entry_t))) + HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "Can't allocate candidate slist entry") + slist_entry_ptr->addr = addr; + + if(H5SL_insert(aux_ptr->candidate_slist_ptr, slist_entry_ptr, &(slist_entry_ptr->addr)) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTINSERT, FAIL, "can't insert entry into dirty entry slist") + +done: + /* Clean up on error */ + if(ret_value < 0) + if(slist_entry_ptr) + slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr); + + FUNC_LEAVE_NOAPI(ret_value) +} /* H5AC_add_candidate() */ + + +/*------------------------------------------------------------------------- + * + * Function: H5AC__broadcast_candidate_list() + * + * Purpose: Broadcast the contents of the process 0 candidate entry + * slist. In passing, also remove all entries from said + * list. As the application of this will be handled by + * the same functions on all processes, construct and + * return a copy of the list in the same format as that + * received by the other processes. Note that if this + * copy is returned in *haddr_buf_ptr_ptr, the caller + * must free it. + * + * This function must only be called by the process with + * MPI_rank 0. + * + * Return SUCCEED on success, and FAIL on failure. + * + * Return: Non-negative on success/Negative on failure. + * + * Programmer: John Mainzer, 7/1/05 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5AC__broadcast_candidate_list(H5AC_t *cache_ptr, int *num_entries_ptr, + haddr_t **haddr_buf_ptr_ptr) +{ + H5AC_aux_t * aux_ptr = NULL; + haddr_t * haddr_buf_ptr = NULL; + int mpi_result; + int num_entries; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_STATIC + + /* Sanity checks */ + HDassert(cache_ptr != NULL); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + aux_ptr = (H5AC_aux_t *)(cache_ptr->aux_ptr); + HDassert(aux_ptr != NULL); + HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC); + HDassert(aux_ptr->mpi_rank == 0); + HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED); + HDassert(aux_ptr->candidate_slist_ptr != NULL); + HDassert(num_entries_ptr != NULL); + HDassert(*num_entries_ptr == 0); + HDassert(haddr_buf_ptr_ptr != NULL); + HDassert(*haddr_buf_ptr_ptr == NULL); + + /* First broadcast the number of entries in the list so that the + * receivers can set up buffers to receive them. If there aren't + * any, we are done. + */ + num_entries = (int)H5SL_count(aux_ptr->candidate_slist_ptr); + if(MPI_SUCCESS != (mpi_result = MPI_Bcast(&num_entries, 1, MPI_INT, 0, aux_ptr->mpi_comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_result) + + if(num_entries > 0) { + size_t buf_size = 0; + int chk_num_entries = 0; + + /* convert the candidate list into the format we + * are used to receiving from process 0, and also load it + * into a buffer for transmission. + */ + if(H5AC__copy_candidate_list_to_buffer(cache_ptr, &chk_num_entries, &haddr_buf_ptr) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate buffer.") + HDassert(chk_num_entries == num_entries); + HDassert(haddr_buf_ptr != NULL); + + /* Now broadcast the list of candidate entries */ + buf_size = sizeof(haddr_t) * (size_t)num_entries; + if(MPI_SUCCESS != (mpi_result = MPI_Bcast((void *)haddr_buf_ptr, (int)buf_size, MPI_BYTE, 0, aux_ptr->mpi_comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_result) + } /* end if */ + + /* Pass the number of entries and the buffer pointer + * back to the caller. Do this so that we can use the same code + * to apply the candidate list to all the processes. + */ + *num_entries_ptr = num_entries; + *haddr_buf_ptr_ptr = haddr_buf_ptr; + +done: + if(ret_value < 0) + if(haddr_buf_ptr) + haddr_buf_ptr = (haddr_t *)H5MM_xfree((void *)haddr_buf_ptr); + + FUNC_LEAVE_NOAPI(ret_value) +} /* H5AC__broadcast_candidate_list() */ + + +/*------------------------------------------------------------------------- + * + * Function: H5AC__broadcast_clean_list_cb() + * + * Purpose: Skip list callback for building array of addresses for + * broadcasting the clean list. + * + * Return: Non-negative on success/Negative on failure. + * + * Programmer: Quincey Koziol, 6/12/15 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5AC__broadcast_clean_list_cb(void *_item, void H5_ATTR_UNUSED *_key, + void *_udata) +{ + H5AC_slist_entry_t * slist_entry_ptr = (H5AC_slist_entry_t *)_item; /* Address of item */ + H5AC_addr_list_ud_t * udata = (H5AC_addr_list_ud_t *)_udata; /* Context for callback */ + haddr_t addr; + + FUNC_ENTER_STATIC_NOERR + + /* Sanity checks */ + HDassert(slist_entry_ptr); + HDassert(udata); + + /* Store the entry's address in the buffer */ + addr = slist_entry_ptr->addr; + udata->addr_buf_ptr[udata->i] = addr; + udata->i++; + + /* now release the entry */ + slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr); + + /* and also remove the matching entry from the dirtied list + * if it exists. + */ + if(NULL != (slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(udata->aux_ptr->d_slist_ptr, (void *)(&addr)))) + slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr); + + FUNC_LEAVE_NOAPI(SUCCEED) +} /* H5AC__broadcast_clean_list_cb() */ + + +/*------------------------------------------------------------------------- + * + * Function: H5AC__broadcast_clean_list() + * + * Purpose: Broadcast the contents of the process 0 cleaned entry + * slist. In passing, also remove all entries from said + * list, and also remove any matching entries from the dirtied + * slist. + * + * This function must only be called by the process with + * MPI_rank 0. + * + * Return SUCCEED on success, and FAIL on failure. + * + * Return: Non-negative on success/Negative on failure. + * + * Programmer: John Mainzer, 7/1/05 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5AC__broadcast_clean_list(H5AC_t * cache_ptr) +{ + haddr_t * addr_buf_ptr = NULL; + H5AC_aux_t * aux_ptr; + int mpi_result; + int num_entries = 0; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_STATIC + + /* Sanity checks */ + HDassert(cache_ptr != NULL); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + aux_ptr = (H5AC_aux_t *)cache_ptr->aux_ptr; + HDassert(aux_ptr != NULL); + HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC); + HDassert(aux_ptr->mpi_rank == 0); + HDassert(aux_ptr->c_slist_ptr != NULL); + + /* First broadcast the number of entries in the list so that the + * receives can set up a buffer to receive them. If there aren't + * any, we are done. + */ + num_entries = (int)H5SL_count(aux_ptr->c_slist_ptr); + if(MPI_SUCCESS != (mpi_result = MPI_Bcast(&num_entries, 1, MPI_INT, 0, aux_ptr->mpi_comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_result) + + if(num_entries > 0) { + H5AC_addr_list_ud_t udata; + size_t buf_size; + + /* allocate a buffer to store the list of entry base addresses in */ + buf_size = sizeof(haddr_t) * (size_t)num_entries; + if(NULL == (addr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size))) + HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for addr buffer") + + /* Set up user data for callback */ + udata.aux_ptr = aux_ptr; + udata.addr_buf_ptr = addr_buf_ptr; + udata.i = 0; + + /* Free all the clean list entries, building the address list in the callback */ + /* (Callback also removes the matching entries from the dirtied list) */ + if(H5SL_free(aux_ptr->c_slist_ptr, H5AC__broadcast_clean_list_cb, &udata) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFREE, FAIL, "Can't build address list for clean entries") + + /* Now broadcast the list of cleaned entries */ + if(MPI_SUCCESS != (mpi_result = MPI_Bcast((void *)addr_buf_ptr, (int)buf_size, MPI_BYTE, 0, aux_ptr->mpi_comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_result) + } /* end if */ + + /* if it is defined, call the sync point done callback. Note + * that this callback is defined purely for testing purposes, + * and should be undefined under normal operating circumstances. + */ + if(aux_ptr->sync_point_done) + (aux_ptr->sync_point_done)(num_entries, addr_buf_ptr); + +done: + if(addr_buf_ptr) + addr_buf_ptr = (haddr_t *)H5MM_xfree((void *)addr_buf_ptr); + + FUNC_LEAVE_NOAPI(ret_value) +} /* H5AC__broadcast_clean_list() */ + + +/*------------------------------------------------------------------------- + * Function: H5AC__construct_candidate_list() + * + * Purpose: In the parallel case when the metadata_write_strategy is + * H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED, process 0 uses + * this function to construct the list of cache entries to + * be flushed. This list is then propagated to the other + * caches, and then flushed in a distributed fashion. + * + * The sync_point_op parameter is used to determine the extent + * of the flush. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: John Mainzer + * 3/17/10 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5AC__construct_candidate_list(H5AC_t *cache_ptr, H5AC_aux_t *aux_ptr, + int sync_point_op) +{ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_STATIC + + /* Sanity checks */ + HDassert(cache_ptr != NULL); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + HDassert(aux_ptr != NULL); + HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC); + HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED); + HDassert((sync_point_op == H5AC_SYNC_POINT_OP__FLUSH_CACHE) || (aux_ptr->mpi_rank == 0)); + HDassert(aux_ptr->d_slist_ptr != NULL); + HDassert(aux_ptr->c_slist_ptr != NULL); + HDassert(H5SL_count(aux_ptr->c_slist_ptr) == 0); + HDassert(aux_ptr->candidate_slist_ptr != NULL); + HDassert(H5SL_count(aux_ptr->candidate_slist_ptr) == 0); + HDassert((sync_point_op == H5AC_SYNC_POINT_OP__FLUSH_TO_MIN_CLEAN) || (sync_point_op == H5AC_SYNC_POINT_OP__FLUSH_CACHE)); + + switch(sync_point_op) { + case H5AC_SYNC_POINT_OP__FLUSH_TO_MIN_CLEAN: + if(H5C_construct_candidate_list__min_clean((H5C_t *)cache_ptr) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "H5C_construct_candidate_list__min_clean() failed.") + break; + + case H5AC_SYNC_POINT_OP__FLUSH_CACHE: + if(H5C_construct_candidate_list__clean_cache((H5C_t *)cache_ptr) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "H5C_construct_candidate_list__clean_cache() failed.") + break; + + default: + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "unknown sync point operation.") + break; + } /* end switch */ + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* H5AC__construct_candidate_list() */ + + +/*------------------------------------------------------------------------- + * + * Function: H5AC__copy_candidate_list_to_buffer_cb + * + * Purpose: Skip list callback for building array of addresses for + * broadcasting the candidate list. + * + * Return: Return SUCCEED on success, and FAIL on failure. + * + * Programmer: Quincey Koziol, 6/12/15 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5AC__copy_candidate_list_to_buffer_cb(void *_item, void H5_ATTR_UNUSED *_key, + void *_udata) +{ + H5AC_slist_entry_t * slist_entry_ptr = (H5AC_slist_entry_t *)_item; /* Address of item */ + H5AC_addr_list_ud_t * udata = (H5AC_addr_list_ud_t *)_udata; /* Context for callback */ + + FUNC_ENTER_STATIC_NOERR + + /* Sanity checks */ + HDassert(slist_entry_ptr); + HDassert(udata); + + /* Store the entry's address in the buffer */ + udata->addr_buf_ptr[udata->i] = slist_entry_ptr->addr; + udata->i++; + + /* now release the entry */ + slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr); + + FUNC_LEAVE_NOAPI(SUCCEED) +} /* H5AC__copy_candidate_list_to_buffer_cb() */ + + +/*------------------------------------------------------------------------- + * + * Function: H5AC__copy_candidate_list_to_buffer + * + * Purpose: Allocate buffer(s) and copy the contents of the candidate + * entry slist into it (them). In passing, remove all + * entries from the candidate slist. Note that the + * candidate slist must not be empty. + * + * If MPI_Offset_buf_ptr_ptr is not NULL, allocate a buffer + * of MPI_Offset, copy the contents of the candidate + * entry list into it with the appropriate conversions, + * and return the base address of the buffer in + * *MPI_Offset_buf_ptr. Note that this is the buffer + * used by process 0 to transmit the list of entries to + * be flushed to all other processes (in this file group). + * + * Similarly, allocate a buffer of haddr_t, load the contents + * of the candidate list into this buffer, and return its + * base address in *haddr_buf_ptr_ptr. Note that this + * latter buffer is constructed unconditionally. + * + * In passing, also remove all entries from the candidate + * entry slist. + * + * Return: Return SUCCEED on success, and FAIL on failure. + * + * Programmer: John Mainzer, 4/19/10 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5AC__copy_candidate_list_to_buffer(const H5AC_t *cache_ptr, int *num_entries_ptr, + haddr_t **haddr_buf_ptr_ptr) +{ + H5AC_aux_t * aux_ptr = NULL; + H5AC_addr_list_ud_t udata; + haddr_t * haddr_buf_ptr = NULL; + size_t buf_size; + int num_entries = 0; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_STATIC + + /* Sanity checks */ + HDassert(cache_ptr != NULL); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + aux_ptr = (H5AC_aux_t *)(cache_ptr->aux_ptr); + HDassert(aux_ptr != NULL); + HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC); + HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED); + HDassert(aux_ptr->candidate_slist_ptr != NULL); + HDassert(H5SL_count(aux_ptr->candidate_slist_ptr) > 0); + HDassert(num_entries_ptr != NULL); + HDassert(*num_entries_ptr == 0); + HDassert(haddr_buf_ptr_ptr != NULL); + HDassert(*haddr_buf_ptr_ptr == NULL); + + num_entries = (int)H5SL_count(aux_ptr->candidate_slist_ptr); + + /* allocate a buffer(s) to store the list of candidate entry + * base addresses in + */ + buf_size = sizeof(haddr_t) * (size_t)num_entries; + if(NULL == (haddr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size))) + HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for haddr buffer") + + /* Set up user data for callback */ + udata.aux_ptr = aux_ptr; + udata.addr_buf_ptr = haddr_buf_ptr; + udata.i = 0; + + /* Free all the candidate list entries, building the address list in the callback */ + if(H5SL_free(aux_ptr->candidate_slist_ptr, H5AC__copy_candidate_list_to_buffer_cb, &udata) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFREE, FAIL, "Can't build address list for candidate entries") + + /* Pass the number of entries and the buffer pointer + * back to the caller. + */ + *num_entries_ptr = num_entries; + *haddr_buf_ptr_ptr = haddr_buf_ptr; + +done: + if(ret_value < 0) + if(haddr_buf_ptr) + haddr_buf_ptr = (haddr_t *)H5MM_xfree((void *)haddr_buf_ptr); + + FUNC_LEAVE_NOAPI(ret_value) +} /* H5AC__copy_candidate_list_to_buffer() */ + + +/*------------------------------------------------------------------------- + * + * Function: H5AC__log_deleted_entry() + * + * Purpose: Log an entry which has been deleted. + * + * Only called for mpi_rank 0. We must make sure that the entry + * doesn't appear in the cleaned or dirty entry lists. + * + * Return SUCCEED on success, and FAIL on failure. + * + * Return: Non-negative on success/Negative on failure. + * + * Programmer: John Mainzer, 6/29/05 + * + *------------------------------------------------------------------------- + */ +herr_t +H5AC__log_deleted_entry(const H5AC_info_t *entry_ptr) +{ + H5AC_t * cache_ptr; + H5AC_aux_t * aux_ptr; + H5AC_slist_entry_t * slist_entry_ptr = NULL; + haddr_t addr; + + FUNC_ENTER_PACKAGE_NOERR + + /* Sanity checks */ + HDassert(entry_ptr); + addr = entry_ptr->addr; + cache_ptr = entry_ptr->cache_ptr; + HDassert(cache_ptr != NULL); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + aux_ptr = (H5AC_aux_t *)(cache_ptr->aux_ptr); + HDassert(aux_ptr != NULL); + HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC); + HDassert(aux_ptr->mpi_rank == 0); + HDassert(aux_ptr->d_slist_ptr != NULL); + HDassert(aux_ptr->c_slist_ptr != NULL); + + /* if the entry appears in the dirtied entry slist, remove it. */ + if(NULL != (slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->d_slist_ptr, (void *)(&addr)))) + slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr); + + /* if the entry appears in the cleaned entry slist, remove it. */ + if(NULL != (slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->c_slist_ptr, (void *)(&addr)))) + slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr); + + FUNC_LEAVE_NOAPI(SUCCEED) +} /* H5AC__log_deleted_entry() */ + + +/*------------------------------------------------------------------------- + * + * Function: H5AC__log_dirtied_entry() + * + * Purpose: Update the dirty_bytes count for a newly dirtied entry. + * + * If mpi_rank isn't 0, this simply means adding the size + * of the entries to the dirty_bytes count. + * + * If mpi_rank is 0, we must first check to see if the entry + * appears in the dirty entries slist. If it is, do nothing. + * If it isn't, add the size to th dirty_bytes count, add the + * entry to the dirty entries slist, and remove it from the + * cleaned list (if it is present there). + * + * Return SUCCEED on success, and FAIL on failure. + * + * Return: Non-negative on success/Negative on failure. + * + * Programmer: John Mainzer, 6/29/05 + * + *------------------------------------------------------------------------- + */ +herr_t +H5AC__log_dirtied_entry(const H5AC_info_t *entry_ptr) +{ + H5AC_t * cache_ptr; + H5AC_aux_t * aux_ptr; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_PACKAGE + + /* Sanity checks */ + HDassert(entry_ptr); + HDassert(entry_ptr->is_dirty == FALSE); + cache_ptr = entry_ptr->cache_ptr; + HDassert(cache_ptr != NULL); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + aux_ptr = (H5AC_aux_t *)(cache_ptr->aux_ptr); + HDassert(aux_ptr != NULL); + HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC); + + if(aux_ptr->mpi_rank == 0) { + H5AC_slist_entry_t *slist_entry_ptr; + haddr_t addr = entry_ptr->addr; + + /* Sanity checks */ + HDassert(aux_ptr->d_slist_ptr != NULL); + HDassert(aux_ptr->c_slist_ptr != NULL); + + if(NULL == H5SL_search(aux_ptr->d_slist_ptr, (void *)(&addr))) { + /* insert the address of the entry in the dirty entry list, and + * add its size to the dirty_bytes count. + */ + if(NULL == (slist_entry_ptr = H5FL_MALLOC(H5AC_slist_entry_t))) + HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "Can't allocate dirty slist entry .") + slist_entry_ptr->addr = addr; + + if(H5SL_insert(aux_ptr->d_slist_ptr, slist_entry_ptr, &(slist_entry_ptr->addr)) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTINSERT, FAIL, "can't insert entry into dirty entry slist.") + + aux_ptr->dirty_bytes += entry_ptr->size; +#if H5AC_DEBUG_DIRTY_BYTES_CREATION + aux_ptr->unprotect_dirty_bytes += entry_ptr->size; + aux_ptr->unprotect_dirty_bytes_updates += 1; +#endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */ + } /* end if */ + + /* the entry is dirty. If it exists on the cleaned entries list, + * remove it. + */ + if(NULL != (slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->c_slist_ptr, (void *)(&addr)))) + slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr); + } /* end if */ + else { + aux_ptr->dirty_bytes += entry_ptr->size; +#if H5AC_DEBUG_DIRTY_BYTES_CREATION + aux_ptr->unprotect_dirty_bytes += entry_size; + aux_ptr->unprotect_dirty_bytes_updates += 1; +#endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */ + } /* end else */ + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* H5AC__log_dirtied_entry() */ + + +/*------------------------------------------------------------------------- + * + * Function: H5AC__log_flushed_entry() + * + * Purpose: Update the clean entry slist for the flush of an entry -- + * specifically, if the entry has been cleared, remove it + * from both the cleaned and dirtied lists if it is present. + * Otherwise, if the entry was dirty, insert the indicated + * entry address in the clean slist if it isn't there already. + * + * This function is only used in PHDF5, and should only + * be called for the process with mpi rank 0. + * + * Return SUCCEED on success, and FAIL on failure. + * + * Return: Non-negative on success/Negative on failure. + * + * Programmer: John Mainzer, 6/29/05 + * + *------------------------------------------------------------------------- + */ +herr_t +H5AC__log_flushed_entry(H5C_t *cache_ptr, haddr_t addr, hbool_t was_dirty, + unsigned flags) +{ + hbool_t cleared; + H5AC_aux_t * aux_ptr; + H5AC_slist_entry_t * slist_entry_ptr = NULL; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_PACKAGE + + /* Sanity check */ + HDassert(cache_ptr != NULL); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + aux_ptr = (H5AC_aux_t *)(cache_ptr->aux_ptr); + HDassert(aux_ptr != NULL); + HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC); + HDassert(aux_ptr->mpi_rank == 0); + HDassert(aux_ptr->c_slist_ptr != NULL); + + /* Set local flags */ + cleared = ((flags & H5C__FLUSH_CLEAR_ONLY_FLAG) != 0); + + if(cleared) { + /* If the entry has been cleared, must remove it from both the + * cleaned list and the dirtied list. + */ + if(NULL != (slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->c_slist_ptr, (void *)(&addr)))) + slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr); + if(NULL != (slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->d_slist_ptr, (void *)(&addr)))) + slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr); + } /* end if */ + else if(was_dirty) { + if(NULL == H5SL_search(aux_ptr->c_slist_ptr, (void *)(&addr))) { + /* insert the address of the entry in the clean entry list. */ + if(NULL == (slist_entry_ptr = H5FL_MALLOC(H5AC_slist_entry_t))) + HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "Can't allocate clean slist entry .") + slist_entry_ptr->addr = addr; + + if(H5SL_insert(aux_ptr->c_slist_ptr, slist_entry_ptr, &(slist_entry_ptr->addr)) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTINSERT, FAIL, "can't insert entry into clean entry slist.") + } /* end if */ + } /* end else-if */ + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* H5AC__log_flushed_entry() */ + + +/*------------------------------------------------------------------------- + * + * Function: H5AC__log_inserted_entry() + * + * Purpose: Update the dirty_bytes count for a newly inserted entry. + * + * If mpi_rank isnt 0, this simply means adding the size + * of the entry to the dirty_bytes count. + * + * If mpi_rank is 0, we must also add the entry to the + * dirty entries slist. + * + * Return SUCCEED on success, and FAIL on failure. + * + * Return: Non-negative on success/Negative on failure. + * + * Programmer: John Mainzer, 6/30/05 + * + *------------------------------------------------------------------------- + */ +herr_t +H5AC__log_inserted_entry(const H5AC_info_t *entry_ptr) +{ + H5AC_t * cache_ptr; + H5AC_aux_t * aux_ptr; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_PACKAGE + + /* Sanity checks */ + HDassert(entry_ptr); + cache_ptr = entry_ptr->cache_ptr; + HDassert(cache_ptr != NULL); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + aux_ptr = (H5AC_aux_t *)(cache_ptr->aux_ptr); + HDassert(aux_ptr != NULL); + HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC); + + if(aux_ptr->mpi_rank == 0) { + H5AC_slist_entry_t *slist_entry_ptr; + + HDassert(aux_ptr->d_slist_ptr != NULL); + HDassert(aux_ptr->c_slist_ptr != NULL); + + /* Entry to insert should not be in dirty list currently */ + if(NULL != H5SL_search(aux_ptr->d_slist_ptr, (const void *)(&entry_ptr->addr))) + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Inserted entry already in dirty slist.") + + /* insert the address of the entry in the dirty entry list, and + * add its size to the dirty_bytes count. + */ + if(NULL == (slist_entry_ptr = H5FL_MALLOC(H5AC_slist_entry_t))) + HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "Can't allocate dirty slist entry .") + slist_entry_ptr->addr = entry_ptr->addr; + if(H5SL_insert(aux_ptr->d_slist_ptr, slist_entry_ptr, &(slist_entry_ptr->addr)) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTINSERT, FAIL, "can't insert entry into dirty entry slist.") + + /* Entry to insert should not be in clean list either */ + if(NULL != H5SL_search(aux_ptr->c_slist_ptr, (const void *)(&entry_ptr->addr))) + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Inserted entry in clean slist.") + } /* end if */ + + aux_ptr->dirty_bytes += entry_ptr->size; + +#if H5AC_DEBUG_DIRTY_BYTES_CREATION + aux_ptr->insert_dirty_bytes += size; + aux_ptr->insert_dirty_bytes_updates += 1; +#endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */ + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* H5AC__log_inserted_entry() */ + + +/*------------------------------------------------------------------------- + * + * Function: H5AC__log_moved_entry() + * + * Purpose: Update the dirty_bytes count for a moved entry. + * + * WARNING + * + * At present, the way that the move call is used ensures + * that the moved entry is present in all caches by + * moving in a collective operation and immediately after + * unprotecting the target entry. + * + * This function uses this invariant, and will cause arcane + * failures if it is not met. If maintaining this invariant + * becomes impossible, we will have to rework this function + * extensively, and likely include a bit of IPC for + * synchronization. A better option might be to subsume + * move in the unprotect operation. + * + * Given that the target entry is in all caches, the function + * proceeds as follows: + * + * For processes with mpi rank other 0, it simply checks to + * see if the entry was dirty prior to the move, and adds + * the entries size to the dirty bytes count. + * + * In the process with mpi rank 0, the function first checks + * to see if the entry was dirty prior to the move. If it + * was, and if the entry doesn't appear in the dirtied list + * under its old address, it adds the entry's size to the + * dirty bytes count. + * + * The rank 0 process then removes any references to the + * entry under its old address from the cleands and dirtied + * lists, and inserts an entry in the dirtied list under the + * new address. + * + * Return SUCCEED on success, and FAIL on failure. + * + * Return: Non-negative on success/Negative on failure. + * + * Programmer: John Mainzer, 6/30/05 + * + *------------------------------------------------------------------------- + */ +herr_t +H5AC__log_moved_entry(const H5F_t *f, haddr_t old_addr, haddr_t new_addr) +{ + H5AC_t * cache_ptr; + H5AC_aux_t * aux_ptr; + hbool_t entry_in_cache; + hbool_t entry_dirty; + size_t entry_size; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_PACKAGE + + /* Sanity checks */ + HDassert(f); + HDassert(f->shared); + cache_ptr = (H5AC_t *)f->shared->cache; + HDassert(cache_ptr); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + aux_ptr = (H5AC_aux_t *)(cache_ptr->aux_ptr); + HDassert(aux_ptr != NULL); + HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC); + + /* get entry status, size, etc here */ + if(H5C_get_entry_status(f, old_addr, &entry_size, &entry_in_cache, + &entry_dirty, NULL, NULL, NULL, NULL) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't get entry status.") + if(!entry_in_cache) + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "entry not in cache.") + + if(aux_ptr->mpi_rank == 0) { + H5AC_slist_entry_t * slist_entry_ptr; + + HDassert(aux_ptr->d_slist_ptr != NULL); + HDassert(aux_ptr->c_slist_ptr != NULL); + + /* if the entry appears in the cleaned entry slist, under its old + * address, remove it. + */ + if(NULL != (slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->c_slist_ptr, (void *)(&old_addr)))) + slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr); + + /* if the entry appears in the dirtied entry slist under its old + * address, remove it, but don't free it. Set addr to new_addr. + */ + if(NULL != (slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->d_slist_ptr, (void *)(&old_addr)))) + slist_entry_ptr->addr = new_addr; + else { + /* otherwise, allocate a new entry that is ready + * for insertion, and increment dirty_bytes. + * + * Note that the fact that the entry wasn't in the dirtied + * list under its old address implies that it must have + * been clean to start with. + */ + HDassert(!entry_dirty); + if(NULL == (slist_entry_ptr = H5FL_MALLOC(H5AC_slist_entry_t))) + HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "Can't allocate dirty slist entry .") + slist_entry_ptr->addr = new_addr; + + aux_ptr->dirty_bytes += entry_size; + +#if H5AC_DEBUG_DIRTY_BYTES_CREATION + aux_ptr->move_dirty_bytes += entry_size; + aux_ptr->move_dirty_bytes_updates += 1; +#endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */ + } /* end else */ + + /* insert / reinsert the entry in the dirty slist */ + if(H5SL_insert(aux_ptr->d_slist_ptr, slist_entry_ptr, &(slist_entry_ptr->addr)) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTINSERT, FAIL, "can't insert entry into dirty entry slist.") + } /* end if */ + else if(!entry_dirty) { + aux_ptr->dirty_bytes += entry_size; + +#if H5AC_DEBUG_DIRTY_BYTES_CREATION + aux_ptr->move_dirty_bytes += entry_size; + aux_ptr->move_dirty_bytes_updates += 1; +#endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */ + } /* end else-if */ + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* H5AC__log_moved_entry() */ + + +/*------------------------------------------------------------------------- + * Function: H5AC__propagate_and_apply_candidate_list + * + * Purpose: Prior to the addition of support for multiple metadata + * write strategies, in PHDF5, only the metadata cache with + * mpi rank 0 was allowed to write to file. All other + * metadata caches on processes with rank greater than 0 + * were required to retain dirty entries until they were + * notified that the entry was clean. + * + * This constraint is relaxed with the distributed + * metadata write strategy, in which a list of candidate + * metadata cache entries is constructed by the process 0 + * cache and then distributed to the caches of all the other + * processes. Once the listed is distributed, many (if not + * all) processes writing writing a unique subset of the + * entries, and marking the remainder clean. The subsets + * are chosen so that each entry in the list of candidates + * is written by exactly one cache, and all entries are + * marked as being clean in all caches. + * + * While the list of candidate cache entries is prepared + * elsewhere, this function is the main routine for distributing + * and applying the list. It must be run simultaniously on + * all processes that have the relevant file open. To ensure + * proper synchronization, there is a barrier at the beginning + * of this function. + * + * At present, this function is called under one of two + * circumstances: + * + * 1) Dirty byte creation exceeds some user specified value. + * + * While metadata reads may occur independently, all + * operations writing metadata must be collective. Thus + * all metadata caches see the same sequence of operations, + * and therefore the same dirty data creation. + * + * This fact is used to synchronize the caches for purposes + * of propagating the list of candidate entries, by simply + * calling this function from all caches whenever some user + * specified threshold on dirty data is exceeded. (the + * process 0 cache creates the candidate list just before + * calling this function). + * + * 2) Under direct user control -- this operation must be + * collective. + * + * The operations to be managed by this function are as + * follows: + * + * All processes: + * + * 1) Participate in an opening barrier. + * + * For the process with mpi rank 0: + * + * 1) Load the contents of the candidate list + * (candidate_slist_ptr) into a buffer, and broadcast that + * buffer to all the other caches. Clear the candidate + * list in passing. + * + * If there is a positive number of candidates, proceed with + * the following: + * + * 2) Apply the candidate entry list. + * + * 3) Particpate in a closing barrier. + * + * 4) Remove from the dirty list (d_slist_ptr) and from the + * flushed and still clean entries list (c_slist_ptr), + * all addresses that appeared in the candidate list, as + * these entries are now clean. + * + * + * For all processes with mpi rank greater than 0: + * + * 1) Receive the candidate entry list broadcast + * + * If there is a positive number of candidates, proceed with + * the following: + * + * 2) Apply the candidate entry list. + * + * 3) Particpate in a closing barrier. + * + * Return: Success: non-negative + * + * Failure: negative + * + * Programmer: John Mainzer + * 3/17/10 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5AC__propagate_and_apply_candidate_list(H5F_t *f, hid_t dxpl_id) +{ + H5AC_t * cache_ptr; + H5AC_aux_t * aux_ptr; + haddr_t * candidates_list_ptr = NULL; + int mpi_result; + int num_candidates = 0; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_STATIC + + /* Sanity checks */ + HDassert(f != NULL); + cache_ptr = f->shared->cache; + HDassert(cache_ptr != NULL); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + aux_ptr = (H5AC_aux_t *)(cache_ptr->aux_ptr); + HDassert(aux_ptr != NULL); + HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC); + HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED); + + /* to prevent "messages from the future" we must synchronize all + * processes before we write any entries. + */ + if(MPI_SUCCESS != (mpi_result = MPI_Barrier(aux_ptr->mpi_comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_result) + + if(aux_ptr->mpi_rank == 0) { + if(H5AC__broadcast_candidate_list(cache_ptr, &num_candidates, &candidates_list_ptr) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't broadcast candidate slist.") + + HDassert(H5SL_count(aux_ptr->candidate_slist_ptr) == 0); + } /* end if */ + else { + if(H5AC__receive_candidate_list(cache_ptr, &num_candidates, &candidates_list_ptr) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't receive candidate broadcast.") + } /* end else */ + + if(num_candidates > 0) { + herr_t result; + + /* all processes apply the candidate list. + * H5C_apply_candidate_list() handles the details of + * distributing the writes across the processes. + */ + + /* Enable writes during this operation */ + aux_ptr->write_permitted = TRUE; + + /* Apply the candidate list */ + result = H5C_apply_candidate_list(f, dxpl_id, cache_ptr, num_candidates, + candidates_list_ptr, aux_ptr->mpi_rank, aux_ptr->mpi_size); + + /* Disable writes again */ + aux_ptr->write_permitted = FALSE; + + /* Check for error on the write operation */ + if(result < 0) + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't apply candidate list.") + + /* this code exists primarily for the test bed -- it allows us to + * enforce posix semantics on the server that pretends to be a + * file system in our parallel tests. + */ + if(aux_ptr->write_done) + (aux_ptr->write_done)(); + + /* to prevent "messages from the past" we must synchronize all + * processes again before we go on. + */ + if(MPI_SUCCESS != (mpi_result = MPI_Barrier(aux_ptr->mpi_comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_result) + + /* if this is process zero, tidy up the dirtied, + * and flushed and still clean lists. + */ + if(aux_ptr->mpi_rank == 0) + if(H5AC__tidy_cache_0_lists(cache_ptr, num_candidates, candidates_list_ptr) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't tidy up process 0 lists.") + } /* end if */ + + /* if it is defined, call the sync point done callback. Note + * that this callback is defined purely for testing purposes, + * and should be undefined under normal operating circumstances. + */ + if(aux_ptr->sync_point_done) + (aux_ptr->sync_point_done)(num_candidates, candidates_list_ptr); + +done: + if(candidates_list_ptr) + candidates_list_ptr = (haddr_t *)H5MM_xfree((void *)candidates_list_ptr); + + FUNC_LEAVE_NOAPI(ret_value) +} /* H5AC__propagate_and_apply_candidate_list() */ + + +/*------------------------------------------------------------------------- + * Function: H5AC__propagate_flushed_and_still_clean_entries_list + * + * Purpose: In PHDF5, if the process 0 only metadata write strategy + * is selected, only the metadata cache with mpi rank 0 is + * allowed to write to file. All other metadata caches on + * processes with rank greater than 0 must retain dirty + * entries until they are notified that the entry is now + * clean. + * + * This function is the main routine for handling this + * notification proceedure. It must be called + * simultaniously on all processes that have the relevant + * file open. To this end, it is called only during a + * sync point, with a barrier prior to the call. + * + * Note that any metadata entry writes by process 0 will + * occur after the barrier and just before this call. + * + * Typicaly, calls to this function will be triggered in + * one of two ways: + * + * 1) Dirty byte creation exceeds some user specified value. + * + * While metadata reads may occur independently, all + * operations writing metadata must be collective. Thus + * all metadata caches see the same sequence of operations, + * and therefore the same dirty data creation. + * + * This fact is used to synchronize the caches for purposes + * of propagating the list of flushed and still clean + * entries, by simply calling this function from all + * caches whenever some user specified threshold on dirty + * data is exceeded. + * + * 2) Under direct user control -- this operation must be + * collective. + * + * The operations to be managed by this function are as + * follows: + * + * For the process with mpi rank 0: + * + * 1) Load the contents of the flushed and still clean entries + * list (c_slist_ptr) into a buffer, and broadcast that + * buffer to all the other caches. + * + * 2) Clear the flushed and still clean entries list + * (c_slist_ptr). + * + * + * For all processes with mpi rank greater than 0: + * + * 1) Receive the flushed and still clean entries list broadcast + * + * 2) Mark the specified entries as clean. + * + * + * For all processes: + * + * 1) Reset the dirtied bytes count to 0. + * + * Return: Success: non-negative + * + * Failure: negative + * + * Programmer: John Mainzer + * July 5, 2005 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5AC__propagate_flushed_and_still_clean_entries_list(H5F_t *f, hid_t dxpl_id) +{ + H5AC_t * cache_ptr; + H5AC_aux_t * aux_ptr; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_STATIC + + /* Sanity checks */ + HDassert(f != NULL); + cache_ptr = f->shared->cache; + HDassert(cache_ptr != NULL); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + aux_ptr = (H5AC_aux_t *)(cache_ptr->aux_ptr); + HDassert(aux_ptr != NULL); + HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC); + HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__PROCESS_0_ONLY); + + if(aux_ptr->mpi_rank == 0) { + if(H5AC__broadcast_clean_list(cache_ptr) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't broadcast clean slist.") + HDassert(H5SL_count(aux_ptr->c_slist_ptr) == 0); + } /* end if */ + else { + if(H5AC__receive_and_apply_clean_list(f, dxpl_id) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't receive and/or process clean slist broadcast.") + } /* end else */ + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* H5AC__propagate_flushed_and_still_clean_entries_list() */ + + +/*------------------------------------------------------------------------- + * + * Function: H5AC_receive_haddr_list() + * + * Purpose: Receive the list of entry addresses from process 0, + * and return it in a buffer pointed to by *haddr_buf_ptr_ptr. + * Note that the caller must free this buffer if it is + * returned. + * + * This function must only be called by the process with + * MPI_rank greater than 0. + * + * Return SUCCEED on success, and FAIL on failure. + * + * Return: Non-negative on success/Negative on failure. + * + * Programmer: Quincey Koziol, 6/11/2015 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5AC__receive_haddr_list(MPI_Comm mpi_comm, int *num_entries_ptr, + haddr_t **haddr_buf_ptr_ptr) +{ + haddr_t * haddr_buf_ptr = NULL; + int mpi_result; + int num_entries; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_STATIC + + /* Sanity checks */ + HDassert(num_entries_ptr != NULL); + HDassert(*num_entries_ptr == 0); + HDassert(haddr_buf_ptr_ptr != NULL); + HDassert(*haddr_buf_ptr_ptr == NULL); + + /* First receive the number of entries in the list so that we + * can set up a buffer to receive them. If there aren't + * any, we are done. + */ + if(MPI_SUCCESS != (mpi_result = MPI_Bcast(&num_entries, 1, MPI_INT, 0, mpi_comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_result) + + if(num_entries > 0) { + size_t buf_size; + + /* allocate buffers to store the list of entry base addresses in */ + buf_size = sizeof(haddr_t) * (size_t)num_entries; + if(NULL == (haddr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size))) + HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for haddr buffer") + + /* Now receive the list of candidate entries */ + if(MPI_SUCCESS != (mpi_result = MPI_Bcast((void *)haddr_buf_ptr, (int)buf_size, MPI_BYTE, 0, mpi_comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_result) + } /* end if */ + + /* finally, pass the number of entries and the buffer pointer + * back to the caller. + */ + *num_entries_ptr = num_entries; + *haddr_buf_ptr_ptr = haddr_buf_ptr; + +done: + if(ret_value < 0) + if(haddr_buf_ptr) + haddr_buf_ptr = (haddr_t *)H5MM_xfree((void *)haddr_buf_ptr); + + FUNC_LEAVE_NOAPI(ret_value) +} /* H5AC_receive_haddr_list() */ + + +/*------------------------------------------------------------------------- + * + * Function: H5AC__receive_and_apply_clean_list() + * + * Purpose: Receive the list of cleaned entries from process 0, + * and mark the specified entries as clean. + * + * This function must only be called by the process with + * MPI_rank greater than 0. + * + * Return SUCCEED on success, and FAIL on failure. + * + * Return: Non-negative on success/Negative on failure. + * + * Programmer: John Mainzer, 7/4/05 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5AC__receive_and_apply_clean_list(H5F_t *f, hid_t dxpl_id) +{ + H5AC_t * cache_ptr; + H5AC_aux_t * aux_ptr; + haddr_t * haddr_buf_ptr = NULL; + int num_entries = 0; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_STATIC + + /* Sanity check */ + HDassert(f != NULL); + cache_ptr = f->shared->cache; + HDassert(cache_ptr != NULL); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + aux_ptr = (H5AC_aux_t *)(cache_ptr->aux_ptr); + HDassert(aux_ptr != NULL); + HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC); + HDassert(aux_ptr->mpi_rank != 0); + + /* Retrieve the clean list from process 0 */ + if(H5AC__receive_haddr_list(aux_ptr->mpi_comm, &num_entries, &haddr_buf_ptr) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "can't receive clean list") + + if(num_entries > 0) + /* mark the indicated entries as clean */ + if(H5C_mark_entries_as_clean(f, dxpl_id, (int32_t)num_entries, haddr_buf_ptr) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't mark entries clean.") + + /* if it is defined, call the sync point done callback. Note + * that this callback is defined purely for testing purposes, + * and should be undefined under normal operating circumstances. + */ + if(aux_ptr->sync_point_done) + (aux_ptr->sync_point_done)(num_entries, haddr_buf_ptr); + +done: + if(haddr_buf_ptr) + haddr_buf_ptr = (haddr_t *)H5MM_xfree((void *)haddr_buf_ptr); + + FUNC_LEAVE_NOAPI(ret_value) +} /* H5AC__receive_and_apply_clean_list() */ + + +/*------------------------------------------------------------------------- + * + * Function: H5AC__receive_candidate_list() + * + * Purpose: Receive the list of candidate entries from process 0, + * and return it in a buffer pointed to by *haddr_buf_ptr_ptr. + * Note that the caller must free this buffer if it is + * returned. + * + * This function must only be called by the process with + * MPI_rank greater than 0. + * + * Return SUCCEED on success, and FAIL on failure. + * + * Return: Non-negative on success/Negative on failure. + * + * Programmer: John Mainzer, 3/17/10 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5AC__receive_candidate_list(const H5AC_t *cache_ptr, int *num_entries_ptr, + haddr_t **haddr_buf_ptr_ptr) +{ + H5AC_aux_t * aux_ptr; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_STATIC + + /* Sanity checks */ + HDassert(cache_ptr != NULL); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + aux_ptr = (H5AC_aux_t *)(cache_ptr->aux_ptr); + HDassert(aux_ptr != NULL); + HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC); + HDassert(aux_ptr->mpi_rank != 0); + HDassert(aux_ptr-> metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED); + HDassert(num_entries_ptr != NULL); + HDassert(*num_entries_ptr == 0); + HDassert(haddr_buf_ptr_ptr != NULL); + HDassert(*haddr_buf_ptr_ptr == NULL); + + /* Retrieve the candidate list from process 0 */ + if(H5AC__receive_haddr_list(aux_ptr->mpi_comm, num_entries_ptr, haddr_buf_ptr_ptr) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "can't receive clean list") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* H5AC__receive_candidate_list() */ + + +/*------------------------------------------------------------------------- + * Function: H5AC__rsp__dist_md_write__flush + * + * Purpose: Routine for handling the details of running a sync point + * that is triggered by a flush -- which in turn must have been + * triggered by either a flush API call or a file close -- + * when the distributed metadata write strategy is selected. + * + * Upon entry, each process generates it own candidate list, + * being a sorted list of all dirty metadata entries currently + * in the metadata cache. Note that this list must be idendical + * across all processes, as all processes see the same stream + * of dirty metadata coming in, and use the same lists of + * candidate entries at each sync point. (At first glance, this + * argument sounds circular, but think of it in the sense of + * a recursive proof). + * + * If this this list is empty, we are done, and the function + * returns + * + * Otherwise, after the sorted list dirty metadata entries is + * constructed, each process uses the same algorithm to assign + * each entry on the candidate list to exactly one process for + * flushing. + * + * At this point, all processes participate in a barrier to + * avoid messages from the past/future bugs. + * + * Each process then flushes the entries assigned to it, and + * marks all other entries on the candidate list as clean. + * + * Finally, all processes participate in a second barrier to + * avoid messages from the past/future bugs. + * + * At the end of this process, process 0 and only process 0 + * must tidy up its lists of dirtied and cleaned entries. + * These lists are not used in the distributed metadata write + * strategy, but they must be maintained should we shift + * to a strategy that uses them. + * + * Return: Success: non-negative + * + * Failure: negative + * + * Programmer: John Mainzer + * April 28, 2010 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5AC__rsp__dist_md_write__flush(H5F_t *f, hid_t dxpl_id) +{ + H5AC_t * cache_ptr; + H5AC_aux_t * aux_ptr; + haddr_t * haddr_buf_ptr = NULL; + int mpi_result; + int num_entries = 0; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_STATIC + + /* Sanity checks */ + HDassert(f != NULL); + cache_ptr = f->shared->cache; + HDassert(cache_ptr != NULL); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + aux_ptr = (H5AC_aux_t *)(cache_ptr->aux_ptr); + HDassert(aux_ptr != NULL); + HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC); + HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED); + + /* first construct the candidate list -- initially, this will be in the + * form of a skip list. We will convert it later. + */ + if(H5C_construct_candidate_list__clean_cache(cache_ptr) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate list.") + + if(H5SL_count(aux_ptr->candidate_slist_ptr) > 0) { + herr_t result; + + /* convert the candidate list into the format we + * are used to receiving from process 0. + */ + if(H5AC__copy_candidate_list_to_buffer(cache_ptr, &num_entries, &haddr_buf_ptr) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate buffer.") + + /* initial sync point barrier */ + if(MPI_SUCCESS != (mpi_result = MPI_Barrier(aux_ptr->mpi_comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_result) + + /* Enable writes during this operation */ + aux_ptr->write_permitted = TRUE; + + /* Apply the candidate list */ + result = H5C_apply_candidate_list(f, dxpl_id, cache_ptr, num_entries, + haddr_buf_ptr, aux_ptr->mpi_rank, aux_ptr->mpi_size); + + /* Disable writes again */ + aux_ptr->write_permitted = FALSE; + + /* Check for error on the write operation */ + if(result < 0) + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't apply candidate list.") + + /* this code exists primarily for the test bed -- it allows us to + * enforce posix semantics on the server that pretends to be a + * file system in our parallel tests. + */ + if(aux_ptr->write_done) + (aux_ptr->write_done)(); + + /* final sync point barrier */ + if(MPI_SUCCESS != (mpi_result = MPI_Barrier(aux_ptr->mpi_comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_result) + + /* if this is process zero, tidy up the dirtied, + * and flushed and still clean lists. + */ + if(aux_ptr->mpi_rank == 0) + if(H5AC__tidy_cache_0_lists(cache_ptr, num_entries, haddr_buf_ptr) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't tidy up process 0 lists.") + } /* end if */ + + /* if it is defined, call the sync point done callback. Note + * that this callback is defined purely for testing purposes, + * and should be undefined under normal operating circumstances. + */ + if(aux_ptr->sync_point_done) + (aux_ptr->sync_point_done)(num_entries, haddr_buf_ptr); + +done: + if(haddr_buf_ptr) + haddr_buf_ptr = (haddr_t *)H5MM_xfree((void *)haddr_buf_ptr); + + FUNC_LEAVE_NOAPI(ret_value) +} /* H5AC__rsp__dist_md_write__flush() */ + + +/*------------------------------------------------------------------------- + * Function: H5AC__rsp__dist_md_write__flush_to_min_clean + * + * Purpose: Routine for handling the details of running a sync point + * triggered by the accumulation of dirty metadata (as + * opposed to a flush call to the API) when the distributed + * metadata write strategy is selected. + * + * After invocation and initial sanity checking this function + * first checks to see if evictions are enabled -- if they + * are not, the function does nothing and returns. + * + * Otherwise, process zero constructs a list of entries to + * be flushed in order to bring the process zero cache back + * within its min clean requirement. Note that this list + * (the candidate list) may be empty. + * + * Then, all processes participate in a barrier. + * + * After the barrier, process 0 broadcasts the number of + * entries in the candidate list prepared above, and all + * other processes receive this number. + * + * If this number is zero, we are done, and the function + * returns without further action. + * + * Otherwise, process 0 broadcasts the sorted list of + * candidate entries, and all other processes receive it. + * + * Then, each process uses the same algorithm to assign + * each entry on the candidate list to exactly one process + * for flushing. + * + * Each process then flushes the entries assigned to it, and + * marks all other entries on the candidate list as clean. + * + * Finally, all processes participate in a second barrier to + * avoid messages from the past/future bugs. + * + * At the end of this process, process 0 and only process 0 + * must tidy up its lists of dirtied and cleaned entries. + * These lists are not used in the distributed metadata write + * strategy, but they must be maintained should we shift + * to a strategy that uses them. + * + * Return: Success: non-negative + * + * Failure: negative + * + * Programmer: John Mainzer + * April 28, 2010 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5AC__rsp__dist_md_write__flush_to_min_clean(H5F_t *f, hid_t dxpl_id) +{ + H5AC_t * cache_ptr; + H5AC_aux_t * aux_ptr; + hbool_t evictions_enabled; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_STATIC + + /* Sanity checks */ + HDassert(f != NULL); + cache_ptr = f->shared->cache; + HDassert(cache_ptr != NULL); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + aux_ptr = (H5AC_aux_t *)(cache_ptr->aux_ptr); + HDassert(aux_ptr != NULL); + HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC); + HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED); + + /* Query if evictions are allowed */ + if(H5C_get_evictions_enabled((const H5C_t *)cache_ptr, &evictions_enabled) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "H5C_get_evictions_enabled() failed.") + + if(evictions_enabled) { + /* construct candidate list -- process 0 only */ + if(aux_ptr->mpi_rank == 0) + if(H5AC__construct_candidate_list(cache_ptr, aux_ptr, H5AC_SYNC_POINT_OP__FLUSH_TO_MIN_CLEAN) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate list.") + + /* propagate and apply candidate list -- all processes */ + if(H5AC__propagate_and_apply_candidate_list(f, dxpl_id) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't propagate and apply candidate list.") + } /* evictions enabled */ + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* H5AC__rsp__dist_md_write__flush_to_min_clean() */ + + +/*------------------------------------------------------------------------- + * Function: H5AC__rsp__p0_only__flush + * + * Purpose: Routine for handling the details of running a sync point + * that is triggered a flush -- which in turn must have been + * triggered by either a flush API call or a file close -- + * when the process 0 only metadata write strategy is selected. + * + * First, all processes participate in a barrier. + * + * Then process zero flushes all dirty entries, and broadcasts + * they number of clean entries (if any) to all the other + * caches. + * + * If this number is zero, we are done. + * + * Otherwise, process 0 broadcasts the list of cleaned + * entries, and all other processes which are part of this + * file group receive it, and mark the listed entries as + * clean in their caches. + * + * Since all processes have the same set of dirty + * entries at the beginning of the sync point, and all + * entries that will be written are written before + * process zero broadcasts the number of cleaned entries, + * there is no need for a closing barrier. + * + * Return: Success: non-negative + * + * Failure: negative + * + * Programmer: John Mainzer + * April 28, 2010 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5AC__rsp__p0_only__flush(H5F_t *f, hid_t dxpl_id) +{ + H5AC_t * cache_ptr; + H5AC_aux_t * aux_ptr; + int mpi_result; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_STATIC + + /* Sanity checks */ + HDassert(f != NULL); + cache_ptr = f->shared->cache; + HDassert(cache_ptr != NULL); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + aux_ptr = (H5AC_aux_t *)(cache_ptr->aux_ptr); + HDassert(aux_ptr != NULL); + HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC); + HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__PROCESS_0_ONLY); + + /* to prevent "messages from the future" we must + * synchronize all processes before we start the flush. + * Hence the following barrier. + */ + if(MPI_SUCCESS != (mpi_result = MPI_Barrier(aux_ptr->mpi_comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_result) + + /* Flush data to disk, from rank 0 process */ + if(aux_ptr->mpi_rank == 0) { + herr_t result; + + /* Enable writes during this operation */ + aux_ptr->write_permitted = TRUE; + + /* Flush the cache */ + result = H5C_flush_cache(f, dxpl_id, H5AC__NO_FLAGS_SET); + + /* Disable writes again */ + aux_ptr->write_permitted = FALSE; + + /* Check for error on the write operation */ + if(result < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush.") + + /* this code exists primarily for the test bed -- it allows us to + * enforce posix semantics on the server that pretends to be a + * file system in our parallel tests. + */ + if(aux_ptr->write_done) + (aux_ptr->write_done)(); + } /* end if */ + + /* Propagate cleaned entries to other ranks. */ + if(H5AC__propagate_flushed_and_still_clean_entries_list(f, H5AC_dxpl_id) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't propagate clean entries list.") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* H5AC__rsp__p0_only__flush() */ + + +/*------------------------------------------------------------------------- + * Function: H5AC__rsp__p0_only__flush_to_min_clean + * + * Purpose: Routine for handling the details of running a sync point + * triggered by the accumulation of dirty metadata (as + * opposed to a flush call to the API) when the process 0 + * only metadata write strategy is selected. + * + * After invocation and initial sanity checking this function + * first checks to see if evictions are enabled -- if they + * are not, the function does nothing and returns. + * + * Otherwise, all processes participate in a barrier. + * + * After the barrier, if this is process 0, the function + * causes the cache to flush sufficient entries to get the + * cache back within its minimum clean fraction, and broadcast + * the number of entries which have been flushed since + * the last sync point, and are still clean. + * + * If this number is zero, we are done. + * + * Otherwise, process 0 broadcasts the list of cleaned + * entries, and all other processes which are part of this + * file group receive it, and mark the listed entries as + * clean in their caches. + * + * Since all processes have the same set of dirty + * entries at the beginning of the sync point, and all + * entries that will be written are written before + * process zero broadcasts the number of cleaned entries, + * there is no need for a closing barrier. + * + * Return: Success: non-negative + * + * Failure: negative + * + * Programmer: John Mainzer + * April 28, 2010 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5AC__rsp__p0_only__flush_to_min_clean(H5F_t *f, hid_t dxpl_id) +{ + H5AC_t * cache_ptr; + H5AC_aux_t * aux_ptr; + hbool_t evictions_enabled; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_STATIC + + /* Sanity checks */ + HDassert(f != NULL); + cache_ptr = f->shared->cache; + HDassert(cache_ptr != NULL); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + aux_ptr = (H5AC_aux_t *)(cache_ptr->aux_ptr); + HDassert(aux_ptr != NULL); + HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC); + HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__PROCESS_0_ONLY); + + /* Query if evictions are allowed */ + if(H5C_get_evictions_enabled((const H5C_t *)cache_ptr, &evictions_enabled) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "H5C_get_evictions_enabled() failed.") + + /* Flush if evictions are allowed -- following call + * will cause process 0 to flush to min clean size, + * and then propagate the newly clean entries to the + * other processes. + * + * Otherwise, do nothing. + */ + if(evictions_enabled) { + int mpi_result; + + /* to prevent "messages from the future" we must synchronize all + * processes before we start the flush. This synchronization may + * already be done -- hence the do_barrier parameter. + */ + if(MPI_SUCCESS != (mpi_result = MPI_Barrier(aux_ptr->mpi_comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_result) + + if(0 == aux_ptr->mpi_rank) { + herr_t result; + + /* here, process 0 flushes as many entries as necessary to + * comply with the currently specified min clean size. + * Note that it is quite possible that no entries will be + * flushed. + */ + + /* Enable writes during this operation */ + aux_ptr->write_permitted = TRUE; + + /* Flush the cache */ + result = H5C_flush_to_min_clean(f, dxpl_id); + + /* Disable writes again */ + aux_ptr->write_permitted = FALSE; + + /* Check for error on the write operation */ + if(result < 0) + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "H5C_flush_to_min_clean() failed.") + + /* this call exists primarily for the test code -- it is used + * to enforce POSIX semantics on the process used to simulate + * reads and writes in t_cache.c. + */ + if(aux_ptr->write_done) + (aux_ptr->write_done)(); + } /* end if */ + + if(H5AC__propagate_flushed_and_still_clean_entries_list(f, dxpl_id) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't propagate clean entries list.") + } /* end if */ + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* H5AC__rsp__p0_only__flush_to_min_clean() */ + + +/*------------------------------------------------------------------------- + * Function: H5AC__run_sync_point + * + * Purpose: Top level routine for managing a sync point between all + * meta data caches in the parallel case. Since all caches + * see the same sequence of dirty metadata, we simply count + * bytes of dirty metadata, and run a sync point whenever the + * number of dirty bytes of metadata seen since the last + * sync point exceeds a threshold that is common across all + * processes. We also run sync points in response to + * HDF5 API calls triggering either a flush or a file close. + * + * In earlier versions of PHDF5, only the metadata cache with + * mpi rank 0 was allowed to write to file. All other + * metadata caches on processes with rank greater than 0 were + * required to retain dirty entries until they were notified + * that the entry is was clean. + * + * This function was created to make it easier for us to + * experiment with other options, as it is a single point + * for the execution of sync points. + * + * Return: Success: non-negative + * + * Failure: negative + * + * Programmer: John Mainzer + * March 11, 2010 + * + *------------------------------------------------------------------------- + */ +herr_t +H5AC__run_sync_point(H5F_t *f, hid_t dxpl_id, int sync_point_op) +{ + H5AC_t * cache_ptr; + H5AC_aux_t * aux_ptr; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_PACKAGE + + /* Sanity checks */ + HDassert(f != NULL); + cache_ptr = f->shared->cache; + HDassert(cache_ptr != NULL); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + aux_ptr = (H5AC_aux_t *)(cache_ptr->aux_ptr); + HDassert(aux_ptr != NULL); + HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC); + HDassert((sync_point_op == H5AC_SYNC_POINT_OP__FLUSH_TO_MIN_CLEAN) || + (sync_point_op == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED)); + +#if H5AC_DEBUG_DIRTY_BYTES_CREATION +HDfprintf(stdout, "%d:H5AC_propagate...:%u: (u/uu/i/iu/r/ru) = %zu/%u/%zu/%u/%zu/%u\n", + aux_ptr->mpi_rank, + aux_ptr->dirty_bytes_propagations, + aux_ptr->unprotect_dirty_bytes, + aux_ptr->unprotect_dirty_bytes_updates, + aux_ptr->insert_dirty_bytes, + aux_ptr->insert_dirty_bytes_updates, + aux_ptr->rename_dirty_bytes, + aux_ptr->rename_dirty_bytes_updates); +#endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */ + + switch(aux_ptr->metadata_write_strategy) { + case H5AC_METADATA_WRITE_STRATEGY__PROCESS_0_ONLY: + switch(sync_point_op) { + case H5AC_SYNC_POINT_OP__FLUSH_TO_MIN_CLEAN: + if(H5AC__rsp__p0_only__flush_to_min_clean(f, dxpl_id) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "H5AC__rsp__p0_only__flush_to_min_clean() failed.") + break; + + case H5AC_SYNC_POINT_OP__FLUSH_CACHE: + if(H5AC__rsp__p0_only__flush(f, dxpl_id) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "H5AC__rsp__p0_only__flush() failed.") + break; + + default: + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "unknown flush op"); + break; + } /* end switch */ + break; + + case H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED: + switch(sync_point_op) { + case H5AC_SYNC_POINT_OP__FLUSH_TO_MIN_CLEAN: + if(H5AC__rsp__dist_md_write__flush_to_min_clean(f, dxpl_id) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "H5AC__rsp__dist_md_write__flush_to_min_clean() failed.") + break; + + case H5AC_SYNC_POINT_OP__FLUSH_CACHE: + if(H5AC__rsp__dist_md_write__flush(f, dxpl_id) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "H5AC__rsp__dist_md_write__flush() failed.") + break; + + default: + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "unknown flush op"); + break; + } /* end switch */ + break; + + default: + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Unknown metadata write strategy.") + break; + } /* end switch */ + + /* reset the dirty bytes count */ + aux_ptr->dirty_bytes = 0; + +#if H5AC_DEBUG_DIRTY_BYTES_CREATION + aux_ptr->dirty_bytes_propagations += 1; + aux_ptr->unprotect_dirty_bytes = 0; + aux_ptr->unprotect_dirty_bytes_updates = 0; + aux_ptr->insert_dirty_bytes = 0; + aux_ptr->insert_dirty_bytes_updates = 0; + aux_ptr->rename_dirty_bytes = 0; + aux_ptr->rename_dirty_bytes_updates = 0; +#endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */ + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* H5AC__run_sync_point() */ + + +/*------------------------------------------------------------------------- + * Function: H5AC__tidy_cache_0_lists() + * + * Purpose: In the distributed metadata write strategy, not all dirty + * entries are written by process 0 -- thus we must tidy + * up the dirtied, and flushed and still clean lists + * maintained by process zero after each sync point. + * + * This procedure exists to tend to this issue. + * + * At this point, all entries that process 0 cleared should + * have been removed from both the dirty and flushed and + * still clean lists, and entries that process 0 has flushed + * should have been removed from the dirtied list and added + * to the flushed and still clean list. + * + * However, since the distributed metadata write strategy + * doesn't make use of these lists, the objective is simply + * to maintain these lists in consistent state that allows + * them to be used should the metadata write strategy change + * to one that uses these lists. + * + * Thus for our purposes, all we need to do is remove from + * the dirtied and flushed and still clean lists all + * references to entries that appear in the candidate list. + * + * Return: Success: non-negative + * + * Failure: negative + * + * Programmer: John Mainzer + * 4/20/10 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5AC__tidy_cache_0_lists(H5AC_t *cache_ptr, int num_candidates, + haddr_t *candidates_list_ptr) +{ + H5AC_aux_t * aux_ptr; + int i; + + FUNC_ENTER_STATIC_NOERR + + /* Sanity checks */ + HDassert(cache_ptr != NULL); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + aux_ptr = (H5AC_aux_t *)(cache_ptr->aux_ptr); + HDassert(aux_ptr != NULL); + HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC); + HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED); + HDassert(aux_ptr->mpi_rank == 0); + HDassert(num_candidates > 0); + HDassert(candidates_list_ptr != NULL); + + /* clean up dirtied and flushed and still clean lists by removing + * all entries on the candidate list. Cleared entries should + * have been removed from both the dirty and cleaned lists at + * this point, flushed entries should have been added to the + * cleaned list. However, for this metadata write strategy, + * we just want to remove all references to the candidate entries. + */ + for(i = 0; i < num_candidates; i++) { + H5AC_slist_entry_t * d_slist_entry_ptr; + H5AC_slist_entry_t * c_slist_entry_ptr; + haddr_t addr; + + addr = candidates_list_ptr[i]; + + /* addr may be either on the dirtied list, or on the flushed + * and still clean list. Remove it. + */ + if(NULL != (d_slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->d_slist_ptr, (void *)&addr))) + d_slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, d_slist_entry_ptr); + if(NULL != (c_slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->c_slist_ptr, (void *)&addr))) + c_slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, c_slist_entry_ptr); + } /* end for */ + + FUNC_LEAVE_NOAPI(SUCCEED) +} /* H5AC__tidy_cache_0_lists() */ + + +/*------------------------------------------------------------------------- + * Function: H5AC__flush_entries + * + * Purpose: Flush the metadata cache associated with the specified file, + * only writing from rank 0, but propagating the cleaned entries + * to all ranks. + * + * Return: Non-negative on success/Negative on failure if there was a + * request to flush all items and something was protected. + * + * Programmer: Quincey Koziol + * koziol@hdfgroup.org + * Aug 22 2009 + * + *------------------------------------------------------------------------- + */ +herr_t +H5AC__flush_entries(H5F_t *f, hid_t dxpl_id) +{ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_PACKAGE + + /* Sanity checks */ + HDassert(f); + HDassert(f->shared->cache); + + /* Check if we have >1 ranks */ + if(f->shared->cache->aux_ptr) + if(H5AC__run_sync_point(f, dxpl_id, H5AC_SYNC_POINT_OP__FLUSH_CACHE) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't run sync point.") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* H5AC__flush_entries() */ +#endif /* H5_HAVE_PARALLEL */ + |