summaryrefslogtreecommitdiffstats
path: root/src/H5ACmpio.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/H5ACmpio.c')
-rw-r--r--src/H5ACmpio.c2274
1 files changed, 2274 insertions, 0 deletions
diff --git a/src/H5ACmpio.c b/src/H5ACmpio.c
new file mode 100644
index 0000000..6c2b924
--- /dev/null
+++ b/src/H5ACmpio.c
@@ -0,0 +1,2274 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright by The HDF Group. *
+ * Copyright by the Board of Trustees of the University of Illinois. *
+ * All rights reserved. *
+ * *
+ * This file is part of HDF5. The full HDF5 copyright notice, including *
+ * terms governing use, modification, and redistribution, is contained in *
+ * the files COPYING and Copyright.html. COPYING can be found at the root *
+ * of the source code distribution tree; Copyright.html can be found at the *
+ * root level of an installed copy of the electronic HDF5 document set and *
+ * is linked from the top-level documents page. It can also be found at *
+ * http://hdfgroup.org/HDF5/doc/Copyright.html. If you do not have *
+ * access to either file, you may request a copy from help@hdfgroup.org. *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+/*-------------------------------------------------------------------------
+ *
+ * Created: H5ACmpio.c
+ * Jun 20 2015
+ * Quincey Koziol <koziol@hdfgroup.org>
+ *
+ * Purpose: Functions in this file implement support for parallel
+ * I/O cache functionality
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/****************/
+/* Module Setup */
+/****************/
+
+#define H5AC_PACKAGE /*suppress error about including H5ACpkg */
+#define H5F_PACKAGE /*suppress error about including H5Fpkg */
+
+/* Interface initialization */
+#define H5_INTERFACE_INIT_FUNC H5AC__init_mpio_interface
+
+/***********/
+/* Headers */
+/***********/
+#include "H5private.h" /* Generic Functions */
+#include "H5ACpkg.h" /* Metadata cache */
+#include "H5Cprivate.h" /* Cache */
+#include "H5Eprivate.h" /* Error handling */
+#include "H5Fpkg.h" /* Files */
+#include "H5MMprivate.h" /* Memory management */
+
+#ifdef H5_HAVE_PARALLEL
+
+/****************/
+/* Local Macros */
+/****************/
+
+
+/******************/
+/* Local Typedefs */
+/******************/
+
+/****************************************************************************
+ *
+ * structure H5AC_slist_entry_t
+ *
+ * The dirty entry list maintained via the d_slist_ptr field of H5AC_aux_t
+ * and the cleaned entry list maintained via the c_slist_ptr field of
+ * H5AC_aux_t are just lists of the file offsets of the dirty/cleaned
+ * entries. Unfortunately, the slist code makes us define a dynamically
+ * allocated structure to store these offsets in. This structure serves
+ * that purpose. Its fields are as follows:
+ *
+ * addr: file offset of a metadata entry. Entries are added to this
+ * list (if they aren't there already) when they are marked
+ * dirty in an unprotect, inserted, or moved. They are
+ * removed when they appear in a clean entries broadcast.
+ *
+ ****************************************************************************/
+typedef struct H5AC_slist_entry_t
+{
+ haddr_t addr;
+} H5AC_slist_entry_t;
+
+/* User data for address list building callbacks */
+typedef struct H5AC_addr_list_ud_t
+{
+ H5AC_aux_t * aux_ptr; /* 'Auxiliary' parallel cache info */
+ haddr_t * addr_buf_ptr; /* Array to store addresses */
+ int i; /* Counter for position in array */
+} H5AC_addr_list_ud_t;
+
+
+/********************/
+/* Local Prototypes */
+/********************/
+
+static herr_t H5AC__broadcast_candidate_list(H5AC_t *cache_ptr,
+ int *num_entries_ptr, haddr_t **haddr_buf_ptr_ptr);
+static herr_t H5AC__broadcast_clean_list(H5AC_t *cache_ptr);
+static herr_t H5AC__construct_candidate_list(H5AC_t *cache_ptr,
+ H5AC_aux_t *aux_ptr, int sync_point_op);
+static herr_t H5AC__copy_candidate_list_to_buffer(const H5AC_t *cache_ptr,
+ int *num_entries_ptr, haddr_t **haddr_buf_ptr_ptr);
+static herr_t H5AC__propagate_and_apply_candidate_list(H5F_t *f, hid_t dxpl_id);
+static herr_t H5AC__propagate_flushed_and_still_clean_entries_list(H5F_t *f,
+ hid_t dxpl_id);
+static herr_t H5AC__receive_haddr_list(MPI_Comm mpi_comm, int *num_entries_ptr,
+ haddr_t **haddr_buf_ptr_ptr);
+static herr_t H5AC__receive_candidate_list(const H5AC_t *cache_ptr,
+ int *num_entries_ptr, haddr_t **haddr_buf_ptr_ptr);
+static herr_t H5AC__receive_and_apply_clean_list(H5F_t *f, hid_t dxpl_id);
+static herr_t H5AC__tidy_cache_0_lists(H5AC_t *cache_ptr, int num_candidates,
+ haddr_t *candidates_list_ptr);
+static herr_t H5AC__rsp__dist_md_write__flush(H5F_t *f, hid_t dxpl_id);
+static herr_t H5AC__rsp__dist_md_write__flush_to_min_clean(H5F_t *f, hid_t dxpl_id);
+static herr_t H5AC__rsp__p0_only__flush(H5F_t *f, hid_t dxpl_id);
+static herr_t H5AC__rsp__p0_only__flush_to_min_clean(H5F_t *f, hid_t dxpl_id);
+
+
+/*********************/
+/* Package Variables */
+/*********************/
+
+/* Declare a free list to manage the H5AC_aux_t struct */
+H5FL_DEFINE(H5AC_aux_t);
+
+
+/*****************************/
+/* Library Private Variables */
+/*****************************/
+
+
+/*******************/
+/* Local Variables */
+/*******************/
+
+/* Declare a free list to manage the H5AC_slist_entry_t struct */
+H5FL_DEFINE_STATIC(H5AC_slist_entry_t);
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5AC__init_mpio_interface
+ *
+ * Purpose: Initialize interface-specific information
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: Quincey Koziol
+ * 6/20/15
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5AC__init_mpio_interface(void)
+{
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_STATIC
+
+ /* Funnel all work to H5AC_init() */
+ if(H5AC_init() < 0)
+ HGOTO_ERROR(H5E_FUNC, H5E_CANTINIT, FAIL, "interface initialization failed")
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5AC__init_mpio_interface() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5AC__set_sync_point_done_callback
+ *
+ * Purpose: Set the value of the sync_point_done callback. This
+ * callback is used by the parallel test code to verify
+ * that the expected writes and only the expected writes
+ * take place during a sync point.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: John Mainzer
+ * 5/9/10
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5AC__set_sync_point_done_callback(H5C_t * cache_ptr,
+ void (* sync_point_done)(int num_writes, haddr_t * written_entries_tbl))
+{
+ H5AC_aux_t * aux_ptr;
+
+ FUNC_ENTER_PACKAGE_NOERR
+
+ /* Sanity checks */
+ HDassert(cache_ptr);
+ aux_ptr = H5C_get_aux_ptr(cache_ptr);
+ HDassert(aux_ptr != NULL);
+ HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
+
+ aux_ptr->sync_point_done = sync_point_done;
+
+ FUNC_LEAVE_NOAPI(SUCCEED)
+} /* H5AC__set_sync_point_done_callback() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5AC__set_write_done_callback
+ *
+ * Purpose: Set the value of the write_done callback. This callback
+ * is used to improve performance of the parallel test bed
+ * for the cache.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: John Mainzer
+ * 5/11/06
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5AC__set_write_done_callback(H5C_t * cache_ptr, void (* write_done)(void))
+{
+ H5AC_aux_t * aux_ptr;
+
+ FUNC_ENTER_PACKAGE_NOERR
+
+ /* Sanity checks */
+ HDassert(cache_ptr);
+ aux_ptr = H5C_get_aux_ptr(cache_ptr);
+ HDassert(aux_ptr != NULL);
+ HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
+
+ aux_ptr->write_done = write_done;
+
+ FUNC_LEAVE_NOAPI(SUCCEED)
+} /* H5AC__set_write_done_callback() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5AC_add_candidate()
+ *
+ * Purpose: Add the supplied metadata entry address to the candidate
+ * list. Verify that each entry added does not appear in
+ * the list prior to its insertion.
+ *
+ * This function is intended for used in constructing list
+ * of entried to be flushed during sync points. It shouldn't
+ * be called anywhere else.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: John Mainzer
+ * 3/17/10
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5AC_add_candidate(H5AC_t * cache_ptr, haddr_t addr)
+{
+ H5AC_aux_t * aux_ptr;
+ H5AC_slist_entry_t * slist_entry_ptr = NULL;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI(FAIL)
+
+ /* Sanity checks */
+ HDassert(cache_ptr != NULL);
+ aux_ptr = H5C_get_aux_ptr(cache_ptr);
+ HDassert(aux_ptr != NULL);
+ HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
+ HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED);
+ HDassert(aux_ptr->candidate_slist_ptr != NULL);
+
+ /* Construct an entry for the supplied address, and insert
+ * it into the candidate slist.
+ */
+ if(NULL == (slist_entry_ptr = H5FL_MALLOC(H5AC_slist_entry_t)))
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "Can't allocate candidate slist entry")
+ slist_entry_ptr->addr = addr;
+
+ if(H5SL_insert(aux_ptr->candidate_slist_ptr, slist_entry_ptr, &(slist_entry_ptr->addr)) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTINSERT, FAIL, "can't insert entry into dirty entry slist")
+
+done:
+ /* Clean up on error */
+ if(ret_value < 0)
+ if(slist_entry_ptr)
+ slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr);
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5AC_add_candidate() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5AC__broadcast_candidate_list()
+ *
+ * Purpose: Broadcast the contents of the process 0 candidate entry
+ * slist. In passing, also remove all entries from said
+ * list. As the application of this will be handled by
+ * the same functions on all processes, construct and
+ * return a copy of the list in the same format as that
+ * received by the other processes. Note that if this
+ * copy is returned in *haddr_buf_ptr_ptr, the caller
+ * must free it.
+ *
+ * This function must only be called by the process with
+ * MPI_rank 0.
+ *
+ * Return SUCCEED on success, and FAIL on failure.
+ *
+ * Return: Non-negative on success/Negative on failure.
+ *
+ * Programmer: John Mainzer, 7/1/05
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5AC__broadcast_candidate_list(H5AC_t *cache_ptr, int *num_entries_ptr,
+ haddr_t **haddr_buf_ptr_ptr)
+{
+ H5AC_aux_t * aux_ptr = NULL;
+ haddr_t * haddr_buf_ptr = NULL;
+ int mpi_result;
+ int num_entries;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_STATIC
+
+ /* Sanity checks */
+ HDassert(cache_ptr != NULL);
+ aux_ptr = H5C_get_aux_ptr(cache_ptr);
+ HDassert(aux_ptr != NULL);
+ HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
+ HDassert(aux_ptr->mpi_rank == 0);
+ HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED);
+ HDassert(aux_ptr->candidate_slist_ptr != NULL);
+ HDassert(num_entries_ptr != NULL);
+ HDassert(*num_entries_ptr == 0);
+ HDassert(haddr_buf_ptr_ptr != NULL);
+ HDassert(*haddr_buf_ptr_ptr == NULL);
+
+ /* First broadcast the number of entries in the list so that the
+ * receivers can set up buffers to receive them. If there aren't
+ * any, we are done.
+ */
+ num_entries = (int)H5SL_count(aux_ptr->candidate_slist_ptr);
+ if(MPI_SUCCESS != (mpi_result = MPI_Bcast(&num_entries, 1, MPI_INT, 0, aux_ptr->mpi_comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_result)
+
+ if(num_entries > 0) {
+ size_t buf_size = 0;
+ int chk_num_entries = 0;
+
+ /* convert the candidate list into the format we
+ * are used to receiving from process 0, and also load it
+ * into a buffer for transmission.
+ */
+ if(H5AC__copy_candidate_list_to_buffer(cache_ptr, &chk_num_entries, &haddr_buf_ptr) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate buffer.")
+ HDassert(chk_num_entries == num_entries);
+ HDassert(haddr_buf_ptr != NULL);
+
+ /* Now broadcast the list of candidate entries */
+ buf_size = sizeof(haddr_t) * (size_t)num_entries;
+ if(MPI_SUCCESS != (mpi_result = MPI_Bcast((void *)haddr_buf_ptr, (int)buf_size, MPI_BYTE, 0, aux_ptr->mpi_comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_result)
+ } /* end if */
+
+ /* Pass the number of entries and the buffer pointer
+ * back to the caller. Do this so that we can use the same code
+ * to apply the candidate list to all the processes.
+ */
+ *num_entries_ptr = num_entries;
+ *haddr_buf_ptr_ptr = haddr_buf_ptr;
+
+done:
+ if(ret_value < 0)
+ if(haddr_buf_ptr)
+ haddr_buf_ptr = (haddr_t *)H5MM_xfree((void *)haddr_buf_ptr);
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5AC__broadcast_candidate_list() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5AC__broadcast_clean_list_cb()
+ *
+ * Purpose: Skip list callback for building array of addresses for
+ * broadcasting the clean list.
+ *
+ * Return: Non-negative on success/Negative on failure.
+ *
+ * Programmer: Quincey Koziol, 6/12/15
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5AC__broadcast_clean_list_cb(void *_item, void H5_ATTR_UNUSED *_key,
+ void *_udata)
+{
+ H5AC_slist_entry_t * slist_entry_ptr = (H5AC_slist_entry_t *)_item; /* Address of item */
+ H5AC_addr_list_ud_t * udata = (H5AC_addr_list_ud_t *)_udata; /* Context for callback */
+ haddr_t addr;
+
+ FUNC_ENTER_STATIC_NOERR
+
+ /* Sanity checks */
+ HDassert(slist_entry_ptr);
+ HDassert(udata);
+
+ /* Store the entry's address in the buffer */
+ addr = slist_entry_ptr->addr;
+ udata->addr_buf_ptr[udata->i] = addr;
+ udata->i++;
+
+ /* now release the entry */
+ slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr);
+
+ /* and also remove the matching entry from the dirtied list
+ * if it exists.
+ */
+ if(NULL != (slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(udata->aux_ptr->d_slist_ptr, (void *)(&addr))))
+ slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr);
+
+ FUNC_LEAVE_NOAPI(SUCCEED)
+} /* H5AC__broadcast_clean_list_cb() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5AC__broadcast_clean_list()
+ *
+ * Purpose: Broadcast the contents of the process 0 cleaned entry
+ * slist. In passing, also remove all entries from said
+ * list, and also remove any matching entries from the dirtied
+ * slist.
+ *
+ * This function must only be called by the process with
+ * MPI_rank 0.
+ *
+ * Return SUCCEED on success, and FAIL on failure.
+ *
+ * Return: Non-negative on success/Negative on failure.
+ *
+ * Programmer: John Mainzer, 7/1/05
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5AC__broadcast_clean_list(H5AC_t * cache_ptr)
+{
+ haddr_t * addr_buf_ptr = NULL;
+ H5AC_aux_t * aux_ptr;
+ int mpi_result;
+ int num_entries = 0;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_STATIC
+
+ /* Sanity checks */
+ HDassert(cache_ptr != NULL);
+ aux_ptr = H5C_get_aux_ptr(cache_ptr);
+ HDassert(aux_ptr != NULL);
+ HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
+ HDassert(aux_ptr->mpi_rank == 0);
+ HDassert(aux_ptr->c_slist_ptr != NULL);
+
+ /* First broadcast the number of entries in the list so that the
+ * receives can set up a buffer to receive them. If there aren't
+ * any, we are done.
+ */
+ num_entries = (int)H5SL_count(aux_ptr->c_slist_ptr);
+ if(MPI_SUCCESS != (mpi_result = MPI_Bcast(&num_entries, 1, MPI_INT, 0, aux_ptr->mpi_comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_result)
+
+ if(num_entries > 0) {
+ H5AC_addr_list_ud_t udata;
+ size_t buf_size;
+
+ /* allocate a buffer to store the list of entry base addresses in */
+ buf_size = sizeof(haddr_t) * (size_t)num_entries;
+ if(NULL == (addr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size)))
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for addr buffer")
+
+ /* Set up user data for callback */
+ udata.aux_ptr = aux_ptr;
+ udata.addr_buf_ptr = addr_buf_ptr;
+ udata.i = 0;
+
+ /* Free all the clean list entries, building the address list in the callback */
+ /* (Callback also removes the matching entries from the dirtied list) */
+ if(H5SL_free(aux_ptr->c_slist_ptr, H5AC__broadcast_clean_list_cb, &udata) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFREE, FAIL, "Can't build address list for clean entries")
+
+ /* Now broadcast the list of cleaned entries */
+ if(MPI_SUCCESS != (mpi_result = MPI_Bcast((void *)addr_buf_ptr, (int)buf_size, MPI_BYTE, 0, aux_ptr->mpi_comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_result)
+ } /* end if */
+
+ /* if it is defined, call the sync point done callback. Note
+ * that this callback is defined purely for testing purposes,
+ * and should be undefined under normal operating circumstances.
+ */
+ if(aux_ptr->sync_point_done)
+ (aux_ptr->sync_point_done)(num_entries, addr_buf_ptr);
+
+done:
+ if(addr_buf_ptr)
+ addr_buf_ptr = (haddr_t *)H5MM_xfree((void *)addr_buf_ptr);
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5AC__broadcast_clean_list() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5AC__construct_candidate_list()
+ *
+ * Purpose: In the parallel case when the metadata_write_strategy is
+ * H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED, process 0 uses
+ * this function to construct the list of cache entries to
+ * be flushed. This list is then propagated to the other
+ * caches, and then flushed in a distributed fashion.
+ *
+ * The sync_point_op parameter is used to determine the extent
+ * of the flush.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: John Mainzer
+ * 3/17/10
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5AC__construct_candidate_list(H5AC_t *cache_ptr, H5AC_aux_t *aux_ptr,
+ int sync_point_op)
+{
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_STATIC
+
+ /* Sanity checks */
+ HDassert(cache_ptr != NULL);
+ HDassert(aux_ptr != NULL);
+ HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
+ HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED);
+ HDassert((sync_point_op == H5AC_SYNC_POINT_OP__FLUSH_CACHE) || (aux_ptr->mpi_rank == 0));
+ HDassert(aux_ptr->d_slist_ptr != NULL);
+ HDassert(aux_ptr->c_slist_ptr != NULL);
+ HDassert(H5SL_count(aux_ptr->c_slist_ptr) == 0);
+ HDassert(aux_ptr->candidate_slist_ptr != NULL);
+ HDassert(H5SL_count(aux_ptr->candidate_slist_ptr) == 0);
+ HDassert((sync_point_op == H5AC_SYNC_POINT_OP__FLUSH_TO_MIN_CLEAN) || (sync_point_op == H5AC_SYNC_POINT_OP__FLUSH_CACHE));
+
+ switch(sync_point_op) {
+ case H5AC_SYNC_POINT_OP__FLUSH_TO_MIN_CLEAN:
+ if(H5C_construct_candidate_list__min_clean((H5C_t *)cache_ptr) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "H5C_construct_candidate_list__min_clean() failed.")
+ break;
+
+ case H5AC_SYNC_POINT_OP__FLUSH_CACHE:
+ if(H5C_construct_candidate_list__clean_cache((H5C_t *)cache_ptr) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "H5C_construct_candidate_list__clean_cache() failed.")
+ break;
+
+ default:
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "unknown sync point operation.")
+ break;
+ } /* end switch */
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5AC__construct_candidate_list() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5AC__copy_candidate_list_to_buffer_cb
+ *
+ * Purpose: Skip list callback for building array of addresses for
+ * broadcasting the candidate list.
+ *
+ * Return: Return SUCCEED on success, and FAIL on failure.
+ *
+ * Programmer: Quincey Koziol, 6/12/15
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5AC__copy_candidate_list_to_buffer_cb(void *_item, void H5_ATTR_UNUSED *_key,
+ void *_udata)
+{
+ H5AC_slist_entry_t * slist_entry_ptr = (H5AC_slist_entry_t *)_item; /* Address of item */
+ H5AC_addr_list_ud_t * udata = (H5AC_addr_list_ud_t *)_udata; /* Context for callback */
+
+ FUNC_ENTER_STATIC_NOERR
+
+ /* Sanity checks */
+ HDassert(slist_entry_ptr);
+ HDassert(udata);
+
+ /* Store the entry's address in the buffer */
+ udata->addr_buf_ptr[udata->i] = slist_entry_ptr->addr;
+ udata->i++;
+
+ /* now release the entry */
+ slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr);
+
+ FUNC_LEAVE_NOAPI(SUCCEED)
+} /* H5AC__copy_candidate_list_to_buffer_cb() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5AC__copy_candidate_list_to_buffer
+ *
+ * Purpose: Allocate buffer(s) and copy the contents of the candidate
+ * entry slist into it (them). In passing, remove all
+ * entries from the candidate slist. Note that the
+ * candidate slist must not be empty.
+ *
+ * If MPI_Offset_buf_ptr_ptr is not NULL, allocate a buffer
+ * of MPI_Offset, copy the contents of the candidate
+ * entry list into it with the appropriate conversions,
+ * and return the base address of the buffer in
+ * *MPI_Offset_buf_ptr. Note that this is the buffer
+ * used by process 0 to transmit the list of entries to
+ * be flushed to all other processes (in this file group).
+ *
+ * Similarly, allocate a buffer of haddr_t, load the contents
+ * of the candidate list into this buffer, and return its
+ * base address in *haddr_buf_ptr_ptr. Note that this
+ * latter buffer is constructed unconditionally.
+ *
+ * In passing, also remove all entries from the candidate
+ * entry slist.
+ *
+ * Return: Return SUCCEED on success, and FAIL on failure.
+ *
+ * Programmer: John Mainzer, 4/19/10
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5AC__copy_candidate_list_to_buffer(const H5AC_t *cache_ptr, int *num_entries_ptr,
+ haddr_t **haddr_buf_ptr_ptr)
+{
+ H5AC_aux_t * aux_ptr = NULL;
+ H5AC_addr_list_ud_t udata;
+ haddr_t * haddr_buf_ptr = NULL;
+ size_t buf_size;
+ int num_entries = 0;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_STATIC
+
+ /* Sanity checks */
+ HDassert(cache_ptr != NULL);
+ aux_ptr = H5C_get_aux_ptr(cache_ptr);
+ HDassert(aux_ptr != NULL);
+ HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
+ HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED);
+ HDassert(aux_ptr->candidate_slist_ptr != NULL);
+ HDassert(H5SL_count(aux_ptr->candidate_slist_ptr) > 0);
+ HDassert(num_entries_ptr != NULL);
+ HDassert(*num_entries_ptr == 0);
+ HDassert(haddr_buf_ptr_ptr != NULL);
+ HDassert(*haddr_buf_ptr_ptr == NULL);
+
+ num_entries = (int)H5SL_count(aux_ptr->candidate_slist_ptr);
+
+ /* allocate a buffer(s) to store the list of candidate entry
+ * base addresses in
+ */
+ buf_size = sizeof(haddr_t) * (size_t)num_entries;
+ if(NULL == (haddr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size)))
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for haddr buffer")
+
+ /* Set up user data for callback */
+ udata.aux_ptr = aux_ptr;
+ udata.addr_buf_ptr = haddr_buf_ptr;
+ udata.i = 0;
+
+ /* Free all the candidate list entries, building the address list in the callback */
+ if(H5SL_free(aux_ptr->candidate_slist_ptr, H5AC__copy_candidate_list_to_buffer_cb, &udata) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFREE, FAIL, "Can't build address list for candidate entries")
+
+ /* Pass the number of entries and the buffer pointer
+ * back to the caller.
+ */
+ *num_entries_ptr = num_entries;
+ *haddr_buf_ptr_ptr = haddr_buf_ptr;
+
+done:
+ if(ret_value < 0)
+ if(haddr_buf_ptr)
+ haddr_buf_ptr = (haddr_t *)H5MM_xfree((void *)haddr_buf_ptr);
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5AC__copy_candidate_list_to_buffer() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5AC__log_deleted_entry()
+ *
+ * Purpose: Log an entry which has been deleted.
+ *
+ * Only called for mpi_rank 0. We must make sure that the entry
+ * doesn't appear in the cleaned or dirty entry lists.
+ *
+ * Return SUCCEED on success, and FAIL on failure.
+ *
+ * Return: Non-negative on success/Negative on failure.
+ *
+ * Programmer: John Mainzer, 6/29/05
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5AC__log_deleted_entry(const H5AC_info_t *entry_ptr)
+{
+ H5AC_t * cache_ptr;
+ H5AC_aux_t * aux_ptr;
+ H5AC_slist_entry_t * slist_entry_ptr = NULL;
+ haddr_t addr;
+
+ FUNC_ENTER_PACKAGE_NOERR
+
+ /* Sanity checks */
+ HDassert(entry_ptr);
+ addr = entry_ptr->addr;
+ cache_ptr = entry_ptr->cache_ptr;
+ HDassert(cache_ptr != NULL);
+ aux_ptr = H5C_get_aux_ptr(cache_ptr);
+ HDassert(aux_ptr != NULL);
+ HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
+ HDassert(aux_ptr->mpi_rank == 0);
+ HDassert(aux_ptr->d_slist_ptr != NULL);
+ HDassert(aux_ptr->c_slist_ptr != NULL);
+
+ /* if the entry appears in the dirtied entry slist, remove it. */
+ if(NULL != (slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->d_slist_ptr, (void *)(&addr))))
+ slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr);
+
+ /* if the entry appears in the cleaned entry slist, remove it. */
+ if(NULL != (slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->c_slist_ptr, (void *)(&addr))))
+ slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr);
+
+ FUNC_LEAVE_NOAPI(SUCCEED)
+} /* H5AC__log_deleted_entry() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5AC__log_dirtied_entry()
+ *
+ * Purpose: Update the dirty_bytes count for a newly dirtied entry.
+ *
+ * If mpi_rank isn't 0, this simply means adding the size
+ * of the entries to the dirty_bytes count.
+ *
+ * If mpi_rank is 0, we must first check to see if the entry
+ * appears in the dirty entries slist. If it is, do nothing.
+ * If it isn't, add the size to th dirty_bytes count, add the
+ * entry to the dirty entries slist, and remove it from the
+ * cleaned list (if it is present there).
+ *
+ * Return SUCCEED on success, and FAIL on failure.
+ *
+ * Return: Non-negative on success/Negative on failure.
+ *
+ * Programmer: John Mainzer, 6/29/05
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5AC__log_dirtied_entry(const H5AC_info_t *entry_ptr)
+{
+ H5AC_t * cache_ptr;
+ H5AC_aux_t * aux_ptr;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_PACKAGE
+
+ /* Sanity checks */
+ HDassert(entry_ptr);
+ HDassert(entry_ptr->is_dirty == FALSE);
+ cache_ptr = entry_ptr->cache_ptr;
+ HDassert(cache_ptr != NULL);
+ aux_ptr = H5C_get_aux_ptr(cache_ptr);
+ HDassert(aux_ptr != NULL);
+ HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
+
+ if(aux_ptr->mpi_rank == 0) {
+ H5AC_slist_entry_t *slist_entry_ptr;
+ haddr_t addr = entry_ptr->addr;
+
+ /* Sanity checks */
+ HDassert(aux_ptr->d_slist_ptr != NULL);
+ HDassert(aux_ptr->c_slist_ptr != NULL);
+
+ if(NULL == H5SL_search(aux_ptr->d_slist_ptr, (void *)(&addr))) {
+ /* insert the address of the entry in the dirty entry list, and
+ * add its size to the dirty_bytes count.
+ */
+ if(NULL == (slist_entry_ptr = H5FL_MALLOC(H5AC_slist_entry_t)))
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "Can't allocate dirty slist entry .")
+ slist_entry_ptr->addr = addr;
+
+ if(H5SL_insert(aux_ptr->d_slist_ptr, slist_entry_ptr, &(slist_entry_ptr->addr)) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTINSERT, FAIL, "can't insert entry into dirty entry slist.")
+
+ aux_ptr->dirty_bytes += entry_ptr->size;
+#if H5AC_DEBUG_DIRTY_BYTES_CREATION
+ aux_ptr->unprotect_dirty_bytes += entry_ptr->size;
+ aux_ptr->unprotect_dirty_bytes_updates += 1;
+#endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */
+ } /* end if */
+
+ /* the entry is dirty. If it exists on the cleaned entries list,
+ * remove it.
+ */
+ if(NULL != (slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->c_slist_ptr, (void *)(&addr))))
+ slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr);
+ } /* end if */
+ else {
+ aux_ptr->dirty_bytes += entry_ptr->size;
+#if H5AC_DEBUG_DIRTY_BYTES_CREATION
+ aux_ptr->unprotect_dirty_bytes += entry_size;
+ aux_ptr->unprotect_dirty_bytes_updates += 1;
+#endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */
+ } /* end else */
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5AC__log_dirtied_entry() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5AC__log_flushed_entry()
+ *
+ * Purpose: Update the clean entry slist for the flush of an entry --
+ * specifically, if the entry has been cleared, remove it
+ * from both the cleaned and dirtied lists if it is present.
+ * Otherwise, if the entry was dirty, insert the indicated
+ * entry address in the clean slist if it isn't there already.
+ *
+ * This function is only used in PHDF5, and should only
+ * be called for the process with mpi rank 0.
+ *
+ * Return SUCCEED on success, and FAIL on failure.
+ *
+ * Return: Non-negative on success/Negative on failure.
+ *
+ * Programmer: John Mainzer, 6/29/05
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5AC__log_flushed_entry(H5C_t *cache_ptr, haddr_t addr, hbool_t was_dirty,
+ unsigned flags)
+{
+ hbool_t cleared;
+ H5AC_aux_t * aux_ptr;
+ H5AC_slist_entry_t * slist_entry_ptr = NULL;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_PACKAGE
+
+ /* Sanity check */
+ HDassert(cache_ptr != NULL);
+ aux_ptr = H5C_get_aux_ptr(cache_ptr);
+ HDassert(aux_ptr != NULL);
+ HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
+ HDassert(aux_ptr->mpi_rank == 0);
+ HDassert(aux_ptr->c_slist_ptr != NULL);
+
+ /* Set local flags */
+ cleared = ((flags & H5C__FLUSH_CLEAR_ONLY_FLAG) != 0);
+
+ if(cleared) {
+ /* If the entry has been cleared, must remove it from both the
+ * cleaned list and the dirtied list.
+ */
+ if(NULL != (slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->c_slist_ptr, (void *)(&addr))))
+ slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr);
+ if(NULL != (slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->d_slist_ptr, (void *)(&addr))))
+ slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr);
+ } /* end if */
+ else if(was_dirty) {
+ if(NULL == H5SL_search(aux_ptr->c_slist_ptr, (void *)(&addr))) {
+ /* insert the address of the entry in the clean entry list. */
+ if(NULL == (slist_entry_ptr = H5FL_MALLOC(H5AC_slist_entry_t)))
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "Can't allocate clean slist entry .")
+ slist_entry_ptr->addr = addr;
+
+ if(H5SL_insert(aux_ptr->c_slist_ptr, slist_entry_ptr, &(slist_entry_ptr->addr)) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTINSERT, FAIL, "can't insert entry into clean entry slist.")
+ } /* end if */
+ } /* end else-if */
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5AC__log_flushed_entry() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5AC__log_inserted_entry()
+ *
+ * Purpose: Update the dirty_bytes count for a newly inserted entry.
+ *
+ * If mpi_rank isnt 0, this simply means adding the size
+ * of the entry to the dirty_bytes count.
+ *
+ * If mpi_rank is 0, we must also add the entry to the
+ * dirty entries slist.
+ *
+ * Return SUCCEED on success, and FAIL on failure.
+ *
+ * Return: Non-negative on success/Negative on failure.
+ *
+ * Programmer: John Mainzer, 6/30/05
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5AC__log_inserted_entry(const H5AC_info_t *entry_ptr)
+{
+ H5AC_t * cache_ptr;
+ H5AC_aux_t * aux_ptr;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_PACKAGE
+
+ /* Sanity checks */
+ HDassert(entry_ptr);
+ cache_ptr = entry_ptr->cache_ptr;
+ HDassert(cache_ptr != NULL);
+ aux_ptr = H5C_get_aux_ptr(cache_ptr);
+ HDassert(aux_ptr != NULL);
+ HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
+
+ if(aux_ptr->mpi_rank == 0) {
+ H5AC_slist_entry_t *slist_entry_ptr;
+
+ HDassert(aux_ptr->d_slist_ptr != NULL);
+ HDassert(aux_ptr->c_slist_ptr != NULL);
+
+ /* Entry to insert should not be in dirty list currently */
+ if(NULL != H5SL_search(aux_ptr->d_slist_ptr, (const void *)(&entry_ptr->addr)))
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Inserted entry already in dirty slist.")
+
+ /* insert the address of the entry in the dirty entry list, and
+ * add its size to the dirty_bytes count.
+ */
+ if(NULL == (slist_entry_ptr = H5FL_MALLOC(H5AC_slist_entry_t)))
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "Can't allocate dirty slist entry .")
+ slist_entry_ptr->addr = entry_ptr->addr;
+ if(H5SL_insert(aux_ptr->d_slist_ptr, slist_entry_ptr, &(slist_entry_ptr->addr)) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTINSERT, FAIL, "can't insert entry into dirty entry slist.")
+
+ /* Entry to insert should not be in clean list either */
+ if(NULL != H5SL_search(aux_ptr->c_slist_ptr, (const void *)(&entry_ptr->addr)))
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Inserted entry in clean slist.")
+ } /* end if */
+
+ aux_ptr->dirty_bytes += entry_ptr->size;
+
+#if H5AC_DEBUG_DIRTY_BYTES_CREATION
+ aux_ptr->insert_dirty_bytes += size;
+ aux_ptr->insert_dirty_bytes_updates += 1;
+#endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5AC__log_inserted_entry() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5AC__log_moved_entry()
+ *
+ * Purpose: Update the dirty_bytes count for a moved entry.
+ *
+ * WARNING
+ *
+ * At present, the way that the move call is used ensures
+ * that the moved entry is present in all caches by
+ * moving in a collective operation and immediately after
+ * unprotecting the target entry.
+ *
+ * This function uses this invariant, and will cause arcane
+ * failures if it is not met. If maintaining this invariant
+ * becomes impossible, we will have to rework this function
+ * extensively, and likely include a bit of IPC for
+ * synchronization. A better option might be to subsume
+ * move in the unprotect operation.
+ *
+ * Given that the target entry is in all caches, the function
+ * proceeds as follows:
+ *
+ * For processes with mpi rank other 0, it simply checks to
+ * see if the entry was dirty prior to the move, and adds
+ * the entries size to the dirty bytes count.
+ *
+ * In the process with mpi rank 0, the function first checks
+ * to see if the entry was dirty prior to the move. If it
+ * was, and if the entry doesn't appear in the dirtied list
+ * under its old address, it adds the entry's size to the
+ * dirty bytes count.
+ *
+ * The rank 0 process then removes any references to the
+ * entry under its old address from the cleands and dirtied
+ * lists, and inserts an entry in the dirtied list under the
+ * new address.
+ *
+ * Return SUCCEED on success, and FAIL on failure.
+ *
+ * Return: Non-negative on success/Negative on failure.
+ *
+ * Programmer: John Mainzer, 6/30/05
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5AC__log_moved_entry(const H5F_t *f, haddr_t old_addr, haddr_t new_addr)
+{
+ H5AC_t * cache_ptr;
+ H5AC_aux_t * aux_ptr;
+ hbool_t entry_in_cache;
+ hbool_t entry_dirty;
+ size_t entry_size;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_PACKAGE
+
+ /* Sanity checks */
+ HDassert(f);
+ HDassert(f->shared);
+ cache_ptr = (H5AC_t *)f->shared->cache;
+ HDassert(cache_ptr);
+ aux_ptr = H5C_get_aux_ptr(cache_ptr);
+ HDassert(aux_ptr != NULL);
+ HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
+
+ /* get entry status, size, etc here */
+ if(H5C_get_entry_status(f, old_addr, &entry_size, &entry_in_cache,
+ &entry_dirty, NULL, NULL, NULL, NULL) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't get entry status.")
+ if(!entry_in_cache)
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "entry not in cache.")
+
+ if(aux_ptr->mpi_rank == 0) {
+ H5AC_slist_entry_t * slist_entry_ptr;
+
+ HDassert(aux_ptr->d_slist_ptr != NULL);
+ HDassert(aux_ptr->c_slist_ptr != NULL);
+
+ /* if the entry appears in the cleaned entry slist, under its old
+ * address, remove it.
+ */
+ if(NULL != (slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->c_slist_ptr, (void *)(&old_addr))))
+ slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr);
+
+ /* if the entry appears in the dirtied entry slist under its old
+ * address, remove it, but don't free it. Set addr to new_addr.
+ */
+ if(NULL != (slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->d_slist_ptr, (void *)(&old_addr))))
+ slist_entry_ptr->addr = new_addr;
+ else {
+ /* otherwise, allocate a new entry that is ready
+ * for insertion, and increment dirty_bytes.
+ *
+ * Note that the fact that the entry wasn't in the dirtied
+ * list under its old address implies that it must have
+ * been clean to start with.
+ */
+ HDassert(!entry_dirty);
+ if(NULL == (slist_entry_ptr = H5FL_MALLOC(H5AC_slist_entry_t)))
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "Can't allocate dirty slist entry .")
+ slist_entry_ptr->addr = new_addr;
+
+ aux_ptr->dirty_bytes += entry_size;
+
+#if H5AC_DEBUG_DIRTY_BYTES_CREATION
+ aux_ptr->move_dirty_bytes += entry_size;
+ aux_ptr->move_dirty_bytes_updates += 1;
+#endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */
+ } /* end else */
+
+ /* insert / reinsert the entry in the dirty slist */
+ if(H5SL_insert(aux_ptr->d_slist_ptr, slist_entry_ptr, &(slist_entry_ptr->addr)) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTINSERT, FAIL, "can't insert entry into dirty entry slist.")
+ } /* end if */
+ else if(!entry_dirty) {
+ aux_ptr->dirty_bytes += entry_size;
+
+#if H5AC_DEBUG_DIRTY_BYTES_CREATION
+ aux_ptr->move_dirty_bytes += entry_size;
+ aux_ptr->move_dirty_bytes_updates += 1;
+#endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */
+ } /* end else-if */
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5AC__log_moved_entry() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5AC__propagate_and_apply_candidate_list
+ *
+ * Purpose: Prior to the addition of support for multiple metadata
+ * write strategies, in PHDF5, only the metadata cache with
+ * mpi rank 0 was allowed to write to file. All other
+ * metadata caches on processes with rank greater than 0
+ * were required to retain dirty entries until they were
+ * notified that the entry was clean.
+ *
+ * This constraint is relaxed with the distributed
+ * metadata write strategy, in which a list of candidate
+ * metadata cache entries is constructed by the process 0
+ * cache and then distributed to the caches of all the other
+ * processes. Once the listed is distributed, many (if not
+ * all) processes writing writing a unique subset of the
+ * entries, and marking the remainder clean. The subsets
+ * are chosen so that each entry in the list of candidates
+ * is written by exactly one cache, and all entries are
+ * marked as being clean in all caches.
+ *
+ * While the list of candidate cache entries is prepared
+ * elsewhere, this function is the main routine for distributing
+ * and applying the list. It must be run simultaniously on
+ * all processes that have the relevant file open. To ensure
+ * proper synchronization, there is a barrier at the beginning
+ * of this function.
+ *
+ * At present, this function is called under one of two
+ * circumstances:
+ *
+ * 1) Dirty byte creation exceeds some user specified value.
+ *
+ * While metadata reads may occur independently, all
+ * operations writing metadata must be collective. Thus
+ * all metadata caches see the same sequence of operations,
+ * and therefore the same dirty data creation.
+ *
+ * This fact is used to synchronize the caches for purposes
+ * of propagating the list of candidate entries, by simply
+ * calling this function from all caches whenever some user
+ * specified threshold on dirty data is exceeded. (the
+ * process 0 cache creates the candidate list just before
+ * calling this function).
+ *
+ * 2) Under direct user control -- this operation must be
+ * collective.
+ *
+ * The operations to be managed by this function are as
+ * follows:
+ *
+ * All processes:
+ *
+ * 1) Participate in an opening barrier.
+ *
+ * For the process with mpi rank 0:
+ *
+ * 1) Load the contents of the candidate list
+ * (candidate_slist_ptr) into a buffer, and broadcast that
+ * buffer to all the other caches. Clear the candidate
+ * list in passing.
+ *
+ * If there is a positive number of candidates, proceed with
+ * the following:
+ *
+ * 2) Apply the candidate entry list.
+ *
+ * 3) Particpate in a closing barrier.
+ *
+ * 4) Remove from the dirty list (d_slist_ptr) and from the
+ * flushed and still clean entries list (c_slist_ptr),
+ * all addresses that appeared in the candidate list, as
+ * these entries are now clean.
+ *
+ *
+ * For all processes with mpi rank greater than 0:
+ *
+ * 1) Receive the candidate entry list broadcast
+ *
+ * If there is a positive number of candidates, proceed with
+ * the following:
+ *
+ * 2) Apply the candidate entry list.
+ *
+ * 3) Particpate in a closing barrier.
+ *
+ * Return: Success: non-negative
+ *
+ * Failure: negative
+ *
+ * Programmer: John Mainzer
+ * 3/17/10
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5AC__propagate_and_apply_candidate_list(H5F_t *f, hid_t dxpl_id)
+{
+ H5AC_t * cache_ptr;
+ H5AC_aux_t * aux_ptr;
+ haddr_t * candidates_list_ptr = NULL;
+ int mpi_result;
+ int num_candidates = 0;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_STATIC
+
+ /* Sanity checks */
+ HDassert(f != NULL);
+ cache_ptr = f->shared->cache;
+ HDassert(cache_ptr != NULL);
+ aux_ptr = H5C_get_aux_ptr(cache_ptr);
+ HDassert(aux_ptr != NULL);
+ HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
+ HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED);
+
+ /* to prevent "messages from the future" we must synchronize all
+ * processes before we write any entries.
+ */
+ if(MPI_SUCCESS != (mpi_result = MPI_Barrier(aux_ptr->mpi_comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_result)
+
+ if(aux_ptr->mpi_rank == 0) {
+ if(H5AC__broadcast_candidate_list(cache_ptr, &num_candidates, &candidates_list_ptr) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't broadcast candidate slist.")
+
+ HDassert(H5SL_count(aux_ptr->candidate_slist_ptr) == 0);
+ } /* end if */
+ else {
+ if(H5AC__receive_candidate_list(cache_ptr, &num_candidates, &candidates_list_ptr) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't receive candidate broadcast.")
+ } /* end else */
+
+ if(num_candidates > 0) {
+ herr_t result;
+
+ /* all processes apply the candidate list.
+ * H5C_apply_candidate_list() handles the details of
+ * distributing the writes across the processes.
+ */
+
+ /* Enable writes during this operation */
+ aux_ptr->write_permitted = TRUE;
+
+ /* Apply the candidate list */
+ result = H5C_apply_candidate_list(f, dxpl_id, cache_ptr, num_candidates,
+ candidates_list_ptr, aux_ptr->mpi_rank, aux_ptr->mpi_size);
+
+ /* Disable writes again */
+ aux_ptr->write_permitted = FALSE;
+
+ /* Check for error on the write operation */
+ if(result < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't apply candidate list.")
+
+ /* this code exists primarily for the test bed -- it allows us to
+ * enforce posix semantics on the server that pretends to be a
+ * file system in our parallel tests.
+ */
+ if(aux_ptr->write_done)
+ (aux_ptr->write_done)();
+
+ /* to prevent "messages from the past" we must synchronize all
+ * processes again before we go on.
+ */
+ if(MPI_SUCCESS != (mpi_result = MPI_Barrier(aux_ptr->mpi_comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_result)
+
+ /* if this is process zero, tidy up the dirtied,
+ * and flushed and still clean lists.
+ */
+ if(aux_ptr->mpi_rank == 0)
+ if(H5AC__tidy_cache_0_lists(cache_ptr, num_candidates, candidates_list_ptr) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't tidy up process 0 lists.")
+ } /* end if */
+
+ /* if it is defined, call the sync point done callback. Note
+ * that this callback is defined purely for testing purposes,
+ * and should be undefined under normal operating circumstances.
+ */
+ if(aux_ptr->sync_point_done)
+ (aux_ptr->sync_point_done)(num_candidates, candidates_list_ptr);
+
+done:
+ if(candidates_list_ptr)
+ candidates_list_ptr = (haddr_t *)H5MM_xfree((void *)candidates_list_ptr);
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5AC__propagate_and_apply_candidate_list() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5AC__propagate_flushed_and_still_clean_entries_list
+ *
+ * Purpose: In PHDF5, if the process 0 only metadata write strategy
+ * is selected, only the metadata cache with mpi rank 0 is
+ * allowed to write to file. All other metadata caches on
+ * processes with rank greater than 0 must retain dirty
+ * entries until they are notified that the entry is now
+ * clean.
+ *
+ * This function is the main routine for handling this
+ * notification proceedure. It must be called
+ * simultaniously on all processes that have the relevant
+ * file open. To this end, it is called only during a
+ * sync point, with a barrier prior to the call.
+ *
+ * Note that any metadata entry writes by process 0 will
+ * occur after the barrier and just before this call.
+ *
+ * Typicaly, calls to this function will be triggered in
+ * one of two ways:
+ *
+ * 1) Dirty byte creation exceeds some user specified value.
+ *
+ * While metadata reads may occur independently, all
+ * operations writing metadata must be collective. Thus
+ * all metadata caches see the same sequence of operations,
+ * and therefore the same dirty data creation.
+ *
+ * This fact is used to synchronize the caches for purposes
+ * of propagating the list of flushed and still clean
+ * entries, by simply calling this function from all
+ * caches whenever some user specified threshold on dirty
+ * data is exceeded.
+ *
+ * 2) Under direct user control -- this operation must be
+ * collective.
+ *
+ * The operations to be managed by this function are as
+ * follows:
+ *
+ * For the process with mpi rank 0:
+ *
+ * 1) Load the contents of the flushed and still clean entries
+ * list (c_slist_ptr) into a buffer, and broadcast that
+ * buffer to all the other caches.
+ *
+ * 2) Clear the flushed and still clean entries list
+ * (c_slist_ptr).
+ *
+ *
+ * For all processes with mpi rank greater than 0:
+ *
+ * 1) Receive the flushed and still clean entries list broadcast
+ *
+ * 2) Mark the specified entries as clean.
+ *
+ *
+ * For all processes:
+ *
+ * 1) Reset the dirtied bytes count to 0.
+ *
+ * Return: Success: non-negative
+ *
+ * Failure: negative
+ *
+ * Programmer: John Mainzer
+ * July 5, 2005
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5AC__propagate_flushed_and_still_clean_entries_list(H5F_t *f, hid_t dxpl_id)
+{
+ H5AC_t * cache_ptr;
+ H5AC_aux_t * aux_ptr;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_STATIC
+
+ /* Sanity checks */
+ HDassert(f != NULL);
+ cache_ptr = f->shared->cache;
+ HDassert(cache_ptr != NULL);
+ aux_ptr = H5C_get_aux_ptr(cache_ptr);
+ HDassert(aux_ptr != NULL);
+ HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
+ HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__PROCESS_0_ONLY);
+
+ if(aux_ptr->mpi_rank == 0) {
+ if(H5AC__broadcast_clean_list(cache_ptr) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't broadcast clean slist.")
+ HDassert(H5SL_count(aux_ptr->c_slist_ptr) == 0);
+ } /* end if */
+ else {
+ if(H5AC__receive_and_apply_clean_list(f, dxpl_id) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't receive and/or process clean slist broadcast.")
+ } /* end else */
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5AC__propagate_flushed_and_still_clean_entries_list() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5AC_receive_haddr_list()
+ *
+ * Purpose: Receive the list of entry addresses from process 0,
+ * and return it in a buffer pointed to by *haddr_buf_ptr_ptr.
+ * Note that the caller must free this buffer if it is
+ * returned.
+ *
+ * This function must only be called by the process with
+ * MPI_rank greater than 0.
+ *
+ * Return SUCCEED on success, and FAIL on failure.
+ *
+ * Return: Non-negative on success/Negative on failure.
+ *
+ * Programmer: Quincey Koziol, 6/11/2015
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5AC__receive_haddr_list(MPI_Comm mpi_comm, int *num_entries_ptr,
+ haddr_t **haddr_buf_ptr_ptr)
+{
+ haddr_t * haddr_buf_ptr = NULL;
+ int mpi_result;
+ int num_entries;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_STATIC
+
+ /* Sanity checks */
+ HDassert(num_entries_ptr != NULL);
+ HDassert(*num_entries_ptr == 0);
+ HDassert(haddr_buf_ptr_ptr != NULL);
+ HDassert(*haddr_buf_ptr_ptr == NULL);
+
+ /* First receive the number of entries in the list so that we
+ * can set up a buffer to receive them. If there aren't
+ * any, we are done.
+ */
+ if(MPI_SUCCESS != (mpi_result = MPI_Bcast(&num_entries, 1, MPI_INT, 0, mpi_comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_result)
+
+ if(num_entries > 0) {
+ size_t buf_size;
+
+ /* allocate buffers to store the list of entry base addresses in */
+ buf_size = sizeof(haddr_t) * (size_t)num_entries;
+ if(NULL == (haddr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size)))
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for haddr buffer")
+
+ /* Now receive the list of candidate entries */
+ if(MPI_SUCCESS != (mpi_result = MPI_Bcast((void *)haddr_buf_ptr, (int)buf_size, MPI_BYTE, 0, mpi_comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_result)
+ } /* end if */
+
+ /* finally, pass the number of entries and the buffer pointer
+ * back to the caller.
+ */
+ *num_entries_ptr = num_entries;
+ *haddr_buf_ptr_ptr = haddr_buf_ptr;
+
+done:
+ if(ret_value < 0)
+ if(haddr_buf_ptr)
+ haddr_buf_ptr = (haddr_t *)H5MM_xfree((void *)haddr_buf_ptr);
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5AC_receive_haddr_list() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5AC__receive_and_apply_clean_list()
+ *
+ * Purpose: Receive the list of cleaned entries from process 0,
+ * and mark the specified entries as clean.
+ *
+ * This function must only be called by the process with
+ * MPI_rank greater than 0.
+ *
+ * Return SUCCEED on success, and FAIL on failure.
+ *
+ * Return: Non-negative on success/Negative on failure.
+ *
+ * Programmer: John Mainzer, 7/4/05
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5AC__receive_and_apply_clean_list(H5F_t *f, hid_t dxpl_id)
+{
+ H5AC_t * cache_ptr;
+ H5AC_aux_t * aux_ptr;
+ haddr_t * haddr_buf_ptr = NULL;
+ int num_entries = 0;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_STATIC
+
+ /* Sanity check */
+ HDassert(f != NULL);
+ cache_ptr = f->shared->cache;
+ HDassert(cache_ptr != NULL);
+ aux_ptr = H5C_get_aux_ptr(cache_ptr);
+ HDassert(aux_ptr != NULL);
+ HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
+ HDassert(aux_ptr->mpi_rank != 0);
+
+ /* Retrieve the clean list from process 0 */
+ if(H5AC__receive_haddr_list(aux_ptr->mpi_comm, &num_entries, &haddr_buf_ptr) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "can't receive clean list")
+
+ if(num_entries > 0)
+ /* mark the indicated entries as clean */
+ if(H5C_mark_entries_as_clean(f, dxpl_id, (int32_t)num_entries, haddr_buf_ptr) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't mark entries clean.")
+
+ /* if it is defined, call the sync point done callback. Note
+ * that this callback is defined purely for testing purposes,
+ * and should be undefined under normal operating circumstances.
+ */
+ if(aux_ptr->sync_point_done)
+ (aux_ptr->sync_point_done)(num_entries, haddr_buf_ptr);
+
+done:
+ if(haddr_buf_ptr)
+ haddr_buf_ptr = (haddr_t *)H5MM_xfree((void *)haddr_buf_ptr);
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5AC__receive_and_apply_clean_list() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5AC__receive_candidate_list()
+ *
+ * Purpose: Receive the list of candidate entries from process 0,
+ * and return it in a buffer pointed to by *haddr_buf_ptr_ptr.
+ * Note that the caller must free this buffer if it is
+ * returned.
+ *
+ * This function must only be called by the process with
+ * MPI_rank greater than 0.
+ *
+ * Return SUCCEED on success, and FAIL on failure.
+ *
+ * Return: Non-negative on success/Negative on failure.
+ *
+ * Programmer: John Mainzer, 3/17/10
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5AC__receive_candidate_list(const H5AC_t *cache_ptr, int *num_entries_ptr,
+ haddr_t **haddr_buf_ptr_ptr)
+{
+ H5AC_aux_t * aux_ptr;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_STATIC
+
+ /* Sanity checks */
+ HDassert(cache_ptr != NULL);
+ aux_ptr = H5C_get_aux_ptr(cache_ptr);
+ HDassert(aux_ptr != NULL);
+ HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
+ HDassert(aux_ptr->mpi_rank != 0);
+ HDassert(aux_ptr-> metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED);
+ HDassert(num_entries_ptr != NULL);
+ HDassert(*num_entries_ptr == 0);
+ HDassert(haddr_buf_ptr_ptr != NULL);
+ HDassert(*haddr_buf_ptr_ptr == NULL);
+
+ /* Retrieve the candidate list from process 0 */
+ if(H5AC__receive_haddr_list(aux_ptr->mpi_comm, num_entries_ptr, haddr_buf_ptr_ptr) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "can't receive clean list")
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5AC__receive_candidate_list() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5AC__rsp__dist_md_write__flush
+ *
+ * Purpose: Routine for handling the details of running a sync point
+ * that is triggered by a flush -- which in turn must have been
+ * triggered by either a flush API call or a file close --
+ * when the distributed metadata write strategy is selected.
+ *
+ * Upon entry, each process generates it own candidate list,
+ * being a sorted list of all dirty metadata entries currently
+ * in the metadata cache. Note that this list must be idendical
+ * across all processes, as all processes see the same stream
+ * of dirty metadata coming in, and use the same lists of
+ * candidate entries at each sync point. (At first glance, this
+ * argument sounds circular, but think of it in the sense of
+ * a recursive proof).
+ *
+ * If this this list is empty, we are done, and the function
+ * returns
+ *
+ * Otherwise, after the sorted list dirty metadata entries is
+ * constructed, each process uses the same algorithm to assign
+ * each entry on the candidate list to exactly one process for
+ * flushing.
+ *
+ * At this point, all processes participate in a barrier to
+ * avoid messages from the past/future bugs.
+ *
+ * Each process then flushes the entries assigned to it, and
+ * marks all other entries on the candidate list as clean.
+ *
+ * Finally, all processes participate in a second barrier to
+ * avoid messages from the past/future bugs.
+ *
+ * At the end of this process, process 0 and only process 0
+ * must tidy up its lists of dirtied and cleaned entries.
+ * These lists are not used in the distributed metadata write
+ * strategy, but they must be maintained should we shift
+ * to a strategy that uses them.
+ *
+ * Return: Success: non-negative
+ *
+ * Failure: negative
+ *
+ * Programmer: John Mainzer
+ * April 28, 2010
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5AC__rsp__dist_md_write__flush(H5F_t *f, hid_t dxpl_id)
+{
+ H5AC_t * cache_ptr;
+ H5AC_aux_t * aux_ptr;
+ haddr_t * haddr_buf_ptr = NULL;
+ int mpi_result;
+ int num_entries = 0;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_STATIC
+
+ /* Sanity checks */
+ HDassert(f != NULL);
+ cache_ptr = f->shared->cache;
+ HDassert(cache_ptr != NULL);
+ aux_ptr = H5C_get_aux_ptr(cache_ptr);
+ HDassert(aux_ptr != NULL);
+ HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
+ HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED);
+
+ /* first construct the candidate list -- initially, this will be in the
+ * form of a skip list. We will convert it later.
+ */
+ if(H5C_construct_candidate_list__clean_cache(cache_ptr) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate list.")
+
+ if(H5SL_count(aux_ptr->candidate_slist_ptr) > 0) {
+ herr_t result;
+
+ /* convert the candidate list into the format we
+ * are used to receiving from process 0.
+ */
+ if(H5AC__copy_candidate_list_to_buffer(cache_ptr, &num_entries, &haddr_buf_ptr) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate buffer.")
+
+ /* initial sync point barrier */
+ if(MPI_SUCCESS != (mpi_result = MPI_Barrier(aux_ptr->mpi_comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_result)
+
+ /* Enable writes during this operation */
+ aux_ptr->write_permitted = TRUE;
+
+ /* Apply the candidate list */
+ result = H5C_apply_candidate_list(f, dxpl_id, cache_ptr, num_entries,
+ haddr_buf_ptr, aux_ptr->mpi_rank, aux_ptr->mpi_size);
+
+ /* Disable writes again */
+ aux_ptr->write_permitted = FALSE;
+
+ /* Check for error on the write operation */
+ if(result < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't apply candidate list.")
+
+ /* this code exists primarily for the test bed -- it allows us to
+ * enforce posix semantics on the server that pretends to be a
+ * file system in our parallel tests.
+ */
+ if(aux_ptr->write_done)
+ (aux_ptr->write_done)();
+
+ /* final sync point barrier */
+ if(MPI_SUCCESS != (mpi_result = MPI_Barrier(aux_ptr->mpi_comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_result)
+
+ /* if this is process zero, tidy up the dirtied,
+ * and flushed and still clean lists.
+ */
+ if(aux_ptr->mpi_rank == 0)
+ if(H5AC__tidy_cache_0_lists(cache_ptr, num_entries, haddr_buf_ptr) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't tidy up process 0 lists.")
+ } /* end if */
+
+ /* if it is defined, call the sync point done callback. Note
+ * that this callback is defined purely for testing purposes,
+ * and should be undefined under normal operating circumstances.
+ */
+ if(aux_ptr->sync_point_done)
+ (aux_ptr->sync_point_done)(num_entries, haddr_buf_ptr);
+
+done:
+ if(haddr_buf_ptr)
+ haddr_buf_ptr = (haddr_t *)H5MM_xfree((void *)haddr_buf_ptr);
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5AC__rsp__dist_md_write__flush() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5AC__rsp__dist_md_write__flush_to_min_clean
+ *
+ * Purpose: Routine for handling the details of running a sync point
+ * triggered by the accumulation of dirty metadata (as
+ * opposed to a flush call to the API) when the distributed
+ * metadata write strategy is selected.
+ *
+ * After invocation and initial sanity checking this function
+ * first checks to see if evictions are enabled -- if they
+ * are not, the function does nothing and returns.
+ *
+ * Otherwise, process zero constructs a list of entries to
+ * be flushed in order to bring the process zero cache back
+ * within its min clean requirement. Note that this list
+ * (the candidate list) may be empty.
+ *
+ * Then, all processes participate in a barrier.
+ *
+ * After the barrier, process 0 broadcasts the number of
+ * entries in the candidate list prepared above, and all
+ * other processes receive this number.
+ *
+ * If this number is zero, we are done, and the function
+ * returns without further action.
+ *
+ * Otherwise, process 0 broadcasts the sorted list of
+ * candidate entries, and all other processes receive it.
+ *
+ * Then, each process uses the same algorithm to assign
+ * each entry on the candidate list to exactly one process
+ * for flushing.
+ *
+ * Each process then flushes the entries assigned to it, and
+ * marks all other entries on the candidate list as clean.
+ *
+ * Finally, all processes participate in a second barrier to
+ * avoid messages from the past/future bugs.
+ *
+ * At the end of this process, process 0 and only process 0
+ * must tidy up its lists of dirtied and cleaned entries.
+ * These lists are not used in the distributed metadata write
+ * strategy, but they must be maintained should we shift
+ * to a strategy that uses them.
+ *
+ * Return: Success: non-negative
+ *
+ * Failure: negative
+ *
+ * Programmer: John Mainzer
+ * April 28, 2010
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5AC__rsp__dist_md_write__flush_to_min_clean(H5F_t *f, hid_t dxpl_id)
+{
+ H5AC_t * cache_ptr;
+ H5AC_aux_t * aux_ptr;
+ hbool_t evictions_enabled;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_STATIC
+
+ /* Sanity checks */
+ HDassert(f != NULL);
+ cache_ptr = f->shared->cache;
+ HDassert(cache_ptr != NULL);
+ aux_ptr = H5C_get_aux_ptr(cache_ptr);
+ HDassert(aux_ptr != NULL);
+ HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
+ HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED);
+
+ /* Query if evictions are allowed */
+ if(H5C_get_evictions_enabled((const H5C_t *)cache_ptr, &evictions_enabled) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "H5C_get_evictions_enabled() failed.")
+
+ if(evictions_enabled) {
+ /* construct candidate list -- process 0 only */
+ if(aux_ptr->mpi_rank == 0)
+ if(H5AC__construct_candidate_list(cache_ptr, aux_ptr, H5AC_SYNC_POINT_OP__FLUSH_TO_MIN_CLEAN) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate list.")
+
+ /* propagate and apply candidate list -- all processes */
+ if(H5AC__propagate_and_apply_candidate_list(f, dxpl_id) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't propagate and apply candidate list.")
+ } /* evictions enabled */
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5AC__rsp__dist_md_write__flush_to_min_clean() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5AC__rsp__p0_only__flush
+ *
+ * Purpose: Routine for handling the details of running a sync point
+ * that is triggered a flush -- which in turn must have been
+ * triggered by either a flush API call or a file close --
+ * when the process 0 only metadata write strategy is selected.
+ *
+ * First, all processes participate in a barrier.
+ *
+ * Then process zero flushes all dirty entries, and broadcasts
+ * they number of clean entries (if any) to all the other
+ * caches.
+ *
+ * If this number is zero, we are done.
+ *
+ * Otherwise, process 0 broadcasts the list of cleaned
+ * entries, and all other processes which are part of this
+ * file group receive it, and mark the listed entries as
+ * clean in their caches.
+ *
+ * Since all processes have the same set of dirty
+ * entries at the beginning of the sync point, and all
+ * entries that will be written are written before
+ * process zero broadcasts the number of cleaned entries,
+ * there is no need for a closing barrier.
+ *
+ * Return: Success: non-negative
+ *
+ * Failure: negative
+ *
+ * Programmer: John Mainzer
+ * April 28, 2010
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5AC__rsp__p0_only__flush(H5F_t *f, hid_t dxpl_id)
+{
+ H5AC_t * cache_ptr;
+ H5AC_aux_t * aux_ptr;
+ int mpi_result;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_STATIC
+
+ /* Sanity checks */
+ HDassert(f != NULL);
+ cache_ptr = f->shared->cache;
+ HDassert(cache_ptr != NULL);
+ aux_ptr = H5C_get_aux_ptr(cache_ptr);
+ HDassert(aux_ptr != NULL);
+ HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
+ HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__PROCESS_0_ONLY);
+
+ /* to prevent "messages from the future" we must
+ * synchronize all processes before we start the flush.
+ * Hence the following barrier.
+ */
+ if(MPI_SUCCESS != (mpi_result = MPI_Barrier(aux_ptr->mpi_comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_result)
+
+ /* Flush data to disk, from rank 0 process */
+ if(aux_ptr->mpi_rank == 0) {
+ herr_t result;
+
+ /* Enable writes during this operation */
+ aux_ptr->write_permitted = TRUE;
+
+ /* Flush the cache */
+ result = H5C_flush_cache(f, dxpl_id, H5AC__NO_FLAGS_SET);
+
+ /* Disable writes again */
+ aux_ptr->write_permitted = FALSE;
+
+ /* Check for error on the write operation */
+ if(result < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush.")
+
+ /* this code exists primarily for the test bed -- it allows us to
+ * enforce posix semantics on the server that pretends to be a
+ * file system in our parallel tests.
+ */
+ if(aux_ptr->write_done)
+ (aux_ptr->write_done)();
+ } /* end if */
+
+ /* Propagate cleaned entries to other ranks. */
+ if(H5AC__propagate_flushed_and_still_clean_entries_list(f, H5AC_dxpl_id) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't propagate clean entries list.")
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5AC__rsp__p0_only__flush() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5AC__rsp__p0_only__flush_to_min_clean
+ *
+ * Purpose: Routine for handling the details of running a sync point
+ * triggered by the accumulation of dirty metadata (as
+ * opposed to a flush call to the API) when the process 0
+ * only metadata write strategy is selected.
+ *
+ * After invocation and initial sanity checking this function
+ * first checks to see if evictions are enabled -- if they
+ * are not, the function does nothing and returns.
+ *
+ * Otherwise, all processes participate in a barrier.
+ *
+ * After the barrier, if this is process 0, the function
+ * causes the cache to flush sufficient entries to get the
+ * cache back within its minimum clean fraction, and broadcast
+ * the number of entries which have been flushed since
+ * the last sync point, and are still clean.
+ *
+ * If this number is zero, we are done.
+ *
+ * Otherwise, process 0 broadcasts the list of cleaned
+ * entries, and all other processes which are part of this
+ * file group receive it, and mark the listed entries as
+ * clean in their caches.
+ *
+ * Since all processes have the same set of dirty
+ * entries at the beginning of the sync point, and all
+ * entries that will be written are written before
+ * process zero broadcasts the number of cleaned entries,
+ * there is no need for a closing barrier.
+ *
+ * Return: Success: non-negative
+ *
+ * Failure: negative
+ *
+ * Programmer: John Mainzer
+ * April 28, 2010
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5AC__rsp__p0_only__flush_to_min_clean(H5F_t *f, hid_t dxpl_id)
+{
+ H5AC_t * cache_ptr;
+ H5AC_aux_t * aux_ptr;
+ hbool_t evictions_enabled;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_STATIC
+
+ /* Sanity checks */
+ HDassert(f != NULL);
+ cache_ptr = f->shared->cache;
+ HDassert(cache_ptr != NULL);
+ aux_ptr = H5C_get_aux_ptr(cache_ptr);
+ HDassert(aux_ptr != NULL);
+ HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
+ HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__PROCESS_0_ONLY);
+
+ /* Query if evictions are allowed */
+ if(H5C_get_evictions_enabled((const H5C_t *)cache_ptr, &evictions_enabled) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "H5C_get_evictions_enabled() failed.")
+
+ /* Flush if evictions are allowed -- following call
+ * will cause process 0 to flush to min clean size,
+ * and then propagate the newly clean entries to the
+ * other processes.
+ *
+ * Otherwise, do nothing.
+ */
+ if(evictions_enabled) {
+ int mpi_result;
+
+ /* to prevent "messages from the future" we must synchronize all
+ * processes before we start the flush. This synchronization may
+ * already be done -- hence the do_barrier parameter.
+ */
+ if(MPI_SUCCESS != (mpi_result = MPI_Barrier(aux_ptr->mpi_comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_result)
+
+ if(0 == aux_ptr->mpi_rank) {
+ herr_t result;
+
+ /* here, process 0 flushes as many entries as necessary to
+ * comply with the currently specified min clean size.
+ * Note that it is quite possible that no entries will be
+ * flushed.
+ */
+
+ /* Enable writes during this operation */
+ aux_ptr->write_permitted = TRUE;
+
+ /* Flush the cache */
+ result = H5C_flush_to_min_clean(f, dxpl_id);
+
+ /* Disable writes again */
+ aux_ptr->write_permitted = FALSE;
+
+ /* Check for error on the write operation */
+ if(result < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "H5C_flush_to_min_clean() failed.")
+
+ /* this call exists primarily for the test code -- it is used
+ * to enforce POSIX semantics on the process used to simulate
+ * reads and writes in t_cache.c.
+ */
+ if(aux_ptr->write_done)
+ (aux_ptr->write_done)();
+ } /* end if */
+
+ if(H5AC__propagate_flushed_and_still_clean_entries_list(f, dxpl_id) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't propagate clean entries list.")
+ } /* end if */
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5AC__rsp__p0_only__flush_to_min_clean() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5AC__run_sync_point
+ *
+ * Purpose: Top level routine for managing a sync point between all
+ * meta data caches in the parallel case. Since all caches
+ * see the same sequence of dirty metadata, we simply count
+ * bytes of dirty metadata, and run a sync point whenever the
+ * number of dirty bytes of metadata seen since the last
+ * sync point exceeds a threshold that is common across all
+ * processes. We also run sync points in response to
+ * HDF5 API calls triggering either a flush or a file close.
+ *
+ * In earlier versions of PHDF5, only the metadata cache with
+ * mpi rank 0 was allowed to write to file. All other
+ * metadata caches on processes with rank greater than 0 were
+ * required to retain dirty entries until they were notified
+ * that the entry is was clean.
+ *
+ * This function was created to make it easier for us to
+ * experiment with other options, as it is a single point
+ * for the execution of sync points.
+ *
+ * Return: Success: non-negative
+ *
+ * Failure: negative
+ *
+ * Programmer: John Mainzer
+ * March 11, 2010
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5AC__run_sync_point(H5F_t *f, hid_t dxpl_id, int sync_point_op)
+{
+ H5AC_t * cache_ptr;
+ H5AC_aux_t * aux_ptr;
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_PACKAGE
+
+ /* Sanity checks */
+ HDassert(f != NULL);
+ cache_ptr = f->shared->cache;
+ HDassert(cache_ptr != NULL);
+ aux_ptr = H5C_get_aux_ptr(cache_ptr);
+ HDassert(aux_ptr != NULL);
+ HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
+ HDassert((sync_point_op == H5AC_SYNC_POINT_OP__FLUSH_TO_MIN_CLEAN) ||
+ (sync_point_op == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED));
+
+#if H5AC_DEBUG_DIRTY_BYTES_CREATION
+HDfprintf(stdout, "%d:H5AC_propagate...:%u: (u/uu/i/iu/r/ru) = %zu/%u/%zu/%u/%zu/%u\n",
+ aux_ptr->mpi_rank,
+ aux_ptr->dirty_bytes_propagations,
+ aux_ptr->unprotect_dirty_bytes,
+ aux_ptr->unprotect_dirty_bytes_updates,
+ aux_ptr->insert_dirty_bytes,
+ aux_ptr->insert_dirty_bytes_updates,
+ aux_ptr->rename_dirty_bytes,
+ aux_ptr->rename_dirty_bytes_updates);
+#endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */
+
+ switch(aux_ptr->metadata_write_strategy) {
+ case H5AC_METADATA_WRITE_STRATEGY__PROCESS_0_ONLY:
+ switch(sync_point_op) {
+ case H5AC_SYNC_POINT_OP__FLUSH_TO_MIN_CLEAN:
+ if(H5AC__rsp__p0_only__flush_to_min_clean(f, dxpl_id) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "H5AC__rsp__p0_only__flush_to_min_clean() failed.")
+ break;
+
+ case H5AC_SYNC_POINT_OP__FLUSH_CACHE:
+ if(H5AC__rsp__p0_only__flush(f, dxpl_id) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "H5AC__rsp__p0_only__flush() failed.")
+ break;
+
+ default:
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "unknown flush op");
+ break;
+ } /* end switch */
+ break;
+
+ case H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED:
+ switch(sync_point_op) {
+ case H5AC_SYNC_POINT_OP__FLUSH_TO_MIN_CLEAN:
+ if(H5AC__rsp__dist_md_write__flush_to_min_clean(f, dxpl_id) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "H5AC__rsp__dist_md_write__flush_to_min_clean() failed.")
+ break;
+
+ case H5AC_SYNC_POINT_OP__FLUSH_CACHE:
+ if(H5AC__rsp__dist_md_write__flush(f, dxpl_id) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "H5AC__rsp__dist_md_write__flush() failed.")
+ break;
+
+ default:
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "unknown flush op");
+ break;
+ } /* end switch */
+ break;
+
+ default:
+ HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Unknown metadata write strategy.")
+ break;
+ } /* end switch */
+
+ /* reset the dirty bytes count */
+ aux_ptr->dirty_bytes = 0;
+
+#if H5AC_DEBUG_DIRTY_BYTES_CREATION
+ aux_ptr->dirty_bytes_propagations += 1;
+ aux_ptr->unprotect_dirty_bytes = 0;
+ aux_ptr->unprotect_dirty_bytes_updates = 0;
+ aux_ptr->insert_dirty_bytes = 0;
+ aux_ptr->insert_dirty_bytes_updates = 0;
+ aux_ptr->rename_dirty_bytes = 0;
+ aux_ptr->rename_dirty_bytes_updates = 0;
+#endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5AC__run_sync_point() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5AC__tidy_cache_0_lists()
+ *
+ * Purpose: In the distributed metadata write strategy, not all dirty
+ * entries are written by process 0 -- thus we must tidy
+ * up the dirtied, and flushed and still clean lists
+ * maintained by process zero after each sync point.
+ *
+ * This procedure exists to tend to this issue.
+ *
+ * At this point, all entries that process 0 cleared should
+ * have been removed from both the dirty and flushed and
+ * still clean lists, and entries that process 0 has flushed
+ * should have been removed from the dirtied list and added
+ * to the flushed and still clean list.
+ *
+ * However, since the distributed metadata write strategy
+ * doesn't make use of these lists, the objective is simply
+ * to maintain these lists in consistent state that allows
+ * them to be used should the metadata write strategy change
+ * to one that uses these lists.
+ *
+ * Thus for our purposes, all we need to do is remove from
+ * the dirtied and flushed and still clean lists all
+ * references to entries that appear in the candidate list.
+ *
+ * Return: Success: non-negative
+ *
+ * Failure: negative
+ *
+ * Programmer: John Mainzer
+ * 4/20/10
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5AC__tidy_cache_0_lists(H5AC_t *cache_ptr, int num_candidates,
+ haddr_t *candidates_list_ptr)
+{
+ H5AC_aux_t * aux_ptr;
+ int i;
+
+ FUNC_ENTER_STATIC_NOERR
+
+ /* Sanity checks */
+ HDassert(cache_ptr != NULL);
+ aux_ptr = H5C_get_aux_ptr(cache_ptr);
+ HDassert(aux_ptr != NULL);
+ HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
+ HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED);
+ HDassert(aux_ptr->mpi_rank == 0);
+ HDassert(num_candidates > 0);
+ HDassert(candidates_list_ptr != NULL);
+
+ /* clean up dirtied and flushed and still clean lists by removing
+ * all entries on the candidate list. Cleared entries should
+ * have been removed from both the dirty and cleaned lists at
+ * this point, flushed entries should have been added to the
+ * cleaned list. However, for this metadata write strategy,
+ * we just want to remove all references to the candidate entries.
+ */
+ for(i = 0; i < num_candidates; i++) {
+ H5AC_slist_entry_t * d_slist_entry_ptr;
+ H5AC_slist_entry_t * c_slist_entry_ptr;
+ haddr_t addr;
+
+ addr = candidates_list_ptr[i];
+
+ /* addr may be either on the dirtied list, or on the flushed
+ * and still clean list. Remove it.
+ */
+ if(NULL != (d_slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->d_slist_ptr, (void *)&addr)))
+ d_slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, d_slist_entry_ptr);
+ if(NULL != (c_slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->c_slist_ptr, (void *)&addr)))
+ c_slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, c_slist_entry_ptr);
+ } /* end for */
+
+ FUNC_LEAVE_NOAPI(SUCCEED)
+} /* H5AC__tidy_cache_0_lists() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5AC__flush_entries
+ *
+ * Purpose: Flush the metadata cache associated with the specified file,
+ * only writing from rank 0, but propagating the cleaned entries
+ * to all ranks.
+ *
+ * Return: Non-negative on success/Negative on failure if there was a
+ * request to flush all items and something was protected.
+ *
+ * Programmer: Quincey Koziol
+ * koziol@hdfgroup.org
+ * Aug 22 2009
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5AC__flush_entries(H5F_t *f, hid_t dxpl_id)
+{
+ herr_t ret_value = SUCCEED; /* Return value */
+
+ FUNC_ENTER_PACKAGE
+
+ /* Sanity checks */
+ HDassert(f);
+ HDassert(f->shared->cache);
+
+ /* Check if we have >1 ranks */
+ if(H5C_get_aux_ptr(f->shared->cache))
+ if(H5AC__run_sync_point(f, dxpl_id, H5AC_SYNC_POINT_OP__FLUSH_CACHE) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't run sync point.")
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5AC__flush_entries() */
+#endif /* H5_HAVE_PARALLEL */
+