diff options
Diffstat (limited to 'src/H5AC.c')
-rw-r--r-- | src/H5AC.c | 2106 |
1 files changed, 2067 insertions, 39 deletions
@@ -42,18 +42,24 @@ *------------------------------------------------------------------------- */ +#define H5C_PACKAGE /*suppress error about including H5Cpkg */ #define H5F_PACKAGE /*suppress error about including H5Fpkg */ /* Interface initialization */ #define H5_INTERFACE_INIT_FUNC H5AC_init_interface +#ifdef H5_HAVE_PARALLEL +#include <mpi.h> +#endif /* H5_HAVE_PARALLEL */ #include "H5private.h" /* Generic Functions */ #include "H5ACprivate.h" /* Metadata cache */ +#include "H5Cpkg.h" /* Cache */ #include "H5Dprivate.h" /* Dataset functions */ #include "H5Eprivate.h" /* Error handling */ #include "H5Fpkg.h" /* Files */ #include "H5FDprivate.h" /* File drivers */ +#include "H5FLprivate.h" /* Free Lists */ #include "H5Iprivate.h" /* IDs */ #include "H5Pprivate.h" /* Property lists */ @@ -62,6 +68,299 @@ #include "H5FPprivate.h" /* Flexible PHDF5 */ #endif /* H5_HAVE_FPHDF5 */ +#define H5AC_DEBUG_DIRTY_BYTES_CREATION 0 + +/**************************************************************************** + * + * structure H5AC_aux_t + * + * While H5AC has become a wrapper for the cache implemented in H5C.c, there + * are some features of the metadata cache that are specific to it, and which + * therefore do not belong in the more generic H5C cache code. + * + * In particular, there is the matter of synchronizing writes from the + * metadata cache to disk in the PHDF5 case. + * + * Prior to this update, the presumption was that all metadata caches would + * write the same data at the same time since all operations modifying + * metadata must be performed collectively. Given this assumption, it was + * safe to allow only the writes from process 0 to actually make it to disk, + * while metadata writes from all other processes were discarded. + * + * Unfortunately, this presumption is in error as operations that read + * metadata need not be collective, but can change the location of dirty + * entries in the metadata cache LRU lists. This can result in the same + * metadata write operation triggering writes from the metadata caches on + * some processes, but not all (causing a hang), or in different sets of + * entries being written from different caches (potentially resulting in + * metadata corruption in the file). + * + * To deal with this issue, I decided to apply a paradigm shift to the way + * metadata is written to disk. + * + * With this set of changes, only the metadata cache on process 0 is able + * to write metadata to disk, although metadata caches on all other + * processes can read metadata from disk as before. + * + * To keep all the other caches from getting plugged up with dirty metadata, + * process 0 periodically broadcasts a list of entries that it has flushed + * since that last notice, and which are currently clean. The other caches + * mark these entries as clean as well, which allows them to evict the + * entries as needed. + * + * One obvious problem in this approach is synchronizing the broadcasts + * and receptions, as different caches may see different amounts of + * activity. + * + * The current solution is for the caches to track the number of bytes + * of newly generated dirty metadata, and to broadcast and receive + * whenever this value exceeds some user specified threshold. + * + * Maintaining this count is easy for all processes not on process 0 -- + * all that is necessary is to add the size of the entry to the total + * whenever there is an insertion, a rename of a previously clean entry, + * or whever a previously clean entry is marked dirty in an unprotect. + * + * On process 0, we have to be careful not to count dirty bytes twice. + * If an entry is marked dirty, flushed, and marked dirty again, all + * within a single reporting period, it only th first marking should + * be added to the dirty bytes generated tally, as that is all that + * the other processes will see. + * + * At present, this structure exists to maintain the fields needed to + * implement the above scheme, and thus is only used in the parallel + * case. However, other uses may arise in the future. + * + * Instance of this structure are associated with metadata caches via + * the aux_ptr field of H5C_t (see H5Cpkg.h). The H5AC code is + * responsible for allocating, maintaining, and discarding instances + * of H5AC_aux_t. + * + * The remainder of this header comments documents the individual fields + * of the structure. + * + * JRM - 6/27/05 + * + * magic: Unsigned 32 bit integer always set to + * H5AC__H5AC_AUX_T_MAGIC. This field is used to validate + * pointers to instances of H5AC_aux_t. + * + * mpi_comm: MPI communicator associated with the file for which the + * cache has been created. + * + * mpi_rank: MPI rank of this process within mpi_comm. + * + * mpi_size: Number of processes in mpi_comm. + * + * write_permitted: Boolean flag used to control whether the cache + * is permitted to write to file. + * + * dirty_bytes_threshold: Integer field containing the dirty bytes + * generation threashold. Whenever dirty byte creation + * exceeds this value, the metadata cache on process 0 + * broadcasts a list of the entries it has flushed since + * the last broadcast (or since the beginning of execution) + * and which are currently clean (if they are still in the + * cache) + * + * Similarly, metadata caches on processes other than process + * 0 will attempt to receive a list of clean entries whenever + * the threshold is exceeded. + * + * dirty_bytes: Integer field containing the number of bytes of dirty + * metadata generated since the beginning of the computation, + * or (more typically) since the last clean entries list + * broadcast. This field is reset to zero after each such + * broadcast. + * + * dirty_bytes_propagations: This field only exists when the + * H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE. + * + * It is used to track the number of times the cleaned list + * has been propagated from process 0 to the other + * processes. + * + * unprotect_dirty_bytes: This field only exists when the + * H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE. + * + * It is used to track the number of dirty bytes created + * via unprotect operations since the last time the cleaned + * list was propagated. + * + * unprotect_dirty_bytes_updates: This field only exists when the + * H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE. + * + * It is used to track the number of times dirty bytes have + * been created via unprotect operations since the last time + * the cleaned list was propagated. + * + * insert_dirty_bytes: This field only exists when the + * H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE. + * + * It is used to track the number of dirty bytes created + * via insert operations since the last time the cleaned + * list was propagated. + * + * insert_dirty_bytes_updates: This field only exists when the + * H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE. + * + * It is used to track the number of times dirty bytes have + * been created via insert operations since the last time + * the cleaned list was propagated. + * + * rename_dirty_bytes: This field only exists when the + * H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE. + * + * It is used to track the number of dirty bytes created + * via rename operations since the last time the cleaned + * list was propagated. + * + * rename_dirty_bytes_updates: This field only exists when the + * H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE. + * + * It is used to track the number of times dirty bytes have + * been created via rename operations since the last time + * the cleaned list was propagated. + * + * d_slist_ptr: Pointer to an instance of H5SL_t used to maintain a list + * of entries that have been dirtied since the last time they + * were listed in a clean entries broadcast. This list is + * only maintained by the metadata cache on process 0 -- it + * it used to maintain a view of the dirty entries as seen + * by the other caches, so as to keep the dirty bytes count + * in synchronization with them. + * + * Thus on process 0, the dirty_bytes count is incremented + * only if either + * + * 1) an entry is inserted in the metadata cache, or + * + * 2) a previously clean entry is renamed, and it does not + * already appear in the dirty entry list, or + * + * 3) a previously clean entry is unprotected with the + * dirtied flag set and the entry does not already appear + * in the dirty entry list. + * + * Entries are added to the dirty entry list whever they cause + * the dirty bytes count to be increased. They are removed + * when they appear in a clean entries broadcast. Note that + * renames must be reflected in the dirty entry list. + * + * To reitterate, this field is only used on process 0 -- it + * should be NULL on all other processes. + * + * d_slist_len: Integer field containing the number of entries in the + * dirty entry list. This field should always contain the + * value 0 on all processes other than process 0. It exists + * primarily for sanity checking. + * + * c_slist_ptr: Pointer to an instance of H5SL_t used to maintain a list + * of entries that were dirty, have been flushed + * to disk since the last clean entries broadcast, and are + * still clean. Since only process 0 can write to disk, this + * list only exists on process 0. + * + * In essence, this slist is used to assemble the contents of + * the next clean entries broadcast. The list emptied after + * each broadcast. + * + * c_slist_len: Integer field containing the number of entries in the clean + * entries list (*c_slist_ptr). This field should always + * contain the value 0 on all processes other than process 0. + * It exists primarily for sanity checking. + * + ****************************************************************************/ + +#ifdef H5_HAVE_PARALLEL + +#define H5AC__H5AC_AUX_T_MAGIC (unsigned)0x00D0A01 + +typedef struct H5AC_aux_t +{ + uint32_t magic; + + MPI_Comm mpi_comm; + + int mpi_rank; + + int mpi_size; + + hbool_t write_permitted; + + int32_t dirty_bytes_threshold; + + int32_t dirty_bytes; + +#if H5AC_DEBUG_DIRTY_BYTES_CREATION + + int32_t dirty_bytes_propagations; + + int32_t unprotect_dirty_bytes; + int32_t unprotect_dirty_bytes_updates; + + int32_t insert_dirty_bytes; + int32_t insert_dirty_bytes_updates; + + int32_t rename_dirty_bytes; + int32_t rename_dirty_bytes_updates; + +#endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */ + + H5SL_t * d_slist_ptr; + + int32_t d_slist_len; + + H5SL_t * c_slist_ptr; + + int32_t c_slist_len; + +} H5AC_aux_t; /* struct H5AC_aux_t */ + +/* Declare a free list to manage the H5AC_aux_t struct */ +H5FL_DEFINE_STATIC(H5AC_aux_t); + +#endif /* H5_HAVE_PARALLEL */ + +/**************************************************************************** + * + * structure H5AC_slist_entry_t + * + * The dirty entry list maintained via the d_slist_ptr field of H5AC_aux_t + * and the cleaned entry list maintained via the c_slist_ptr field of + * H5AC_aux_t are just lists of the file offsets of the dirty/cleaned + * entries. Unfortunately, the slist code makes us define a dynamically + * allocated structure to store these offsets in. This structure serves + * that purpose. Its fields are as follows: + * + * magic: Unsigned 32 bit integer always set to + * H5AC__H5AC_SLIST_ENTRY_T_MAGIC. This field is used to + * validate pointers to instances of H5AC_slist_entry_t. + * + * addr: file offset of a metadata entry. Entries are added to this + * list (if they aren't there already) when they are marked + * dirty in an unprotect, inserted, or renamed. They are + * removed when they appear in a clean entries broadcast. + * + ****************************************************************************/ + +#ifdef H5_HAVE_PARALLEL + +#define H5AC__H5AC_SLIST_ENTRY_T_MAGIC 0x00D0A02 + +typedef struct H5AC_slist_entry_t +{ + uint32_t magic; + + haddr_t addr; +} H5AC_slist_entry_t; + +/* Declare a free list to manage the H5AC_slist_entry_t struct */ +H5FL_DEFINE_STATIC(H5AC_slist_entry_t); + +#endif /* H5_HAVE_PARALLEL */ + + /* * Private file-scope variables. */ @@ -90,6 +389,55 @@ static herr_t H5AC_check_if_write_permitted(const H5F_t *f, hid_t dxpl_id, hbool_t * write_permitted_ptr); +#ifdef H5_HAVE_PARALLEL +static herr_t H5AC_broadcast_clean_list(H5AC_t * cache_ptr); + +static herr_t H5AC_log_deleted_entry(H5AC_t * cache_ptr, + H5AC_info_t * entry_ptr, + haddr_t addr, + unsigned int flags); + +static herr_t H5AC_log_dirtied_entry(H5AC_t * cache_ptr, + H5C_cache_entry_t * entry_ptr, + haddr_t addr, + hbool_t size_changed, + size_t new_size); + +static herr_t H5AC_log_flushed_entry(H5C_t * cache_ptr, + haddr_t addr, + hbool_t was_dirty, + unsigned flags, + int type_id); + +#if 0 /* this is useful debugging code -- JRM */ +static herr_t H5AC_log_flushed_entry_dummy(H5C_t * cache_ptr, + haddr_t addr, + hbool_t was_dirty, + unsigned flags, + int type_id); +#endif /* JRM */ + +static herr_t H5AC_log_inserted_entry(H5F_t * f, + H5AC_t * cache_ptr, + H5AC_info_t * entry_ptr, + const H5AC_class_t * type, + haddr_t addr); + +static herr_t H5AC_propagate_flushed_and_still_clean_entries_list(H5F_t * f, + hid_t dxpl_id, + H5AC_t * cache_ptr, + hbool_t do_barrier); + +static herr_t H5AC_receive_and_apply_clean_list(H5F_t * f, + hid_t primary_dxpl_id, + hid_t secondary_dxpl_id, + H5AC_t * cache_ptr); + +static herr_t H5AC_log_renamed_entry(H5AC_t * cache_ptr, + haddr_t old_addr, + haddr_t new_addr); +#endif /* H5_HAVE_PARALLEL */ + /*------------------------------------------------------------------------- * Function: H5AC_init @@ -351,6 +699,13 @@ H5AC_term_interface(void) * through the function. * JRM - 4/7/05 * + * Added code allocating and initializing the auxilary + * structure (an instance of H5AC_aux_t), and linking it + * to the instance of H5C_t created by H5C_create(). At + * present, the auxilary structure is only used in PHDF5. + * + * JRM - 6/28/05 + * *------------------------------------------------------------------------- */ @@ -376,6 +731,12 @@ H5AC_create(const H5F_t *f, { herr_t ret_value = SUCCEED; /* Return value */ herr_t result; +#ifdef H5_HAVE_PARALLEL + MPI_Comm mpi_comm = MPI_COMM_NULL; + int mpi_rank = -1; + int mpi_size = -1; + H5AC_aux_t * aux_ptr = NULL; +#endif /* H5_HAVE_PARALLEL */ FUNC_ENTER_NOAPI(H5AC_create, FAIL) @@ -390,15 +751,138 @@ H5AC_create(const H5F_t *f, HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "Bad cache configuration"); } - /* The default max cache size and min clean size will frequently be - * overwritten shortly by the subsequent set resize config call. - * -- JRM - */ - f->shared->cache = H5C_create(H5AC__DEFAULT_MAX_CACHE_SIZE, - H5AC__DEFAULT_MIN_CLEAN_SIZE, - (H5AC_NTYPES - 1), - (const char **)H5AC_entry_type_names, - H5AC_check_if_write_permitted); +#ifdef H5_HAVE_PARALLEL + if ( IS_H5FD_MPI(f) ) { + + if ( (mpi_comm = H5F_mpi_get_comm(f)) == MPI_COMM_NULL ) { + + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, \ + "can't get MPI communicator") + } + + if ( (mpi_rank = H5F_mpi_get_rank(f)) < 0 ) { + + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get mpi rank") + } + + if ( (mpi_size = H5F_mpi_get_size(f)) < 0 ) { + + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get mpi size") + } + + /* There is no point in setting up the auxilary structure if size + * is less than or equal to 1, as there will never be any processes + * to broadcast the clean lists to. + */ + if ( mpi_size > 1 ) { + + if ( NULL == (aux_ptr = H5FL_CALLOC(H5AC_aux_t)) ) { + + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, \ + "Can't allocate H5AC auxilary structure.") + + } else { + + aux_ptr->magic = H5AC__H5AC_AUX_T_MAGIC; + aux_ptr->mpi_comm = mpi_comm; + aux_ptr->mpi_rank = mpi_rank; + aux_ptr->mpi_size = mpi_size; + aux_ptr->write_permitted = FALSE; + aux_ptr->dirty_bytes_threshold = 256 * 1024; + aux_ptr->dirty_bytes = 0; +#if H5AC_DEBUG_DIRTY_BYTES_CREATION + aux_ptr->dirty_bytes_propagations = 0; + aux_ptr->unprotect_dirty_bytes = 0; + aux_ptr->unprotect_dirty_bytes_updates = 0; + aux_ptr->insert_dirty_bytes = 0; + aux_ptr->insert_dirty_bytes_updates = 0; + aux_ptr->rename_dirty_bytes = 0; + aux_ptr->rename_dirty_bytes_updates = 0; +#endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */ + aux_ptr->d_slist_ptr = NULL; + aux_ptr->d_slist_len = 0; + aux_ptr->c_slist_ptr = NULL; + aux_ptr->c_slist_len = 0; + } + + if ( mpi_rank == 0 ) { + + aux_ptr->d_slist_ptr = + H5SL_create(H5SL_TYPE_HADDR,0.5,(size_t)16); + + if ( aux_ptr->d_slist_ptr == NULL ) { + + HGOTO_ERROR(H5E_CACHE, H5E_CANTCREATE, FAIL, + "can't create dirtied entry list.") + } + + aux_ptr->c_slist_ptr = + H5SL_create(H5SL_TYPE_HADDR,0.5,(size_t)16); + + if ( aux_ptr->c_slist_ptr == NULL ) { + + HGOTO_ERROR(H5E_CACHE, H5E_CANTCREATE, FAIL, + "can't create cleaned entry list.") + } + } + } + + if ( aux_ptr != NULL ) { + + if ( aux_ptr->mpi_rank == 0 ) { + + f->shared->cache = H5C_create(H5AC__DEFAULT_MAX_CACHE_SIZE, + H5AC__DEFAULT_MIN_CLEAN_SIZE, + (H5AC_NTYPES - 1), + (const char **)H5AC_entry_type_names, + H5AC_check_if_write_permitted, + TRUE, + H5AC_log_flushed_entry, + (void *)aux_ptr); + + } else { + + f->shared->cache = H5C_create(H5AC__DEFAULT_MAX_CACHE_SIZE, + H5AC__DEFAULT_MIN_CLEAN_SIZE, + (H5AC_NTYPES - 1), + (const char **)H5AC_entry_type_names, + NULL, + FALSE, +#if 0 /* this is useful debugging code -- keep it for a while */ /* JRM */ + H5AC_log_flushed_entry_dummy, +#else /* JRM */ + NULL, +#endif /* JRM */ + (void *)aux_ptr); + } + } else { + + f->shared->cache = H5C_create(H5AC__DEFAULT_MAX_CACHE_SIZE, + H5AC__DEFAULT_MIN_CLEAN_SIZE, + (H5AC_NTYPES - 1), + (const char **)H5AC_entry_type_names, + H5AC_check_if_write_permitted, + TRUE, + NULL, + NULL); + } + } else { +#endif /* H5_HAVE_PARALLEL */ + /* The default max cache size and min clean size will frequently be + * overwritten shortly by the subsequent set resize config call. + * -- JRM + */ + f->shared->cache = H5C_create(H5AC__DEFAULT_MAX_CACHE_SIZE, + H5AC__DEFAULT_MIN_CLEAN_SIZE, + (H5AC_NTYPES - 1), + (const char **)H5AC_entry_type_names, + H5AC_check_if_write_permitted, + TRUE, + NULL, + NULL); +#ifdef H5_HAVE_PARALLEL + } +#endif /* H5_HAVE_PARALLEL */ if ( NULL == f->shared->cache ) { @@ -416,6 +900,31 @@ H5AC_create(const H5F_t *f, done: +#ifdef H5_HAVE_PARALLEL + + /* if there is a failure, try to tidy up the auxilary structure */ + + if ( ret_value != SUCCEED ) { + + if ( aux_ptr != NULL ) { + + if ( aux_ptr->d_slist_ptr != NULL ) { + + H5SL_close(aux_ptr->d_slist_ptr); + } + + if ( aux_ptr->c_slist_ptr != NULL ) { + + H5SL_close(aux_ptr->c_slist_ptr); + } + + aux_ptr->magic = 0; + H5FL_FREE(H5AC_aux_t, aux_ptr); + aux_ptr = NULL; + } + } +#endif /* H5_HAVE_PARALLEL */ + FUNC_LEAVE_NOAPI(ret_value) } /* H5AC_create() */ @@ -445,6 +954,10 @@ done: * * JRM - 6/7/04 * + * Added code to free the auxiliary structure and its + * associated slist if present. + * JRM - 6/28/05 + * *------------------------------------------------------------------------- */ herr_t @@ -452,12 +965,23 @@ H5AC_dest(H5F_t *f, hid_t dxpl_id) { H5AC_t *cache = NULL; herr_t ret_value=SUCCEED; /* Return value */ +#ifdef H5_HAVE_PARALLEL + H5AC_aux_t * aux_ptr = NULL; +#endif /* H5_HAVE_PARALLEL */ FUNC_ENTER_NOAPI(H5AC_dest, FAIL) assert(f); assert(f->shared->cache); cache = f->shared->cache; +#ifdef H5_HAVE_PARALLEL + aux_ptr = cache->aux_ptr; + + if ( aux_ptr != NULL ) { + + HDassert ( aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC ); + } +#endif /* H5_HAVE_PARALLEL */ f->shared->cache = NULL; @@ -466,6 +990,25 @@ H5AC_dest(H5F_t *f, hid_t dxpl_id) HGOTO_ERROR(H5E_CACHE, H5E_CANTFREE, FAIL, "can't destroy cache") } +#ifdef H5_HAVE_PARALLEL + if ( aux_ptr != NULL ) { + + if ( aux_ptr->d_slist_ptr != NULL ) { + + H5SL_close(aux_ptr->d_slist_ptr); + } + + if ( aux_ptr->c_slist_ptr != NULL ) { + + H5SL_close(aux_ptr->c_slist_ptr); + } + + aux_ptr->magic = 0; + H5FL_FREE(H5AC_aux_t, aux_ptr); + aux_ptr = NULL; + } +#endif /* H5_HAVE_PARALLEL */ + done: FUNC_LEAVE_NOAPI(ret_value) @@ -529,28 +1072,103 @@ done: * * Complete re-write. See above for details. -- JRM 5/11/04 * - * Abstracted the guts of the function to H5C_dest() in H5C.c, - * and then re-wrote the function as a wrapper for H5C_dest(). + * Abstracted the guts of the function to H5C_flush_cache() + * in H5C.c, and then re-wrote the function as a wrapper for + * H5C_flush_cache(). * * JRM - 6/7/04 * + * JRM - 7/5/05 + * Modified function as part of a fix for a cache coherency + * bug in PHDF5. See the header comments on the H5AC_aux_t + * structure for details. + * *------------------------------------------------------------------------- */ herr_t H5AC_flush(H5F_t *f, hid_t dxpl_id, unsigned flags) { - herr_t status; - herr_t ret_value=SUCCEED; /* Return value */ + herr_t status; + herr_t ret_value = SUCCEED; /* Return value */ +#ifdef H5_HAVE_PARALLEL + H5AC_aux_t * aux_ptr = NULL; + int mpi_code; +#endif /* H5_HAVE_PARALLEL */ + FUNC_ENTER_NOAPI(H5AC_flush, FAIL) HDassert(f); HDassert(f->shared->cache); - status = H5C_flush_cache(f, - dxpl_id, - H5AC_noblock_dxpl_id, - f->shared->cache, +#ifdef H5_HAVE_PARALLEL + aux_ptr = f->shared->cache->aux_ptr; + + if ( aux_ptr != NULL ) { + +#if H5AC_DEBUG_DIRTY_BYTES_CREATION + HDfprintf(stdout, + "%d::H5AC_flush: (u/uu/i/iu/r/ru) = %d/%d/%d/%d/%d/%d\n", + (int)(aux_ptr->mpi_rank), + (int)(aux_ptr->unprotect_dirty_bytes), + (int)(aux_ptr->unprotect_dirty_bytes_updates), + (int)(aux_ptr->insert_dirty_bytes), + (int)(aux_ptr->insert_dirty_bytes_updates), + (int)(aux_ptr->rename_dirty_bytes), + (int)(aux_ptr->rename_dirty_bytes_updates)); +#endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */ + + /* to prevent "messages from the future" we must synchronize all + * processes before we start the flush. Hence the following + * barrier. + */ + if ( MPI_SUCCESS != (mpi_code = MPI_Barrier(aux_ptr->mpi_comm)) ) { + + HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code) + } + + /* if the clear only flag is set, this flush will not involve any + * disk I/O. In such cases, it is not necessary to let process 0 + * flush first. + */ + if ( ( aux_ptr->mpi_rank == 0 ) && + ( (flags & H5AC__FLUSH_CLEAR_ONLY_FLAG) != 0 ) ) { + + unsigned init_flush_flags = H5AC__NO_FLAGS_SET; + + if ( ( (flags & H5AC__FLUSH_MARKED_ENTRIES_FLAG) != 0 ) && + ( (flags & H5AC__FLUSH_INVALIDATE_FLAG) == 0 ) ) { + + init_flush_flags |= H5AC__FLUSH_MARKED_ENTRIES_FLAG; + } + + aux_ptr->write_permitted = TRUE; + + status = H5C_flush_cache(f, + H5AC_noblock_dxpl_id, + H5AC_noblock_dxpl_id, + f->shared->cache, + init_flush_flags); + + aux_ptr->write_permitted = FALSE; + + if ( status < 0 ) { + + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush.") + } + } /* end if ( aux_ptr->mpi_rank == 0 ) */ + + status = H5AC_propagate_flushed_and_still_clean_entries_list(f, + H5AC_noblock_dxpl_id, + f->shared->cache, + FALSE); + } /* end if ( aux_ptr != NULL ) */ +#endif /* H5_HAVE_PARALLEL */ + + status = H5C_flush_cache(f, + dxpl_id, + H5AC_noblock_dxpl_id, + f->shared->cache, flags); if ( status < 0 ) { @@ -616,8 +1234,16 @@ done: * moving management of the dirty flag on cache entries into * the cache code. * + * JRM - 7/5/05 + * Added code to track dirty byte generation, and to trigger + * clean entry list propagation when it exceeds a user + * specified threshold. Note that this code only applies in + * the PHDF5 case. It should have no effect on either the + * serial or FPHSD5 cases. + * *------------------------------------------------------------------------- */ + herr_t H5AC_set(H5F_t *f, hid_t dxpl_id, const H5AC_class_t *type, haddr_t addr, void *thing, unsigned int flags) { @@ -625,6 +1251,9 @@ H5AC_set(H5F_t *f, hid_t dxpl_id, const H5AC_class_t *type, haddr_t addr, void * H5AC_info_t *info; H5AC_t *cache; herr_t ret_value=SUCCEED; /* Return value */ +#ifdef H5_HAVE_PARALLEL + H5AC_aux_t * aux_ptr = NULL; +#endif /* H5_HAVE_PARALLEL */ FUNC_ENTER_NOAPI(H5AC_set, FAIL) @@ -717,6 +1346,23 @@ H5AC_set(H5F_t *f, hid_t dxpl_id, const H5AC_class_t *type, haddr_t addr, void * #endif /* H5_HAVE_FPHDF5 */ #endif /* H5_HAVE_PARALLEL */ +#ifdef H5_HAVE_PARALLEL + if ( NULL != (aux_ptr = f->shared->cache->aux_ptr) ) { + + result = H5AC_log_inserted_entry(f, + f->shared->cache, + (H5AC_info_t *)thing, + type, + addr); + + if ( result < 0 ) { + + HGOTO_ERROR(H5E_CACHE, H5E_CANTINS, FAIL, \ + "H5AC_log_inserted_entry() failed.") + } + } +#endif /* H5_HAVE_PARALLEL */ + result = H5C_insert_entry(f, dxpl_id, H5AC_noblock_dxpl_id, @@ -731,6 +1377,22 @@ H5AC_set(H5F_t *f, hid_t dxpl_id, const H5AC_class_t *type, haddr_t addr, void * HGOTO_ERROR(H5E_CACHE, H5E_CANTINS, FAIL, "H5C_insert_entry() failed") } +#ifdef H5_HAVE_PARALLEL + if ( ( aux_ptr != NULL ) && + ( aux_ptr->dirty_bytes >= aux_ptr->dirty_bytes_threshold ) ) { + + result = H5AC_propagate_flushed_and_still_clean_entries_list(f, + H5AC_noblock_dxpl_id, + f->shared->cache, + TRUE); + if ( result < 0 ) { + + HGOTO_ERROR(H5E_CACHE, H5E_CANTUNPROTECT, FAIL, \ + "Can't propagate clean entries list.") + } + } +#endif /* H5_HAVE_PARALLEL */ + done: FUNC_LEAVE_NOAPI(ret_value) @@ -766,6 +1428,18 @@ done: * in H5C.c, and then re-wrote the function as a wrapper for * H5C_rename_entry(). * + * JRM - 7/5/05 + * Added code to track dirty byte generation, and to trigger + * clean entry list propagation when it exceeds a user + * specified threshold. Note that this code only applies in + * the PHDF5 case. It should have no effect on either the + * serial or FPHSD5 cases. + * + * Note that this code presumes that the renamed entry will + * be present in all caches -- which it must be at present. + * To maintain this invarient, only rename entries immediately + * after you unprotect them. + * *------------------------------------------------------------------------- */ herr_t @@ -773,6 +1447,9 @@ H5AC_rename(H5F_t *f, const H5AC_class_t *type, haddr_t old_addr, haddr_t new_ad { herr_t result; herr_t ret_value=SUCCEED; /* Return value */ +#ifdef H5_HAVE_PARALLEL + H5AC_aux_t * aux_ptr = NULL; +#endif /* H5_HAVE_PARALLEL */ FUNC_ENTER_NOAPI(H5AC_rename, FAIL) @@ -808,6 +1485,13 @@ H5AC_rename(H5F_t *f, const H5AC_class_t *type, haddr_t old_addr, haddr_t new_ad * In any case, don't check this code in without revisiting this * issue. * JRM -- 6/6/05 + * + * On reflection, the code was already broken, as there was no + * way to advise the SAP that a renamed entry had changed its + * address, or was dirty. I will not worry about it for now, + * but the matter must be addressed if we ever get serious + * about FPHDF5. + * JRM -- 7/5/05 */ HGOTO_DONE(SUCCEED); @@ -816,6 +1500,21 @@ H5AC_rename(H5F_t *f, const H5AC_class_t *type, haddr_t old_addr, haddr_t new_ad #endif /* H5_HAVE_FPHDF5 */ #endif /* H5_HAVE_PARALLEL */ +#ifdef H5_HAVE_PARALLEL + if ( NULL != (aux_ptr = f->shared->cache->aux_ptr) ) { + + result = H5AC_log_renamed_entry(f->shared->cache, + old_addr, + new_addr); + + if ( result < 0 ) { + + HGOTO_ERROR(H5E_CACHE, H5E_CANTUNPROTECT, FAIL, \ + "H5AC_log_renamed_entry() failed.") + } + } +#endif /* H5_HAVE_PARALLEL */ + result = H5C_rename_entry(f->shared->cache, type, old_addr, @@ -827,6 +1526,22 @@ H5AC_rename(H5F_t *f, const H5AC_class_t *type, haddr_t old_addr, haddr_t new_ad "H5C_rename_entry() failed.") } +#ifdef H5_HAVE_PARALLEL + if ( ( aux_ptr != NULL ) && + ( aux_ptr->dirty_bytes >= aux_ptr->dirty_bytes_threshold ) ) { + + result = H5AC_propagate_flushed_and_still_clean_entries_list(f, + H5AC_noblock_dxpl_id, + f->shared->cache, + TRUE); + if ( result < 0 ) { + + HGOTO_ERROR(H5E_CACHE, H5E_CANTUNPROTECT, FAIL, \ + "Can't propagate clean entries list.") + } + } +#endif /* H5_HAVE_PARALLEL */ + done: FUNC_LEAVE_NOAPI(ret_value) @@ -837,7 +1552,7 @@ done: /*------------------------------------------------------------------------- * Function: H5AC_protect * - * Purpose: If the target entry is not in the cache, load it. If + * Purpose: If the target entry is not in the cache, load it. If * necessary, attempt to evict one or more entries to keep * the cache within its maximum size. * @@ -845,8 +1560,8 @@ done: * to the caller. The caller must call H5AC_unprotect() when * finished with the entry. * - * While it is protected, the entry may not be either evicted - * or flushed -- nor may it be accessed by another call to + * While it is protected, the entry may not be either evicted + * or flushed -- nor may it be accessed by another call to * H5AC_protect. Any attempt to do so will result in a failure. * * This comment is a re-write of the original Purpose: section. @@ -887,8 +1602,8 @@ done: * Purpose section above. * * JRM - 6/7/04 - * Abstracted the guts of the function to H5C_protect() - * in H5C.c, and then re-wrote the function as a wrapper for + * Abstracted the guts of the function to H5C_protect() + * in H5C.c, and then re-wrote the function as a wrapper for * H5C_protect(). * *------------------------------------------------------------------------- @@ -1108,6 +1823,18 @@ done: * part of a collection of changes directed at moving * management of cache entry dirty flags into the H5C code. * + * JRM - 7/5/05 + * Added code to track dirty byte generation, and to trigger + * clean entry list propagation when it exceeds a user + * specified threshold. Note that this code only applies in + * the PHDF5 case. It should have no effect on either the + * serial or FPHSD5 cases. + * + * JRM - 9/8/05 + * Added code to track entry size changes. This is necessary + * as it can effect dirty byte creation counts, thereby + * throwing the caches out of sync in the PHDF5 case. + * *------------------------------------------------------------------------- */ herr_t @@ -1116,6 +1843,12 @@ H5AC_unprotect(H5F_t *f, hid_t dxpl_id, const H5AC_class_t *type, haddr_t addr, { herr_t result; herr_t ret_value=SUCCEED; /* Return value */ + hbool_t size_changed = FALSE; + hbool_t dirtied; + size_t new_size = 0; +#ifdef H5_HAVE_PARALLEL + H5AC_aux_t * aux_ptr = NULL; +#endif /* H5_HAVE_PARALLEL */ FUNC_ENTER_NOAPI(H5AC_unprotect, FAIL) @@ -1129,6 +1862,23 @@ H5AC_unprotect(H5F_t *f, hid_t dxpl_id, const H5AC_class_t *type, haddr_t addr, HDassert( ((H5AC_info_t *)thing)->addr == addr ); HDassert( ((H5AC_info_t *)thing)->type == type ); + dirtied = ((flags & H5AC__DIRTIED_FLAG) == H5AC__DIRTIED_FLAG ); + + if ( dirtied ) { + + if ( (type->size)(f, thing, &new_size) < 0 ) { + + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTGETSIZE, FAIL, \ + "Can't get size of thing") + } + + if ( ((H5AC_info_t *)thing)->size != new_size ) { + + size_changed = TRUE; + flags = flags | H5AC__SIZE_CHANGED_FLAG; + } + } + #ifdef H5_HAVE_PARALLEL #ifdef H5_HAVE_FPHDF5 /* The following code to support flexible parallel is a direct copy @@ -1202,6 +1952,40 @@ H5AC_unprotect(H5F_t *f, hid_t dxpl_id, const H5AC_class_t *type, haddr_t addr, #endif /* H5_HAVE_FPHDF5 */ #endif /* H5_HAVE_PARALLEL */ +#ifdef H5_HAVE_PARALLEL + if ( ( dirtied ) && ( ((H5AC_info_t *)thing)->is_dirty == FALSE ) && + ( NULL != (aux_ptr = f->shared->cache->aux_ptr) ) ) { + + result = H5AC_log_dirtied_entry(f->shared->cache, + (H5AC_info_t *)thing, + addr, + size_changed, + new_size); + + if ( result < 0 ) { + + HGOTO_ERROR(H5E_CACHE, H5E_CANTUNPROTECT, FAIL, \ + "H5AC_log_dirtied_entry() failed.") + } + } + + if ( ( (flags & H5C__DELETED_FLAG) != 0 ) && + ( NULL != (aux_ptr = f->shared->cache->aux_ptr) ) && + ( aux_ptr->mpi_rank == 0 ) ) { + + result = H5AC_log_deleted_entry(f->shared->cache, + (H5AC_info_t *)thing, + addr, + flags); + + if ( result < 0 ) { + + HGOTO_ERROR(H5E_CACHE, H5E_CANTUNPROTECT, FAIL, \ + "H5AC_log_deleted_entry() failed.") + } + } +#endif /* H5_HAVE_PARALLEL */ + result = H5C_unprotect(f, dxpl_id, H5AC_noblock_dxpl_id, @@ -1209,7 +1993,8 @@ H5AC_unprotect(H5F_t *f, hid_t dxpl_id, const H5AC_class_t *type, haddr_t addr, type, addr, thing, - flags); + flags, + new_size); if ( result < 0 ) { @@ -1217,6 +2002,23 @@ H5AC_unprotect(H5F_t *f, hid_t dxpl_id, const H5AC_class_t *type, haddr_t addr, "H5C_unprotect() failed.") } +#ifdef H5_HAVE_PARALLEL + if ( ( aux_ptr != NULL ) && + ( aux_ptr->dirty_bytes >= aux_ptr->dirty_bytes_threshold ) ) { + + result = H5AC_propagate_flushed_and_still_clean_entries_list(f, + H5AC_noblock_dxpl_id, + f->shared->cache, + TRUE); + + if ( result < 0 ) { + + HGOTO_ERROR(H5E_CACHE, H5E_CANTUNPROTECT, FAIL, \ + "Can't propagate clean entries list.") + } + } +#endif /* H5_HAVE_PARALLEL */ + done: FUNC_LEAVE_NOAPI(ret_value) @@ -1686,6 +2488,184 @@ done: /*------------------------------------------------------------------------- * + * Function: H5AC_broadcast_clean_list() + * + * Purpose: Broadcast the contents of the process 0 cleaned entry + * slist. In passing, also remove all entries from said + * list, and also remove any matching entries from the dirtied + * slist. + * + * This function must only be called by the process with + * MPI_rank 0. + * + * Return SUCCEED on success, and FAIL on failure. + * + * Return: Non-negative on success/Negative on failure. + * + * Programmer: John Mainzer, 7/1/05 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ + +#ifdef H5_HAVE_PARALLEL +static herr_t +H5AC_broadcast_clean_list(H5AC_t * cache_ptr) +{ + herr_t ret_value = SUCCEED; /* Return value */ + haddr_t addr; + H5AC_aux_t * aux_ptr = NULL; + H5SL_node_t * slist_node_ptr = NULL; + H5AC_slist_entry_t * slist_entry_ptr = NULL; + MPI_Offset * buf_ptr = NULL; + size_t buf_size; + int i = 0; + int mpi_result; + int num_entries; + + FUNC_ENTER_NOAPI(H5AC_broadcast_clean_list, FAIL) + + HDassert( cache_ptr != NULL ); + HDassert( cache_ptr->magic == H5C__H5C_T_MAGIC ); + + aux_ptr = cache_ptr->aux_ptr; + + HDassert( aux_ptr != NULL ); + HDassert( aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC ); + HDassert( aux_ptr->mpi_rank == 0 ); + HDassert( aux_ptr->c_slist_ptr != NULL ); + HDassert( H5SL_count(aux_ptr->c_slist_ptr) == + (size_t)(aux_ptr->c_slist_len) ); + + + /* First broadcast the number of entries in the list so that the + * receives can set up a buffer to receive them. If there aren't + * any, we are done. + */ + num_entries = aux_ptr->c_slist_len; + + mpi_result = MPI_Bcast(&num_entries, 1, MPI_INT, 0, aux_ptr->mpi_comm); + + if ( mpi_result != MPI_SUCCESS ) { + + HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed 1", mpi_result) + + } + + if ( num_entries > 0 ) + { + /* allocate a buffer to store the list of entry base addresses in */ + + buf_size = sizeof(MPI_Offset) * (size_t)num_entries; + + buf_ptr = (MPI_Offset *)H5MM_malloc(buf_size); + + if ( buf_ptr == NULL ) { + + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, \ + "memory allocation failed for clean entry buffer") + } + + /* now load the entry base addresses into the buffer, emptying the + * cleaned entry list in passing + */ + + while ( NULL != (slist_node_ptr = H5SL_first(aux_ptr->c_slist_ptr) ) ) + { + slist_entry_ptr = H5SL_item(slist_node_ptr); + + HDassert(slist_entry_ptr->magic == H5AC__H5AC_SLIST_ENTRY_T_MAGIC); + + HDassert( i < num_entries ); + + addr = slist_entry_ptr->addr; + + if ( H5FD_mpi_haddr_to_MPIOff(addr, &(buf_ptr[i])) < 0 ) { + + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, \ + "can't convert from haddr to MPI off") + } + + i++; + + /* now remove the entry from the cleaned entry list */ + if ( H5SL_remove(aux_ptr->c_slist_ptr, (void *)(&addr)) + != slist_entry_ptr ) { + + HGOTO_ERROR(H5E_CACHE, H5E_CANTDELETE, FAIL, \ + "Can't delete entry from cleaned entry slist.") + } + + slist_entry_ptr->magic = 0; + H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr); + slist_entry_ptr = NULL; + + aux_ptr->c_slist_len -= 1; + + HDassert( aux_ptr->c_slist_len >= 0 ); + + /* and also remove the matching entry from the dirtied list + * if it exists. + */ + if ( (slist_entry_ptr = H5SL_search(aux_ptr->d_slist_ptr, + (void *)(&addr))) != NULL ) { + + HDassert( slist_entry_ptr->magic == + H5AC__H5AC_SLIST_ENTRY_T_MAGIC ); + HDassert( slist_entry_ptr->addr == addr ); + + if ( H5SL_remove(aux_ptr->d_slist_ptr, (void *)(&addr)) + != slist_entry_ptr ) { + + HGOTO_ERROR(H5E_CACHE, H5E_CANTDELETE, FAIL, \ + "Can't delete entry from dirty entry slist.") + } + + slist_entry_ptr->magic = 0; + H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr); + slist_entry_ptr = NULL; + + aux_ptr->d_slist_len -= 1; + + HDassert( aux_ptr->d_slist_len >= 0 ); + } + + } /* while */ + + + /* Now broadcast the list of cleaned entries -- if there is one. + * + * The peculiar structure of the following call to MPI_Bcast is + * due to MPI's (?) failure to believe in the MPI_Offset type. + * Thus the element type is MPI_BYTE, with size equal to the + * buf_size computed above. + */ + + mpi_result = MPI_Bcast((void *)buf_ptr, (int)buf_size, MPI_BYTE, 0, + aux_ptr->mpi_comm); + + if ( mpi_result != MPI_SUCCESS ) { + + HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed 2", mpi_result) + } + } + +done: + + if ( buf_ptr != NULL ) { + + buf_ptr = (MPI_Offset *)H5MM_xfree((void *)buf_ptr); + } + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5AC_broadcast_clean_list() */ +#endif /* H5_HAVE_PARALLEL */ + + +/*------------------------------------------------------------------------- + * * Function: H5AC_check_if_write_permitted * * Purpose: Determine if a write is permitted under the current @@ -1703,6 +2683,16 @@ done: * * Modifications: * + * John Mainzer, 9/23/05 + * Rewrote function to return the value of the + * write_permitted field in aux structure if the structure + * exists and mpi_rank is 0. + * + * If the aux structure exists, but mpi_rank isn't 0, the + * function now returns FALSE. + * + * In all other cases, the function returns TRUE. + * *------------------------------------------------------------------------- */ @@ -1720,49 +2710,1087 @@ H5AC_check_if_write_permitted(const H5F_t UNUSED * f, { hbool_t write_permitted = TRUE; herr_t ret_value = SUCCEED; /* Return value */ +#ifdef H5_HAVE_PARALLEL + H5AC_aux_t * aux_ptr = NULL; +#endif /* H5_HAVE_PARALLEL */ + FUNC_ENTER_NOAPI(H5AC_check_if_write_permitted, FAIL) #ifdef H5_HAVE_PARALLEL + HDassert( f != NULL ); + HDassert( f->shared != NULL ); + HDassert( f->shared->cache != NULL ); - if ( IS_H5FD_MPI(f) ) { + aux_ptr = (H5AC_aux_t *)(f->shared->cache->aux_ptr); + + if ( aux_ptr != NULL ) { + + HDassert( aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC ); - H5P_genplist_t *dxpl; /* Dataset transfer property list */ - H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode property value */ + if ( aux_ptr->mpi_rank == 0 ) { - /* Get the dataset transfer property list */ - if ( NULL == (dxpl = H5I_object(dxpl_id)) ) { + write_permitted = aux_ptr->write_permitted; - HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, \ - "not a dataset creation property list") + } else { + + write_permitted = FALSE; + } + } +#endif /* H5_HAVE_PARALLEL */ + + *write_permitted_ptr = write_permitted; + +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5AC_check_if_write_permitted() */ + +/*------------------------------------------------------------------------- + * + * Function: H5AC_log_deleted_entry() + * + * Purpose: Log an entry for which H5C__DELETED_FLAG has been set. + * + * If mpi_rank is 0, we must make sure that the entry doesn't + * appear in the cleaned or dirty entry lists. Otherwise, + * we have nothing to do. + * + * Return SUCCEED on success, and FAIL on failure. + * + * Return: Non-negative on success/Negative on failure. + * + * Programmer: John Mainzer, 6/29/05 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ + +#ifdef H5_HAVE_PARALLEL +static herr_t +H5AC_log_deleted_entry(H5AC_t * cache_ptr, + H5AC_info_t * entry_ptr, + haddr_t addr, + unsigned int flags) +{ + herr_t ret_value = SUCCEED; /* Return value */ + H5AC_aux_t * aux_ptr = NULL; + H5AC_slist_entry_t * slist_entry_ptr = NULL; + + FUNC_ENTER_NOAPI(H5AC_log_deleted_entry, FAIL) + + HDassert( cache_ptr != NULL ); + HDassert( cache_ptr->magic == H5C__H5C_T_MAGIC ); + + aux_ptr = cache_ptr->aux_ptr; + + HDassert( aux_ptr != NULL ); + HDassert( aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC ); + + HDassert( entry_ptr != NULL ); + HDassert( entry_ptr->addr == addr ); + + HDassert( (flags & H5C__DELETED_FLAG) != 0 ); + + if ( aux_ptr->mpi_rank == 0 ) { + + HDassert( aux_ptr->d_slist_ptr != NULL ); + HDassert( aux_ptr->c_slist_ptr != NULL ); + + /* if the entry appears in the dirtied entry slist, remove it. */ + if ( (slist_entry_ptr = H5SL_search(aux_ptr->d_slist_ptr, + (void *)(&addr))) != NULL ) { + + HDassert( slist_entry_ptr->magic == + H5AC__H5AC_SLIST_ENTRY_T_MAGIC ); + HDassert( slist_entry_ptr->addr == addr ); + + if ( H5SL_remove(aux_ptr->d_slist_ptr, (void *)(&addr)) + != slist_entry_ptr ) { + + HGOTO_ERROR(H5E_CACHE, H5E_CANTDELETE, FAIL, \ + "Can't delete entry from dirty entry slist.") + } + + slist_entry_ptr->magic = 0; + H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr); + slist_entry_ptr = NULL; + + aux_ptr->d_slist_len -= 1; + + HDassert( aux_ptr->d_slist_len >= 0 ); } - /* Get the transfer mode property */ - if( H5P_get(dxpl, H5D_XFER_IO_XFER_MODE_NAME, &xfer_mode) < 0 ) { + /* if the entry appears in the cleaned entry slist, remove it. */ + if ( (slist_entry_ptr = H5SL_search(aux_ptr->c_slist_ptr, + (void *)(&addr))) != NULL ) { - HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, \ - "can't retrieve xfer mode") + HDassert( slist_entry_ptr->magic == + H5AC__H5AC_SLIST_ENTRY_T_MAGIC ); + HDassert( slist_entry_ptr->addr == addr ); + + if ( H5SL_remove(aux_ptr->c_slist_ptr, (void *)(&addr)) + != slist_entry_ptr ) { + + HGOTO_ERROR(H5E_CACHE, H5E_CANTDELETE, FAIL, \ + "Can't delete entry from cleaned entry slist.") + } + slist_entry_ptr->magic = 0; + H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr); + slist_entry_ptr = NULL; + + aux_ptr->c_slist_len -= 1; + + HDassert( aux_ptr->c_slist_len >= 0 ); } + } - if ( xfer_mode == H5FD_MPIO_INDEPENDENT ) { +done: + + FUNC_LEAVE_NOAPI(ret_value) - write_permitted = FALSE; +} /* H5AC_log_deleted_entry() */ +#endif /* H5_HAVE_PARALLEL */ + + +/*------------------------------------------------------------------------- + * + * Function: H5AC_log_dirtied_entry() + * + * Purpose: Update the dirty_bytes count for a newly dirtied entry. + * + * If mpi_rank isnt 0, this simply means adding the size + * of the entries to the dirty_bytes count. + * + * If mpi_rank is 0, we must first check to see if the entry + * appears in the dirty entries slist. If it is, do nothing. + * If it isn't, add the size to th dirty_bytes count, add the + * entry to the dirty entries slist, and remove it from the + * cleaned list (if it is present there). + * + * Return SUCCEED on success, and FAIL on failure. + * + * Return: Non-negative on success/Negative on failure. + * + * Programmer: John Mainzer, 6/29/05 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ + +#ifdef H5_HAVE_PARALLEL +static herr_t +H5AC_log_dirtied_entry(H5AC_t * cache_ptr, + H5AC_info_t * entry_ptr, + haddr_t addr, + hbool_t size_changed, + size_t new_size) +{ + herr_t ret_value = SUCCEED; /* Return value */ + size_t entry_size; + H5AC_aux_t * aux_ptr = NULL; + H5AC_slist_entry_t * slist_entry_ptr = NULL; + + FUNC_ENTER_NOAPI(H5AC_log_dirtied_entry, FAIL) + + HDassert( cache_ptr != NULL ); + HDassert( cache_ptr->magic == H5C__H5C_T_MAGIC ); + + aux_ptr = cache_ptr->aux_ptr; + + HDassert( aux_ptr != NULL ); + HDassert( aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC ); + + HDassert( entry_ptr != NULL ); + HDassert( entry_ptr->addr == addr ); + HDassert( entry_ptr->is_dirty == FALSE ); + + if ( size_changed ) { + + entry_size = new_size; + + } else { + + entry_size = entry_ptr->size; + } + + if ( aux_ptr->mpi_rank == 0 ) { + + HDassert( aux_ptr->d_slist_ptr != NULL ); + HDassert( aux_ptr->c_slist_ptr != NULL ); + + if ( H5SL_search(aux_ptr->d_slist_ptr, (void *)(&addr)) == NULL ) { + + /* insert the address of the entry in the dirty entry list, and + * add its size to the dirty_bytes count. + */ + if ( NULL == (slist_entry_ptr = H5FL_CALLOC(H5AC_slist_entry_t)) ) { + + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, \ + "Can't allocate dirty slist entry .") + } + + slist_entry_ptr->magic = H5AC__H5AC_SLIST_ENTRY_T_MAGIC; + slist_entry_ptr->addr = addr; + + if ( H5SL_insert(aux_ptr->d_slist_ptr, slist_entry_ptr, + &(slist_entry_ptr->addr)) < 0 ) { + + HGOTO_ERROR(H5E_CACHE, H5E_CANTINSERT, FAIL, \ + "can't insert entry into dirty entry slist.") + } + + aux_ptr->d_slist_len += 1; + aux_ptr->dirty_bytes += entry_size; +#if H5AC_DEBUG_DIRTY_BYTES_CREATION + aux_ptr->unprotect_dirty_bytes += entry_size; + aux_ptr->unprotect_dirty_bytes_updates += 1; +#endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */ + } + + if ( H5SL_search(aux_ptr->c_slist_ptr, (void *)(&addr)) != NULL ) { + + /* the entry is dirty. If it exists on the cleaned entries list, + * remove it. + */ + if ( (slist_entry_ptr = H5SL_search(aux_ptr->c_slist_ptr, + (void *)(&addr))) != NULL ) { + + HDassert( slist_entry_ptr->magic == + H5AC__H5AC_SLIST_ENTRY_T_MAGIC ); + HDassert( slist_entry_ptr->addr == addr ); + + if ( H5SL_remove(aux_ptr->c_slist_ptr, (void *)(&addr)) + != slist_entry_ptr ) { + + HGOTO_ERROR(H5E_CACHE, H5E_CANTDELETE, FAIL, \ + "Can't delete entry from clean entry slist.") + } + + slist_entry_ptr->magic = 0; + H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr); + slist_entry_ptr = NULL; + + aux_ptr->c_slist_len -= 1; + + HDassert( aux_ptr->c_slist_len >= 0 ); + } + } + } else { + + aux_ptr->dirty_bytes += entry_size; +#if H5AC_DEBUG_DIRTY_BYTES_CREATION + aux_ptr->unprotect_dirty_bytes += entry_size; + aux_ptr->unprotect_dirty_bytes_updates += 1; +#endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */ + } + +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5AC_log_dirtied_entry() */ +#endif /* H5_HAVE_PARALLEL */ + + +/*------------------------------------------------------------------------- + * + * Function: H5AC_log_flushed_entry() + * + * Purpose: Update the clean entry slist for the flush of an entry -- + * specifically, if the entry has been cleared, remove it + * from both the cleaned and dirtied lists if it is present. + * Otherwise, if the entry was dirty, insert the indicated + * entry address in the clean slist if it isn't there already. + * + * This function is only used in PHDF5, and should only + * be called for the process with mpi rank 0. + * + * Return SUCCEED on success, and FAIL on failure. + * + * Return: Non-negative on success/Negative on failure. + * + * Programmer: John Mainzer, 6/29/05 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ + +#ifdef H5_HAVE_PARALLEL +#if 0 /* This is useful debugging code. -- JRM */ +static herr_t +H5AC_log_flushed_entry_dummy(H5C_t * cache_ptr, + haddr_t addr, + hbool_t was_dirty, + unsigned flags, + int type_id) +{ + herr_t ret_value = SUCCEED; /* Return value */ + H5AC_aux_t * aux_ptr = NULL; + + FUNC_ENTER_NOAPI(H5AC_log_flushed_entry_dummy, FAIL) + + aux_ptr = cache_ptr->aux_ptr; + + if ( ( was_dirty ) && ( (flags & H5C__FLUSH_CLEAR_ONLY_FLAG) == 0 ) ) { + + HDfprintf(stdout, + "%d:H5AC_log_flushed_entry(): addr = %d, flags = %x, was_dirty = %d, type_id = %d\n", + (int)(aux_ptr->mpi_rank), (int)addr, flags, (int)was_dirty, type_id); + } +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5AC_log_flushed_entry_dummy() */ +#endif /* JRM */ + +static herr_t +H5AC_log_flushed_entry(H5C_t * cache_ptr, + haddr_t addr, + hbool_t was_dirty, + unsigned flags, + UNUSED int type_id) +{ + herr_t ret_value = SUCCEED; /* Return value */ + hbool_t cleared; + H5AC_aux_t * aux_ptr; + H5AC_slist_entry_t * slist_entry_ptr = NULL; + + + FUNC_ENTER_NOAPI(H5AC_log_flushed_entry, FAIL) + + HDassert( cache_ptr != NULL ); + HDassert( cache_ptr->magic == H5C__H5C_T_MAGIC ); + + aux_ptr = cache_ptr->aux_ptr; + + HDassert( aux_ptr != NULL ); + HDassert( aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC ); + HDassert( aux_ptr->mpi_rank == 0 ); + HDassert( aux_ptr->c_slist_ptr != NULL ); + + cleared = ( (flags & H5C__FLUSH_CLEAR_ONLY_FLAG) != 0 ); + + if ( cleared ) { + + /* If the entry has been cleared, must remove it from both the + * cleaned list and the dirtied list. + */ + + if ( (slist_entry_ptr = H5SL_search(aux_ptr->c_slist_ptr, + (void *)(&addr))) != NULL ) { + + HDassert( slist_entry_ptr->magic == H5AC__H5AC_SLIST_ENTRY_T_MAGIC); + HDassert( slist_entry_ptr->addr == addr ); + + if ( H5SL_remove(aux_ptr->c_slist_ptr, (void *)(&addr)) + != slist_entry_ptr ) { + + HGOTO_ERROR(H5E_CACHE, H5E_CANTDELETE, FAIL, \ + "Can't delete entry from clean entry slist.") + } + + slist_entry_ptr->magic = 0; + H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr); + slist_entry_ptr = NULL; + + aux_ptr->c_slist_len -= 1; + + HDassert( aux_ptr->c_slist_len >= 0 ); + } + + if ( (slist_entry_ptr = H5SL_search(aux_ptr->d_slist_ptr, + (void *)(&addr))) != NULL ) { + + HDassert( slist_entry_ptr->magic == H5AC__H5AC_SLIST_ENTRY_T_MAGIC); + HDassert( slist_entry_ptr->addr == addr ); + + if ( H5SL_remove(aux_ptr->d_slist_ptr, (void *)(&addr)) + != slist_entry_ptr ) { + + HGOTO_ERROR(H5E_CACHE, H5E_CANTDELETE, FAIL, \ + "Can't delete entry from dirty entry slist.") + } + + slist_entry_ptr->magic = 0; + H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr); + slist_entry_ptr = NULL; + + aux_ptr->d_slist_len -= 1; + + HDassert( aux_ptr->d_slist_len >= 0 ); + } + } else if ( was_dirty ) { + + if ( H5SL_search(aux_ptr->c_slist_ptr, (void *)(&addr)) == NULL ) { + + /* insert the address of the entry in the clean entry list. */ + + if ( NULL == (slist_entry_ptr = H5FL_CALLOC(H5AC_slist_entry_t)) ) { + + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, \ + "Can't allocate clean slist entry .") + } + + slist_entry_ptr->magic = H5AC__H5AC_SLIST_ENTRY_T_MAGIC; + slist_entry_ptr->addr = addr; + + if ( H5SL_insert(aux_ptr->c_slist_ptr, slist_entry_ptr, + &(slist_entry_ptr->addr)) < 0 ) { + + HGOTO_ERROR(H5E_CACHE, H5E_CANTINSERT, FAIL, \ + "can't insert entry into clean entry slist.") + } + + aux_ptr->c_slist_len += 1; + } + } + +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5AC_log_flushed_entry() */ +#endif /* H5_HAVE_PARALLEL */ + + +/*------------------------------------------------------------------------- + * + * Function: H5AC_log_inserted_entry() + * + * Purpose: Update the dirty_bytes count for a newly inserted entry. + * + * If mpi_rank isnt 0, this simply means adding the size + * of the entry to the dirty_bytes count. + * + * If mpi_rank is 0, we must also add the entry to the + * dirty entries slist. + * + * Return SUCCEED on success, and FAIL on failure. + * + * Return: Non-negative on success/Negative on failure. + * + * Programmer: John Mainzer, 6/30/05 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ + +#ifdef H5_HAVE_PARALLEL +static herr_t +H5AC_log_inserted_entry(H5F_t * f, + H5AC_t * cache_ptr, + H5AC_info_t * entry_ptr, + const H5AC_class_t * type, + haddr_t addr) +{ + herr_t ret_value = SUCCEED; /* Return value */ + size_t size; + H5AC_aux_t * aux_ptr = NULL; + H5AC_slist_entry_t * slist_entry_ptr = NULL; + + FUNC_ENTER_NOAPI(H5AC_log_inserted_entry, FAIL) + + HDassert( cache_ptr != NULL ); + HDassert( cache_ptr->magic == H5C__H5C_T_MAGIC ); + + aux_ptr = cache_ptr->aux_ptr; + + HDassert( aux_ptr != NULL ); + HDassert( aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC ); + + HDassert( entry_ptr != NULL ); + HDassert( entry_ptr->addr == addr ); + HDassert( entry_ptr->type == type ); + + /* the size field of the entry will not have been set yet, so we + * have to obtain it directly. + */ + if ( (type->size)(f, (void *)entry_ptr, &size) < 0 ) { + + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTGETSIZE, FAIL, \ + "Can't get size of entry to be inserted.") + } + + if ( aux_ptr->mpi_rank == 0 ) { + + HDassert( aux_ptr->d_slist_ptr != NULL ); + HDassert( aux_ptr->c_slist_ptr != NULL ); + + if ( H5SL_search(aux_ptr->d_slist_ptr, (void *)(&addr)) == NULL ) { + + /* insert the address of the entry in the dirty entry list, and + * add its size to the dirty_bytes count. + */ + if ( NULL == (slist_entry_ptr = H5FL_CALLOC(H5AC_slist_entry_t)) ) { + + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, \ + "Can't allocate dirty slist entry .") + } + + slist_entry_ptr->magic = H5AC__H5AC_SLIST_ENTRY_T_MAGIC; + slist_entry_ptr->addr = addr; + + if ( H5SL_insert(aux_ptr->d_slist_ptr, slist_entry_ptr, + &(slist_entry_ptr->addr)) < 0 ) { + + HGOTO_ERROR(H5E_CACHE, H5E_CANTINSERT, FAIL, \ + "can't insert entry into dirty entry slist.") + } + + aux_ptr->d_slist_len += 1; + + } else { + + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, \ + "Inserted entry already in dirty slist.") + } + + if ( H5SL_search(aux_ptr->c_slist_ptr, (void *)(&addr)) != NULL ) { + + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, \ + "Inserted entry in clean slist.") + } + } + + aux_ptr->dirty_bytes += size; + +#if H5AC_DEBUG_DIRTY_BYTES_CREATION + aux_ptr->insert_dirty_bytes += size; + aux_ptr->insert_dirty_bytes_updates += 1; +#endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */ + +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5AC_log_inserted_entry() */ +#endif /* H5_HAVE_PARALLEL */ + + +/*------------------------------------------------------------------------- + * + * Function: H5AC_log_renamed_entry() + * + * Purpose: Update the dirty_bytes count for a renamed entry. + * + * WARNING + * + * At present, the way that the rename call is used ensures + * that the renamed entry is present in all caches by + * renaming in a collective operation and immediately after + * unprotecting the target entry. + * + * This function uses this invarient, and will cause arcane + * failures if it is not met. If maintaining this invarient + * becomes impossible, we will have to rework this function + * extensively, and likely include a bit of IPC for + * synchronization. A better option might be to subsume + * rename in the unprotect operation. + * + * Given that the target entry is in all caches, the function + * proceeds as follows: + * + * For processes with mpi rank other 0, it simply checks to + * see if the entry was dirty prior to the rename, and adds + * the entries size to the dirty bytes count. + * + * In the process with mpi rank 0, the function first checks + * to see if the entry was dirty prior to the rename. If it + * was, and if the entry doesn't appear in the dirtied list + * under its old address, it adds the entry's size to the + * dirty bytes count. + * + * The rank 0 process then removes any references to the + * entry under its old address from the cleands and dirtied + * lists, and inserts an entry in the dirtied list under the + * new address. + * + * Return SUCCEED on success, and FAIL on failure. + * + * Return: Non-negative on success/Negative on failure. + * + * Programmer: John Mainzer, 6/30/05 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ + +#ifdef H5_HAVE_PARALLEL +static herr_t +H5AC_log_renamed_entry(H5AC_t * cache_ptr, + haddr_t old_addr, + haddr_t new_addr) +{ + herr_t ret_value = SUCCEED; /* Return value */ + hbool_t entry_in_cache; + hbool_t entry_dirty; + size_t entry_size; + H5AC_aux_t * aux_ptr = NULL; + H5AC_slist_entry_t * slist_entry_ptr = NULL; + + FUNC_ENTER_NOAPI(H5AC_log_renamed_entry, FAIL) + + HDassert( cache_ptr != NULL ); + HDassert( cache_ptr->magic == H5C__H5C_T_MAGIC ); + + aux_ptr = cache_ptr->aux_ptr; + + HDassert( aux_ptr != NULL ); + HDassert( aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC ); + + /* get entry status, size, etc here */ + if ( H5C_get_entry_status(cache_ptr, old_addr, &entry_size, &entry_in_cache, + &entry_dirty, NULL) < 0 ) { + + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't get entry status.") + + } else if ( ! entry_in_cache ) { + + HDassert( entry_in_cache ); + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "entry not in cache.") + } + + if ( aux_ptr->mpi_rank == 0 ) { + + HDassert( aux_ptr->d_slist_ptr != NULL ); + HDassert( aux_ptr->c_slist_ptr != NULL ); + + /* if the entry appears in the cleaned entry slist, under its old + * address, remove it. + */ + if ( (slist_entry_ptr = H5SL_search(aux_ptr->c_slist_ptr, + (void *)(&old_addr))) != NULL ) { + + HDassert( slist_entry_ptr->magic == + H5AC__H5AC_SLIST_ENTRY_T_MAGIC ); + HDassert( slist_entry_ptr->addr == old_addr ); + + if ( H5SL_remove(aux_ptr->c_slist_ptr, (void *)(&old_addr)) + != slist_entry_ptr ) { + + HGOTO_ERROR(H5E_CACHE, H5E_CANTDELETE, FAIL, \ + "Can't delete entry from cleaned entry slist.") + } + + slist_entry_ptr->magic = 0; + H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr); + slist_entry_ptr = NULL; + + aux_ptr->c_slist_len -= 1; + + HDassert( aux_ptr->c_slist_len >= 0 ); + } + + /* if the entry appears in the dirtied entry slist under its old + * address, remove it, but don't free it. Set addr to new_addr. + */ + if ( (slist_entry_ptr = H5SL_search(aux_ptr->d_slist_ptr, + (void *)(&old_addr))) != NULL ) { + + HDassert( slist_entry_ptr->magic == + H5AC__H5AC_SLIST_ENTRY_T_MAGIC ); + HDassert( slist_entry_ptr->addr == old_addr ); + + if ( H5SL_remove(aux_ptr->d_slist_ptr, (void *)(&old_addr)) + != slist_entry_ptr ) { + + HGOTO_ERROR(H5E_CACHE, H5E_CANTDELETE, FAIL, \ + "Can't delete entry from dirty entry slist.") + } + + slist_entry_ptr->addr = new_addr; + + aux_ptr->d_slist_len -= 1; + + HDassert( aux_ptr->d_slist_len >= 0 ); } else { + + /* otherwise, allocate a new entry that is ready + * for insertion, and increment dirty_bytes. + * + * Note that the fact that the entry wasn't in the dirtied + * list under its old address implies that it must have + * been clean to start with. + */ + + HDassert( !entry_dirty ); + + if ( NULL == (slist_entry_ptr = H5FL_CALLOC(H5AC_slist_entry_t)) ) { + + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, \ + "Can't allocate dirty slist entry .") + } + + slist_entry_ptr->magic = H5AC__H5AC_SLIST_ENTRY_T_MAGIC; + slist_entry_ptr->addr = new_addr; - HDassert(xfer_mode == H5FD_MPIO_COLLECTIVE ); + aux_ptr->dirty_bytes += entry_size; +#if H5AC_DEBUG_DIRTY_BYTES_CREATION + aux_ptr->rename_dirty_bytes += entry_size; + aux_ptr->rename_dirty_bytes_updates += 1; +#endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */ } + + /* verify that there is no entry at new_addr in the dirty slist */ + if ( H5SL_search(aux_ptr->d_slist_ptr, (void *)(&new_addr)) != NULL ) { + + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, \ + "dirty slist already contains entry at new_addr.") + } + + /* insert / reinsert the entry in the dirty slist */ + if ( H5SL_insert(aux_ptr->d_slist_ptr, slist_entry_ptr, + &(slist_entry_ptr->addr)) < 0 ) { + + HGOTO_ERROR(H5E_CACHE, H5E_CANTINSERT, FAIL, \ + "can't insert entry into dirty entry slist.") + } + + aux_ptr->d_slist_len += 1; + + } else if ( ! entry_dirty ) { + + aux_ptr->dirty_bytes += entry_size; + +#if H5AC_DEBUG_DIRTY_BYTES_CREATION + aux_ptr->rename_dirty_bytes += entry_size; + aux_ptr->rename_dirty_bytes_updates += 1; +#endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */ } +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5AC_log_renamed_entry() */ #endif /* H5_HAVE_PARALLEL */ - *write_permitted_ptr = write_permitted; + +/*------------------------------------------------------------------------- + * Function: H5AC_propagate_flushed_and_still_clean_entries_list + * + * Purpose: In PHDF5, only the metadata cache with mpi rank 0 is allowed + * to write to file. All other metadata caches on processes + * with rank greater than 0 must retain dirty entries until + * they are notified that the entry is now clean. + * + * This function is the main routine for that proceedure. + * It must be called simultaniously on all processes that + * have the relevant file open. To this end, there must + * be a barrier immediately prior to this call. + * + * Typicaly, this will be done one of two ways: + * + * 1) Dirty byte creation exceeds some user specified value. + * + * While metadata reads may occur independently, all + * operations writing metadata must be collective. Thus + * all metadata caches see the same sequence of operations, + * and therefore the same dirty data creation. + * + * This fact is used to synchronize the caches for purposes + * of propagating the list of flushed and still clean + * entries, by simply calling this function from all + * caches whenever some user specified threshold on dirty + * data is exceeded. + * + * 2) Under direct user control -- this operation must be + * collective. + * + * The operations to be managed by this function are as + * follows: + * + * For the process with mpi rank 0: + * + * 1) Enable writes, flush the cache to its min clean size, + * and then disable writes again. + * + * 2) Load the contents of the flushed and still clean entries + * list (c_slist_ptr) into a buffer, and broadcast that + * buffer to all the other caches. + * + * 3) Clear the flushed and still clean entries list + * (c_slist_ptr). + * + * + * For all processes with mpi rank greater than 0: + * + * 1) Receive the flushed and still clean entries list broadcast + * + * 2) Mark the specified entries as clean. + * + * + * For all processes: + * + * 1) Reset the dirtied bytes count to 0. + * + * Return: Success: non-negative + * + * Failure: negative + * + * Programmer: John Mainzer + * July 5, 2005 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ + +#ifdef H5_HAVE_PARALLEL +herr_t +H5AC_propagate_flushed_and_still_clean_entries_list(H5F_t * f, + hid_t dxpl_id, + H5AC_t * cache_ptr, + hbool_t do_barrier) +{ + herr_t ret_value = SUCCEED; /* Return value */ + herr_t result; + int mpi_code; + H5AC_aux_t * aux_ptr = NULL; + + FUNC_ENTER_NOAPI(H5AC_propagate_flushed_and_still_clean_entries_list, FAIL) + + HDassert( cache_ptr != NULL ); + HDassert( cache_ptr->magic == H5C__H5C_T_MAGIC ); + + aux_ptr = cache_ptr->aux_ptr; + + HDassert( aux_ptr != NULL ); + HDassert( aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC ); + +#if H5AC_DEBUG_DIRTY_BYTES_CREATION + HDfprintf(stdout, + "%d:H5AC_propagate...:%d: (u/uu/i/iu/r/ru) = %d/%d/%d/%d/%d/%d\n", + (int)(aux_ptr->mpi_rank), + (int)(aux_ptr->dirty_bytes_propagations), + (int)(aux_ptr->unprotect_dirty_bytes), + (int)(aux_ptr->unprotect_dirty_bytes_updates), + (int)(aux_ptr->insert_dirty_bytes), + (int)(aux_ptr->insert_dirty_bytes_updates), + (int)(aux_ptr->rename_dirty_bytes), + (int)(aux_ptr->rename_dirty_bytes_updates)); +#endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */ + + if ( do_barrier ) { + + /* to prevent "messages from the future" we must synchronize all + * processes before we start the flush. This synchronization may + * already be done -- hence the do_barrier parameter. + */ + + if ( MPI_SUCCESS != (mpi_code = MPI_Barrier(aux_ptr->mpi_comm)) ) { + + HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code) + } + } + + if ( aux_ptr->mpi_rank == 0 ) { + + aux_ptr->write_permitted = TRUE; + + result = H5C_flush_to_min_clean(f, dxpl_id, H5AC_noblock_dxpl_id, + cache_ptr); + + aux_ptr->write_permitted = FALSE; + + if ( result < 0 ) { + + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, \ + "H5C_flush_to_min_clean() failed.") + } + + if ( H5AC_broadcast_clean_list(cache_ptr) < 0 ) { + + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, \ + "Can't broadcast clean slist.") + } + + HDassert( aux_ptr->c_slist_len == 0 ); + + } else { + + if ( H5AC_receive_and_apply_clean_list(f, dxpl_id, + H5AC_noblock_dxpl_id, + cache_ptr) < 0 ) { + + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, \ + "Can't receive and/or process clean slist broadcast.") + } + } + + aux_ptr->dirty_bytes = 0; +#if H5AC_DEBUG_DIRTY_BYTES_CREATION + aux_ptr->dirty_bytes_propagations += 1; + aux_ptr->unprotect_dirty_bytes = 0; + aux_ptr->unprotect_dirty_bytes_updates = 0; + aux_ptr->insert_dirty_bytes = 0; + aux_ptr->insert_dirty_bytes_updates = 0; + aux_ptr->rename_dirty_bytes = 0; + aux_ptr->rename_dirty_bytes_updates = 0; +#endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */ done: FUNC_LEAVE_NOAPI(ret_value) -} /* H5AC_check_if_write_permitted() */ +} /* H5AC_propagate_flushed_and_still_clean_entries_list() */ +#endif /* H5_HAVE_PARALLEL */ + + +/*------------------------------------------------------------------------- + * + * Function: H5AC_receive_and_apply_clean_list() + * + * Purpose: Receive the list of cleaned entries from process 0, + * and mark the specified entries as clean. + * + * This function must only be called by the process with + * MPI_rank greater than 0. + * + * Return SUCCEED on success, and FAIL on failure. + * + * Return: Non-negative on success/Negative on failure. + * + * Programmer: John Mainzer, 7/4/05 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ + +#ifdef H5_HAVE_PARALLEL +static herr_t +H5AC_receive_and_apply_clean_list(H5F_t * f, + hid_t primary_dxpl_id, + hid_t secondary_dxpl_id, + H5AC_t * cache_ptr) +{ + herr_t ret_value = SUCCEED; /* Return value */ + H5AC_aux_t * aux_ptr = NULL; + haddr_t * haddr_buf_ptr = NULL; + MPI_Offset * MPI_Offset_buf_ptr = NULL; + size_t buf_size; + int i = 0; + int mpi_result; + int num_entries; + + FUNC_ENTER_NOAPI(H5AC_receive_and_apply_clean_list, FAIL) + + HDassert( cache_ptr != NULL ); + HDassert( cache_ptr->magic == H5C__H5C_T_MAGIC ); + + aux_ptr = cache_ptr->aux_ptr; + + HDassert( aux_ptr != NULL ); + HDassert( aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC ); + HDassert( aux_ptr->mpi_rank != 0 ); + + /* First receive the number of entries in the list so that we + * can set up a buffer to receive them. If there aren't + * any, we are done. + */ + mpi_result = MPI_Bcast(&num_entries, 1, MPI_INT, 0, aux_ptr->mpi_comm); + + if ( mpi_result != MPI_SUCCESS ) { + + HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed 1", mpi_result) + } + + if ( num_entries > 0 ) + { + /* allocate a buffers to store the list of entry base addresses in */ + + buf_size = sizeof(MPI_Offset) * (size_t)num_entries; + + MPI_Offset_buf_ptr = (MPI_Offset *)H5MM_malloc(buf_size); + + if ( MPI_Offset_buf_ptr == NULL ) { + + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, \ + "memory allocation failed for receive buffer") + } + + haddr_buf_ptr = (haddr_t *)H5MM_malloc(sizeof(haddr_t) * + (size_t)num_entries); + + if ( haddr_buf_ptr == NULL ) { + + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, \ + "memory allocation failed for haddr buffer") + } + + + /* Now receive the list of cleaned entries + * + * The peculiar structure of the following call to MPI_Bcast is + * due to MPI's (?) failure to believe in the MPI_Offset type. + * Thus the element type is MPI_BYTE, with size equal to the + * buf_size computed above. + */ + + mpi_result = MPI_Bcast((void *)MPI_Offset_buf_ptr, (int)buf_size, + MPI_BYTE, 0, aux_ptr->mpi_comm); + + if ( mpi_result != MPI_SUCCESS ) { + + HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed 2", mpi_result) + } + + + /* translate the MPI_Offsets to haddr_t */ + i = 0; + while ( i < num_entries ) + { + haddr_buf_ptr[i] = H5FD_mpi_MPIOff_to_haddr(MPI_Offset_buf_ptr[i]); + + if ( haddr_buf_ptr[i] == HADDR_UNDEF ) { + + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, \ + "can't convert MPI off to haddr") + } + + i++; + } + + + /* mark the indicated entries as clean */ + if ( H5C_mark_entries_as_clean(f, primary_dxpl_id, secondary_dxpl_id, + cache_ptr, (int32_t)num_entries, + &(haddr_buf_ptr[0])) < 0 ) { + + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, \ + "Can't mark entries clean.") + + } + } + +done: + + if ( MPI_Offset_buf_ptr != NULL ) { + + MPI_Offset_buf_ptr = + (MPI_Offset *)H5MM_xfree((void *)MPI_Offset_buf_ptr); + } + + if ( haddr_buf_ptr != NULL ) { + + haddr_buf_ptr = (haddr_t *)H5MM_xfree((void *)haddr_buf_ptr); + } + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5AC_receive_and_apply_clean_list() */ +#endif /* H5_HAVE_PARALLEL */ + |