diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/H5AC.c | 95 | ||||
-rw-r--r-- | src/H5ACmpio.c | 8 | ||||
-rw-r--r-- | src/H5ACprivate.h | 11 | ||||
-rw-r--r-- | src/H5C.c | 797 | ||||
-rw-r--r-- | src/H5Cmpio.c | 305 | ||||
-rw-r--r-- | src/H5Cpkg.h | 271 | ||||
-rw-r--r-- | src/H5Cprivate.h | 6 | ||||
-rw-r--r-- | src/H5Dchunk.c | 20 | ||||
-rw-r--r-- | src/H5FDmpio.c | 78 | ||||
-rw-r--r-- | src/H5Fint.c | 40 | ||||
-rw-r--r-- | src/H5Fmpi.c | 29 | ||||
-rw-r--r-- | src/H5Fpkg.h | 4 | ||||
-rw-r--r-- | src/H5Fprivate.h | 15 | ||||
-rw-r--r-- | src/H5Fquery.c | 28 | ||||
-rw-r--r-- | src/H5Pdxpl.c | 17 | ||||
-rw-r--r-- | src/H5Pfapl.c | 292 | ||||
-rw-r--r-- | src/H5Pint.c | 23 | ||||
-rw-r--r-- | src/H5Plapl.c | 14 | ||||
-rw-r--r-- | src/H5Ppkg.h | 2 | ||||
-rw-r--r-- | src/H5Pprivate.h | 8 | ||||
-rw-r--r-- | src/H5Ppublic.h | 6 |
21 files changed, 1735 insertions, 334 deletions
@@ -80,8 +80,13 @@ hbool_t H5_PKG_INIT_VAR = FALSE; /* Library Private Variables */ /*****************************/ -/* Default dataset transfer property list for metadata I/O calls */ -hid_t H5AC_dxpl_id = (-1); +/* Default dataset transfer property list for metadata I/O calls (coll write, ind read) */ +hid_t H5AC_ind_read_dxpl_id = (-1); +hid_t H5AC_dxpl_id; +#ifdef H5_HAVE_PARALLEL +/* collective metadata read property */ +hid_t H5AC_coll_read_dxpl_id = (-1); +#endif /* H5_HAVE_PARALLEL */ /* DXPL to be used in operations that will not result in I/O calls */ hid_t H5AC_noio_dxpl_id = (-1); @@ -174,8 +179,11 @@ done: herr_t H5AC__init_package(void) { -#ifdef H5_DEBUG_BUILD H5P_genplist_t *xfer_plist; /* Dataset transfer property list object */ +#ifdef H5_HAVE_PARALLEL + H5P_coll_md_read_flag_t coll_meta_read; +#endif /* H5_HAVE_PARALLEL */ +#ifdef H5_DEBUG_BUILD H5FD_dxpl_type_t dxpl_type; /* Property indicating the type of the internal dxpl */ #endif /* H5_DEBUG_BUILD */ herr_t ret_value = SUCCEED; /* Return value */ @@ -195,12 +203,16 @@ H5AC__init_package(void) } #endif /* H5_HAVE_PARALLEL */ -#ifdef H5_DEBUG_BUILD - /* Get an ID for the metadata (H5AC) dxpl */ - if((H5AC_dxpl_id = H5P_create_id(H5P_CLS_DATASET_XFER_g, FALSE)) < 0) +#if defined(H5_HAVE_PARALLEL) || defined(H5_DEBUG_BUILD) + /* Get an ID for the internal independent metadata dxpl */ + if((H5AC_ind_read_dxpl_id = H5P_create_id(H5P_CLS_DATASET_XFER_g, FALSE)) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTCREATE, FAIL, "unable to register property list") + + /* if this is a debug build, set the dxpl type flag on the + independent metadata dxpl and create the noio and raw data internal dxpls */ +#ifdef H5_DEBUG_BUILD /* Get the property list object */ - if (NULL == (xfer_plist = (H5P_genplist_t *)H5I_object(H5AC_dxpl_id))) + if (NULL == (xfer_plist = (H5P_genplist_t *)H5I_object(H5AC_ind_read_dxpl_id))) HGOTO_ERROR(H5E_CACHE, H5E_BADATOM, FAIL, "can't get new property list object") /* Insert the dxpl type property */ dxpl_type = H5FD_METADATA_DXPL; @@ -228,11 +240,40 @@ H5AC__init_package(void) dxpl_type = H5FD_RAWDATA_DXPL; if(H5P_set(xfer_plist, H5FD_DXPL_TYPE_NAME, &dxpl_type) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTSET, FAIL, "can't set dxpl type property") -#else /* H5_DEBUG_BUILD */ - H5AC_dxpl_id = H5P_DATASET_XFER_DEFAULT; +#endif /* H5_DEBUG_BUILD */ + + /* if this is a parallel build, create an internal dxpl for + collective metadata reads */ +#ifdef H5_HAVE_PARALLEL + /* Get an ID for H5AC_coll_read_dxpl_id */ + if((H5AC_coll_read_dxpl_id = H5P_create_id(H5P_CLS_DATASET_XFER_g, FALSE)) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTCREATE, FAIL, "unable to register property list") + /* Get the property list object */ + if (NULL == (xfer_plist = (H5P_genplist_t *)H5I_object(H5AC_coll_read_dxpl_id))) + HGOTO_ERROR(H5E_CACHE, H5E_BADATOM, FAIL, "can't get new property list object") + /* set 'collective metadata read' property */ + coll_meta_read = H5P_USER_TRUE; + if(H5P_set(xfer_plist, H5_COLL_MD_READ_FLAG_NAME, &coll_meta_read) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set collective metadata read flag") + + /* if we have a debug build, set the dxpl type to metadata on the + collective metadata dxpl */ +#ifdef H5_DEBUG_BUILD + /* set metadata dxpl type */ + dxpl_type = H5FD_METADATA_DXPL; + if(H5P_set(xfer_plist, H5FD_DXPL_TYPE_NAME, &dxpl_type) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTSET, FAIL, "can't set dxpl type property") +#endif /* H5_DEBUG_BUILD */ +#endif /* H5_HAVE_PARALLEL */ + +#else /* defined(H5_HAVE_PARALLEL) || defined(H5_DEBUG_BUILD) */ + H5AC_ind_read_dxpl_id = H5P_DATASET_XFER_DEFAULT; H5AC_noio_dxpl_id = H5P_DATASET_XFER_DEFAULT; H5AC_rawdata_dxpl_id = H5P_DATASET_XFER_DEFAULT; -#endif /* H5_DEBUG_BUILD */ +#endif /* defined(H5_HAVE_PARALLEL) || defined(H5_DEBUG_BUILD) */ + + /* MSC - Temp. Remove later */ + H5AC_dxpl_id = H5AC_ind_read_dxpl_id; done: FUNC_LEAVE_NOAPI(ret_value) @@ -261,21 +302,33 @@ H5AC_term_package(void) FUNC_ENTER_NOAPI_NOINIT_NOERR if(H5_PKG_INIT_VAR) { - if(H5AC_dxpl_id > 0 || H5AC_noio_dxpl_id > 0 || H5AC_rawdata_dxpl_id > 0) { -#ifdef H5_DEBUG_BUILD + if(H5AC_ind_read_dxpl_id > 0 || H5AC_noio_dxpl_id > 0 || H5AC_rawdata_dxpl_id > 0 +#ifdef H5_HAVE_PARALLEL + || H5AC_coll_read_dxpl_id > 0 +#endif /* H5_HAVE_PARALLEL */ + ) { +#if defined(H5_HAVE_PARALLEL) || defined(H5_DEBUG_BUILD) /* Indicate more work to do */ n = 1; /* H5I */ - /* Close H5AC dxpl */ - if(H5I_dec_ref(H5AC_dxpl_id) < 0 || + /* Close H5AC dxpls */ + if(H5I_dec_ref(H5AC_ind_read_dxpl_id) < 0 || H5I_dec_ref(H5AC_noio_dxpl_id) < 0 || - H5I_dec_ref(H5AC_rawdata_dxpl_id) < 0) + H5I_dec_ref(H5AC_rawdata_dxpl_id) < 0 +#ifdef H5_HAVE_PARALLEL + || H5I_dec_ref(H5AC_coll_read_dxpl_id) < 0 +#endif /* H5_HAVE_PARALLEL */ + ) H5E_clear_stack(NULL); /*ignore error*/ -#endif /* H5_DEBUG_BUILD */ +#endif /* defined(H5_HAVE_PARALLEL) || defined(H5_DEBUG_BUILD) */ + /* Reset static IDs */ - H5AC_dxpl_id = (-1); + H5AC_ind_read_dxpl_id = (-1); H5AC_noio_dxpl_id = (-1); H5AC_rawdata_dxpl_id = (-1); +#ifdef H5_HAVE_PARALLEL + H5AC_coll_read_dxpl_id = (-1); +#endif /* H5_HAVE_PARALLEL */ } /* end if */ /* Reset interface initialization flag */ @@ -491,6 +544,10 @@ H5AC_dest(H5F_t *f, hid_t dxpl_id) #endif /* H5AC__TRACE_FILE_ENABLED */ #ifdef H5_HAVE_PARALLEL + /* destroying the cache, so clear all collective entries */ + if(H5C_clear_coll_entries(f->shared->cache, 0) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "H5C_clear_coll_entries() failed.") + aux_ptr = (H5AC_aux_t *)H5C_get_aux_ptr(f->shared->cache); if(aux_ptr) /* Sanity check */ @@ -629,6 +686,10 @@ H5AC_flush(H5F_t *f, hid_t dxpl_id) #endif /* H5AC__TRACE_FILE_ENABLED */ #ifdef H5_HAVE_PARALLEL + /* flushing the cache, so clear all collective entries */ + if(H5C_clear_coll_entries(f->shared->cache, 0) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "H5C_clear_coll_entries() failed.") + /* Attempt to flush all entries from rank 0 & Bcast clean list to other ranks */ if(H5AC__flush_entries(f, dxpl_id) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush.") diff --git a/src/H5ACmpio.c b/src/H5ACmpio.c index 79b0b54..db291cf 100644 --- a/src/H5ACmpio.c +++ b/src/H5ACmpio.c @@ -2065,6 +2065,14 @@ HDfprintf(stdout, "%d:H5AC_propagate...:%u: (u/uu/i/iu/r/ru) = %zu/%u/%zu/%u/%zu aux_ptr->rename_dirty_bytes_updates); #endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */ + /* clear collective access flag on half of the entries in the + cache and mark them as independent in case they need to be + evicted later. All ranks are guranteed to mark the same entries + since we don't modify the order of the collectively accessed + entries except through collective access. */ + if(H5C_clear_coll_entries(cache_ptr, TRUE) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "H5C_clear_coll_entries() failed.") + switch(aux_ptr->metadata_write_strategy) { case H5AC_METADATA_WRITE_STRATEGY__PROCESS_0_ONLY: switch(sync_point_op) { diff --git a/src/H5ACprivate.h b/src/H5ACprivate.h index 0231aa0..71ddcf3 100644 --- a/src/H5ACprivate.h +++ b/src/H5ACprivate.h @@ -181,13 +181,6 @@ typedef H5C_cache_entry_t H5AC_info_t; /* Typedef for metadata cache (defined in H5Cpkg.h) */ typedef H5C_t H5AC_t; -#ifdef H5_HAVE_PARALLEL -/* Definitions for "collective metadata write" property */ -#define H5AC_COLLECTIVE_META_WRITE_NAME "H5AC_collective_metadata_write" -#define H5AC_COLLECTIVE_META_WRITE_SIZE sizeof(unsigned) -#define H5AC_COLLECTIVE_META_WRITE_DEF 0 -#endif /* H5_HAVE_PARALLEL */ - #define H5AC_METADATA_TAG_NAME "H5AC_metadata_tag" #define H5AC_METADATA_TAG_SIZE sizeof(haddr_t) #define H5AC_METADATA_TAG_DEF H5AC__INVALID_TAG @@ -196,6 +189,10 @@ typedef H5C_t H5AC_t; /* Dataset transfer property list for metadata calls */ H5_DLLVAR hid_t H5AC_dxpl_id; +H5_DLLVAR hid_t H5AC_ind_read_dxpl_id; +#ifdef H5_HAVE_PARALLEL +H5_DLLVAR hid_t H5AC_coll_read_dxpl_id; +#endif /* H5_HAVE_PARALLEL */ /* DXPL to be used in operations that will not result in I/O calls */ H5_DLLVAR hid_t H5AC_noio_dxpl_id; @@ -88,13 +88,11 @@ #include "H5Cpkg.h" /* Cache */ #include "H5Eprivate.h" /* Error handling */ #include "H5Fpkg.h" /* Files */ -#include "H5FDprivate.h" /* File drivers */ #include "H5FLprivate.h" /* Free Lists */ #include "H5Iprivate.h" /* IDs */ #include "H5MFprivate.h" /* File memory management */ #include "H5MMprivate.h" /* Memory management */ #include "H5Pprivate.h" /* Property lists */ -#include "H5SLprivate.h" /* Skip lists */ /****************/ @@ -156,6 +154,9 @@ static herr_t H5C_flush_ring(H5F_t *f, hid_t dxpl_id, H5C_ring_t ring, static void * H5C_load_entry(H5F_t * f, hid_t dxpl_id, +#ifdef H5_HAVE_PARALLEL + hbool_t coll_access, +#endif /* H5_HAVE_PARALLEL */ const H5C_class_t * type, haddr_t addr, void * udata); @@ -180,6 +181,9 @@ static herr_t H5C_mark_tagged_entries(H5C_t * cache_ptr, static herr_t H5C_flush_marked_entries(H5F_t * f, hid_t dxpl_id); +static herr_t H5C__generate_image(const H5F_t *f, H5C_t * cache_ptr, H5C_cache_entry_t *entry_ptr, + hid_t dxpl_id, int64_t *entry_size_change_ptr); + #if H5C_DO_TAGGING_SANITY_CHECKS static herr_t H5C_verify_tag(int id, haddr_t tag); #endif @@ -220,6 +224,9 @@ hbool_t H5_PKG_INIT_VAR = FALSE; /* Declare a free list to manage the H5C_t struct */ H5FL_DEFINE_STATIC(H5C_t); +/* Declare extern free list to manage the H5C_collective_write_t struct */ +H5FL_EXTERN(H5C_collective_write_t); + /**************************************************************************** @@ -549,6 +556,13 @@ H5C_create(size_t max_cache_size, cache_ptr->LRU_head_ptr = NULL; cache_ptr->LRU_tail_ptr = NULL; +#ifdef H5_HAVE_PARALLEL + cache_ptr->coll_list_len = 0; + cache_ptr->coll_list_size = (size_t)0; + cache_ptr->coll_head_ptr = NULL; + cache_ptr->coll_tail_ptr = NULL; +#endif /* H5_HAVE_PARALLEL */ + cache_ptr->cLRU_list_len = 0; cache_ptr->cLRU_list_size = (size_t)0; cache_ptr->cLRU_head_ptr = NULL; @@ -970,6 +984,12 @@ H5C_expunge_entry(H5F_t *f, hid_t dxpl_id, const H5C_class_t *type, HGOTO_ERROR(H5E_CACHE, H5E_CANTEXPUNGE, FAIL, "Target entry is protected.") if(entry_ptr->is_pinned) HGOTO_ERROR(H5E_CACHE, H5E_CANTEXPUNGE, FAIL, "Target entry is pinned.") +#ifdef H5_HAVE_PARALLEL + if(entry_ptr->coll_access) { + entry_ptr->coll_access = FALSE; + H5C__REMOVE_FROM_COLL_LIST(cache_ptr, entry_ptr, FAIL) + } /* end if */ +#endif /* H5_HAVE_PARALLEL */ /* If we get this far, call H5C__flush_single_entry() with the * H5C__FLUSH_INVALIDATE_FLAG and the H5C__FLUSH_CLEAR_ONLY_FLAG. @@ -987,7 +1007,7 @@ H5C_expunge_entry(H5F_t *f, hid_t dxpl_id, const H5C_class_t *type, /* Delete the entry from the skip list on destroy */ flush_flags |= H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG; - if(H5C__flush_single_entry(f, dxpl_id, entry_ptr, flush_flags, NULL) < 0) + if(H5C__flush_single_entry(f, dxpl_id, entry_ptr, flush_flags, NULL, NULL) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTEXPUNGE, FAIL, "can't flush entry") #if H5C_DO_SANITY_CHECKS @@ -1755,6 +1775,9 @@ H5C_insert_entry(H5F_t * f, H5AC_ring_t ring = H5C_RING_UNDEFINED; hbool_t insert_pinned; hbool_t flush_last; +#ifdef H5_HAVE_PARALLEL + hbool_t coll_access = FALSE; /* whether access to the cache entry is done collectively */ +#endif /* H5_HAVE_PARALLEL */ hbool_t set_flush_marker; hbool_t write_permitted = TRUE; size_t empty_space; @@ -1892,6 +1915,11 @@ H5C_insert_entry(H5F_t * f, entry_ptr->aux_next = NULL; entry_ptr->aux_prev = NULL; +#ifdef H5_HAVE_PARALLEL + entry_ptr->coll_next = NULL; + entry_ptr->coll_prev = NULL; +#endif /* H5_HAVE_PARALLEL */ + H5C__RESET_CACHE_ENTRY_STATS(entry_ptr) if ( ( cache_ptr->flash_size_increase_possible ) && @@ -1993,6 +2021,44 @@ H5C_insert_entry(H5F_t * f, H5C__UPDATE_STATS_FOR_INSERTION(cache_ptr, entry_ptr) +#ifdef H5_HAVE_PARALLEL + /* Get the dataset transfer property list */ + if(NULL == (dxpl = (H5P_genplist_t *)H5I_object(dxpl_id))) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a property list"); + + if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) { + coll_access = (H5P_USER_TRUE == f->coll_md_read ? TRUE : FALSE); + + if(!coll_access && H5P_FORCE_FALSE != f->coll_md_read) { + H5P_coll_md_read_flag_t prop_value; + + /* Get the property value */ + if(H5P_get(dxpl, H5_COLL_MD_READ_FLAG_NAME, &prop_value) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "Can't get collective metadata access flag") + coll_access = (H5P_USER_TRUE == prop_value ? TRUE : FALSE); + } /* end if */ + } /* end if */ + + entry_ptr->coll_access = coll_access; + if(coll_access) { + H5C__INSERT_IN_COLL_LIST(cache_ptr, entry_ptr, FAIL) + + /* Make sure the size of the collective entries in the cache remain in check */ + if(H5P_USER_TRUE == f->coll_md_read) { + if(cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100) { + if(H5C_clear_coll_entries(cache_ptr, TRUE) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't clear collective metadata entries") + } /* end if */ + } /* end if */ + else { + if(cache_ptr->max_cache_size * 40 < cache_ptr->coll_list_size * 100) { + if(H5C_clear_coll_entries(cache_ptr, TRUE) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't clear collective metadata entries") + } /* end if */ + } /* end else */ + } /* end if */ +#endif + done: #if H5C_DO_EXTREME_SANITY_CHECKS if ( ( H5C_validate_protected_entry_list(cache_ptr) < 0 ) || @@ -2349,6 +2415,14 @@ H5C_resize_entry(void *thing, size_t new_size) (entry_ptr->size), (new_size)) } /* end if */ +#ifdef H5_HAVE_PARALLEL + if(entry_ptr->coll_access) { + H5C__DLL_UPDATE_FOR_SIZE_CHANGE((cache_ptr->coll_list_len), \ + (cache_ptr->coll_list_size), \ + (entry_ptr->size), (new_size)) + } /* end if */ +#endif /* H5_HAVE_PARALLEL */ + /* update the hash table */ H5C__UPDATE_INDEX_FOR_SIZE_CHANGE((cache_ptr), (entry_ptr->size),\ (new_size), (entry_ptr), (was_clean)); @@ -2575,6 +2649,9 @@ H5C_protect(H5F_t * f, hbool_t have_write_permitted = FALSE; hbool_t read_only = FALSE; hbool_t flush_last; +#ifdef H5_HAVE_PARALLEL + hbool_t coll_access = FALSE; /* whether access to the cache entry is done collectively */ +#endif /* H5_HAVE_PARALLEL */ hbool_t write_permitted; size_t empty_space; void * thing; @@ -2616,6 +2693,21 @@ H5C_protect(H5F_t * f, if((H5P_get(dxpl, H5AC_RING_NAME, &ring)) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, NULL, "unable to query ring value") +#ifdef H5_HAVE_PARALLEL + if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) { + coll_access = (H5P_USER_TRUE == f->coll_md_read ? TRUE : FALSE); + + if(!coll_access && H5P_FORCE_FALSE != f->coll_md_read) { + H5P_coll_md_read_flag_t prop_value; + + /* get the property value */ + if(H5P_get(dxpl, H5_COLL_MD_READ_FLAG_NAME, &prop_value) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, NULL, "Can't get collective metadata access flag") + coll_access = (H5P_USER_TRUE == prop_value ? TRUE : FALSE); + } /* end if */ + } /* end if */ +#endif /* H5_HAVE_PARALLEL */ + /* first check to see if the target is in cache */ H5C__SEARCH_INDEX(cache_ptr, addr, entry_ptr, NULL) @@ -2627,6 +2719,62 @@ H5C_protect(H5F_t * f, if(entry_ptr->type != type) HGOTO_ERROR(H5E_CACHE, H5E_BADTYPE, NULL, "incorrect cache entry type") + /* if this is a collective metadata read, the entry is not + marked as collective, and is clean, it is possible that + other processes will not have it in its cache and will + expect a bcast of the entry from process 0. So process 0 + will bcast the entry to all other ranks. Ranks that do have + the entry in their cache still have to participate in the + bcast. */ +#ifdef H5_HAVE_PARALLEL + if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI) && coll_access) { + if(!(entry_ptr->is_dirty) && !(entry_ptr->coll_access)) { + MPI_Comm comm; /* File MPI Communicator */ + int mpi_code; /* MPI error code */ + int buf_size; + + if(MPI_COMM_NULL == (comm = H5F_mpi_get_comm(f))) + HGOTO_ERROR(H5E_FILE, H5E_CANTGET, NULL, "get_comm request failed") + + if(entry_ptr->image_ptr == NULL) { + int mpi_rank; + size_t image_size; + + if((mpi_rank = H5F_mpi_get_rank(f)) < 0) + HGOTO_ERROR(H5E_FILE, H5E_CANTGET, NULL, "Can't get MPI rank") + + if(entry_ptr->compressed) + image_size = entry_ptr->compressed_size; + else + image_size = entry_ptr->size; + HDassert(image_size > 0); + + if(NULL == (entry_ptr->image_ptr = H5MM_malloc(image_size + H5C_IMAGE_EXTRA_SPACE))) + HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, NULL, "memory allocation failed for on disk image buffer") +#if H5C_DO_MEMORY_SANITY_CHECKS + HDmemcpy(((uint8_t *)entry_ptr->image_ptr) + image_size, + H5C_IMAGE_SANITY_VALUE, H5C_IMAGE_EXTRA_SPACE); +#endif /* H5C_DO_MEMORY_SANITY_CHECKS */ + if(0 == mpi_rank) + if(H5C__generate_image(f, cache_ptr, entry_ptr, dxpl_id, NULL) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, NULL, "can't generate entry's image") + } /* end if */ + HDassert(entry_ptr->image_ptr); + + H5_CHECKED_ASSIGN(buf_size, int, entry_ptr->size, size_t); + if(MPI_SUCCESS != (mpi_code = MPI_Bcast(entry_ptr->image_ptr, buf_size, MPI_BYTE, 0, comm))) + HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code) + + /* Mark the entry as collective and insert into the collective list */ + entry_ptr->coll_access = TRUE; + H5C__INSERT_IN_COLL_LIST(cache_ptr, entry_ptr, NULL) + } /* end if */ + else if(entry_ptr->coll_access) { + H5C__MOVE_TO_TOP_IN_COLL_LIST(cache_ptr, entry_ptr, NULL) + } /* end else-if */ + } /* end if */ +#endif /* H5_HAVE_PARALLEL */ + #if H5C_DO_TAGGING_SANITY_CHECKS { haddr_t tag = HADDR_UNDEF; @@ -2659,7 +2807,11 @@ H5C_protect(H5F_t * f, hit = FALSE; - thing = H5C_load_entry(f, dxpl_id, type, addr, udata); + thing = H5C_load_entry(f, dxpl_id, +#ifdef H5_HAVE_PARALLEL + coll_access, +#endif /* H5_HAVE_PARALLEL */ + type, addr, udata); if ( thing == NULL ) { @@ -2668,6 +2820,10 @@ H5C_protect(H5F_t * f, entry_ptr = (H5C_cache_entry_t *)thing; entry_ptr->ring = ring; +#ifdef H5_HAVE_PARALLEL + if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI) && entry_ptr->coll_access) + H5C__INSERT_IN_COLL_LIST(cache_ptr, entry_ptr, NULL) +#endif /* H5_HAVE_PARALLEL */ /* Apply tag to newly protected entry */ if(H5C_tag_entry(cache_ptr, entry_ptr, dxpl_id) < 0) @@ -2901,6 +3057,24 @@ H5C_protect(H5F_t * f, } } +#ifdef H5_HAVE_PARALLEL + if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) { + /* Make sure the size of the collective entries in the cache remain in check */ + if(coll_access) { + if(H5P_USER_TRUE == f->coll_md_read) { + if(cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100) + if(H5C_clear_coll_entries(cache_ptr, TRUE) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "can't clear collective metadata entries") + } /* end if */ + else { + if(cache_ptr->max_cache_size * 40 < cache_ptr->coll_list_size * 100) + if(H5C_clear_coll_entries(cache_ptr, TRUE) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "can't clear collective metadata entries") + } /* end else */ + } /* end if */ + } /* end if */ +#endif /* H5_HAVE_PARALLEL */ + done: #if H5C_DO_EXTREME_SANITY_CHECKS @@ -4544,7 +4718,7 @@ H5C_unprotect(H5F_t * f, /* Delete the entry from the skip list on destroy */ flush_flags |= H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG; - if(H5C__flush_single_entry(f, dxpl_id, entry_ptr, flush_flags, NULL) < 0) + if(H5C__flush_single_entry(f, dxpl_id, entry_ptr, flush_flags, NULL, NULL) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTUNPROTECT, FAIL, "Can't flush entry") #if H5C_DO_SANITY_CHECKS @@ -4570,7 +4744,7 @@ H5C_unprotect(H5F_t * f, else if(test_entry_ptr != entry_ptr) HGOTO_ERROR(H5E_CACHE, H5E_CANTUNPROTECT, FAIL, "hash table contains multiple entries for addr?!?.") - if(H5C__flush_single_entry(f, dxpl_id, entry_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL) < 0) + if(H5C__flush_single_entry(f, dxpl_id, entry_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL, NULL) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTUNPROTECT, FAIL, "Can't clear entry") } #endif /* H5_HAVE_PARALLEL */ @@ -5863,7 +6037,7 @@ H5C__autoadjust__ageout__evict_aged_out_entries(H5F_t * f, cache_ptr->entries_removed_counter = 0; cache_ptr->last_entry_removed_ptr = NULL; - if(H5C__flush_single_entry(f, dxpl_id, entry_ptr, H5C__NO_FLAGS_SET, NULL) < 0) + if(H5C__flush_single_entry(f, dxpl_id, entry_ptr, H5C__NO_FLAGS_SET, NULL, NULL) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to flush entry") if ( ( cache_ptr->entries_removed_counter > 1 ) || @@ -5875,7 +6049,7 @@ H5C__autoadjust__ageout__evict_aged_out_entries(H5F_t * f, bytes_evicted += entry_ptr->size; - if(H5C__flush_single_entry(f, dxpl_id, entry_ptr, H5C__FLUSH_INVALIDATE_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL) < 0 ) + if(H5C__flush_single_entry(f, dxpl_id, entry_ptr, H5C__FLUSH_INVALIDATE_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL, NULL) < 0 ) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to flush entry") } @@ -5955,7 +6129,7 @@ H5C__autoadjust__ageout__evict_aged_out_entries(H5F_t * f, prev_ptr = entry_ptr->prev; if ( ! (entry_ptr->is_dirty) ) { - if(H5C__flush_single_entry(f, dxpl_id, entry_ptr, H5C__FLUSH_INVALIDATE_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL) < 0) + if(H5C__flush_single_entry(f, dxpl_id, entry_ptr, H5C__FLUSH_INVALIDATE_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL, NULL) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to flush clean entry") } /* just skip the entry if it is dirty, as we can't do @@ -6803,7 +6977,7 @@ H5C_flush_invalidate_ring(const H5F_t * f, hid_t dxpl_id, H5C_ring_t ring, entry_size_change = 0; #endif /* H5C_DO_SANITY_CHECKS */ - if(H5C__flush_single_entry(f, dxpl_id, entry_ptr, H5C__NO_FLAGS_SET, entry_size_change_ptr) < 0) + if(H5C__flush_single_entry(f, dxpl_id, entry_ptr, H5C__NO_FLAGS_SET, entry_size_change_ptr, NULL) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "dirty pinned entry flush failed.") #if H5C_DO_SANITY_CHECKS /* entry size may have changed during the flush. @@ -6848,9 +7022,9 @@ H5C_flush_invalidate_ring(const H5F_t * f, hid_t dxpl_id, H5C_ring_t ring, entry_size_change = 0; #endif /* H5C_DO_SANITY_CHECKS */ - if(H5C__flush_single_entry(f, dxpl_id, entry_ptr, + if(H5C__flush_single_entry(f, dxpl_id, entry_ptr, (cooked_flags | H5C__FLUSH_INVALIDATE_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG), - entry_size_change_ptr) < 0) + entry_size_change_ptr, NULL) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "dirty entry flush destroy failed.") #if H5C_DO_SANITY_CHECKS /* entry size may have changed during the flush. @@ -6971,8 +7145,8 @@ H5C_flush_invalidate_ring(const H5F_t * f, hid_t dxpl_id, H5C_ring_t ring, entry_was_dirty = entry_ptr->is_dirty; if(H5C__flush_single_entry(f, dxpl_id, entry_ptr, - (cooked_flags | H5C__FLUSH_INVALIDATE_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG), - NULL) < 0) + (cooked_flags | H5C__FLUSH_INVALIDATE_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG), + NULL, NULL) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Entry flush destroy failed.") if(entry_was_dirty) { @@ -7342,7 +7516,7 @@ H5C_flush_ring(H5F_t *f, hid_t dxpl_id, H5C_ring_t ring, unsigned flags) entry_size_change = 0; #endif /* H5C_DO_SANITY_CHECKS */ - if(H5C__flush_single_entry(f, dxpl_id, entry_ptr, flags, entry_size_change_ptr) < 0) + if(H5C__flush_single_entry(f, dxpl_id, entry_ptr, flags, entry_size_change_ptr, NULL) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "dirty pinned entry flush failed.") #if H5C_DO_SANITY_CHECKS @@ -7387,7 +7561,7 @@ H5C_flush_ring(H5F_t *f, hid_t dxpl_id, H5C_ring_t ring, unsigned flags) flushed_entries_size += (int64_t)entry_ptr->size; entry_size_change = 0; #endif /* H5C_DO_SANITY_CHECKS */ - if(H5C__flush_single_entry(f, dxpl_id, entry_ptr, flags, entry_size_change_ptr) < 0) + if(H5C__flush_single_entry(f, dxpl_id, entry_ptr, flags, entry_size_change_ptr, NULL) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush entry.") #if H5C_DO_SANITY_CHECKS @@ -7536,7 +7710,11 @@ done: */ herr_t H5C__flush_single_entry(const H5F_t *f, hid_t dxpl_id, H5C_cache_entry_t *entry_ptr, - unsigned flags, int64_t *entry_size_change_ptr) + unsigned flags, int64_t *entry_size_change_ptr, H5SL_t +#ifndef H5_HAVE_PARALLEL + H5_ATTR_UNUSED +#endif /* NDEBUG */ + *collective_write_list) { H5C_t * cache_ptr; /* Cache for file */ hbool_t destroy; /* external flag */ @@ -7547,11 +7725,7 @@ H5C__flush_single_entry(const H5F_t *f, hid_t dxpl_id, H5C_cache_entry_t *entry_ hbool_t write_entry; /* internal flag */ hbool_t destroy_entry; /* internal flag */ hbool_t was_dirty; - haddr_t new_addr = HADDR_UNDEF; - haddr_t old_addr = HADDR_UNDEF; haddr_t entry_addr = HADDR_UNDEF; - size_t new_len = 0; - size_t new_compressed_len = 0; herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_PACKAGE @@ -7582,6 +7756,10 @@ H5C__flush_single_entry(const H5F_t *f, hid_t dxpl_id, H5C_cache_entry_t *entry_ else destroy_entry = destroy; +#ifdef H5_HAVE_PARALLEL + HDassert(FALSE == entry_ptr->coll_access); +#endif + /* we will write the entry to disk if it exists, is dirty, and if the * clear only flag is not set. */ @@ -7632,7 +7810,6 @@ H5C__flush_single_entry(const H5F_t *f, hid_t dxpl_id, H5C_cache_entry_t *entry_ /* serialize the entry if necessary, and then write it to disk. */ if(write_entry) { - unsigned serialize_flags = H5C__SERIALIZE_NO_FLAGS_SET; /* The entry is dirty, and we are doing either a flush, * or a flush destroy. In either case, serialize the @@ -7672,225 +7849,9 @@ H5C__flush_single_entry(const H5F_t *f, hid_t dxpl_id, H5C_cache_entry_t *entry_ } /* end if */ if(!(entry_ptr->image_up_to_date)) { - /* reset cache_ptr->slist_changed so we can detect slist - * modifications in the pre_serialize call. - */ - cache_ptr->slist_changed = FALSE; - - /* make note of the entry's current address */ - old_addr = entry_ptr->addr; - - /* Call client's pre-serialize callback, if there's one */ - if ( ( entry_ptr->type->pre_serialize != NULL ) && - ( (entry_ptr->type->pre_serialize)(f, dxpl_id, - (void *)entry_ptr, - entry_ptr->addr, - entry_ptr->size, - entry_ptr->compressed_size, - &new_addr, &new_len, - &new_compressed_len, - &serialize_flags) < 0 ) ) - HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to pre-serialize entry") - - /* set cache_ptr->slist_change_in_pre_serialize if the - * slist was modified. - */ - if(cache_ptr->slist_changed) - cache_ptr->slist_change_in_pre_serialize = TRUE; - - /* Check for any flags set in the pre-serialize callback */ - if(serialize_flags != H5C__SERIALIZE_NO_FLAGS_SET) { - /* Check for unexpected flags from serialize callback */ - if(serialize_flags & ~(H5C__SERIALIZE_RESIZED_FLAG | - H5C__SERIALIZE_MOVED_FLAG | - H5C__SERIALIZE_COMPRESSED_FLAG)) - HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unknown serialize flag(s)") -#ifdef H5_HAVE_PARALLEL - /* In the parallel case, resizes and moves in - * the serialize operation can cause problems. - * If they occur, scream and die. - * - * At present, in the parallel case, the aux_ptr - * will only be set if there is more than one - * process. Thus we can use this to detect - * the parallel case. - * - * This works for now, but if we start using the - * aux_ptr for other purposes, we will have to - * change this test accordingly. - * - * NB: While this test detects entryies that attempt - * to resize or move themselves during a flush - * in the parallel case, it will not detect an - * entry that dirties, resizes, and/or moves - * other entries during its flush. - * - * From what Quincey tells me, this test is - * sufficient for now, as any flush routine that - * does the latter will also do the former. - * - * If that ceases to be the case, further - * tests will be necessary. - */ - if(cache_ptr->aux_ptr != NULL) - HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "resize/move in serialize occured in parallel case.") -#endif /* H5_HAVE_PARALLEL */ - - /* Resize the buffer if required */ - if ( ( ( ! entry_ptr->compressed ) && - ( serialize_flags & H5C__SERIALIZE_RESIZED_FLAG ) ) || - ( ( entry_ptr->compressed ) && - ( serialize_flags & H5C__SERIALIZE_COMPRESSED_FLAG ) ) ) - { - size_t new_image_size; - - if(entry_ptr->compressed) - new_image_size = new_compressed_len; - else - new_image_size = new_len; - HDassert(new_image_size > 0); - - /* Release the current image */ - if(entry_ptr->image_ptr) - entry_ptr->image_ptr = H5MM_xfree(entry_ptr->image_ptr); - - /* Allocate a new image buffer */ - if(NULL == (entry_ptr->image_ptr = H5MM_malloc(new_image_size + H5C_IMAGE_EXTRA_SPACE))) - HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for on disk image buffer") -#if H5C_DO_MEMORY_SANITY_CHECKS - HDmemcpy(((uint8_t *)entry_ptr->image_ptr) + new_image_size, H5C_IMAGE_SANITY_VALUE, H5C_IMAGE_EXTRA_SPACE); -#endif /* H5C_DO_MEMORY_SANITY_CHECKS */ - } /* end if */ - - /* If required, update the entry and the cache data structures - * for a resize. - */ - if(serialize_flags & H5C__SERIALIZE_RESIZED_FLAG) { - H5C__UPDATE_STATS_FOR_ENTRY_SIZE_CHANGE(cache_ptr, \ - entry_ptr, new_len) - - /* update the hash table for the size change*/ - H5C__UPDATE_INDEX_FOR_SIZE_CHANGE(cache_ptr, \ - entry_ptr->size, \ - new_len, entry_ptr, \ - !(entry_ptr->is_dirty)); - - /* The entry can't be protected since we are - * in the process of flushing it. Thus we must - * update the replacement policy data - * structures for the size change. The macro - * deals with the pinned case. - */ - H5C__UPDATE_RP_FOR_SIZE_CHANGE(cache_ptr, entry_ptr, new_len); - - /* as we haven't updated the cache data structures for - * for the flush or flush destroy yet, the entry should - * be in the slist. Thus update it for the size change. - */ - HDassert(entry_ptr->in_slist); - H5C__UPDATE_SLIST_FOR_SIZE_CHANGE(cache_ptr, entry_ptr->size, \ - new_len) - - /* if defined, update *entry_size_change_ptr for the - * change in entry size. - */ - if(entry_size_change_ptr != NULL) - *entry_size_change_ptr = (int64_t)new_len - (int64_t)(entry_ptr->size); - - /* finally, update the entry for its new size */ - entry_ptr->size = new_len; - } /* end if */ - - /* If required, udate the entry and the cache data structures - * for a move - */ - if(serialize_flags & H5C__SERIALIZE_MOVED_FLAG) { -#if H5C_DO_SANITY_CHECKS - int64_t saved_slist_len_increase; - int64_t saved_slist_size_increase; -#endif /* H5C_DO_SANITY_CHECKS */ - - H5C__UPDATE_STATS_FOR_MOVE(cache_ptr, entry_ptr) - - if(entry_ptr->addr == old_addr) { - /* we must update cache data structures for the - * change in address. - */ - - /* delete the entry from the hash table and the slist */ - H5C__DELETE_FROM_INDEX(cache_ptr, entry_ptr) - H5C__REMOVE_ENTRY_FROM_SLIST(cache_ptr, entry_ptr) - - /* update the entry for its new address */ - entry_ptr->addr = new_addr; - - /* and then reinsert in the index and slist */ - H5C__INSERT_IN_INDEX(cache_ptr, entry_ptr, FAIL) - -#if H5C_DO_SANITY_CHECKS - /* save cache_ptr->slist_len_increase and - * cache_ptr->slist_size_increase before the - * reinsertion into the slist, and restore - * them afterwards to avoid skewing our sanity - * checking. - */ - saved_slist_len_increase = cache_ptr->slist_len_increase; - saved_slist_size_increase = cache_ptr->slist_size_increase; -#endif /* H5C_DO_SANITY_CHECKS */ - - H5C__INSERT_ENTRY_IN_SLIST(cache_ptr, entry_ptr, FAIL) - -#if H5C_DO_SANITY_CHECKS - cache_ptr->slist_len_increase = saved_slist_len_increase; - cache_ptr->slist_size_increase = saved_slist_size_increase; -#endif /* H5C_DO_SANITY_CHECKS */ - } - else /* move is alread done for us -- just do sanity checks */ - HDassert(entry_ptr->addr == new_addr); - } /* end if */ - - if(serialize_flags & H5C__SERIALIZE_COMPRESSED_FLAG) { - /* just save the new compressed entry size in - * entry_ptr->compressed_size. We don't need to - * do more, as compressed size is only used for I/O. - */ - HDassert(entry_ptr->compressed); - entry_ptr->compressed_size = new_compressed_len; - } - } /* end if ( serialize_flags != H5C__SERIALIZE_NO_FLAGS_SET ) */ - - /* Serialize object into buffer */ - { - size_t image_len; - - if(entry_ptr->compressed) - image_len = entry_ptr->compressed_size; - else - image_len = entry_ptr->size; - - /* reset cache_ptr->slist_changed so we can detect slist - * modifications in the serialize call. - */ - cache_ptr->slist_changed = FALSE; - - - if(entry_ptr->type->serialize(f, entry_ptr->image_ptr, - image_len, (void *)entry_ptr) < 0) - HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to serialize entry") - - /* set cache_ptr->slist_change_in_serialize if the - * slist was modified. - */ - if(cache_ptr->slist_changed) - cache_ptr->slist_change_in_pre_serialize = TRUE; - -#if H5C_DO_MEMORY_SANITY_CHECKS - HDassert(0 == HDmemcmp(((uint8_t *)entry_ptr->image_ptr) + image_len, - H5C_IMAGE_SANITY_VALUE, H5C_IMAGE_EXTRA_SPACE)); -#endif /* H5C_DO_MEMORY_SANITY_CHECKS */ - - entry_ptr->image_up_to_date = TRUE; - } + /* Generate the entry's image */ + if(H5C__generate_image(f, cache_ptr, entry_ptr, dxpl_id, entry_size_change_ptr) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "can't generate entry's image") } /* end if ( ! (entry_ptr->image_up_to_date) ) */ /* Finally, write the image to disk. @@ -7914,6 +7875,25 @@ H5C__flush_single_entry(const H5F_t *f, hid_t dxpl_id, H5C_cache_entry_t *entry_ else image_size = entry_ptr->size; +#ifdef H5_HAVE_PARALLEL + if(collective_write_list) { + H5C_collective_write_t *item; + + if(NULL == (item = (H5C_collective_write_t *)H5FL_MALLOC(H5C_collective_write_t))) + HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "unable to allocate skip list item") + + item->length = image_size; + item->free_buf = FALSE; + item->buf = entry_ptr->image_ptr; + item->offset = entry_ptr->addr; + + if(H5SL_insert(collective_write_list, item, &item->offset) < 0) { + H5MM_free(item); + HGOTO_ERROR(H5E_CACHE, H5E_CANTINSERT, FAIL, "unable to insert skip list item") + } /* end if */ + } /* end if */ + else +#endif /* H5_HAVE_PARALLEL */ if(H5F_block_write(f, entry_ptr->type->mem_type, entry_ptr->addr, image_size, dxpl_id, entry_ptr->image_ptr) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't write image to file.") @@ -8170,6 +8150,9 @@ done: static void * H5C_load_entry(H5F_t * f, hid_t dxpl_id, +#ifdef H5_HAVE_PARALLEL + hbool_t coll_access, +#endif /* H5_HAVE_PARALLEL */ const H5C_class_t * type, haddr_t addr, void * udata) @@ -8188,6 +8171,11 @@ H5C_load_entry(H5F_t * f, H5C_cache_entry_t * entry; /* Alias for thing loaded, as cache entry */ size_t len; /* Size of image in file */ unsigned u; /* Local index variable */ +#ifdef H5_HAVE_PARALLEL + int mpi_rank = 0; /* MPI process rank */ + MPI_Comm comm = MPI_COMM_NULL; /* File MPI Communicator */ + int mpi_code; /* MPI error code */ +#endif /* H5_HAVE_PARALLEL */ void * ret_value = NULL; /* Return value */ FUNC_ENTER_NOAPI_NOINIT @@ -8332,10 +8320,37 @@ H5C_load_entry(H5F_t * f, HDmemcpy(image + len, H5C_IMAGE_SANITY_VALUE, H5C_IMAGE_EXTRA_SPACE); #endif /* H5C_DO_MEMORY_SANITY_CHECKS */ +#ifdef H5_HAVE_PARALLEL + if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) { + if((mpi_rank = H5F_mpi_get_rank(f)) < 0) + HGOTO_ERROR(H5E_FILE, H5E_CANTGET, NULL, "Can't get MPI rank") + if((comm = H5F_mpi_get_comm(f)) == MPI_COMM_NULL) + HGOTO_ERROR(H5E_FILE, H5E_CANTGET, NULL, "get_comm request failed") + } /* end if */ +#endif /* H5_HAVE_PARALLEL */ + /* Get the on-disk entry image */ - if(0 == (type->flags & H5C__CLASS_SKIP_READS)) - if(H5F_block_read(f, type->mem_type, addr, len, dxpl_id, image) < 0) - HGOTO_ERROR(H5E_CACHE, H5E_READERROR, NULL, "Can't read image*") + if(0 == (type->flags & H5C__CLASS_SKIP_READS)) { +#ifdef H5_HAVE_PARALLEL + if(!coll_access || 0 == mpi_rank) { +#endif /* H5_HAVE_PARALLEL */ + if(H5F_block_read(f, type->mem_type, addr, len, dxpl_id, image) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_READERROR, NULL, "Can't read image*") + +#ifdef H5_HAVE_PARALLEL + } /* end if */ + /* if the collective metadata read optimization is turned on, + bcast the metadata read from process 0 to all ranks in the file + communicator */ + if(coll_access) { + int buf_size; + + H5_CHECKED_ASSIGN(buf_size, int, len, size_t); + if(MPI_SUCCESS != (mpi_code = MPI_Bcast(image, buf_size, MPI_BYTE, 0, comm))) + HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code) + } /* end if */ +#endif /* H5_HAVE_PARALLEL */ + } /* end if */ /* Deserialize the on-disk image into the native memory form */ if(NULL == (thing = type->deserialize(image, len, udata, &dirty))) @@ -8438,11 +8453,28 @@ H5C_load_entry(H5F_t * f, HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "free_icr callback failed") +#ifdef H5_HAVE_PARALLEL + if(!coll_access || 0 == mpi_rank) { +#endif /* H5_HAVE_PARALLEL */ /* Go get the on-disk image again */ if(H5F_block_read(f, type->mem_type, addr, new_len, dxpl_id, image) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, "Can't read image") +#ifdef H5_HAVE_PARALLEL + } + /* if the collective metadata read optimization is turned on, + bcast the metadata read from process 0 to all ranks in the file + communicator */ + if(coll_access) { + int buf_size; + + H5_CHECKED_ASSIGN(buf_size, int, new_len, size_t); + if(MPI_SUCCESS != (mpi_code = MPI_Bcast(image, buf_size, MPI_BYTE, 0, comm))) + HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code) + } /* end if */ +#endif /* H5_HAVE_PARALLEL */ + /* Deserialize on-disk image into native memory form again */ if(NULL == (thing = type->deserialize(image, new_len, udata, &dirty))) @@ -8531,6 +8563,7 @@ H5C_load_entry(H5F_t * f, #ifdef H5_HAVE_PARALLEL entry->clear_on_unprotect = FALSE; entry->flush_immediately = FALSE; + entry->coll_access = coll_access; #endif /* H5_HAVE_PARALLEL */ entry->flush_in_progress = FALSE; entry->destroy_in_progress = FALSE; @@ -8551,6 +8584,11 @@ H5C_load_entry(H5F_t * f, entry->aux_next = NULL; entry->aux_prev = NULL; +#ifdef H5_HAVE_PARALLEL + entry->coll_next = NULL; + entry->coll_prev = NULL; +#endif /* H5_HAVE_PARALLEL */ + H5C__RESET_CACHE_ENTRY_STATS(entry); ret_value = thing; @@ -8743,7 +8781,14 @@ H5C_make_space_in_cache(H5F_t * f, cache_ptr->entries_removed_counter = 0; cache_ptr->last_entry_removed_ptr = NULL; - if(H5C__flush_single_entry(f, dxpl_id, entry_ptr, H5C__NO_FLAGS_SET, NULL) < 0) +#ifdef H5_HAVE_PARALLEL + if(TRUE == entry_ptr->coll_access) { + entry_ptr->coll_access = FALSE; + H5C__REMOVE_FROM_COLL_LIST(cache_ptr, entry_ptr, FAIL) + } /* end if */ +#endif + + if(H5C__flush_single_entry(f, dxpl_id, entry_ptr, H5C__NO_FLAGS_SET, NULL, NULL) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to flush entry") if ( ( cache_ptr->entries_removed_counter > 1 ) || @@ -8751,14 +8796,16 @@ H5C_make_space_in_cache(H5F_t * f, restart_scan = TRUE; - } else if ( (cache_ptr->index_size + space_needed) - > - cache_ptr->max_cache_size ) { + } else if ( (cache_ptr->index_size + space_needed) > cache_ptr->max_cache_size +#ifdef H5_HAVE_PARALLEL + && !(entry_ptr->coll_access) +#endif /* H5_HAVE_PARALLEL */ + ) { #if H5C_COLLECT_CACHE_STATS cache_ptr->entries_scanned_to_make_space++; #endif /* H5C_COLLECT_CACHE_STATS */ - if(H5C__flush_single_entry(f, dxpl_id, entry_ptr, H5C__FLUSH_INVALIDATE_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL) < 0) + if(H5C__flush_single_entry(f, dxpl_id, entry_ptr, H5C__FLUSH_INVALIDATE_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL, NULL) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to flush entry") } else { /* We have enough space so don't flush clean entry. */ @@ -8894,8 +8941,14 @@ H5C_make_space_in_cache(H5F_t * f, prev_ptr = entry_ptr->aux_prev; - if(H5C__flush_single_entry(f, dxpl_id, entry_ptr, H5C__FLUSH_INVALIDATE_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL) < 0) - HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to flush entry") +#ifdef H5_HAVE_PARALLEL + if(!(entry_ptr->coll_access)) { +#endif /* H5_HAVE_PARALLEL */ + if(H5C__flush_single_entry(f, dxpl_id, entry_ptr, H5C__FLUSH_INVALIDATE_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL, NULL) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to flush entry") +#ifdef H5_HAVE_PARALLEL + } /* end if */ +#endif /* H5_HAVE_PARALLEL */ /* we are scanning the clean LRU, so the serialize function * will not be called on any entry -- thus there is no @@ -9934,3 +9987,245 @@ done: FUNC_LEAVE_NOAPI(ret_value) } /* H5C_get_entry_ring() */ + +/*------------------------------------------------------------------------- + * Function: H5C__generate_image + * + * Purpose: Serialize an entry and generate its image. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: Mohamad Chaarawi + * 2/10/16 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5C__generate_image(const H5F_t *f, H5C_t *cache_ptr, H5C_cache_entry_t *entry_ptr, + hid_t dxpl_id, int64_t *entry_size_change_ptr) +{ + haddr_t new_addr = HADDR_UNDEF; + haddr_t old_addr = HADDR_UNDEF; + size_t new_len = 0; + size_t new_compressed_len = 0; + unsigned serialize_flags = H5C__SERIALIZE_NO_FLAGS_SET; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_STATIC + + /* Sanity check */ + HDassert(!entry_ptr->image_up_to_date); + + /* reset cache_ptr->slist_changed so we can detect slist + * modifications in the pre_serialize call. + */ + cache_ptr->slist_changed = FALSE; + + /* make note of the entry's current address */ + old_addr = entry_ptr->addr; + + /* Call client's pre-serialize callback, if there's one */ + if(entry_ptr->type->pre_serialize && + (entry_ptr->type->pre_serialize)(f, dxpl_id, + (void *)entry_ptr, entry_ptr->addr, entry_ptr->size, + entry_ptr->compressed_size, &new_addr, &new_len, + &new_compressed_len, &serialize_flags) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to pre-serialize entry") + + /* set cache_ptr->slist_change_in_pre_serialize if the + * slist was modified. + */ + if(cache_ptr->slist_changed) + cache_ptr->slist_change_in_pre_serialize = TRUE; + + /* Check for any flags set in the pre-serialize callback */ + if(serialize_flags != H5C__SERIALIZE_NO_FLAGS_SET) { + /* Check for unexpected flags from serialize callback */ + if(serialize_flags & ~(H5C__SERIALIZE_RESIZED_FLAG | + H5C__SERIALIZE_MOVED_FLAG | + H5C__SERIALIZE_COMPRESSED_FLAG)) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unknown serialize flag(s)") + +#ifdef H5_HAVE_PARALLEL + /* In the parallel case, resizes and moves in + * the serialize operation can cause problems. + * If they occur, scream and die. + * + * At present, in the parallel case, the aux_ptr + * will only be set if there is more than one + * process. Thus we can use this to detect + * the parallel case. + * + * This works for now, but if we start using the + * aux_ptr for other purposes, we will have to + * change this test accordingly. + * + * NB: While this test detects entryies that attempt + * to resize or move themselves during a flush + * in the parallel case, it will not detect an + * entry that dirties, resizes, and/or moves + * other entries during its flush. + * + * From what Quincey tells me, this test is + * sufficient for now, as any flush routine that + * does the latter will also do the former. + * + * If that ceases to be the case, further + * tests will be necessary. + */ + if(cache_ptr->aux_ptr != NULL) + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "resize/move in serialize occured in parallel case.") +#endif + + /* Resize the buffer if required */ + if(((!entry_ptr->compressed) && (serialize_flags & H5C__SERIALIZE_RESIZED_FLAG)) || + ((entry_ptr->compressed) && (serialize_flags & H5C__SERIALIZE_COMPRESSED_FLAG))) { + size_t new_image_size; + + if(entry_ptr->compressed) + new_image_size = new_compressed_len; + else + new_image_size = new_len; + HDassert(new_image_size > 0); + + /* Release the current image */ + if(entry_ptr->image_ptr) + entry_ptr->image_ptr = H5MM_xfree(entry_ptr->image_ptr); + + /* Allocate a new image buffer */ + if(NULL == (entry_ptr->image_ptr = H5MM_malloc(new_image_size + H5C_IMAGE_EXTRA_SPACE))) + HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for on disk image buffer") + +#if H5C_DO_MEMORY_SANITY_CHECKS + HDmemcpy(((uint8_t *)entry_ptr->image_ptr) + new_image_size, H5C_IMAGE_SANITY_VALUE, H5C_IMAGE_EXTRA_SPACE); +#endif /* H5C_DO_MEMORY_SANITY_CHECKS */ + } /* end if */ + + /* If required, update the entry and the cache data structures + * for a resize. + */ + if(serialize_flags & H5C__SERIALIZE_RESIZED_FLAG) { + H5C__UPDATE_STATS_FOR_ENTRY_SIZE_CHANGE(cache_ptr, entry_ptr, new_len); + + /* update the hash table for the size change*/ + H5C__UPDATE_INDEX_FOR_SIZE_CHANGE(cache_ptr, entry_ptr->size, \ + new_len, entry_ptr, !(entry_ptr->is_dirty)); + + /* The entry can't be protected since we are + * in the process of flushing it. Thus we must + * update the replacement policy data + * structures for the size change. The macro + * deals with the pinned case. + */ + H5C__UPDATE_RP_FOR_SIZE_CHANGE(cache_ptr, entry_ptr, new_len); + + /* as we haven't updated the cache data structures for + * for the flush or flush destroy yet, the entry should + * be in the slist. Thus update it for the size change. + */ + HDassert(entry_ptr->in_slist); + H5C__UPDATE_SLIST_FOR_SIZE_CHANGE(cache_ptr, entry_ptr->size, new_len); + + /* if defined, update *entry_size_change_ptr for the + * change in entry size. + */ + if(entry_size_change_ptr != NULL) + *entry_size_change_ptr = (int64_t)new_len - (int64_t)(entry_ptr->size); + + /* finally, update the entry for its new size */ + entry_ptr->size = new_len; + } /* end if */ + + /* If required, udate the entry and the cache data structures + * for a move + */ + if(serialize_flags & H5C__SERIALIZE_MOVED_FLAG) { +#if H5C_DO_SANITY_CHECKS + int64_t saved_slist_len_increase; + int64_t saved_slist_size_increase; +#endif /* H5C_DO_SANITY_CHECKS */ + + H5C__UPDATE_STATS_FOR_MOVE(cache_ptr, entry_ptr); + + if(entry_ptr->addr == old_addr) { + /* we must update cache data structures for the + * change in address. + */ + + /* delete the entry from the hash table and the slist */ + H5C__DELETE_FROM_INDEX(cache_ptr, entry_ptr); + H5C__REMOVE_ENTRY_FROM_SLIST(cache_ptr, entry_ptr); + + /* update the entry for its new address */ + entry_ptr->addr = new_addr; + + /* and then reinsert in the index and slist */ + H5C__INSERT_IN_INDEX(cache_ptr, entry_ptr, FAIL); + +#if H5C_DO_SANITY_CHECKS + /* save cache_ptr->slist_len_increase and + * cache_ptr->slist_size_increase before the + * reinsertion into the slist, and restore + * them afterwards to avoid skewing our sanity + * checking. + */ + saved_slist_len_increase = cache_ptr->slist_len_increase; + saved_slist_size_increase = cache_ptr->slist_size_increase; +#endif /* H5C_DO_SANITY_CHECKS */ + + H5C__INSERT_ENTRY_IN_SLIST(cache_ptr, entry_ptr, FAIL); + +#if H5C_DO_SANITY_CHECKS + cache_ptr->slist_len_increase = saved_slist_len_increase; + cache_ptr->slist_size_increase = saved_slist_size_increase; +#endif /* H5C_DO_SANITY_CHECKS */ + } /* end if */ + else /* move is already done for us -- just do sanity checks */ + HDassert(entry_ptr->addr == new_addr); + } /* end if */ + + if(serialize_flags & H5C__SERIALIZE_COMPRESSED_FLAG) { + /* just save the new compressed entry size in + * entry_ptr->compressed_size. We don't need to + * do more, as compressed size is only used for I/O. + */ + HDassert(entry_ptr->compressed); + entry_ptr->compressed_size = new_compressed_len; + } /* end if */ + } /* end if(serialize_flags != H5C__SERIALIZE_NO_FLAGS_SET) */ + + /* Serialize object into buffer */ + { + size_t image_len; + + if(entry_ptr->compressed) + image_len = entry_ptr->compressed_size; + else + image_len = entry_ptr->size; + + /* reset cache_ptr->slist_changed so we can detect slist + * modifications in the serialize call. + */ + cache_ptr->slist_changed = FALSE; + + if(entry_ptr->type->serialize(f, entry_ptr->image_ptr, image_len, (void *)entry_ptr) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to serialize entry") + + /* set cache_ptr->slist_change_in_serialize if the + * slist was modified. + */ + if(cache_ptr->slist_changed) + cache_ptr->slist_change_in_pre_serialize = TRUE; + +#if H5C_DO_MEMORY_SANITY_CHECKS + HDassert(0 == HDmemcmp(((uint8_t *)entry_ptr->image_ptr) + image_len, + H5C_IMAGE_SANITY_VALUE, H5C_IMAGE_EXTRA_SPACE)); +#endif /* H5C_DO_MEMORY_SANITY_CHECKS */ + + entry_ptr->image_up_to_date = TRUE; + } /* end block */ + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* H5C__generate_image */ + diff --git a/src/H5Cmpio.c b/src/H5Cmpio.c index 44733c0..4e88e44 100644 --- a/src/H5Cmpio.c +++ b/src/H5Cmpio.c @@ -39,14 +39,17 @@ #include "H5private.h" /* Generic Functions */ #include "H5ACprivate.h" /* Metadata cache */ #include "H5Cpkg.h" /* Cache */ +#include "H5Dprivate.h" /* Datasets */ #include "H5Eprivate.h" /* Error handling */ #include "H5Fpkg.h" /* Files */ +#include "H5FDprivate.h" /* File drivers */ +#include "H5FLprivate.h" /* Free Lists */ #include "H5Iprivate.h" /* IDs */ #include "H5MMprivate.h" /* Memory management */ +#include "H5Pprivate.h" /* Property lists */ #ifdef H5_HAVE_PARALLEL - /****************/ /* Local Macros */ /****************/ @@ -61,6 +64,9 @@ /********************/ /* Local Prototypes */ /********************/ +static herr_t H5C__collective_write(H5F_t *f, hid_t dxpl_id, + H5SL_t *collective_write_list); +static herr_t H5C__collective_write_free(void *_item, void *key, void *op_data); /*********************/ @@ -77,6 +83,9 @@ /* Local Variables */ /*******************/ +/* Declare a free list to manage the H5C_collective_write_t struct */ +H5FL_DEFINE(H5C_collective_write_t); + /*------------------------------------------------------------------------- @@ -221,6 +230,7 @@ H5C_apply_candidate_list(H5F_t * f, H5C_cache_entry_t * entry_ptr = NULL; H5C_cache_entry_t * flush_ptr = NULL; H5C_cache_entry_t * delayed_ptr = NULL; + H5SL_t * collective_write_list = NULL; #if H5C_DO_SANITY_CHECKS haddr_t last_addr; #endif /* H5C_DO_SANITY_CHECKS */ @@ -253,6 +263,12 @@ H5C_apply_candidate_list(H5F_t * f, HDfprintf(stdout, "%s", tbl_buf); #endif /* H5C_APPLY_CANDIDATE_LIST__DEBUG */ + if(f->coll_md_write) { + /* Create skip list of entries for collective write */ + if(NULL == (collective_write_list = H5SL_create(H5SL_TYPE_HADDR, NULL))) + HGOTO_ERROR(H5E_DATASET, H5E_CANTCREATE, FAIL, "can't create skip list for entries") + } /* end if */ + n = num_candidates / mpi_size; m = num_candidates % mpi_size; HDassert(n >= 0); @@ -357,6 +373,16 @@ H5C_apply_candidate_list(H5F_t * f, entries_to_clear++; entry_ptr->clear_on_unprotect = TRUE; } /* end else */ + + /* Entries marked as collectively accessed and are in the + candidate list to clear from the cache have to be + removed from the coll list. This is OK since the + candidate list is collective and uniform across all + ranks. */ + if(TRUE == entry_ptr->coll_access) { + entry_ptr->coll_access = FALSE; + H5C__REMOVE_FROM_COLL_LIST(cache_ptr, entry_ptr, FAIL) + } /* end if */ } /* end else */ } /* end for */ @@ -433,7 +459,7 @@ H5C_apply_candidate_list(H5F_t * f, * will not call either the pre_serialize or serialize callbacks. */ - if(H5C__flush_single_entry(f, dxpl_id, clear_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL) < 0) + if(H5C__flush_single_entry(f, dxpl_id, clear_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL, NULL) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't clear entry.") } /* end if */ @@ -478,7 +504,8 @@ H5C_apply_candidate_list(H5F_t * f, cache_ptr->entries_removed_counter = 0; cache_ptr->last_entry_removed_ptr = NULL; - if(H5C__flush_single_entry(f, dxpl_id, flush_ptr, H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL) < 0) + /* Add this entry to the list of entries to collectively write */ + if(H5C__flush_single_entry(f, dxpl_id, flush_ptr, H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL, collective_write_list) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush entry.") if ( ( cache_ptr->entries_removed_counter > 1 ) || @@ -636,7 +663,7 @@ H5C_apply_candidate_list(H5F_t * f, (long long)clear_ptr->addr); #endif /* H5C_APPLY_CANDIDATE_LIST__DEBUG */ - if(H5C__flush_single_entry(f, dxpl_id, clear_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL) < 0) + if(H5C__flush_single_entry(f, dxpl_id, clear_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL, NULL) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't clear entry.") } /* end else-if */ @@ -652,7 +679,8 @@ H5C_apply_candidate_list(H5F_t * f, (long long)flush_ptr->addr); #endif /* H5C_APPLY_CANDIDATE_LIST__DEBUG */ - if(H5C__flush_single_entry(f, dxpl_id, flush_ptr, H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL) < 0) + /* Add this entry to the list of entries to collectively write */ + if(H5C__flush_single_entry(f, dxpl_id, flush_ptr, H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL, collective_write_list) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't clear entry.") } /* end else-if */ } /* end if */ @@ -686,20 +714,33 @@ H5C_apply_candidate_list(H5F_t * f, if (delayed_ptr) { if (delayed_ptr->clear_on_unprotect) { + if(H5C__flush_single_entry(f, dxpl_id, delayed_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG, NULL, NULL) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush entry.") + entry_ptr->clear_on_unprotect = FALSE; entries_cleared++; } else if (delayed_ptr->flush_immediately) { + /* Add this entry to the list of entries to collectively write */ + if(H5C__flush_single_entry(f, dxpl_id, delayed_ptr, H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL, collective_write_list) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush entry collectively.") + entry_ptr->flush_immediately = FALSE; entries_flushed++; } /* end if */ - if(H5C__flush_single_entry(f, dxpl_id, delayed_ptr, H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL) < 0) - HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush entry collectively.") - entries_flushed_collectively++; entries_flushed_or_cleared_last++; } /* end if */ + /* If we've deferred writing to do it collectively, take care of that now */ + if(f->coll_md_write) { + HDassert(collective_write_list); + + /* Write collective list */ + if(H5C__collective_write(f, dxpl_id, collective_write_list) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_WRITEERROR, FAIL, "Can't write metadata collectively") + } /* end if */ + /* ====================================================================== * * Finished flushing everything. * * ====================================================================== */ @@ -719,6 +760,10 @@ done: if(candidate_assignment_table != NULL) candidate_assignment_table = (int *)H5MM_xfree((void *)candidate_assignment_table); + if(collective_write_list) + if(H5SL_destroy(collective_write_list, H5C__collective_write_free, NULL) < 0) + HDONE_ERROR(H5E_CACHE, H5E_CANTFREE, FAIL, "failed to destroy skip list") + FUNC_LEAVE_NOAPI(ret_value) } /* H5C_apply_candidate_list() */ @@ -1082,6 +1127,14 @@ H5C_mark_entries_as_clean(H5F_t * f, * scan the LRU list shortly, and clear all those entries * not currently protected. */ + + /* Make sure first that we clear the collective flag from + it so it can be cleared */ + if(TRUE == entry_ptr->coll_access) { + entry_ptr->coll_access = FALSE; + H5C__REMOVE_FROM_COLL_LIST(cache_ptr, entry_ptr, FAIL) + } /* end if */ + entry_ptr->clear_on_unprotect = TRUE; #if H5C_DO_SANITY_CHECKS if ( entry_ptr->is_protected ) { @@ -1142,7 +1195,7 @@ H5C_mark_entries_as_clean(H5F_t * f, entry_ptr = entry_ptr->prev; entries_cleared++; - if(H5C__flush_single_entry(f, dxpl_id, clear_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL) < 0) + if(H5C__flush_single_entry(f, dxpl_id, clear_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL, NULL) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't clear entry.") } else { @@ -1170,7 +1223,7 @@ H5C_mark_entries_as_clean(H5F_t * f, entry_ptr = entry_ptr->next; entries_cleared++; - if(H5C__flush_single_entry(f, dxpl_id, clear_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL) < 0 ) + if(H5C__flush_single_entry(f, dxpl_id, clear_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL, NULL) < 0 ) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't clear entry.") } else { @@ -1215,5 +1268,237 @@ done: FUNC_LEAVE_NOAPI(ret_value) } /* H5C_mark_entries_as_clean() */ + + +/*------------------------------------------------------------------------- + * + * Function: H5C_clear_coll_entries + * + * Purpose: Clear half or the entire list of collective entries and + * mark them as independent. + * + * Return: FAIL if error is detected, SUCCEED otherwise. + * + * Programmer: Mohamad Chaarawi + * April, 2015 + * + *------------------------------------------------------------------------- + */ +herr_t +H5C_clear_coll_entries(H5C_t *cache_ptr, hbool_t partial) +{ + int32_t clear_cnt; + H5C_cache_entry_t * entry_ptr = NULL; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI_NOINIT + + entry_ptr = cache_ptr->coll_tail_ptr; + clear_cnt = (partial ? cache_ptr->coll_list_len / 2 : cache_ptr->coll_list_len); + while(entry_ptr && clear_cnt > 0) { + H5C_cache_entry_t *prev_ptr = entry_ptr->coll_prev; + + /* Sanity check */ + HDassert(entry_ptr->coll_access); + + /* Mark entry as independent */ + entry_ptr->coll_access = FALSE; + H5C__REMOVE_FROM_COLL_LIST(cache_ptr, entry_ptr, FAIL) + + /* Decrement entry count */ + clear_cnt--; + + /* Advance to next entry */ + entry_ptr = prev_ptr; + } /* end while */ + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* H5C_clear_coll_entries */ + + +/*------------------------------------------------------------------------- + * + * Function: H5C__collective_write + * + * Purpose: Perform a collective write of a list of metadata entries. + * + * Return: FAIL if error is detected, SUCCEED otherwise. + * + * Programmer: Mohamad Chaarawi + * February, 2016 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5C__collective_write(H5F_t *f, hid_t dxpl_id, H5SL_t *collective_write_list) +{ + H5P_genplist_t *plist = NULL; + H5FD_mpio_xfer_t orig_xfer_mode = H5FD_MPIO_COLLECTIVE; + int count; + int *length_array = NULL; + MPI_Aint *buf_array = NULL; + MPI_Aint *offset_array = NULL; + MPI_Datatype btype; + hbool_t btype_created = FALSE; + MPI_Datatype ftype; + hbool_t ftype_created = FALSE; + int mpi_code; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_STATIC + + /* Get original transfer mode */ + if(NULL == (plist = (H5P_genplist_t *)H5I_object(dxpl_id))) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a data transfer property list") + if(H5P_get(plist, H5D_XFER_IO_XFER_MODE_NAME, &orig_xfer_mode) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O property") + + /* Get number of entries in collective write list */ + count = (int)H5SL_count(collective_write_list); + + if(count > 0) { + H5FD_mpio_xfer_t xfer_mode = H5FD_MPIO_COLLECTIVE; + H5SL_node_t *node; + H5C_collective_write_t *item; + void *base_buf; + int i; + + if(H5P_set(plist, H5D_XFER_IO_XFER_MODE_NAME, &xfer_mode) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O property") + + /* Allocate arrays */ + if(NULL == (length_array = (int *)H5MM_malloc((size_t)count * sizeof(int)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "memory allocation failed for collective write table length array") + if(NULL == (buf_array = (MPI_Aint *)H5MM_malloc((size_t)count * sizeof(MPI_Aint)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "memory allocation failed for collective buf table length array") + if(NULL == (offset_array = (MPI_Aint *)H5MM_malloc((size_t)count * sizeof(MPI_Aint)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "memory allocation failed for collective offset table length array") + + /* Fill arrays */ + node = H5SL_first(collective_write_list); + HDassert(node); + if(NULL == (item = (H5C_collective_write_t *)H5SL_item(node))) + HGOTO_ERROR(H5E_CACHE, H5E_NOTFOUND, FAIL, "can't retrieve skip list item") + + /* Set up initial array position & buffer base address */ + length_array[0] = (int)item->length; + base_buf = item->buf; + buf_array[0] = (MPI_Aint)0; + offset_array[0] = (MPI_Aint)item->offset; + + node = H5SL_next(node); + i = 1; + while(node) { + if(NULL == (item = (H5C_collective_write_t *)H5SL_item(node))) + HGOTO_ERROR(H5E_CACHE, H5E_NOTFOUND, FAIL, "can't retrieve skip list item") + + /* Set up array position */ + length_array[i] = (int)item->length; + buf_array[i] = (MPI_Aint)item->buf - (MPI_Aint)base_buf; + offset_array[i] = (MPI_Aint)item->offset; + + /* Advance to next node & array location */ + node = H5SL_next(node); + i++; + } /* end while */ + + /* Create memory MPI type */ + if(MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed(count, length_array, buf_array, MPI_BYTE, &btype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed failed", mpi_code) + btype_created = TRUE; + if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&btype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code) + + /* Create file MPI type */ + if(MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed(count, length_array, offset_array, MPI_BYTE, &ftype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed failed", mpi_code) + ftype_created = TRUE; + if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&ftype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code) + + /* Pass buf type, file type to the file driver */ + if(H5FD_mpi_setup_collective(dxpl_id, &btype, &ftype) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O properties") + + /* Write data */ + if(H5F_block_write(f, H5FD_MEM_DEFAULT, (haddr_t)0, (size_t)1, dxpl_id, base_buf) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to write entries collectively") + } /* end if */ + else { + MPI_Status mpi_stat; + MPI_File mpi_fh_p; + MPI_File mpi_fh; + + if(H5F_get_mpi_handle(f, (MPI_File **)&mpi_fh_p) < 0) + HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "can't get mpi file handle") + mpi_fh = *(MPI_File*)mpi_fh_p; + + /* just to match up with the 1st MPI_File_set_view from H5FD_mpio_write() */ + if(MPI_SUCCESS != (mpi_code = MPI_File_set_view(mpi_fh, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) + + /* just to match up with MPI_File_write_at_all from H5FD_mpio_write() */ + HDmemset(&mpi_stat, 0, sizeof(MPI_Status)); + if(MPI_SUCCESS != (mpi_code = MPI_File_write_at_all(mpi_fh, (MPI_Offset)0, NULL, 0, MPI_BYTE, &mpi_stat))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at_all failed", mpi_code) + + /* just to match up with the 2nd MPI_File_set_view (reset) in H5FD_mpio_write() */ + if(MPI_SUCCESS != (mpi_code = MPI_File_set_view(mpi_fh, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) + } /* end else */ + +done: + /* Free arrays */ + length_array = (int *)H5MM_xfree(length_array); + buf_array = (MPI_Aint *)H5MM_xfree(buf_array); + offset_array = (MPI_Aint *)H5MM_xfree(offset_array); + + /* Free MPI Types */ + if(btype_created && MPI_SUCCESS != (mpi_code = MPI_Type_free(&btype))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code) + if(ftype_created && MPI_SUCCESS != (mpi_code = MPI_Type_free(&ftype))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code) + + /* Reset dxpl */ + if(orig_xfer_mode != H5FD_MPIO_COLLECTIVE) { + HDassert(plist); + if(H5P_set(plist, H5D_XFER_IO_XFER_MODE_NAME, &orig_xfer_mode) < 0) + HDONE_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O property") + } /* end if */ + + FUNC_LEAVE_NOAPI(ret_value); +} /* end H5C__collective_write() */ + + +/*------------------------------------------------------------------------- + * + * Function: H5C__collective_write_free + * + * Purpose: Release node on collective write skiplist + * + * Return: FAIL if error is detected, SUCCEED otherwise. + * + * Programmer: Mohamad Chaarawi + * February, 2016 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5C__collective_write_free(void *_item, void H5_ATTR_UNUSED *key, void H5_ATTR_UNUSED *op_data) +{ + H5C_collective_write_t *item = (H5C_collective_write_t *)_item; + + FUNC_ENTER_STATIC_NOERR + + /* Sanity check */ + HDassert(item); + + if(item->free_buf) + item->buf = H5MM_xfree(item->buf); + H5FL_FREE(H5C_collective_write_t, item); + + FUNC_LEAVE_NOAPI(SUCCEED) +} /* end H5C__collective_write_free() */ #endif /* H5_HAVE_PARALLEL */ diff --git a/src/H5Cpkg.h b/src/H5Cpkg.h index 9508b98..08be52b 100644 --- a/src/H5Cpkg.h +++ b/src/H5Cpkg.h @@ -2542,7 +2542,16 @@ if ( ( (cache_ptr)->index_size != \ HDassert( ((entry_ptr)->ro_ref_count) == 0 ); \ HDassert( (entry_ptr)->size > 0 ); \ HDassert( new_size > 0 ); \ - \ + \ + if ( (entry_ptr)->coll_access ) { \ + \ + H5C__DLL_UPDATE_FOR_SIZE_CHANGE((cache_ptr)->coll_list_len, \ + (cache_ptr)->coll_list_size, \ + (entry_ptr)->size, \ + (new_size)); \ + \ + } \ + \ if ( (entry_ptr)->is_pinned ) { \ \ H5C__DLL_UPDATE_FOR_SIZE_CHANGE((cache_ptr)->pel_len, \ @@ -2892,6 +2901,247 @@ if ( ( (cache_ptr)->index_size != \ #endif /* H5C_MAINTAIN_CLEAN_AND_DIRTY_LRU_LISTS */ +#ifdef H5_HAVE_PARALLEL + +#if H5C_DO_SANITY_CHECKS + +#define H5C__COLL_DLL_PRE_REMOVE_SC(entry_ptr, hd_ptr, tail_ptr, len, Size, fv) \ +if ( ( (hd_ptr) == NULL ) || \ + ( (tail_ptr) == NULL ) || \ + ( (entry_ptr) == NULL ) || \ + ( (len) <= 0 ) || \ + ( (Size) < (entry_ptr)->size ) || \ + ( ( (Size) == (entry_ptr)->size ) && ( ! ( (len) == 1 ) ) ) || \ + ( ( (entry_ptr)->coll_prev == NULL ) && ( (hd_ptr) != (entry_ptr) ) ) || \ + ( ( (entry_ptr)->coll_next == NULL ) && ( (tail_ptr) != (entry_ptr) ) ) || \ + ( ( (len) == 1 ) && \ + ( ! ( ( (hd_ptr) == (entry_ptr) ) && ( (tail_ptr) == (entry_ptr) ) && \ + ( (entry_ptr)->coll_next == NULL ) && \ + ( (entry_ptr)->coll_prev == NULL ) && \ + ( (Size) == (entry_ptr)->size ) \ + ) \ + ) \ + ) \ + ) { \ + HDassert(0 && "coll DLL pre remove SC failed"); \ + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, (fv), "coll DLL pre remove SC failed") \ +} + +#define H5C__COLL_DLL_SC(head_ptr, tail_ptr, len, Size, fv) \ +if ( ( ( ( (head_ptr) == NULL ) || ( (tail_ptr) == NULL ) ) && \ + ( (head_ptr) != (tail_ptr) ) \ + ) || \ + ( (len) < 0 ) || \ + ( (Size) < 0 ) || \ + ( ( (len) == 1 ) && \ + ( ( (head_ptr) != (tail_ptr) ) || ( (Size) <= 0 ) || \ + ( (head_ptr) == NULL ) || ( (head_ptr)->size != (Size) ) \ + ) \ + ) || \ + ( ( (len) >= 1 ) && \ + ( ( (head_ptr) == NULL ) || ( (head_ptr)->coll_prev != NULL ) || \ + ( (tail_ptr) == NULL ) || ( (tail_ptr)->coll_next != NULL ) \ + ) \ + ) \ + ) { \ + HDassert(0 && "COLL DLL sanity check failed"); \ + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, (fv), "COLL DLL sanity check failed") \ +} + +#define H5C__COLL_DLL_PRE_INSERT_SC(entry_ptr, hd_ptr, tail_ptr, len, Size, fv) \ +if ( ( (entry_ptr) == NULL ) || \ + ( (entry_ptr)->coll_next != NULL ) || \ + ( (entry_ptr)->coll_prev != NULL ) || \ + ( ( ( (hd_ptr) == NULL ) || ( (tail_ptr) == NULL ) ) && \ + ( (hd_ptr) != (tail_ptr) ) \ + ) || \ + ( (len) < 0 ) || \ + ( ( (len) == 1 ) && \ + ( ( (hd_ptr) != (tail_ptr) ) || ( (Size) <= 0 ) || \ + ( (hd_ptr) == NULL ) || ( (hd_ptr)->size != (Size) ) \ + ) \ + ) || \ + ( ( (len) >= 1 ) && \ + ( ( (hd_ptr) == NULL ) || ( (hd_ptr)->coll_prev != NULL ) || \ + ( (tail_ptr) == NULL ) || ( (tail_ptr)->coll_next != NULL ) \ + ) \ + ) \ + ) { \ + HDassert(0 && "COLL DLL pre insert SC failed"); \ + HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, (fv), "COLL DLL pre insert SC failed") \ +} + +#else /* H5C_DO_SANITY_CHECKS */ + +#define H5C__COLL_DLL_PRE_REMOVE_SC(entry_ptr, hd_ptr, tail_ptr, len, Size, fv) +#define H5C__COLL_DLL_SC(head_ptr, tail_ptr, len, Size, fv) +#define H5C__COLL_DLL_PRE_INSERT_SC(entry_ptr, hd_ptr, tail_ptr, len, Size, fv) + +#endif /* H5C_DO_SANITY_CHECKS */ + + +#define H5C__COLL_DLL_APPEND(entry_ptr, head_ptr, tail_ptr, len, Size, fail_val) \ +{ \ + H5C__COLL_DLL_PRE_INSERT_SC(entry_ptr, head_ptr, tail_ptr, len, Size, \ + fail_val) \ + if ( (head_ptr) == NULL ) \ + { \ + (head_ptr) = (entry_ptr); \ + (tail_ptr) = (entry_ptr); \ + } \ + else \ + { \ + (tail_ptr)->coll_next = (entry_ptr); \ + (entry_ptr)->coll_prev = (tail_ptr); \ + (tail_ptr) = (entry_ptr); \ + } \ + (len)++; \ + (Size) += entry_ptr->size; \ +} /* H5C__COLL_DLL_APPEND() */ + +#define H5C__COLL_DLL_PREPEND(entry_ptr, head_ptr, tail_ptr, len, Size, fv) \ +{ \ + H5C__COLL_DLL_PRE_INSERT_SC(entry_ptr, head_ptr, tail_ptr, len, Size, fv)\ + if ( (head_ptr) == NULL ) \ + { \ + (head_ptr) = (entry_ptr); \ + (tail_ptr) = (entry_ptr); \ + } \ + else \ + { \ + (head_ptr)->coll_prev = (entry_ptr); \ + (entry_ptr)->coll_next = (head_ptr); \ + (head_ptr) = (entry_ptr); \ + } \ + (len)++; \ + (Size) += entry_ptr->size; \ +} /* H5C__COLL_DLL_PREPEND() */ + +#define H5C__COLL_DLL_REMOVE(entry_ptr, head_ptr, tail_ptr, len, Size, fv) \ +{ \ + H5C__COLL_DLL_PRE_REMOVE_SC(entry_ptr, head_ptr, tail_ptr, len, Size, fv)\ + { \ + if ( (head_ptr) == (entry_ptr) ) \ + { \ + (head_ptr) = (entry_ptr)->coll_next; \ + if ( (head_ptr) != NULL ) \ + (head_ptr)->coll_prev = NULL; \ + } \ + else \ + { \ + (entry_ptr)->coll_prev->coll_next = (entry_ptr)->coll_next; \ + } \ + if ( (tail_ptr) == (entry_ptr) ) \ + { \ + (tail_ptr) = (entry_ptr)->coll_prev; \ + if ( (tail_ptr) != NULL ) \ + (tail_ptr)->coll_next = NULL; \ + } \ + else \ + (entry_ptr)->coll_next->coll_prev = (entry_ptr)->coll_prev; \ + entry_ptr->coll_next = NULL; \ + entry_ptr->coll_prev = NULL; \ + (len)--; \ + (Size) -= entry_ptr->size; \ + } \ +} /* H5C__COLL_DLL_REMOVE() */ + + +/*------------------------------------------------------------------------- + * + * Macro: H5C__INSERT_IN_COLL_LIST + * + * Purpose: Insert entry into collective entries list + * + * Return: N/A + * + * Programmer: Mohamad Chaarawi + * + *------------------------------------------------------------------------- + */ + +#define H5C__INSERT_IN_COLL_LIST(cache_ptr, entry_ptr, fail_val) \ +{ \ + HDassert( (cache_ptr) ); \ + HDassert( (cache_ptr)->magic == H5C__H5C_T_MAGIC ); \ + HDassert( (entry_ptr) ); \ + \ + /* insert the entry at the head of the list. */ \ + \ + H5C__COLL_DLL_PREPEND((entry_ptr), (cache_ptr)->coll_head_ptr, \ + (cache_ptr)->coll_tail_ptr, \ + (cache_ptr)->coll_list_len, \ + (cache_ptr)->coll_list_size, \ + (fail_val)) \ + \ +} /* H5C__INSERT_IN_COLL_LIST */ + + +/*------------------------------------------------------------------------- + * + * Macro: H5C__REMOVE_FROM_COLL_LIST + * + * Purpose: Remove entry from collective entries list + * + * Return: N/A + * + * Programmer: Mohamad Chaarawi + * + *------------------------------------------------------------------------- + */ + +#define H5C__REMOVE_FROM_COLL_LIST(cache_ptr, entry_ptr, fail_val) \ +{ \ + HDassert( (cache_ptr) ); \ + HDassert( (cache_ptr)->magic == H5C__H5C_T_MAGIC ); \ + HDassert( (entry_ptr) ); \ + \ + /* remove the entry from the list. */ \ + \ + H5C__COLL_DLL_REMOVE((entry_ptr), (cache_ptr)->coll_head_ptr, \ + (cache_ptr)->coll_tail_ptr, \ + (cache_ptr)->coll_list_len, \ + (cache_ptr)->coll_list_size, \ + (fail_val)) \ + \ +} /* H5C__REMOVE_FROM_COLL_LIST */ + + +/*------------------------------------------------------------------------- + * + * Macro: H5C__MOVE_TO_TOP_IN_COLL_LIST + * + * Purpose: Update entry position in collective entries list + * + * Return: N/A + * + * Programmer: Mohamad Chaarawi + * + *------------------------------------------------------------------------- + */ + +#define H5C__MOVE_TO_TOP_IN_COLL_LIST(cache_ptr, entry_ptr, fail_val) \ +{ \ + HDassert( (cache_ptr) ); \ + HDassert( (cache_ptr)->magic == H5C__H5C_T_MAGIC ); \ + HDassert( (entry_ptr) ); \ + \ + /* Remove entry and insert at the head of the list. */ \ + H5C__COLL_DLL_REMOVE((entry_ptr), (cache_ptr)->coll_head_ptr, \ + (cache_ptr)->coll_tail_ptr, \ + (cache_ptr)->coll_list_len, \ + (cache_ptr)->coll_list_size, \ + (fail_val)) \ + \ + H5C__COLL_DLL_PREPEND((entry_ptr), (cache_ptr)->coll_head_ptr, \ + (cache_ptr)->coll_tail_ptr, \ + (cache_ptr)->coll_list_len, \ + (cache_ptr)->coll_list_size, \ + (fail_val)) \ + \ +} /* H5C__MOVE_TO_TOP_IN_COLL_LIST */ +#endif /* H5_HAVE_PARALLEL */ + /****************************/ /* Package Private Typedefs */ @@ -3885,6 +4135,13 @@ struct H5C_t { H5C_cache_entry_t * dLRU_head_ptr; H5C_cache_entry_t * dLRU_tail_ptr; +#ifdef H5_HAVE_PARALLEL + int32_t coll_list_len; + size_t coll_list_size; + H5C_cache_entry_t * coll_head_ptr; + H5C_cache_entry_t * coll_tail_ptr; +#endif /* H5_HAVE_PARALLEL */ + /* Fields for automatic cache size adjustment */ hbool_t size_increase_possible; hbool_t flash_size_increase_possible; @@ -3988,6 +4245,16 @@ struct H5C_t { #endif /* NDEBUG */ }; +#ifdef H5_HAVE_PARALLEL +typedef struct H5C_collective_write_t { + size_t length; + hbool_t free_buf; + void *buf; + haddr_t offset; +} H5C_collective_write_t; +#endif /* H5_HAVE_PARALLEL */ + + /*****************************/ /* Package Private Variables */ /*****************************/ @@ -3997,7 +4264,7 @@ struct H5C_t { /* Package Private Prototypes */ /******************************/ H5_DLL herr_t H5C__flush_single_entry(const H5F_t *f, hid_t dxpl_id, - H5C_cache_entry_t *entry_ptr, unsigned flags, int64_t *entry_size_change_ptr); + H5C_cache_entry_t *entry_ptr, unsigned flags, int64_t *entry_size_change_ptr, H5SL_t *collective_write_list); #endif /* _H5Cpkg_H */ diff --git a/src/H5Cprivate.h b/src/H5Cprivate.h index da9e6f5..5502e9f 100644 --- a/src/H5Cprivate.h +++ b/src/H5Cprivate.h @@ -1608,6 +1608,7 @@ typedef struct H5C_cache_entry_t { #ifdef H5_HAVE_PARALLEL hbool_t clear_on_unprotect; hbool_t flush_immediately; + hbool_t coll_access; #endif /* H5_HAVE_PARALLEL */ hbool_t flush_in_progress; hbool_t destroy_in_progress; @@ -1631,6 +1632,10 @@ typedef struct H5C_cache_entry_t { struct H5C_cache_entry_t * prev; struct H5C_cache_entry_t * aux_next; struct H5C_cache_entry_t * aux_prev; +#ifdef H5_HAVE_PARALLEL + struct H5C_cache_entry_t * coll_next; + struct H5C_cache_entry_t * coll_prev; +#endif /* H5_HAVE_PARALLEL */ #if H5C_COLLECT_CACHE_ENTRY_STATS /* cache entry stats fields */ @@ -1996,6 +2001,7 @@ H5_DLL herr_t H5C_apply_candidate_list(H5F_t *f, hid_t dxpl_id, int mpi_rank, int mpi_size); H5_DLL herr_t H5C_construct_candidate_list__clean_cache(H5C_t *cache_ptr); H5_DLL herr_t H5C_construct_candidate_list__min_clean(H5C_t *cache_ptr); +H5_DLL herr_t H5C_clear_coll_entries(H5C_t * cache_ptr, hbool_t partial); H5_DLL herr_t H5C_mark_entries_as_clean(H5F_t *f, hid_t dxpl_id, int32_t ce_array_len, haddr_t *ce_array_ptr); #endif /* H5_HAVE_PARALLEL */ diff --git a/src/H5Dchunk.c b/src/H5Dchunk.c index 5657e6e..a17e035 100644 --- a/src/H5Dchunk.c +++ b/src/H5Dchunk.c @@ -56,6 +56,7 @@ #endif /* H5_HAVE_PARALLEL */ #include "H5Dpkg.h" /* Dataset functions */ #include "H5Eprivate.h" /* Error handling */ +#include "H5Fprivate.h" /* File functions */ #include "H5FLprivate.h" /* Free Lists */ #include "H5Iprivate.h" /* IDs */ #include "H5MMprivate.h" /* Memory management */ @@ -2650,6 +2651,9 @@ H5D__chunk_lookup(const H5D_t *dset, hid_t dxpl_id, const hsize_t *scaled, /* Check for cached information */ if(!H5D__chunk_cinfo_cache_found(&dset->shared->cache.chunk.last, udata)) { H5D_chk_idx_info_t idx_info; /* Chunked index info */ +#ifdef H5_HAVE_PARALLEL + H5P_coll_md_read_flag_t temp_cmr; /* Temp value to hold the coll metadata read setting */ +#endif /* H5_HAVE_PARALLEL */ /* Compose chunked index info struct */ idx_info.f = dset->oloc.file; @@ -2658,10 +2662,26 @@ H5D__chunk_lookup(const H5D_t *dset, hid_t dxpl_id, const hsize_t *scaled, idx_info.layout = &dset->shared->layout.u.chunk; idx_info.storage = &dset->shared->layout.storage.u.chunk; +#ifdef H5_HAVE_PARALLEL + if(H5F_HAS_FEATURE(idx_info.f, H5FD_FEAT_HAS_MPI)) { + /* disable collective metadata read for chunk indexes + as it is highly unlikely that users would read the + same chunks from all processes. MSC - might turn on + for root node? */ + temp_cmr = H5F_COLL_MD_READ(idx_info.f); + H5F_set_coll_md_read(idx_info.f, H5P_FORCE_FALSE); + } /* end if */ +#endif /* H5_HAVE_PARALLEL */ + /* Go get the chunk information */ if((dset->shared->layout.storage.u.chunk.ops->get_addr)(&idx_info, udata) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't query chunk address") +#ifdef H5_HAVE_PARALLEL + if(H5F_HAS_FEATURE(idx_info.f, H5FD_FEAT_HAS_MPI)) + H5F_set_coll_md_read(idx_info.f, temp_cmr); +#endif /* H5_HAVE_PARALLEL */ + /* Cache the information retrieved */ H5D__chunk_cinfo_cache_update(&dset->shared->cache.chunk.last, udata); } /* end if */ diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c index bbd3eca..ed70e20 100644 --- a/src/H5FDmpio.c +++ b/src/H5FDmpio.c @@ -1696,6 +1696,7 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, int size_i; hbool_t use_view_this_time = FALSE; H5P_genplist_t *plist = NULL; /* Property list pointer */ + H5FD_mpio_xfer_t xfer_mode; /* I/O tranfer mode */ herr_t ret_value = SUCCEED; FUNC_ENTER_NOAPI_NOINIT @@ -1726,57 +1727,42 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, fprintf(stdout, "in H5FD_mpio_write mpi_off=%ld size_i=%d\n", (long)mpi_off, size_i); #endif - if(type == H5FD_MEM_DRAW) { - H5FD_mpio_xfer_t xfer_mode; /* I/O tranfer mode */ + /* Obtain the data transfer properties */ + if(NULL == (plist = (H5P_genplist_t *)H5I_object(dxpl_id))) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list") - /* Obtain the data transfer properties */ - if(NULL == (plist = (H5P_genplist_t *)H5I_object(dxpl_id))) - HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list") + /* get the transfer mode from the dxpl */ + if(H5P_get(plist, H5D_XFER_IO_XFER_MODE_NAME, &xfer_mode)<0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode") - /* get the transfer mode from the dxpl */ - if(H5P_get(plist, H5D_XFER_IO_XFER_MODE_NAME, &xfer_mode)<0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode") + /* + * Set up for a fancy xfer using complex types, or single byte block. We + * wouldn't need to rely on the use_view field if MPI semantics allowed + * us to test that btype=ftype=MPI_BYTE (or even MPI_TYPE_NULL, which + * could mean "use MPI_BYTE" by convention). + */ + if(xfer_mode == H5FD_MPIO_COLLECTIVE) { + MPI_Datatype file_type; - /* - * Set up for a fancy xfer using complex types, or single byte block. We - * wouldn't need to rely on the use_view field if MPI semantics allowed - * us to test that btype=ftype=MPI_BYTE (or even MPI_TYPE_NULL, which - * could mean "use MPI_BYTE" by convention). - */ - if(xfer_mode == H5FD_MPIO_COLLECTIVE) { - MPI_Datatype file_type; + /* Remember that views are used */ + use_view_this_time = TRUE; - /* Remember that views are used */ - use_view_this_time = TRUE; + /* prepare for a full-blown xfer using btype, ftype, and disp */ + if(H5P_get(plist, H5FD_MPI_XFER_MEM_MPI_TYPE_NAME, &buf_type) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O type property") + if(H5P_get(plist, H5FD_MPI_XFER_FILE_MPI_TYPE_NAME, &file_type) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O type property") - /* prepare for a full-blown xfer using btype, ftype, and disp */ - if(H5P_get(plist, H5FD_MPI_XFER_MEM_MPI_TYPE_NAME, &buf_type) < 0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O type property") - if(H5P_get(plist, H5FD_MPI_XFER_FILE_MPI_TYPE_NAME, &file_type) < 0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O type property") - - /* - * Set the file view when we are using MPI derived types - */ - if(MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, mpi_off, MPI_BYTE, file_type, H5FD_mpi_native_g, file->info))) - HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) + /* + * Set the file view when we are using MPI derived types + */ + if(MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, mpi_off, MPI_BYTE, file_type, H5FD_mpi_native_g, file->info))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) - /* When using types, use the address as the displacement for - * MPI_File_set_view and reset the address for the read to zero - */ - mpi_off = 0; - } /* end if */ - } /* end if */ - else { -#if 0 /* JRM -- 3/23/10 */ /* this is no longer always the case */ - /* Only one process can do the actual metadata write */ - if(file->mpi_rank != H5_PAR_META_WRITE) -#ifdef LATER - HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "can't write metadata from non-zero rank") -#else /* LATER */ - HGOTO_DONE(SUCCEED) /* skip the actual write */ -#endif /* LATER */ -#endif /* JRM */ + /* When using types, use the address as the displacement for + * MPI_File_set_view and reset the address for the read to zero + */ + mpi_off = 0; } /* end if */ /* Write the data. */ @@ -1803,6 +1789,8 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at_all failed", mpi_code) } /* end if */ else { + if(type != H5FD_MEM_DRAW) + HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "Metadata Coll opt property should be collective at this point") #ifdef H5FDmpio_DEBUG if(H5FD_mpio_Debug[(int)'t']) fprintf(stdout, "H5FD_mpio_write: doing MPI independent IO\n"); diff --git a/src/H5Fint.c b/src/H5Fint.c index 14f5456..063b65f 100644 --- a/src/H5Fint.c +++ b/src/H5Fint.c @@ -172,6 +172,10 @@ H5F_get_access_plist(H5F_t *f, hbool_t app_ref) efc_size = H5F_efc_max_nfiles(f->shared->efc); if(H5P_set(new_plist, H5F_ACS_EFC_SIZE_NAME, &efc_size) < 0) HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't set elink file cache size") +#ifdef H5_HAVE_PARALLEL + if(H5P_set(new_plist, H5_COLL_MD_READ_FLAG_NAME, &(f->coll_md_read)) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't set collective metadata read flag") +#endif /* H5_HAVE_PARALLEL */ /* Prepare the driver property */ driver_prop.driver_id = f->shared->lf->driver_id; @@ -637,6 +641,12 @@ H5F_new(H5F_file_t *shared, unsigned flags, hid_t fcpl_id, hid_t fapl_id, H5FD_t if(efc_size > 0) if(NULL == (f->shared->efc = H5F_efc_create(efc_size))) HGOTO_ERROR(H5E_FILE, H5E_CANTINIT, NULL, "can't create external file cache") +#ifdef H5_HAVE_PARALLEL + if(H5P_get(plist, H5_COLL_MD_READ_FLAG_NAME, &(f->coll_md_read)) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, NULL, "can't get collective metadata read flag") + if(H5P_get(plist, H5F_ACS_COLL_MD_WRITE_FLAG_NAME, &(f->coll_md_write)) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, NULL, "can't get collective metadata write flag") +#endif /* H5_HAVE_PARALLEL */ /* Get the VFD values to cache */ f->shared->maxaddr = H5FD_get_maxaddr(lf); @@ -2100,3 +2110,33 @@ done: FUNC_LEAVE_NOAPI(ret_value) } /* end H5F__set_eoa() */ +#ifdef H5_HAVE_PARALLEL + +/*------------------------------------------------------------------------- + * Function: H5F_set_coll_md_read + * + * Purpose: Set the coll_md_read field with a new value. + * + * Return: Success: SUCCEED + * Failure: FAIL + * + * Programmer: Quincey Koziol + * 2/10/16 + * + *------------------------------------------------------------------------- + */ +void +H5F_set_coll_md_read(H5F_t *f, H5P_coll_md_read_flag_t cmr) +{ + /* Use FUNC_ENTER_NOAPI_NOINIT_NOERR here to avoid performance issues */ + FUNC_ENTER_NOAPI_NOINIT_NOERR + + /* Sanity check */ + HDassert(f); + + f->coll_md_read = cmr; + + FUNC_LEAVE_NOAPI_VOID +} /* H5F_set_coll_md_read() */ +#endif /* H5_HAVE_PARALLEL */ + diff --git a/src/H5Fmpi.c b/src/H5Fmpi.c index 9783947..5434aa5 100644 --- a/src/H5Fmpi.c +++ b/src/H5Fmpi.c @@ -77,6 +77,35 @@ #ifdef H5_HAVE_PARALLEL + +/*------------------------------------------------------------------------- + * Function: H5F_get_mpi_handle + * + * Purpose: Retrieves MPI File handle. + * + * Return: Success: The size (positive) + * Failure: Negative + * + *------------------------------------------------------------------------- + */ +herr_t +H5F_get_mpi_handle(const H5F_t *f, MPI_File **f_handle) +{ + herr_t ret_value = SUCCEED; + hid_t fapl = -1; + + FUNC_ENTER_NOAPI(FAIL) + + assert(f && f->shared); + + /* Dispatch to driver */ + if ((ret_value = H5FD_get_vfd_handle(f->shared->lf, fapl, (void **)f_handle)) < 0) + HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "can't get mpi file handle") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5F_get_mpi_handle() */ + /*------------------------------------------------------------------------- * Function: H5F_mpi_get_rank diff --git a/src/H5Fpkg.h b/src/H5Fpkg.h index c62858c..be324d0 100644 --- a/src/H5Fpkg.h +++ b/src/H5Fpkg.h @@ -311,6 +311,10 @@ struct H5F_t { hbool_t closing; /* File is in the process of being closed */ struct H5F_t *parent; /* Parent file that this file is mounted to */ unsigned nmounts; /* Number of children mounted to this file */ +#ifdef H5_HAVE_PARALLEL + H5P_coll_md_read_flag_t coll_md_read; /* Do all metadata reads collectively */ + hbool_t coll_md_write; /* Do all metadata writes collectively */ +#endif /* H5_HAVE_PARALLEL */ }; diff --git a/src/H5Fprivate.h b/src/H5Fprivate.h index dd2877b..52649e5 100644 --- a/src/H5Fprivate.h +++ b/src/H5Fprivate.h @@ -27,6 +27,9 @@ #include "H5FDpublic.h" /* File drivers */ /* Private headers needed by this file */ +#ifdef H5_HAVE_PARALLEL +#include "H5Pprivate.h" /* Property lists */ +#endif /* H5_HAVE_PARALLEL */ #include "H5VMprivate.h" /* Vectors and arrays */ @@ -312,6 +315,9 @@ #define H5F_SET_GRP_BTREE_SHARED(F, RC) (((F)->shared->grp_btree_shared = (RC)) ? SUCCEED : FAIL) #define H5F_USE_TMP_SPACE(F) ((F)->shared->use_tmp_space) #define H5F_IS_TMP_ADDR(F, ADDR) (H5F_addr_le((F)->shared->tmp_addr, (ADDR))) +#ifdef H5_HAVE_PARALLEL +#define H5F_COLL_MD_READ(F) ((F)->coll_md_read) +#endif /* H5_HAVE_PARALLEL */ #else /* H5F_MODULE */ #define H5F_INTENT(F) (H5F_get_intent(F)) #define H5F_OPEN_NAME(F) (H5F_get_open_name(F)) @@ -354,6 +360,9 @@ #define H5F_SET_GRP_BTREE_SHARED(F, RC) (H5F_set_grp_btree_shared((F), (RC))) #define H5F_USE_TMP_SPACE(F) (H5F_use_tmp_space(F)) #define H5F_IS_TMP_ADDR(F, ADDR) (H5F_is_tmp_addr((F), (ADDR))) +#ifdef H5_HAVE_PARALLEL +#define H5F_COLL_MD_READ(F) (H5F_coll_md_read(F)) +#endif /* H5_HAVE_PARALLEL */ #endif /* H5F_MODULE */ @@ -454,6 +463,7 @@ #define H5F_ACS_FILE_IMAGE_INFO_NAME "file_image_info" /* struct containing initial file image and callback info */ #define H5F_ACS_CORE_WRITE_TRACKING_FLAG_NAME "core_write_tracking_flag" /* Whether or not core VFD backing store write tracking is enabled */ #define H5F_ACS_CORE_WRITE_TRACKING_PAGE_SIZE_NAME "core_write_tracking_page_size" /* The page size in kiB when core VFD write tracking is enabled */ +#define H5F_ACS_COLL_MD_WRITE_FLAG_NAME "collective_metadata_write" /* property indicating whether metadata writes are done collectively or not */ /* ======================== File Mount properties ====================*/ #define H5F_MNT_SYM_LOCAL_NAME "local" /* Whether absolute symlinks local to file. */ @@ -638,6 +648,10 @@ H5_DLL struct H5UC_t *H5F_grp_btree_shared(const H5F_t *f); H5_DLL herr_t H5F_set_grp_btree_shared(H5F_t *f, struct H5UC_t *rc); H5_DLL hbool_t H5F_use_tmp_space(const H5F_t *f); H5_DLL hbool_t H5F_is_tmp_addr(const H5F_t *f, haddr_t addr); +#ifdef H5_HAVE_PARALLEL +H5_DLL H5P_coll_md_read_flag_t H5F_coll_md_read(const H5F_t *f); +H5_DLL void H5F_set_coll_md_read(H5F_t *f, H5P_coll_md_read_flag_t flag); +#endif /* H5_HAVE_PARALLEL */ /* Functions that retrieve values from VFD layer */ H5_DLL hid_t H5F_get_driver_id(const H5F_t *f); @@ -676,6 +690,7 @@ H5_DLL herr_t H5F_super_dirty(H5F_t *f); /* Parallel I/O (i.e. MPI) related routines */ #ifdef H5_HAVE_PARALLEL +H5_DLL herr_t H5F_get_mpi_handle(const H5F_t *f, MPI_File **f_handle); H5_DLL int H5F_mpi_get_rank(const H5F_t *f); H5_DLL MPI_Comm H5F_mpi_get_comm(const H5F_t *f); H5_DLL int H5F_mpi_get_size(const H5F_t *f); diff --git a/src/H5Fquery.c b/src/H5Fquery.c index 75fc216..e9af300 100644 --- a/src/H5Fquery.c +++ b/src/H5Fquery.c @@ -1073,3 +1073,31 @@ H5F_use_tmp_space(const H5F_t *f) FUNC_LEAVE_NOAPI(f->shared->use_tmp_space) } /* end H5F_use_tmp_space() */ +#ifdef H5_HAVE_PARALLEL + +/*------------------------------------------------------------------------- + * Function: H5F_coll_md_read + * + * Purpose: Retrieve the 'collective metadata reads' flag for the file. + * + * Return: Success: Non-negative, the 'collective metadata reads' flag + * Failure: (can't happen) + * + * Programmer: Quincey Koziol + * koziol@hdfgroup.org + * Feb 10 2016 + * + *------------------------------------------------------------------------- + */ +H5P_coll_md_read_flag_t +H5F_coll_md_read(const H5F_t *f) +{ + /* Use FUNC_ENTER_NOAPI_NOINIT_NOERR here to avoid performance issues */ + FUNC_ENTER_NOAPI_NOINIT_NOERR + + HDassert(f); + + FUNC_LEAVE_NOAPI(f->coll_md_read) +} /* end H5F_coll_md_read() */ +#endif /* H5_HAVE_PARALLEL */ + diff --git a/src/H5Pdxpl.c b/src/H5Pdxpl.c index 3ff71dc..9353094 100644 --- a/src/H5Pdxpl.c +++ b/src/H5Pdxpl.c @@ -179,8 +179,15 @@ #ifdef H5_DEBUG_BUILD /* dxpl I/O type - private property */ #define H5FD_DXPL_TYPE_SIZE sizeof(H5FD_dxpl_type_t) -#define H5FD_DXPL_TYPE_DEF H5FD_NOIO_DXPL #endif /* H5_DEBUG_BUILD */ +#ifdef H5_HAVE_PARALLEL +/* Definition for reading metadata collectively */ +#define H5D_XFER_COLL_MD_READ_SIZE sizeof(H5P_coll_md_read_flag_t) +#define H5D_XFER_COLL_MD_READ_DEF H5P_USER_FALSE +#define H5D_XFER_COLL_MD_READ_ENC H5P__encode_coll_md_read_flag_t +#define H5D_XFER_COLL_MD_READ_DEC H5P__decode_coll_md_read_flag_t +#endif /* H5_HAVE_PARALLEL */ + /******************/ /* Local Typedefs */ @@ -278,6 +285,7 @@ static const H5D_mpio_no_collective_cause_t H5D_def_mpio_no_collective_cause_g = #ifdef H5_HAVE_PARALLEL static const MPI_Datatype H5D_def_btype_g = H5FD_MPI_XFER_MEM_MPI_TYPE_DEF; /* Default value for MPI buffer type */ static const MPI_Datatype H5D_def_ftype_g = H5FD_MPI_XFER_FILE_MPI_TYPE_DEF; /* Default value for MPI file type */ +static const H5P_coll_md_read_flag_t H5D_def_coll_md_read_g = H5D_XFER_COLL_MD_READ_DEF; /* Default setting for the collective metedata read flag */ #endif /* H5_HAVE_PARALLEL */ static const H5Z_EDC_t H5D_def_enable_edc_g = H5D_XFER_EDC_DEF; /* Default value for EDC property */ static const H5Z_cb_t H5D_def_filter_cb_g = H5D_XFER_FILTER_CB_DEF; /* Default value for filter callback */ @@ -435,6 +443,13 @@ H5P__dxfr_reg_prop(H5P_genclass_t *pclass) if(H5P_register_real(pclass, H5FD_MPI_XFER_FILE_MPI_TYPE_NAME, H5FD_MPI_XFER_FILE_MPI_TYPE_SIZE, &H5D_def_ftype_g, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL) < 0) HGOTO_ERROR(H5E_PLIST, H5E_CANTINSERT, FAIL, "can't insert property into class") + + /* Register the metadata collective read flag */ + if(H5P_register_real(pclass, H5_COLL_MD_READ_FLAG_NAME, H5D_XFER_COLL_MD_READ_SIZE, + &H5D_def_coll_md_read_g, + NULL, NULL, NULL, H5D_XFER_COLL_MD_READ_ENC, H5D_XFER_COLL_MD_READ_DEC, + NULL, NULL, NULL, NULL) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTINSERT, FAIL, "can't insert property into class") #endif /* H5_HAVE_PARALLEL */ /* Register the EDC property */ diff --git a/src/H5Pfapl.c b/src/H5Pfapl.c index b92d3e9..d90dcd5 100644 --- a/src/H5Pfapl.c +++ b/src/H5Pfapl.c @@ -178,6 +178,17 @@ #define H5F_ACS_CORE_WRITE_TRACKING_PAGE_SIZE_DEF 524288 #define H5F_ACS_CORE_WRITE_TRACKING_PAGE_SIZE_ENC H5P__encode_size_t #define H5F_ACS_CORE_WRITE_TRACKING_PAGE_SIZE_DEC H5P__decode_size_t +/* Definition of collective metadata read mode flag */ +#define H5F_ACS_COLL_MD_READ_FLAG_SIZE sizeof(H5P_coll_md_read_flag_t) +#define H5F_ACS_COLL_MD_READ_FLAG_DEF H5P_USER_FALSE +#define H5F_ACS_COLL_MD_READ_FLAG_ENC H5P__encode_coll_md_read_flag_t +#define H5F_ACS_COLL_MD_READ_FLAG_DEC H5P__decode_coll_md_read_flag_t +/* Definition of collective metadata write mode flag */ +#define H5F_ACS_COLL_MD_WRITE_FLAG_SIZE sizeof(hbool_t) +#define H5F_ACS_COLL_MD_WRITE_FLAG_DEF FALSE +#define H5F_ACS_COLL_MD_WRITE_FLAG_ENC H5P__encode_hbool_t +#define H5F_ACS_COLL_MD_WRITE_FLAG_DEC H5P__decode_hbool_t + /******************/ /* Local Typedefs */ @@ -280,6 +291,8 @@ static const unsigned H5F_def_efc_size_g = H5F_ACS_EFC_SIZE_DEF; static const H5FD_file_image_info_t H5F_def_file_image_info_g = H5F_ACS_FILE_IMAGE_INFO_DEF; /* Default file image info and callbacks */ static const hbool_t H5F_def_core_write_tracking_flag_g = H5F_ACS_CORE_WRITE_TRACKING_FLAG_DEF; /* Default setting for core VFD write tracking */ static const size_t H5F_def_core_write_tracking_page_size_g = H5F_ACS_CORE_WRITE_TRACKING_PAGE_SIZE_DEF; /* Default core VFD write tracking page size */ +static const H5P_coll_md_read_flag_t H5F_def_coll_md_read_flag_g = H5F_ACS_COLL_MD_READ_FLAG_DEF; /* Default setting for the collective metedata read flag */ +static const hbool_t H5F_def_coll_md_write_flag_g = H5F_ACS_COLL_MD_WRITE_FLAG_DEF; /* Default setting for the collective metedata write flag */ @@ -437,6 +450,18 @@ H5P__facc_reg_prop(H5P_genclass_t *pclass) NULL, NULL, NULL, NULL) < 0) HGOTO_ERROR(H5E_PLIST, H5E_CANTINSERT, FAIL, "can't insert property into class") + /* Register the metadata collective read flag */ + if(H5P_register_real(pclass, H5_COLL_MD_READ_FLAG_NAME, H5F_ACS_COLL_MD_READ_FLAG_SIZE, &H5F_def_coll_md_read_flag_g, + NULL, NULL, NULL, H5F_ACS_COLL_MD_READ_FLAG_ENC, H5F_ACS_COLL_MD_READ_FLAG_DEC, + NULL, NULL, NULL, NULL) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTINSERT, FAIL, "can't insert property into class") + + /* Register the metadata collective write flag */ + if(H5P_register_real(pclass, H5F_ACS_COLL_MD_WRITE_FLAG_NAME, H5F_ACS_COLL_MD_WRITE_FLAG_SIZE, &H5F_def_coll_md_write_flag_g, + NULL, NULL, NULL, H5F_ACS_COLL_MD_WRITE_FLAG_ENC, H5F_ACS_COLL_MD_WRITE_FLAG_DEC, + NULL, NULL, NULL, NULL) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTINSERT, FAIL, "can't insert property into class") + done: FUNC_LEAVE_NOAPI(ret_value) } /* end H5P__facc_reg_prop() */ @@ -3481,3 +3506,270 @@ done: FUNC_LEAVE_API(ret_value) } /* end H5Pget_core_write_tracking() */ + +/*------------------------------------------------------------------------- + * Function: H5P__encode_coll_md_read_flag_t + * + * Purpose: Generic encoding callback routine for 'coll_md_read_flag' properties. + * + * Return: Success: Non-negative + * Failure: Negative + * + * Programmer: Mohamad Chaarawi + * Sunday, June 21, 2015 + * + *------------------------------------------------------------------------- + */ +herr_t +H5P__encode_coll_md_read_flag_t(const void *value, void **_pp, size_t *size) +{ + const H5P_coll_md_read_flag_t *coll_md_read_flag = (const H5P_coll_md_read_flag_t *)value; + uint8_t **pp = (uint8_t **)_pp; + + FUNC_ENTER_PACKAGE_NOERR + + /* Sanity checks */ + HDassert(coll_md_read_flag); + HDassert(size); + + if(NULL != *pp) { + /* Encode the value */ + HDmemcpy(*pp, coll_md_read_flag, sizeof(H5P_coll_md_read_flag_t)); + *pp += sizeof(H5P_coll_md_read_flag_t); + } /* end if */ + + /* Set size needed for encoding */ + *size += sizeof(H5P_coll_md_read_flag_t); + + FUNC_LEAVE_NOAPI(SUCCEED) +} /* end H5P__encode_coll_md_read_flag_t() */ + + +/*------------------------------------------------------------------------- + * Function: H5P__decode_coll_md_read_flag_t + * + * Purpose: Generic decoding callback routine for 'coll_md_read_flag' properties. + * + * Return: Success: Non-negative + * Failure: Negative + * + * Programmer: Mohamad Chaarawi + * Sunday, June 21, 2015 + * + *------------------------------------------------------------------------- + */ +herr_t +H5P__decode_coll_md_read_flag_t(const void **_pp, void *_value) +{ + H5P_coll_md_read_flag_t *coll_md_read_flag = (H5P_coll_md_read_flag_t *)_value; /* File close degree */ + const uint8_t **pp = (const uint8_t **)_pp; + + FUNC_ENTER_STATIC_NOERR + + /* Sanity checks */ + HDassert(pp); + HDassert(*pp); + HDassert(coll_md_read_flag); + + /* Decode file close degree */ + *coll_md_read_flag = (H5P_coll_md_read_flag_t)*(*pp); + *pp += sizeof(H5P_coll_md_read_flag_t); + + FUNC_LEAVE_NOAPI(SUCCEED) +} /* end H5P__decode_coll_md_read_flag_t() */ + +#ifdef H5_HAVE_PARALLEL + +/*------------------------------------------------------------------------- + * Function: H5Pset_all_coll_metadata_ops + * + * Purpose: Tell the library whether the metadata read operations will + * be done collectively (1) or not (0). Default is independent. + * With collective mode, the library will optimize access to + * metadata operations on the file. + * + * Note: This routine accepts file access property lists, link + * access property lists, attribute access property lists, + * dataset access property lists, group access property lists, + * named datatype access property lists, + * and dataset transfer property lists. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: Mohamad Chaarawi + * Sunday, June 21, 2015 + * + *------------------------------------------------------------------------- + */ +herr_t +H5Pset_all_coll_metadata_ops(hid_t plist_id, hbool_t is_collective) +{ + H5P_genplist_t *plist; /* Property list pointer */ + H5P_coll_md_read_flag_t coll_meta_read; /* Property value */ + herr_t ret_value = SUCCEED; /* return value */ + + FUNC_ENTER_API(FAIL) + H5TRACE2("e", "ib", plist_id, is_collective); + + /* Compare the property list's class against the other class */ + /* (Dataset, group, attribute, and named datype access property lists + * are sub-classes of link access property lists -QAK) + */ + if(TRUE != H5P_isa_class(plist_id, H5P_LINK_ACCESS) && + TRUE != H5P_isa_class(plist_id, H5P_FILE_ACCESS) && + TRUE != H5P_isa_class(plist_id, H5P_DATASET_XFER)) + HGOTO_ERROR(H5E_PLIST, H5E_CANTREGISTER, FAIL, "property list is not an access plist") + + /* set property to either TRUE if > 0, or FALSE otherwise */ + if(is_collective) + coll_meta_read = H5P_USER_TRUE; + else + coll_meta_read = H5P_USER_FALSE; + + /* Get the plist structure */ + if(NULL == (plist = (H5P_genplist_t *)H5I_object(plist_id))) + HGOTO_ERROR(H5E_ATOM, H5E_BADATOM, FAIL, "can't find object for ID") + + /* Set values */ + if(H5P_set(plist, H5_COLL_MD_READ_FLAG_NAME, &coll_meta_read) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set collective metadata read flag") + +done: + FUNC_LEAVE_API(ret_value) +} /* end H5Pset_all_coll_metadata_ops() */ + + +/*------------------------------------------------------------------------- + * Function: H5Pget_coll_metadata_read + * + * Purpose: Gets information about collective metadata read mode. + * + * Note: This routine accepts file access property lists, link + * access property lists, attribute access property lists, + * dataset access property lists, group access property lists, + * named datatype access property lists, + * and dataset transfer property lists. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: Mohamad Chaarawi + * Sunday, June 21, 2015 + * + *------------------------------------------------------------------------- + */ +herr_t +H5Pget_coll_metadata_read(hid_t plist_id, hbool_t *is_collective) +{ + herr_t ret_value = SUCCEED; /* return value */ + + FUNC_ENTER_API(FAIL) + H5TRACE2("e", "i*b", plist_id, is_collective); + + /* Compare the property list's class against the other class */ + /* (Dataset, group, attribute, and named datype access property lists + * are sub-classes of link access property lists -QAK) + */ + if(TRUE != H5P_isa_class(plist_id, H5P_LINK_ACCESS) && + TRUE != H5P_isa_class(plist_id, H5P_FILE_ACCESS) && + TRUE != H5P_isa_class(plist_id, H5P_DATASET_XFER)) + HGOTO_ERROR(H5E_PLIST, H5E_CANTREGISTER, FAIL, "property list is not an access plist") + + /* Get value */ + if(is_collective) { + H5P_coll_md_read_flag_t internal_flag; /* property setting. we need to convert to either TRUE or FALSE */ + H5P_genplist_t *plist; /* Property list pointer */ + + /* Get the plist structure */ + if(NULL == (plist = (H5P_genplist_t *)H5I_object(plist_id))) + HGOTO_ERROR(H5E_ATOM, H5E_BADATOM, FAIL, "can't find object for ID") + + if(H5P_get(plist, H5_COLL_MD_READ_FLAG_NAME, &internal_flag) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get core collective metadata read flag") + + if(internal_flag < 0) + *is_collective = FALSE; + else + *is_collective = (hbool_t)internal_flag; + } /* end if */ + +done: + FUNC_LEAVE_API(ret_value) +} /* H5Pget_coll_metadata_read */ + + +/*------------------------------------------------------------------------- + * Function: H5Pset_coll_metadata_write + * + * Purpose: Tell the library whether the metadata write operations will + * be done collectively (1) or not (0). Default is collective. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: Mohamad Chaarawi + * Sunday, June 21, 2015 + * + *------------------------------------------------------------------------- + */ +herr_t +H5Pset_coll_metadata_write(hid_t plist_id, hbool_t is_collective) +{ + H5P_genplist_t *plist; /* Property list pointer */ + herr_t ret_value = SUCCEED; /* return value */ + + FUNC_ENTER_API(FAIL) + H5TRACE2("e", "ib", plist_id, is_collective); + + /* Compare the property list's class against the other class */ + if(TRUE != H5P_isa_class(plist_id, H5P_FILE_ACCESS)) + HGOTO_ERROR(H5E_PLIST, H5E_CANTREGISTER, FAIL, "property list is not a file access plist") + + /* Get the plist structure */ + if(NULL == (plist = (H5P_genplist_t *)H5I_object(plist_id))) + HGOTO_ERROR(H5E_ATOM, H5E_BADATOM, FAIL, "can't find object for ID") + + /* Set values */ + if(H5P_set(plist, H5F_ACS_COLL_MD_WRITE_FLAG_NAME, &is_collective) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set collective metadata write flag") + +done: + FUNC_LEAVE_API(ret_value) +} /* end H5Pset_coll_metadata_write() */ + + +/*------------------------------------------------------------------------- + * Function: H5Pget_coll_metadata_write + * + * Purpose: Gets information about collective metadata write mode. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: Mohamad Chaarawi + * Sunday, June 21, 2015 + * + *------------------------------------------------------------------------- + */ +herr_t +H5Pget_coll_metadata_write(hid_t plist_id, hbool_t *is_collective) +{ + H5P_genplist_t *plist; /* Property list pointer */ + herr_t ret_value = SUCCEED; /* return value */ + + FUNC_ENTER_API(FAIL) + H5TRACE2("e", "i*b", plist_id, is_collective); + + /* Compare the property list's class against the other class */ + if(TRUE != H5P_isa_class(plist_id, H5P_FILE_ACCESS)) + HGOTO_ERROR(H5E_PLIST, H5E_CANTREGISTER, FAIL, "property list is not an access plist") + + /* Get the plist structure */ + if(NULL == (plist = (H5P_genplist_t *)H5I_object(plist_id))) + HGOTO_ERROR(H5E_ATOM, H5E_BADATOM, FAIL, "can't find object for ID") + + if(H5P_get(plist, H5F_ACS_COLL_MD_WRITE_FLAG_NAME, is_collective) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get collective metadata write flag") + +done: + FUNC_LEAVE_API(ret_value) +} /* end H5Pget_coll_metadata_write() */ +#endif /* H5_HAVE_PARALLEL */ + diff --git a/src/H5Pint.c b/src/H5Pint.c index 4818021..a222082 100644 --- a/src/H5Pint.c +++ b/src/H5Pint.c @@ -29,6 +29,9 @@ /* Headers */ /***********/ #include "H5private.h" /* Generic Functions */ +#ifdef H5_HAVE_PARALLEL +#include "H5ACprivate.h" /* Metadata cache */ +#endif /* H5_HAVE_PARALLEL */ #include "H5Eprivate.h" /* Error handling */ #include "H5Fprivate.h" /* File access */ #include "H5FLprivate.h" /* Free lists */ @@ -5493,11 +5496,31 @@ H5P_verify_apl_and_dxpl(hid_t *acspl_id, const H5P_libclass_t *libclass, hid_t * if(H5P_DEFAULT == *acspl_id) *acspl_id = *libclass->def_plist_id; else { +#ifdef H5_HAVE_PARALLEL + H5P_coll_md_read_flag_t md_coll_read; /* Collective metadata read flag */ + H5P_genplist_t *plist; /* Property list pointer */ +#endif /* H5_HAVE_PARALLEL */ + /* Sanity check the access property list class */ if(TRUE != H5P_isa_class(*acspl_id, *libclass->class_id)) HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not the required access property list") + +#ifdef H5_HAVE_PARALLEL + /* Get the plist structure for the access property list */ + if(NULL == (plist = (H5P_genplist_t *)H5I_object(*acspl_id))) + HGOTO_ERROR(H5E_PLIST, H5E_BADATOM, FAIL, "can't find object for ID") + + /* Get the collective metadata read flag */ + if(H5P_peek(plist, H5_COLL_MD_READ_FLAG_NAME, &md_coll_read) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get core collective metadata read flag") + + /* If collective metadata read requested and using internal DXPL, switch to internal collective DXPL */ + if(H5P_USER_TRUE == md_coll_read) + *dxpl_id = H5AC_coll_read_dxpl_id; +#endif /* H5_HAVE_PARALLEL */ } /* end else */ done: FUNC_LEAVE_NOAPI(ret_value) } /* end H5P_verify_apl_and_dxpl() */ + diff --git a/src/H5Plapl.c b/src/H5Plapl.c index 239f2a5..f2711e8 100644 --- a/src/H5Plapl.c +++ b/src/H5Plapl.c @@ -88,6 +88,12 @@ #define H5L_ACS_ELINK_CB_SIZE sizeof(H5L_elink_cb_t) #define H5L_ACS_ELINK_CB_DEF {NULL,NULL} +/* Definition for reading metadata collectively */ +#define H5L_ACS_COLL_MD_READ_SIZE sizeof(H5P_coll_md_read_flag_t) +#define H5L_ACS_COLL_MD_READ_DEF H5P_USER_FALSE +#define H5L_ACS_COLL_MD_READ_ENC H5P__encode_coll_md_read_flag_t +#define H5L_ACS_COLL_MD_READ_DEC H5P__decode_coll_md_read_flag_t + /******************/ /* Local Typedefs */ @@ -164,7 +170,7 @@ static const char *H5L_def_elink_prefix_g = H5L_ACS_ELINK_PREFIX_DEF; /* Default static const hid_t H5L_def_fapl_id_g = H5L_ACS_ELINK_FAPL_DEF; /* Default fapl for external link access */ static const unsigned H5L_def_elink_flags_g = H5L_ACS_ELINK_FLAGS_DEF; /* Default file access flags for external link traversal */ static const H5L_elink_cb_t H5L_def_elink_cb_g = H5L_ACS_ELINK_CB_DEF; /* Default external link traversal callback */ - +static const H5P_coll_md_read_flag_t H5L_def_coll_md_read_g = H5L_ACS_COLL_MD_READ_DEF; /* Default setting for the collective metedata read flag */ /*------------------------------------------------------------------------- @@ -216,6 +222,12 @@ H5P__lacc_reg_prop(H5P_genclass_t *pclass) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL) < 0) HGOTO_ERROR(H5E_PLIST, H5E_CANTINSERT, FAIL, "can't insert property into class") + /* Register the metadata collective read flag */ + if(H5P_register_real(pclass, H5_COLL_MD_READ_FLAG_NAME, H5L_ACS_COLL_MD_READ_SIZE, &H5L_def_coll_md_read_g, + NULL, NULL, NULL, H5L_ACS_COLL_MD_READ_ENC, H5L_ACS_COLL_MD_READ_DEC, + NULL, NULL, NULL, NULL) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTINSERT, FAIL, "can't insert property into class") + done: FUNC_LEAVE_NOAPI(ret_value) } /* end H5P__lacc_reg_prop() */ diff --git a/src/H5Ppkg.h b/src/H5Ppkg.h index 5997845..3662cf9 100644 --- a/src/H5Ppkg.h +++ b/src/H5Ppkg.h @@ -192,6 +192,8 @@ H5_DLL herr_t H5P__decode_unsigned(const void **_pp, void *value); H5_DLL herr_t H5P__decode_uint8_t(const void **_pp, void *value); H5_DLL herr_t H5P__decode_hbool_t(const void **_pp, void *value); H5_DLL herr_t H5P__decode_double(const void **_pp, void *value); +H5_DLL herr_t H5P__encode_coll_md_read_flag_t(const void *value, void **_pp, size_t *size); +H5_DLL herr_t H5P__decode_coll_md_read_flag_t(const void **_pp, void *value); /* Private OCPL routines */ H5_DLL herr_t H5P_get_filter(const struct H5Z_filter_info_t *filter, diff --git a/src/H5Pprivate.h b/src/H5Pprivate.h index 9be6e2a..29fb919 100644 --- a/src/H5Pprivate.h +++ b/src/H5Pprivate.h @@ -41,11 +41,19 @@ #define H5P_CLASS(P) (H5P_get_class(P)) #endif /* H5P_MODULE */ +#define H5_COLL_MD_READ_FLAG_NAME "collective_metadata_read" + /****************************/ /* Library Private Typedefs */ /****************************/ +typedef enum H5P_coll_md_read_flag_t { + H5P_FORCE_FALSE = -1, + H5P_USER_FALSE = 0, + H5P_USER_TRUE = 1 +} H5P_coll_md_read_flag_t; + /* Forward declarations (for prototypes & type definitions) */ struct H5O_fill_t; struct H5T_t; diff --git a/src/H5Ppublic.h b/src/H5Ppublic.h index e102776..9308339 100644 --- a/src/H5Ppublic.h +++ b/src/H5Ppublic.h @@ -351,6 +351,12 @@ H5_DLL herr_t H5Pget_file_image_callbacks(hid_t fapl_id, H5FD_file_image_callbacks_t *callbacks_ptr); H5_DLL herr_t H5Pset_core_write_tracking(hid_t fapl_id, hbool_t is_enabled, size_t page_size); H5_DLL herr_t H5Pget_core_write_tracking(hid_t fapl_id, hbool_t *is_enabled, size_t *page_size); +#ifdef H5_HAVE_PARALLEL +H5_DLL herr_t H5Pset_all_coll_metadata_ops(hid_t plist_id, hbool_t is_collective); +H5_DLL herr_t H5Pget_coll_metadata_read(hid_t plist_id, hbool_t *is_collective); +H5_DLL herr_t H5Pset_coll_metadata_write(hid_t plist_id, hbool_t is_collective); +H5_DLL herr_t H5Pget_coll_metadata_write(hid_t plist_id, hbool_t *is_collective); +#endif /* H5_HAVE_PARALLEL */ /* Dataset creation property list (DCPL) routines */ H5_DLL herr_t H5Pset_layout(hid_t plist_id, H5D_layout_t layout); |