diff options
Diffstat (limited to 'src/H5Cmpio.c')
-rw-r--r-- | src/H5Cmpio.c | 305 |
1 files changed, 295 insertions, 10 deletions
diff --git a/src/H5Cmpio.c b/src/H5Cmpio.c index 44733c0..4e88e44 100644 --- a/src/H5Cmpio.c +++ b/src/H5Cmpio.c @@ -39,14 +39,17 @@ #include "H5private.h" /* Generic Functions */ #include "H5ACprivate.h" /* Metadata cache */ #include "H5Cpkg.h" /* Cache */ +#include "H5Dprivate.h" /* Datasets */ #include "H5Eprivate.h" /* Error handling */ #include "H5Fpkg.h" /* Files */ +#include "H5FDprivate.h" /* File drivers */ +#include "H5FLprivate.h" /* Free Lists */ #include "H5Iprivate.h" /* IDs */ #include "H5MMprivate.h" /* Memory management */ +#include "H5Pprivate.h" /* Property lists */ #ifdef H5_HAVE_PARALLEL - /****************/ /* Local Macros */ /****************/ @@ -61,6 +64,9 @@ /********************/ /* Local Prototypes */ /********************/ +static herr_t H5C__collective_write(H5F_t *f, hid_t dxpl_id, + H5SL_t *collective_write_list); +static herr_t H5C__collective_write_free(void *_item, void *key, void *op_data); /*********************/ @@ -77,6 +83,9 @@ /* Local Variables */ /*******************/ +/* Declare a free list to manage the H5C_collective_write_t struct */ +H5FL_DEFINE(H5C_collective_write_t); + /*------------------------------------------------------------------------- @@ -221,6 +230,7 @@ H5C_apply_candidate_list(H5F_t * f, H5C_cache_entry_t * entry_ptr = NULL; H5C_cache_entry_t * flush_ptr = NULL; H5C_cache_entry_t * delayed_ptr = NULL; + H5SL_t * collective_write_list = NULL; #if H5C_DO_SANITY_CHECKS haddr_t last_addr; #endif /* H5C_DO_SANITY_CHECKS */ @@ -253,6 +263,12 @@ H5C_apply_candidate_list(H5F_t * f, HDfprintf(stdout, "%s", tbl_buf); #endif /* H5C_APPLY_CANDIDATE_LIST__DEBUG */ + if(f->coll_md_write) { + /* Create skip list of entries for collective write */ + if(NULL == (collective_write_list = H5SL_create(H5SL_TYPE_HADDR, NULL))) + HGOTO_ERROR(H5E_DATASET, H5E_CANTCREATE, FAIL, "can't create skip list for entries") + } /* end if */ + n = num_candidates / mpi_size; m = num_candidates % mpi_size; HDassert(n >= 0); @@ -357,6 +373,16 @@ H5C_apply_candidate_list(H5F_t * f, entries_to_clear++; entry_ptr->clear_on_unprotect = TRUE; } /* end else */ + + /* Entries marked as collectively accessed and are in the + candidate list to clear from the cache have to be + removed from the coll list. This is OK since the + candidate list is collective and uniform across all + ranks. */ + if(TRUE == entry_ptr->coll_access) { + entry_ptr->coll_access = FALSE; + H5C__REMOVE_FROM_COLL_LIST(cache_ptr, entry_ptr, FAIL) + } /* end if */ } /* end else */ } /* end for */ @@ -433,7 +459,7 @@ H5C_apply_candidate_list(H5F_t * f, * will not call either the pre_serialize or serialize callbacks. */ - if(H5C__flush_single_entry(f, dxpl_id, clear_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL) < 0) + if(H5C__flush_single_entry(f, dxpl_id, clear_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL, NULL) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't clear entry.") } /* end if */ @@ -478,7 +504,8 @@ H5C_apply_candidate_list(H5F_t * f, cache_ptr->entries_removed_counter = 0; cache_ptr->last_entry_removed_ptr = NULL; - if(H5C__flush_single_entry(f, dxpl_id, flush_ptr, H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL) < 0) + /* Add this entry to the list of entries to collectively write */ + if(H5C__flush_single_entry(f, dxpl_id, flush_ptr, H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL, collective_write_list) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush entry.") if ( ( cache_ptr->entries_removed_counter > 1 ) || @@ -636,7 +663,7 @@ H5C_apply_candidate_list(H5F_t * f, (long long)clear_ptr->addr); #endif /* H5C_APPLY_CANDIDATE_LIST__DEBUG */ - if(H5C__flush_single_entry(f, dxpl_id, clear_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL) < 0) + if(H5C__flush_single_entry(f, dxpl_id, clear_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL, NULL) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't clear entry.") } /* end else-if */ @@ -652,7 +679,8 @@ H5C_apply_candidate_list(H5F_t * f, (long long)flush_ptr->addr); #endif /* H5C_APPLY_CANDIDATE_LIST__DEBUG */ - if(H5C__flush_single_entry(f, dxpl_id, flush_ptr, H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL) < 0) + /* Add this entry to the list of entries to collectively write */ + if(H5C__flush_single_entry(f, dxpl_id, flush_ptr, H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL, collective_write_list) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't clear entry.") } /* end else-if */ } /* end if */ @@ -686,20 +714,33 @@ H5C_apply_candidate_list(H5F_t * f, if (delayed_ptr) { if (delayed_ptr->clear_on_unprotect) { + if(H5C__flush_single_entry(f, dxpl_id, delayed_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG, NULL, NULL) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush entry.") + entry_ptr->clear_on_unprotect = FALSE; entries_cleared++; } else if (delayed_ptr->flush_immediately) { + /* Add this entry to the list of entries to collectively write */ + if(H5C__flush_single_entry(f, dxpl_id, delayed_ptr, H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL, collective_write_list) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush entry collectively.") + entry_ptr->flush_immediately = FALSE; entries_flushed++; } /* end if */ - if(H5C__flush_single_entry(f, dxpl_id, delayed_ptr, H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL) < 0) - HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush entry collectively.") - entries_flushed_collectively++; entries_flushed_or_cleared_last++; } /* end if */ + /* If we've deferred writing to do it collectively, take care of that now */ + if(f->coll_md_write) { + HDassert(collective_write_list); + + /* Write collective list */ + if(H5C__collective_write(f, dxpl_id, collective_write_list) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_WRITEERROR, FAIL, "Can't write metadata collectively") + } /* end if */ + /* ====================================================================== * * Finished flushing everything. * * ====================================================================== */ @@ -719,6 +760,10 @@ done: if(candidate_assignment_table != NULL) candidate_assignment_table = (int *)H5MM_xfree((void *)candidate_assignment_table); + if(collective_write_list) + if(H5SL_destroy(collective_write_list, H5C__collective_write_free, NULL) < 0) + HDONE_ERROR(H5E_CACHE, H5E_CANTFREE, FAIL, "failed to destroy skip list") + FUNC_LEAVE_NOAPI(ret_value) } /* H5C_apply_candidate_list() */ @@ -1082,6 +1127,14 @@ H5C_mark_entries_as_clean(H5F_t * f, * scan the LRU list shortly, and clear all those entries * not currently protected. */ + + /* Make sure first that we clear the collective flag from + it so it can be cleared */ + if(TRUE == entry_ptr->coll_access) { + entry_ptr->coll_access = FALSE; + H5C__REMOVE_FROM_COLL_LIST(cache_ptr, entry_ptr, FAIL) + } /* end if */ + entry_ptr->clear_on_unprotect = TRUE; #if H5C_DO_SANITY_CHECKS if ( entry_ptr->is_protected ) { @@ -1142,7 +1195,7 @@ H5C_mark_entries_as_clean(H5F_t * f, entry_ptr = entry_ptr->prev; entries_cleared++; - if(H5C__flush_single_entry(f, dxpl_id, clear_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL) < 0) + if(H5C__flush_single_entry(f, dxpl_id, clear_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL, NULL) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't clear entry.") } else { @@ -1170,7 +1223,7 @@ H5C_mark_entries_as_clean(H5F_t * f, entry_ptr = entry_ptr->next; entries_cleared++; - if(H5C__flush_single_entry(f, dxpl_id, clear_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL) < 0 ) + if(H5C__flush_single_entry(f, dxpl_id, clear_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL, NULL) < 0 ) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't clear entry.") } else { @@ -1215,5 +1268,237 @@ done: FUNC_LEAVE_NOAPI(ret_value) } /* H5C_mark_entries_as_clean() */ + + +/*------------------------------------------------------------------------- + * + * Function: H5C_clear_coll_entries + * + * Purpose: Clear half or the entire list of collective entries and + * mark them as independent. + * + * Return: FAIL if error is detected, SUCCEED otherwise. + * + * Programmer: Mohamad Chaarawi + * April, 2015 + * + *------------------------------------------------------------------------- + */ +herr_t +H5C_clear_coll_entries(H5C_t *cache_ptr, hbool_t partial) +{ + int32_t clear_cnt; + H5C_cache_entry_t * entry_ptr = NULL; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI_NOINIT + + entry_ptr = cache_ptr->coll_tail_ptr; + clear_cnt = (partial ? cache_ptr->coll_list_len / 2 : cache_ptr->coll_list_len); + while(entry_ptr && clear_cnt > 0) { + H5C_cache_entry_t *prev_ptr = entry_ptr->coll_prev; + + /* Sanity check */ + HDassert(entry_ptr->coll_access); + + /* Mark entry as independent */ + entry_ptr->coll_access = FALSE; + H5C__REMOVE_FROM_COLL_LIST(cache_ptr, entry_ptr, FAIL) + + /* Decrement entry count */ + clear_cnt--; + + /* Advance to next entry */ + entry_ptr = prev_ptr; + } /* end while */ + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* H5C_clear_coll_entries */ + + +/*------------------------------------------------------------------------- + * + * Function: H5C__collective_write + * + * Purpose: Perform a collective write of a list of metadata entries. + * + * Return: FAIL if error is detected, SUCCEED otherwise. + * + * Programmer: Mohamad Chaarawi + * February, 2016 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5C__collective_write(H5F_t *f, hid_t dxpl_id, H5SL_t *collective_write_list) +{ + H5P_genplist_t *plist = NULL; + H5FD_mpio_xfer_t orig_xfer_mode = H5FD_MPIO_COLLECTIVE; + int count; + int *length_array = NULL; + MPI_Aint *buf_array = NULL; + MPI_Aint *offset_array = NULL; + MPI_Datatype btype; + hbool_t btype_created = FALSE; + MPI_Datatype ftype; + hbool_t ftype_created = FALSE; + int mpi_code; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_STATIC + + /* Get original transfer mode */ + if(NULL == (plist = (H5P_genplist_t *)H5I_object(dxpl_id))) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a data transfer property list") + if(H5P_get(plist, H5D_XFER_IO_XFER_MODE_NAME, &orig_xfer_mode) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O property") + + /* Get number of entries in collective write list */ + count = (int)H5SL_count(collective_write_list); + + if(count > 0) { + H5FD_mpio_xfer_t xfer_mode = H5FD_MPIO_COLLECTIVE; + H5SL_node_t *node; + H5C_collective_write_t *item; + void *base_buf; + int i; + + if(H5P_set(plist, H5D_XFER_IO_XFER_MODE_NAME, &xfer_mode) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O property") + + /* Allocate arrays */ + if(NULL == (length_array = (int *)H5MM_malloc((size_t)count * sizeof(int)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "memory allocation failed for collective write table length array") + if(NULL == (buf_array = (MPI_Aint *)H5MM_malloc((size_t)count * sizeof(MPI_Aint)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "memory allocation failed for collective buf table length array") + if(NULL == (offset_array = (MPI_Aint *)H5MM_malloc((size_t)count * sizeof(MPI_Aint)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "memory allocation failed for collective offset table length array") + + /* Fill arrays */ + node = H5SL_first(collective_write_list); + HDassert(node); + if(NULL == (item = (H5C_collective_write_t *)H5SL_item(node))) + HGOTO_ERROR(H5E_CACHE, H5E_NOTFOUND, FAIL, "can't retrieve skip list item") + + /* Set up initial array position & buffer base address */ + length_array[0] = (int)item->length; + base_buf = item->buf; + buf_array[0] = (MPI_Aint)0; + offset_array[0] = (MPI_Aint)item->offset; + + node = H5SL_next(node); + i = 1; + while(node) { + if(NULL == (item = (H5C_collective_write_t *)H5SL_item(node))) + HGOTO_ERROR(H5E_CACHE, H5E_NOTFOUND, FAIL, "can't retrieve skip list item") + + /* Set up array position */ + length_array[i] = (int)item->length; + buf_array[i] = (MPI_Aint)item->buf - (MPI_Aint)base_buf; + offset_array[i] = (MPI_Aint)item->offset; + + /* Advance to next node & array location */ + node = H5SL_next(node); + i++; + } /* end while */ + + /* Create memory MPI type */ + if(MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed(count, length_array, buf_array, MPI_BYTE, &btype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed failed", mpi_code) + btype_created = TRUE; + if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&btype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code) + + /* Create file MPI type */ + if(MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed(count, length_array, offset_array, MPI_BYTE, &ftype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed failed", mpi_code) + ftype_created = TRUE; + if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&ftype))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code) + + /* Pass buf type, file type to the file driver */ + if(H5FD_mpi_setup_collective(dxpl_id, &btype, &ftype) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O properties") + + /* Write data */ + if(H5F_block_write(f, H5FD_MEM_DEFAULT, (haddr_t)0, (size_t)1, dxpl_id, base_buf) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to write entries collectively") + } /* end if */ + else { + MPI_Status mpi_stat; + MPI_File mpi_fh_p; + MPI_File mpi_fh; + + if(H5F_get_mpi_handle(f, (MPI_File **)&mpi_fh_p) < 0) + HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "can't get mpi file handle") + mpi_fh = *(MPI_File*)mpi_fh_p; + + /* just to match up with the 1st MPI_File_set_view from H5FD_mpio_write() */ + if(MPI_SUCCESS != (mpi_code = MPI_File_set_view(mpi_fh, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) + + /* just to match up with MPI_File_write_at_all from H5FD_mpio_write() */ + HDmemset(&mpi_stat, 0, sizeof(MPI_Status)); + if(MPI_SUCCESS != (mpi_code = MPI_File_write_at_all(mpi_fh, (MPI_Offset)0, NULL, 0, MPI_BYTE, &mpi_stat))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at_all failed", mpi_code) + + /* just to match up with the 2nd MPI_File_set_view (reset) in H5FD_mpio_write() */ + if(MPI_SUCCESS != (mpi_code = MPI_File_set_view(mpi_fh, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) + } /* end else */ + +done: + /* Free arrays */ + length_array = (int *)H5MM_xfree(length_array); + buf_array = (MPI_Aint *)H5MM_xfree(buf_array); + offset_array = (MPI_Aint *)H5MM_xfree(offset_array); + + /* Free MPI Types */ + if(btype_created && MPI_SUCCESS != (mpi_code = MPI_Type_free(&btype))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code) + if(ftype_created && MPI_SUCCESS != (mpi_code = MPI_Type_free(&ftype))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code) + + /* Reset dxpl */ + if(orig_xfer_mode != H5FD_MPIO_COLLECTIVE) { + HDassert(plist); + if(H5P_set(plist, H5D_XFER_IO_XFER_MODE_NAME, &orig_xfer_mode) < 0) + HDONE_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O property") + } /* end if */ + + FUNC_LEAVE_NOAPI(ret_value); +} /* end H5C__collective_write() */ + + +/*------------------------------------------------------------------------- + * + * Function: H5C__collective_write_free + * + * Purpose: Release node on collective write skiplist + * + * Return: FAIL if error is detected, SUCCEED otherwise. + * + * Programmer: Mohamad Chaarawi + * February, 2016 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5C__collective_write_free(void *_item, void H5_ATTR_UNUSED *key, void H5_ATTR_UNUSED *op_data) +{ + H5C_collective_write_t *item = (H5C_collective_write_t *)_item; + + FUNC_ENTER_STATIC_NOERR + + /* Sanity check */ + HDassert(item); + + if(item->free_buf) + item->buf = H5MM_xfree(item->buf); + H5FL_FREE(H5C_collective_write_t, item); + + FUNC_LEAVE_NOAPI(SUCCEED) +} /* end H5C__collective_write_free() */ #endif /* H5_HAVE_PARALLEL */ |