summaryrefslogtreecommitdiffstats
path: root/src/H5Cmpio.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/H5Cmpio.c')
-rw-r--r--src/H5Cmpio.c305
1 files changed, 295 insertions, 10 deletions
diff --git a/src/H5Cmpio.c b/src/H5Cmpio.c
index 44733c0..4e88e44 100644
--- a/src/H5Cmpio.c
+++ b/src/H5Cmpio.c
@@ -39,14 +39,17 @@
#include "H5private.h" /* Generic Functions */
#include "H5ACprivate.h" /* Metadata cache */
#include "H5Cpkg.h" /* Cache */
+#include "H5Dprivate.h" /* Datasets */
#include "H5Eprivate.h" /* Error handling */
#include "H5Fpkg.h" /* Files */
+#include "H5FDprivate.h" /* File drivers */
+#include "H5FLprivate.h" /* Free Lists */
#include "H5Iprivate.h" /* IDs */
#include "H5MMprivate.h" /* Memory management */
+#include "H5Pprivate.h" /* Property lists */
#ifdef H5_HAVE_PARALLEL
-
/****************/
/* Local Macros */
/****************/
@@ -61,6 +64,9 @@
/********************/
/* Local Prototypes */
/********************/
+static herr_t H5C__collective_write(H5F_t *f, hid_t dxpl_id,
+ H5SL_t *collective_write_list);
+static herr_t H5C__collective_write_free(void *_item, void *key, void *op_data);
/*********************/
@@ -77,6 +83,9 @@
/* Local Variables */
/*******************/
+/* Declare a free list to manage the H5C_collective_write_t struct */
+H5FL_DEFINE(H5C_collective_write_t);
+
/*-------------------------------------------------------------------------
@@ -221,6 +230,7 @@ H5C_apply_candidate_list(H5F_t * f,
H5C_cache_entry_t * entry_ptr = NULL;
H5C_cache_entry_t * flush_ptr = NULL;
H5C_cache_entry_t * delayed_ptr = NULL;
+ H5SL_t * collective_write_list = NULL;
#if H5C_DO_SANITY_CHECKS
haddr_t last_addr;
#endif /* H5C_DO_SANITY_CHECKS */
@@ -253,6 +263,12 @@ H5C_apply_candidate_list(H5F_t * f,
HDfprintf(stdout, "%s", tbl_buf);
#endif /* H5C_APPLY_CANDIDATE_LIST__DEBUG */
+ if(f->coll_md_write) {
+ /* Create skip list of entries for collective write */
+ if(NULL == (collective_write_list = H5SL_create(H5SL_TYPE_HADDR, NULL)))
+ HGOTO_ERROR(H5E_DATASET, H5E_CANTCREATE, FAIL, "can't create skip list for entries")
+ } /* end if */
+
n = num_candidates / mpi_size;
m = num_candidates % mpi_size;
HDassert(n >= 0);
@@ -357,6 +373,16 @@ H5C_apply_candidate_list(H5F_t * f,
entries_to_clear++;
entry_ptr->clear_on_unprotect = TRUE;
} /* end else */
+
+ /* Entries marked as collectively accessed and are in the
+ candidate list to clear from the cache have to be
+ removed from the coll list. This is OK since the
+ candidate list is collective and uniform across all
+ ranks. */
+ if(TRUE == entry_ptr->coll_access) {
+ entry_ptr->coll_access = FALSE;
+ H5C__REMOVE_FROM_COLL_LIST(cache_ptr, entry_ptr, FAIL)
+ } /* end if */
} /* end else */
} /* end for */
@@ -433,7 +459,7 @@ H5C_apply_candidate_list(H5F_t * f,
* will not call either the pre_serialize or serialize callbacks.
*/
- if(H5C__flush_single_entry(f, dxpl_id, clear_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL) < 0)
+ if(H5C__flush_single_entry(f, dxpl_id, clear_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL, NULL) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't clear entry.")
} /* end if */
@@ -478,7 +504,8 @@ H5C_apply_candidate_list(H5F_t * f,
cache_ptr->entries_removed_counter = 0;
cache_ptr->last_entry_removed_ptr = NULL;
- if(H5C__flush_single_entry(f, dxpl_id, flush_ptr, H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL) < 0)
+ /* Add this entry to the list of entries to collectively write */
+ if(H5C__flush_single_entry(f, dxpl_id, flush_ptr, H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL, collective_write_list) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush entry.")
if ( ( cache_ptr->entries_removed_counter > 1 ) ||
@@ -636,7 +663,7 @@ H5C_apply_candidate_list(H5F_t * f,
(long long)clear_ptr->addr);
#endif /* H5C_APPLY_CANDIDATE_LIST__DEBUG */
- if(H5C__flush_single_entry(f, dxpl_id, clear_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL) < 0)
+ if(H5C__flush_single_entry(f, dxpl_id, clear_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL, NULL) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't clear entry.")
} /* end else-if */
@@ -652,7 +679,8 @@ H5C_apply_candidate_list(H5F_t * f,
(long long)flush_ptr->addr);
#endif /* H5C_APPLY_CANDIDATE_LIST__DEBUG */
- if(H5C__flush_single_entry(f, dxpl_id, flush_ptr, H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL) < 0)
+ /* Add this entry to the list of entries to collectively write */
+ if(H5C__flush_single_entry(f, dxpl_id, flush_ptr, H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL, collective_write_list) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't clear entry.")
} /* end else-if */
} /* end if */
@@ -686,20 +714,33 @@ H5C_apply_candidate_list(H5F_t * f,
if (delayed_ptr) {
if (delayed_ptr->clear_on_unprotect) {
+ if(H5C__flush_single_entry(f, dxpl_id, delayed_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG, NULL, NULL) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush entry.")
+
entry_ptr->clear_on_unprotect = FALSE;
entries_cleared++;
} else if (delayed_ptr->flush_immediately) {
+ /* Add this entry to the list of entries to collectively write */
+ if(H5C__flush_single_entry(f, dxpl_id, delayed_ptr, H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL, collective_write_list) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush entry collectively.")
+
entry_ptr->flush_immediately = FALSE;
entries_flushed++;
} /* end if */
- if(H5C__flush_single_entry(f, dxpl_id, delayed_ptr, H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush entry collectively.")
-
entries_flushed_collectively++;
entries_flushed_or_cleared_last++;
} /* end if */
+ /* If we've deferred writing to do it collectively, take care of that now */
+ if(f->coll_md_write) {
+ HDassert(collective_write_list);
+
+ /* Write collective list */
+ if(H5C__collective_write(f, dxpl_id, collective_write_list) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_WRITEERROR, FAIL, "Can't write metadata collectively")
+ } /* end if */
+
/* ====================================================================== *
* Finished flushing everything. *
* ====================================================================== */
@@ -719,6 +760,10 @@ done:
if(candidate_assignment_table != NULL)
candidate_assignment_table = (int *)H5MM_xfree((void *)candidate_assignment_table);
+ if(collective_write_list)
+ if(H5SL_destroy(collective_write_list, H5C__collective_write_free, NULL) < 0)
+ HDONE_ERROR(H5E_CACHE, H5E_CANTFREE, FAIL, "failed to destroy skip list")
+
FUNC_LEAVE_NOAPI(ret_value)
} /* H5C_apply_candidate_list() */
@@ -1082,6 +1127,14 @@ H5C_mark_entries_as_clean(H5F_t * f,
* scan the LRU list shortly, and clear all those entries
* not currently protected.
*/
+
+ /* Make sure first that we clear the collective flag from
+ it so it can be cleared */
+ if(TRUE == entry_ptr->coll_access) {
+ entry_ptr->coll_access = FALSE;
+ H5C__REMOVE_FROM_COLL_LIST(cache_ptr, entry_ptr, FAIL)
+ } /* end if */
+
entry_ptr->clear_on_unprotect = TRUE;
#if H5C_DO_SANITY_CHECKS
if ( entry_ptr->is_protected ) {
@@ -1142,7 +1195,7 @@ H5C_mark_entries_as_clean(H5F_t * f,
entry_ptr = entry_ptr->prev;
entries_cleared++;
- if(H5C__flush_single_entry(f, dxpl_id, clear_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL) < 0)
+ if(H5C__flush_single_entry(f, dxpl_id, clear_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL, NULL) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't clear entry.")
} else {
@@ -1170,7 +1223,7 @@ H5C_mark_entries_as_clean(H5F_t * f,
entry_ptr = entry_ptr->next;
entries_cleared++;
- if(H5C__flush_single_entry(f, dxpl_id, clear_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL) < 0 )
+ if(H5C__flush_single_entry(f, dxpl_id, clear_ptr, H5C__FLUSH_CLEAR_ONLY_FLAG | H5C__DEL_FROM_SLIST_ON_DESTROY_FLAG, NULL, NULL) < 0 )
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't clear entry.")
} else {
@@ -1215,5 +1268,237 @@ done:
FUNC_LEAVE_NOAPI(ret_value)
} /* H5C_mark_entries_as_clean() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5C_clear_coll_entries
+ *
+ * Purpose: Clear half or the entire list of collective entries and
+ * mark them as independent.
+ *
+ * Return: FAIL if error is detected, SUCCEED otherwise.
+ *
+ * Programmer: Mohamad Chaarawi
+ * April, 2015
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5C_clear_coll_entries(H5C_t *cache_ptr, hbool_t partial)
+{
+ int32_t clear_cnt;
+ H5C_cache_entry_t * entry_ptr = NULL;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_NOAPI_NOINIT
+
+ entry_ptr = cache_ptr->coll_tail_ptr;
+ clear_cnt = (partial ? cache_ptr->coll_list_len / 2 : cache_ptr->coll_list_len);
+ while(entry_ptr && clear_cnt > 0) {
+ H5C_cache_entry_t *prev_ptr = entry_ptr->coll_prev;
+
+ /* Sanity check */
+ HDassert(entry_ptr->coll_access);
+
+ /* Mark entry as independent */
+ entry_ptr->coll_access = FALSE;
+ H5C__REMOVE_FROM_COLL_LIST(cache_ptr, entry_ptr, FAIL)
+
+ /* Decrement entry count */
+ clear_cnt--;
+
+ /* Advance to next entry */
+ entry_ptr = prev_ptr;
+ } /* end while */
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5C_clear_coll_entries */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5C__collective_write
+ *
+ * Purpose: Perform a collective write of a list of metadata entries.
+ *
+ * Return: FAIL if error is detected, SUCCEED otherwise.
+ *
+ * Programmer: Mohamad Chaarawi
+ * February, 2016
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5C__collective_write(H5F_t *f, hid_t dxpl_id, H5SL_t *collective_write_list)
+{
+ H5P_genplist_t *plist = NULL;
+ H5FD_mpio_xfer_t orig_xfer_mode = H5FD_MPIO_COLLECTIVE;
+ int count;
+ int *length_array = NULL;
+ MPI_Aint *buf_array = NULL;
+ MPI_Aint *offset_array = NULL;
+ MPI_Datatype btype;
+ hbool_t btype_created = FALSE;
+ MPI_Datatype ftype;
+ hbool_t ftype_created = FALSE;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_STATIC
+
+ /* Get original transfer mode */
+ if(NULL == (plist = (H5P_genplist_t *)H5I_object(dxpl_id)))
+ HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a data transfer property list")
+ if(H5P_get(plist, H5D_XFER_IO_XFER_MODE_NAME, &orig_xfer_mode) < 0)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O property")
+
+ /* Get number of entries in collective write list */
+ count = (int)H5SL_count(collective_write_list);
+
+ if(count > 0) {
+ H5FD_mpio_xfer_t xfer_mode = H5FD_MPIO_COLLECTIVE;
+ H5SL_node_t *node;
+ H5C_collective_write_t *item;
+ void *base_buf;
+ int i;
+
+ if(H5P_set(plist, H5D_XFER_IO_XFER_MODE_NAME, &xfer_mode) < 0)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O property")
+
+ /* Allocate arrays */
+ if(NULL == (length_array = (int *)H5MM_malloc((size_t)count * sizeof(int))))
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "memory allocation failed for collective write table length array")
+ if(NULL == (buf_array = (MPI_Aint *)H5MM_malloc((size_t)count * sizeof(MPI_Aint))))
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "memory allocation failed for collective buf table length array")
+ if(NULL == (offset_array = (MPI_Aint *)H5MM_malloc((size_t)count * sizeof(MPI_Aint))))
+ HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "memory allocation failed for collective offset table length array")
+
+ /* Fill arrays */
+ node = H5SL_first(collective_write_list);
+ HDassert(node);
+ if(NULL == (item = (H5C_collective_write_t *)H5SL_item(node)))
+ HGOTO_ERROR(H5E_CACHE, H5E_NOTFOUND, FAIL, "can't retrieve skip list item")
+
+ /* Set up initial array position & buffer base address */
+ length_array[0] = (int)item->length;
+ base_buf = item->buf;
+ buf_array[0] = (MPI_Aint)0;
+ offset_array[0] = (MPI_Aint)item->offset;
+
+ node = H5SL_next(node);
+ i = 1;
+ while(node) {
+ if(NULL == (item = (H5C_collective_write_t *)H5SL_item(node)))
+ HGOTO_ERROR(H5E_CACHE, H5E_NOTFOUND, FAIL, "can't retrieve skip list item")
+
+ /* Set up array position */
+ length_array[i] = (int)item->length;
+ buf_array[i] = (MPI_Aint)item->buf - (MPI_Aint)base_buf;
+ offset_array[i] = (MPI_Aint)item->offset;
+
+ /* Advance to next node & array location */
+ node = H5SL_next(node);
+ i++;
+ } /* end while */
+
+ /* Create memory MPI type */
+ if(MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed(count, length_array, buf_array, MPI_BYTE, &btype)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed failed", mpi_code)
+ btype_created = TRUE;
+ if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&btype)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code)
+
+ /* Create file MPI type */
+ if(MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed(count, length_array, offset_array, MPI_BYTE, &ftype)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed failed", mpi_code)
+ ftype_created = TRUE;
+ if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&ftype)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code)
+
+ /* Pass buf type, file type to the file driver */
+ if(H5FD_mpi_setup_collective(dxpl_id, &btype, &ftype) < 0)
+ HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O properties")
+
+ /* Write data */
+ if(H5F_block_write(f, H5FD_MEM_DEFAULT, (haddr_t)0, (size_t)1, dxpl_id, base_buf) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to write entries collectively")
+ } /* end if */
+ else {
+ MPI_Status mpi_stat;
+ MPI_File mpi_fh_p;
+ MPI_File mpi_fh;
+
+ if(H5F_get_mpi_handle(f, (MPI_File **)&mpi_fh_p) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "can't get mpi file handle")
+ mpi_fh = *(MPI_File*)mpi_fh_p;
+
+ /* just to match up with the 1st MPI_File_set_view from H5FD_mpio_write() */
+ if(MPI_SUCCESS != (mpi_code = MPI_File_set_view(mpi_fh, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
+
+ /* just to match up with MPI_File_write_at_all from H5FD_mpio_write() */
+ HDmemset(&mpi_stat, 0, sizeof(MPI_Status));
+ if(MPI_SUCCESS != (mpi_code = MPI_File_write_at_all(mpi_fh, (MPI_Offset)0, NULL, 0, MPI_BYTE, &mpi_stat)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at_all failed", mpi_code)
+
+ /* just to match up with the 2nd MPI_File_set_view (reset) in H5FD_mpio_write() */
+ if(MPI_SUCCESS != (mpi_code = MPI_File_set_view(mpi_fh, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
+ } /* end else */
+
+done:
+ /* Free arrays */
+ length_array = (int *)H5MM_xfree(length_array);
+ buf_array = (MPI_Aint *)H5MM_xfree(buf_array);
+ offset_array = (MPI_Aint *)H5MM_xfree(offset_array);
+
+ /* Free MPI Types */
+ if(btype_created && MPI_SUCCESS != (mpi_code = MPI_Type_free(&btype)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+ if(ftype_created && MPI_SUCCESS != (mpi_code = MPI_Type_free(&ftype)))
+ HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
+
+ /* Reset dxpl */
+ if(orig_xfer_mode != H5FD_MPIO_COLLECTIVE) {
+ HDassert(plist);
+ if(H5P_set(plist, H5D_XFER_IO_XFER_MODE_NAME, &orig_xfer_mode) < 0)
+ HDONE_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O property")
+ } /* end if */
+
+ FUNC_LEAVE_NOAPI(ret_value);
+} /* end H5C__collective_write() */
+
+
+/*-------------------------------------------------------------------------
+ *
+ * Function: H5C__collective_write_free
+ *
+ * Purpose: Release node on collective write skiplist
+ *
+ * Return: FAIL if error is detected, SUCCEED otherwise.
+ *
+ * Programmer: Mohamad Chaarawi
+ * February, 2016
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5C__collective_write_free(void *_item, void H5_ATTR_UNUSED *key, void H5_ATTR_UNUSED *op_data)
+{
+ H5C_collective_write_t *item = (H5C_collective_write_t *)_item;
+
+ FUNC_ENTER_STATIC_NOERR
+
+ /* Sanity check */
+ HDassert(item);
+
+ if(item->free_buf)
+ item->buf = H5MM_xfree(item->buf);
+ H5FL_FREE(H5C_collective_write_t, item);
+
+ FUNC_LEAVE_NOAPI(SUCCEED)
+} /* end H5C__collective_write_free() */
#endif /* H5_HAVE_PARALLEL */