From 94c34773ceae5b30c4afb227d0385ebf4ab6ce28 Mon Sep 17 00:00:00 2001 From: mainzer Date: Thu, 6 Apr 2017 18:11:21 -0500 Subject: Checkin of fix for CGNS bug (https://jira.hdfgroup.org/browse/HDFFV-10055). Briefly, in H5C_collective_write() in H5Cmpio.c, the metadata cache attempts to perform a collective write of metadata cache entries. This worked fine as long as all processes had at least one entry to write. However, when the process has no entries, the function tries to participate in the collective write by calling MPI_File_set_view(), MPI_File_write_all() and then MPI_File_set_view() again, to match the calls in H5FD_mpio_write(). After pull request 183, the CGNS test benchmark_hdf5 started failing. On investigation, I determined that the failure occurred in the first call to MPI_File_set_view() in the "no data to write" path through H5C_collective_write(). Note that pull request 183 did not create the problem, it only exposed it. The bug can be observed after pull request 182 if one executes the CGNS progam src/ptests/benchmark_hdf5 with 90 processes. The problem appears to have been that the calls to MPI_File_set_view() in H5C_collective_write() and H5FD_mpio_write() were using different values for the info parameter. I patched the problem by adding a MPI specific VFD call allowing me to get the MPI_Info used in H5FD_mpio_write() for use in MPI_File_set_view() calls in H5C_collective_write(). Tested serial & parallel, debug & production on Jelly. --- src/H5Cdbg.c | 112 +++++++++++++++++++++++++++++++++++++++++++++++++++++- src/H5Cmpio.c | 102 ++++++++++++++++++++++++++++++++++++++----------- src/H5Cprivate.h | 5 ++- src/H5FDmpi.c | 39 +++++++++++++++++++ src/H5FDmpio.c | 37 +++++++++++++++++- src/H5FDprivate.h | 2 + src/H5Fmpi.c | 26 +++++++++++++ src/H5Fprivate.h | 1 + 8 files changed, 298 insertions(+), 26 deletions(-) diff --git a/src/H5Cdbg.c b/src/H5Cdbg.c index 0a98406..455b653 100644 --- a/src/H5Cdbg.c +++ b/src/H5Cdbg.c @@ -30,12 +30,16 @@ #include "H5Cmodule.h" /* This source code file is part of the H5C module */ +#define H5AC_FRIEND + + + /***********/ /* Headers */ /***********/ #include "H5private.h" /* Generic Functions */ -#include "H5ACprivate.h" /* Metadata Cache */ +#include "H5ACpkg.h" /* Metadata Cache */ #include "H5Cpkg.h" /* Cache */ #include "H5Eprivate.h" /* Error Handling */ @@ -340,6 +344,112 @@ H5C_dump_cache_skip_list(H5C_t * cache_ptr, char * calling_fcn) /*------------------------------------------------------------------------- + * Function: H5C_dump_coll_write_list + * + * Purpose: Debugging routine that prints a summary of the contents of + * the collective write skip list used by the metadata cache + * in the parallel case to maintain a list of entries to write + * collectively at a sync point. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: John Mainzer + * 4/1/17 + * + *------------------------------------------------------------------------- + */ +#ifdef H5_HAVE_PARALLEL +#ifndef NDEBUG +herr_t +H5C_dump_coll_write_list(H5C_t * cache_ptr, char * calling_fcn) +{ + herr_t ret_value = SUCCEED; /* Return value */ + int i; + int list_len; + H5AC_aux_t * aux_ptr = NULL; + H5C_cache_entry_t * entry_ptr = NULL; + H5SL_node_t * node_ptr = NULL; + + FUNC_ENTER_NOAPI_NOERR + + HDassert(cache_ptr != NULL); + HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC); + HDassert(cache_ptr->aux_ptr); + + aux_ptr = (H5AC_aux_t *)cache_ptr->aux_ptr; + + HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC); + + HDassert(calling_fcn != NULL); + + list_len = (int)H5SL_count(cache_ptr->coll_write_list); + + HDfprintf(stdout, "\n\nDumping MDC coll write list from %d:%s.\n", + aux_ptr->mpi_rank, calling_fcn); + HDfprintf(stdout, " slist len = %u.\n", cache_ptr->slist_len); + + if ( list_len > 0 ) { + + /* scan the collective write list generating the desired output */ + HDfprintf(stdout, + "Num: Addr: Len: Prot/Pind: Dirty: Type:\n"); + + i = 0; + + node_ptr = H5SL_first(cache_ptr->coll_write_list); + + if ( node_ptr != NULL ) + + entry_ptr = (H5C_cache_entry_t *)H5SL_item(node_ptr); + + else + + entry_ptr = NULL; + + while ( entry_ptr != NULL ) { + + HDassert(entry_ptr->magic == H5C__H5C_CACHE_ENTRY_T_MAGIC); + + HDfprintf(stdout, + "%s%d 0x%016llx %4lld %d/%d %d %s\n", + cache_ptr->prefix, i, + (long long)(entry_ptr->addr), + (long long)(entry_ptr->size), + (int)(entry_ptr->is_protected), + (int)(entry_ptr->is_pinned), + (int)(entry_ptr->is_dirty), + entry_ptr->type->name); + + /* HDfprintf(stdout, " node_ptr = 0x%llx, item = %p\n", + (unsigned long long)node_ptr, + H5SL_item(node_ptr)); + */ + + node_ptr = H5SL_next(node_ptr); + + if ( node_ptr != NULL ) + + entry_ptr = (H5C_cache_entry_t *)H5SL_item(node_ptr); + + else + + entry_ptr = NULL; + + i++; + + } /* end while */ + } /* end if */ + + HDfprintf(stdout, "\n\n"); + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5C_dump_coll_write_list() */ +#endif /* NDEBUG */ +#endif /* H5_HAVE_PARALLEL */ + + +/*------------------------------------------------------------------------- * Function: H5C_set_prefix * * Purpose: Set the values of the prefix field of H5C_t. This diff --git a/src/H5Cmpio.c b/src/H5Cmpio.c index 06ce714..0d1a3ff 100644 --- a/src/H5Cmpio.c +++ b/src/H5Cmpio.c @@ -950,12 +950,15 @@ H5C__collective_write(H5F_t *f, hid_t dxpl_id) /* Get original transfer mode */ if(NULL == (plist = (H5P_genplist_t *)H5I_object(dxpl_id))) - HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a data transfer property list") + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, \ + "not a data transfer property list") + if(H5P_get(plist, H5D_XFER_IO_XFER_MODE_NAME, &orig_xfer_mode) < 0) HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O property") /* Get number of entries in collective write list */ count = (int)H5SL_count(cache_ptr->coll_write_list); + if(count > 0) { H5FD_mpio_xfer_t xfer_mode = H5FD_MPIO_COLLECTIVE; H5SL_node_t *node; @@ -964,21 +967,34 @@ H5C__collective_write(H5F_t *f, hid_t dxpl_id) int i; if(H5P_set(plist, H5D_XFER_IO_XFER_MODE_NAME, &xfer_mode) < 0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O property") + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, \ + "can't set MPI-I/O property") /* Allocate arrays */ - if(NULL == (length_array = (int *)H5MM_malloc((size_t)count * sizeof(int)))) - HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "memory allocation failed for collective write table length array") - if(NULL == (buf_array = (MPI_Aint *)H5MM_malloc((size_t)count * sizeof(MPI_Aint)))) - HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "memory allocation failed for collective buf table length array") - if(NULL == (offset_array = (MPI_Aint *)H5MM_malloc((size_t)count * sizeof(MPI_Aint)))) - HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "memory allocation failed for collective offset table length array") + if ( NULL == (length_array = + (int *)H5MM_malloc((size_t)count * sizeof(int))) ) + + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, \ + "memory allocation failed for collective write table length array") + + if ( NULL == (buf_array = + (MPI_Aint *)H5MM_malloc((size_t)count * sizeof(MPI_Aint))) ) + + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, \ + "memory allocation failed for collective buf table length array") + + if(NULL == (offset_array = + (MPI_Aint *)H5MM_malloc((size_t)count * sizeof(MPI_Aint))) ) + + HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, \ + "memory allocation failed for collective offset table length array") /* Fill arrays */ node = H5SL_first(cache_ptr->coll_write_list); HDassert(node); if(NULL == (entry_ptr = (H5C_cache_entry_t *)H5SL_item(node))) - HGOTO_ERROR(H5E_CACHE, H5E_NOTFOUND, FAIL, "can't retrieve skip list item") + HGOTO_ERROR(H5E_CACHE, H5E_NOTFOUND, FAIL, \ + "can't retrieve skip list item") /* Set up initial array position & buffer base address */ length_array[0] = (int)entry_ptr->size; @@ -989,8 +1005,10 @@ H5C__collective_write(H5F_t *f, hid_t dxpl_id) node = H5SL_next(node); i = 1; while(node) { + if(NULL == (entry_ptr = (H5C_cache_entry_t *)H5SL_item(node))) - HGOTO_ERROR(H5E_CACHE, H5E_NOTFOUND, FAIL, "can't retrieve skip list item") + HGOTO_ERROR(H5E_CACHE, H5E_NOTFOUND, FAIL, \ + "can't retrieve skip list item") /* Set up array position */ length_array[i] = (int)entry_ptr->size; @@ -1003,48 +1021,85 @@ H5C__collective_write(H5F_t *f, hid_t dxpl_id) } /* end while */ /* Create memory MPI type */ - if(MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed(count, length_array, buf_array, MPI_BYTE, &btype))) + if(MPI_SUCCESS != (mpi_code = + MPI_Type_create_hindexed(count, length_array, + buf_array, MPI_BYTE, + &btype))) HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed failed", mpi_code) + btype_created = TRUE; + if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&btype))) HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code) /* Create file MPI type */ - if(MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed(count, length_array, offset_array, MPI_BYTE, &ftype))) + if(MPI_SUCCESS != (mpi_code = + MPI_Type_create_hindexed(count, length_array, + offset_array, MPI_BYTE, + &ftype))) HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed failed", mpi_code) + ftype_created = TRUE; + if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&ftype))) HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code) /* Pass buf type, file type to the file driver */ if(H5FD_mpi_setup_collective(dxpl_id, &btype, &ftype) < 0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O properties") + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, \ + "can't set MPI-I/O properties") /* Write data */ - if(H5F_block_write(f, H5FD_MEM_DEFAULT, (haddr_t)0, (size_t)1, dxpl_id, base_buf) < 0) - HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to write entries collectively") + if(H5F_block_write(f, H5FD_MEM_DEFAULT, (haddr_t)0, + (size_t)1, dxpl_id, base_buf) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, \ + "unable to write entries collectively") + } /* end if */ else { MPI_Status mpi_stat; - MPI_File mpi_fh_p; + MPI_File *mpi_fh_p; MPI_File mpi_fh; + MPI_Info *info_p; + MPI_Info info; if(H5F_get_mpi_handle(f, (MPI_File **)&mpi_fh_p) < 0) - HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "can't get mpi file handle") + HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, \ + "can't get mpi file handle") + mpi_fh = *(MPI_File*)mpi_fh_p; - /* just to match up with the 1st MPI_File_set_view from H5FD_mpio_write() */ - if(MPI_SUCCESS != (mpi_code = MPI_File_set_view(mpi_fh, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL))) + if (H5F_get_mpi_info(f, &info_p) < 0) + HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, \ + "can't get mpi file info") + + info = *info_p; + + /* just to match up with the 1st MPI_File_set_view from + * H5FD_mpio_write() + */ + if(MPI_SUCCESS != (mpi_code = + MPI_File_set_view(mpi_fh, (MPI_Offset)0, MPI_BYTE, + MPI_BYTE, "native", + info))) HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) /* just to match up with MPI_File_write_at_all from H5FD_mpio_write() */ HDmemset(&mpi_stat, 0, sizeof(MPI_Status)); - if(MPI_SUCCESS != (mpi_code = MPI_File_write_at_all(mpi_fh, (MPI_Offset)0, NULL, 0, MPI_BYTE, &mpi_stat))) + if(MPI_SUCCESS != (mpi_code = + MPI_File_write_at_all(mpi_fh, (MPI_Offset)0, + NULL, 0, MPI_BYTE, &mpi_stat))) HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at_all failed", mpi_code) - /* just to match up with the 2nd MPI_File_set_view (reset) in H5FD_mpio_write() */ - if(MPI_SUCCESS != (mpi_code = MPI_File_set_view(mpi_fh, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL))) + /* just to match up with the 2nd MPI_File_set_view (reset) in + * H5FD_mpio_write() + */ + if(MPI_SUCCESS != (mpi_code = + MPI_File_set_view(mpi_fh, (MPI_Offset)0, MPI_BYTE, + MPI_BYTE, "native", + info))) HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) + } /* end else */ done: @@ -1063,7 +1118,8 @@ done: if(orig_xfer_mode != H5FD_MPIO_COLLECTIVE) { HDassert(plist); if(H5P_set(plist, H5D_XFER_IO_XFER_MODE_NAME, &orig_xfer_mode) < 0) - HDONE_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O property") + HDONE_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, \ + "can't set MPI-I/O property") } /* end if */ FUNC_LEAVE_NOAPI(ret_value); diff --git a/src/H5Cprivate.h b/src/H5Cprivate.h index bdfb23e..5c5a666 100644 --- a/src/H5Cprivate.h +++ b/src/H5Cprivate.h @@ -2340,9 +2340,12 @@ H5_DLL herr_t H5C_dump_cache_LRU(H5C_t *cache_ptr, const char *cache_name); H5_DLL hbool_t H5C_get_serialization_in_progress(const H5C_t *cache_ptr); H5_DLL hbool_t H5C_cache_is_clean(const H5C_t *cache_ptr, H5C_ring_t inner_ring); H5_DLL herr_t H5C_dump_cache_skip_list(H5C_t *cache_ptr, char *calling_fcn); +#ifdef H5_HAVE_PARALLEL +H5_DLL herr_t H5C_dump_coll_write_list(H5C_t * cache_ptr, char * calling_fcn); +#endif /* H5_HAVE_PARALLEL */ H5_DLL herr_t H5C_get_entry_ptr_from_addr(H5C_t *cache_ptr, haddr_t addr, void **entry_ptr_ptr); -H5_DLL herr_t H5C_flush_dependency_exists(H5C_t *cache_ptr, haddr_t parent_addr, +H5_DLL herr_t H5C_flush_dependency_exists(H5C_t *cache_ptr, haddr_t parent_addr, haddr_t child_addr, hbool_t *fd_exists_ptr); H5_DLL herr_t H5C_verify_entry_type(H5C_t *cache_ptr, haddr_t addr, const H5C_class_t *expected_type, hbool_t *in_cache_ptr, diff --git a/src/H5FDmpi.c b/src/H5FDmpi.c index fdc4eca..bf4e03a 100644 --- a/src/H5FDmpi.c +++ b/src/H5FDmpi.c @@ -148,6 +148,45 @@ done: /*------------------------------------------------------------------------- + * Function: H5FD_get_mpi_info + * + * Purpose: Retrieves the file's mpi info + * + * Return: Success: SUCCEED + * + * Failure: Negative + * + * Programmer: John Mainzer + * 4/4/17 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +herr_t +H5FD_get_mpi_info(H5FD_t *file, void** mpi_info) +{ + const H5FD_class_mpi_t *cls; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI_NOINIT + + HDassert(file); + cls = (const H5FD_class_mpi_t *)(file->cls); + HDassert(cls); + HDassert(cls->get_mpi_info); /* All MPI drivers must implement this */ + + /* Dispatch to driver */ + if ((ret_value=(cls->get_mpi_info)(file, mpi_info)) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, MPI_COMM_NULL, \ + "driver get_mpi_info request failed") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD_get_mpi_info() */ + + +/*------------------------------------------------------------------------- * Function: H5FD_mpi_MPIOff_to_haddr * * Purpose: Convert an MPI_Offset value to haddr_t. diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c index a3a404f..9417d46 100644 --- a/src/H5FDmpio.c +++ b/src/H5FDmpio.c @@ -95,6 +95,7 @@ static herr_t H5FD_mpio_truncate(H5FD_t *_file, hid_t dxpl_id, hbool_t closing); static int H5FD_mpio_mpi_rank(const H5FD_t *_file); static int H5FD_mpio_mpi_size(const H5FD_t *_file); static MPI_Comm H5FD_mpio_communicator(const H5FD_t *_file); +static herr_t H5FD_mpio_get_info(H5FD_t *_file, void** mpi_info); /* The MPIO file driver information */ static const H5FD_class_mpi_t H5FD_mpio_g = { @@ -134,7 +135,8 @@ static const H5FD_class_mpi_t H5FD_mpio_g = { }, /* End of superclass information */ H5FD_mpio_mpi_rank, /*get_rank */ H5FD_mpio_mpi_size, /*get_size */ - H5FD_mpio_communicator /*get_comm */ + H5FD_mpio_communicator, /*get_comm */ + H5FD_mpio_get_info /*get_info */ }; #ifdef H5FDmpio_DEBUG @@ -1308,6 +1310,39 @@ done: /*------------------------------------------------------------------------- + * Function: H5FD_mpio_get_info + * + * Purpose: Returns the file info of MPIO file driver. + * + * Returns: Non-negative if succeed or negative if fails. + * + * Programmer: John Mainzer + * April 4, 2017 + * + * Modifications: + * + *------------------------------------------------------------------------- +*/ +static herr_t +H5FD_mpio_get_info(H5FD_t *_file, void** mpi_info) +{ + H5FD_mpio_t *file = (H5FD_mpio_t *)_file; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI_NOINIT + + if(!mpi_info) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "mpi info not valid") + + *mpi_info = &(file->info); + +done: + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5FD_mpio_get_info() */ + + +/*------------------------------------------------------------------------- * Function: H5FD_mpio_read * * Purpose: Reads SIZE bytes of data from FILE beginning at address ADDR diff --git a/src/H5FDprivate.h b/src/H5FDprivate.h index c64ec30..fb7c43c 100644 --- a/src/H5FDprivate.h +++ b/src/H5FDprivate.h @@ -53,6 +53,7 @@ typedef struct H5FD_class_mpi_t { int (*get_rank)(const H5FD_t *file); /* Get the MPI rank of a process */ int (*get_size)(const H5FD_t *file); /* Get the MPI size of a communicator */ MPI_Comm (*get_comm)(const H5FD_t *file); /* Get the communicator for a file */ + herr_t (*get_mpi_info)(H5FD_t *file, void** mpi_info); /* get MPI_Info for a file */ } H5FD_class_mpi_t; #endif @@ -202,6 +203,7 @@ H5_DLL herr_t H5FD_get_mpio_atomicity(H5FD_t *file, hbool_t *flag); H5_DLL int H5FD_mpi_get_rank(const H5FD_t *file); H5_DLL int H5FD_mpi_get_size(const H5FD_t *file); H5_DLL MPI_Comm H5FD_mpi_get_comm(const H5FD_t *_file); +H5_DLL herr_t H5FD_get_mpi_info(H5FD_t *file, void** file_info); #endif /* H5_HAVE_PARALLEL */ #endif /* !_H5FDprivate_H */ diff --git a/src/H5Fmpi.c b/src/H5Fmpi.c index 5434aa5..60593a8 100644 --- a/src/H5Fmpi.c +++ b/src/H5Fmpi.c @@ -356,5 +356,31 @@ done: FUNC_LEAVE_NOAPI(ret_value) } /* end H5F_mpi_retrieve_comm */ +/*------------------------------------------------------------------------- + * Function: H5F_get_mpi_info + * + * Purpose: Retrieves MPI File info. + * + * Return: Success: The size (positive) + * Failure: Negative + * + *------------------------------------------------------------------------- + */ +herr_t +H5F_get_mpi_info(const H5F_t *f, MPI_Info **f_info) +{ + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI(FAIL) + + HDassert(f && f->shared); + + /* Dispatch to driver */ + if ((ret_value = H5FD_get_mpi_info(f->shared->lf, (void **)f_info)) < 0) + HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "can't get mpi file info") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5F_get_mpi_info() */ #endif /* H5_HAVE_PARALLEL */ diff --git a/src/H5Fprivate.h b/src/H5Fprivate.h index 886063a..8ef353a 100644 --- a/src/H5Fprivate.h +++ b/src/H5Fprivate.h @@ -852,6 +852,7 @@ H5_DLL int H5F_mpi_get_rank(const H5F_t *f); H5_DLL MPI_Comm H5F_mpi_get_comm(const H5F_t *f); H5_DLL int H5F_mpi_get_size(const H5F_t *f); H5_DLL herr_t H5F_mpi_retrieve_comm(hid_t loc_id, hid_t acspl_id, MPI_Comm *mpi_comm); +H5_DLL herr_t H5F_get_mpi_info(const H5F_t *f, MPI_Info **f_info); #endif /* H5_HAVE_PARALLEL */ /* External file cache routines */ -- cgit v0.12