From 8aa520584463a5151699dc5768bcdef39f5564a6 Mon Sep 17 00:00:00 2001 From: "M. Scot Breitenfeld" Date: Mon, 18 Dec 2017 13:42:09 -0600 Subject: Optimized version of avoid truncate patch. --- src/H5FDmpio.c | 198 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/H5FDprivate.h | 3 + src/H5Fint.c | 11 +++ 3 files changed, 212 insertions(+) diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c index f594d8e..3fe6ed9 100644 --- a/src/H5FDmpio.c +++ b/src/H5FDmpio.c @@ -56,6 +56,7 @@ static char H5FD_mpi_native_g[] = "native"; * library to determine whether the file is empty, truncated, or okay. The MPIO * driver doesn't bother to keep it updated since it's an expensive operation. */ +#if 0 /* original version */ /* JRM */ typedef struct H5FD_mpio_t { H5FD_t pub; /*public stuff, must be first */ MPI_File f; /*MPIO file handle */ @@ -68,6 +69,26 @@ typedef struct H5FD_mpio_t { haddr_t last_eoa; /* Last known end-of-address marker */ haddr_t local_eof; /* Local end-of-file address for each process */ } H5FD_mpio_t; +#else /* modified version */ /* JRM */ +typedef struct H5FD_mpio_t { + H5FD_t pub; /*public stuff, must be first */ + MPI_File f; /*MPIO file handle */ + MPI_Comm comm; /*communicator */ + MPI_Info info; /*file information */ + int mpi_rank; /* This process's rank */ + int mpi_size; /* Total number of processes */ + haddr_t eof; /*end-of-file marker */ + haddr_t eoa; /*end-of-address marker */ + haddr_t last_eoa; /* Last known end-of-address marker */ + haddr_t local_eof; /* Local end-of-file address for each process */ + herr_t do_pre_trunc_barrier; /* hack to allow us to skip */ + /* unnecessary barriers in */ + /* H5FD_mpio_trucate() without a VFD */ + /* API change. This should be removed */ + /* as soon as be make the necessary */ + /* VFD API change. */ +} H5FD_mpio_t; +#endif /* modified version */ /* JRM */ /* Private Prototypes */ @@ -1039,6 +1060,11 @@ H5FD_mpio_open(const char *name, unsigned flags, hid_t fapl_id, file->eof = H5FD_mpi_MPIOff_to_haddr(size); file->local_eof = file->eof; +#if 1 /* JRM */ + /* Mark initial barriers in H5FD_mpio_truncate() as necessary */ + file->do_pre_trunc_barrier = TRUE; +#endif /* JRM */ + /* Set return value */ ret_value=(H5FD_t*)file; @@ -1930,6 +1956,7 @@ done: FUNC_LEAVE_NOAPI(ret_value) } /* end H5FD_mpio_flush() */ +#if 0 /* original version */ /*------------------------------------------------------------------------- * Function: H5FD_mpio_truncate @@ -1996,6 +2023,177 @@ done: FUNC_LEAVE_NOAPI(ret_value) } /* end H5FD_mpio_truncate() */ +#else /* modified versin */ + + +/*------------------------------------------------------------------------- + * Function: H5FD_mark_pre_trunc_barrier_unecessary + * + * Purpose: Hack to allow us to avoid most unnecessary barriers + * prior in H5FD_mpio_truncate(). + * + * This function should be deleted when next we modify the + * VFD interface. This change should allow us to tell the + * truncate function to omit the initial barrier if no + * file I/O has occurred since the last barrier. + * + * Return: void + * + * + * Programmer: John Mainzer + * 12/14/17 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +void +H5FD_mpio_mark_pre_trunc_barrier_unecessary(H5FD_t *_file) +{ + H5FD_mpio_t *file = (H5FD_mpio_t*)_file; + + FUNC_ENTER_NOAPI_NOINIT_NOERR + + HDassert(file); + HDassert(H5FD_MPIO == file->pub.driver_id); + + file->do_pre_trunc_barrier = FALSE; + + FUNC_LEAVE_NOAPI_VOID + +} /* end H5FD_mpio_mark_pre_trunc_barrier_unecessary() */ + + +/*------------------------------------------------------------------------- + * Function: H5FD_mpio_truncate + * + * Purpose: Make certain the file's size matches it's allocated size + * + * This is a little sticky in the mpio case, as it is not + * easy for us to track the current EOF by extracting it from + * write calls. + * + * Instead, we first check to see if the eoa has changed since + * the last call to this function. If it has, we call + * MPI_File_get_size() to determine the current EOF, and + * only call MPI_File_set_size() if this value disagrees + * with the current eoa. + * + * Return: Success: Non-negative + * Failure: Negative + * + * Programmer: Quincey Koziol + * January 31, 2008 + * + * Changes: Heavily reworked to avoid unnecessary MPI_File_set_size() + * calls. The hope is that these calls are superfluous in the + * typical case, allowing us to avoid truncates most of the + * time. + * + * The basic idea is to query the file system to get the + * current eof, and only truncate if the file systems + * conception of the eof disagrees with our eoa. + * + * JRM -- 10/27/17 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD_mpio_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t H5_ATTR_UNUSED closing) +{ + H5FD_mpio_t *file = (H5FD_mpio_t*)_file; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI_NOINIT + +#ifdef H5FDmpio_DEBUG + if(H5FD_mpio_Debug[(int)'t']) + HDfprintf(stdout, "Entering %s\n", FUNC); +#endif + HDassert(file); + HDassert(H5FD_MPIO == file->pub.driver_id); + + if ( !H5F_addr_eq(file->eoa, file->last_eoa) ) { + + int mpi_code; /* mpi return code */ + MPI_Offset size; + MPI_Offset needed_eof; + + /* In principle, it is possible for the size returned by the + * call to MPI_File_get_size() to depend on whether writes from + * all proceeses have completed at the time process 0 makes the + * call. + * + * In practice, most (all?) truncate calls will come after a barrier + * and with no interviening writes to the file (with the possible + * exception of sueprblock / superblock extension message updates). + * + * Unfortunately, the current VFD API doesn't let us pass in a + * flag indicating whether this particular call is unnecessary. + * To work around this, I have added the new function + * H5FD_mpio_mark_pre_trunc_barrier_unecessary() allow us to + * set a flag in H5FD_mpio_t indicating that we can skip the + * barrier. + * + * This is a pretty ugly hack, but until we revise the VFD API, + * it is about the best we can do. + */ + if (file->do_pre_trunc_barrier) { + if (MPI_SUCCESS!= (mpi_code=MPI_Barrier(file->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed(1)", mpi_code) + } + + /* Only processor p0 will get the filesize and broadcast it. */ + if (file->mpi_rank == 0) { + if (MPI_SUCCESS != (mpi_code=MPI_File_get_size(file->f, &size))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_get_size failed", mpi_code) + } /* end if */ + + /* Broadcast file size */ + if(MPI_SUCCESS != (mpi_code = MPI_Bcast(&size, (int)sizeof(MPI_Offset), + MPI_BYTE, 0, file->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code) + + if(H5FD_mpi_haddr_to_MPIOff(file->eoa, &needed_eof) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, \ + "cannot convert from haddr_t to MPI_Offset") + + if (size != needed_eof) /* eoa != eof. Set eof to eoa */ { + + /* Extend the file's size */ + if(MPI_SUCCESS != (mpi_code=MPI_File_set_size(file->f, needed_eof))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_size failed", mpi_code) + + /* In general, we must wait until all processes have finished + * the truncate before any process can continue, since it is + * possible that a process would write at the end of the + * file, and this write would be discarded by the truncate. + * + * While this is an issue for a user initiated flush, it may + * not be an issue at file close. If so, we may be able to + * optimize out the following barrier in that case. + */ + if(MPI_SUCCESS != (mpi_code = MPI_Barrier(file->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code) + } + + /* Update the 'last' eoa value */ + file->last_eoa = file->eoa; + } /* end if */ + +done: + file->do_pre_trunc_barrier = TRUE; + +#ifdef H5FDmpio_DEBUG + if(H5FD_mpio_Debug[(int)'t']) + HDfprintf(stdout, "Leaving %s\n", FUNC); +#endif + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD_mpio_truncate() */ + +#endif /* modified version */ + /*------------------------------------------------------------------------- * Function: H5FD_mpio_mpi_rank diff --git a/src/H5FDprivate.h b/src/H5FDprivate.h index e758951..f7be327 100644 --- a/src/H5FDprivate.h +++ b/src/H5FDprivate.h @@ -203,6 +203,9 @@ H5_DLL int H5FD_mpi_get_rank(const H5FD_t *file); H5_DLL int H5FD_mpi_get_size(const H5FD_t *file); H5_DLL MPI_Comm H5FD_mpi_get_comm(const H5FD_t *_file); H5_DLL herr_t H5FD_get_mpi_info(H5FD_t *file, void** file_info); +#if 1 /* JRM */ +H5_DLL void H5FD_mpio_mark_pre_trunc_barrier_unecessary(H5FD_t *_file); +#endif /* JRM */ #endif /* H5_HAVE_PARALLEL */ #endif /* !_H5FDprivate_H */ diff --git a/src/H5Fint.c b/src/H5Fint.c index 8212eb5..25df964 100644 --- a/src/H5Fint.c +++ b/src/H5Fint.c @@ -1532,6 +1532,17 @@ H5F__flush_phase2(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t closi HDONE_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to flush metadata cache") /* Truncate the file to the current allocated size */ +#if 1 /* JRM */ +#ifdef H5_HAVE_PARALLEL + if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) { + /* Since we just returned from a call to H5AC_flush(), we just + * passed through a barrier. Hence we can skip the barrier on + * entry to the mpio file driver call below. + */ + H5FD_mpio_mark_pre_trunc_barrier_unecessary(f->shared->lf); + } +#endif /* H5_HAVE_PARALLEL */ +#endif /* JRM */ if(H5FD_truncate(f->shared->lf, meta_dxpl_id, closing) < 0) /* Push error, but keep going*/ HDONE_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, "low level truncate failed") -- cgit v0.12 From 4823abf3ee0d9df1c3f23ae7e36ce4d1d146afd6 Mon Sep 17 00:00:00 2001 From: mainzer Date: Sun, 11 Mar 2018 22:32:09 -0500 Subject: Removed commented out code from H5FDmpio.c, H5FDprivate.h and H5Fint.c Tested parallel (debug and production) and serial (production) --- src/H5FDmpio.c | 88 ------------------------------------------------------- src/H5FDprivate.h | 2 -- src/H5Fint.c | 2 -- 3 files changed, 92 deletions(-) diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c index 3fe6ed9..08b1a3f 100644 --- a/src/H5FDmpio.c +++ b/src/H5FDmpio.c @@ -56,20 +56,6 @@ static char H5FD_mpi_native_g[] = "native"; * library to determine whether the file is empty, truncated, or okay. The MPIO * driver doesn't bother to keep it updated since it's an expensive operation. */ -#if 0 /* original version */ /* JRM */ -typedef struct H5FD_mpio_t { - H5FD_t pub; /*public stuff, must be first */ - MPI_File f; /*MPIO file handle */ - MPI_Comm comm; /*communicator */ - MPI_Info info; /*file information */ - int mpi_rank; /* This process's rank */ - int mpi_size; /* Total number of processes */ - haddr_t eof; /*end-of-file marker */ - haddr_t eoa; /*end-of-address marker */ - haddr_t last_eoa; /* Last known end-of-address marker */ - haddr_t local_eof; /* Local end-of-file address for each process */ -} H5FD_mpio_t; -#else /* modified version */ /* JRM */ typedef struct H5FD_mpio_t { H5FD_t pub; /*public stuff, must be first */ MPI_File f; /*MPIO file handle */ @@ -88,7 +74,6 @@ typedef struct H5FD_mpio_t { /* as soon as be make the necessary */ /* VFD API change. */ } H5FD_mpio_t; -#endif /* modified version */ /* JRM */ /* Private Prototypes */ @@ -1060,10 +1045,8 @@ H5FD_mpio_open(const char *name, unsigned flags, hid_t fapl_id, file->eof = H5FD_mpi_MPIOff_to_haddr(size); file->local_eof = file->eof; -#if 1 /* JRM */ /* Mark initial barriers in H5FD_mpio_truncate() as necessary */ file->do_pre_trunc_barrier = TRUE; -#endif /* JRM */ /* Set return value */ ret_value=(H5FD_t*)file; @@ -1956,75 +1939,6 @@ done: FUNC_LEAVE_NOAPI(ret_value) } /* end H5FD_mpio_flush() */ -#if 0 /* original version */ - -/*------------------------------------------------------------------------- - * Function: H5FD_mpio_truncate - * - * Purpose: Make certain the file's size matches it's allocated size - * - * Return: Success: Non-negative - * Failure: Negative - * - * Programmer: Quincey Koziol - * January 31, 2008 - * - *------------------------------------------------------------------------- - */ -static herr_t -H5FD_mpio_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t H5_ATTR_UNUSED closing) -{ - H5FD_mpio_t *file = (H5FD_mpio_t*)_file; - herr_t ret_value = SUCCEED; - - FUNC_ENTER_NOAPI_NOINIT - -#ifdef H5FDmpio_DEBUG - if(H5FD_mpio_Debug[(int)'t']) - HDfprintf(stdout, "Entering %s\n", FUNC); -#endif - HDassert(file); - HDassert(H5FD_MPIO == file->pub.driver_id); - - /* Extend the file to make sure it's large enough, then sync. - * Unfortunately, keeping track of EOF is an expensive operation, so - * we can't just check whether EOFeoa > file->last_eoa) { - int mpi_code; /* mpi return code */ - MPI_Offset mpi_off; - - if(H5FD_mpi_haddr_to_MPIOff(file->eoa, &mpi_off) < 0) - HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "cannot convert from haddr_t to MPI_Offset") - - /* Extend the file's size */ - if(MPI_SUCCESS != (mpi_code = MPI_File_set_size(file->f, mpi_off))) - HMPI_GOTO_ERROR(FAIL, "MPI_File_set_size failed", mpi_code) - - /* Don't let any proc return until all have extended the file. - * (Prevents race condition where some processes go ahead and write - * more data to the file before all the processes have finished making - * it the shorter length, potentially truncating the file and dropping - * the new data written) - */ - if(MPI_SUCCESS != (mpi_code = MPI_Barrier(file->comm))) - HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code) - - /* Update the 'last' eoa value */ - file->last_eoa = file->eoa; - } /* end if */ - -done: -#ifdef H5FDmpio_DEBUG - if(H5FD_mpio_Debug[(int)'t']) - HDfprintf(stdout, "Leaving %s\n", FUNC); -#endif - - FUNC_LEAVE_NOAPI(ret_value) -} /* end H5FD_mpio_truncate() */ - -#else /* modified versin */ - /*------------------------------------------------------------------------- * Function: H5FD_mark_pre_trunc_barrier_unecessary @@ -2192,8 +2106,6 @@ done: FUNC_LEAVE_NOAPI(ret_value) } /* end H5FD_mpio_truncate() */ -#endif /* modified version */ - /*------------------------------------------------------------------------- * Function: H5FD_mpio_mpi_rank diff --git a/src/H5FDprivate.h b/src/H5FDprivate.h index f7be327..f7cd931 100644 --- a/src/H5FDprivate.h +++ b/src/H5FDprivate.h @@ -203,9 +203,7 @@ H5_DLL int H5FD_mpi_get_rank(const H5FD_t *file); H5_DLL int H5FD_mpi_get_size(const H5FD_t *file); H5_DLL MPI_Comm H5FD_mpi_get_comm(const H5FD_t *_file); H5_DLL herr_t H5FD_get_mpi_info(H5FD_t *file, void** file_info); -#if 1 /* JRM */ H5_DLL void H5FD_mpio_mark_pre_trunc_barrier_unecessary(H5FD_t *_file); -#endif /* JRM */ #endif /* H5_HAVE_PARALLEL */ #endif /* !_H5FDprivate_H */ diff --git a/src/H5Fint.c b/src/H5Fint.c index 135d878..cc5931a 100644 --- a/src/H5Fint.c +++ b/src/H5Fint.c @@ -1788,7 +1788,6 @@ H5F__flush_phase2(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t closi HDONE_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to flush metadata cache") /* Truncate the file to the current allocated size */ -#if 1 /* JRM */ #ifdef H5_HAVE_PARALLEL if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) { /* Since we just returned from a call to H5AC_flush(), we just @@ -1798,7 +1797,6 @@ H5F__flush_phase2(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t closi H5FD_mpio_mark_pre_trunc_barrier_unecessary(f->shared->lf); } #endif /* H5_HAVE_PARALLEL */ -#endif /* JRM */ if(H5FD_truncate(f->shared->lf, meta_dxpl_id, closing) < 0) /* Push error, but keep going*/ HDONE_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, "low level truncate failed") -- cgit v0.12