From 8aa520584463a5151699dc5768bcdef39f5564a6 Mon Sep 17 00:00:00 2001 From: "M. Scot Breitenfeld" Date: Mon, 18 Dec 2017 13:42:09 -0600 Subject: Optimized version of avoid truncate patch. --- src/H5FDmpio.c | 198 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/H5FDprivate.h | 3 + src/H5Fint.c | 11 +++ 3 files changed, 212 insertions(+) diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c index f594d8e..3fe6ed9 100644 --- a/src/H5FDmpio.c +++ b/src/H5FDmpio.c @@ -56,6 +56,7 @@ static char H5FD_mpi_native_g[] = "native"; * library to determine whether the file is empty, truncated, or okay. The MPIO * driver doesn't bother to keep it updated since it's an expensive operation. */ +#if 0 /* original version */ /* JRM */ typedef struct H5FD_mpio_t { H5FD_t pub; /*public stuff, must be first */ MPI_File f; /*MPIO file handle */ @@ -68,6 +69,26 @@ typedef struct H5FD_mpio_t { haddr_t last_eoa; /* Last known end-of-address marker */ haddr_t local_eof; /* Local end-of-file address for each process */ } H5FD_mpio_t; +#else /* modified version */ /* JRM */ +typedef struct H5FD_mpio_t { + H5FD_t pub; /*public stuff, must be first */ + MPI_File f; /*MPIO file handle */ + MPI_Comm comm; /*communicator */ + MPI_Info info; /*file information */ + int mpi_rank; /* This process's rank */ + int mpi_size; /* Total number of processes */ + haddr_t eof; /*end-of-file marker */ + haddr_t eoa; /*end-of-address marker */ + haddr_t last_eoa; /* Last known end-of-address marker */ + haddr_t local_eof; /* Local end-of-file address for each process */ + herr_t do_pre_trunc_barrier; /* hack to allow us to skip */ + /* unnecessary barriers in */ + /* H5FD_mpio_trucate() without a VFD */ + /* API change. This should be removed */ + /* as soon as be make the necessary */ + /* VFD API change. */ +} H5FD_mpio_t; +#endif /* modified version */ /* JRM */ /* Private Prototypes */ @@ -1039,6 +1060,11 @@ H5FD_mpio_open(const char *name, unsigned flags, hid_t fapl_id, file->eof = H5FD_mpi_MPIOff_to_haddr(size); file->local_eof = file->eof; +#if 1 /* JRM */ + /* Mark initial barriers in H5FD_mpio_truncate() as necessary */ + file->do_pre_trunc_barrier = TRUE; +#endif /* JRM */ + /* Set return value */ ret_value=(H5FD_t*)file; @@ -1930,6 +1956,7 @@ done: FUNC_LEAVE_NOAPI(ret_value) } /* end H5FD_mpio_flush() */ +#if 0 /* original version */ /*------------------------------------------------------------------------- * Function: H5FD_mpio_truncate @@ -1996,6 +2023,177 @@ done: FUNC_LEAVE_NOAPI(ret_value) } /* end H5FD_mpio_truncate() */ +#else /* modified versin */ + + +/*------------------------------------------------------------------------- + * Function: H5FD_mark_pre_trunc_barrier_unecessary + * + * Purpose: Hack to allow us to avoid most unnecessary barriers + * prior in H5FD_mpio_truncate(). + * + * This function should be deleted when next we modify the + * VFD interface. This change should allow us to tell the + * truncate function to omit the initial barrier if no + * file I/O has occurred since the last barrier. + * + * Return: void + * + * + * Programmer: John Mainzer + * 12/14/17 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +void +H5FD_mpio_mark_pre_trunc_barrier_unecessary(H5FD_t *_file) +{ + H5FD_mpio_t *file = (H5FD_mpio_t*)_file; + + FUNC_ENTER_NOAPI_NOINIT_NOERR + + HDassert(file); + HDassert(H5FD_MPIO == file->pub.driver_id); + + file->do_pre_trunc_barrier = FALSE; + + FUNC_LEAVE_NOAPI_VOID + +} /* end H5FD_mpio_mark_pre_trunc_barrier_unecessary() */ + + +/*------------------------------------------------------------------------- + * Function: H5FD_mpio_truncate + * + * Purpose: Make certain the file's size matches it's allocated size + * + * This is a little sticky in the mpio case, as it is not + * easy for us to track the current EOF by extracting it from + * write calls. + * + * Instead, we first check to see if the eoa has changed since + * the last call to this function. If it has, we call + * MPI_File_get_size() to determine the current EOF, and + * only call MPI_File_set_size() if this value disagrees + * with the current eoa. + * + * Return: Success: Non-negative + * Failure: Negative + * + * Programmer: Quincey Koziol + * January 31, 2008 + * + * Changes: Heavily reworked to avoid unnecessary MPI_File_set_size() + * calls. The hope is that these calls are superfluous in the + * typical case, allowing us to avoid truncates most of the + * time. + * + * The basic idea is to query the file system to get the + * current eof, and only truncate if the file systems + * conception of the eof disagrees with our eoa. + * + * JRM -- 10/27/17 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD_mpio_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t H5_ATTR_UNUSED closing) +{ + H5FD_mpio_t *file = (H5FD_mpio_t*)_file; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI_NOINIT + +#ifdef H5FDmpio_DEBUG + if(H5FD_mpio_Debug[(int)'t']) + HDfprintf(stdout, "Entering %s\n", FUNC); +#endif + HDassert(file); + HDassert(H5FD_MPIO == file->pub.driver_id); + + if ( !H5F_addr_eq(file->eoa, file->last_eoa) ) { + + int mpi_code; /* mpi return code */ + MPI_Offset size; + MPI_Offset needed_eof; + + /* In principle, it is possible for the size returned by the + * call to MPI_File_get_size() to depend on whether writes from + * all proceeses have completed at the time process 0 makes the + * call. + * + * In practice, most (all?) truncate calls will come after a barrier + * and with no interviening writes to the file (with the possible + * exception of sueprblock / superblock extension message updates). + * + * Unfortunately, the current VFD API doesn't let us pass in a + * flag indicating whether this particular call is unnecessary. + * To work around this, I have added the new function + * H5FD_mpio_mark_pre_trunc_barrier_unecessary() allow us to + * set a flag in H5FD_mpio_t indicating that we can skip the + * barrier. + * + * This is a pretty ugly hack, but until we revise the VFD API, + * it is about the best we can do. + */ + if (file->do_pre_trunc_barrier) { + if (MPI_SUCCESS!= (mpi_code=MPI_Barrier(file->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed(1)", mpi_code) + } + + /* Only processor p0 will get the filesize and broadcast it. */ + if (file->mpi_rank == 0) { + if (MPI_SUCCESS != (mpi_code=MPI_File_get_size(file->f, &size))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_get_size failed", mpi_code) + } /* end if */ + + /* Broadcast file size */ + if(MPI_SUCCESS != (mpi_code = MPI_Bcast(&size, (int)sizeof(MPI_Offset), + MPI_BYTE, 0, file->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code) + + if(H5FD_mpi_haddr_to_MPIOff(file->eoa, &needed_eof) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, \ + "cannot convert from haddr_t to MPI_Offset") + + if (size != needed_eof) /* eoa != eof. Set eof to eoa */ { + + /* Extend the file's size */ + if(MPI_SUCCESS != (mpi_code=MPI_File_set_size(file->f, needed_eof))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_size failed", mpi_code) + + /* In general, we must wait until all processes have finished + * the truncate before any process can continue, since it is + * possible that a process would write at the end of the + * file, and this write would be discarded by the truncate. + * + * While this is an issue for a user initiated flush, it may + * not be an issue at file close. If so, we may be able to + * optimize out the following barrier in that case. + */ + if(MPI_SUCCESS != (mpi_code = MPI_Barrier(file->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code) + } + + /* Update the 'last' eoa value */ + file->last_eoa = file->eoa; + } /* end if */ + +done: + file->do_pre_trunc_barrier = TRUE; + +#ifdef H5FDmpio_DEBUG + if(H5FD_mpio_Debug[(int)'t']) + HDfprintf(stdout, "Leaving %s\n", FUNC); +#endif + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD_mpio_truncate() */ + +#endif /* modified version */ + /*------------------------------------------------------------------------- * Function: H5FD_mpio_mpi_rank diff --git a/src/H5FDprivate.h b/src/H5FDprivate.h index e758951..f7be327 100644 --- a/src/H5FDprivate.h +++ b/src/H5FDprivate.h @@ -203,6 +203,9 @@ H5_DLL int H5FD_mpi_get_rank(const H5FD_t *file); H5_DLL int H5FD_mpi_get_size(const H5FD_t *file); H5_DLL MPI_Comm H5FD_mpi_get_comm(const H5FD_t *_file); H5_DLL herr_t H5FD_get_mpi_info(H5FD_t *file, void** file_info); +#if 1 /* JRM */ +H5_DLL void H5FD_mpio_mark_pre_trunc_barrier_unecessary(H5FD_t *_file); +#endif /* JRM */ #endif /* H5_HAVE_PARALLEL */ #endif /* !_H5FDprivate_H */ diff --git a/src/H5Fint.c b/src/H5Fint.c index 8212eb5..25df964 100644 --- a/src/H5Fint.c +++ b/src/H5Fint.c @@ -1532,6 +1532,17 @@ H5F__flush_phase2(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t closi HDONE_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to flush metadata cache") /* Truncate the file to the current allocated size */ +#if 1 /* JRM */ +#ifdef H5_HAVE_PARALLEL + if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) { + /* Since we just returned from a call to H5AC_flush(), we just + * passed through a barrier. Hence we can skip the barrier on + * entry to the mpio file driver call below. + */ + H5FD_mpio_mark_pre_trunc_barrier_unecessary(f->shared->lf); + } +#endif /* H5_HAVE_PARALLEL */ +#endif /* JRM */ if(H5FD_truncate(f->shared->lf, meta_dxpl_id, closing) < 0) /* Push error, but keep going*/ HDONE_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, "low level truncate failed") -- cgit v0.12