summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorM. Scot Breitenfeld <brtnfld@hdfgroup.org>2017-12-18 19:42:09 (GMT)
committerM. Scot Breitenfeld <brtnfld@hdfgroup.org>2017-12-18 19:42:09 (GMT)
commit8aa520584463a5151699dc5768bcdef39f5564a6 (patch)
tree5c905b6022d0d3286f957fb78710e58335f3e2c7 /src
parent758b9667b3e07a7562107f19f7dabee027e5bd53 (diff)
downloadhdf5-8aa520584463a5151699dc5768bcdef39f5564a6.zip
hdf5-8aa520584463a5151699dc5768bcdef39f5564a6.tar.gz
hdf5-8aa520584463a5151699dc5768bcdef39f5564a6.tar.bz2
Optimized version of avoid truncate patch.
Diffstat (limited to 'src')
-rw-r--r--src/H5FDmpio.c198
-rw-r--r--src/H5FDprivate.h3
-rw-r--r--src/H5Fint.c11
3 files changed, 212 insertions, 0 deletions
diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c
index f594d8e..3fe6ed9 100644
--- a/src/H5FDmpio.c
+++ b/src/H5FDmpio.c
@@ -56,6 +56,7 @@ static char H5FD_mpi_native_g[] = "native";
* library to determine whether the file is empty, truncated, or okay. The MPIO
* driver doesn't bother to keep it updated since it's an expensive operation.
*/
+#if 0 /* original version */ /* JRM */
typedef struct H5FD_mpio_t {
H5FD_t pub; /*public stuff, must be first */
MPI_File f; /*MPIO file handle */
@@ -68,6 +69,26 @@ typedef struct H5FD_mpio_t {
haddr_t last_eoa; /* Last known end-of-address marker */
haddr_t local_eof; /* Local end-of-file address for each process */
} H5FD_mpio_t;
+#else /* modified version */ /* JRM */
+typedef struct H5FD_mpio_t {
+ H5FD_t pub; /*public stuff, must be first */
+ MPI_File f; /*MPIO file handle */
+ MPI_Comm comm; /*communicator */
+ MPI_Info info; /*file information */
+ int mpi_rank; /* This process's rank */
+ int mpi_size; /* Total number of processes */
+ haddr_t eof; /*end-of-file marker */
+ haddr_t eoa; /*end-of-address marker */
+ haddr_t last_eoa; /* Last known end-of-address marker */
+ haddr_t local_eof; /* Local end-of-file address for each process */
+ herr_t do_pre_trunc_barrier; /* hack to allow us to skip */
+ /* unnecessary barriers in */
+ /* H5FD_mpio_trucate() without a VFD */
+ /* API change. This should be removed */
+ /* as soon as be make the necessary */
+ /* VFD API change. */
+} H5FD_mpio_t;
+#endif /* modified version */ /* JRM */
/* Private Prototypes */
@@ -1039,6 +1060,11 @@ H5FD_mpio_open(const char *name, unsigned flags, hid_t fapl_id,
file->eof = H5FD_mpi_MPIOff_to_haddr(size);
file->local_eof = file->eof;
+#if 1 /* JRM */
+ /* Mark initial barriers in H5FD_mpio_truncate() as necessary */
+ file->do_pre_trunc_barrier = TRUE;
+#endif /* JRM */
+
/* Set return value */
ret_value=(H5FD_t*)file;
@@ -1930,6 +1956,7 @@ done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD_mpio_flush() */
+#if 0 /* original version */
/*-------------------------------------------------------------------------
* Function: H5FD_mpio_truncate
@@ -1996,6 +2023,177 @@ done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD_mpio_truncate() */
+#else /* modified versin */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mark_pre_trunc_barrier_unecessary
+ *
+ * Purpose: Hack to allow us to avoid most unnecessary barriers
+ * prior in H5FD_mpio_truncate().
+ *
+ * This function should be deleted when next we modify the
+ * VFD interface. This change should allow us to tell the
+ * truncate function to omit the initial barrier if no
+ * file I/O has occurred since the last barrier.
+ *
+ * Return: void
+ *
+ *
+ * Programmer: John Mainzer
+ * 12/14/17
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+void
+H5FD_mpio_mark_pre_trunc_barrier_unecessary(H5FD_t *_file)
+{
+ H5FD_mpio_t *file = (H5FD_mpio_t*)_file;
+
+ FUNC_ENTER_NOAPI_NOINIT_NOERR
+
+ HDassert(file);
+ HDassert(H5FD_MPIO == file->pub.driver_id);
+
+ file->do_pre_trunc_barrier = FALSE;
+
+ FUNC_LEAVE_NOAPI_VOID
+
+} /* end H5FD_mpio_mark_pre_trunc_barrier_unecessary() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpio_truncate
+ *
+ * Purpose: Make certain the file's size matches it's allocated size
+ *
+ * This is a little sticky in the mpio case, as it is not
+ * easy for us to track the current EOF by extracting it from
+ * write calls.
+ *
+ * Instead, we first check to see if the eoa has changed since
+ * the last call to this function. If it has, we call
+ * MPI_File_get_size() to determine the current EOF, and
+ * only call MPI_File_set_size() if this value disagrees
+ * with the current eoa.
+ *
+ * Return: Success: Non-negative
+ * Failure: Negative
+ *
+ * Programmer: Quincey Koziol
+ * January 31, 2008
+ *
+ * Changes: Heavily reworked to avoid unnecessary MPI_File_set_size()
+ * calls. The hope is that these calls are superfluous in the
+ * typical case, allowing us to avoid truncates most of the
+ * time.
+ *
+ * The basic idea is to query the file system to get the
+ * current eof, and only truncate if the file systems
+ * conception of the eof disagrees with our eoa.
+ *
+ * JRM -- 10/27/17
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD_mpio_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t H5_ATTR_UNUSED closing)
+{
+ H5FD_mpio_t *file = (H5FD_mpio_t*)_file;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_NOAPI_NOINIT
+
+#ifdef H5FDmpio_DEBUG
+ if(H5FD_mpio_Debug[(int)'t'])
+ HDfprintf(stdout, "Entering %s\n", FUNC);
+#endif
+ HDassert(file);
+ HDassert(H5FD_MPIO == file->pub.driver_id);
+
+ if ( !H5F_addr_eq(file->eoa, file->last_eoa) ) {
+
+ int mpi_code; /* mpi return code */
+ MPI_Offset size;
+ MPI_Offset needed_eof;
+
+ /* In principle, it is possible for the size returned by the
+ * call to MPI_File_get_size() to depend on whether writes from
+ * all proceeses have completed at the time process 0 makes the
+ * call.
+ *
+ * In practice, most (all?) truncate calls will come after a barrier
+ * and with no interviening writes to the file (with the possible
+ * exception of sueprblock / superblock extension message updates).
+ *
+ * Unfortunately, the current VFD API doesn't let us pass in a
+ * flag indicating whether this particular call is unnecessary.
+ * To work around this, I have added the new function
+ * H5FD_mpio_mark_pre_trunc_barrier_unecessary() allow us to
+ * set a flag in H5FD_mpio_t indicating that we can skip the
+ * barrier.
+ *
+ * This is a pretty ugly hack, but until we revise the VFD API,
+ * it is about the best we can do.
+ */
+ if (file->do_pre_trunc_barrier) {
+ if (MPI_SUCCESS!= (mpi_code=MPI_Barrier(file->comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed(1)", mpi_code)
+ }
+
+ /* Only processor p0 will get the filesize and broadcast it. */
+ if (file->mpi_rank == 0) {
+ if (MPI_SUCCESS != (mpi_code=MPI_File_get_size(file->f, &size)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_get_size failed", mpi_code)
+ } /* end if */
+
+ /* Broadcast file size */
+ if(MPI_SUCCESS != (mpi_code = MPI_Bcast(&size, (int)sizeof(MPI_Offset),
+ MPI_BYTE, 0, file->comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code)
+
+ if(H5FD_mpi_haddr_to_MPIOff(file->eoa, &needed_eof) < 0)
+ HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, \
+ "cannot convert from haddr_t to MPI_Offset")
+
+ if (size != needed_eof) /* eoa != eof. Set eof to eoa */ {
+
+ /* Extend the file's size */
+ if(MPI_SUCCESS != (mpi_code=MPI_File_set_size(file->f, needed_eof)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_set_size failed", mpi_code)
+
+ /* In general, we must wait until all processes have finished
+ * the truncate before any process can continue, since it is
+ * possible that a process would write at the end of the
+ * file, and this write would be discarded by the truncate.
+ *
+ * While this is an issue for a user initiated flush, it may
+ * not be an issue at file close. If so, we may be able to
+ * optimize out the following barrier in that case.
+ */
+ if(MPI_SUCCESS != (mpi_code = MPI_Barrier(file->comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code)
+ }
+
+ /* Update the 'last' eoa value */
+ file->last_eoa = file->eoa;
+ } /* end if */
+
+done:
+ file->do_pre_trunc_barrier = TRUE;
+
+#ifdef H5FDmpio_DEBUG
+ if(H5FD_mpio_Debug[(int)'t'])
+ HDfprintf(stdout, "Leaving %s\n", FUNC);
+#endif
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5FD_mpio_truncate() */
+
+#endif /* modified version */
+
/*-------------------------------------------------------------------------
* Function: H5FD_mpio_mpi_rank
diff --git a/src/H5FDprivate.h b/src/H5FDprivate.h
index e758951..f7be327 100644
--- a/src/H5FDprivate.h
+++ b/src/H5FDprivate.h
@@ -203,6 +203,9 @@ H5_DLL int H5FD_mpi_get_rank(const H5FD_t *file);
H5_DLL int H5FD_mpi_get_size(const H5FD_t *file);
H5_DLL MPI_Comm H5FD_mpi_get_comm(const H5FD_t *_file);
H5_DLL herr_t H5FD_get_mpi_info(H5FD_t *file, void** file_info);
+#if 1 /* JRM */
+H5_DLL void H5FD_mpio_mark_pre_trunc_barrier_unecessary(H5FD_t *_file);
+#endif /* JRM */
#endif /* H5_HAVE_PARALLEL */
#endif /* !_H5FDprivate_H */
diff --git a/src/H5Fint.c b/src/H5Fint.c
index 8212eb5..25df964 100644
--- a/src/H5Fint.c
+++ b/src/H5Fint.c
@@ -1532,6 +1532,17 @@ H5F__flush_phase2(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t closi
HDONE_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to flush metadata cache")
/* Truncate the file to the current allocated size */
+#if 1 /* JRM */
+#ifdef H5_HAVE_PARALLEL
+ if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) {
+ /* Since we just returned from a call to H5AC_flush(), we just
+ * passed through a barrier. Hence we can skip the barrier on
+ * entry to the mpio file driver call below.
+ */
+ H5FD_mpio_mark_pre_trunc_barrier_unecessary(f->shared->lf);
+ }
+#endif /* H5_HAVE_PARALLEL */
+#endif /* JRM */
if(H5FD_truncate(f->shared->lf, meta_dxpl_id, closing) < 0)
/* Push error, but keep going*/
HDONE_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, "low level truncate failed")