summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/H5B.c4
-rw-r--r--src/H5F.c15
-rw-r--r--src/H5Flow.c9
-rw-r--r--src/H5Fmpio.c81
-rw-r--r--src/H5Fmpioprivate.h25
-rw-r--r--src/H5Fprivate.h2
-rw-r--r--src/H5Gnode.c4
-rw-r--r--src/H5HL.c10
-rw-r--r--src/H5O.c8
9 files changed, 155 insertions, 3 deletions
diff --git a/src/H5B.c b/src/H5B.c
index 7491ed2..aeb0879 100644
--- a/src/H5B.c
+++ b/src/H5B.c
@@ -383,6 +383,7 @@ H5B_load(H5F_t *f, const haddr_t *addr, const void *_type, void *udata)
* Jun 23 1997
*
* Modifications:
+ * rky 980828 Only p0 writes metadata to disk.
*
*-------------------------------------------------------------------------
*/
@@ -453,6 +454,9 @@ H5B_flush(H5F_t *f, hbool_t destroy, const haddr_t *addr, H5B_t *bt)
* bother writing data for the child entries that don't exist or
* for the final unchanged children.
*/
+#ifdef HAVE_PARALLEL
+ H5F_mpio_tas_allsame( f->shared->lf, TRUE ); /* only p0 will write */
+#endif /* HAVE_PARALLEL */
if (H5F_block_write(f, addr, (hsize_t)size, H5D_XFER_DFLT,
bt->page) < 0) {
HRETURN_ERROR(H5E_BTREE, H5E_CANTFLUSH, FAIL,
diff --git a/src/H5F.c b/src/H5F.c
index b0fdf4e..8191747 100644
--- a/src/H5F.c
+++ b/src/H5F.c
@@ -47,6 +47,7 @@ static char RcsId[] = "@(#)$Revision$";
#include <H5MMprivate.h> /*core memory management */
#include <H5Pprivate.h> /*property lists */
#include <H5Tprivate.h> /*data types */
+#include <H5Fmpioprivate.h> /*MPI-IO parallel stuff */
#include <ctype.h>
#include <sys/types.h>
@@ -138,6 +139,16 @@ H5F_init_interface(void)
interface_initialize_g = TRUE;
FUNC_ENTER(H5F_init_interface, FAIL);
+#ifdef HAVE_PARALLEL
+ {
+ /* Allow MPI buf-and-file-type optimizations? */
+ const char *s = getenv ("HDF5_MPI_1_METAWRITE");
+ if (s && isdigit(*s)) {
+ H5_mpi_1_metawrite_g = (int)HDstrtol (s, NULL, 0);
+ }
+ }
+#endif
+
/* Initialize the atom group for the file IDs */
if (H5I_init_group(H5_FILE, H5I_FILEID_HASHSIZE, 0,
(herr_t (*)(void*))H5F_close)<0 ||
@@ -1416,6 +1427,7 @@ H5Fflush(hid_t object_id)
* Aug 29 1997
*
* Modifications:
+ * rky 980828 Only p0 writes metadata to disk.
*
*-------------------------------------------------------------------------
*/
@@ -1481,6 +1493,9 @@ H5F_flush(H5F_t *f, hbool_t invalidate)
}
/* write the boot block to disk */
+#ifdef HAVE_PARALLEL
+ H5F_mpio_tas_allsame( f->shared->lf, TRUE ); /* only p0 will write */
+#endif
if (H5F_low_write(f->shared->lf, f->shared->access_parms,
H5D_XFER_DFLT,
&(f->shared->boot_addr), (size_t)(p-buf), buf)<0) {
diff --git a/src/H5Flow.c b/src/H5Flow.c
index eec5714..bd1661a 100644
--- a/src/H5Flow.c
+++ b/src/H5Flow.c
@@ -368,6 +368,7 @@ H5F_low_write(H5F_low_t *lf, const H5F_access_t *access_parms,
* Monday, November 10, 1997
*
* Modifications:
+ * rky 980828 Only p0 writes metadata to disk.
*
*-------------------------------------------------------------------------
*/
@@ -382,14 +383,18 @@ H5F_low_flush(H5F_low_t *lf, const H5F_access_t *access_parms)
assert(lf && lf->type);
/* Make sure the last block of the file has been allocated on disk */
+ /* rky 980828 NOTE
+ * Is this really necessary? Could this be eliminated for MPI-IO files? */
H5F_addr_reset(&last_byte);
if (addr_defined(&(lf->eof)) && H5F_addr_gt(&(lf->eof), &last_byte)) {
last_byte = lf->eof;
last_byte.offset -= 1;
if (H5F_low_read(lf, access_parms, H5D_XFER_DFLT, &last_byte,
1, buf) >= 0) {
- H5F_low_write(lf, access_parms, H5D_XFER_DFLT, &last_byte,
- 1, buf);
+#ifdef HAVE_PARALLEL
+ H5F_mpio_tas_allsame( lf, TRUE ); /* only p0 will write */
+#endif /* HAVE_PARALLEL */
+ H5F_low_write(lf, access_parms, H5D_XFER_DFLT, &last_byte, 1, buf);
}
}
/* Invoke the subclass the flush method */
diff --git a/src/H5Fmpio.c b/src/H5Fmpio.c
index 50e7ddc..7214d6b 100644
--- a/src/H5Fmpio.c
+++ b/src/H5Fmpio.c
@@ -69,6 +69,10 @@
static hbool_t interface_initialize_g = FALSE; /* rky??? */
#define INTERFACE_INIT NULL
+/* Global var to allow elimination of redundant metadata writes
+ * to be controlled by the value of an environment variable. */
+hbool_t H5_mpi_1_metawrite_g = FALSE;
+
#define H5F_MPIO_DEV 0xfffe /*pseudo dev for MPI-IO until we fix things */
/* Make sure this differs from H5F_CORE_DEV */
@@ -285,6 +289,7 @@ H5F_mpio_access(const char *name, const H5F_access_t *access_parms, int mode,
* rky, 11 Jun 1998
* Added H5F_mpio_Debug debug flags controlled by MPI_Info.
*
+ * rky 980828 Init flag controlling redundant metadata writes to disk.
*-------------------------------------------------------------------------
*/
static H5F_low_t *
@@ -351,6 +356,7 @@ H5F_mpio_open(const char *name, const H5F_access_t *access_parms, uintn flags,
"memory allocation failed");
}
lf->u.mpio.f = fh;
+ H5F_mpio_tas_allsame( lf, FALSE ); /* initialize */
H5F_addr_reset(&(lf->eof));
mpierr = MPI_File_get_size( fh, &size );
if (MPI_SUCCESS != mpierr) {
@@ -627,6 +633,45 @@ H5F_mpio_read(H5F_low_t *lf, H5F_access_t *access_parms,
} /* H5F_mpio_read */
/*-------------------------------------------------------------------------
+ * Function: H5F_mpio_tas_allsame
+ *
+ * Purpose: Test and set the allsame parameter.
+ *
+ * Errors:
+ *
+ * Return: Success: the old value of the allsame flag
+ *
+ * Failure: assert fails if access_parms is NULL.
+ *
+ * Programmer: rky 980828
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+hbool_t
+H5F_mpio_tas_allsame(H5F_low_t *lf, hbool_t newval )
+{
+ hbool_t oldval;
+
+ FUNC_ENTER(H5F_mpio_tas_allsame, FALSE);
+#ifdef H5Fmpio_DEBUG
+ if (H5F_mpio_Debug[(int)'t'])
+ fprintf(stdout, "Entering H5F_mpio_tas_allsame, newval=%d\n", newval );
+#endif
+
+ assert(lf);
+ oldval = lf->u.mpio.allsame;
+ lf->u.mpio.allsame = newval;
+
+#ifdef H5Fmpio_DEBUG
+ if (H5F_mpio_Debug[(int)'t'])
+ fprintf(stdout, "Leaving H5F_mpio_tas_allsame, oldval=%d\n", oldval );
+#endif
+ FUNC_LEAVE(oldval);
+}
+
+/*-------------------------------------------------------------------------
* Function: H5F_mpio_write
*
* Purpose: Depending on a field in access params, either:
@@ -637,6 +682,19 @@ H5F_mpio_read(H5F_low_t *lf, H5F_access_t *access_parms,
* This can allow MPI to coalesce requests from
* different processes (collective or independent).
*
+ * rky 980828
+ * If the allsame flag is set, we assume that all the procs
+ * in the relevant MPI communicator will write identical data
+ * at identical offsets in the file, so only proc 0 will write,
+ * and all other procs will wait for p0 to finish.
+ * This is useful for writing metadata, for example.
+ * Note that we don't _check_ that the data is identical.
+ * ALso, the mechanism we use to eliminate the redundant writes
+ * is by requiring a call to H5F_mpio_tas_allsame before the write,
+ * which is rather klugey.
+ * Would it be better to pass a parameter to low-level writes
+ * like H5F_block_write and H5F_low_write, instead? Or...???
+ *
* Errors:
* IO WRITEERROR MPI_File_write_at failed.
*
@@ -666,6 +724,9 @@ H5F_mpio_read(H5F_low_t *lf, H5F_access_t *access_parms,
* The guts of H5F_mpio_read and H5F_mpio_write
* should be replaced by a single dual-purpose routine.
*
+ * rky, 980828
+ * Added allsame parameter to make all but proc 0 skip the actual write.
+ *
*-------------------------------------------------------------------------
*/
static herr_t
@@ -676,9 +737,10 @@ H5F_mpio_write(H5F_low_t *lf, H5F_access_t *access_parms,
MPI_Offset mpi_off, mpi_disp;
MPI_Status mpi_stat;
MPI_Datatype buf_type, file_type;
- int mpierr, msglen, size_i, bytes_written;
+ int mpierr, msglen, size_i, bytes_written, mpi_rank;
int use_types_this_time, used_types_last_time;
char mpierrmsg[MPI_MAX_ERROR_STRING];
+ hbool_t allsame;
FUNC_ENTER(H5F_mpio_write, FAIL);
#ifdef H5Fmpio_DEBUG
@@ -707,6 +769,22 @@ H5F_mpio_write(H5F_low_t *lf, H5F_access_t *access_parms,
mpi_off, size_i );
#endif
+ /* Only p0 will do the actual write if all procs in comm write same data */
+ allsame = H5F_mpio_tas_allsame( lf, FALSE );
+ if (allsame && H5_mpi_1_metawrite_g) {
+ mpierr = MPI_Comm_rank( access_parms->u.mpio.comm, &mpi_rank );
+ if (mpierr != MPI_SUCCESS)
+ HRETURN_ERROR(H5E_IO, H5E_MPI, FAIL, "MPI_Comm_rank failed" );
+ if (mpi_rank != 0) {
+#ifdef H5Fmpio_DEBUG
+ if (H5F_mpio_Debug[(int)'w']) {
+ fprintf(stdout, " in H5F_mpio_write (write omitted)\n" );
+ }
+#endif
+ goto done; /* skip the actual write */
+ }
+ }
+
/* Set up for a fancy xfer using complex types, or single byte block.
* We wouldn't need to rely on the use_types field
* if MPI semantics allowed us to test that btype=ftype=MPI_BYTE
@@ -797,6 +875,7 @@ H5F_mpio_write(H5F_low_t *lf, H5F_access_t *access_parms,
"MPI_Get_count returned invalid count" );
}
+ done:
#ifdef H5Fmpio_DEBUG
if (H5F_mpio_Debug[(int)'t'])
fprintf(stdout, "Leaving H5F_mpio_write\n" );
diff --git a/src/H5Fmpioprivate.h b/src/H5Fmpioprivate.h
new file mode 100644
index 0000000..a51eb36
--- /dev/null
+++ b/src/H5Fmpioprivate.h
@@ -0,0 +1,25 @@
+/****************************************************************************
+ * NCSA HDF *
+ * Software Development Group *
+ * National Center for Supercomputing Applications *
+ * University of Illinois at Urbana-Champaign *
+ * 605 E. Springfield, Champaign IL 61820 *
+ * *
+ * For conditions of distribution and use, see the accompanying *
+ * hdf/COPYING file. *
+ * *
+ ****************************************************************************/
+
+/*
+ * This file contains macros & information for MPI-IO file access
+ */
+
+#ifndef _H5Fmpioprivate_H
+#define _H5Fmpioprivate_H
+
+#ifdef HAVE_PARALLEL
+extern hbool_t H5_mpi_1_metawrite_g;
+hbool_t H5F_mpio_tas_allsame(H5F_low_t *lf, hbool_t newval );
+#endif /* HAVE_PARALLEL */
+
+#endif
diff --git a/src/H5Fprivate.h b/src/H5Fprivate.h
index 396b448..5e60efe 100644
--- a/src/H5Fprivate.h
+++ b/src/H5Fprivate.h
@@ -392,6 +392,8 @@ typedef struct H5F_low_t {
/* MPI-IO */
struct {
MPI_File f; /* MPI-IO file handle */
+ hbool_t allsame;/* all procs should write same data, *
+ * so only p0 will do the actual write */
} mpio;
#endif
diff --git a/src/H5Gnode.c b/src/H5Gnode.c
index 842e6a0..3c8005e 100644
--- a/src/H5Gnode.c
+++ b/src/H5Gnode.c
@@ -296,6 +296,7 @@ H5G_node_create(H5F_t *f, H5B_ins_t __unused__ op, void *_lt_key,
* Jun 23 1997
*
* Modifications:
+ * rky 980828 Only p0 writes metadata to disk.
*
*-------------------------------------------------------------------------
*/
@@ -351,6 +352,9 @@ H5G_node_flush(H5F_t *f, hbool_t destroy, const haddr_t *addr,
H5G_ent_encode_vec(f, &p, sym->entry, sym->nsyms);
HDmemset(p, 0, size - (p - buf));
+#ifdef HAVE_PARALLEL
+ H5F_mpio_tas_allsame( f->shared->lf, TRUE ); /* only p0 will write */
+#endif /* HAVE_PARALLEL */
status = H5F_block_write(f, addr, (hsize_t)size, H5D_XFER_DFLT, buf);
buf = H5MM_xfree(buf);
if (status < 0)
diff --git a/src/H5HL.c b/src/H5HL.c
index a960c80..9e157b0 100644
--- a/src/H5HL.c
+++ b/src/H5HL.c
@@ -305,6 +305,7 @@ H5HL_load(H5F_t *f, const haddr_t *addr, const void __unused__ *udata1,
* Jul 17 1997
*
* Modifications:
+ * rky 980828 Only p0 writes metadata to disk.
*
*-------------------------------------------------------------------------
*/
@@ -376,6 +377,9 @@ H5HL_flush(H5F_t *f, hbool_t destroy, const haddr_t *addr, H5HL_t *heap)
H5F_addr_inc(&hdr_end_addr, (hsize_t)H5HL_SIZEOF_HDR(f));
if (H5F_addr_eq(&(heap->addr), &hdr_end_addr)) {
/* The header and data are contiguous */
+#ifdef HAVE_PARALLEL
+ H5F_mpio_tas_allsame( f->shared->lf, TRUE ); /* only p0 writes */
+#endif /* HAVE_PARALLEL */
if (H5F_block_write(f, addr,
(hsize_t)(H5HL_SIZEOF_HDR(f)+heap->disk_alloc),
H5D_XFER_DFLT, heap->chunk) < 0) {
@@ -383,11 +387,17 @@ H5HL_flush(H5F_t *f, hbool_t destroy, const haddr_t *addr, H5HL_t *heap)
"unable to write heap header and data to file");
}
} else {
+#ifdef HAVE_PARALLEL
+ H5F_mpio_tas_allsame( f->shared->lf, TRUE ); /* only p0 writes */
+#endif /* HAVE_PARALLEL */
if (H5F_block_write(f, addr, (hsize_t)H5HL_SIZEOF_HDR(f),
H5D_XFER_DFLT, heap->chunk)<0) {
HRETURN_ERROR(H5E_HEAP, H5E_WRITEERROR, FAIL,
"unable to write heap header to file");
}
+#ifdef HAVE_PARALLEL
+ H5F_mpio_tas_allsame( f->shared->lf, TRUE ); /* only p0 writes */
+#endif /* HAVE_PARALLEL */
if (H5F_block_write(f, &(heap->addr), (hsize_t)(heap->disk_alloc),
H5D_XFER_DFLT,
heap->chunk + H5HL_SIZEOF_HDR(f)) < 0) {
diff --git a/src/H5O.c b/src/H5O.c
index addd14b..322862c 100644
--- a/src/H5O.c
+++ b/src/H5O.c
@@ -506,6 +506,8 @@ H5O_load(H5F_t *f, const haddr_t *addr, const void __unused__ *_udata1,
* Robb Matzke, 7 Jan 1998
* Handles constant vs non-constant messages.
*
+ * rky 980828 Only p0 writes metadata to disk.
+ *
*-------------------------------------------------------------------------
*/
static herr_t
@@ -546,6 +548,9 @@ H5O_flush(H5F_t *f, hbool_t destroy, const haddr_t *addr, H5O_t *oh)
HDmemset (p, 0, H5O_SIZEOF_HDR(f)-12);
/* write the object header header */
+#ifdef HAVE_PARALLEL
+ H5F_mpio_tas_allsame( f->shared->lf, TRUE ); /* only p0 will write */
+#endif /* HAVE_PARALLEL */
if (H5F_block_write(f, addr, (hsize_t)H5O_SIZEOF_HDR(f),
H5D_XFER_DFLT, buf) < 0) {
HRETURN_ERROR(H5E_OHDR, H5E_WRITEERROR, FAIL,
@@ -618,6 +623,9 @@ H5O_flush(H5F_t *f, hbool_t destroy, const haddr_t *addr, H5O_t *oh)
for (i = 0; i < oh->nchunks; i++) {
if (oh->chunk[i].dirty) {
assert(H5F_addr_defined(&(oh->chunk[i].addr)));
+#ifdef HAVE_PARALLEL
+ H5F_mpio_tas_allsame( f->shared->lf, TRUE ); /* only p0 write */
+#endif /* HAVE_PARALLEL */
if (H5F_block_write(f, &(oh->chunk[i].addr),
(hsize_t)(oh->chunk[i].size), H5D_XFER_DFLT,
oh->chunk[i].image) < 0) {