From 1373661a78b7a6d3c7526cd8f80b9ba3460a1d74 Mon Sep 17 00:00:00 2001 From: Robert Kim Yates Date: Fri, 28 Aug 1998 18:37:58 -0500 Subject: [svn-r630] Added code to eliminate redundant writes of metadata, so only proc 0 writes it to disk. The elimination is activated only when the environment variable HDF5_MPI_1_METAWRITE is nonzero; otherwise all processes that opened the file will write the metadata to disk. --- src/H5B.c | 4 +++ src/H5F.c | 15 ++++++++++ src/H5Flow.c | 9 ++++-- src/H5Fmpio.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++- src/H5Fmpioprivate.h | 25 ++++++++++++++++ src/H5Fprivate.h | 2 ++ src/H5Gnode.c | 4 +++ src/H5HL.c | 10 +++++++ src/H5O.c | 8 ++++++ 9 files changed, 155 insertions(+), 3 deletions(-) create mode 100644 src/H5Fmpioprivate.h diff --git a/src/H5B.c b/src/H5B.c index 7491ed2..aeb0879 100644 --- a/src/H5B.c +++ b/src/H5B.c @@ -383,6 +383,7 @@ H5B_load(H5F_t *f, const haddr_t *addr, const void *_type, void *udata) * Jun 23 1997 * * Modifications: + * rky 980828 Only p0 writes metadata to disk. * *------------------------------------------------------------------------- */ @@ -453,6 +454,9 @@ H5B_flush(H5F_t *f, hbool_t destroy, const haddr_t *addr, H5B_t *bt) * bother writing data for the child entries that don't exist or * for the final unchanged children. */ +#ifdef HAVE_PARALLEL + H5F_mpio_tas_allsame( f->shared->lf, TRUE ); /* only p0 will write */ +#endif /* HAVE_PARALLEL */ if (H5F_block_write(f, addr, (hsize_t)size, H5D_XFER_DFLT, bt->page) < 0) { HRETURN_ERROR(H5E_BTREE, H5E_CANTFLUSH, FAIL, diff --git a/src/H5F.c b/src/H5F.c index b0fdf4e..8191747 100644 --- a/src/H5F.c +++ b/src/H5F.c @@ -47,6 +47,7 @@ static char RcsId[] = "@(#)$Revision$"; #include /*core memory management */ #include /*property lists */ #include /*data types */ +#include /*MPI-IO parallel stuff */ #include #include @@ -138,6 +139,16 @@ H5F_init_interface(void) interface_initialize_g = TRUE; FUNC_ENTER(H5F_init_interface, FAIL); +#ifdef HAVE_PARALLEL + { + /* Allow MPI buf-and-file-type optimizations? */ + const char *s = getenv ("HDF5_MPI_1_METAWRITE"); + if (s && isdigit(*s)) { + H5_mpi_1_metawrite_g = (int)HDstrtol (s, NULL, 0); + } + } +#endif + /* Initialize the atom group for the file IDs */ if (H5I_init_group(H5_FILE, H5I_FILEID_HASHSIZE, 0, (herr_t (*)(void*))H5F_close)<0 || @@ -1416,6 +1427,7 @@ H5Fflush(hid_t object_id) * Aug 29 1997 * * Modifications: + * rky 980828 Only p0 writes metadata to disk. * *------------------------------------------------------------------------- */ @@ -1481,6 +1493,9 @@ H5F_flush(H5F_t *f, hbool_t invalidate) } /* write the boot block to disk */ +#ifdef HAVE_PARALLEL + H5F_mpio_tas_allsame( f->shared->lf, TRUE ); /* only p0 will write */ +#endif if (H5F_low_write(f->shared->lf, f->shared->access_parms, H5D_XFER_DFLT, &(f->shared->boot_addr), (size_t)(p-buf), buf)<0) { diff --git a/src/H5Flow.c b/src/H5Flow.c index eec5714..bd1661a 100644 --- a/src/H5Flow.c +++ b/src/H5Flow.c @@ -368,6 +368,7 @@ H5F_low_write(H5F_low_t *lf, const H5F_access_t *access_parms, * Monday, November 10, 1997 * * Modifications: + * rky 980828 Only p0 writes metadata to disk. * *------------------------------------------------------------------------- */ @@ -382,14 +383,18 @@ H5F_low_flush(H5F_low_t *lf, const H5F_access_t *access_parms) assert(lf && lf->type); /* Make sure the last block of the file has been allocated on disk */ + /* rky 980828 NOTE + * Is this really necessary? Could this be eliminated for MPI-IO files? */ H5F_addr_reset(&last_byte); if (addr_defined(&(lf->eof)) && H5F_addr_gt(&(lf->eof), &last_byte)) { last_byte = lf->eof; last_byte.offset -= 1; if (H5F_low_read(lf, access_parms, H5D_XFER_DFLT, &last_byte, 1, buf) >= 0) { - H5F_low_write(lf, access_parms, H5D_XFER_DFLT, &last_byte, - 1, buf); +#ifdef HAVE_PARALLEL + H5F_mpio_tas_allsame( lf, TRUE ); /* only p0 will write */ +#endif /* HAVE_PARALLEL */ + H5F_low_write(lf, access_parms, H5D_XFER_DFLT, &last_byte, 1, buf); } } /* Invoke the subclass the flush method */ diff --git a/src/H5Fmpio.c b/src/H5Fmpio.c index 50e7ddc..7214d6b 100644 --- a/src/H5Fmpio.c +++ b/src/H5Fmpio.c @@ -69,6 +69,10 @@ static hbool_t interface_initialize_g = FALSE; /* rky??? */ #define INTERFACE_INIT NULL +/* Global var to allow elimination of redundant metadata writes + * to be controlled by the value of an environment variable. */ +hbool_t H5_mpi_1_metawrite_g = FALSE; + #define H5F_MPIO_DEV 0xfffe /*pseudo dev for MPI-IO until we fix things */ /* Make sure this differs from H5F_CORE_DEV */ @@ -285,6 +289,7 @@ H5F_mpio_access(const char *name, const H5F_access_t *access_parms, int mode, * rky, 11 Jun 1998 * Added H5F_mpio_Debug debug flags controlled by MPI_Info. * + * rky 980828 Init flag controlling redundant metadata writes to disk. *------------------------------------------------------------------------- */ static H5F_low_t * @@ -351,6 +356,7 @@ H5F_mpio_open(const char *name, const H5F_access_t *access_parms, uintn flags, "memory allocation failed"); } lf->u.mpio.f = fh; + H5F_mpio_tas_allsame( lf, FALSE ); /* initialize */ H5F_addr_reset(&(lf->eof)); mpierr = MPI_File_get_size( fh, &size ); if (MPI_SUCCESS != mpierr) { @@ -627,6 +633,45 @@ H5F_mpio_read(H5F_low_t *lf, H5F_access_t *access_parms, } /* H5F_mpio_read */ /*------------------------------------------------------------------------- + * Function: H5F_mpio_tas_allsame + * + * Purpose: Test and set the allsame parameter. + * + * Errors: + * + * Return: Success: the old value of the allsame flag + * + * Failure: assert fails if access_parms is NULL. + * + * Programmer: rky 980828 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +hbool_t +H5F_mpio_tas_allsame(H5F_low_t *lf, hbool_t newval ) +{ + hbool_t oldval; + + FUNC_ENTER(H5F_mpio_tas_allsame, FALSE); +#ifdef H5Fmpio_DEBUG + if (H5F_mpio_Debug[(int)'t']) + fprintf(stdout, "Entering H5F_mpio_tas_allsame, newval=%d\n", newval ); +#endif + + assert(lf); + oldval = lf->u.mpio.allsame; + lf->u.mpio.allsame = newval; + +#ifdef H5Fmpio_DEBUG + if (H5F_mpio_Debug[(int)'t']) + fprintf(stdout, "Leaving H5F_mpio_tas_allsame, oldval=%d\n", oldval ); +#endif + FUNC_LEAVE(oldval); +} + +/*------------------------------------------------------------------------- * Function: H5F_mpio_write * * Purpose: Depending on a field in access params, either: @@ -637,6 +682,19 @@ H5F_mpio_read(H5F_low_t *lf, H5F_access_t *access_parms, * This can allow MPI to coalesce requests from * different processes (collective or independent). * + * rky 980828 + * If the allsame flag is set, we assume that all the procs + * in the relevant MPI communicator will write identical data + * at identical offsets in the file, so only proc 0 will write, + * and all other procs will wait for p0 to finish. + * This is useful for writing metadata, for example. + * Note that we don't _check_ that the data is identical. + * ALso, the mechanism we use to eliminate the redundant writes + * is by requiring a call to H5F_mpio_tas_allsame before the write, + * which is rather klugey. + * Would it be better to pass a parameter to low-level writes + * like H5F_block_write and H5F_low_write, instead? Or...??? + * * Errors: * IO WRITEERROR MPI_File_write_at failed. * @@ -666,6 +724,9 @@ H5F_mpio_read(H5F_low_t *lf, H5F_access_t *access_parms, * The guts of H5F_mpio_read and H5F_mpio_write * should be replaced by a single dual-purpose routine. * + * rky, 980828 + * Added allsame parameter to make all but proc 0 skip the actual write. + * *------------------------------------------------------------------------- */ static herr_t @@ -676,9 +737,10 @@ H5F_mpio_write(H5F_low_t *lf, H5F_access_t *access_parms, MPI_Offset mpi_off, mpi_disp; MPI_Status mpi_stat; MPI_Datatype buf_type, file_type; - int mpierr, msglen, size_i, bytes_written; + int mpierr, msglen, size_i, bytes_written, mpi_rank; int use_types_this_time, used_types_last_time; char mpierrmsg[MPI_MAX_ERROR_STRING]; + hbool_t allsame; FUNC_ENTER(H5F_mpio_write, FAIL); #ifdef H5Fmpio_DEBUG @@ -707,6 +769,22 @@ H5F_mpio_write(H5F_low_t *lf, H5F_access_t *access_parms, mpi_off, size_i ); #endif + /* Only p0 will do the actual write if all procs in comm write same data */ + allsame = H5F_mpio_tas_allsame( lf, FALSE ); + if (allsame && H5_mpi_1_metawrite_g) { + mpierr = MPI_Comm_rank( access_parms->u.mpio.comm, &mpi_rank ); + if (mpierr != MPI_SUCCESS) + HRETURN_ERROR(H5E_IO, H5E_MPI, FAIL, "MPI_Comm_rank failed" ); + if (mpi_rank != 0) { +#ifdef H5Fmpio_DEBUG + if (H5F_mpio_Debug[(int)'w']) { + fprintf(stdout, " in H5F_mpio_write (write omitted)\n" ); + } +#endif + goto done; /* skip the actual write */ + } + } + /* Set up for a fancy xfer using complex types, or single byte block. * We wouldn't need to rely on the use_types field * if MPI semantics allowed us to test that btype=ftype=MPI_BYTE @@ -797,6 +875,7 @@ H5F_mpio_write(H5F_low_t *lf, H5F_access_t *access_parms, "MPI_Get_count returned invalid count" ); } + done: #ifdef H5Fmpio_DEBUG if (H5F_mpio_Debug[(int)'t']) fprintf(stdout, "Leaving H5F_mpio_write\n" ); diff --git a/src/H5Fmpioprivate.h b/src/H5Fmpioprivate.h new file mode 100644 index 0000000..a51eb36 --- /dev/null +++ b/src/H5Fmpioprivate.h @@ -0,0 +1,25 @@ +/**************************************************************************** + * NCSA HDF * + * Software Development Group * + * National Center for Supercomputing Applications * + * University of Illinois at Urbana-Champaign * + * 605 E. Springfield, Champaign IL 61820 * + * * + * For conditions of distribution and use, see the accompanying * + * hdf/COPYING file. * + * * + ****************************************************************************/ + +/* + * This file contains macros & information for MPI-IO file access + */ + +#ifndef _H5Fmpioprivate_H +#define _H5Fmpioprivate_H + +#ifdef HAVE_PARALLEL +extern hbool_t H5_mpi_1_metawrite_g; +hbool_t H5F_mpio_tas_allsame(H5F_low_t *lf, hbool_t newval ); +#endif /* HAVE_PARALLEL */ + +#endif diff --git a/src/H5Fprivate.h b/src/H5Fprivate.h index 396b448..5e60efe 100644 --- a/src/H5Fprivate.h +++ b/src/H5Fprivate.h @@ -392,6 +392,8 @@ typedef struct H5F_low_t { /* MPI-IO */ struct { MPI_File f; /* MPI-IO file handle */ + hbool_t allsame;/* all procs should write same data, * + * so only p0 will do the actual write */ } mpio; #endif diff --git a/src/H5Gnode.c b/src/H5Gnode.c index 842e6a0..3c8005e 100644 --- a/src/H5Gnode.c +++ b/src/H5Gnode.c @@ -296,6 +296,7 @@ H5G_node_create(H5F_t *f, H5B_ins_t __unused__ op, void *_lt_key, * Jun 23 1997 * * Modifications: + * rky 980828 Only p0 writes metadata to disk. * *------------------------------------------------------------------------- */ @@ -351,6 +352,9 @@ H5G_node_flush(H5F_t *f, hbool_t destroy, const haddr_t *addr, H5G_ent_encode_vec(f, &p, sym->entry, sym->nsyms); HDmemset(p, 0, size - (p - buf)); +#ifdef HAVE_PARALLEL + H5F_mpio_tas_allsame( f->shared->lf, TRUE ); /* only p0 will write */ +#endif /* HAVE_PARALLEL */ status = H5F_block_write(f, addr, (hsize_t)size, H5D_XFER_DFLT, buf); buf = H5MM_xfree(buf); if (status < 0) diff --git a/src/H5HL.c b/src/H5HL.c index a960c80..9e157b0 100644 --- a/src/H5HL.c +++ b/src/H5HL.c @@ -305,6 +305,7 @@ H5HL_load(H5F_t *f, const haddr_t *addr, const void __unused__ *udata1, * Jul 17 1997 * * Modifications: + * rky 980828 Only p0 writes metadata to disk. * *------------------------------------------------------------------------- */ @@ -376,6 +377,9 @@ H5HL_flush(H5F_t *f, hbool_t destroy, const haddr_t *addr, H5HL_t *heap) H5F_addr_inc(&hdr_end_addr, (hsize_t)H5HL_SIZEOF_HDR(f)); if (H5F_addr_eq(&(heap->addr), &hdr_end_addr)) { /* The header and data are contiguous */ +#ifdef HAVE_PARALLEL + H5F_mpio_tas_allsame( f->shared->lf, TRUE ); /* only p0 writes */ +#endif /* HAVE_PARALLEL */ if (H5F_block_write(f, addr, (hsize_t)(H5HL_SIZEOF_HDR(f)+heap->disk_alloc), H5D_XFER_DFLT, heap->chunk) < 0) { @@ -383,11 +387,17 @@ H5HL_flush(H5F_t *f, hbool_t destroy, const haddr_t *addr, H5HL_t *heap) "unable to write heap header and data to file"); } } else { +#ifdef HAVE_PARALLEL + H5F_mpio_tas_allsame( f->shared->lf, TRUE ); /* only p0 writes */ +#endif /* HAVE_PARALLEL */ if (H5F_block_write(f, addr, (hsize_t)H5HL_SIZEOF_HDR(f), H5D_XFER_DFLT, heap->chunk)<0) { HRETURN_ERROR(H5E_HEAP, H5E_WRITEERROR, FAIL, "unable to write heap header to file"); } +#ifdef HAVE_PARALLEL + H5F_mpio_tas_allsame( f->shared->lf, TRUE ); /* only p0 writes */ +#endif /* HAVE_PARALLEL */ if (H5F_block_write(f, &(heap->addr), (hsize_t)(heap->disk_alloc), H5D_XFER_DFLT, heap->chunk + H5HL_SIZEOF_HDR(f)) < 0) { diff --git a/src/H5O.c b/src/H5O.c index addd14b..322862c 100644 --- a/src/H5O.c +++ b/src/H5O.c @@ -506,6 +506,8 @@ H5O_load(H5F_t *f, const haddr_t *addr, const void __unused__ *_udata1, * Robb Matzke, 7 Jan 1998 * Handles constant vs non-constant messages. * + * rky 980828 Only p0 writes metadata to disk. + * *------------------------------------------------------------------------- */ static herr_t @@ -546,6 +548,9 @@ H5O_flush(H5F_t *f, hbool_t destroy, const haddr_t *addr, H5O_t *oh) HDmemset (p, 0, H5O_SIZEOF_HDR(f)-12); /* write the object header header */ +#ifdef HAVE_PARALLEL + H5F_mpio_tas_allsame( f->shared->lf, TRUE ); /* only p0 will write */ +#endif /* HAVE_PARALLEL */ if (H5F_block_write(f, addr, (hsize_t)H5O_SIZEOF_HDR(f), H5D_XFER_DFLT, buf) < 0) { HRETURN_ERROR(H5E_OHDR, H5E_WRITEERROR, FAIL, @@ -618,6 +623,9 @@ H5O_flush(H5F_t *f, hbool_t destroy, const haddr_t *addr, H5O_t *oh) for (i = 0; i < oh->nchunks; i++) { if (oh->chunk[i].dirty) { assert(H5F_addr_defined(&(oh->chunk[i].addr))); +#ifdef HAVE_PARALLEL + H5F_mpio_tas_allsame( f->shared->lf, TRUE ); /* only p0 write */ +#endif /* HAVE_PARALLEL */ if (H5F_block_write(f, &(oh->chunk[i].addr), (hsize_t)(oh->chunk[i].size), H5D_XFER_DFLT, oh->chunk[i].image) < 0) { -- cgit v0.12