From 34b2fedf24ce57d0a5b1634060db1ca29a87d8b4 Mon Sep 17 00:00:00 2001 From: Quincey Koziol Date: Fri, 30 Jan 2004 20:37:08 -0500 Subject: [svn-r8125] Purpose: Bug fix/optimization Description: Address slowdown in MPI-I/O file metadata operations that was introduced mid-stream. We now _require_ a POSIX compliant parallel file system for the MPI-I/O file driver (as well as for the MPI-POSIX file driver). Also optimized file open operation when the file is being created by reducing the number of collective & syncronizing calls. Platforms tested: FreeBSD 4.9 (sleipnir) w/parallel h5committest --- doc/html/Environment.html | 11 ++- release_docs/RELEASE.txt | 3 + src/H5F.c | 12 ---- src/H5FDmpio.c | 179 ++++++++++++++++++++++++---------------------- src/H5FDmpiposix.c | 30 ++------ src/H5Fpkg.h | 8 --- 6 files changed, 105 insertions(+), 138 deletions(-) diff --git a/doc/html/Environment.html b/doc/html/Environment.html index af306f3..a00998b 100644 --- a/doc/html/Environment.html +++ b/doc/html/Environment.html @@ -87,6 +87,10 @@ their defined effects are as follows: directory do not remove temporary HDF5 data files. The default is for each test to remove the files before exit. +
HDF5_DISABLE_VERSION_CHECK +
When set to 1, HDF5 will not abort when the version + of the HDF5 headers doesn't match the version of the HDF5 library. +
HDF5_MPI_OPT_TYPES   (for parallel beta version only)
When set to 1, PHDF5 will use the MPI optimized @@ -94,13 +98,6 @@ their defined effects are as follows: Currently, this optimization fails when accessing extendable datasets. The default is not to use the optimized code. -
HDF5_MPI_1_METAWRITE -   (for parallel beta version only) -
When set to 1, PHDF5 will write the metadata - via process 0 of each opened parallel HDF5 file. This should - improve I/O throughput. The default is not to use this - optimization. - diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt index 8d9842f..c8f8304 100644 --- a/release_docs/RELEASE.txt +++ b/release_docs/RELEASE.txt @@ -128,6 +128,9 @@ Bug Fixes since HDF5-1.6.1 release Configuration ------------- + - Parallel I/O with the MPI-I/O driver will no longer work if the + filesystem is not POSIX compliant. The "HDF5_MPI_1_METAWRITE" + environment variable has been removed. QAK - 2004/01/30 Performance ------------- diff --git a/src/H5F.c b/src/H5F.c index 322fe3c..042c5e8 100644 --- a/src/H5F.c +++ b/src/H5F.c @@ -232,18 +232,6 @@ H5F_init_interface(void) FUNC_ENTER_NOAPI_NOINIT(H5F_init_interface); -#ifdef OLD_METADATA_WRITE -#ifdef H5_HAVE_PARALLEL - { - /* Allow MPI buf-and-file-type optimizations? */ - const char *s = HDgetenv ("HDF5_MPI_1_METAWRITE"); - if (s && HDisdigit(*s)) { - H5_mpiposix_1_metawrite_g = H5_mpi_1_metawrite_g = (int)HDstrtol (s, NULL, 0); - } - } -#endif /* H5_HAVE_PARALLEL */ -#endif /* OLD_METADATA_WRITE */ - /* * Initialize the atom group for the file IDs. There are two groups: * the H5I_FILE group contains all the ID's for files which are currently diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c index f791ddb..99b9110 100644 --- a/src/H5FDmpio.c +++ b/src/H5FDmpio.c @@ -52,6 +52,7 @@ typedef struct H5FD_mpio_t { MPI_Info info; /*file information */ int mpi_rank; /* This process's rank */ int mpi_size; /* Total number of processes */ + hbool_t truncate_pending; /* Whether a file truncation is pending */ haddr_t eof; /*end-of-file marker */ haddr_t eoa; /*end-of-address marker */ haddr_t last_eoa; /* Last known end-of-address marker */ @@ -141,17 +142,6 @@ static int H5FD_mpio_Debug[256] = 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; #endif -#ifdef OLD_METADATA_WRITE -/* Global var to allow elimination of redundant metadata writes - * to be controlled by the value of an environment variable. */ -/* Use the elimination by default unless this is the Intel Red machine */ -#ifndef __PUMAGON__ -hbool_t H5_mpi_1_metawrite_g = TRUE; -#else -hbool_t H5_mpi_1_metawrite_g = FALSE; -#endif -#endif /* OLD_METADATA_WRITE */ - /* Interface initialization */ #define PABLO_MASK H5FD_mpio_mask #define INTERFACE_INIT H5FD_mpio_init @@ -1006,10 +996,9 @@ H5FD_mpio_open(const char *name, unsigned flags, hid_t fapl_id, const H5FD_mpio_fapl_t *fa=NULL; H5FD_mpio_fapl_t _fa; H5P_genplist_t *plist; /* Property list pointer */ - H5FD_t *ret_value; /* Return value */ MPI_Comm comm_dup=MPI_COMM_NULL; MPI_Info info_dup=MPI_INFO_NULL; - + H5FD_t *ret_value; /* Return value */ FUNC_ENTER_NOAPI(H5FD_mpio_open, NULL); @@ -1063,8 +1052,7 @@ H5FD_mpio_open(const char *name, unsigned flags, hid_t fapl_id, #endif /*OKAY: CAST DISCARDS CONST*/ - mpi_code=MPI_File_open(comm_dup, (char*)name, mpi_amode, info_dup, &fh); - if (MPI_SUCCESS != mpi_code) + if (MPI_SUCCESS != (mpi_code=MPI_File_open(comm_dup, (char*)name, mpi_amode, info_dup, &fh))) HMPI_GOTO_ERROR(NULL, "MPI_File_open failed", mpi_code); file_opened=1; @@ -1074,39 +1062,63 @@ H5FD_mpio_open(const char *name, unsigned flags, hid_t fapl_id, if (MPI_SUCCESS != (mpi_code=MPI_Comm_size (comm_dup, &mpi_size))) HMPI_GOTO_ERROR(NULL, "MPI_Comm_size failed", mpi_code); -/* Following changes in handling file-truncation made be rkyates and ppweidhaas, sep 99 */ - - /* Only processor p0 will get the filesize and broadcast it. */ - if (mpi_rank == 0) { - /* Get current file size */ - if (MPI_SUCCESS != (mpi_code=MPI_File_get_size(fh, &size))) - HMPI_GOTO_ERROR(NULL, "MPI_File_get_size failed", mpi_code); - } - - /* Broadcast file-size */ - if (MPI_SUCCESS != (mpi_code=MPI_Bcast(&size, sizeof(MPI_Offset), MPI_BYTE, 0, comm_dup))) - HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code); - - /* Only if size > 0, truncate the file - if requested */ - if (size && (flags & H5F_ACC_TRUNC)) { - if (MPI_SUCCESS != (mpi_code=MPI_File_set_size(fh, (MPI_Offset)0))) - HMPI_GOTO_ERROR(NULL, "MPI_File_set_size failed", mpi_code); - - /* Don't let any proc return until all have truncated the file. */ - if (MPI_SUCCESS!= (mpi_code=MPI_Barrier(comm_dup))) - HMPI_GOTO_ERROR(NULL, "MPI_Barrier failed", mpi_code); - size = 0; - } - /* Build the return value and initialize it */ if (NULL==(file=H5MM_calloc(sizeof(H5FD_mpio_t)))) HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed"); - file->f = fh; file->comm = comm_dup; file->info = info_dup; file->mpi_rank = mpi_rank; file->mpi_size = mpi_size; + + /* Determine if the file should be truncated */ + if(flags & H5F_ACC_TRUNC) { +#ifdef H5_MPI_FILE_SET_SIZE_BIG + /* Indicate that a 'truncate' operation is pending on the file */ + file->truncate_pending=TRUE; + + /* File is treated as zero size now */ + size=0; +#else /* H5_MPI_FILE_SET_SIZE_BIG */ + /* Only processor p0 will get the filesize and broadcast it. */ + if (mpi_rank == 0) { + /* Get current file size */ + if (MPI_SUCCESS != (mpi_code=MPI_File_get_size(fh, &size))) + HMPI_GOTO_ERROR(NULL, "MPI_File_get_size failed", mpi_code) + } /* end if */ + + /* Broadcast file-size */ + if (MPI_SUCCESS != (mpi_code=MPI_Bcast(&size, sizeof(MPI_Offset), MPI_BYTE, 0, comm_dup))) + HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code) + + /* Only truncate the file if it is non-zero length */ + if(size) { + if (MPI_SUCCESS != (mpi_code=MPI_File_set_size(fh, (MPI_Offset)0))) + HMPI_GOTO_ERROR(NULL, "MPI_File_set_size failed", mpi_code) + + /* Don't let any proc return until all have truncated the file. */ + if (MPI_SUCCESS!= (mpi_code=MPI_Barrier(comm_dup))) + HMPI_GOTO_ERROR(NULL, "MPI_Barrier failed", mpi_code) + + /* File is zero size now */ + size = 0; + } /* end if */ +#endif /* H5_MPI_FILE_SET_SIZE_BIG */ + } /* end if */ + else { + /* Only processor p0 will get the filesize and broadcast it. */ + if (mpi_rank == 0) { + /* Get current file size */ + if (MPI_SUCCESS != (mpi_code=MPI_File_get_size(fh, &size))) + HMPI_GOTO_ERROR(NULL, "MPI_File_get_size failed", mpi_code) + } /* end if */ + + /* Broadcast file size */ + if (MPI_SUCCESS != (mpi_code=MPI_Bcast(&size, sizeof(MPI_Offset), MPI_BYTE, 0, comm_dup))) + HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code) + } /* end else */ + + /* Set the size of the file (from library's perspective) */ file->eof = H5FD_mpio_MPIOff_to_haddr(size); /* Set return value */ @@ -1159,8 +1171,8 @@ static herr_t H5FD_mpio_close(H5FD_t *_file) { H5FD_mpio_t *file = (H5FD_mpio_t*)_file; - int mpi_code; /* mpi return code */ - herr_t ret_value=SUCCEED; /* Return value */ + int mpi_code; /* MPI return code */ + herr_t ret_value=SUCCEED; /* Return value */ FUNC_ENTER_NOAPI(H5FD_mpio_close, FAIL); @@ -1170,6 +1182,25 @@ H5FD_mpio_close(H5FD_t *_file) #endif assert(file); assert(H5FD_MPIO==file->pub.driver_id); + assert(file->eoa>0); + +#ifdef H5_MPI_FILE_SET_SIZE_BIG + /* Check if we should truncate the file */ + if(file->truncate_pending) { + MPI_Offset mpi_off; /* Offset to write test data at */ + + /* Some numeric conversions */ + if (H5FD_mpio_haddr_to_MPIOff(file->eoa, &mpi_off)<0) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off") + + /* Truncate the extra data off the end */ + /* (Don't worry about a barrier after the call, since we are closing) */ + if (MPI_SUCCESS != (mpi_code=MPI_File_set_size(file->f, mpi_off))) + HMPI_DONE_ERROR(NULL, "MPI_File_set_size failed", mpi_code) + } /* end if */ +#else /* H5_MPI_FILE_SET_SIZE_BIG */ + assert(!file->truncate_pending); +#endif /* H5_MPI_FILE_SET_SIZE_BIG */ /* MPI_File_close sets argument to MPI_FILE_NULL */ if (MPI_SUCCESS != (mpi_code=MPI_File_close(&(file->f)/*in,out*/))) @@ -1820,36 +1851,17 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, if (MPI_SUCCESS!= (mpi_code=MPI_Barrier(file->comm))) HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); -#ifdef OLD_METADATA_WRITE - /* Only p will do the actual write if all procs in comm write same metadata */ - if (H5_mpi_1_metawrite_g) { - if (file->mpi_rank != H5_PAR_META_WRITE) { + /* Only one process will do the actual write if all procs in comm write same metadata */ + if (file->mpi_rank != H5_PAR_META_WRITE) { #ifdef H5FDmpio_DEBUG - if (H5FD_mpio_Debug[(int)'w']) { - fprintf(stdout, - " proc %d: in H5FD_mpio_write (write omitted)\n", - file->mpi_rank ); - } -#endif - HGOTO_DONE(SUCCEED) /* skip the actual write */ + if (H5FD_mpio_Debug[(int)'w']) { + fprintf(stdout, + " proc %d: in H5FD_mpio_write (write omitted)\n", + file->mpi_rank ); } +#endif + HGOTO_DONE(SUCCEED) /* skip the actual write */ } -#else /* OLD_METADATA_WRITE */ - /* Remember that views are used */ - use_view_this_time=TRUE; - - /* - * Set the file view when we are using MPI derived types - */ - /*OKAY: CAST DISCARDS CONST QUALIFIER*/ - if (MPI_SUCCESS != (mpi_code=MPI_File_set_view(file->f, mpi_off, MPI_BYTE, MPI_BYTE, (char*)"native", file->info))) - HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) - - /* When using types, use the address as the displacement for - * MPI_File_set_view and reset the address for the read to zero - */ - mpi_off=0; -#endif /* OLD_METADATA_WRITE */ } /* end if */ /* Write the data. */ @@ -1896,20 +1908,17 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, file->eof = HADDR_UNDEF; done: -#ifdef OLD_METADATA_WRITE - /* if only p writes, need to broadcast the ret_value to other processes */ - if ((type!=H5FD_MEM_DRAW) && H5_mpi_1_metawrite_g) { - if (MPI_SUCCESS != (mpi_code=MPI_Bcast(&ret_value, sizeof(ret_value), MPI_BYTE, H5_PAR_META_WRITE, file->comm))) - HMPI_DONE_ERROR(FAIL, "MPI_Bcast failed", mpi_code) + /* if only one process writes, need to broadcast the ret_value to other processes */ + if (type!=H5FD_MEM_DRAW) { + if (MPI_SUCCESS != (mpi_code=MPI_Bcast(&ret_value, sizeof(ret_value), MPI_BYTE, H5_PAR_META_WRITE, file->comm))) + HMPI_DONE_ERROR(FAIL, "MPI_Bcast failed", mpi_code) } /* end if */ -#endif /* OLD_METADATA_WRITE */ #ifdef H5FDmpio_DEBUG if (H5FD_mpio_Debug[(int)'t']) fprintf(stdout, "proc %d: Leaving H5FD_mpio_write with ret_value=%d\n", file->mpi_rank, ret_value ); #endif - FUNC_LEAVE_NOAPI(ret_value); } @@ -1950,10 +1959,6 @@ H5FD_mpio_flush(H5FD_t *_file, hid_t UNUSED dxpl_id, unsigned closing) int mpi_code; /* mpi return code */ MPI_Offset mpi_off; herr_t ret_value=SUCCEED; -#ifndef H5_MPI_FILE_SET_SIZE_BIG - uint8_t byte=0; - MPI_Status mpi_stat; -#endif /* OLD_WAY */ FUNC_ENTER_NOAPI(H5FD_mpio_flush, FAIL); @@ -1964,11 +1969,6 @@ H5FD_mpio_flush(H5FD_t *_file, hid_t UNUSED dxpl_id, unsigned closing) assert(file); assert(H5FD_MPIO==file->pub.driver_id); -#ifndef H5_MPI_FILE_SET_SIZE_BIG - /* Portably initialize MPI status variable */ - HDmemset(&mpi_stat,0,sizeof(MPI_Status)); -#endif /* OLD_WAY */ - /* Extend the file to make sure it's large enough, then sync. * Unfortunately, keeping track of EOF is an expensive operation, so * we can't just check whether EOFf, mpi_off))) HMPI_GOTO_ERROR(FAIL, "MPI_File_set_size failed", mpi_code); + + /* File does not need to be truncated now */ + file->truncate_pending=FALSE; #else /* H5_MPI_FILE_SET_SIZE_BIG */ if (0==file->mpi_rank) { + uint8_t byte=0; + MPI_Status mpi_stat; + + /* Portably initialize MPI status variable */ + HDmemset(&mpi_stat,0,sizeof(MPI_Status)); + if (H5FD_mpio_haddr_to_MPIOff(file->eoa-1, &mpi_off)<0) HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "cannot convert from haddr_t to MPI_Offset"); if (MPI_SUCCESS != (mpi_code=MPI_File_read_at(file->f, mpi_off, &byte, 1, MPI_BYTE, &mpi_stat))) diff --git a/src/H5FDmpiposix.c b/src/H5FDmpiposix.c index 8bca0bf..a51945d 100644 --- a/src/H5FDmpiposix.c +++ b/src/H5FDmpiposix.c @@ -224,17 +224,6 @@ static const H5FD_class_t H5FD_mpiposix_g = { H5FD_FLMAP_SINGLE /*fl_map */ }; -#ifdef OLD_METADATA_WRITE -/* Global var to allow elimination of redundant metadata writes - * to be controlled by the value of an environment variable. */ -/* Use the elimination by default unless this is the Intel Red machine */ -#ifndef __PUMAGON__ -hbool_t H5_mpiposix_1_metawrite_g = TRUE; -#else -hbool_t H5_mpiposix_1_metawrite_g = FALSE; -#endif -#endif /* OLD_METADATA_WRITE */ - /* Interface initialization */ #define PABLO_MASK H5FD_mpiposix_mask #define INTERFACE_INIT H5FD_mpiposix_init @@ -1425,12 +1414,9 @@ H5FD_mpiposix_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, if (MPI_SUCCESS!= (mpi_code=MPI_Barrier(file->comm))) HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); - /* Only p will do the actual write if all procs in comm write same metadata */ -#ifdef OLD_METADATA_WRITE - if (H5_mpiposix_1_metawrite_g) -#endif /* OLD_METADATA_WRITE */ - if (file->mpi_rank != H5_PAR_META_WRITE) - HGOTO_DONE(SUCCEED) /* skip the actual write */ + /* Only one process will do the actual write if all procs in comm write same metadata */ + if (file->mpi_rank != H5_PAR_META_WRITE) + HGOTO_DONE(SUCCEED) /* skip the actual write */ } /* end if */ #ifdef REPORT_IO @@ -1502,19 +1488,11 @@ done: } /* end if */ /* Guard against getting into metadata broadcast in failure cases */ else { -#ifdef OLD_METADATA_WRITE - /* if only p writes, need to broadcast the ret_value to other processes */ - if ((type!=H5FD_MEM_DRAW) && H5_mpiposix_1_metawrite_g) { - if (MPI_SUCCESS != (mpi_code= MPI_Bcast(&ret_value, sizeof(ret_value), MPI_BYTE, H5_PAR_META_WRITE, file->comm))) - HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code); - } /* end if */ -#else /* OLD_METADATA_WRITE */ - /* if only p writes, need to broadcast the ret_value to other processes */ + /* if only one process writes, need to broadcast the ret_value to other processes */ if (type!=H5FD_MEM_DRAW) { if (MPI_SUCCESS != (mpi_code= MPI_Bcast(&ret_value, sizeof(ret_value), MPI_BYTE, H5_PAR_META_WRITE, file->comm))) HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code); } /* end if */ -#endif /* OLD_METADATA_WRITE */ } /* end else */ FUNC_LEAVE_NOAPI(ret_value); diff --git a/src/H5Fpkg.h b/src/H5Fpkg.h index 257e990..2956002 100644 --- a/src/H5Fpkg.h +++ b/src/H5Fpkg.h @@ -185,14 +185,6 @@ struct H5F_t { H5F_mtab_t mtab; /* File mount table */ }; -#ifdef OLD_METADATA_WRITE -#ifdef H5_HAVE_PARALLEL -/* Whether a single process writes metadata */ -H5_DLLVAR hbool_t H5_mpi_1_metawrite_g; -H5_DLLVAR hbool_t H5_mpiposix_1_metawrite_g; -#endif /* H5_HAVE_PARALLEL */ -#endif /* OLD_METADATA_WRITE */ - /* Private functions, not part of the publicly documented API */ #ifdef NOT_YET H5_DLL void H5F_encode_length_unusual(const H5F_t *f, uint8_t **p, uint8_t *l); -- cgit v0.12