From 5db6c61f18198ac4477a6ba99d405ff82cf467a7 Mon Sep 17 00:00:00 2001 From: Quincey Koziol Date: Mon, 29 Dec 2003 14:34:11 -0500 Subject: [svn-r7993] Purpose: Code cleanup, bug fixes Description: Wrap up rest of changes necessary for fixing the "short" MPI-I/O read problem that Robb reported. Platforms tested: FreeBSD 4.9 (sleipnir) too minor to require h5committest --- configure | 1 - configure.in | 29 ----------- release_docs/RELEASE.txt | 3 ++ src/H5FDfphdf5.c | 125 ++++++++++++++++++++++++++--------------------- src/H5FDmpio.c | 18 +++++-- src/H5FDmpiposix.c | 2 +- src/H5FDsec2.c | 2 +- src/H5FPclient.c | 4 +- src/H5FPprivate.h | 2 +- testpar/testphdf5.h | 1 + 10 files changed, 89 insertions(+), 98 deletions(-) diff --git a/configure b/configure index 722a680..50cbb5a 100755 --- a/configure +++ b/configure @@ -32326,7 +32326,6 @@ fi rm -f conftest.$ac_objext conftest$ac_exeext conftest.$ac_ext fi - MPE=yes # Check whether --with-mpe or --without-mpe was given. diff --git a/configure.in b/configure.in index 17669ee..c2e9ef8 100644 --- a/configure.in +++ b/configure.in @@ -2194,35 +2194,6 @@ if test -n "$PARALLEL"; then ) fi -dnl ---------------------------------------------------------------------- -dnl Block the MPI_Get_count code since it does not work -dnl dnl Check whether MPI_Get_count actually works correctly on this -dnl dnl platform. -dnl AC_MSG_CHECKING(whether a MPI_Get_count works correctly) -dnl AC_TRY_RUN([ -dnl #include -dnl -dnl int main(int argc, char **argv) -dnl { -dnl MPI_Status mpi_stat; -dnl int bytes_read = 0, ret; -dnl -dnl MPI_Init(&argc, &argv); -dnl memset(&mpi_stat, 0, sizeof(MPI_Status)); /* zero out status */ -dnl ret = MPI_Get_count(&mpi_stat, MPI_BYTE, &bytes_read); -dnl MPI_Finalize(); -dnl -dnl /* this returns TRUE if bytes_read is 0...the shell thinks that the -dnl * program fails, but we want it to fail of course so switch the -dnl * "true"/"false" parts of the TRY_RUN macro */ -dnl return bytes_read == 0; -dnl } -dnl ], -dnl AC_MSG_RESULT(no), -dnl AC_MSG_RESULT(yes) -dnl CPPFLAGS="$CPPFLAGS -DMPI_GET_COUNT_WORKS",AC_MSG_RESULT(no)) -dnl ---------------------------------------------------------------------- - dnl -------------------------------------------------------------------- dnl Do we want MPE instrumentation feature on? dnl diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt index ca689d9..c2e23dd 100644 --- a/release_docs/RELEASE.txt +++ b/release_docs/RELEASE.txt @@ -95,6 +95,9 @@ Bug Fixes since HDF5-1.6.0 release ------- - Fixed bug with variable-length datatypes used in compound datatypes. SLU - 2003/12/29 + - Fixed bug in parallel I/O routines that would cause reads from + "short datasets" (datasets which were only partially written out) + to return invalid data. QAK & AKC - 2003/12/19 - Fixed bug where scalar dataspaces for attributes were reporting as simple dataspaces. QAK - 2003/12/13 - Fixed problem with selection offsets of hyperslab selections in diff --git a/src/H5FDfphdf5.c b/src/H5FDfphdf5.c index 6ee76f6..8198165 100644 --- a/src/H5FDfphdf5.c +++ b/src/H5FDfphdf5.c @@ -1160,14 +1160,11 @@ H5FD_fphdf5_read(H5FD_t *_file, H5FD_mem_t mem_type, hid_t dxpl_id, haddr_t addr, size_t size, void *buf) { H5FD_fphdf5_t *file = (H5FD_fphdf5_t*)_file; - MPI_Offset mpi_off; MPI_Status status; - int mrc; - MPI_Datatype buf_type; - int size_i; - int bytes_read; - int n; - unsigned use_view_this_time = 0; + MPI_Offset mpi_off; + int size_i; /* Integer copy of 'size' to read */ + int bytes_read; /* Number of bytes read in */ + int mrc; /* MPI return code */ herr_t ret_value = SUCCEED; FUNC_ENTER_NOAPI(H5FD_fphdf5_read, FAIL) @@ -1189,14 +1186,18 @@ H5FD_fphdf5_read(H5FD_t *_file, H5FD_mem_t mem_type, hid_t dxpl_id, HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr_t to MPI offset") size_i = (int)size; - if ((hsize_t)size_i != size) HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from size_t to int") - /* Only look for MPI views for raw data transfers */ - if (mem_type == H5FD_MEM_DRAW) { + /* Only do real MPI stuff for raw data transfers */ + if(mem_type==H5FD_MEM_DRAW) { H5P_genplist_t *plist; - H5FD_mpio_xfer_t xfer_mode; /* I/O tranfer mode */ + H5FD_mpio_xfer_t xfer_mode; /* I/O tranfer mode */ + MPI_Datatype buf_type; /* MPI datatype for I/O operation */ + int n; + int type_size; /* MPI datatype used for I/O's size */ + int io_size; /* Actual number of bytes requested */ + unsigned use_view_this_time = 0; /* Flag to indicate we are using an MPI view */ /* Obtain the data transfer properties */ if ((plist = H5I_object(dxpl_id)) == NULL) @@ -1239,6 +1240,13 @@ H5FD_fphdf5_read(H5FD_t *_file, H5FD_mem_t mem_type, hid_t dxpl_id, if ((mrc = MPI_File_read_at_all(file->f, mpi_off, buf, size_i, buf_type, &status)) != MPI_SUCCESS) HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mrc) + + /* + * Reset the file view when we used MPI derived types + */ + if ((mrc = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, + H5FD_mpio_native, file->info)) != MPI_SUCCESS) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mrc) } else { /* * Prepare for a simple xfer of a contiguous block of bytes. The @@ -1252,21 +1260,33 @@ H5FD_fphdf5_read(H5FD_t *_file, H5FD_mem_t mem_type, hid_t dxpl_id, HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mrc) } - /* - * MPI_Get_count incorrectly returns negative count; fake a complete - * read. + /* How many bytes were actually read? */ + /* [This works because the "basic elements" we use for all our MPI derived + * types are MPI_BYTE. We should be using the 'buf_type' for the MPI + * datatype in this call though... (We aren't because using it causes + * the LANL "qsc" machine to dump core - 12/19/03) - QAK] */ - bytes_read = size_i; + if (MPI_SUCCESS != (mrc=MPI_Get_elements(&status, MPI_BYTE, &bytes_read))) + HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mrc) + + /* Get the type's size */ + if (MPI_SUCCESS != (mrc=MPI_Type_size(buf_type,&type_size))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_size failed", mrc) + + /* Compute the actual number of bytes requested */ + io_size=type_size*size_i; + + /* Check for read failure */ + if (bytes_read<0 || bytes_read>io_size) + HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed") /* - * Reset the file view when we used MPI derived types + * This gives us zeroes beyond end of physical MPI file. */ - if (use_view_this_time) - if ((mrc = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, - H5FD_mpio_native, file->info)) != MPI_SUCCESS) - HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mrc) - - } else { + if ((n=(io_size-bytes_read)) > 0) + HDmemset((char*)buf+bytes_read, 0, (size_t)n); + } /* end if */ + else { /* * This is metadata - we want to try to read it from the SAP * first. @@ -1276,7 +1296,7 @@ H5FD_fphdf5_read(H5FD_t *_file, H5FD_mem_t mem_type, hid_t dxpl_id, if (H5FP_request_read_metadata(_file, file->file_id, dxpl_id, mem_type, addr, size, (uint8_t**)&buf, - &bytes_read, &req_id, &sap_status) != SUCCEED) { + &req_id, &sap_status) != SUCCEED) { /* FIXME: The read failed, for some reason */ HDfprintf(stderr, "%s:%d: Metadata cache read failed!\n", FUNC, __LINE__); } @@ -1297,27 +1317,6 @@ HDfprintf(stderr, "%s:%d: Metadata cache read failed!\n", FUNC, __LINE__); } } /* end else */ - /* Check for read failure */ - if (bytes_read < 0 || bytes_read > size_i) - HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed") - - /* - * This gives us zeroes beyond end of physical MPI file. What about - * reading past logical end of HDF5 file??? - */ - n = size_i - bytes_read; - - if (n > 0) { - if (use_view_this_time) - /* - * INCOMPLETE rky 1998-09-18 - * Haven't implemented reading zeros beyond EOF. What to do??? - */ - HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "eof file read failed") - - memset((char*)buf + bytes_read, 0, (size_t)n); - } - done: FUNC_LEAVE_NOAPI(ret_value) } @@ -1440,6 +1439,8 @@ H5FD_fphdf5_write_real(H5FD_t *_file, H5FD_mem_t mem_type, H5P_genplist_t *plist MPI_Offset mpi_off; int mrc; int bytes_written; + int type_size; /* MPI datatype used for I/O's size */ + int io_size; /* Actual number of bytes requested */ unsigned use_view_this_time = 0; herr_t ret_value = SUCCEED; @@ -1504,28 +1505,38 @@ H5FD_fphdf5_write_real(H5FD_t *_file, H5FD_mem_t mem_type, H5P_genplist_t *plist if ((mrc = MPI_File_write_at_all(file->f, mpi_off, (void*)buf, size, buf_type, &status)) != MPI_SUCCESS) HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at_all failed", mrc) - } else { + + /* Reset the file view when we used MPI derived types */ + if ((mrc = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, + H5FD_mpio_native, file->info)) != MPI_SUCCESS) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mrc) + } /* end if */ + else { /*OKAY: CAST DISCARDS CONST QUALIFIER*/ if ((mrc = MPI_File_write_at(file->f, mpi_off, (void*)buf, size, buf_type, &status)) != MPI_SUCCESS) HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at failed", mrc) } - /* Reset the file view when we used MPI derived types */ - if (use_view_this_time) - if ((mrc = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, - H5FD_mpio_native, file->info)) != MPI_SUCCESS) - HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mrc) - - /* - * MPI_Get_count incorrectly returns negative count; fake a complete - * write (use size for both parameters). + /* How many bytes were actually written? */ + /* [This works because the "basic elements" we use for all our MPI derived + * types are MPI_BYTE. We should be using the 'buf_type' for the MPI + * datatype in this call though... (We aren't because using it causes + * the LANL "qsc" machine to dump core - 12/19/03) - QAK] */ - bytes_written = size; + if (MPI_SUCCESS != (mrc=MPI_Get_elements(&status, MPI_BYTE, &bytes_written))) + HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mrc) + + /* Get the type's size */ + if (MPI_SUCCESS != (mrc=MPI_Type_size(buf_type,&type_size))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_size failed", mrc) + + /* Compute the actual number of bytes requested */ + io_size=type_size*size; /* Check for write failure */ - if (bytes_written < 0 || bytes_written > size) - HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file write failed") + if (bytes_written!=io_size) + HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "file write failed") /* Forget the EOF value (see H5FD_fphdf5_get_eof()) */ file->eof = HADDR_UNDEF; diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c index 1b0f461..61d97da 100644 --- a/src/H5FDmpio.c +++ b/src/H5FDmpio.c @@ -40,8 +40,7 @@ /* * The driver identification number, initialized at runtime if H5_HAVE_PARALLEL * is defined. This allows applications to still have the H5FD_MPIO - * "constants" in their source code (it also makes this file strictly ANSI - * compliant when H5_HAVE_PARALLEL isn't defined) + * "constants" in their source code. */ static hid_t H5FD_MPIO_g = 0; @@ -1323,6 +1322,10 @@ done: * to HADDR_UNDEF which is the error return value of this * function. * + * Keeping the EOF updated (during write calls) is expensive + * because any process may extend the physical end of the + * file. -QAK + * * Return: Success: The end-of-address marker. * * Failure: HADDR_UNDEF @@ -1563,7 +1566,9 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t UNUSED type, hid_t dxpl_id, haddr_t add /* How many bytes were actually read? */ /* [This works because the "basic elements" we use for all our MPI derived - * types are MPI_BYTE - QAK] + * types are MPI_BYTE. We should be using the 'buf_type' for the MPI + * datatype in this call though... (We aren't because using it causes + * the LANL "qsc" machine to dump core - 12/19/03) - QAK] */ if (MPI_SUCCESS != (mpi_code=MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read))) HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code) @@ -1879,7 +1884,9 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, /* How many bytes were actually written? */ /* [This works because the "basic elements" we use for all our MPI derived - * types are MPI_BYTE - QAK] + * types are MPI_BYTE. We should be using the 'buf_type' for the MPI + * datatype in this call though... (We aren't because using it causes + * the LANL "qsc" machine to dump core - 12/19/03) - QAK] */ if (MPI_SUCCESS != (mpi_code=MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_written))) HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code) @@ -1903,7 +1910,7 @@ done: /* if only p writes, need to broadcast the ret_value to other processes */ if ((type!=H5FD_MEM_DRAW) && H5_mpi_1_metawrite_g) { if (MPI_SUCCESS != (mpi_code=MPI_Bcast(&ret_value, sizeof(ret_value), MPI_BYTE, H5_PAR_META_WRITE, file->comm))) - HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code) + HMPI_DONE_ERROR(FAIL, "MPI_Bcast failed", mpi_code) } /* end if */ #endif /* OLD_METADATA_WRITE */ @@ -1912,6 +1919,7 @@ done: fprintf(stdout, "proc %d: Leaving H5FD_mpio_write with ret_value=%d\n", file->mpi_rank, ret_value ); #endif + FUNC_LEAVE_NOAPI(ret_value) } diff --git a/src/H5FDmpiposix.c b/src/H5FDmpiposix.c index 647fb35..4dadb1b 100644 --- a/src/H5FDmpiposix.c +++ b/src/H5FDmpiposix.c @@ -1209,7 +1209,7 @@ H5FD_mpiposix_read(H5FD_t *_file, H5FD_mem_t UNUSED type, hid_t UNUSED dxpl_id, if (0==nbytes) { /* end of file but not end of format address space */ HDmemset(buf, 0, size); - size = 0; + break; } /* end if */ assert(nbytes>=0); assert((size_t)nbytes<=size); diff --git a/src/H5FDsec2.c b/src/H5FDsec2.c index 5b73475..5924c0a 100644 --- a/src/H5FDsec2.c +++ b/src/H5FDsec2.c @@ -682,7 +682,7 @@ H5FD_sec2_read(H5FD_t *_file, H5FD_mem_t UNUSED type, hid_t UNUSED dxpl_id, hadd if (0==nbytes) { /* end of file but not end of format address space */ HDmemset(buf, 0, size); - size = 0; + break; } assert(nbytes>=0); assert((size_t)nbytes<=size); diff --git a/src/H5FPclient.c b/src/H5FPclient.c index e80288d..cdc22e0 100644 --- a/src/H5FPclient.c +++ b/src/H5FPclient.c @@ -286,7 +286,7 @@ done: herr_t H5FP_request_read_metadata(H5FD_t *file, unsigned file_id, hid_t dxpl_id, H5FD_mem_t UNUSED mem_type, haddr_t addr, - size_t size, uint8_t **buf, int *bytes_read, + size_t size, uint8_t **buf, unsigned *req_id, H5FP_status_t *status) { H5FP_request_t req; @@ -300,7 +300,6 @@ H5FP_request_read_metadata(H5FD_t *file, unsigned file_id, hid_t dxpl_id, /* check args */ assert(file); assert(buf); - assert(bytes_read); assert(req_id); assert(status); @@ -352,7 +351,6 @@ HDfprintf(stderr, "Buffer not big enough to hold metadata!!!!\n"); assert(0); } - *bytes_read = size; break; case H5FP_STATUS_DUMPING: /* diff --git a/src/H5FPprivate.h b/src/H5FPprivate.h index 0abd9cf..aa85969 100644 --- a/src/H5FPprivate.h +++ b/src/H5FPprivate.h @@ -305,7 +305,7 @@ extern herr_t H5FP_request_release_lock(unsigned sap_file_id, hobj_ref_t oid, H5FP_status_t *status); extern herr_t H5FP_request_read_metadata(H5FD_t *file, unsigned sap_file_id, hid_t dxpl_id, H5FD_mem_t mem_type, haddr_t addr, - size_t size, uint8_t **buf, int *bytes_read, + size_t size, uint8_t **buf, unsigned *req_id, H5FP_status_t *status); extern herr_t H5FP_request_write_metadata(H5FD_t *file, unsigned file_id, hid_t dxpl_id, H5FD_mem_t mem_type, haddr_t addr, diff --git a/testpar/testphdf5.h b/testpar/testphdf5.h index 711d189..bb5164a 100644 --- a/testpar/testphdf5.h +++ b/testpar/testphdf5.h @@ -187,6 +187,7 @@ void extend_readInd(char *filename); void extend_readAll(char *filename); void compact_dataset(char *filename); void big_dataset(const char *filename); +void short_dataset(const char *filename); int dataset_vrfy(hssize_t start[], hsize_t count[], hsize_t stride[], hsize_t block[], DATATYPE *dataset, DATATYPE *original); -- cgit v0.12