From f4f2e83d2233699aaf32f2e7f715504b43542d42 Mon Sep 17 00:00:00 2001 From: Larry Knox Date: Mon, 27 Aug 2018 16:24:20 -0500 Subject: Merge pull request #1224 in HDFFV/hdf5 from ~JHENDERSON/hdf5:develop to develop * commit '763b95c6b0f027025a5a5bfb203b77317ffa2368': Update RELEASE.txt with suggested changes Update MANIFEST file for new t_coll_md_read.c file Remove now-unused local variable Add fix for HDFFV-10501 --- MANIFEST | 1 + release_docs/RELEASE.txt | 63 +++++++++++++++ src/H5Dmpio.c | 18 +---- testpar/CMakeLists.txt | 1 + testpar/Makefile.am | 2 +- testpar/t_coll_md_read.c | 199 +++++++++++++++++++++++++++++++++++++++++++++++ testpar/testphdf5.c | 3 + testpar/testphdf5.h | 1 + 8 files changed, 270 insertions(+), 18 deletions(-) create mode 100644 testpar/t_coll_md_read.c diff --git a/MANIFEST b/MANIFEST index 257a7c7..3b92b1a 100644 --- a/MANIFEST +++ b/MANIFEST @@ -1248,6 +1248,7 @@ ./testpar/t_cache_image.c ./testpar/t_chunk_alloc.c ./testpar/t_coll_chunk.c +./testpar/t_coll_md_read.c ./testpar/t_dset.c ./testpar/t_file.c ./testpar/t_file_image.c diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt index 872bced..c022d73 100644 --- a/release_docs/RELEASE.txt +++ b/release_docs/RELEASE.txt @@ -34,6 +34,7 @@ CONTENTS - New Features - Support for new platforms and languages +- Bug Fixes since HDF5-1.10.3 - Bug Fixes since HDF5-1.10.2 - Supported Platforms - Tested Configuration Features Summary @@ -229,6 +230,68 @@ Support for new platforms, languages and compilers. ======================================= - +Bug Fixes since HDF5-1.10.3 release +================================== + + Library + ------- + - A bug was discovered in the parallel library where an application + would hang if a collective read/write of a chunked dataset occurred + when collective metadata reads were enabled and some of the ranks + had no selection in the dataset's dataspace. The ranks which had no + selection in the dataset's dataspace called H5D__chunk_addrmap() to + retrieve the lowest chunk address in the dataset. This is because we + require reads/writes to be performed in strictly non-decreasing order + of chunk address in the file. + + When the chunk index used was a version 1 or 2 B-tree, these + non-participating ranks would issue a collective MPI_Bcast() call + that the participating ranks would not issue, causing the hang. Since + the non-participating ranks are not actually reading/writing anything, + the H5D__chunk_addrmap() call can be safely removed and the address used + for the read/write can be set to an arbitrary number (0 was chosen). + + (JTH - 2018/08/25, HDFFV-10501) + + Configuration + ------------- + - + + Performance + ------------- + - + + Fortran + -------- + - + + Tools + ----- + - + + High-Level APIs: + ------ + - + + Fortran High-Level APIs: + ------ + - + + Documentation + ------------- + - + + F90 APIs + -------- + - + + C++ APIs + -------- + - + + Testing + ------- + Bug Fixes since HDF5-1.10.2 release ================================== diff --git a/src/H5Dmpio.c b/src/H5Dmpio.c index 9bffd8b..a669593 100644 --- a/src/H5Dmpio.c +++ b/src/H5Dmpio.c @@ -925,7 +925,6 @@ H5D__link_chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *typ hbool_t chunk_final_ftype_is_derived = FALSE; H5D_storage_t ctg_store; /* Storage info for "fake" contiguous dataset */ size_t total_chunks; - haddr_t *total_chunk_addr_array = NULL; MPI_Datatype *chunk_mtype = NULL; MPI_Datatype *chunk_ftype = NULL; MPI_Aint *chunk_disp_array = NULL; @@ -1146,20 +1145,7 @@ if(H5DEBUG(D)) mpi_buf_count = (hsize_t)1; } /* end if */ else { /* no selection at all for this process */ - /* Allocate chunking information */ - if(NULL == (total_chunk_addr_array = (haddr_t *)H5MM_malloc(sizeof(haddr_t) * total_chunks))) - HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate total chunk address arraybuffer") - - /* Retrieve chunk address map */ - if(H5D__chunk_addrmap(io_info, total_chunk_addr_array) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address") - - /* Get chunk with lowest address */ - ctg_store.contig.dset_addr = HADDR_MAX; - for(u = 0; u < total_chunks; u++) - if(total_chunk_addr_array[u] < ctg_store.contig.dset_addr) - ctg_store.contig.dset_addr = total_chunk_addr_array[u]; - HDassert(ctg_store.contig.dset_addr != HADDR_MAX); + ctg_store.contig.dset_addr = 0; /* Set the MPI datatype */ chunk_final_ftype = MPI_BYTE; @@ -1187,8 +1173,6 @@ if(H5DEBUG(D)) HDfprintf(H5DEBUG(D),"before freeing memory inside H5D_link_collective_io ret_value = %d\n", ret_value); #endif /* Release resources */ - if(total_chunk_addr_array) - H5MM_xfree(total_chunk_addr_array); if(chunk_addr_info_array) H5MM_xfree(chunk_addr_info_array); if(chunk_mtype) diff --git a/testpar/CMakeLists.txt b/testpar/CMakeLists.txt index c08a69e..0b3cbe3 100644 --- a/testpar/CMakeLists.txt +++ b/testpar/CMakeLists.txt @@ -17,6 +17,7 @@ set (testphdf5_SOURCES ${HDF5_TEST_PAR_SOURCE_DIR}/t_chunk_alloc.c ${HDF5_TEST_PAR_SOURCE_DIR}/t_filter_read.c ${HDF5_TEST_PAR_SOURCE_DIR}/t_prop.c + ${HDF5_TEST_PAR_SOURCE_DIR}/t_coll_md_read.c ) #-- Adding test for testhdf5 diff --git a/testpar/Makefile.am b/testpar/Makefile.am index 5c7cb26..7262ca6 100644 --- a/testpar/Makefile.am +++ b/testpar/Makefile.am @@ -29,7 +29,7 @@ check_PROGRAMS = $(TEST_PROG_PARA) testphdf5_SOURCES=testphdf5.c t_dset.c t_file.c t_file_image.c t_mdset.c \ t_ph5basic.c t_coll_chunk.c t_span_tree.c t_chunk_alloc.c t_filter_read.c \ - t_prop.c + t_prop.c t_coll_md_read.c # The tests all depend on the hdf5 library and the test library LDADD = $(LIBH5TEST) $(LIBHDF5) diff --git a/testpar/t_coll_md_read.c b/testpar/t_coll_md_read.c new file mode 100644 index 0000000..f945d2b --- /dev/null +++ b/testpar/t_coll_md_read.c @@ -0,0 +1,199 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright by The HDF Group. * + * All rights reserved. * + * * + * This file is part of HDF5. The full HDF5 copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the COPYING file, which can be found at the root of the source code * + * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. * + * If you do not have access to either file, you may request a copy from * + * help@hdfgroup.org. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +/* + * A test suite to test HDF5's collective metadata read capabilities, as enabled + * by making a call to H5Pset_all_coll_metadata_ops(). + */ + +#include "testphdf5.h" + +#include +#include +#include + +/* + * Define the non-participating process as the "last" + * rank to avoid any weirdness potentially caused by + * an if (mpi_rank == 0) check. + */ +#define PARTIAL_NO_SELECTION_NO_SEL_PROCESS (mpi_rank == mpi_size - 1) +#define PARTIAL_NO_SELECTION_DATASET_NAME "partial_no_selection_dset" +#define PARTIAL_NO_SELECTION_DATASET_NDIMS 2 +#define PARTIAL_NO_SELECTION_Y_DIM_SCALE 5 +#define PARTIAL_NO_SELECTION_X_DIM_SCALE 5 + +/* + * A test for issue HDFFV-10501. A parallel hang was reported which occurred + * in linked-chunk I/O when collective metadata reads are enabled and some ranks + * do not have any selection in a dataset's dataspace, while others do. The ranks + * which have no selection during the read/write operation called H5D__chunk_addrmap() + * to retrieve the lowest chunk address, since we require that the read/write be done + * in strictly non-decreasing order of chunk address. For version 1 and 2 B-trees, + * this caused the non-participating ranks to issue a collective MPI_Bcast() call + * which the other ranks did not issue, thus causing a hang. + * + * However, since these ranks are not actually reading/writing anything, this call + * can simply be removed and the address used for the read/write can be set to an + * arbitrary number (0 was chosen). + */ +void test_partial_no_selection_coll_md_read(void) +{ + const char *filename; + hsize_t *dataset_dims = NULL; + hsize_t max_dataset_dims[PARTIAL_NO_SELECTION_DATASET_NDIMS]; + hsize_t sel_dims[1]; + hsize_t chunk_dims[PARTIAL_NO_SELECTION_DATASET_NDIMS] = { PARTIAL_NO_SELECTION_Y_DIM_SCALE, PARTIAL_NO_SELECTION_X_DIM_SCALE }; + hsize_t start[PARTIAL_NO_SELECTION_DATASET_NDIMS]; + hsize_t stride[PARTIAL_NO_SELECTION_DATASET_NDIMS]; + hsize_t count[PARTIAL_NO_SELECTION_DATASET_NDIMS]; + hsize_t block[PARTIAL_NO_SELECTION_DATASET_NDIMS]; + hid_t file_id = -1; + hid_t fapl_id = -1; + hid_t dset_id = -1; + hid_t dcpl_id = -1; + hid_t dxpl_id = -1; + hid_t fspace_id = -1; + hid_t mspace_id = -1; + int mpi_rank, mpi_size; + void *data = NULL; + void *read_buf = NULL; + + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); + + filename = GetTestParameters(); + + fapl_id = create_faccess_plist(MPI_COMM_WORLD, MPI_INFO_NULL, facc_type); + VRFY((fapl_id >= 0), "create_faccess_plist succeeded"); + + /* + * Even though the testphdf5 framework currently sets collective metadata reads + * on the FAPL, we call it here just to be sure this is futureproof, since + * demonstrating this issue relies upon it. + */ + VRFY((H5Pset_all_coll_metadata_ops(fapl_id, true) >= 0), "Set collective metadata reads succeeded"); + + file_id = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); + VRFY((file_id >= 0), "H5Fcreate succeeded"); + + dataset_dims = malloc(PARTIAL_NO_SELECTION_DATASET_NDIMS * sizeof(*dataset_dims)); + VRFY((dataset_dims != NULL), "malloc succeeded"); + + dataset_dims[0] = PARTIAL_NO_SELECTION_Y_DIM_SCALE * mpi_size; + dataset_dims[1] = PARTIAL_NO_SELECTION_X_DIM_SCALE * mpi_size; + max_dataset_dims[0] = H5S_UNLIMITED; + max_dataset_dims[1] = H5S_UNLIMITED; + + fspace_id = H5Screate_simple(PARTIAL_NO_SELECTION_DATASET_NDIMS, dataset_dims, max_dataset_dims); + VRFY((fspace_id >= 0), "H5Screate_simple succeeded"); + + /* + * Set up chunking on the dataset in order to reproduce the problem. + */ + dcpl_id = H5Pcreate(H5P_DATASET_CREATE); + VRFY((dcpl_id >= 0), "H5Pcreate succeeded"); + + VRFY((H5Pset_chunk(dcpl_id, PARTIAL_NO_SELECTION_DATASET_NDIMS, chunk_dims) >= 0), "H5Pset_chunk succeeded"); + + dset_id = H5Dcreate2(file_id, PARTIAL_NO_SELECTION_DATASET_NAME, H5T_NATIVE_INT, fspace_id, H5P_DEFAULT, dcpl_id, H5P_DEFAULT); + VRFY((dset_id >= 0), "H5Dcreate2 succeeded"); + + /* + * Setup hyperslab selection to split the dataset among the ranks. + * + * The ranks will write rows across the dataset. + */ + start[0] = PARTIAL_NO_SELECTION_Y_DIM_SCALE * mpi_rank; + start[1] = 0; + stride[0] = PARTIAL_NO_SELECTION_Y_DIM_SCALE; + stride[1] = PARTIAL_NO_SELECTION_X_DIM_SCALE; + count[0] = 1; + count[1] = mpi_size; + block[0] = PARTIAL_NO_SELECTION_Y_DIM_SCALE; + block[1] = PARTIAL_NO_SELECTION_X_DIM_SCALE; + + VRFY((H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, start, stride, count, block) >= 0), "H5Sselect_hyperslab succeeded"); + + sel_dims[0] = count[1] * (PARTIAL_NO_SELECTION_Y_DIM_SCALE * PARTIAL_NO_SELECTION_X_DIM_SCALE); + + mspace_id = H5Screate_simple(1, sel_dims, NULL); + VRFY((mspace_id >= 0), "H5Screate_simple succeeded"); + + data = calloc(1, count[1] * (PARTIAL_NO_SELECTION_Y_DIM_SCALE * PARTIAL_NO_SELECTION_X_DIM_SCALE) * sizeof(int)); + VRFY((data != NULL), "calloc succeeded"); + + dxpl_id = H5Pcreate(H5P_DATASET_XFER); + VRFY((dxpl_id >= 0), "H5Pcreate succeeded"); + + /* + * Enable collective access for the data transfer. + */ + VRFY((H5Pset_dxpl_mpio(dxpl_id, H5FD_MPIO_COLLECTIVE) >= 0), "H5Pset_dxpl_mpio succeeded"); + + VRFY((H5Dwrite(dset_id, H5T_NATIVE_INT, mspace_id, fspace_id, dxpl_id, data) >= 0), "H5Dwrite succeeded"); + + VRFY((H5Fflush(file_id, H5F_SCOPE_GLOBAL) >= 0), "H5Fflush succeeded"); + + /* + * Ensure that linked-chunk I/O is performed since this is + * the particular code path where the issue lies and we don't + * want the library doing multi-chunk I/O behind our backs. + */ + VRFY((H5Pset_dxpl_mpio_chunk_opt(dxpl_id, H5FD_MPIO_CHUNK_ONE_IO) >= 0), "H5Pset_dxpl_mpio_chunk_opt succeeded"); + + read_buf = malloc(count[1] * (PARTIAL_NO_SELECTION_Y_DIM_SCALE * PARTIAL_NO_SELECTION_X_DIM_SCALE) * sizeof(int)); + VRFY((read_buf != NULL), "malloc succeeded"); + + /* + * Make sure to call H5Sselect_none() on the non-participating process. + */ + if (PARTIAL_NO_SELECTION_NO_SEL_PROCESS) { + VRFY((H5Sselect_none(fspace_id) >= 0), "H5Sselect_none succeeded"); + VRFY((H5Sselect_none(mspace_id) >= 0), "H5Sselect_none succeeded"); + } + + /* + * Finally have each rank read their section of data back from the dataset. + */ + VRFY((H5Dread(dset_id, H5T_NATIVE_INT, mspace_id, fspace_id, dxpl_id, read_buf) >= 0), "H5Dread succeeded"); + + /* + * Check data integrity just to be sure. + */ + if (!PARTIAL_NO_SELECTION_NO_SEL_PROCESS) { + VRFY((!memcmp(data, read_buf, count[1] * (PARTIAL_NO_SELECTION_Y_DIM_SCALE * PARTIAL_NO_SELECTION_X_DIM_SCALE) * sizeof(int))), "memcmp succeeded"); + } + + if (dataset_dims) { + free(dataset_dims); + dataset_dims = NULL; + } + + if (data) { + free(data); + data = NULL; + } + + if (read_buf) { + free(read_buf); + read_buf = NULL; + } + + VRFY((H5Sclose(fspace_id) >= 0), "H5Sclose succeeded"); + VRFY((H5Sclose(mspace_id) >= 0), "H5Sclose succeeded"); + VRFY((H5Pclose(dcpl_id) >= 0), "H5Pclose succeeded"); + VRFY((H5Pclose(dxpl_id) >= 0), "H5Pclose succeeded"); + VRFY((H5Dclose(dset_id) >= 0), "H5Dclose succeeded"); + VRFY((H5Pclose(fapl_id) >= 0), "H5Pclose succeeded"); + VRFY((H5Fclose(file_id) >= 0), "H5Fclose succeeded"); +} diff --git a/testpar/testphdf5.c b/testpar/testphdf5.c index 87d9056..69b66ae 100644 --- a/testpar/testphdf5.c +++ b/testpar/testphdf5.c @@ -547,6 +547,9 @@ int main(int argc, char **argv) AddTest("denseattr", test_dense_attr, NULL, "Store Dense Attributes", PARATESTFILE); + AddTest("noselcollmdread", test_partial_no_selection_coll_md_read, NULL, + "Collective Metadata read with some ranks having no selection", PARATESTFILE); + /* Display testing information */ TestInfo(argv[0]); diff --git a/testpar/testphdf5.h b/testpar/testphdf5.h index 322cb9b..176574e 100644 --- a/testpar/testphdf5.h +++ b/testpar/testphdf5.h @@ -294,6 +294,7 @@ void file_image_daisy_chain_test(void); void compress_readAll(void); #endif /* H5_HAVE_FILTER_DEFLATE */ void test_dense_attr(void); +void test_partial_no_selection_coll_md_read(void); /* commonly used prototypes */ hid_t create_faccess_plist(MPI_Comm comm, MPI_Info info, int l_facc_type); -- cgit v0.12