diff options
Diffstat (limited to 'testpar')
-rw-r--r-- | testpar/CMakeLists.txt | 6 | ||||
-rw-r--r-- | testpar/CMakeTests.cmake | 78 | ||||
-rw-r--r-- | testpar/CMakeVFDTests.cmake | 30 | ||||
-rw-r--r-- | testpar/t_cache.c | 16 | ||||
-rw-r--r-- | testpar/t_coll_md_read.c | 336 | ||||
-rw-r--r-- | testpar/t_dset.c | 2 | ||||
-rw-r--r-- | testpar/t_shapesame.c | 32 | ||||
-rw-r--r-- | testpar/t_span_tree.c | 28 | ||||
-rw-r--r-- | testpar/testphdf5.c | 5 | ||||
-rw-r--r-- | testpar/testphdf5.h | 2 |
10 files changed, 464 insertions, 71 deletions
diff --git a/testpar/CMakeLists.txt b/testpar/CMakeLists.txt index 0b3cbe3..96ce0c0 100644 --- a/testpar/CMakeLists.txt +++ b/testpar/CMakeLists.txt @@ -47,7 +47,7 @@ set (H5P_TESTS t_mpi t_bigio t_cache - t_cache_image + #t_cache_image t_pflush1 t_pflush2 t_pread @@ -58,8 +58,8 @@ set (H5P_TESTS t_filters_parallel ) -foreach (testp ${H5P_TESTS}) - ADD_H5P_EXE(${testp}) +foreach (h5_testp ${H5P_TESTS}) + ADD_H5P_EXE(${h5_testp}) endforeach () include (CMakeTests.cmake) diff --git a/testpar/CMakeTests.cmake b/testpar/CMakeTests.cmake index a0d7f59..214801b 100644 --- a/testpar/CMakeTests.cmake +++ b/testpar/CMakeTests.cmake @@ -15,17 +15,83 @@ ### T E S T I N G ### ############################################################################## ############################################################################## +# Remove any output file left over from previous test run +add_test (NAME MPI_TEST-clear-testphdf5-objects + COMMAND ${CMAKE_COMMAND} + -E remove + ParaTest.h5 + WORKING_DIRECTORY + ${HDF5_TEST_PAR_BINARY_DIR} +) +set_tests_properties (MPI_TEST-clear-testphdf5-objects PROPERTIES FIXTURES_SETUP par_clear_testphdf5) -add_test (NAME TEST_PAR_testphdf5 COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} ${MPIEXEC_PREFLAGS} $<TARGET_FILE:testphdf5> ${MPIEXEC_POSTFLAGS}) +set (SKIP_testphdf5 "") +#if (HDF5_OPENMPI_VERSION_SKIP) +# set (SKIP_testphdf5 "${SKIP_testphdf5};-x;ecdsetw") +#endif () -foreach (testp ${H5P_TESTS}) - add_test (NAME TEST_PAR_${testp} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} ${MPIEXEC_PREFLAGS} $<TARGET_FILE:${testp}> ${MPIEXEC_POSTFLAGS}) +add_test (NAME MPI_TEST_testphdf5 COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} ${MPIEXEC_PREFLAGS} $<TARGET_FILE:testphdf5> ${MPIEXEC_POSTFLAGS} ${SKIP_testphdf5}) +set_tests_properties (MPI_TEST_testphdf5 PROPERTIES + FIXTURES_REQUIRED par_clear_testphdf5 + ENVIRONMENT "HDF5_ALARM_SECONDS=3600;srcdir=${HDF5_TEST_PAR_BINARY_DIR}" + WORKING_DIRECTORY ${HDF5_TEST_PAR_BINARY_DIR} +) +if (last_test) + set_tests_properties (MPI_TEST_testphdf5 PROPERTIES DEPENDS ${last_test}) +endif () +set (last_test "MPI_TEST_testphdf5") + +#if (HDF5_OPENMPI_VERSION_SKIP) +# list (REMOVE_ITEM H5P_TESTS t_shapesame) +#endif () + +set (test_par_CLEANFILES + t_cache_image_00.h5 + t_cache_image_01.h5 + t_cache_image_02.h5 + flush.h5 + noflush.h5 + reloc_t_pread_data_file.h5 + reloc_t_pread_group_0_file.h5 + reloc_t_pread_group_1_file.h5 + shutdown.h5 + after_mpi_fin.h5 + #the following should have been removed by the programs + bigio_test.h5 + CacheTestDummy.h5 + t_filters_parallel.h5 + MPItest.h5 + ShapeSameTest.h5 +) + +# Remove any output file left over from previous test run +add_test (NAME MPI_TEST-clear-objects + COMMAND ${CMAKE_COMMAND} + -E remove + ${test_par_CLEANFILES} + WORKING_DIRECTORY + ${HDF5_TEST_PAR_BINARY_DIR} +) +set_tests_properties (MPI_TEST-clear-objects PROPERTIES FIXTURES_SETUP par_clear_objects) + +foreach (h5_testp ${H5P_TESTS}) + add_test (NAME MPI_TEST_${h5_testp} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} ${MPIEXEC_PREFLAGS} $<TARGET_FILE:${h5_testp}> ${MPIEXEC_POSTFLAGS}) + set_tests_properties (MPI_TEST_${h5_testp} PROPERTIES + FIXTURES_REQUIRED par_clear_objects + ENVIRONMENT "HDF5_ALARM_SECONDS=3600;srcdir=${HDF5_TEST_PAR_BINARY_DIR}" + WORKING_DIRECTORY ${HDF5_TEST_PAR_BINARY_DIR} + ) + if (last_test) + set_tests_properties (MPI_TEST_${h5_testp} PROPERTIES DEPENDS ${last_test}) + endif () + set (last_test "MPI_TEST_${h5_testp}") endforeach () # The t_pflush1 test is hard-coded to fail. -set_tests_properties (TEST_PAR_t_pflush1 PROPERTIES WILL_FAIL "true") -#set_property (TEST TEST_PAR_t_pflush1 PROPERTY PASS_REGULAR_EXPRESSION "PASSED") -set_tests_properties (TEST_PAR_t_pflush2 PROPERTIES DEPENDS TEST_PAR_t_pflush1) +set_tests_properties (MPI_TEST_t_pflush1 PROPERTIES WILL_FAIL "true") +#set_property (TEST MPI_TEST_t_pflush1 PROPERTY PASS_REGULAR_EXPRESSION "PASSED") +set_tests_properties (MPI_TEST_t_pflush2 PROPERTIES DEPENDS MPI_TEST_t_pflush1) +set_tests_properties (MPI_TEST_t_prestart PROPERTIES DEPENDS MPI_TEST_t_pshutdown) ############################################################################## ############################################################################## diff --git a/testpar/CMakeVFDTests.cmake b/testpar/CMakeVFDTests.cmake index b6b065f..c0b848b 100644 --- a/testpar/CMakeVFDTests.cmake +++ b/testpar/CMakeVFDTests.cmake @@ -33,25 +33,41 @@ set (VFD_LIST ${VFD_LIST} direct) endif () +foreach (vfdtest ${VFD_LIST}) + file (MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/${vfdtest}") +endforeach () + macro (ADD_VFD_TEST vfdname resultcode) if (NOT HDF5_ENABLE_USING_MEMCHECKER) - foreach (test ${H5P_VFD_TESTS}) + foreach (h5_test ${H5P_VFD_TESTS}) + add_test ( + NAME MPI_TEST_VFD-${vfdname}-${h5_test}-clear-objects + COMMAND ${CMAKE_COMMAND} + -E remove + ${vfdname}-shared/${vfdname}-${h5_test}.out + ${vfdname}-shared/${vfdname}-${h5_test}.out.err + ) add_test ( - NAME TEST_PAR_VFD-${vfdname}-${test} + NAME MPI_TEST_VFD-${vfdname}-${h5_test} COMMAND "${CMAKE_COMMAND}" - -D "TEST_PROGRAM=$<TARGET_FILE:${test}>" + -D "TEST_PROGRAM=$<TARGET_FILE:${h5_test}>" -D "TEST_ARGS:STRING=" -D "TEST_VFD:STRING=${vfdname}" -D "TEST_EXPECT=${resultcode}" - -D "TEST_OUTPUT=${test}" - -D "TEST_FOLDER=${PROJECT_BINARY_DIR}" + -D "TEST_OUTPUT=${vfdname}-${h5_test}.out" + -D "TEST_FOLDER=${PROJECT_BINARY_DIR}/${vfdname}" -P "${HDF_RESOURCES_DIR}/vfdTest.cmake" ) + set_tests_properties (MPI_TEST_VFD-${vfdname}-${h5_test} PROPERTIES + DEPENDS MPI_TEST_VFD-${vfdname}-${h5_test}-clear-objects + ENVIRONMENT "srcdir=${HDF5_TEST_PAR_BINARY_DIR}/${vfdname}" + WORKING_DIRECTORY ${HDF5_TEST_PAR_BINARY_DIR}/${vfdname} + ) endforeach () endif () endmacro () # Run test with different Virtual File Driver - foreach (vfd ${VFD_LIST}) - ADD_VFD_TEST (${vfd} 0) + foreach (h5_vfd ${VFD_LIST}) + ADD_VFD_TEST (${h5_vfd} 0) endforeach () diff --git a/testpar/t_cache.c b/testpar/t_cache.c index 7488728..41c95e2 100644 --- a/testpar/t_cache.c +++ b/testpar/t_cache.c @@ -7225,7 +7225,7 @@ smoke_check_6(int metadata_write_strategy) /* some error occured in the server -- report failure */ nerrors++; if ( verbose ) { - HDfprintf(stdout, "%d:%s: server_main() failed.\n", + HDfprintf(stdout, "%d:%s: server_main() failed.\n", world_mpi_rank, FUNC); } } @@ -7241,7 +7241,7 @@ smoke_check_6(int metadata_write_strategy) fid = -1; cache_ptr = NULL; if ( verbose ) { - HDfprintf(stdout, "%d:%s: setup_cache_for_test() failed.\n", + HDfprintf(stdout, "%d:%s: setup_cache_for_test() failed.\n", world_mpi_rank, FUNC); } } @@ -7250,7 +7250,7 @@ smoke_check_6(int metadata_write_strategy) virt_num_data_entries = NUM_DATA_ENTRIES; /* insert the first half collectively */ - file_ptr->coll_md_read = H5P_USER_TRUE; + H5CX_set_coll_metadata_read(TRUE); for ( i = 0; i < virt_num_data_entries/2; i++ ) { struct datum * entry_ptr; @@ -7271,7 +7271,7 @@ smoke_check_6(int metadata_write_strategy) } /* insert the other half independently */ - file_ptr->coll_md_read = H5P_USER_FALSE; + H5CX_set_coll_metadata_read(FALSE); for ( i = virt_num_data_entries/2; i < virt_num_data_entries; i++ ) { struct datum * entry_ptr; @@ -7291,7 +7291,7 @@ smoke_check_6(int metadata_write_strategy) HDassert(cache_ptr->max_cache_size*0.8 > cache_ptr->coll_list_size); } - /* flush the file */ + /* flush the file */ if ( H5Fflush(fid, H5F_SCOPE_GLOBAL) < 0 ) { nerrors++; if ( verbose ) { @@ -7301,7 +7301,7 @@ smoke_check_6(int metadata_write_strategy) } /* Protect the first half of the entries collectively */ - file_ptr->coll_md_read = H5P_USER_TRUE; + H5CX_set_coll_metadata_read(TRUE); for ( i = 0; i < (virt_num_data_entries / 2); i++ ) { struct datum * entry_ptr; @@ -7322,13 +7322,13 @@ smoke_check_6(int metadata_write_strategy) } /* protect the other half independently */ - file_ptr->coll_md_read = H5P_USER_FALSE; + H5CX_set_coll_metadata_read(FALSE); for ( i = virt_num_data_entries/2; i < virt_num_data_entries; i++ ) { struct datum * entry_ptr; entry_ptr = &(data[i]); - lock_entry(file_ptr, i); + lock_entry(file_ptr, i); if(FALSE != entry_ptr->header.coll_access) { nerrors++; diff --git a/testpar/t_coll_md_read.c b/testpar/t_coll_md_read.c index f945d2b..912388c 100644 --- a/testpar/t_coll_md_read.c +++ b/testpar/t_coll_md_read.c @@ -32,6 +32,14 @@ #define PARTIAL_NO_SELECTION_Y_DIM_SCALE 5 #define PARTIAL_NO_SELECTION_X_DIM_SCALE 5 +#define MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS 2 + +#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_NO_SEL_PROCESS (mpi_rank == mpi_size - 1) +#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DATASET_NAME "linked_chunk_io_sort_chunk_issue" +#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_Y_DIM_SCALE 20000 +#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE 1 +#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS 1 + /* * A test for issue HDFFV-10501. A parallel hang was reported which occurred * in linked-chunk I/O when collective metadata reads are enabled and some ranks @@ -57,13 +65,13 @@ void test_partial_no_selection_coll_md_read(void) hsize_t stride[PARTIAL_NO_SELECTION_DATASET_NDIMS]; hsize_t count[PARTIAL_NO_SELECTION_DATASET_NDIMS]; hsize_t block[PARTIAL_NO_SELECTION_DATASET_NDIMS]; - hid_t file_id = -1; - hid_t fapl_id = -1; - hid_t dset_id = -1; - hid_t dcpl_id = -1; - hid_t dxpl_id = -1; - hid_t fspace_id = -1; - hid_t mspace_id = -1; + hid_t file_id = H5I_INVALID_HID; + hid_t fapl_id = H5I_INVALID_HID; + hid_t dset_id = H5I_INVALID_HID; + hid_t dcpl_id = H5I_INVALID_HID; + hid_t dxpl_id = H5I_INVALID_HID; + hid_t fspace_id = H5I_INVALID_HID; + hid_t mspace_id = H5I_INVALID_HID; int mpi_rank, mpi_size; void *data = NULL; void *read_buf = NULL; @@ -86,7 +94,7 @@ void test_partial_no_selection_coll_md_read(void) file_id = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); VRFY((file_id >= 0), "H5Fcreate succeeded"); - dataset_dims = malloc(PARTIAL_NO_SELECTION_DATASET_NDIMS * sizeof(*dataset_dims)); + dataset_dims = HDmalloc(PARTIAL_NO_SELECTION_DATASET_NDIMS * sizeof(*dataset_dims)); VRFY((dataset_dims != NULL), "malloc succeeded"); dataset_dims[0] = PARTIAL_NO_SELECTION_Y_DIM_SCALE * mpi_size; @@ -129,7 +137,7 @@ void test_partial_no_selection_coll_md_read(void) mspace_id = H5Screate_simple(1, sel_dims, NULL); VRFY((mspace_id >= 0), "H5Screate_simple succeeded"); - data = calloc(1, count[1] * (PARTIAL_NO_SELECTION_Y_DIM_SCALE * PARTIAL_NO_SELECTION_X_DIM_SCALE) * sizeof(int)); + data = HDcalloc(1, count[1] * (PARTIAL_NO_SELECTION_Y_DIM_SCALE * PARTIAL_NO_SELECTION_X_DIM_SCALE) * sizeof(int)); VRFY((data != NULL), "calloc succeeded"); dxpl_id = H5Pcreate(H5P_DATASET_XFER); @@ -151,7 +159,7 @@ void test_partial_no_selection_coll_md_read(void) */ VRFY((H5Pset_dxpl_mpio_chunk_opt(dxpl_id, H5FD_MPIO_CHUNK_ONE_IO) >= 0), "H5Pset_dxpl_mpio_chunk_opt succeeded"); - read_buf = malloc(count[1] * (PARTIAL_NO_SELECTION_Y_DIM_SCALE * PARTIAL_NO_SELECTION_X_DIM_SCALE) * sizeof(int)); + read_buf = HDmalloc(count[1] * (PARTIAL_NO_SELECTION_Y_DIM_SCALE * PARTIAL_NO_SELECTION_X_DIM_SCALE) * sizeof(int)); VRFY((read_buf != NULL), "malloc succeeded"); /* @@ -171,21 +179,321 @@ void test_partial_no_selection_coll_md_read(void) * Check data integrity just to be sure. */ if (!PARTIAL_NO_SELECTION_NO_SEL_PROCESS) { - VRFY((!memcmp(data, read_buf, count[1] * (PARTIAL_NO_SELECTION_Y_DIM_SCALE * PARTIAL_NO_SELECTION_X_DIM_SCALE) * sizeof(int))), "memcmp succeeded"); + VRFY((!HDmemcmp(data, read_buf, count[1] * (PARTIAL_NO_SELECTION_Y_DIM_SCALE * PARTIAL_NO_SELECTION_X_DIM_SCALE) * sizeof(int))), "memcmp succeeded"); + } + + if (dataset_dims) { + HDfree(dataset_dims); + dataset_dims = NULL; } + if (data) { + HDfree(data); + data = NULL; + } + + if (read_buf) { + HDfree(read_buf); + read_buf = NULL; + } + + VRFY((H5Sclose(fspace_id) >= 0), "H5Sclose succeeded"); + VRFY((H5Sclose(mspace_id) >= 0), "H5Sclose succeeded"); + VRFY((H5Pclose(dcpl_id) >= 0), "H5Pclose succeeded"); + VRFY((H5Pclose(dxpl_id) >= 0), "H5Pclose succeeded"); + VRFY((H5Dclose(dset_id) >= 0), "H5Dclose succeeded"); + VRFY((H5Pclose(fapl_id) >= 0), "H5Pclose succeeded"); + VRFY((H5Fclose(file_id) >= 0), "H5Fclose succeeded"); +} + +/* + * A test for HDFFV-10562 which attempts to verify that using multi-chunk + * I/O with collective metadata reads enabled doesn't causes issues due to + * collective metadata reads being made only by process 0 in H5D__chunk_addrmap(). + * + * Failure in this test may either cause a hang, or, due to how the MPI calls + * pertaining to this issue might mistakenly match up, may cause an MPI error + * message similar to: + * + * #008: H5Dmpio.c line 2546 in H5D__obtain_mpio_mode(): MPI_BCast failed + * major: Internal error (too specific to document in detail) + * minor: Some MPI function failed + * #009: H5Dmpio.c line 2546 in H5D__obtain_mpio_mode(): Message truncated, error stack: + *PMPI_Bcast(1600)..................: MPI_Bcast(buf=0x1df98e0, count=18, MPI_BYTE, root=0, comm=0x84000006) failed + *MPIR_Bcast_impl(1452).............: + *MPIR_Bcast(1476)..................: + *MPIR_Bcast_intra(1249)............: + *MPIR_SMP_Bcast(1088)..............: + *MPIR_Bcast_binomial(239)..........: + *MPIDI_CH3U_Receive_data_found(131): Message from rank 0 and tag 2 truncated; 2616 bytes received but buffer size is 18 + * major: Internal error (too specific to document in detail) + * minor: MPI Error String + * + */ +void test_multi_chunk_io_addrmap_issue(void) +{ + const char *filename; + hsize_t start[MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS]; + hsize_t stride[MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS]; + hsize_t count[MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS]; + hsize_t block[MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS]; + hsize_t dims[MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS] = {10, 5}; + hsize_t chunk_dims[MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS] = {5, 5}; + hsize_t max_dims[MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS] = {H5S_UNLIMITED, H5S_UNLIMITED}; + hid_t file_id = H5I_INVALID_HID; + hid_t fapl_id = H5I_INVALID_HID; + hid_t dset_id = H5I_INVALID_HID; + hid_t dcpl_id = H5I_INVALID_HID; + hid_t dxpl_id = H5I_INVALID_HID; + hid_t space_id = H5I_INVALID_HID; + void *read_buf = NULL; + int mpi_rank; + int data[5][5] = { {0, 1, 2, 3, 4}, + {0, 1, 2, 3, 4}, + {0, 1, 2, 3, 4}, + {0, 1, 2, 3, 4}, + {0, 1, 2, 3, 4} }; + + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + + filename = GetTestParameters(); + + fapl_id = create_faccess_plist(MPI_COMM_WORLD, MPI_INFO_NULL, facc_type); + VRFY((fapl_id >= 0), "create_faccess_plist succeeded"); + + /* + * Even though the testphdf5 framework currently sets collective metadata reads + * on the FAPL, we call it here just to be sure this is futureproof, since + * demonstrating this issue relies upon it. + */ + VRFY((H5Pset_all_coll_metadata_ops(fapl_id, true) >= 0), "Set collective metadata reads succeeded"); + + file_id = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); + VRFY((file_id >= 0), "H5Fcreate succeeded"); + + space_id = H5Screate_simple(MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS, dims, max_dims); + VRFY((space_id >= 0), "H5Screate_simple succeeded"); + + dcpl_id = H5Pcreate(H5P_DATASET_CREATE); + VRFY((dcpl_id >= 0), "H5Pcreate succeeded"); + + VRFY((H5Pset_chunk(dcpl_id, MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS, chunk_dims) >= 0), "H5Pset_chunk succeeded"); + + dset_id = H5Dcreate2(file_id, "dset", H5T_NATIVE_INT, space_id, H5P_DEFAULT, dcpl_id, H5P_DEFAULT); + VRFY((dset_id >= 0), "H5Dcreate2 succeeded"); + + dxpl_id = H5Pcreate(H5P_DATASET_XFER); + VRFY((dxpl_id >= 0), "H5Pcreate succeeded"); + + VRFY((H5Pset_dxpl_mpio(dxpl_id, H5FD_MPIO_COLLECTIVE) >= 0), "H5Pset_dxpl_mpio succeeded"); + VRFY((H5Pset_dxpl_mpio_chunk_opt(dxpl_id, H5FD_MPIO_CHUNK_MULTI_IO) >= 0), "H5Pset_dxpl_mpio_chunk_opt succeeded"); + + start[1] = 0; + stride[0] = stride[1] = 1; + count[0] = count[1] = 5; + block[0] = block[1] = 1; + + if (mpi_rank == 0) + start[0] = 0; + else + start[0] = 5; + + VRFY((H5Sselect_hyperslab(space_id, H5S_SELECT_SET, start, stride, count, block) >= 0), "H5Sselect_hyperslab succeeded"); + if (mpi_rank != 0) + VRFY((H5Sselect_none(space_id) >= 0), "H5Sselect_none succeeded"); + + VRFY((H5Dwrite(dset_id, H5T_NATIVE_INT, H5S_ALL, space_id, dxpl_id, data) >= 0), "H5Dwrite succeeded"); + + VRFY((H5Fflush(file_id, H5F_SCOPE_GLOBAL) >= 0), "H5Fflush succeeded"); + + read_buf = HDmalloc(50 * sizeof(int)); + VRFY((read_buf != NULL), "malloc succeeded"); + + VRFY((H5Dread(dset_id, H5T_NATIVE_INT, H5S_ALL, H5S_ALL, dxpl_id, read_buf) >= 0), "H5Dread succeeded"); + + if (read_buf) { + HDfree(read_buf); + read_buf = NULL; + } + + VRFY((H5Sclose(space_id) >= 0), "H5Sclose succeeded"); + VRFY((H5Pclose(dcpl_id) >= 0), "H5Pclose succeeded"); + VRFY((H5Pclose(dxpl_id) >= 0), "H5Pclose succeeded"); + VRFY((H5Dclose(dset_id) >= 0), "H5Dclose succeeded"); + VRFY((H5Pclose(fapl_id) >= 0), "H5Pclose succeeded"); + VRFY((H5Fclose(file_id) >= 0), "H5Fclose succeeded"); +} + +/* + * A test for HDFFV-10562 which attempts to verify that using linked-chunk + * I/O with collective metadata reads enabled doesn't cause issues due to + * collective metadata reads being made only by process 0 in H5D__sort_chunk(). + * + * NOTE: Due to the way that the threshold value which pertains to this test + * is currently calculated within HDF5, there are several conditions that this + * test must maintain. Refer to the function H5D__sort_chunk in H5Dmpio.c for + * a better idea of why. + * + * Condition 1: We need to make sure that the test always selects every single + * chunk in the dataset. It is fine if the selection is split up among multiple + * ranks, but their combined selection must cover the whole dataset. + * + * Condition 2: The number of chunks in the dataset divided by the number of MPI + * ranks must exceed or equal 10000. In other words, each MPI rank must be + * responsible for 10000 or more unique chunks. + * + * Condition 3: This test will currently only be reliably reproducable for 2 or 3 + * MPI ranks. The threshold value calculated reduces to a constant 100 / mpi_size, + * and is compared against a default value of 30%. + * + * Failure in this test may either cause a hang, or, due to how the MPI calls + * pertaining to this issue might mistakenly match up, may cause an MPI error + * message similar to: + * + * #008: H5Dmpio.c line 2338 in H5D__sort_chunk(): MPI_BCast failed + * major: Internal error (too specific to document in detail) + * minor: Some MPI function failed + * #009: H5Dmpio.c line 2338 in H5D__sort_chunk(): Other MPI error, error stack: + *PMPI_Bcast(1600)........: MPI_Bcast(buf=0x7eae610, count=320000, MPI_BYTE, root=0, comm=0x84000006) failed + *MPIR_Bcast_impl(1452)...: + *MPIR_Bcast(1476)........: + *MPIR_Bcast_intra(1249)..: + *MPIR_SMP_Bcast(1088)....: + *MPIR_Bcast_binomial(250): message sizes do not match across processes in the collective routine: Received 2096 but expected 320000 + * major: Internal error (too specific to document in detail) + * minor: MPI Error String + */ +void test_link_chunk_io_sort_chunk_issue(void) +{ + const char *filename; + hsize_t *dataset_dims = NULL; + hsize_t max_dataset_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS]; + hsize_t sel_dims[1]; + hsize_t chunk_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS] = { LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS }; + hsize_t start[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS]; + hsize_t stride[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS]; + hsize_t count[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS]; + hsize_t block[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS]; + hid_t file_id = H5I_INVALID_HID; + hid_t fapl_id = H5I_INVALID_HID; + hid_t dset_id = H5I_INVALID_HID; + hid_t dcpl_id = H5I_INVALID_HID; + hid_t dxpl_id = H5I_INVALID_HID; + hid_t fspace_id = H5I_INVALID_HID; + hid_t mspace_id = H5I_INVALID_HID; + int mpi_rank, mpi_size; + void *data = NULL; + void *read_buf = NULL; + + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); + + filename = GetTestParameters(); + + fapl_id = create_faccess_plist(MPI_COMM_WORLD, MPI_INFO_NULL, facc_type); + VRFY((fapl_id >= 0), "create_faccess_plist succeeded"); + + /* + * Even though the testphdf5 framework currently sets collective metadata reads + * on the FAPL, we call it here just to be sure this is futureproof, since + * demonstrating this issue relies upon it. + */ + VRFY((H5Pset_all_coll_metadata_ops(fapl_id, true) >= 0), "Set collective metadata reads succeeded"); + + file_id = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); + VRFY((file_id >= 0), "H5Fcreate succeeded"); + + dataset_dims = HDmalloc(LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS * sizeof(*dataset_dims)); + VRFY((dataset_dims != NULL), "malloc succeeded"); + + dataset_dims[0] = LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE * mpi_size * LINK_CHUNK_IO_SORT_CHUNK_ISSUE_Y_DIM_SCALE; + max_dataset_dims[0] = H5S_UNLIMITED; + + fspace_id = H5Screate_simple(LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS, dataset_dims, max_dataset_dims); + VRFY((fspace_id >= 0), "H5Screate_simple succeeded"); + + /* + * Set up chunking on the dataset in order to reproduce the problem. + */ + dcpl_id = H5Pcreate(H5P_DATASET_CREATE); + VRFY((dcpl_id >= 0), "H5Pcreate succeeded"); + + VRFY((H5Pset_chunk(dcpl_id, LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS, chunk_dims) >= 0), "H5Pset_chunk succeeded"); + + dset_id = H5Dcreate2(file_id, LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DATASET_NAME, H5T_NATIVE_INT, fspace_id, H5P_DEFAULT, dcpl_id, H5P_DEFAULT); + VRFY((dset_id >= 0), "H5Dcreate2 succeeded"); + + /* + * Setup hyperslab selection to split the dataset among the ranks. + * + * The ranks will write rows across the dataset. + */ + stride[0] = LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE; + count[0] = (dataset_dims[0] / LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) / mpi_size; + start[0] = count[0] * mpi_rank; + block[0] = LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE; + + VRFY((H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, start, stride, count, block) >= 0), "H5Sselect_hyperslab succeeded"); + + sel_dims[0] = count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE); + + mspace_id = H5Screate_simple(1, sel_dims, NULL); + VRFY((mspace_id >= 0), "H5Screate_simple succeeded"); + + data = HDcalloc(1, count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) * sizeof(int)); + VRFY((data != NULL), "calloc succeeded"); + + dxpl_id = H5Pcreate(H5P_DATASET_XFER); + VRFY((dxpl_id >= 0), "H5Pcreate succeeded"); + + /* + * Enable collective access for the data transfer. + */ + VRFY((H5Pset_dxpl_mpio(dxpl_id, H5FD_MPIO_COLLECTIVE) >= 0), "H5Pset_dxpl_mpio succeeded"); + + VRFY((H5Dwrite(dset_id, H5T_NATIVE_INT, mspace_id, fspace_id, dxpl_id, data) >= 0), "H5Dwrite succeeded"); + + VRFY((H5Fflush(file_id, H5F_SCOPE_GLOBAL) >= 0), "H5Fflush succeeded"); + + /* + * Ensure that linked-chunk I/O is performed since this is + * the particular code path where the issue lies and we don't + * want the library doing multi-chunk I/O behind our backs. + */ + VRFY((H5Pset_dxpl_mpio_chunk_opt(dxpl_id, H5FD_MPIO_CHUNK_ONE_IO) >= 0), "H5Pset_dxpl_mpio_chunk_opt succeeded"); + + read_buf = HDmalloc(count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) * sizeof(int)); + VRFY((read_buf != NULL), "malloc succeeded"); + + VRFY((H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, start, stride, count, block) >= 0), "H5Sselect_hyperslab succeeded"); + + sel_dims[0] = count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE); + + VRFY((H5Sclose(mspace_id) >= 0), "H5Sclose succeeded"); + + mspace_id = H5Screate_simple(1, sel_dims, NULL); + VRFY((mspace_id >= 0), "H5Screate_simple succeeded"); + + read_buf = HDrealloc(read_buf, count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) * sizeof(int)); + VRFY((read_buf != NULL), "realloc succeeded"); + + /* + * Finally have each rank read their section of data back from the dataset. + */ + VRFY((H5Dread(dset_id, H5T_NATIVE_INT, mspace_id, fspace_id, dxpl_id, read_buf) >= 0), "H5Dread succeeded"); + if (dataset_dims) { - free(dataset_dims); + HDfree(dataset_dims); dataset_dims = NULL; } if (data) { - free(data); + HDfree(data); data = NULL; } if (read_buf) { - free(read_buf); + HDfree(read_buf); read_buf = NULL; } diff --git a/testpar/t_dset.c b/testpar/t_dset.c index 35501d8..d4e556d 100644 --- a/testpar/t_dset.c +++ b/testpar/t_dset.c @@ -2659,8 +2659,10 @@ compress_readAll(void) nerrors++; } +#if MPI_VERSION >= 3 ret = H5Dwrite(dataset, H5T_NATIVE_INT, H5S_ALL, H5S_ALL, xfer_plist, data_read); VRFY((ret >= 0), "H5Dwrite succeeded"); +#endif ret = H5Pclose(xfer_plist); VRFY((ret >= 0), "H5Pclose succeeded"); diff --git a/testpar/t_shapesame.c b/testpar/t_shapesame.c index eddbada..f5282bd 100644 --- a/testpar/t_shapesame.c +++ b/testpar/t_shapesame.c @@ -948,9 +948,9 @@ contig_hs_dr_pio_test__d2m_l2s(struct hs_dr_pio_test_vars_t * tv_ptr) /* verify that H5S_select_shape_same() reports the two * selections as having the same shape. */ - check = H5S_select_shape_same_test(tv_ptr->small_ds_slice_sid, + check = H5S__select_shape_same_test(tv_ptr->small_ds_slice_sid, tv_ptr->file_large_ds_sid_0); - VRFY((check == TRUE), "H5S_select_shape_same_test passed"); + VRFY((check == TRUE), "H5S__select_shape_same_test passed"); /* Read selection from disk */ @@ -1216,9 +1216,9 @@ contig_hs_dr_pio_test__d2m_s2l(struct hs_dr_pio_test_vars_t * tv_ptr) /* verify that H5S_select_shape_same() reports the two * selections as having the same shape. */ - check = H5S_select_shape_same_test(tv_ptr->file_small_ds_sid_0, + check = H5S__select_shape_same_test(tv_ptr->file_small_ds_sid_0, tv_ptr->mem_large_ds_sid); - VRFY((check == TRUE), "H5S_select_shape_same_test passed"); + VRFY((check == TRUE), "H5S__select_shape_same_test passed"); /* Read selection from disk */ @@ -1531,9 +1531,9 @@ contig_hs_dr_pio_test__m2d_l2s(struct hs_dr_pio_test_vars_t * tv_ptr) * memory slice through the cube selection and the * on disk full square selections as having the same shape. */ - check = H5S_select_shape_same_test(tv_ptr->file_small_ds_sid_0, + check = H5S__select_shape_same_test(tv_ptr->file_small_ds_sid_0, tv_ptr->mem_large_ds_sid); - VRFY((check == TRUE), "H5S_select_shape_same_test passed."); + VRFY((check == TRUE), "H5S__select_shape_same_test passed."); /* write the slice from the in memory large data set to the @@ -1864,9 +1864,9 @@ contig_hs_dr_pio_test__m2d_s2l(struct hs_dr_pio_test_vars_t * tv_ptr) * on disk slice through the large data set selection * as having the same shape. */ - check = H5S_select_shape_same_test(tv_ptr->mem_small_ds_sid, + check = H5S__select_shape_same_test(tv_ptr->mem_small_ds_sid, tv_ptr->file_large_ds_sid_0); - VRFY((check == TRUE), "H5S_select_shape_same_test passed"); + VRFY((check == TRUE), "H5S__select_shape_same_test passed"); /* write the small data set slice from memory to the @@ -3149,9 +3149,9 @@ ckrbrd_hs_dr_pio_test__d2m_l2s(struct hs_dr_pio_test_vars_t * tv_ptr) /* verify that H5S_select_shape_same() reports the two * selections as having the same shape. */ - check = H5S_select_shape_same_test(tv_ptr->small_ds_slice_sid, + check = H5S__select_shape_same_test(tv_ptr->small_ds_slice_sid, tv_ptr->file_large_ds_sid_0); - VRFY((check == TRUE), "H5S_select_shape_same_test passed"); + VRFY((check == TRUE), "H5S__select_shape_same_test passed"); /* Read selection from disk */ @@ -3415,9 +3415,9 @@ ckrbrd_hs_dr_pio_test__d2m_s2l(struct hs_dr_pio_test_vars_t * tv_ptr) /* verify that H5S_select_shape_same() reports the two * selections as having the same shape. */ - check = H5S_select_shape_same_test(tv_ptr->file_small_ds_sid_0, + check = H5S__select_shape_same_test(tv_ptr->file_small_ds_sid_0, tv_ptr->mem_large_ds_sid); - VRFY((check == TRUE), "H5S_select_shape_same_test passed"); + VRFY((check == TRUE), "H5S__select_shape_same_test passed"); /* Read selection from disk */ @@ -3800,9 +3800,9 @@ ckrbrd_hs_dr_pio_test__m2d_l2s(struct hs_dr_pio_test_vars_t * tv_ptr) * large dataset and the checkerboard selection of the process * slice of the small data set as having the same shape. */ - check = H5S_select_shape_same_test(tv_ptr->file_small_ds_sid_1, + check = H5S__select_shape_same_test(tv_ptr->file_small_ds_sid_1, tv_ptr->mem_large_ds_sid); - VRFY((check == TRUE), "H5S_select_shape_same_test passed."); + VRFY((check == TRUE), "H5S__select_shape_same_test passed."); /* write the checker board selection of the slice from the in @@ -4155,9 +4155,9 @@ ckrbrd_hs_dr_pio_test__m2d_s2l(struct hs_dr_pio_test_vars_t * tv_ptr) * on disk slice through the large data set selection * as having the same shape. */ - check = H5S_select_shape_same_test(tv_ptr->mem_small_ds_sid, + check = H5S__select_shape_same_test(tv_ptr->mem_small_ds_sid, tv_ptr->file_large_ds_sid_1); - VRFY((check == TRUE), "H5S_select_shape_same_test passed"); + VRFY((check == TRUE), "H5S__select_shape_same_test passed"); /* write the small data set slice from memory to the diff --git a/testpar/t_span_tree.c b/testpar/t_span_tree.c index 02d2cca..cc79af2 100644 --- a/testpar/t_span_tree.c +++ b/testpar/t_span_tree.c @@ -2334,9 +2334,9 @@ lower_dim_size_comp_test__run_test(const int chunk_edge_size, /* verify that H5S_select_shape_same() reports the two * selections as having the same shape. */ - check = H5S_select_shape_same_test(mem_large_ds_sid, + check = H5S__select_shape_same_test(mem_large_ds_sid, file_small_ds_sid); - VRFY((check == TRUE), "H5S_select_shape_same_test passed (1)"); + VRFY((check == TRUE), "H5S__select_shape_same_test passed (1)"); ret = H5Dread(small_dataset, @@ -2455,9 +2455,9 @@ lower_dim_size_comp_test__run_test(const int chunk_edge_size, /* verify that H5S_select_shape_same() reports the two * selections as having the same shape. */ - check = H5S_select_shape_same_test(mem_small_ds_sid, + check = H5S__select_shape_same_test(mem_small_ds_sid, file_large_ds_sid); - VRFY((check == TRUE), "H5S_select_shape_same_test passed (2)"); + VRFY((check == TRUE), "H5S__select_shape_same_test passed (2)"); ret = H5Dread(large_dataset, @@ -2622,31 +2622,27 @@ void lower_dim_size_comp_test(void) { /* const char *fcnName = "lower_dim_size_comp_test()"; */ - int chunk_edge_size = 0; - int use_collective_io = 1; - hid_t dset_type = H5T_NATIVE_UINT; + int chunk_edge_size = 0; + int use_collective_io; + #if 0 HDsleep(60); #endif - HDcompile_assert(sizeof(uint32_t) == sizeof(unsigned)); - for ( use_collective_io = (hbool_t)0; - (int)use_collective_io <= 1; - (hbool_t)(use_collective_io++) ) { + HDcompile_assert(sizeof(uint32_t) == sizeof(unsigned)); + for(use_collective_io = 0; use_collective_io <= 1; use_collective_io++) { chunk_edge_size = 0; lower_dim_size_comp_test__run_test(chunk_edge_size, (hbool_t)use_collective_io, - dset_type); - + H5T_NATIVE_UINT); chunk_edge_size = 5; lower_dim_size_comp_test__run_test(chunk_edge_size, (hbool_t)use_collective_io, - dset_type); - } + H5T_NATIVE_UINT); + } /* end for */ return; - } /* lower_dim_size_comp_test() */ diff --git a/testpar/testphdf5.c b/testpar/testphdf5.c index 69b66ae..999e17a 100644 --- a/testpar/testphdf5.c +++ b/testpar/testphdf5.c @@ -549,7 +549,10 @@ int main(int argc, char **argv) AddTest("noselcollmdread", test_partial_no_selection_coll_md_read, NULL, "Collective Metadata read with some ranks having no selection", PARATESTFILE); - + AddTest("MC coll MD read", test_multi_chunk_io_addrmap_issue, NULL, + "Collective MD read with multi chunk I/O (H5D__chunk_addrmap)", PARATESTFILE); + AddTest("LC coll MD read", test_link_chunk_io_sort_chunk_issue, NULL, + "Collective MD read with link chunk I/O (H5D__sort_chunk)", PARATESTFILE); /* Display testing information */ TestInfo(argv[0]); diff --git a/testpar/testphdf5.h b/testpar/testphdf5.h index 176574e..4409221 100644 --- a/testpar/testphdf5.h +++ b/testpar/testphdf5.h @@ -295,6 +295,8 @@ void compress_readAll(void); #endif /* H5_HAVE_FILTER_DEFLATE */ void test_dense_attr(void); void test_partial_no_selection_coll_md_read(void); +void test_multi_chunk_io_addrmap_issue(void); +void test_link_chunk_io_sort_chunk_issue(void); /* commonly used prototypes */ hid_t create_faccess_plist(MPI_Comm comm, MPI_Info info, int l_facc_type); |