diff options
-rw-r--r-- | MANIFEST | 1 | ||||
-rw-r--r-- | release_docs/RELEASE.txt | 25 | ||||
-rw-r--r-- | src/H5Fsuper.c | 133 | ||||
-rw-r--r-- | testpar/CMakeLists.txt | 1 | ||||
-rw-r--r-- | testpar/Makefile.am | 2 | ||||
-rw-r--r-- | testpar/t_pread.c | 904 |
6 files changed, 1022 insertions, 44 deletions
@@ -1244,6 +1244,7 @@ ./testpar/t_ph5basic.c ./testpar/t_pflush1.c ./testpar/t_pflush2.c +./testpar/t_pread.c ./testpar/t_prop.c ./testpar/t_shapesame.c ./testpar/t_pshutdown.c diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt index db946a0..db3e4e0 100644 --- a/release_docs/RELEASE.txt +++ b/release_docs/RELEASE.txt @@ -62,6 +62,31 @@ New Features Parallel Library: ----------------- + - Optimize parallel open/location of the HDF5 super-block + + Previous releases of PHDF5 required all parallel ranks to + search for the HDF5 superblock signature when opening the + file. As this is accomplished more or less as a synchronous + operation, a large number of processes can experience a + slowdown in the file open due to filesystem contention. + + As a first step in improving the startup/file-open performance, + we allow MPI rank 0 of the associated MPI communicator to locate + the base offset of the super-block and then broadcast that result + to the remaining ranks in the parallel group. Note that this + approach is utilized ONLY during file opens which employ the MPIO + file driver in HDF5 by previously having called H5Pset_fapl_mpio(). + + HDF5 parallel file operations which do not employ multiple ranks + e.g. specifiying MPI_COMM_SELF (whose MPI_Comm_size == 1) + as opposed to MPI_COMM_WORLD, will not be affected by this + optimization. Conversely, parallel file operations on subgroups + of MPI_COMM_WORLD are allowed to be run in parallel with each + subgroup operating as an independant collection of processes. + + (RAW – 2017/10/10, HDFFV-10294) + + - Large MPI-IO transfers Previous releases of PHDF5 would fail when attempting to diff --git a/src/H5Fsuper.c b/src/H5Fsuper.c index 942a7ed..0c6f9cd 100644 --- a/src/H5Fsuper.c +++ b/src/H5Fsuper.c @@ -21,15 +21,15 @@ /***********/ /* Headers */ /***********/ -#include "H5private.h" /* Generic Functions */ +#include "H5private.h" /* Generic Functions */ #include "H5ACprivate.h" /* Metadata cache */ -#include "H5Eprivate.h" /* Error handling */ +#include "H5Eprivate.h" /* Error handling */ #include "H5Fpkg.h" /* File access */ -#include "H5FDprivate.h" /* File drivers */ -#include "H5Iprivate.h" /* IDs */ +#include "H5FDprivate.h" /* File drivers */ +#include "H5Iprivate.h" /* IDs */ #include "H5MFprivate.h" /* File memory management */ -#include "H5MMprivate.h" /* Memory management */ -#include "H5Pprivate.h" /* Property lists */ +#include "H5MMprivate.h" /* Memory management */ +#include "H5Pprivate.h" /* Property lists */ #include "H5SMprivate.h" /* Shared Object Header Messages */ @@ -158,7 +158,7 @@ H5F_super_ext_open(H5F_t *f, haddr_t ext_addr, H5O_loc_t *ext_ptr) /* Open the superblock extension object header */ if(H5O_open(ext_ptr) < 0) - HGOTO_ERROR(H5E_OHDR, H5E_CANTOPENOBJ, FAIL, "unable to open superblock extension") + HGOTO_ERROR(H5E_OHDR, H5E_CANTOPENOBJ, FAIL, "unable to open superblock extension") done: FUNC_LEAVE_NOAPI(ret_value) @@ -224,12 +224,12 @@ done: /*------------------------------------------------------------------------- * Function: H5F__update_super_ext_driver_msg * - * Purpose: Update the superblock extension file driver info message if - * we are using a V 2 superblock. Observe that the function - * is a NO-OP if the file driver info message does not exist. + * Purpose: Update the superblock extension file driver info message if + * we are using a V 2 superblock. Observe that the function + * is a NO-OP if the file driver info message does not exist. * This is necessary, as the function is called whenever the - * EOA is updated, and were it to create the file driver info - * message, it would find itself in an infinite recursion. + * EOA is updated, and were it to create the file driver info + * message, it would find itself in an infinite recursion. * * Return: Success: SUCCEED * Failure: FAIL @@ -267,7 +267,7 @@ H5F__update_super_ext_driver_msg(H5F_t *f, hid_t dxpl_id) /* Check for driver info */ H5_CHECKED_ASSIGN(driver_size, size_t, H5FD_sb_size(f->shared->lf), hsize_t); - /* Nothing to do unless there is both driver info and + /* Nothing to do unless there is both driver info and * the driver info superblock extension message has * already been created. */ @@ -330,9 +330,13 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial unsigned sblock_flags = H5AC__NO_FLAGS_SET; /* flags used in superblock unprotect call */ haddr_t super_addr; /* Absolute address of superblock */ haddr_t eof; /* End of file address */ - unsigned rw_flags; /* Read/write permissions for file */ - hbool_t skip_eof_check = FALSE; /* Whether to skip checking the EOF value */ + unsigned rw_flags; /* Read/write permissions for file */ + hbool_t skip_eof_check = FALSE; /* Whether to skip checking the EOF value */ herr_t ret_value = SUCCEED; /* Return value */ +#ifdef H5_HAVE_PARALLEL + int mpi_rank = 0, mpi_size = 1; + int mpi_result; +#endif /* H5_HAVE_PARALLEL */ FUNC_ENTER_PACKAGE_TAG(meta_dxpl_id, H5AC__SUPERBLOCK_TAG, FAIL) @@ -354,8 +358,51 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "can't get property list") /* Find the superblock */ - if(H5FD_locate_signature(&fdio_info, &super_addr) < 0) - HGOTO_ERROR(H5E_FILE, H5E_NOTHDF5, FAIL, "unable to locate file signature") +#ifdef H5_HAVE_PARALLEL + if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) { + + if((mpi_rank = H5F_mpi_get_rank(f)) < 0) + HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "Can't get MPI rank") + + if((mpi_size = H5F_mpi_get_size(f)) < 0) + HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "can't retrieve MPI communicator size") + } + + /* If we are an MPI application with at least two processes, the + * following superblock signature location optimization is applicable. + * + * Note:: For parallel applications which don't setup for using the + * HDF5 MPIO driver, we will arrive here with mpi_size == 1. + * This occurs because of the variable initialization (above) and the + * fact that we have skipped actually calling MPI functions to determine + * our MPI rank and size. + */ + if ( mpi_size > 1 ) { + MPI_Comm this_comm = MPI_COMM_NULL; + + if ( mpi_rank == 0 ) { + if (H5FD_locate_signature(&fdio_info, &super_addr) < 0) + HGOTO_ERROR(H5E_FILE, H5E_NOTHDF5, FAIL, "unable to locate file signature") + } + HDassert(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)); + + if ( MPI_COMM_NULL == (this_comm = H5F_mpi_get_comm(f)) ) + HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "can't get MPI communicator") + + if ( MPI_SUCCESS != + (mpi_result = MPI_Bcast(&super_addr,sizeof(super_addr), MPI_BYTE, 0, this_comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_result) + } + else { + /* Locate the signature as per per the serial library */ +#endif /* H5_HAVE_PARALLEL */ + + if (H5FD_locate_signature(&fdio_info, &super_addr) < 0) + HGOTO_ERROR(H5E_FILE, H5E_NOTHDF5, FAIL, "unable to locate file signature") + +#ifdef H5_HAVE_PARALLEL + } +#endif /* H5_HAVE_PARALLEL */ if(HADDR_UNDEF == super_addr) HGOTO_ERROR(H5E_FILE, H5E_NOTHDF5, FAIL, "file signature not found") @@ -406,12 +453,12 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial HGOTO_ERROR(H5E_FILE, H5E_CANTPROTECT, FAIL, "unable to load superblock") if(H5F_INTENT(f) & H5F_ACC_SWMR_WRITE) - if(sblock->super_vers < HDF5_SUPERBLOCK_VERSION_3) - HGOTO_ERROR(H5E_FILE, H5E_CANTPROTECT, FAIL, "invalid superblock version for SWMR_WRITE") + if(sblock->super_vers < HDF5_SUPERBLOCK_VERSION_3) + HGOTO_ERROR(H5E_FILE, H5E_CANTPROTECT, FAIL, "invalid superblock version for SWMR_WRITE") /* Enable all latest version support when file has v3 superblock */ if(sblock->super_vers >= HDF5_SUPERBLOCK_VERSION_3) - f->shared->latest_flags |= H5F_LATEST_ALL_FLAGS; + f->shared->latest_flags |= H5F_LATEST_ALL_FLAGS; /* Pin the superblock in the cache */ if(H5AC_pin_protected_entry(sblock) < 0) @@ -511,15 +558,15 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial * been flushed to disk by the SWMR writer process. */ if(H5F_INTENT(f) & H5F_ACC_SWMR_READ) { - /* - * When the file is opened for SWMR read access, skip the check if: - * --the file is already marked for SWMR writing and - * --the file has version 3 superblock for SWMR support - */ - if((sblock->status_flags & H5F_SUPER_SWMR_WRITE_ACCESS) && + /* + * When the file is opened for SWMR read access, skip the check if: + * --the file is already marked for SWMR writing and + * --the file has version 3 superblock for SWMR support + */ + if((sblock->status_flags & H5F_SUPER_SWMR_WRITE_ACCESS) && (sblock->status_flags & H5F_SUPER_WRITE_ACCESS) && sblock->super_vers >= HDF5_SUPERBLOCK_VERSION_3) - skip_eof_check = TRUE; + skip_eof_check = TRUE; } /* end if */ if(!skip_eof_check && initial_read) { if(HADDR_UNDEF == (eof = H5FD_get_eof(f->shared->lf, H5FD_MEM_DEFAULT))) @@ -593,7 +640,7 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial H5O_loc_t ext_loc; /* "Object location" for superblock extension */ H5O_btreek_t btreek; /* v1 B-tree 'K' value message from superblock extension */ H5O_drvinfo_t drvinfo; /* Driver info message from superblock extension */ - size_t u; /* Local index variable */ + size_t u; /* Local index variable */ htri_t status; /* Status for message existing */ /* Sanity check - superblock extension should only be defined for @@ -614,7 +661,7 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial } /* end if */ /* Open the superblock extension */ - if(H5F_super_ext_open(f, sblock->ext_addr, &ext_loc) < 0) + if(H5F_super_ext_open(f, sblock->ext_addr, &ext_loc) < 0) HGOTO_ERROR(H5E_FILE, H5E_CANTOPENOBJ, FAIL, "unable to open file's superblock extension") /* Check for the extension having a 'driver info' message */ @@ -637,8 +684,8 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial /* Reset driver info message */ H5O_msg_reset(H5O_DRVINFO_ID, &drvinfo); - HDassert(FALSE == f->shared->drvinfo_sb_msg_exists); - f->shared->drvinfo_sb_msg_exists = TRUE; + HDassert(FALSE == f->shared->drvinfo_sb_msg_exists); + f->shared->drvinfo_sb_msg_exists = TRUE; } /* end else */ } /* end if */ @@ -764,37 +811,37 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial } /* end if not marked "unknown" */ } /* end if */ - /* Check for the extension having a 'metadata cache image' message */ + /* Check for the extension having a 'metadata cache image' message */ if((status = H5O_msg_exists(&ext_loc, H5O_MDCI_MSG_ID, meta_dxpl_id)) < 0) HGOTO_ERROR(H5E_FILE, H5E_EXISTS, FAIL, "unable to read object header") if(status) { - hbool_t rw = ((rw_flags & H5AC__READ_ONLY_FLAG) == 0); - H5O_mdci_t mdci_msg; + hbool_t rw = ((rw_flags & H5AC__READ_ONLY_FLAG) == 0); + H5O_mdci_t mdci_msg; - /* if the metadata cache image superblock extension message exists, + /* if the metadata cache image superblock extension message exists, * read its contents and pass the data on to the metadata cache. * Given this data, the cache will load and decode the metadata - * cache image block, decoded it and load its contents into the - * the cache on the test protect call. + * cache image block, decoded it and load its contents into the + * the cache on the test protect call. * * Further, if the file is opened R/W, the metadata cache will - * delete the metadata cache image superblock extension and free - * the cache image block. Don't do this now as f->shared - * is not fully setup, which complicates matters. + * delete the metadata cache image superblock extension and free + * the cache image block. Don't do this now as f->shared + * is not fully setup, which complicates matters. */ /* Retrieve the 'metadata cache image message' structure */ - if(NULL == H5O_msg_read(&ext_loc, H5O_MDCI_MSG_ID, &mdci_msg, meta_dxpl_id)) + if(NULL == H5O_msg_read(&ext_loc, H5O_MDCI_MSG_ID, &mdci_msg, meta_dxpl_id)) HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "unable to get metadata cache image message") /* Indicate to the cache that there's an image to load on first protect call */ if(H5AC_load_cache_image_on_next_protect(f, mdci_msg.addr, mdci_msg.size, rw) < 0) - HGOTO_ERROR(H5E_FILE, H5E_CANTLOAD, FAIL, "call to H5AC_load_cache_image_on_next_protect failed"); + HGOTO_ERROR(H5E_FILE, H5E_CANTLOAD, FAIL, "call to H5AC_load_cache_image_on_next_protect failed"); } /* end if */ /* Close superblock extension */ if(H5F_super_ext_close(f, &ext_loc, meta_dxpl_id, FALSE) < 0) - HGOTO_ERROR(H5E_FILE, H5E_CANTCLOSEOBJ, FAIL, "unable to close file's superblock extension") + HGOTO_ERROR(H5E_FILE, H5E_CANTCLOSEOBJ, FAIL, "unable to close file's superblock extension") } /* end if */ /* Update the driver info if VFD indicated to do so */ diff --git a/testpar/CMakeLists.txt b/testpar/CMakeLists.txt index 39d23a9..0c9f70e 100644 --- a/testpar/CMakeLists.txt +++ b/testpar/CMakeLists.txt @@ -47,6 +47,7 @@ set (H5P_TESTS t_cache t_pflush1 t_pflush2 + t_pread t_pshutdown t_prestart t_init_term diff --git a/testpar/Makefile.am b/testpar/Makefile.am index b0fe0cd..1f15830 100644 --- a/testpar/Makefile.am +++ b/testpar/Makefile.am @@ -23,7 +23,7 @@ AM_CPPFLAGS+=-I$(top_srcdir)/src -I$(top_srcdir)/test # Test programs. These are our main targets. # -TEST_PROG_PARA=t_mpi t_bigio testphdf5 t_cache t_cache_image t_pflush1 t_pflush2 t_pshutdown t_prestart t_init_term t_shapesame t_filters_parallel +TEST_PROG_PARA=t_mpi t_bigio testphdf5 t_cache t_cache_image t_pflush1 t_pflush2 t_pread t_pshutdown t_prestart t_init_term t_shapesame t_filters_parallel check_PROGRAMS = $(TEST_PROG_PARA) diff --git a/testpar/t_pread.c b/testpar/t_pread.c new file mode 100644 index 0000000..7f23b9b --- /dev/null +++ b/testpar/t_pread.c @@ -0,0 +1,904 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright by The HDF Group. * + * Copyright by the Board of Trustees of the University of Illinois. * + * All rights reserved. * + * * + * This file is part of HDF5. The full HDF5 copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the COPYING file, which can be found at the root of the source code * + * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. * + * If you do not have access to either file, you may request a copy from * + * help@hdfgroup.org. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +/* + * Collective file open optimization tests + * + */ + +#include "h5test.h" +#include "testpar.h" + +/* The collection of files is included below to aid + * an external "cleanup" process if required. + * + * Note that the code below relies on the ordering of this array + * since each set of three is used by the tests either to construct + * or to read and validate. + */ +#define NFILENAME 9 +const char *FILENAMES[NFILENAME + 1]={"t_pread_data_file", + "reloc_t_pread_data_file", + "prefix_file", + "t_pread_group_0_file", + "reloc_t_pread_group_0_file", + "prefix_file_0", + "t_pread_group_1_file", + "reloc_t_pread_group_1_file", + "prefix_file_1", + NULL}; +#define FILENAME_BUF_SIZE 1024 + +#define COUNT 1000 + +hbool_t pass = true; +static const char *random_hdf5_text = +"Now is the time for all first-time-users of HDF5 to read their \ +manual or go thru the tutorials!\n\ +While you\'re at it, now is also the time to read up on MPI-IO."; + +static const char *hitchhiker_quote = +"A common mistake that people make when trying to design something\n\ +completely foolproof is to underestimate the ingenuity of complete\n\ +fools.\n"; + +static int generate_test_file(MPI_Comm comm, int mpi_rank, int group); +static int test_parallel_read(MPI_Comm comm, int mpi_rank, int group); + + +/*------------------------------------------------------------------------- + * Function: generate_test_file + * + * Purpose: This function is called to produce an HDF5 data file + * whose superblock is relocated to a non-zero offset by + * utilizing the 'h5jam' utility to write random text + * at the start of the file. Unlike simple concatenation + * of files, h5jam is used to place the superblock on a + * power-of-2 boundary. + * + * Since data will be read back and validated, we generate + * data in a predictable manner rather than randomly. + * For now, we simply use the global mpi_rank of the writing + * process as a starting component for the data generation. + * Subsequent writes are increments from the initial start + * value. + * + * In the overall scheme of running the test, we'll call + * this function twice: first as a collection of all MPI + * processes and then a second time with the processes split + * more or less in half. Each sub group will operate + * collectively on their assigned file. This split into + * subgroups validates that parallel groups can successfully + * open and read data independantly from the other parallel + * operations taking place. + * + * Return: Success: 0 + * + * Failure: 1 + * + * Programmer: Richard Warren + * 10/1/17 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static int +generate_test_file( MPI_Comm comm, int mpi_rank, int group_id ) +{ + FILE *header = NULL; + const char *fcn_name = "generate_test_file()"; + const char *failure_mssg = NULL; + const char *group_filename = NULL; + char data_filename[FILENAME_BUF_SIZE]; + char reloc_data_filename[FILENAME_BUF_SIZE]; + char prolog_filename[FILENAME_BUF_SIZE]; + int file_index; + int group_size; + int group_rank; + int local_failure = 0; + int global_failures = 0; + hsize_t count = COUNT; + hsize_t i; + hsize_t offset; + hsize_t dims[1] = {0}; + hid_t file_id = -1; + hid_t memspace = -1; + hid_t filespace = -1; + hid_t fapl_id = -1; + hid_t dxpl_id = -1; + hid_t dset_id = -1; + float nextValue; + float *data_slice = NULL; + + pass = true; + + HDassert(comm != MPI_COMM_NULL); + + if ( (MPI_Comm_rank(comm, &group_rank)) != MPI_SUCCESS) { + pass = FALSE; + failure_mssg = "generate_test_file: MPI_Comm_rank failed.\n"; + } + + if ( (MPI_Comm_size(comm, &group_size)) != MPI_SUCCESS) { + pass = FALSE; + failure_mssg = "generate_test_file: MPI_Comm_size failed.\n"; + } + + if ( mpi_rank == 0 ) { + + HDfprintf(stdout, "Constructing test files..."); + } + + /* Setup the file names + * The test specfic filenames are stored as consecutive + * array entries in the global 'FILENAMES' array above. + * Here, we simply decide on the starting index for + * file construction. The reading portion of the test + * will have a similar setup process... + */ + if ( pass ) { + if ( comm == MPI_COMM_WORLD ) { /* Test 1 */ + file_index = 0; + } + else if ( group_id == 0 ) { /* Test 2 group 0 */ + file_index = 3; + } + else { /* Test 2 group 1 */ + file_index = 6; + } + + /* The 'group_filename' is just a temp variable and + * is used to call into the h5_fixname function. No + * need to worry that we reassign it for each file! + */ + HDassert((group_filename = FILENAMES[file_index])); + + /* Assign the 'data_filename' */ + if ( h5_fixname(group_filename, H5P_DEFAULT, data_filename, + sizeof(data_filename)) == NULL ) { + pass = FALSE; + failure_mssg = "h5_fixname(0) failed.\n"; + } + } + + if ( pass ) { + + HDassert( (group_filename = FILENAMES[file_index+1]) ); + + /* Assign the 'reloc_data_filename' */ + if ( h5_fixname(group_filename, H5P_DEFAULT, reloc_data_filename, + sizeof(reloc_data_filename)) == NULL ) { + + pass = FALSE; + failure_mssg = "h5_fixname(1) failed.\n"; + } + } + + if ( pass ) { + + HDassert( (group_filename = FILENAMES[file_index+2]) ); + + /* Assign the 'prolog_filename' */ + if ( h5_fixname(group_filename, H5P_DEFAULT, prolog_filename, + sizeof(prolog_filename)) == NULL ) { + pass = FALSE; + failure_mssg = "h5_fixname(2) failed.\n"; + } + } + + /* setup data to write */ + if ( pass ) { + if ( (data_slice = (float *)HDmalloc(COUNT * sizeof(float))) == NULL ) { + pass = FALSE; + failure_mssg = "malloc of data_slice failed.\n"; + } + } + + if ( pass ) { + nextValue = (float)(mpi_rank * COUNT); + + for(i=0; i<COUNT; i++) { + data_slice[i] = nextValue; + nextValue += 1; + } + } + + /* setup FAPL */ + if ( pass ) { + if ( (fapl_id = H5Pcreate(H5P_FILE_ACCESS)) < 0 ) { + pass = FALSE; + failure_mssg = "H5Pcreate(H5P_FILE_ACCESS) failed.\n"; + } + } + + if ( pass ) { + if ( (H5Pset_fapl_mpio(fapl_id, comm, MPI_INFO_NULL)) < 0 ) { + pass = FALSE; + failure_mssg = "H5Pset_fapl_mpio() failed\n"; + } + } + + /* create the data file */ + if ( pass ) { + if ( (file_id = H5Fcreate(data_filename, H5F_ACC_TRUNC, + H5P_DEFAULT, fapl_id)) < 0 ) { + pass = FALSE; + failure_mssg = "H5Fcreate() failed.\n"; + } + } + + /* create and write the dataset */ + if ( pass ) { + if ( (dxpl_id = H5Pcreate(H5P_DATASET_XFER)) < 0 ) { + pass = FALSE; + failure_mssg = "H5Pcreate(H5P_DATASET_XFER) failed.\n"; + } + } + + if ( pass ) { + if ( (H5Pset_dxpl_mpio(dxpl_id, H5FD_MPIO_COLLECTIVE)) < 0 ) { + pass = FALSE; + failure_mssg = "H5Pset_dxpl_mpio() failed.\n"; + } + } + + if ( pass ) { + dims[0] = COUNT; + if ( (memspace = H5Screate_simple(1, dims, NULL)) < 0 ) { + pass = FALSE; + failure_mssg = "H5Screate_simple(1, dims, NULL) failed (1).\n"; + } + } + + if ( pass ) { + dims[0] *= (hsize_t)group_size; + if ( (filespace = H5Screate_simple(1, dims, NULL)) < 0 ) { + pass = FALSE; + failure_mssg = "H5Screate_simple(1, dims, NULL) failed (2).\n"; + } + } + + if ( pass ) { + offset = (hsize_t)group_rank * (hsize_t)COUNT; + if ( (H5Sselect_hyperslab(filespace, H5S_SELECT_SET, &offset, + NULL, &count, NULL)) < 0 ) { + pass = FALSE; + failure_mssg = "H5Sselect_hyperslab() failed.\n"; + } + } + + if ( pass ) { + if ( (dset_id = H5Dcreate2(file_id, "dataset0", H5T_NATIVE_FLOAT, + filespace, H5P_DEFAULT, H5P_DEFAULT, + H5P_DEFAULT)) < 0 ) { + pass = false; + failure_mssg = "H5Dcreate2() failed.\n"; + } + } + + if ( pass ) { + if ( (H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, + filespace, dxpl_id, data_slice)) < 0 ) { + pass = false; + failure_mssg = "H5Dwrite() failed.\n"; + } + } + + /* close file, etc. */ + if ( pass || (dset_id != -1)) { + if ( H5Dclose(dset_id) < 0 ) { + pass = false; + failure_mssg = "H5Dclose(dset_id) failed.\n"; + } + } + + if ( pass || (memspace != -1) ) { + if ( H5Sclose(memspace) < 0 ) { + pass = false; + failure_mssg = "H5Sclose(memspace) failed.\n"; + } + } + + if ( pass || (filespace != -1) ) { + if ( H5Sclose(filespace) < 0 ) { + pass = false; + failure_mssg = "H5Sclose(filespace) failed.\n"; + } + } + + if ( pass || (file_id != -1) ) { + if ( H5Fclose(file_id) < 0 ) { + pass = false; + failure_mssg = "H5Fclose(file_id) failed.\n"; + } + } + + if ( pass || (dxpl_id != -1) ) { + if ( H5Pclose(dxpl_id) < 0 ) { + pass = false; + failure_mssg = "H5Pclose(dxpl_id) failed.\n"; + } + } + + if ( pass || (fapl_id != -1) ) { + if ( H5Pclose(fapl_id) < 0 ) { + pass = false; + failure_mssg = "H5Pclose(fapl_id) failed.\n"; + } + } + + /* Add a userblock to the head of the datafile. + * We will use this to for a functional test of the + * file open optimization. This is superblock + * relocation is done by the rank 0 process associated + * with the communicator being used. For test 1, we + * utilize MPI_COMM_WORLD, so group_rank 0 is the + * same as mpi_rank 0. For test 2 which utilizes + * two groups resulting from an MPI_Comm_split, we + * will have parallel groups and hence two + * group_rank(0) processes. Each parallel group + * will create a unique file with different text + * headers and different data. + * + * We also delete files that are no longer needed. + */ + if ( group_rank == 0 ) { + + const char *text_to_write; + size_t bytes_to_write; + + if (group_id == 0) + text_to_write = random_hdf5_text; + else + text_to_write = hitchhiker_quote; + + bytes_to_write = strlen(text_to_write); + + if ( pass ) { + if ( (header = HDfopen(prolog_filename, "w+")) == NULL ) { + pass = FALSE; + failure_mssg = "HDfopen(prolog_filename, \"w+\") failed.\n"; + } + } + + if ( pass ) { + + if ( HDfwrite(text_to_write, 1, bytes_to_write, header) != + bytes_to_write ) { + pass = FALSE; + failure_mssg = "Unable to write header file.\n"; + } + } + + if ( pass || (header != NULL) ) { + if ( HDfclose(header) != 0 ) { + pass = FALSE; + failure_mssg = "HDfclose() failed.\n"; + } + } + + if ( pass ) { + char cmd[256]; + + HDsprintf(cmd, "../tools/src/h5jam/h5jam -i %s -u %s -o %s", + data_filename, prolog_filename, reloc_data_filename); + + if ( system(cmd) != 0 ) { + pass = FALSE; + failure_mssg = "invocation of h5jam failed.\n"; + } + } + + HDremove(prolog_filename); + HDremove(data_filename); + } + + /* collect results from other processes. + * Only overwrite the failure message if no preveious error + * has been detected + */ + local_failure = ( pass ? 0 : 1 ); + + /* This is a global all reduce (NOT group specific) */ + if ( MPI_Allreduce(&local_failure, &global_failures, 1, + MPI_INT, MPI_SUM, MPI_COMM_WORLD) != MPI_SUCCESS ) { + if ( pass ) { + pass = FALSE; + failure_mssg = "MPI_Allreduce() failed.\n"; + } + } else if ( ( pass ) && ( global_failures > 0 ) ) { + pass = FALSE; + failure_mssg = "One or more processes report failure.\n"; + } + + /* report results */ + if ( mpi_rank == 0 ) { + if ( pass ) { + HDfprintf(stdout, "Done.\n"); + } else { + HDfprintf(stdout, "FAILED.\n"); + HDfprintf(stdout, "%s: failure_mssg = \"%s\"\n", + fcn_name, failure_mssg); + } + } + + /* free data_slice if it has been allocated */ + if ( data_slice != NULL ) { + HDfree(data_slice); + data_slice = NULL; + } + + return(! pass); + +} /* generate_test_file() */ + + +/*------------------------------------------------------------------------- + * Function: test_parallel_read + * + * Purpose: This actually tests the superblock optimization + * and covers the two primary cases we're interested in. + * 1). That HDF5 files can be opened in parallel by + * the rank 0 process and that the superblock + * offset is correctly broadcast to the other + * parallel file readers. + * 2). That a parallel application can correctly + * handle reading multiple files by using + * subgroups of MPI_COMM_WORLD and that each + * subgroup operates as described in (1) to + * collectively read the data. + * + * The global MPI rank is used for reading and + * writing data for process specific data in the + * dataset. We do this rather simplisticly, i.e. + * rank 0: writes/reads 0-9999 + * rank 1: writes/reads 1000-1999 + * rank 2: writes/reads 2000-2999 + * ... + * + * Return: Success: 0 + * + * Failure: 1 + * + * Programmer: Richard Warren + * 10/1/17 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static int +test_parallel_read(MPI_Comm comm, int mpi_rank, int group_id) +{ + const char *failure_mssg; + const char *fcn_name = "test_parallel_read()"; + const char *group_filename = NULL; + char reloc_data_filename[FILENAME_BUF_SIZE]; + int local_failure = 0; + int global_failures = 0; + int group_size; + int group_rank; + hid_t fapl_id = -1; + hid_t file_id = -1; + hid_t dset_id = -1; + hid_t memspace = -1; + hid_t filespace = -1; + hsize_t i; + hsize_t offset; + hsize_t count = COUNT; + hsize_t dims[1] = {0}; + float nextValue; + float *data_slice = NULL; + + pass = TRUE; + + HDassert(comm != MPI_COMM_NULL); + + if ( (MPI_Comm_rank(comm, &group_rank)) != MPI_SUCCESS) { + pass = FALSE; + failure_mssg = "test_parallel_read: MPI_Comm_rank failed.\n"; + } + + if ( (MPI_Comm_size(comm, &group_size)) != MPI_SUCCESS) { + pass = FALSE; + failure_mssg = "test_parallel_read: MPI_Comm_size failed.\n"; + } + + if ( mpi_rank == 0 ) { + if ( comm == MPI_COMM_WORLD ) { + TESTING("parallel file open test 1"); + } + else { + TESTING("parallel file open test 2"); + } + } + + /* allocate space for the data_slice array */ + if ( pass ) { + if ( (data_slice = (float *)HDmalloc(COUNT * sizeof(float))) == NULL ) { + pass = FALSE; + failure_mssg = "malloc of data_slice failed.\n"; + } + } + + + /* Select the file file name to read + * Please see the comments in the 'generate_test_file' function + * for more details... + */ + if ( pass ) { + + if ( comm == MPI_COMM_WORLD ) /* test 1 */ + group_filename = FILENAMES[1]; + else if ( group_id == 0 ) /* test 2 group 0 */ + group_filename = FILENAMES[4]; + else /* test 2 group 1 */ + group_filename = FILENAMES[7]; + + HDassert(group_filename); + if ( h5_fixname(group_filename, H5P_DEFAULT, reloc_data_filename, + sizeof(reloc_data_filename)) == NULL ) { + + pass = FALSE; + failure_mssg = "h5_fixname(1) failed.\n"; + } + } + + /* setup FAPL */ + if ( pass ) { + if ( (fapl_id = H5Pcreate(H5P_FILE_ACCESS)) < 0 ) { + pass = FALSE; + failure_mssg = "H5Pcreate(H5P_FILE_ACCESS) failed.\n"; + } + } + + if ( pass ) { + if ( (H5Pset_fapl_mpio(fapl_id, comm, MPI_INFO_NULL)) < 0 ) { + pass = FALSE; + failure_mssg = "H5Pset_fapl_mpio() failed\n"; + } + } + + /* open the file -- should have user block, exercising the optimization */ + if ( pass ) { + if ( (file_id = H5Fopen(reloc_data_filename, + H5F_ACC_RDONLY, fapl_id)) < 0 ) { + pass = FALSE; + failure_mssg = "H5Fopen() failed\n"; + } + } + + /* open the data set */ + if ( pass ) { + if ( (dset_id = H5Dopen2(file_id, "dataset0", H5P_DEFAULT)) < 0 ) { + pass = FALSE; + failure_mssg = "H5Dopen2() failed\n"; + } + } + + /* setup memspace */ + if ( pass ) { + dims[0] = count; + if ( (memspace = H5Screate_simple(1, dims, NULL)) < 0 ) { + pass = FALSE; + failure_mssg = "H5Screate_simple(1, dims, NULL) failed\n"; + } + } + + /* setup filespace */ + if ( pass ) { + if ( (filespace = H5Dget_space(dset_id)) < 0 ) { + pass = FALSE; + failure_mssg = "H5Dget_space(dataset) failed\n"; + } + } + + if ( pass ) { + offset = (hsize_t)group_rank * count; + if ( (H5Sselect_hyperslab(filespace, H5S_SELECT_SET, + &offset, NULL, &count, NULL)) < 0 ) { + pass = FALSE; + failure_mssg = "H5Sselect_hyperslab() failed\n"; + } + } + + /* read this processes section of the data */ + if ( pass ) { + if ( (H5Dread(dset_id, H5T_NATIVE_FLOAT, memspace, + filespace, H5P_DEFAULT, data_slice)) < 0 ) { + pass = FALSE; + failure_mssg = "H5Dread() failed\n"; + } + } + + /* verify the data */ + if ( pass ) { + nextValue = (float)((hsize_t)mpi_rank * count); + i = 0; + while ( ( pass ) && ( i < count ) ) { + /* what we really want is data_slice[i] != nextValue -- + * the following is a circumlocution to shut up the + * the compiler. + */ + if ( ( data_slice[i] > nextValue ) || + ( data_slice[i] < nextValue ) ) { + pass = FALSE; + failure_mssg = "Unexpected dset contents.\n"; + } + nextValue += 1; + i++; + } + } + + /* close file, etc. */ + if ( pass || (dset_id != -1) ) { + if ( H5Dclose(dset_id) < 0 ) { + pass = false; + failure_mssg = "H5Dclose(dset_id) failed.\n"; + } + } + + if ( pass || (memspace != -1) ) { + if ( H5Sclose(memspace) < 0 ) { + pass = false; + failure_mssg = "H5Sclose(memspace) failed.\n"; + } + } + + if ( pass || (filespace != -1) ) { + if ( H5Sclose(filespace) < 0 ) { + pass = false; + failure_mssg = "H5Sclose(filespace) failed.\n"; + } + } + + if ( pass || (file_id != -1) ) { + if ( H5Fclose(file_id) < 0 ) { + pass = false; + failure_mssg = "H5Fclose(file_id) failed.\n"; + } + } + + if ( pass || (fapl_id != -1) ) { + if ( H5Pclose(fapl_id) < 0 ) { + pass = false; + failure_mssg = "H5Pclose(fapl_id) failed.\n"; + } + } + + /* collect results from other processes. + * Only overwrite the failure message if no preveious error + * has been detected + */ + local_failure = ( pass ? 0 : 1 ); + + if ( MPI_Allreduce( &local_failure, &global_failures, 1, + MPI_INT, MPI_SUM, MPI_COMM_WORLD) != MPI_SUCCESS ) { + if ( pass ) { + pass = FALSE; + failure_mssg = "MPI_Allreduce() failed.\n"; + } + } else if ( ( pass ) && ( global_failures > 0 ) ) { + pass = FALSE; + failure_mssg = "One or more processes report failure.\n"; + } + + /* report results and finish cleanup */ + if ( group_rank == 0 ) { + if ( pass ) { + PASSED(); + } else { + H5_FAILED(); + HDfprintf(stdout, "%s: failure_mssg = \"%s\"\n", + fcn_name, failure_mssg); + } + + HDremove(reloc_data_filename); + } + + /* free data_slice if it has been allocated */ + if ( data_slice != NULL ) { + HDfree(data_slice); + data_slice = NULL; + } + + + return( ! pass ); + +} /* test_parallel_read() */ + + +/*------------------------------------------------------------------------- + * Function: main + * + * Purpose: To implement a parallel test which validates whether the + * new superblock lookup functionality is working correctly. + * + * The test consists of creating two seperate HDF datasets + * in which random text is inserted at the start of each + * file using the 'j5jam' application. This forces the + * HDF5 file superblock to a non-zero offset. + * Having created the two independant files, we create two + * non-overlapping MPI groups, each of which is then tasked + * with the opening and validation of the data contained + * therein. + * + * WARNING: This test uses fork() and execve(), and + * therefore will not run on Windows. + * + * Return: Success: 0 + * + * Failure: 1 + * + * Programmer: Richard Warren + * 10/1/17 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ + +int +main( int argc, char **argv) +{ + int nerrs = 0; + int which_group = 0; + int mpi_rank; + int mpi_size; + int split_size; + MPI_Comm group_comm = MPI_COMM_WORLD; + + if ( (MPI_Init(&argc, &argv)) != MPI_SUCCESS) { + HDfprintf(stderr, "FATAL: Unable to initialize MPI\n"); + HDexit(EXIT_FAILURE); + } + + if ( (MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank)) != MPI_SUCCESS) { + HDfprintf(stderr, "FATAL: MPI_Comm_rank returned an error\n"); + HDexit(EXIT_FAILURE); + } + + if ( (MPI_Comm_size(MPI_COMM_WORLD, &mpi_size)) != MPI_SUCCESS) { + HDfprintf(stderr, "FATAL: MPI_Comm_size returned an error\n"); + HDexit(EXIT_FAILURE); + } + + H5open(); + + if ( mpi_rank == 0 ) { + HDfprintf(stdout, "========================================\n"); + HDfprintf(stdout, "Collective file open optimization tests\n"); + HDfprintf(stdout, " mpi_size = %d\n", mpi_size); + HDfprintf(stdout, "========================================\n"); + } + + if ( mpi_size < 4 ) { + + if ( mpi_rank == 0 ) { + + HDprintf(" Need at least 4 processes. Exiting.\n"); + } + goto finish; + } + + /* ------ Create two (2) MPI groups ------ + * + * We split MPI_COMM_WORLD into 2 more or less equal sized + * groups. The resulting communicators will be used to generate + * two HDF files which in turn will be opened in parallel and the + * contents verified in the second read test below. + */ + split_size = mpi_size / 2; + which_group = (mpi_rank < split_size ? 0 : 1); + + if ( (MPI_Comm_split(MPI_COMM_WORLD, + which_group, + 0, + &group_comm)) != MPI_SUCCESS) { + + HDfprintf(stderr, "FATAL: MPI_Comm_split returned an error\n"); + HDexit(EXIT_FAILURE); + } + + /* ------ Generate all files ------ */ + + /* We generate the file used for test 1 */ + nerrs += generate_test_file( MPI_COMM_WORLD, mpi_rank, which_group ); + + if ( nerrs > 0 ) { + if ( mpi_rank == 0 ) { + HDprintf(" Test(1) file construction failed -- skipping tests.\n"); + } + goto finish; + } + + /* We generate the file used for test 2 */ + nerrs += generate_test_file( group_comm, mpi_rank, which_group ); + + if ( nerrs > 0 ) { + if ( mpi_rank == 0 ) { + HDprintf(" Test(2) file construction failed -- skipping tests.\n"); + } + goto finish; + } + + /* Now read the generated test file (stil using MPI_COMM_WORLD) */ + nerrs += test_parallel_read( MPI_COMM_WORLD, mpi_rank, which_group); + + if ( nerrs > 0 ) { + if ( mpi_rank == 0 ) { + HDprintf(" Parallel read test(1) failed -- skipping tests.\n"); + } + goto finish; + } + + /* Update the user on our progress so far. */ + if ( mpi_rank == 0 ) { + HDprintf(" Test 1 of 2 succeeded\n"); + HDprintf(" -- Starting multi-group parallel read test.\n"); + } + + /* run the 2nd set of tests */ + nerrs += test_parallel_read(group_comm, mpi_rank, which_group); + + if ( nerrs > 0 ) { + if ( mpi_rank == 0 ) { + HDprintf(" Multi-group read test(2) failed\n"); + } + goto finish; + } + + if ( mpi_rank == 0 ) { + HDprintf(" Test 2 of 2 succeeded\n"); + } + +finish: + + if ((group_comm != MPI_COMM_NULL) && + (MPI_Comm_free(&group_comm)) != MPI_SUCCESS) { + HDfprintf(stderr, "MPI_Comm_free failed!\n"); + } + + + /* make sure all processes are finished before final report, cleanup + * and exit. + */ + MPI_Barrier(MPI_COMM_WORLD); + + if ( mpi_rank == 0 ) { /* only process 0 reports */ + const char *header = "Collective file open optimization tests"; + + HDfprintf(stdout, "===================================\n"); + if ( nerrs > 0 ) { + + HDfprintf(stdout, "***%s detected %d failures***\n", header, nerrs); + } + else { + HDfprintf(stdout, "%s finished with no failures\n", header); + } + HDfprintf(stdout, "===================================\n"); + } + + /* close HDF5 library */ + if (H5close() != SUCCEED) { + HDfprintf(stdout, "H5close() failed. (Ignoring)\n"); + } + + /* MPI_Finalize must be called AFTER H5close which may use MPI calls */ + MPI_Finalize(); + + /* cannot just return (nerrs) because exit code is limited to 1byte */ + return((nerrs > 0) ? EXIT_FAILURE : EXIT_SUCCESS ); + +} /* main() */ |