From 81094ac3cfbf5785c0006516133d0dc34665b81c Mon Sep 17 00:00:00 2001 From: Richard Warren Date: Thu, 28 Sep 2017 16:27:29 -0400 Subject: The initial coding for the superblock read optization --- src/H5FDmpio.h | 21 ++++++ src/H5Fsuper.c | 30 +++++++- testpar/CMakeLists.txt | 1 + testpar/Makefile.am | 2 +- testpar/t_pread.c | 196 +++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 247 insertions(+), 3 deletions(-) create mode 100644 testpar/t_pread.c diff --git a/src/H5FDmpio.h b/src/H5FDmpio.h index 6ee0a1a..9bcc182 100644 --- a/src/H5FDmpio.h +++ b/src/H5FDmpio.h @@ -29,6 +29,27 @@ #endif /* H5_HAVE_PARALLEL */ #ifdef H5_HAVE_PARALLEL +#define H5FD_GET_MPI_RANK_AND_SIZE(rank,size, f) { \ + (rank) = 0; (size) = 1; \ + if (H5F_HAS_FEATURE((f), H5FD_FEAT_HAS_MPI)) { \ + (rank) = H5F_mpi_get_rank((f)); \ + (size) = H5F_mpi_get_size((f)); \ + } else { \ + int mpi_initialized = 0, mpi_finalized = 0; \ + MPI_Initialized(&mpi_initialized); \ + MPI_Finalized(&mpi_finalized); \ + if (mpi_initialized && !mpi_finalized) { \ + MPI_Comm_rank(MPI_COMM_WORLD, &(rank)); \ + MPI_Comm_size(MPI_COMM_WORLD, &(size)); \ + } \ + }} + +#define H5FD_GET_MPI_COMM(comm, f) { \ + if (H5F_HAS_FEATURE((f), H5FD_FEAT_HAS_MPI)) \ + (comm) = H5F_mpi_get_comm((f)); \ + else (comm) = MPI_COMM_WORLD; \ + } + /*Turn on H5FDmpio_debug if H5F_DEBUG is on */ #ifdef H5F_DEBUG #ifndef H5FDmpio_DEBUG diff --git a/src/H5Fsuper.c b/src/H5Fsuper.c index 7c70a64..32051f3 100644 --- a/src/H5Fsuper.c +++ b/src/H5Fsuper.c @@ -333,6 +333,7 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial unsigned rw_flags; /* Read/write permissions for file */ hbool_t skip_eof_check = FALSE; /* Whether to skip checking the EOF value */ herr_t ret_value = SUCCEED; /* Return value */ + int mpi_rank = 0, mpi_size = 1; FUNC_ENTER_PACKAGE_TAG(meta_dxpl_id, H5AC__SUPERBLOCK_TAG, FAIL) @@ -354,8 +355,33 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "can't get property list") /* Find the superblock */ - if(H5FD_locate_signature(&fdio_info, &super_addr) < 0) - HGOTO_ERROR(H5E_FILE, H5E_NOTHDF5, FAIL, "unable to locate file signature") +#ifdef H5_HAVE_PARALLEL + H5FD_GET_MPI_RANK_AND_SIZE(mpi_rank, mpi_size, f); + /* If we are an MPI application with at least two processes, the + * following superblock signature location optimization is applicable. + */ + if ( mpi_size > 1 ) { + MPI_Comm this_comm = MPI_COMM_NULL; + + if ( mpi_rank == 0 ) { + if(H5FD_locate_signature(&fdio_info, &super_addr) < 0) + HGOTO_ERROR(H5E_FILE, H5E_NOTHDF5, FAIL, "unable to locate file signature") + } + H5FD_GET_MPI_COMM(this_comm, f); + if (( this_comm == MPI_COMM_NULL ) || + ( MPI_Bcast(&super_addr,sizeof(super_addr), MPI_BYTE, 0, this_comm) != MPI_SUCCESS)) + HGOTO_ERROR(H5E_FILE, H5E_NOTHDF5, FAIL, "unable to locate file signature") + } + else { + /* Locate the signature as per per the serial library */ +#endif /* H5_HAVE_PARALLEL */ + + if(H5FD_locate_signature(&fdio_info, &super_addr) < 0) + HGOTO_ERROR(H5E_FILE, H5E_NOTHDF5, FAIL, "unable to locate file signature") + +#ifdef H5_HAVE_PARALLEL + } +#endif if(HADDR_UNDEF == super_addr) HGOTO_ERROR(H5E_FILE, H5E_NOTHDF5, FAIL, "file signature not found") diff --git a/testpar/CMakeLists.txt b/testpar/CMakeLists.txt index 39d23a9..0c9f70e 100644 --- a/testpar/CMakeLists.txt +++ b/testpar/CMakeLists.txt @@ -47,6 +47,7 @@ set (H5P_TESTS t_cache t_pflush1 t_pflush2 + t_pread t_pshutdown t_prestart t_init_term diff --git a/testpar/Makefile.am b/testpar/Makefile.am index b0fe0cd..1f15830 100644 --- a/testpar/Makefile.am +++ b/testpar/Makefile.am @@ -23,7 +23,7 @@ AM_CPPFLAGS+=-I$(top_srcdir)/src -I$(top_srcdir)/test # Test programs. These are our main targets. # -TEST_PROG_PARA=t_mpi t_bigio testphdf5 t_cache t_cache_image t_pflush1 t_pflush2 t_pshutdown t_prestart t_init_term t_shapesame t_filters_parallel +TEST_PROG_PARA=t_mpi t_bigio testphdf5 t_cache t_cache_image t_pflush1 t_pflush2 t_pread t_pshutdown t_prestart t_init_term t_shapesame t_filters_parallel check_PROGRAMS = $(TEST_PROG_PARA) diff --git a/testpar/t_pread.c b/testpar/t_pread.c new file mode 100644 index 0000000..4512185 --- /dev/null +++ b/testpar/t_pread.c @@ -0,0 +1,196 @@ +#include +#include +#include + +#include "mpi.h" +#include "hdf5.h" + +static char *random_hdf5_text = + "Now is the time for all first-time-users of HDF5 to read their manual or go thru the tutorials!\n\ +While you\'re at it, now is also the time to read up on MPI-IO."; + +static char *datafile_relocated = "relocated_super.h5"; +hbool_t pass = true; + + +static void +generate_test_file( int mpi_rank, int mpi_size ) +{ + FILE *header; + char *datafile_base = "mytestfile.h5"; + char *prologue_file = "hdf5_readme.txt"; + hid_t file_id, memspace, filespace, attr_id, fapl_id, dxpl_id, dset_id; + hsize_t i, offset, count = 1000; + hsize_t dims[1] = {0}; + float nextValue, data_slice[count]; + + pass = true; + + nextValue = (float)(mpi_rank * count); + for(i=0; i 0 ) return; + + /* ---- mpi_rank 0 ------*/ + header = fopen( prologue_file, "w+"); + if (header == NULL) { + pass = false; + HDfprintf(stderr, "FATAL: Unable to create a simple txt file\n"); + return; + } + else { + size_t bytes_written, bytes_to_write = strlen(random_hdf5_text); + bytes_written = fwrite( random_hdf5_text, 1, bytes_to_write , header); + if (bytes_written == 0) { + pass = false; + HDfprintf(stderr, "FATAL: Unable to write a simple txt file\n"); + } + fclose(header); + } + + if ( pass ) { + char cmd[256]; + sprintf(cmd, "../tools/src/h5jam/h5jam -i %s -u %s -o %s", + datafile_base, prologue_file, datafile_relocated); + system(cmd); + unlink(datafile_base); + unlink(prologue_file); + } +} + + +static void +test_parallel_read( int mpi_rank, int mpi_size ) +{ + int status, errors = 0; + hid_t access_plist = -1, dataset = -1; + hid_t file_id = -1, memspace = -1, dataspace = -1; + hsize_t i, offset, count = 1000; + hsize_t dims[1] = {0}; + float nextValue, data_slice[count]; + herr_t ret; + + access_plist = H5Pcreate(H5P_FILE_ACCESS); + if (access_plist >= 0) { + ret = H5Pset_fapl_mpio(access_plist, MPI_COMM_WORLD, MPI_INFO_NULL); + } else pass = false; + if (ret >= 0) { + file_id = H5Fopen(datafile_relocated,H5F_ACC_RDONLY,access_plist); + } else pass = false; + if (file_id >= 0) { + dataset = H5Dopen2(file_id, "dataset0", H5P_DEFAULT); + } else pass = false; + if (dataset >= 0) { + dims[0] = count; + memspace = H5Screate_simple(1, dims, NULL); + } else pass = false; + if ( memspace >= 0 ) { + dataspace = H5Dget_space(dataset); + } else pass = false; + if ( dataspace >= 0 ) { + offset = mpi_rank * count; + ret = H5Sselect_hyperslab(dataspace, H5S_SELECT_SET, &offset, NULL, &count, NULL); + } else pass = false; + if ( ret >= 0 ) { + ret = H5Dread(dataset, H5T_NATIVE_FLOAT, memspace, dataspace, H5P_DEFAULT, data_slice); + } else pass = false; + if (ret >= 0) { + nextValue = (float)(mpi_rank * count); + for (i=0; i < count; i++) { + if (data_slice[i] != nextValue) pass = false; + nextValue += 1; + } + } else pass = false; + + status = ( pass ? 0 : -1 ); + MPI_Allreduce( &status, &errors, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD ); + + if ( mpi_rank == 0) + HDfprintf(stderr, "H5Fopen/H5Dread/data_validation %s\n", ((errors == 0) ? "succeeded" : "FAILED")); + + H5Pclose(access_plist); + H5Dclose(dataset); + H5Fclose(file_id); + + /* Cleanup */ + unlink(datafile_relocated); + + return; +} + + +int +main( int argc, char **argv) +{ + int status, errors, mpi_rank, mpi_size; + + if ((status = MPI_Init(&argc, &argv)) != MPI_SUCCESS) { + HDfprintf(stderr, "FATAL: Unable to initialize MPI\n"); + exit(1); + } + if ((status = MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank)) != MPI_SUCCESS) { + HDfprintf(stderr, "FATAL: MPI_Comm_rank returned an error\n"); + exit(2); + } + if ((status = MPI_Comm_size(MPI_COMM_WORLD, &mpi_size)) != MPI_SUCCESS) { + HDfprintf(stderr, "FATAL: MPI_Comm_size returned an error\n"); + exit(2); + } + + generate_test_file( mpi_rank, mpi_size ); + status = ( pass ? 0 : -1 ); + + /* Synch all ranks before attempting the parallel read */ + if ( MPI_Allreduce( &status, &errors, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD ) != MPI_SUCCESS) { + pass = false; + if (mpi_rank == 0) HDfprintf(stderr, "FATAL: MPI_Allreduce returned an error\n"); + } + + if ( errors == 0 ) { + test_parallel_read( mpi_rank, mpi_size ); + } + + MPI_Finalize(); + return 0; +} -- cgit v0.12 From ceab5a51766f3e32d02bc65d59374c79910483bd Mon Sep 17 00:00:00 2001 From: mainzer Date: Thu, 5 Oct 2017 17:25:57 -0500 Subject: Edits to the file open optimization and associated test code to bring them closer to the HDF5 library's unwritten coding standards. Also bug fix to repair a hang in testphdf5. Tested parallel/debug on Charis and Jelly, parallel/production on Jelly. --- src/H5FDmpio.h | 2 + src/H5Fsuper.c | 47 ++- testpar/t_pread.c | 835 +++++++++++++++++++++++++++++++++++++++++++----------- 3 files changed, 704 insertions(+), 180 deletions(-) diff --git a/src/H5FDmpio.h b/src/H5FDmpio.h index 9bcc182..f02afe6 100644 --- a/src/H5FDmpio.h +++ b/src/H5FDmpio.h @@ -29,6 +29,7 @@ #endif /* H5_HAVE_PARALLEL */ #ifdef H5_HAVE_PARALLEL +#if 0 /* delete this eventually */ #define H5FD_GET_MPI_RANK_AND_SIZE(rank,size, f) { \ (rank) = 0; (size) = 1; \ if (H5F_HAS_FEATURE((f), H5FD_FEAT_HAS_MPI)) { \ @@ -49,6 +50,7 @@ (comm) = H5F_mpi_get_comm((f)); \ else (comm) = MPI_COMM_WORLD; \ } +#endif /* delete this eventually */ /*Turn on H5FDmpio_debug if H5F_DEBUG is on */ #ifdef H5F_DEBUG diff --git a/src/H5Fsuper.c b/src/H5Fsuper.c index 4750223..a34a7fd 100644 --- a/src/H5Fsuper.c +++ b/src/H5Fsuper.c @@ -333,7 +333,10 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial unsigned rw_flags; /* Read/write permissions for file */ hbool_t skip_eof_check = FALSE; /* Whether to skip checking the EOF value */ herr_t ret_value = SUCCEED; /* Return value */ +#ifdef H5_HAVE_PARALLEL int mpi_rank = 0, mpi_size = 1; + int mpi_result; +#endif /* H5_HAVE_PARALLEL */ FUNC_ENTER_PACKAGE_TAG(meta_dxpl_id, H5AC__SUPERBLOCK_TAG, FAIL) @@ -356,21 +359,43 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial /* Find the superblock */ #ifdef H5_HAVE_PARALLEL +#if 0 H5FD_GET_MPI_RANK_AND_SIZE(mpi_rank, mpi_size, f); +#else + if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) { + + if((mpi_rank = H5F_mpi_get_rank(f)) < 0) + HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "Can't get MPI rank") + + if((mpi_size = H5F_mpi_get_size(f)) < 0) + HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTGET, FAIL, "can't retrieve MPI communicator size") + } +#endif /* If we are an MPI application with at least two processes, the * following superblock signature location optimization is applicable. */ if ( mpi_size > 1 ) { - MPI_Comm this_comm = MPI_COMM_NULL; - - if ( mpi_rank == 0 ) { - if(H5FD_locate_signature(&fdio_info, &super_addr) < 0) - HGOTO_ERROR(H5E_FILE, H5E_NOTHDF5, FAIL, "unable to locate file signature") - } - H5FD_GET_MPI_COMM(this_comm, f); - if (( this_comm == MPI_COMM_NULL ) || - ( MPI_Bcast(&super_addr,sizeof(super_addr), MPI_BYTE, 0, this_comm) != MPI_SUCCESS)) - HGOTO_ERROR(H5E_FILE, H5E_NOTHDF5, FAIL, "unable to locate file signature") + MPI_Comm this_comm = MPI_COMM_NULL; + + if ( mpi_rank == 0 ) { + if(H5FD_locate_signature(&fdio_info, &super_addr) < 0) + HGOTO_ERROR(H5E_FILE, H5E_NOTHDF5, FAIL, "unable to locate file signature") + } +#if 0 + H5FD_GET_MPI_COMM(this_comm, f); + if (( this_comm == MPI_COMM_NULL ) || + ( MPI_Bcast(&super_addr,sizeof(super_addr), MPI_BYTE, 0, this_comm) != MPI_SUCCESS)) + HGOTO_ERROR(H5E_FILE, H5E_NOTHDF5, FAIL, "unable to locate file signature") +#else + HDassert(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)); + + if ( MPI_COMM_NULL == (this_comm = H5F_mpi_get_comm(f)) ) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI communicator") + + if ( MPI_SUCCESS != + (mpi_result = MPI_Bcast(&super_addr,sizeof(super_addr), MPI_BYTE, 0, this_comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_result) +#endif } else { /* Locate the signature as per per the serial library */ @@ -381,7 +406,7 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial #ifdef H5_HAVE_PARALLEL } -#endif +#endif /* H5_HAVE_PARALLEL */ if(HADDR_UNDEF == super_addr) HGOTO_ERROR(H5E_FILE, H5E_NOTHDF5, FAIL, "file signature not found") diff --git a/testpar/t_pread.c b/testpar/t_pread.c index 4512185..ecc7360 100644 --- a/testpar/t_pread.c +++ b/testpar/t_pread.c @@ -1,196 +1,693 @@ -#include -#include -#include +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright by The HDF Group. * + * Copyright by the Board of Trustees of the University of Illinois. * + * All rights reserved. * + * * + * This file is part of HDF5. The full HDF5 copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the COPYING file, which can be found at the root of the source code * + * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. * + * If you do not have access to either file, you may request a copy from * + * help@hdfgroup.org. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ -#include "mpi.h" -#include "hdf5.h" +/* + * Collective file open optimization tests + * + */ -static char *random_hdf5_text = - "Now is the time for all first-time-users of HDF5 to read their manual or go thru the tutorials!\n\ -While you\'re at it, now is also the time to read up on MPI-IO."; -static char *datafile_relocated = "relocated_super.h5"; +#include "h5test.h" +#include "testpar.h" + +#define NFILENAME 3 +const char *FILENAMES[NFILENAME + 1]={"t_pread_data_file", + "reloc_t_pread_data_file", + "prefix_file", + NULL}; +#define FILENAME_BUF_SIZE 1024 + +#define COUNT 1000 + hbool_t pass = true; +static const char *random_hdf5_text = +"Now is the time for all first-time-users of HDF5 to read their \ +manual or go thru the tutorials!\n\ +While you\'re at it, now is also the time to read up on MPI-IO."; +static int generate_test_file(int mpi_rank, int mpi_size); +static int test_parallel_read(int mpi_rank); -static void + +/*------------------------------------------------------------------------- + * Function: generate_test_file + * + * Purpose: *** Richard -- please fill this in *** + * + * + * Return: Success: 0 + * + * Failure: 1 + * + * Programmer: Richard Warren + * 10/1/17 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static int generate_test_file( int mpi_rank, int mpi_size ) { - FILE *header; - char *datafile_base = "mytestfile.h5"; - char *prologue_file = "hdf5_readme.txt"; - hid_t file_id, memspace, filespace, attr_id, fapl_id, dxpl_id, dset_id; - hsize_t i, offset, count = 1000; - hsize_t dims[1] = {0}; - float nextValue, data_slice[count]; - - pass = true; - - nextValue = (float)(mpi_rank * count); - for(i=0; i 0 ) return; - - /* ---- mpi_rank 0 ------*/ - header = fopen( prologue_file, "w+"); - if (header == NULL) { - pass = false; - HDfprintf(stderr, "FATAL: Unable to create a simple txt file\n"); - return; - } - else { - size_t bytes_written, bytes_to_write = strlen(random_hdf5_text); - bytes_written = fwrite( random_hdf5_text, 1, bytes_to_write , header); - if (bytes_written == 0) { - pass = false; - HDfprintf(stderr, "FATAL: Unable to write a simple txt file\n"); - } - fclose(header); - } + /* setup the file names */ + if ( pass ) { + HDassert(FILENAMES[0]); - if ( pass ) { - char cmd[256]; - sprintf(cmd, "../tools/src/h5jam/h5jam -i %s -u %s -o %s", - datafile_base, prologue_file, datafile_relocated); - system(cmd); - unlink(datafile_base); - unlink(prologue_file); - } -} + if ( h5_fixname(FILENAMES[0], H5P_DEFAULT, data_filename, + sizeof(data_filename)) == NULL ) { + pass = FALSE; + failure_mssg = "h5_fixname(0) failed.\n"; + } + } + + if ( pass ) { + HDassert(FILENAMES[1]); + + if ( h5_fixname(FILENAMES[1], H5P_DEFAULT, reloc_data_filename, + sizeof(reloc_data_filename)) == NULL ) { + + pass = FALSE; + failure_mssg = "h5_fixname(1) failed.\n"; + } + } + + if ( pass ) { + HDassert(FILENAMES[2]); + + if ( h5_fixname(FILENAMES[2], H5P_DEFAULT, prolog_filename, + sizeof(prolog_filename)) == NULL ) { + pass = FALSE; + failure_mssg = "h5_fixname(2) failed.\n"; + } + } + + /* setup data to write */ + if ( pass ) { + if ( (data_slice = (float *)HDmalloc(COUNT * sizeof(float))) == NULL ) { + pass = FALSE; + failure_mssg = "malloc of data_slice failed.\n"; + } + } + + if ( pass ) { + nextValue = (float)(mpi_rank * COUNT); + + for(i=0; i 0 ) ) { + pass = FALSE; + failure_mssg = "One or more processes report failure.\n"; + } + + /* report results */ + if ( mpi_rank == 0 ) { + if ( pass ) { + HDfprintf(stdout, "Done.\n"); + } else { + HDfprintf(stdout, "FAILED.\n"); + HDfprintf(stdout, "%s: failure_mssg = \"%s\"\n", + fcn_name, failure_mssg); + } + } + /* free data_slice if it has been allocated */ + if ( data_slice != NULL ) { + HDfree(data_slice); + data_slice = NULL; + } -static void -test_parallel_read( int mpi_rank, int mpi_size ) + return(! pass); + +} /* generate_test_file() */ + + +/*------------------------------------------------------------------------- + * Function: test_parallel_read + * + * Purpose: *** Richard -- please fill this in *** + * + * + * Return: Success: 0 + * + * Failure: 1 + * + * Programmer: Richard Warren + * 10/1/17 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static int +test_parallel_read(int mpi_rank) { - int status, errors = 0; - hid_t access_plist = -1, dataset = -1; - hid_t file_id = -1, memspace = -1, dataspace = -1; - hsize_t i, offset, count = 1000; - hsize_t dims[1] = {0}; - float nextValue, data_slice[count]; - herr_t ret; - - access_plist = H5Pcreate(H5P_FILE_ACCESS); - if (access_plist >= 0) { - ret = H5Pset_fapl_mpio(access_plist, MPI_COMM_WORLD, MPI_INFO_NULL); - } else pass = false; - if (ret >= 0) { - file_id = H5Fopen(datafile_relocated,H5F_ACC_RDONLY,access_plist); - } else pass = false; - if (file_id >= 0) { - dataset = H5Dopen2(file_id, "dataset0", H5P_DEFAULT); - } else pass = false; - if (dataset >= 0) { - dims[0] = count; - memspace = H5Screate_simple(1, dims, NULL); - } else pass = false; - if ( memspace >= 0 ) { - dataspace = H5Dget_space(dataset); - } else pass = false; - if ( dataspace >= 0 ) { - offset = mpi_rank * count; - ret = H5Sselect_hyperslab(dataspace, H5S_SELECT_SET, &offset, NULL, &count, NULL); - } else pass = false; - if ( ret >= 0 ) { - ret = H5Dread(dataset, H5T_NATIVE_FLOAT, memspace, dataspace, H5P_DEFAULT, data_slice); - } else pass = false; - if (ret >= 0) { - nextValue = (float)(mpi_rank * count); - for (i=0; i < count; i++) { - if (data_slice[i] != nextValue) pass = false; - nextValue += 1; - } - } else pass = false; - - status = ( pass ? 0 : -1 ); - MPI_Allreduce( &status, &errors, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD ); - - if ( mpi_rank == 0) - HDfprintf(stderr, "H5Fopen/H5Dread/data_validation %s\n", ((errors == 0) ? "succeeded" : "FAILED")); - - H5Pclose(access_plist); - H5Dclose(dataset); - H5Fclose(file_id); - - /* Cleanup */ - unlink(datafile_relocated); - - return; -} + const char *failure_mssg; + const char *fcn_name = "test_parallel_read()"; + char reloc_data_filename[FILENAME_BUF_SIZE]; + int local_failure = 0; + int global_failures = 0; + hid_t fapl_id; + hid_t file_id; + hid_t dset_id; + hid_t memspace = -1; + hid_t filespace = -1; + hsize_t i; + hsize_t offset; + hsize_t count = COUNT; + hsize_t dims[1] = {0}; + float nextValue; + float *data_slice = NULL; + + pass = TRUE; + + if ( mpi_rank == 0 ) { + + TESTING("parallel file open test 1"); + } + + /* allocate space for the data_slice array */ + if ( pass ) { + if ( (data_slice = (float *)HDmalloc(COUNT * sizeof(float))) == NULL ) { + pass = FALSE; + failure_mssg = "malloc of data_slice failed.\n"; + } + } + + + /* construct file file name */ + if ( pass ) { + HDassert(FILENAMES[1]); + + if ( h5_fixname(FILENAMES[1], H5P_DEFAULT, reloc_data_filename, + sizeof(reloc_data_filename)) == NULL ) { + + pass = FALSE; + failure_mssg = "h5_fixname(1) failed.\n"; + } + } + + /* setup FAPL */ + if ( pass ) { + if ( (fapl_id = H5Pcreate(H5P_FILE_ACCESS)) < 0 ) { + pass = FALSE; + failure_mssg = "H5Pcreate(H5P_FILE_ACCESS) failed.\n"; + } + } + + if ( pass ) { + if ( (H5Pset_fapl_mpio(fapl_id, MPI_COMM_WORLD, MPI_INFO_NULL)) < 0 ) { + pass = FALSE; + failure_mssg = "H5Pset_fapl_mpio() failed\n"; + } + } + + /* open the file -- should have user block, exercising the optimization */ + if ( pass ) { + if ( (file_id = H5Fopen(reloc_data_filename, + H5F_ACC_RDONLY, fapl_id)) < 0 ) { + pass = FALSE; + failure_mssg = "H5Fopen() failed\n"; + } + } + + /* open the data set */ + if ( pass ) { + if ( (dset_id = H5Dopen2(file_id, "dataset0", H5P_DEFAULT)) < 0 ) { + pass = FALSE; + failure_mssg = "H5Dopen2() failed\n"; + } + } + + /* setup memspace */ + if ( pass ) { + dims[0] = count; + if ( (memspace = H5Screate_simple(1, dims, NULL)) < 0 ) { + pass = FALSE; + failure_mssg = "H5Screate_simple(1, dims, NULL) failed\n"; + } + } + + /* setup filespace */ + if ( pass ) { + if ( (filespace = H5Dget_space(dset_id)) < 0 ) { + pass = FALSE; + failure_mssg = "H5Dget_space(dataset) failed\n"; + } + } + + if ( pass ) { + offset = (hsize_t)mpi_rank * count; + if ( (H5Sselect_hyperslab(filespace, H5S_SELECT_SET, + &offset, NULL, &count, NULL)) < 0 ) { + pass = FALSE; + failure_mssg = "H5Sselect_hyperslab() failed\n"; + } + } + + /* read this processes section of the data */ + if ( pass ) { + if ( (H5Dread(dset_id, H5T_NATIVE_FLOAT, memspace, + filespace, H5P_DEFAULT, data_slice)) < 0 ) { + pass = FALSE; + failure_mssg = "H5Dread() failed\n"; + } + } + + /* verify the data */ + if ( pass ) { + nextValue = (float)((hsize_t)mpi_rank * count); + i = 0; + while ( ( pass ) && ( i < count ) ) { + /* what we really want is data_slice[i] != nextValue -- + * the following is a circumlocution to shut up the + * the compiler. + */ + if ( ( data_slice[i] > nextValue ) || + ( data_slice[i] < nextValue ) ) { + pass = FALSE; + failure_mssg = "Unexpected dset contents.\n"; + } + nextValue += 1; + i++; + } + } + + /* close file, etc. */ + if ( pass ) { + if ( H5Dclose(dset_id) < 0 ) { + pass = false; + failure_mssg = "H5Dclose(dset_id) failed.\n"; + } + } + + if ( pass ) { + if ( H5Sclose(memspace) < 0 ) { + pass = false; + failure_mssg = "H5Sclose(memspace) failed.\n"; + } + } + + if ( pass ) { + if ( H5Sclose(filespace) < 0 ) { + pass = false; + failure_mssg = "H5Sclose(filespace) failed.\n"; + } + } + + if ( pass ) { + if ( H5Fclose(file_id) < 0 ) { + pass = false; + failure_mssg = "H5Fclose(file_id) failed.\n"; + } + } + + if ( pass ) { + if ( H5Pclose(fapl_id) < 0 ) { + pass = false; + failure_mssg = "H5Pclose(fapl_id) failed.\n"; + } + } + + /* collect results from other processes. + * Only overwrite the failure message if no preveious error + * has been detected + */ + local_failure = ( pass ? 0 : 1 ); + + if ( MPI_Allreduce( &local_failure, &global_failures, 1, + MPI_INT, MPI_SUM, MPI_COMM_WORLD) != MPI_SUCCESS ) { + if ( pass ) { + pass = FALSE; + failure_mssg = "MPI_Allreduce() failed.\n"; + } + } else if ( ( pass ) && ( global_failures > 0 ) ) { + pass = FALSE; + failure_mssg = "One or more processes report failure.\n"; + } + /* report results and finish cleanup */ + if ( mpi_rank == 0 ) { + if ( pass ) { + PASSED(); + } else { + H5_FAILED(); + HDfprintf(stdout, "%s: failure_mssg = \"%s\"\n", + fcn_name, failure_mssg); + } + + HDremove(reloc_data_filename); + } + + /* free data_slice if it has been allocated */ + if ( data_slice != NULL ) { + HDfree(data_slice); + data_slice = NULL; + } + + + return( ! pass ); + +} /* test_parallel_read() */ + + +/*------------------------------------------------------------------------- + * Function: main + * + * Purpose: *** Richard -- please fill this in *** + * + * + * WARNING: This test uses fork() and execve(), and + * therefore will not run on Windows. + * + * Return: Success: 0 + * + * Failure: 1 + * + * Programmer: Richard Warren + * 10/1/17 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ int main( int argc, char **argv) { - int status, errors, mpi_rank, mpi_size; + int nerrs = 0; + int mpi_rank; + int mpi_size; - if ((status = MPI_Init(&argc, &argv)) != MPI_SUCCESS) { - HDfprintf(stderr, "FATAL: Unable to initialize MPI\n"); - exit(1); - } - if ((status = MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank)) != MPI_SUCCESS) { - HDfprintf(stderr, "FATAL: MPI_Comm_rank returned an error\n"); - exit(2); - } - if ((status = MPI_Comm_size(MPI_COMM_WORLD, &mpi_size)) != MPI_SUCCESS) { - HDfprintf(stderr, "FATAL: MPI_Comm_size returned an error\n"); - exit(2); - } + if ( (MPI_Init(&argc, &argv)) != MPI_SUCCESS) { + HDfprintf(stderr, "FATAL: Unable to initialize MPI\n"); + exit(1); + } - generate_test_file( mpi_rank, mpi_size ); - status = ( pass ? 0 : -1 ); + if ( (MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank)) != MPI_SUCCESS) { + HDfprintf(stderr, "FATAL: MPI_Comm_rank returned an error\n"); + exit(2); + } - /* Synch all ranks before attempting the parallel read */ - if ( MPI_Allreduce( &status, &errors, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD ) != MPI_SUCCESS) { - pass = false; - if (mpi_rank == 0) HDfprintf(stderr, "FATAL: MPI_Allreduce returned an error\n"); - } + if ( (MPI_Comm_size(MPI_COMM_WORLD, &mpi_size)) != MPI_SUCCESS) { + HDfprintf(stderr, "FATAL: MPI_Comm_size returned an error\n"); + exit(2); + } + + H5open(); + + if ( mpi_rank == 0 ) { + HDfprintf(stdout, "========================================\n"); + HDfprintf(stdout, "Collective file open optimization tests\n"); + HDfprintf(stdout, " mpi_size = %d\n", mpi_size); + HDfprintf(stdout, "========================================\n"); + } + + if ( mpi_size < 2 ) { - if ( errors == 0 ) { - test_parallel_read( mpi_rank, mpi_size ); + if ( mpi_rank == 0 ) { + + HDprintf(" Need at least 2 processes. Exiting.\n"); + } + goto finish; + } + + /* create the test files & verify that the process + * succeeded. If not, abort the remaining tests as + * they depend on the test files. + */ + + nerrs += generate_test_file( mpi_rank, mpi_size ); + + /* abort tests if there were any errors in test file construction */ + if ( nerrs > 0 ) { + if ( mpi_rank == 0 ) { + HDprintf(" Test file construction failed -- skipping tests.\n"); + } + goto finish; } - MPI_Finalize(); - return 0; -} + /* run the tests */ + nerrs += test_parallel_read(mpi_rank); + +finish: + + /* make sure all processes are finished before final report, cleanup + * and exit. + */ + MPI_Barrier(MPI_COMM_WORLD); + + if ( mpi_rank == 0 ) { /* only process 0 reports */ + const char *header = "Collective file open optimization tests"; + + HDfprintf(stdout, "===================================\n"); + if ( nerrs > 0 ) { + + HDfprintf(stdout, "***%s detected %d failures***\n", header, nerrs); + } + else { + HDfprintf(stdout, "%s finished with no failures\n", header); + } + HDfprintf(stdout, "===================================\n"); + } + + /* close HDF5 library */ + H5close(); + + /* MPI_Finalize must be called AFTER H5close which may use MPI calls */ + MPI_Finalize(); + + /* cannot just return (nerrs) because exit code is limited to 1byte */ + return(nerrs > 0); + +} /* main() */ -- cgit v0.12 From 837624b9cd33e95560377659aede1d065a858726 Mon Sep 17 00:00:00 2001 From: Richard Warren Date: Mon, 9 Oct 2017 16:47:21 -0400 Subject: Add a test for parallel reads of independent files using MPI subgroups --- src/H5FDmpio.h | 23 ------- src/H5Fsuper.c | 18 +++--- testpar/t_pread.c | 183 +++++++++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 159 insertions(+), 65 deletions(-) diff --git a/src/H5FDmpio.h b/src/H5FDmpio.h index f02afe6..6ee0a1a 100644 --- a/src/H5FDmpio.h +++ b/src/H5FDmpio.h @@ -29,29 +29,6 @@ #endif /* H5_HAVE_PARALLEL */ #ifdef H5_HAVE_PARALLEL -#if 0 /* delete this eventually */ -#define H5FD_GET_MPI_RANK_AND_SIZE(rank,size, f) { \ - (rank) = 0; (size) = 1; \ - if (H5F_HAS_FEATURE((f), H5FD_FEAT_HAS_MPI)) { \ - (rank) = H5F_mpi_get_rank((f)); \ - (size) = H5F_mpi_get_size((f)); \ - } else { \ - int mpi_initialized = 0, mpi_finalized = 0; \ - MPI_Initialized(&mpi_initialized); \ - MPI_Finalized(&mpi_finalized); \ - if (mpi_initialized && !mpi_finalized) { \ - MPI_Comm_rank(MPI_COMM_WORLD, &(rank)); \ - MPI_Comm_size(MPI_COMM_WORLD, &(size)); \ - } \ - }} - -#define H5FD_GET_MPI_COMM(comm, f) { \ - if (H5F_HAS_FEATURE((f), H5FD_FEAT_HAS_MPI)) \ - (comm) = H5F_mpi_get_comm((f)); \ - else (comm) = MPI_COMM_WORLD; \ - } -#endif /* delete this eventually */ - /*Turn on H5FDmpio_debug if H5F_DEBUG is on */ #ifdef H5F_DEBUG #ifndef H5FDmpio_DEBUG diff --git a/src/H5Fsuper.c b/src/H5Fsuper.c index a34a7fd..a3b1fed 100644 --- a/src/H5Fsuper.c +++ b/src/H5Fsuper.c @@ -359,9 +359,6 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial /* Find the superblock */ #ifdef H5_HAVE_PARALLEL -#if 0 - H5FD_GET_MPI_RANK_AND_SIZE(mpi_rank, mpi_size, f); -#else if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) { if((mpi_rank = H5F_mpi_get_rank(f)) < 0) @@ -370,9 +367,15 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial if((mpi_size = H5F_mpi_get_size(f)) < 0) HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTGET, FAIL, "can't retrieve MPI communicator size") } -#endif + /* If we are an MPI application with at least two processes, the * following superblock signature location optimization is applicable. + * + * Note:: For parallel applications which don't setup for using the + * HDF5 MPIO driver, we will arrive here with mpi_size == 1. + * This occurs because of the variable initialization (above) and the + * fact that we have skipped actually calling MPI functions to determine + * our MPI rank and size. */ if ( mpi_size > 1 ) { MPI_Comm this_comm = MPI_COMM_NULL; @@ -381,12 +384,6 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial if(H5FD_locate_signature(&fdio_info, &super_addr) < 0) HGOTO_ERROR(H5E_FILE, H5E_NOTHDF5, FAIL, "unable to locate file signature") } -#if 0 - H5FD_GET_MPI_COMM(this_comm, f); - if (( this_comm == MPI_COMM_NULL ) || - ( MPI_Bcast(&super_addr,sizeof(super_addr), MPI_BYTE, 0, this_comm) != MPI_SUCCESS)) - HGOTO_ERROR(H5E_FILE, H5E_NOTHDF5, FAIL, "unable to locate file signature") -#else HDassert(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)); if ( MPI_COMM_NULL == (this_comm = H5F_mpi_get_comm(f)) ) @@ -395,7 +392,6 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial if ( MPI_SUCCESS != (mpi_result = MPI_Bcast(&super_addr,sizeof(super_addr), MPI_BYTE, 0, this_comm))) HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_result) -#endif } else { /* Locate the signature as per per the serial library */ diff --git a/testpar/t_pread.c b/testpar/t_pread.c index ecc7360..48c14a9 100644 --- a/testpar/t_pread.c +++ b/testpar/t_pread.c @@ -35,15 +35,36 @@ static const char *random_hdf5_text = manual or go thru the tutorials!\n\ While you\'re at it, now is also the time to read up on MPI-IO."; -static int generate_test_file(int mpi_rank, int mpi_size); -static int test_parallel_read(int mpi_rank); +static const char *hitchhiker_quote = +"A common mistake that people make when trying to design something\n\ +completely foolproof is to underestimate the ingenuity of complete\n\ +fools.\n"; + +static int generate_test_file(MPI_Comm comm, int mpi_rank, int group); +static int test_parallel_read(MPI_Comm comm, int mpi_rank, int group); /*------------------------------------------------------------------------- * Function: generate_test_file * - * Purpose: *** Richard -- please fill this in *** + * Purpose: This function is called to produce an HDF5 data file + * whose superblock is relocated to a non-zero offset by + * utilizing the 'h5jam' utility to write random text + * at the start of the file. Unlike simple concatenation + * of files, h5jam is used to place the superblock on a + * power-of-2 boundary. + * + * Since data will be read back and validated, we generate + * data in a predictable manner rather than randomly. + * For now, we simply use the mpi_rank of the writing + * process as a starting component of the data generation. + * Subsequent writes are increments from the initial start + * value. * + * In the overall scheme of running the test, we'll call + * this function twice so as to create two seperate files. + * Each file will serve as the input data for two + * independent parallel reads. * * Return: Success: 0 * @@ -57,14 +78,17 @@ static int test_parallel_read(int mpi_rank); *------------------------------------------------------------------------- */ static int -generate_test_file( int mpi_rank, int mpi_size ) +generate_test_file( MPI_Comm comm, int mpi_rank, int group_id ) { FILE *header; const char *fcn_name = "generate_test_file()"; const char *failure_mssg = NULL; + char group_file[FILENAME_BUF_SIZE]; char data_filename[FILENAME_BUF_SIZE]; char reloc_data_filename[FILENAME_BUF_SIZE]; char prolog_filename[FILENAME_BUF_SIZE]; + int group_size; + int group_rank; int local_failure = 0; int global_failures = 0; hsize_t count = COUNT; @@ -82,6 +106,18 @@ generate_test_file( int mpi_rank, int mpi_size ) pass = true; + HDassert(comm != MPI_COMM_NULL); + + if ( (MPI_Comm_rank(comm, &group_rank)) != MPI_SUCCESS) { + pass = FALSE; + failure_mssg = "generate_test_file: MPI_Comm_rank failed.\n"; + } + + if ( (MPI_Comm_size(comm, &group_size)) != MPI_SUCCESS) { + pass = FALSE; + failure_mssg = "generate_test_file: MPI_Comm_size failed.\n"; + } + if ( mpi_rank == 0 ) { HDfprintf(stdout, "Constructing test files..."); @@ -90,8 +126,11 @@ generate_test_file( int mpi_rank, int mpi_size ) /* setup the file names */ if ( pass ) { HDassert(FILENAMES[0]); - - if ( h5_fixname(FILENAMES[0], H5P_DEFAULT, data_filename, + if ( HDsprintf(group_file, "%s_%d", FILENAMES[0], group_id) < 0) { + pass = FALSE; + failure_mssg = "HDsprintf(0) failed.\n"; + } + else if ( h5_fixname(group_file, H5P_DEFAULT, data_filename, sizeof(data_filename)) == NULL ) { pass = FALSE; failure_mssg = "h5_fixname(0) failed.\n"; @@ -100,8 +139,11 @@ generate_test_file( int mpi_rank, int mpi_size ) if ( pass ) { HDassert(FILENAMES[1]); - - if ( h5_fixname(FILENAMES[1], H5P_DEFAULT, reloc_data_filename, + if ( HDsprintf(group_file, "%s_%d", FILENAMES[1], group_id) < 0) { + pass = FALSE; + failure_mssg = "HDsprintf(1) failed.\n"; + } + else if ( h5_fixname(group_file, H5P_DEFAULT, reloc_data_filename, sizeof(reloc_data_filename)) == NULL ) { pass = FALSE; @@ -111,8 +153,11 @@ generate_test_file( int mpi_rank, int mpi_size ) if ( pass ) { HDassert(FILENAMES[2]); - - if ( h5_fixname(FILENAMES[2], H5P_DEFAULT, prolog_filename, + if ( HDsprintf(group_file, "%s_%d", FILENAMES[2], group_id) < 0) { + pass = FALSE; + failure_mssg = "HDsprintf(2) failed.\n"; + } + else if ( h5_fixname(group_file, H5P_DEFAULT, prolog_filename, sizeof(prolog_filename)) == NULL ) { pass = FALSE; failure_mssg = "h5_fixname(2) failed.\n"; @@ -145,7 +190,7 @@ generate_test_file( int mpi_rank, int mpi_size ) } if ( pass ) { - if ( (H5Pset_fapl_mpio(fapl_id, MPI_COMM_WORLD, MPI_INFO_NULL)) < 0 ) { + if ( (H5Pset_fapl_mpio(fapl_id, comm, MPI_INFO_NULL)) < 0 ) { pass = FALSE; failure_mssg = "H5Pset_fapl_mpio() failed\n"; } @@ -184,7 +229,7 @@ generate_test_file( int mpi_rank, int mpi_size ) } if ( pass ) { - dims[0] *= (hsize_t)mpi_size; + dims[0] *= (hsize_t)group_size; if ( (filespace = H5Screate_simple(1, dims, NULL)) < 0 ) { pass = FALSE; failure_mssg = "H5Screate_simple(1, dims, NULL) failed (2).\n"; @@ -192,7 +237,7 @@ generate_test_file( int mpi_rank, int mpi_size ) } if ( pass ) { - offset = (hsize_t)mpi_rank * (hsize_t)COUNT; + offset = (hsize_t)group_rank * (hsize_t)COUNT; if ( (H5Sselect_hyperslab(filespace, H5S_SELECT_SET, &offset, NULL, &count, NULL)) < 0 ) { pass = FALSE; @@ -266,11 +311,17 @@ generate_test_file( int mpi_rank, int mpi_size ) * * Also delete files that are no longer needed. */ - if ( mpi_rank == 0 ) { + if ( group_rank == 0 ) { + const char *text_to_write; size_t bytes_to_write; - bytes_to_write = strlen(random_hdf5_text); + if (group_id == 0) + text_to_write = random_hdf5_text; + else + text_to_write = hitchhiker_quote; + + bytes_to_write = strlen(text_to_write); if ( pass ) { if ( (header = HDfopen(prolog_filename, "w+")) == NULL ) { @@ -280,8 +331,8 @@ generate_test_file( int mpi_rank, int mpi_size ) } if ( pass ) { - bytes_to_write = strlen(random_hdf5_text); - if ( HDfwrite(random_hdf5_text, 1, bytes_to_write, header) != + + if ( HDfwrite(text_to_write, 1, bytes_to_write, header) != bytes_to_write ) { pass = FALSE; failure_mssg = "Unable to write header file.\n"; @@ -319,6 +370,7 @@ generate_test_file( int mpi_rank, int mpi_size ) */ local_failure = ( pass ? 0 : 1 ); + /* This is a global all reduce (NOT group specific) */ if ( MPI_Allreduce(&local_failure, &global_failures, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD) != MPI_SUCCESS ) { if ( pass ) { @@ -355,8 +407,25 @@ generate_test_file( int mpi_rank, int mpi_size ) /*------------------------------------------------------------------------- * Function: test_parallel_read * - * Purpose: *** Richard -- please fill this in *** + * Purpose: This actually tests the superblock optimization + * and covers the two primary cases we're interested in. + * 1). That HDF5 files can be opened in parallel by + * the rank 0 process and that the superblock + * offset is correctly broadcast to the other + * parallel file readers. + * 2). That a parallel application can correctly + * handle reading multiple files by using + * subgroups of MPI_COMM_WORLD and that each + * subgroup operates as described in (1) to + * collectively read the data. * + * The global MPI rank is used for reading and + * writing data for process specific data in the + * dataset. We do this rather simplisticly, i.e. + * rank 0: writes/reads 0-9999 + * rank 1: writes/reads 1000-1999 + * rank 2: writes/reads 2000-2999 + * ... * * Return: Success: 0 * @@ -370,13 +439,16 @@ generate_test_file( int mpi_rank, int mpi_size ) *------------------------------------------------------------------------- */ static int -test_parallel_read(int mpi_rank) +test_parallel_read(MPI_Comm comm, int mpi_rank, int group_id) { const char *failure_mssg; const char *fcn_name = "test_parallel_read()"; + char group_file[FILENAME_BUF_SIZE]; char reloc_data_filename[FILENAME_BUF_SIZE]; int local_failure = 0; int global_failures = 0; + int group_size; + int group_rank; hid_t fapl_id; hid_t file_id; hid_t dset_id; @@ -391,6 +463,18 @@ test_parallel_read(int mpi_rank) pass = TRUE; + HDassert(comm != MPI_COMM_NULL); + + if ( (MPI_Comm_rank(comm, &group_rank)) != MPI_SUCCESS) { + pass = FALSE; + failure_mssg = "test_parallel_read: MPI_Comm_rank failed.\n"; + } + + if ( (MPI_Comm_size(comm, &group_size)) != MPI_SUCCESS) { + pass = FALSE; + failure_mssg = "test_parallel_read: MPI_Comm_size failed.\n"; + } + if ( mpi_rank == 0 ) { TESTING("parallel file open test 1"); @@ -408,8 +492,11 @@ test_parallel_read(int mpi_rank) /* construct file file name */ if ( pass ) { HDassert(FILENAMES[1]); - - if ( h5_fixname(FILENAMES[1], H5P_DEFAULT, reloc_data_filename, + if ( HDsprintf(group_file, "%s_%d", FILENAMES[1], group_id) < 0) { + pass = FALSE; + failure_mssg = "HDsprintf(0) failed.\n"; + } + else if ( h5_fixname(group_file, H5P_DEFAULT, reloc_data_filename, sizeof(reloc_data_filename)) == NULL ) { pass = FALSE; @@ -426,7 +513,7 @@ test_parallel_read(int mpi_rank) } if ( pass ) { - if ( (H5Pset_fapl_mpio(fapl_id, MPI_COMM_WORLD, MPI_INFO_NULL)) < 0 ) { + if ( (H5Pset_fapl_mpio(fapl_id, comm, MPI_INFO_NULL)) < 0 ) { pass = FALSE; failure_mssg = "H5Pset_fapl_mpio() failed\n"; } @@ -467,7 +554,7 @@ test_parallel_read(int mpi_rank) } if ( pass ) { - offset = (hsize_t)mpi_rank * count; + offset = (hsize_t)group_rank * count; if ( (H5Sselect_hyperslab(filespace, H5S_SELECT_SET, &offset, NULL, &count, NULL)) < 0 ) { pass = FALSE; @@ -557,7 +644,7 @@ test_parallel_read(int mpi_rank) } /* report results and finish cleanup */ - if ( mpi_rank == 0 ) { + if ( group_rank == 0 ) { if ( pass ) { PASSED(); } else { @@ -584,8 +671,17 @@ test_parallel_read(int mpi_rank) /*------------------------------------------------------------------------- * Function: main * - * Purpose: *** Richard -- please fill this in *** + * Purpose: To implement a parallel test which validates whether the + * new superblock lookup functionality is working correctly. * + * The test consists of creating two seperate HDF datasets + * in which random text is inserted at the start of each + * file using the 'j5jam' application. This forces the + * HDF5 file superblock to a non-zero offset. + * Having created the two independant files, we create two + * non-overlapping MPI groups, each of which is then tasked + * with the opening and validation of the data contained + * therein. * * WARNING: This test uses fork() and execve(), and * therefore will not run on Windows. @@ -606,8 +702,11 @@ int main( int argc, char **argv) { int nerrs = 0; + int which_group; int mpi_rank; int mpi_size; + int split_size; + MPI_Comm group_comm = MPI_COMM_NULL; if ( (MPI_Init(&argc, &argv)) != MPI_SUCCESS) { HDfprintf(stderr, "FATAL: Unable to initialize MPI\n"); @@ -633,21 +732,37 @@ main( int argc, char **argv) HDfprintf(stdout, "========================================\n"); } - if ( mpi_size < 2 ) { + if ( mpi_size < 4 ) { if ( mpi_rank == 0 ) { - HDprintf(" Need at least 2 processes. Exiting.\n"); + HDprintf(" Need at least 4 processes. Exiting.\n"); } goto finish; } + /* Divide the available processes into two groups + * that are the same size (plus or minus). + */ + split_size = mpi_size / 2; + which_group = (mpi_rank < split_size ? 0 : 1); + + if ( (MPI_Comm_split(MPI_COMM_WORLD, + which_group, + 0, + &group_comm)) != MPI_SUCCESS) { + + HDfprintf(stderr, "FATAL: MPI_Comm_split returned an error\n"); + exit(2); + } + + /* create the test files & verify that the process * succeeded. If not, abort the remaining tests as * they depend on the test files. */ - nerrs += generate_test_file( mpi_rank, mpi_size ); + nerrs += generate_test_file( group_comm, mpi_rank, which_group ); /* abort tests if there were any errors in test file construction */ if ( nerrs > 0 ) { @@ -655,13 +770,19 @@ main( int argc, char **argv) HDprintf(" Test file construction failed -- skipping tests.\n"); } goto finish; - } + } - /* run the tests */ - nerrs += test_parallel_read(mpi_rank); + /* run the tests */ + nerrs += test_parallel_read(group_comm, mpi_rank, which_group); finish: + if ((group_comm != MPI_COMM_NULL) && + (MPI_Comm_free(&group_comm)) != MPI_SUCCESS) { + HDfprintf(stderr, "MPI_Comm_free failed!\n"); + } + + /* make sure all processes are finished before final report, cleanup * and exit. */ -- cgit v0.12 From 1f0194fb641ff348a6415f56596f17dcb7e223b5 Mon Sep 17 00:00:00 2001 From: Richard Warren Date: Tue, 10 Oct 2017 09:47:22 -0400 Subject: Update the MANIFEST and release_docs/RELEASE files --- MANIFEST | 1 + release_docs/RELEASE.txt | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/MANIFEST b/MANIFEST index fa3de21..9dca971 100644 --- a/MANIFEST +++ b/MANIFEST @@ -1244,6 +1244,7 @@ ./testpar/t_ph5basic.c ./testpar/t_pflush1.c ./testpar/t_pflush2.c +./testpar/t_pread.c ./testpar/t_prop.c ./testpar/t_shapesame.c ./testpar/t_pshutdown.c diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt index d0de3ee..ed1b6cc 100644 --- a/release_docs/RELEASE.txt +++ b/release_docs/RELEASE.txt @@ -62,6 +62,31 @@ New Features Parallel Library: ----------------- + - Optimize parallel open/location of the HDF5 super-block + + Previous releases of PHDF5 allow all parallel ranks to + read the starting elements in a file to validate and process + the HDF5 super-block. As this is accomplished more or less as + a synchronous operation, a large number of processes will + likely experience a slowdown due to filesystem contention. + + As a first step in improving the startup/file-open performance, + we allow MPI rank 0 of the associated MPI communicator to locate + the base offset of the super-block and then broadcast that result + to the remaining ranks in the parallel group. Note that this + approach is utilized ONLY during file opens which employ the MPIO + file driver in HDF5 by previously having called H5Pset_fapl_mpio(). + + HDF5 parallel file operations which do not employ multiple ranks + e.g. specifiying MPI_COMM_SELF (whose MPI_Comm_size == 1) + as opposed to MPI_COMM_WORLD, will not be affected by this + optimization. Conversely, parallel file operations on subgroups + of MPI_COMM_WORLD are allowed to be run in parallel with each + subgroup operating as an independant collection of processes. + + (RAW – 2017/10/10, HDFFV-10294) + + - Large MPI-IO transfers Previous releases of PHDF5 would fail when attempting to -- cgit v0.12 From 9849d61344faf01f366fff2e67026e468e184f06 Mon Sep 17 00:00:00 2001 From: Richard Warren Date: Wed, 11 Oct 2017 10:15:33 -0400 Subject: Made edits suggested by John as part of the code review --- testpar/t_pread.c | 200 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 146 insertions(+), 54 deletions(-) diff --git a/testpar/t_pread.c b/testpar/t_pread.c index 48c14a9..b40fc09 100644 --- a/testpar/t_pread.c +++ b/testpar/t_pread.c @@ -20,11 +20,24 @@ #include "h5test.h" #include "testpar.h" -#define NFILENAME 3 -const char *FILENAMES[NFILENAME + 1]={"t_pread_data_file", - "reloc_t_pread_data_file", - "prefix_file", - NULL}; +/* The collection of files is included below to aid + * an external "cleanup" process if required. + * + * Note that the code below relies on the ordering of this array + * since each set of three is used by the tests either to construct + * or to read and validate. + */ +#define NFILENAME 9 +const char *FILENAMES[NFILENAME + 1]={"t_pread_data_file", + "reloc_t_pread_data_file", + "prefix_file", + "t_pread_group_0_file", + "reloc_t_pread_group_0_file", + "prefix_file_0", + "t_pread_group_1_file", + "reloc_t_pread_group_1_file", + "prefix_file_1", + NULL}; #define FILENAME_BUF_SIZE 1024 #define COUNT 1000 @@ -56,15 +69,19 @@ static int test_parallel_read(MPI_Comm comm, int mpi_rank, int group); * * Since data will be read back and validated, we generate * data in a predictable manner rather than randomly. - * For now, we simply use the mpi_rank of the writing - * process as a starting component of the data generation. + * For now, we simply use the global mpi_rank of the writing + * process as a starting component for the data generation. * Subsequent writes are increments from the initial start * value. * * In the overall scheme of running the test, we'll call - * this function twice so as to create two seperate files. - * Each file will serve as the input data for two - * independent parallel reads. + * this function twice: first as a collection of all MPI + * processes and then a second time with the processes split + * more or less in half. Each sub group will operate + * collectively on their assigned file. This split into + * subgroups validates that parallel groups can successfully + * open and read data independantly from the other parallel + * operations taking place. * * Return: Success: 0 * @@ -83,10 +100,11 @@ generate_test_file( MPI_Comm comm, int mpi_rank, int group_id ) FILE *header; const char *fcn_name = "generate_test_file()"; const char *failure_mssg = NULL; - char group_file[FILENAME_BUF_SIZE]; + const char *group_filename = NULL; char data_filename[FILENAME_BUF_SIZE]; char reloc_data_filename[FILENAME_BUF_SIZE]; char prolog_filename[FILENAME_BUF_SIZE]; + int file_index; int group_size; int group_rank; int local_failure = 0; @@ -123,14 +141,32 @@ generate_test_file( MPI_Comm comm, int mpi_rank, int group_id ) HDfprintf(stdout, "Constructing test files..."); } - /* setup the file names */ + /* Setup the file names + * The test specfic filenames are stored as consecutive + * array entries in the global 'FILENAMES' array above. + * Here, we simply decide on the starting index for + * file construction. The reading portion of the test + * will have a similar setup process... + */ if ( pass ) { - HDassert(FILENAMES[0]); - if ( HDsprintf(group_file, "%s_%d", FILENAMES[0], group_id) < 0) { - pass = FALSE; - failure_mssg = "HDsprintf(0) failed.\n"; - } - else if ( h5_fixname(group_file, H5P_DEFAULT, data_filename, + if ( comm == MPI_COMM_WORLD ) { /* Test 1 */ + file_index = 0; + } + else if ( group_id == 0 ) { /* Test 2 group 0 */ + file_index = 3; + } + else { /* Test 2 group 1 */ + file_index = 6; + } + + /* The 'group_filename' is just a temp variable and + * is used to call into the h5_fixname function. No + * need to worry that we reassign it for each file! + */ + HDassert((group_filename = FILENAMES[file_index])); + + /* Assign the 'data_filename' */ + if ( h5_fixname(group_filename, H5P_DEFAULT, data_filename, sizeof(data_filename)) == NULL ) { pass = FALSE; failure_mssg = "h5_fixname(0) failed.\n"; @@ -138,12 +174,11 @@ generate_test_file( MPI_Comm comm, int mpi_rank, int group_id ) } if ( pass ) { - HDassert(FILENAMES[1]); - if ( HDsprintf(group_file, "%s_%d", FILENAMES[1], group_id) < 0) { - pass = FALSE; - failure_mssg = "HDsprintf(1) failed.\n"; - } - else if ( h5_fixname(group_file, H5P_DEFAULT, reloc_data_filename, + + HDassert( (group_filename = FILENAMES[file_index+1]) ); + + /* Assign the 'reloc_data_filename' */ + if ( h5_fixname(group_filename, H5P_DEFAULT, reloc_data_filename, sizeof(reloc_data_filename)) == NULL ) { pass = FALSE; @@ -152,12 +187,11 @@ generate_test_file( MPI_Comm comm, int mpi_rank, int group_id ) } if ( pass ) { - HDassert(FILENAMES[2]); - if ( HDsprintf(group_file, "%s_%d", FILENAMES[2], group_id) < 0) { - pass = FALSE; - failure_mssg = "HDsprintf(2) failed.\n"; - } - else if ( h5_fixname(group_file, H5P_DEFAULT, prolog_filename, + + HDassert( (group_filename = FILENAMES[file_index+2]) ); + + /* Assing the 'prolog_filename' */ + if ( h5_fixname(group_filename, H5P_DEFAULT, prolog_filename, sizeof(prolog_filename)) == NULL ) { pass = FALSE; failure_mssg = "h5_fixname(2) failed.\n"; @@ -305,11 +339,20 @@ generate_test_file( MPI_Comm comm, int mpi_rank, int group_id ) } } - /* add a userblock to the head of the datafile. + /* Add a userblock to the head of the datafile. * We will use this to for a functional test of the - * file open optimization. + * file open optimization. This is superblock + * relocation is done by the rank 0 process associated + * with the communicator being used. For test 1, we + * utilize MPI_COMM_WORLD, so group_rank 0 is the + * same as mpi_rank 0. For test 2 which utilizes + * two groups resulting from an MPI_Comm_split, we + * will have parallel groups and hence two + * group_rank(0) processes. Each parallel group + * will create a unique file with different text + * headers and different data. * - * Also delete files that are no longer needed. + * We also delete files that are no longer needed. */ if ( group_rank == 0 ) { @@ -443,7 +486,7 @@ test_parallel_read(MPI_Comm comm, int mpi_rank, int group_id) { const char *failure_mssg; const char *fcn_name = "test_parallel_read()"; - char group_file[FILENAME_BUF_SIZE]; + const char *group_filename = NULL; char reloc_data_filename[FILENAME_BUF_SIZE]; int local_failure = 0; int global_failures = 0; @@ -476,8 +519,12 @@ test_parallel_read(MPI_Comm comm, int mpi_rank, int group_id) } if ( mpi_rank == 0 ) { - - TESTING("parallel file open test 1"); + if ( comm == MPI_COMM_WORLD ) { + TESTING("parallel file open test 1"); + } + else { + TESTING("parallel file open test 2"); + } } /* allocate space for the data_slice array */ @@ -489,14 +536,21 @@ test_parallel_read(MPI_Comm comm, int mpi_rank, int group_id) } - /* construct file file name */ + /* Select the file file name to read + * Please see the comments in the 'generate_test_file' function + * for more details... + */ if ( pass ) { - HDassert(FILENAMES[1]); - if ( HDsprintf(group_file, "%s_%d", FILENAMES[1], group_id) < 0) { - pass = FALSE; - failure_mssg = "HDsprintf(0) failed.\n"; - } - else if ( h5_fixname(group_file, H5P_DEFAULT, reloc_data_filename, + + if ( comm == MPI_COMM_WORLD ) /* test 1 */ + group_filename = FILENAMES[1]; + else if ( group_id == 0 ) /* test 2 group 0 */ + group_filename = FILENAMES[4]; + else /* test 2 group 1 */ + group_filename = FILENAMES[7]; + + HDassert(group_filename); + if ( h5_fixname(group_filename, H5P_DEFAULT, reloc_data_filename, sizeof(reloc_data_filename)) == NULL ) { pass = FALSE; @@ -702,11 +756,11 @@ int main( int argc, char **argv) { int nerrs = 0; - int which_group; + int which_group = 0; int mpi_rank; int mpi_size; int split_size; - MPI_Comm group_comm = MPI_COMM_NULL; + MPI_Comm group_comm = MPI_COMM_WORLD; if ( (MPI_Init(&argc, &argv)) != MPI_SUCCESS) { HDfprintf(stderr, "FATAL: Unable to initialize MPI\n"); @@ -741,9 +795,42 @@ main( int argc, char **argv) goto finish; } - /* Divide the available processes into two groups - * that are the same size (plus or minus). + /* ------ Test 1 of 2 ------ + * In this test we utilize all processes which makeup MPI_COMM_WORLD. + * We generate the test file which we'll shortly try to read. */ + nerrs += generate_test_file( group_comm, mpi_rank, which_group ); + + /* abort tests if there were any errors in test file construction */ + if ( nerrs > 0 ) { + if ( mpi_rank == 0 ) { + HDprintf(" Test file construction failed -- skipping tests.\n"); + } + goto finish; + } + + /* Now read the generated test file (stil using MPI_COMM_WORLD) */ + nerrs += test_parallel_read( group_comm, mpi_rank, which_group); + + if ( nerrs > 0 ) { + if ( mpi_rank == 0 ) { + HDprintf(" Parallel read test failed -- skipping tests.\n"); + } + goto finish; + } + + /* Update the user on our progress so far. */ + if ( mpi_rank == 0 ) { + HDprintf(" Test 1 of 2 succeeded\n"); + HDprintf(" -- Starting multi-group parallel read test.\n"); + } + + /* ------ Test 2 of 2 ------ + * Create two more or less equal MPI groups to + * initialize the test files and then verify that parallel + * operations by independent group succeeds. + */ + split_size = mpi_size / 2; which_group = (mpi_rank < split_size ? 0 : 1); @@ -756,12 +843,6 @@ main( int argc, char **argv) exit(2); } - - /* create the test files & verify that the process - * succeeded. If not, abort the remaining tests as - * they depend on the test files. - */ - nerrs += generate_test_file( group_comm, mpi_rank, which_group ); /* abort tests if there were any errors in test file construction */ @@ -772,9 +853,20 @@ main( int argc, char **argv) goto finish; } - /* run the tests */ + /* run the 2nd set of tests */ nerrs += test_parallel_read(group_comm, mpi_rank, which_group); + if ( nerrs > 0 ) { + if ( mpi_rank == 0 ) { + HDprintf(" Multi-group read test failed\n"); + } + goto finish; + } + + if ( mpi_rank == 0 ) { + HDprintf(" Test 2 of 2 succeeded\n"); + } + finish: if ((group_comm != MPI_COMM_NULL) && -- cgit v0.12 From 157398107e334e3dafbdcd25f34da391510e45f2 Mon Sep 17 00:00:00 2001 From: Richard Warren Date: Wed, 11 Oct 2017 13:40:54 -0400 Subject: Try to address most of the issues raised by Dana in the code review --- src/H5Fsuper.c | 102 ++++++++++++++++++++++++++-------------------------- testpar/t_pread.c | 104 ++++++++++++++++++++++++++---------------------------- 2 files changed, 101 insertions(+), 105 deletions(-) diff --git a/src/H5Fsuper.c b/src/H5Fsuper.c index a3b1fed..0c6f9cd 100644 --- a/src/H5Fsuper.c +++ b/src/H5Fsuper.c @@ -21,15 +21,15 @@ /***********/ /* Headers */ /***********/ -#include "H5private.h" /* Generic Functions */ +#include "H5private.h" /* Generic Functions */ #include "H5ACprivate.h" /* Metadata cache */ -#include "H5Eprivate.h" /* Error handling */ +#include "H5Eprivate.h" /* Error handling */ #include "H5Fpkg.h" /* File access */ -#include "H5FDprivate.h" /* File drivers */ -#include "H5Iprivate.h" /* IDs */ +#include "H5FDprivate.h" /* File drivers */ +#include "H5Iprivate.h" /* IDs */ #include "H5MFprivate.h" /* File memory management */ -#include "H5MMprivate.h" /* Memory management */ -#include "H5Pprivate.h" /* Property lists */ +#include "H5MMprivate.h" /* Memory management */ +#include "H5Pprivate.h" /* Property lists */ #include "H5SMprivate.h" /* Shared Object Header Messages */ @@ -158,7 +158,7 @@ H5F_super_ext_open(H5F_t *f, haddr_t ext_addr, H5O_loc_t *ext_ptr) /* Open the superblock extension object header */ if(H5O_open(ext_ptr) < 0) - HGOTO_ERROR(H5E_OHDR, H5E_CANTOPENOBJ, FAIL, "unable to open superblock extension") + HGOTO_ERROR(H5E_OHDR, H5E_CANTOPENOBJ, FAIL, "unable to open superblock extension") done: FUNC_LEAVE_NOAPI(ret_value) @@ -224,12 +224,12 @@ done: /*------------------------------------------------------------------------- * Function: H5F__update_super_ext_driver_msg * - * Purpose: Update the superblock extension file driver info message if - * we are using a V 2 superblock. Observe that the function - * is a NO-OP if the file driver info message does not exist. + * Purpose: Update the superblock extension file driver info message if + * we are using a V 2 superblock. Observe that the function + * is a NO-OP if the file driver info message does not exist. * This is necessary, as the function is called whenever the - * EOA is updated, and were it to create the file driver info - * message, it would find itself in an infinite recursion. + * EOA is updated, and were it to create the file driver info + * message, it would find itself in an infinite recursion. * * Return: Success: SUCCEED * Failure: FAIL @@ -267,7 +267,7 @@ H5F__update_super_ext_driver_msg(H5F_t *f, hid_t dxpl_id) /* Check for driver info */ H5_CHECKED_ASSIGN(driver_size, size_t, H5FD_sb_size(f->shared->lf), hsize_t); - /* Nothing to do unless there is both driver info and + /* Nothing to do unless there is both driver info and * the driver info superblock extension message has * already been created. */ @@ -330,13 +330,13 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial unsigned sblock_flags = H5AC__NO_FLAGS_SET; /* flags used in superblock unprotect call */ haddr_t super_addr; /* Absolute address of superblock */ haddr_t eof; /* End of file address */ - unsigned rw_flags; /* Read/write permissions for file */ - hbool_t skip_eof_check = FALSE; /* Whether to skip checking the EOF value */ + unsigned rw_flags; /* Read/write permissions for file */ + hbool_t skip_eof_check = FALSE; /* Whether to skip checking the EOF value */ herr_t ret_value = SUCCEED; /* Return value */ #ifdef H5_HAVE_PARALLEL int mpi_rank = 0, mpi_size = 1; int mpi_result; -#endif /* H5_HAVE_PARALLEL */ +#endif /* H5_HAVE_PARALLEL */ FUNC_ENTER_PACKAGE_TAG(meta_dxpl_id, H5AC__SUPERBLOCK_TAG, FAIL) @@ -365,7 +365,7 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "Can't get MPI rank") if((mpi_size = H5F_mpi_get_size(f)) < 0) - HGOTO_ERROR(H5E_PAGEBUF, H5E_CANTGET, FAIL, "can't retrieve MPI communicator size") + HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "can't retrieve MPI communicator size") } /* If we are an MPI application with at least two processes, the @@ -380,14 +380,14 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial if ( mpi_size > 1 ) { MPI_Comm this_comm = MPI_COMM_NULL; - if ( mpi_rank == 0 ) { - if(H5FD_locate_signature(&fdio_info, &super_addr) < 0) - HGOTO_ERROR(H5E_FILE, H5E_NOTHDF5, FAIL, "unable to locate file signature") + if ( mpi_rank == 0 ) { + if (H5FD_locate_signature(&fdio_info, &super_addr) < 0) + HGOTO_ERROR(H5E_FILE, H5E_NOTHDF5, FAIL, "unable to locate file signature") } HDassert(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)); if ( MPI_COMM_NULL == (this_comm = H5F_mpi_get_comm(f)) ) - HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI communicator") + HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "can't get MPI communicator") if ( MPI_SUCCESS != (mpi_result = MPI_Bcast(&super_addr,sizeof(super_addr), MPI_BYTE, 0, this_comm))) @@ -395,14 +395,14 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial } else { /* Locate the signature as per per the serial library */ -#endif /* H5_HAVE_PARALLEL */ +#endif /* H5_HAVE_PARALLEL */ - if(H5FD_locate_signature(&fdio_info, &super_addr) < 0) - HGOTO_ERROR(H5E_FILE, H5E_NOTHDF5, FAIL, "unable to locate file signature") + if (H5FD_locate_signature(&fdio_info, &super_addr) < 0) + HGOTO_ERROR(H5E_FILE, H5E_NOTHDF5, FAIL, "unable to locate file signature") #ifdef H5_HAVE_PARALLEL } -#endif /* H5_HAVE_PARALLEL */ +#endif /* H5_HAVE_PARALLEL */ if(HADDR_UNDEF == super_addr) HGOTO_ERROR(H5E_FILE, H5E_NOTHDF5, FAIL, "file signature not found") @@ -453,12 +453,12 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial HGOTO_ERROR(H5E_FILE, H5E_CANTPROTECT, FAIL, "unable to load superblock") if(H5F_INTENT(f) & H5F_ACC_SWMR_WRITE) - if(sblock->super_vers < HDF5_SUPERBLOCK_VERSION_3) - HGOTO_ERROR(H5E_FILE, H5E_CANTPROTECT, FAIL, "invalid superblock version for SWMR_WRITE") + if(sblock->super_vers < HDF5_SUPERBLOCK_VERSION_3) + HGOTO_ERROR(H5E_FILE, H5E_CANTPROTECT, FAIL, "invalid superblock version for SWMR_WRITE") /* Enable all latest version support when file has v3 superblock */ if(sblock->super_vers >= HDF5_SUPERBLOCK_VERSION_3) - f->shared->latest_flags |= H5F_LATEST_ALL_FLAGS; + f->shared->latest_flags |= H5F_LATEST_ALL_FLAGS; /* Pin the superblock in the cache */ if(H5AC_pin_protected_entry(sblock) < 0) @@ -558,15 +558,15 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial * been flushed to disk by the SWMR writer process. */ if(H5F_INTENT(f) & H5F_ACC_SWMR_READ) { - /* - * When the file is opened for SWMR read access, skip the check if: - * --the file is already marked for SWMR writing and - * --the file has version 3 superblock for SWMR support - */ - if((sblock->status_flags & H5F_SUPER_SWMR_WRITE_ACCESS) && + /* + * When the file is opened for SWMR read access, skip the check if: + * --the file is already marked for SWMR writing and + * --the file has version 3 superblock for SWMR support + */ + if((sblock->status_flags & H5F_SUPER_SWMR_WRITE_ACCESS) && (sblock->status_flags & H5F_SUPER_WRITE_ACCESS) && sblock->super_vers >= HDF5_SUPERBLOCK_VERSION_3) - skip_eof_check = TRUE; + skip_eof_check = TRUE; } /* end if */ if(!skip_eof_check && initial_read) { if(HADDR_UNDEF == (eof = H5FD_get_eof(f->shared->lf, H5FD_MEM_DEFAULT))) @@ -640,7 +640,7 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial H5O_loc_t ext_loc; /* "Object location" for superblock extension */ H5O_btreek_t btreek; /* v1 B-tree 'K' value message from superblock extension */ H5O_drvinfo_t drvinfo; /* Driver info message from superblock extension */ - size_t u; /* Local index variable */ + size_t u; /* Local index variable */ htri_t status; /* Status for message existing */ /* Sanity check - superblock extension should only be defined for @@ -661,7 +661,7 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial } /* end if */ /* Open the superblock extension */ - if(H5F_super_ext_open(f, sblock->ext_addr, &ext_loc) < 0) + if(H5F_super_ext_open(f, sblock->ext_addr, &ext_loc) < 0) HGOTO_ERROR(H5E_FILE, H5E_CANTOPENOBJ, FAIL, "unable to open file's superblock extension") /* Check for the extension having a 'driver info' message */ @@ -684,8 +684,8 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial /* Reset driver info message */ H5O_msg_reset(H5O_DRVINFO_ID, &drvinfo); - HDassert(FALSE == f->shared->drvinfo_sb_msg_exists); - f->shared->drvinfo_sb_msg_exists = TRUE; + HDassert(FALSE == f->shared->drvinfo_sb_msg_exists); + f->shared->drvinfo_sb_msg_exists = TRUE; } /* end else */ } /* end if */ @@ -811,37 +811,37 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial } /* end if not marked "unknown" */ } /* end if */ - /* Check for the extension having a 'metadata cache image' message */ + /* Check for the extension having a 'metadata cache image' message */ if((status = H5O_msg_exists(&ext_loc, H5O_MDCI_MSG_ID, meta_dxpl_id)) < 0) HGOTO_ERROR(H5E_FILE, H5E_EXISTS, FAIL, "unable to read object header") if(status) { - hbool_t rw = ((rw_flags & H5AC__READ_ONLY_FLAG) == 0); - H5O_mdci_t mdci_msg; + hbool_t rw = ((rw_flags & H5AC__READ_ONLY_FLAG) == 0); + H5O_mdci_t mdci_msg; - /* if the metadata cache image superblock extension message exists, + /* if the metadata cache image superblock extension message exists, * read its contents and pass the data on to the metadata cache. * Given this data, the cache will load and decode the metadata - * cache image block, decoded it and load its contents into the - * the cache on the test protect call. + * cache image block, decoded it and load its contents into the + * the cache on the test protect call. * * Further, if the file is opened R/W, the metadata cache will - * delete the metadata cache image superblock extension and free - * the cache image block. Don't do this now as f->shared - * is not fully setup, which complicates matters. + * delete the metadata cache image superblock extension and free + * the cache image block. Don't do this now as f->shared + * is not fully setup, which complicates matters. */ /* Retrieve the 'metadata cache image message' structure */ - if(NULL == H5O_msg_read(&ext_loc, H5O_MDCI_MSG_ID, &mdci_msg, meta_dxpl_id)) + if(NULL == H5O_msg_read(&ext_loc, H5O_MDCI_MSG_ID, &mdci_msg, meta_dxpl_id)) HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "unable to get metadata cache image message") /* Indicate to the cache that there's an image to load on first protect call */ if(H5AC_load_cache_image_on_next_protect(f, mdci_msg.addr, mdci_msg.size, rw) < 0) - HGOTO_ERROR(H5E_FILE, H5E_CANTLOAD, FAIL, "call to H5AC_load_cache_image_on_next_protect failed"); + HGOTO_ERROR(H5E_FILE, H5E_CANTLOAD, FAIL, "call to H5AC_load_cache_image_on_next_protect failed"); } /* end if */ /* Close superblock extension */ if(H5F_super_ext_close(f, &ext_loc, meta_dxpl_id, FALSE) < 0) - HGOTO_ERROR(H5E_FILE, H5E_CANTCLOSEOBJ, FAIL, "unable to close file's superblock extension") + HGOTO_ERROR(H5E_FILE, H5E_CANTCLOSEOBJ, FAIL, "unable to close file's superblock extension") } /* end if */ /* Update the driver info if VFD indicated to do so */ diff --git a/testpar/t_pread.c b/testpar/t_pread.c index b40fc09..f0cad3d 100644 --- a/testpar/t_pread.c +++ b/testpar/t_pread.c @@ -87,17 +87,13 @@ static int test_parallel_read(MPI_Comm comm, int mpi_rank, int group); * * Failure: 1 * - * Programmer: Richard Warren - * 10/1/17 - * - * Modifications: * *------------------------------------------------------------------------- */ static int generate_test_file( MPI_Comm comm, int mpi_rank, int group_id ) { - FILE *header; + FILE *header = NULL; const char *fcn_name = "generate_test_file()"; const char *failure_mssg = NULL; const char *group_filename = NULL; @@ -113,12 +109,12 @@ generate_test_file( MPI_Comm comm, int mpi_rank, int group_id ) hsize_t i; hsize_t offset; hsize_t dims[1] = {0}; - hid_t file_id; - hid_t memspace; - hid_t filespace; - hid_t fapl_id; - hid_t dxpl_id; - hid_t dset_id; + hid_t file_id = -1; + hid_t memspace = -1; + hid_t filespace = -1; + hid_t fapl_id = -1; + hid_t dxpl_id = -1; + hid_t dset_id = -1; float nextValue; float *data_slice = NULL; @@ -155,17 +151,17 @@ generate_test_file( MPI_Comm comm, int mpi_rank, int group_id ) else if ( group_id == 0 ) { /* Test 2 group 0 */ file_index = 3; } - else { /* Test 2 group 1 */ + else { /* Test 2 group 1 */ file_index = 6; } - /* The 'group_filename' is just a temp variable and + /* The 'group_filename' is just a temp variable and * is used to call into the h5_fixname function. No * need to worry that we reassign it for each file! - */ + */ HDassert((group_filename = FILENAMES[file_index])); - /* Assign the 'data_filename' */ + /* Assign the 'data_filename' */ if ( h5_fixname(group_filename, H5P_DEFAULT, data_filename, sizeof(data_filename)) == NULL ) { pass = FALSE; @@ -177,7 +173,7 @@ generate_test_file( MPI_Comm comm, int mpi_rank, int group_id ) HDassert( (group_filename = FILENAMES[file_index+1]) ); - /* Assign the 'reloc_data_filename' */ + /* Assign the 'reloc_data_filename' */ if ( h5_fixname(group_filename, H5P_DEFAULT, reloc_data_filename, sizeof(reloc_data_filename)) == NULL ) { @@ -190,7 +186,7 @@ generate_test_file( MPI_Comm comm, int mpi_rank, int group_id ) HDassert( (group_filename = FILENAMES[file_index+2]) ); - /* Assing the 'prolog_filename' */ + /* Assign the 'prolog_filename' */ if ( h5_fixname(group_filename, H5P_DEFAULT, prolog_filename, sizeof(prolog_filename)) == NULL ) { pass = FALSE; @@ -211,7 +207,7 @@ generate_test_file( MPI_Comm comm, int mpi_rank, int group_id ) for(i=0; i 0); + return((nerrs > 0) ? FAIL : SUCCEED ); } /* main() */ -- cgit v0.12 From 3dde6d0e32461f46630f814a2fdfbd4c813703bf Mon Sep 17 00:00:00 2001 From: Richard Warren Date: Wed, 11 Oct 2017 16:22:50 -0400 Subject: Updated the code and RELEASE.txt note per comments from John Mainzer --- release_docs/RELEASE.txt | 10 +++--- testpar/t_pread.c | 84 +++++++++++++++++++++++++----------------------- 2 files changed, 48 insertions(+), 46 deletions(-) diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt index ed1b6cc..e561983 100644 --- a/release_docs/RELEASE.txt +++ b/release_docs/RELEASE.txt @@ -64,11 +64,11 @@ New Features ----------------- - Optimize parallel open/location of the HDF5 super-block - Previous releases of PHDF5 allow all parallel ranks to - read the starting elements in a file to validate and process - the HDF5 super-block. As this is accomplished more or less as - a synchronous operation, a large number of processes will - likely experience a slowdown due to filesystem contention. + Previous releases of PHDF5 required all parallel ranks to + search for the HDF5 superblock signature when opening the + file. As this is accomplished more or less as a synchronous + operation, a large number of processes can experience a + slowdown in the file open due to filesystem contention. As a first step in improving the startup/file-open performance, we allow MPI rank 0 of the associated MPI communicator to locate diff --git a/testpar/t_pread.c b/testpar/t_pread.c index f0cad3d..7f23b9b 100644 --- a/testpar/t_pread.c +++ b/testpar/t_pread.c @@ -16,7 +16,6 @@ * */ - #include "h5test.h" #include "testpar.h" @@ -87,7 +86,11 @@ static int test_parallel_read(MPI_Comm comm, int mpi_rank, int group); * * Failure: 1 * + * Programmer: Richard Warren + * 10/1/17 * + * Modifications: + * *------------------------------------------------------------------------- */ static int @@ -758,17 +761,17 @@ main( int argc, char **argv) if ( (MPI_Init(&argc, &argv)) != MPI_SUCCESS) { HDfprintf(stderr, "FATAL: Unable to initialize MPI\n"); - HDexit(FAIL); + HDexit(EXIT_FAILURE); } if ( (MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank)) != MPI_SUCCESS) { HDfprintf(stderr, "FATAL: MPI_Comm_rank returned an error\n"); - HDexit(FAIL); + HDexit(EXIT_FAILURE); } if ( (MPI_Comm_size(MPI_COMM_WORLD, &mpi_size)) != MPI_SUCCESS) { HDfprintf(stderr, "FATAL: MPI_Comm_size returned an error\n"); - HDexit(FAIL); + HDexit(EXIT_FAILURE); } H5open(); @@ -789,26 +792,53 @@ main( int argc, char **argv) goto finish; } - /* ------ Test 1 of 2 ------ - * In this test we utilize all processes which makeup MPI_COMM_WORLD. - * We generate the test file which we'll shortly try to read. + /* ------ Create two (2) MPI groups ------ + * + * We split MPI_COMM_WORLD into 2 more or less equal sized + * groups. The resulting communicators will be used to generate + * two HDF files which in turn will be opened in parallel and the + * contents verified in the second read test below. */ + split_size = mpi_size / 2; + which_group = (mpi_rank < split_size ? 0 : 1); + + if ( (MPI_Comm_split(MPI_COMM_WORLD, + which_group, + 0, + &group_comm)) != MPI_SUCCESS) { + + HDfprintf(stderr, "FATAL: MPI_Comm_split returned an error\n"); + HDexit(EXIT_FAILURE); + } + + /* ------ Generate all files ------ */ + + /* We generate the file used for test 1 */ + nerrs += generate_test_file( MPI_COMM_WORLD, mpi_rank, which_group ); + + if ( nerrs > 0 ) { + if ( mpi_rank == 0 ) { + HDprintf(" Test(1) file construction failed -- skipping tests.\n"); + } + goto finish; + } + + /* We generate the file used for test 2 */ nerrs += generate_test_file( group_comm, mpi_rank, which_group ); - /* abort tests if there were any errors in test file construction */ if ( nerrs > 0 ) { if ( mpi_rank == 0 ) { - HDprintf(" Test file construction failed -- skipping tests.\n"); + HDprintf(" Test(2) file construction failed -- skipping tests.\n"); } goto finish; } /* Now read the generated test file (stil using MPI_COMM_WORLD) */ - nerrs += test_parallel_read( group_comm, mpi_rank, which_group); + nerrs += test_parallel_read( MPI_COMM_WORLD, mpi_rank, which_group); if ( nerrs > 0 ) { if ( mpi_rank == 0 ) { - HDprintf(" Parallel read test failed -- skipping tests.\n"); + HDprintf(" Parallel read test(1) failed -- skipping tests.\n"); } goto finish; } @@ -819,40 +849,12 @@ main( int argc, char **argv) HDprintf(" -- Starting multi-group parallel read test.\n"); } - /* ------ Test 2 of 2 ------ - * Create two more or less equal MPI groups to - * initialize the test files and then verify that parallel - * operations by independent group succeeds. - */ - - split_size = mpi_size / 2; - which_group = (mpi_rank < split_size ? 0 : 1); - - if ( (MPI_Comm_split(MPI_COMM_WORLD, - which_group, - 0, - &group_comm)) != MPI_SUCCESS) { - - HDfprintf(stderr, "FATAL: MPI_Comm_split returned an error\n"); - HDexit(FAIL); - } - - nerrs += generate_test_file( group_comm, mpi_rank, which_group ); - - /* abort tests if there were any errors in test file construction */ - if ( nerrs > 0 ) { - if ( mpi_rank == 0 ) { - HDprintf(" Test file construction failed -- skipping tests.\n"); - } - goto finish; - } - /* run the 2nd set of tests */ nerrs += test_parallel_read(group_comm, mpi_rank, which_group); if ( nerrs > 0 ) { if ( mpi_rank == 0 ) { - HDprintf(" Multi-group read test failed\n"); + HDprintf(" Multi-group read test(2) failed\n"); } goto finish; } @@ -897,6 +899,6 @@ finish: MPI_Finalize(); /* cannot just return (nerrs) because exit code is limited to 1byte */ - return((nerrs > 0) ? FAIL : SUCCEED ); + return((nerrs > 0) ? EXIT_FAILURE : EXIT_SUCCESS ); } /* main() */ -- cgit v0.12