From 16aa2dbaa0e70bf81f4329a70a45c601433549bb Mon Sep 17 00:00:00 2001 From: jhendersonHDF Date: Fri, 16 Sep 2022 11:17:30 -0500 Subject: Subfiling VFD updates (#2106) --- CMakeLists.txt | 6 + configure.ac | 32 +- examples/ph5_subfiling.c | 251 ++- src/H5FDsubfiling/H5FDioc.c | 805 ++++----- src/H5FDsubfiling/H5FDioc.h | 21 +- src/H5FDsubfiling/H5FDioc_int.c | 295 ++-- src/H5FDsubfiling/H5FDioc_priv.h | 37 +- src/H5FDsubfiling/H5FDioc_threads.c | 300 ++-- src/H5FDsubfiling/H5FDsubfile_int.c | 186 +- src/H5FDsubfiling/H5FDsubfiling.c | 832 +++++---- src/H5FDsubfiling/H5FDsubfiling.h | 124 +- src/H5FDsubfiling/H5subfiling_common.c | 2912 +++++++++++++++++++++----------- src/H5FDsubfiling/H5subfiling_common.h | 181 +- testpar/CMakeTests.cmake | 8 + testpar/t_subfiling_vfd.c | 1916 +++++++++++++++++++-- testpar/t_vfd.c | 4 +- utils/subfiling_vfd/CMakeLists.txt | 14 +- utils/subfiling_vfd/h5fuse.sh.in | 4 +- 18 files changed, 5502 insertions(+), 2426 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 35b345c..d1ef0ae 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -740,6 +740,12 @@ option (HDF5_ENABLE_SUBFILING_VFD "Build Parallel HDF5 Subfiling VFD" OFF) if (HDF5_ENABLE_SUBFILING_VFD) if (NOT HDF5_ENABLE_PARALLEL) message (FATAL_ERROR "Subfiling VFD requires a parallel HDF5 build") + else () + # Check for MPI_Comm_split_type + CHECK_SYMBOL_EXISTS (MPI_Comm_split_type "mpi.h" H5_HAVE_MPI_Comm_split_type) + if (NOT H5_HAVE_MPI_Comm_split_type) + message (FATAL_ERROR "Subfiling VFD requires MPI-3 support for MPI_Comm_split_type") + endif () endif() if (NOT DEFINED Threads_FOUND) diff --git a/configure.ac b/configure.ac index ab177fc..cdbdadb 100644 --- a/configure.ac +++ b/configure.ac @@ -3213,6 +3213,32 @@ if test "X$SUBFILING_VFD" = "Xyes"; then AC_DEFINE([HAVE_IOC_VFD], [1], [Define if the I/O Concentrator virtual file driver (VFD) should be compiled]) + if test "X${PARALLEL}" != "Xyes"; then + AC_MSG_ERROR([--enable-parallel is required for --enable-subfiling-vfd]) + fi + + ## ---------------------------------------------------------------------- + ## Check for MPI_Comm_split_type availability + ## + AC_MSG_CHECKING([for MPI_Comm_split_type function]) + + AC_LINK_IFELSE( + [AC_LANG_PROGRAM( + [[ + #include + ]], + [[ + MPI_Comm intra_comm; + MPI_Init(0, (void *) 0); + MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &intra_comm); + ]] + )], + [AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([no]) + AC_MSG_ERROR([unable to link MPI program that uses MPI_Comm_split_type]) + ] + ) + # Set-up mercury HAVE_MERCURY="yes" mercury_dir="$ac_abs_confdir/src/H5FDsubfiling/mercury" @@ -3221,17 +3247,13 @@ if test "X$SUBFILING_VFD" = "Xyes"; then CPPFLAGS="$CPPFLAGS -I$mercury_inc" AM_CPPFLAGS="$AM_CPPFLAGS -I$mercury_inc" - if test "X${PARALLEL}" != "Xyes"; then - AC_MSG_ERROR([--enable-parallel is required for --enable-subfiling-vfd]) - fi - HAVE_STDATOMIC_H="yes" AC_CHECK_HEADERS([stdatomic.h],,[HAVE_STDATOMIC_H="no"]) if test "x$HAVE_STDATOMIC_H" = "xno"; then AC_MSG_ERROR([Subfiling VFD requires atomic operations support. C11 stdatomic.h NOT available.]) fi -# Checks for libraries. + # Checks for libraries. AC_SEARCH_LIBS([shm_open], [rt]) AC_CHECK_LIB([pthread], [pthread_self],[], [echo "Error: Required library pthread not found." && exit 1]) diff --git a/examples/ph5_subfiling.c b/examples/ph5_subfiling.c index 9142749..7d72448 100644 --- a/examples/ph5_subfiling.c +++ b/examples/ph5_subfiling.c @@ -12,7 +12,7 @@ /* * Example of using HDF5's Subfiling VFD to write to an - * HDF5 file that is striped across multiple sub-files + * HDF5 file that is striped across multiple subfiles * * If the HDF5_NOCLEANUP environment variable is set, the * files that this example creates will not be removed as @@ -35,12 +35,13 @@ #define EXAMPLE_FILE "h5_subfiling_default_example.h5" #define EXAMPLE_FILE2 "h5_subfiling_custom_example.h5" +#define EXAMPLE_FILE3 "h5_subfiling_precreate_example.h5" #define EXAMPLE_DSET_NAME "DSET" #define EXAMPLE_DSET_DIMS 2 -/* Have each MPI rank write 64MiB of data */ -#define EXAMPLE_DSET_NY 16777216 +/* Have each MPI rank write 16MiB of data */ +#define EXAMPLE_DSET_NY 4194304 /* Dataset datatype */ #define EXAMPLE_DSET_DATATYPE H5T_NATIVE_INT @@ -56,6 +57,11 @@ cleanup(char *filename, hid_t fapl_id) H5Fdelete(filename, fapl_id); } +/* + * An example of using the HDF5 Subfiling VFD with + * its default settings of 1 subfile per node, with + * a stripe size of 32MiB + */ static void subfiling_write_default(hid_t fapl_id, int mpi_size, int mpi_rank) { @@ -64,12 +70,19 @@ subfiling_write_default(hid_t fapl_id, int mpi_size, int mpi_rank) hsize_t start[EXAMPLE_DSET_DIMS]; hsize_t count[EXAMPLE_DSET_DIMS]; hid_t file_id; + hid_t subfiling_fapl; hid_t dset_id; hid_t filespace; char filename[512]; char *par_prefix; /* + * Make a copy of the FAPL so we don't disturb + * it for the other examples + */ + subfiling_fapl = H5Pcopy(fapl_id); + + /* * Set Subfiling VFD on FAPL using default settings * (use IOC VFD, 1 IOC per node, 32MiB stripe size) * @@ -77,7 +90,7 @@ subfiling_write_default(hid_t fapl_id, int mpi_size, int mpi_rank) * can be adjusted with environment variables as well * in this case. */ - H5Pset_fapl_subfiling(fapl_id, NULL); + H5Pset_fapl_subfiling(subfiling_fapl, NULL); /* * OPTIONAL: Set alignment of objects in HDF5 file to @@ -94,7 +107,7 @@ subfiling_write_default(hid_t fapl_id, int mpi_size, int mpi_rank) * files, so it is a good idea to keep an eye * on this. */ - H5Pset_alignment(fapl_id, 0, 33554432); /* Align to default 32MiB stripe size */ + H5Pset_alignment(subfiling_fapl, 0, 33554432); /* Align to default 32MiB stripe size */ /* Parse any parallel prefix and create filename */ par_prefix = getenv("HDF5_PARAPREFIX"); @@ -105,7 +118,7 @@ subfiling_write_default(hid_t fapl_id, int mpi_size, int mpi_rank) /* * Create a new file collectively */ - file_id = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); + file_id = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, subfiling_fapl); /* * Create the dataspace for the dataset. The first @@ -155,9 +168,15 @@ subfiling_write_default(hid_t fapl_id, int mpi_size, int mpi_rank) H5Sclose(filespace); H5Fclose(file_id); - cleanup(EXAMPLE_FILE, fapl_id); + cleanup(EXAMPLE_FILE, subfiling_fapl); + + H5Pclose(subfiling_fapl); } +/* + * An example of using the HDF5 Subfiling VFD with + * custom settings + */ static void subfiling_write_custom(hid_t fapl_id, int mpi_size, int mpi_rank) { @@ -168,17 +187,23 @@ subfiling_write_custom(hid_t fapl_id, int mpi_size, int mpi_rank) hsize_t start[EXAMPLE_DSET_DIMS]; hsize_t count[EXAMPLE_DSET_DIMS]; hid_t file_id; - hid_t ioc_fapl; + hid_t subfiling_fapl; hid_t dset_id; hid_t filespace; char filename[512]; char *par_prefix; /* + * Make a copy of the FAPL so we don't disturb + * it for the other examples + */ + subfiling_fapl = H5Pcopy(fapl_id); + + /* * Get a default Subfiling and IOC configuration */ - H5Pget_fapl_subfiling(fapl_id, &subf_config); - H5Pget_fapl_ioc(fapl_id, &ioc_config); + H5Pget_fapl_subfiling(subfiling_fapl, &subf_config); + H5Pget_fapl_ioc(subfiling_fapl, &ioc_config); /* * Set Subfiling configuration to use a 1MiB @@ -198,32 +223,18 @@ subfiling_write_custom(hid_t fapl_id, int mpi_size, int mpi_rank) * configuration. */ ioc_config.thread_pool_size = 2; - ioc_config.subf_config = subf_config.shared_cfg; - - /* - * Create a File Access Property List for - * the IOC VFD and set our new configuration - * on it. We make a copy of the original - * FAPL here so we get the MPI parameters - * set on it - */ - ioc_fapl = H5Pcopy(fapl_id); - H5Pset_fapl_ioc(ioc_fapl, &ioc_config); /* - * Close FAPLs in the default configurations - * we retrieved and update the subfiling - * configuration with our new IOC FAPL + * Set our new configuration on the IOC + * FAPL used for Subfiling */ - H5Pclose(ioc_config.under_fapl_id); - H5Pclose(subf_config.ioc_fapl_id); - subf_config.ioc_fapl_id = ioc_fapl; + H5Pset_fapl_ioc(subf_config.ioc_fapl_id, &ioc_config); /* * Finally, set our new Subfiling configuration * on the original FAPL */ - H5Pset_fapl_subfiling(fapl_id, &subf_config); + H5Pset_fapl_subfiling(subfiling_fapl, &subf_config); /* * OPTIONAL: Set alignment of objects in HDF5 file to @@ -240,7 +251,7 @@ subfiling_write_custom(hid_t fapl_id, int mpi_size, int mpi_rank) * files, so it is a good idea to keep an eye * on this. */ - H5Pset_alignment(fapl_id, 0, 1048576); /* Align to custom 1MiB stripe size */ + H5Pset_alignment(subfiling_fapl, 0, 1048576); /* Align to custom 1MiB stripe size */ /* Parse any parallel prefix and create filename */ par_prefix = getenv("HDF5_PARAPREFIX"); @@ -251,7 +262,7 @@ subfiling_write_custom(hid_t fapl_id, int mpi_size, int mpi_rank) /* * Create a new file collectively */ - file_id = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); + file_id = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, subfiling_fapl); /* * Create the dataspace for the dataset. The first @@ -301,7 +312,179 @@ subfiling_write_custom(hid_t fapl_id, int mpi_size, int mpi_rank) H5Sclose(filespace); H5Fclose(file_id); - cleanup(EXAMPLE_FILE2, fapl_id); + cleanup(EXAMPLE_FILE2, subfiling_fapl); + + H5Pclose(subfiling_fapl); +} + +/* + * An example of pre-creating an HDF5 file on MPI rank + * 0 when using the HDF5 Subfiling VFD. In this case, + * the subfiling stripe count must be set so that rank + * 0 knows how many subfiles to pre-create. + */ +static void +subfiling_write_precreate(hid_t fapl_id, int mpi_size, int mpi_rank) +{ + EXAMPLE_DSET_C_DATATYPE *data; + H5FD_subfiling_config_t subf_config; + hsize_t dset_dims[EXAMPLE_DSET_DIMS]; + hsize_t start[EXAMPLE_DSET_DIMS]; + hsize_t count[EXAMPLE_DSET_DIMS]; + hid_t file_id; + hid_t subfiling_fapl; + hid_t dset_id; + hid_t filespace; + char filename[512]; + char *par_prefix; + + /* + * Make a copy of the FAPL so we don't disturb + * it for the other examples + */ + subfiling_fapl = H5Pcopy(fapl_id); + + /* + * Get a default Subfiling and IOC configuration + */ + H5Pget_fapl_subfiling(subfiling_fapl, &subf_config); + + /* + * Set the Subfiling stripe count so that rank + * 0 knows how many subfiles the logical HDF5 + * file should consist of. In this case, use + * 5 subfiles with a default stripe size of + * 32MiB. + */ + subf_config.shared_cfg.stripe_count = 5; + + /* + * OPTIONAL: Set alignment of objects in HDF5 file to + * be equal to the Subfiling stripe size. + * Choosing a Subfiling stripe size and HDF5 + * object alignment value that are some + * multiple of the disk block size can + * generally help performance by ensuring + * that I/O is well-aligned and doesn't + * excessively cross stripe boundaries. + * + * Note that this option can substantially + * increase the size of the resulting HDF5 + * files, so it is a good idea to keep an eye + * on this. + */ + H5Pset_alignment(subfiling_fapl, 0, 1048576); /* Align to custom 1MiB stripe size */ + + /* Parse any parallel prefix and create filename */ + par_prefix = getenv("HDF5_PARAPREFIX"); + + snprintf(filename, sizeof(filename), "%s%s%s", par_prefix ? par_prefix : "", par_prefix ? "/" : "", + EXAMPLE_FILE3); + + /* Set dataset dimensionality */ + dset_dims[0] = mpi_size; + dset_dims[1] = EXAMPLE_DSET_NY; + + if (mpi_rank == 0) { + /* + * Make sure only this rank opens the file + */ + H5Pset_mpi_params(subfiling_fapl, MPI_COMM_SELF, MPI_INFO_NULL); + + /* + * Set the Subfiling VFD on our FAPL using + * our custom configuration + */ + H5Pset_fapl_subfiling(subfiling_fapl, &subf_config); + + /* + * Create a new file on rank 0 + */ + file_id = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, subfiling_fapl); + + /* + * Create the dataspace for the dataset. The first + * dimension varies with the number of MPI ranks + * while the second dimension is fixed. + */ + filespace = H5Screate_simple(EXAMPLE_DSET_DIMS, dset_dims, NULL); + + /* + * Create the dataset with default properties + */ + dset_id = H5Dcreate2(file_id, EXAMPLE_DSET_NAME, EXAMPLE_DSET_DATATYPE, filespace, H5P_DEFAULT, + H5P_DEFAULT, H5P_DEFAULT); + + /* + * Initialize data buffer + */ + data = malloc(dset_dims[0] * dset_dims[1] * sizeof(EXAMPLE_DSET_C_DATATYPE)); + for (size_t i = 0; i < dset_dims[0] * dset_dims[1]; i++) { + data[i] = i; + } + + /* + * Rank 0 writes to the whole dataset + */ + H5Dwrite(dset_id, EXAMPLE_DSET_DATATYPE, H5S_BLOCK, filespace, H5P_DEFAULT, data); + + /* + * Close/release resources. + */ + + free(data); + + H5Dclose(dset_id); + H5Sclose(filespace); + H5Fclose(file_id); + } + + MPI_Barrier(MPI_COMM_WORLD); + + /* + * Use all MPI ranks to re-open the file and + * read back the dataset that was created + */ + H5Pset_mpi_params(subfiling_fapl, MPI_COMM_WORLD, MPI_INFO_NULL); + + /* + * Use the same subfiling configuration as rank 0 + * used to create the file + */ + H5Pset_fapl_subfiling(subfiling_fapl, &subf_config); + + /* + * Re-open the file on all ranks + */ + file_id = H5Fopen(filename, H5F_ACC_RDONLY, subfiling_fapl); + + /* + * Open the dataset that was created + */ + dset_id = H5Dopen2(file_id, EXAMPLE_DSET_NAME, H5P_DEFAULT); + + /* + * Initialize data buffer + */ + data = malloc(dset_dims[0] * dset_dims[1] * sizeof(EXAMPLE_DSET_C_DATATYPE)); + + /* + * Read the dataset on all ranks + */ + H5Dread(dset_id, EXAMPLE_DSET_DATATYPE, H5S_BLOCK, H5S_ALL, H5P_DEFAULT, data); + + /* + * Close/release resources. + */ + + free(data); + + H5Dclose(dset_id); + H5Fclose(file_id); + + cleanup(EXAMPLE_FILE3, subfiling_fapl); + + H5Pclose(subfiling_fapl); } int @@ -338,6 +521,12 @@ main(int argc, char **argv) /* Use Subfiling VFD with custom settings */ subfiling_write_custom(fapl_id, mpi_size, mpi_rank); + /* + * Use Subfiling VFD to precreate the HDF5 + * file on MPI rank 0 + */ + subfiling_write_precreate(fapl_id, mpi_size, mpi_rank); + H5Pclose(fapl_id); if (mpi_rank == 0) diff --git a/src/H5FDsubfiling/H5FDioc.c b/src/H5FDsubfiling/H5FDioc.c index 78d060f..11d51de 100644 --- a/src/H5FDsubfiling/H5FDioc.c +++ b/src/H5FDsubfiling/H5FDioc.c @@ -47,15 +47,21 @@ typedef struct H5FD_ioc_t { int fd; /* the filesystem file descriptor */ H5FD_ioc_config_t fa; /* driver-specific file access properties */ + H5FD_subfiling_params_t subf_config; + /* MPI Info */ MPI_Comm comm; MPI_Info info; int mpi_rank; int mpi_size; - H5FD_t *ioc_file; /* native HDF5 file pointer */ + uint64_t file_id; + int64_t context_id; /* The value used to lookup a subfiling context for the file */ - int64_t context_id; /* The value used to lookup a subfiling context for the file */ + haddr_t eof; + haddr_t eoa; + haddr_t last_eoa; + haddr_t local_eof; char *file_dir; /* Directory where we find files */ char *file_path; /* The user defined filename */ @@ -130,9 +136,8 @@ static herr_t H5FD__ioc_ctl(H5FD_t *file, uint64_t op_code, uint64_t flags, const void *input, void **result); */ -static herr_t H5FD__ioc_get_default_config(hid_t fapl_id, H5FD_ioc_config_t *config_out); +static herr_t H5FD__ioc_get_default_config(H5FD_ioc_config_t *config_out); static herr_t H5FD__ioc_validate_config(const H5FD_ioc_config_t *fa); -static int H5FD__copy_plist(hid_t fapl_id, hid_t *id_out_ptr); static herr_t H5FD__ioc_close_int(H5FD_ioc_t *file_ptr); @@ -330,10 +335,9 @@ H5Pset_fapl_ioc(hid_t fapl_id, H5FD_ioc_config_t *vfd_config) if (NULL == (ioc_conf = H5FL_CALLOC(H5FD_ioc_config_t))) H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate IOC VFD configuration"); - ioc_conf->under_fapl_id = H5I_INVALID_HID; /* Get IOC VFD defaults */ - if (H5FD__ioc_get_default_config(fapl_id, ioc_conf) < 0) + if (H5FD__ioc_get_default_config(ioc_conf) < 0) H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't get default IOC VFD configuration"); vfd_config = ioc_conf; @@ -346,9 +350,6 @@ H5Pset_fapl_ioc(hid_t fapl_id, H5FD_ioc_config_t *vfd_config) done: if (ioc_conf) { - if (ioc_conf->under_fapl_id >= 0 && H5I_dec_ref(ioc_conf->under_fapl_id) < 0) - H5_SUBFILING_DONE_ERROR(H5E_PLIST, H5E_CANTDEC, FAIL, "can't close IOC under FAPL"); - ioc_conf->under_fapl_id = H5I_INVALID_HID; H5FL_FREE(H5FD_ioc_config_t, ioc_conf); } @@ -393,16 +394,12 @@ H5Pget_fapl_ioc(hid_t fapl_id, H5FD_ioc_config_t *config_out) } if (use_default_config) { - if (H5FD__ioc_get_default_config(fapl_id, config_out) < 0) + if (H5FD__ioc_get_default_config(config_out) < 0) H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get default IOC VFD configuration"); } else { /* Copy the IOC fapl data out */ HDmemcpy(config_out, config_ptr, sizeof(H5FD_ioc_config_t)); - - /* Copy the driver info value */ - if (H5FD__copy_plist(config_ptr->under_fapl_id, &(config_out->under_fapl_id)) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "can't copy IOC under FAPL"); } done: @@ -421,56 +418,18 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5FD__ioc_get_default_config(hid_t fapl_id, H5FD_ioc_config_t *config_out) +H5FD__ioc_get_default_config(H5FD_ioc_config_t *config_out) { - MPI_Comm comm = MPI_COMM_NULL; - MPI_Info info = MPI_INFO_NULL; - herr_t ret_value = SUCCEED; + herr_t ret_value = SUCCEED; HDassert(config_out); HDmemset(config_out, 0, sizeof(*config_out)); - config_out->magic = H5FD_IOC_FAPL_MAGIC; - config_out->version = H5FD_IOC_CURR_FAPL_VERSION; - config_out->under_fapl_id = H5I_INVALID_HID; - - /* - * Use default subfiling configuration. Do NOT call - * H5Pget_fapl_subfiling here as that can cause issues - */ - config_out->subf_config.ioc_selection = SELECT_IOC_ONE_PER_NODE; - config_out->subf_config.stripe_size = H5FD_SUBFILING_DEFAULT_STRIPE_SIZE; - config_out->subf_config.stripe_count = 0; - - /* Create a default FAPL and choose an appropriate underlying driver */ - if ((config_out->under_fapl_id = H5Pcreate(H5P_FILE_ACCESS)) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTCREATE, FAIL, "can't create default FAPL"); - - /* Check if any MPI parameters were set on the FAPL */ - if (H5Pget_mpi_params(fapl_id, &comm, &info) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI Comm/Info"); - if (comm == MPI_COMM_NULL) - comm = MPI_COMM_WORLD; - - /* Hardwire MPI I/O VFD for now */ - if (H5Pset_fapl_mpio(config_out->under_fapl_id, comm, info) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI I/O VFD on IOC under FAPL"); - - /* Specific to this I/O Concentrator */ + config_out->magic = H5FD_IOC_FAPL_MAGIC; + config_out->version = H5FD_IOC_CURR_FAPL_VERSION; config_out->thread_pool_size = H5FD_IOC_DEFAULT_THREAD_POOL_SIZE; -done: - if (H5_mpi_comm_free(&comm) < 0) - H5_SUBFILING_DONE_ERROR(H5E_PLIST, H5E_CANTFREE, FAIL, "can't free MPI Communicator"); - if (H5_mpi_info_free(&info) < 0) - H5_SUBFILING_DONE_ERROR(H5E_PLIST, H5E_CANTFREE, FAIL, "can't free MPI Info object"); - - if (ret_value < 0) { - if (config_out->under_fapl_id >= 0 && H5Pclose(config_out->under_fapl_id) < 0) - H5_SUBFILING_DONE_ERROR(H5E_PLIST, H5E_CANTCLOSEOBJ, FAIL, "can't close FAPL"); - } - H5_SUBFILING_FUNC_LEAVE; } @@ -504,13 +463,6 @@ H5FD__ioc_validate_config(const H5FD_ioc_config_t *fa) if (fa->magic != H5FD_IOC_FAPL_MAGIC) H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid H5FD_ioc_config_t magic value"); - if (fa->under_fapl_id < 0) - H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid under FAPL ID"); - - if (fa->subf_config.ioc_selection < SELECT_IOC_ONE_PER_NODE || - fa->subf_config.ioc_selection >= ioc_selection_options) - H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid IOC selection method"); - done: H5_SUBFILING_FUNC_LEAVE; } /* end H5FD__ioc_validate_config() */ @@ -518,31 +470,37 @@ done: /*------------------------------------------------------------------------- * Function: H5FD__ioc_sb_size * - * Purpose: Obtains the number of bytes required to store the driver file - * access data in the HDF5 superblock. + * Purpose: Obtains the number of bytes required to store the driver + * file access data in the HDF5 superblock. * * Return: Success: Number of bytes required. * * Failure: 0 if an error occurs or if the driver has no * data to store in the superblock. * - * NOTE: no public API for H5FD_sb_size, it needs to be added *------------------------------------------------------------------------- */ static hsize_t -H5FD__ioc_sb_size(H5FD_t *_file) +H5FD__ioc_sb_size(H5FD_t H5_ATTR_UNUSED *_file) { - H5FD_ioc_t *file = (H5FD_ioc_t *)_file; - hsize_t ret_value = 0; + hsize_t ret_value = 0; H5FD_IOC_LOG_CALL(__func__); - /* Sanity check */ - HDassert(file); - HDassert(file->ioc_file); + /* Configuration structure magic number */ + ret_value += sizeof(uint32_t); + + /* Configuration structure version number */ + ret_value += sizeof(uint32_t); + + /* IOC thread pool size */ + ret_value += sizeof(int32_t); + + /* Subfiling stripe size */ + ret_value += sizeof(int64_t); - if (file->ioc_file) - ret_value = H5FD_sb_size(file->ioc_file); + /* Subfiling stripe count (encoded as int64_t for future) */ + ret_value += sizeof(int64_t); H5_SUBFILING_FUNC_LEAVE; } /* end H5FD__ioc_sb_size */ @@ -552,23 +510,42 @@ H5FD__ioc_sb_size(H5FD_t *_file) * * Purpose: Encode driver-specific data into the output arguments. * - * Return: SUCCEED/FAIL + * Return: Non-negative on success/Negative on failure *------------------------------------------------------------------------- */ static herr_t -H5FD__ioc_sb_encode(H5FD_t *_file, char *name /*out*/, unsigned char *buf /*out*/) +H5FD__ioc_sb_encode(H5FD_t *_file, char *name, unsigned char *buf) { - H5FD_ioc_t *file = (H5FD_ioc_t *)_file; - herr_t ret_value = SUCCEED; /* Return value */ + subfiling_context_t *sf_context = NULL; + H5FD_ioc_t *file = (H5FD_ioc_t *)_file; + uint8_t *p = (uint8_t *)buf; + int64_t tmp64; + herr_t ret_value = SUCCEED; H5FD_IOC_LOG_CALL(__func__); - /* Sanity check */ - HDassert(file); - HDassert(file->ioc_file); + if (NULL == (sf_context = H5_get_subfiling_object(file->context_id))) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get subfiling context object"); - if (file->ioc_file && H5FD_sb_encode(file->ioc_file, name, buf) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTENCODE, FAIL, "unable to encode the superblock in R/W file"); + /* Encode driver name */ + HDstrncpy(name, "IOC", 9); + name[8] = '\0'; + + /* Encode configuration structure magic number */ + UINT32ENCODE(p, file->fa.magic); + + /* Encode configuration structure version number */ + UINT32ENCODE(p, file->fa.version); + + /* Encode thread pool size field */ + INT32ENCODE(p, file->fa.thread_pool_size); + + /* Encode subfiling stripe size */ + INT64ENCODE(p, sf_context->sf_stripe_size); + + /* Encode subfiling stripe count (number of subfiles) */ + tmp64 = sf_context->sf_num_subfiles; + INT64ENCODE(p, tmp64); done: H5_SUBFILING_FUNC_LEAVE; @@ -579,25 +556,62 @@ done: * * Purpose: Decodes the driver information block. * - * Return: SUCCEED/FAIL - * - * NOTE: no public API for H5FD_sb_size, need to add + * Return: Non-negative on success/Negative on failure *------------------------------------------------------------------------- */ static herr_t H5FD__ioc_sb_decode(H5FD_t *_file, const char *name, const unsigned char *buf) { - H5FD_ioc_t *file = (H5FD_ioc_t *)_file; - herr_t ret_value = SUCCEED; /* Return value */ + subfiling_context_t *sf_context = NULL; + const uint8_t *p = (const uint8_t *)buf; + H5FD_ioc_t *file = (H5FD_ioc_t *)_file; + int64_t tmp64; + herr_t ret_value = SUCCEED; H5FD_IOC_LOG_CALL(__func__); - /* Sanity check */ - HDassert(file); - HDassert(file->ioc_file); + if (NULL == (sf_context = H5_get_subfiling_object(file->context_id))) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get subfiling context object"); + + if (HDstrncmp(name, "IOC", 9)) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "invalid driver name in superblock"); + + /* Decode configuration structure magic number */ + UINT32DECODE(p, file->fa.magic); + + /* Decode configuration structure version number */ + UINT32DECODE(p, file->fa.version); + + /* Decode thread pool size field */ + INT32DECODE(p, file->fa.thread_pool_size); + + /* Decode subfiling stripe size */ + INT64DECODE(p, file->subf_config.stripe_size); - if (H5FD_sb_load(file->ioc_file, name, buf) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTDECODE, FAIL, "unable to decode the superblock in R/W file"); + /* Decode subfiling stripe count */ + INT64DECODE(p, tmp64); + H5_CHECK_OVERFLOW(tmp64, int64_t, int32_t); + file->subf_config.stripe_count = (int32_t)tmp64; + + /* Validate the decoded configuration */ + if (H5FD__ioc_validate_config(&file->fa) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "decoded IOC VFD configuration info is invalid"); + + if (H5_subfiling_validate_config(&file->subf_config) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, + "decoded subfiling configuration parameters are invalid"); + + if (file->subf_config.stripe_size != sf_context->sf_stripe_size) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, + "specified subfiling stripe size (%" PRId64 + ") doesn't match value stored in file (%" PRId64 ")", + sf_context->sf_stripe_size, file->subf_config.stripe_size); + + if (file->subf_config.stripe_count != sf_context->sf_num_subfiles) + H5_SUBFILING_GOTO_ERROR( + H5E_VFL, H5E_BADVALUE, FAIL, + "specified subfiling stripe count (%d) doesn't match value stored in file (%" PRId32 ")", + sf_context->sf_num_subfiles, file->subf_config.stripe_count); done: H5_SUBFILING_FUNC_LEAVE; @@ -629,40 +643,6 @@ H5FD__ioc_fapl_get(H5FD_t *_file) } /* end H5FD__ioc_fapl_get() */ /*------------------------------------------------------------------------- - * Function: H5FD__copy_plist - * - * Purpose: Sanity-wrapped H5P_copy_plist() for each channel. - * Utility function for operation in multiple locations. - * - * Return: 0 on success, -1 on error. - *------------------------------------------------------------------------- - */ -static int -H5FD__copy_plist(hid_t fapl_id, hid_t *id_out_ptr) -{ - int ret_value = 0; - H5P_genplist_t *plist_ptr = NULL; - - H5FD_IOC_LOG_CALL(__func__); - - HDassert(id_out_ptr != NULL); - - if (FALSE == H5P_isa_class(fapl_id, H5P_FILE_ACCESS)) - H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADTYPE, -1, "not a file access property list"); - - plist_ptr = (H5P_genplist_t *)H5I_object(fapl_id); - if (NULL == plist_ptr) - H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADTYPE, -1, "unable to get property list"); - - *id_out_ptr = H5P_copy_plist(plist_ptr, FALSE); - if (H5I_INVALID_HID == *id_out_ptr) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADTYPE, -1, "unable to copy file access property list"); - -done: - H5_SUBFILING_FUNC_LEAVE; -} /* end H5FD__copy_plist() */ - -/*------------------------------------------------------------------------- * Function: H5FD__ioc_fapl_copy * * Purpose: Copies the file access properties. @@ -688,10 +668,6 @@ H5FD__ioc_fapl_copy(const void *_old_fa) HDmemcpy(new_fa_ptr, old_fa_ptr, sizeof(H5FD_ioc_config_t)); - /* Copy the FAPL */ - if (H5FD__copy_plist(old_fa_ptr->under_fapl_id, &(new_fa_ptr->under_fapl_id)) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, NULL, "can't copy the IOC under FAPL"); - ret_value = (void *)new_fa_ptr; done: @@ -721,14 +697,9 @@ H5FD__ioc_fapl_free(void *_fapl) /* Check arguments */ HDassert(fapl); - if (fapl->under_fapl_id >= 0 && H5I_dec_ref(fapl->under_fapl_id) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTDEC, FAIL, "can't close IOC under FAPL ID"); - fapl->under_fapl_id = H5I_INVALID_HID; - /* Free the property list */ fapl = H5FL_FREE(H5FD_ioc_config_t, fapl); -done: H5_SUBFILING_FUNC_LEAVE; } /* end H5FD__ioc_fapl_free() */ @@ -748,10 +719,10 @@ H5FD__ioc_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t maxaddr) { H5FD_ioc_t *file_ptr = NULL; /* Ioc VFD info */ const H5FD_ioc_config_t *config_ptr = NULL; /* Driver-specific property list */ + subfiling_context_t *sf_context = NULL; H5FD_ioc_config_t default_config; - H5FD_class_t *driver = NULL; /* VFD for file */ H5P_genplist_t *plist_ptr = NULL; - H5FD_driver_prop_t driver_prop; /* Property for driver ID & info */ + int ioc_flags; int mpi_inited = 0; int mpi_code; /* MPI return code */ H5FD_t *ret_value = NULL; @@ -768,10 +739,15 @@ H5FD__ioc_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t maxaddr) if (NULL == (file_ptr = (H5FD_ioc_t *)H5FL_CALLOC(H5FD_ioc_t))) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTALLOC, NULL, "unable to allocate file struct"); - file_ptr->comm = MPI_COMM_NULL; - file_ptr->info = MPI_INFO_NULL; - file_ptr->context_id = -1; - file_ptr->fa.under_fapl_id = H5I_INVALID_HID; + file_ptr->comm = MPI_COMM_NULL; + file_ptr->info = MPI_INFO_NULL; + file_ptr->file_id = UINT64_MAX; + file_ptr->context_id = -1; + + /* Initialize file pointer's subfiling parameters */ + file_ptr->subf_config.ioc_selection = SELECT_IOC_ONE_PER_NODE; + file_ptr->subf_config.stripe_size = H5FD_SUBFILING_DEFAULT_STRIPE_SIZE; + file_ptr->subf_config.stripe_count = H5FD_SUBFILING_DEFAULT_STRIPE_COUNT; /* Get the driver-specific file access properties */ if (NULL == (plist_ptr = (H5P_genplist_t *)H5I_object(fapl_id))) @@ -808,7 +784,7 @@ H5FD__ioc_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t maxaddr) config_ptr = H5P_peek_driver_info(plist_ptr); if (!config_ptr || (H5P_FILE_ACCESS_DEFAULT == fapl_id)) { - if (H5FD__ioc_get_default_config(fapl_id, &default_config) < 0) + if (H5FD__ioc_get_default_config(&default_config) < 0) H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, NULL, "can't get default IOC VFD configuration"); config_ptr = &default_config; } @@ -816,117 +792,87 @@ H5FD__ioc_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t maxaddr) /* Fill in the file config values */ HDmemcpy(&file_ptr->fa, config_ptr, sizeof(H5FD_ioc_config_t)); - /* Copy the ioc FAPL. */ - if (H5FD__copy_plist(config_ptr->under_fapl_id, &(file_ptr->fa.under_fapl_id)) < 0) { - file_ptr->fa.under_fapl_id = H5I_INVALID_HID; - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, NULL, "can't copy IOC under FAPL"); - } - - if (NULL != (file_ptr->file_path = HDrealpath(name, NULL))) { - if (H5_dirname(file_ptr->file_path, &file_ptr->file_dir) < 0) { - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL, "couldn't get subfile dirname"); - } - } - else { - if (ENOENT == errno) { - if (NULL == (file_ptr->file_path = HDstrdup(name))) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTCOPY, NULL, "can't copy file name"); - if (NULL == (file_ptr->file_dir = H5MM_strdup("."))) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, NULL, "can't set subfile directory path"); - } - else - H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't resolve subfile path"); - } + /* Fully resolve the given filepath and get its dirname */ + if (H5_resolve_pathname(name, file_ptr->comm, &file_ptr->file_path) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't resolve filepath"); + if (H5_dirname(file_ptr->file_path, &file_ptr->file_dir) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't get filepath dirname"); + + /* Translate the HDF5 file open flags into standard POSIX open flags */ + ioc_flags = (H5F_ACC_RDWR & flags) ? O_RDWR : O_RDONLY; + if (H5F_ACC_TRUNC & flags) + ioc_flags |= O_TRUNC; + if (H5F_ACC_CREAT & flags) + ioc_flags |= O_CREAT; + if (H5F_ACC_EXCL & flags) + ioc_flags |= O_EXCL; - /* Check the underlying driver (sec2/mpio/etc.) */ - if (NULL == (plist_ptr = (H5P_genplist_t *)H5I_object(file_ptr->fa.under_fapl_id))) + if (NULL == (plist_ptr = (H5P_genplist_t *)H5I_object(fapl_id))) H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADTYPE, NULL, "not a file access property list"); - if (H5P_peek(plist_ptr, H5F_ACS_FILE_DRV_NAME, &driver_prop) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, NULL, "can't get driver ID & info"); - if (NULL == (driver = (H5FD_class_t *)H5I_object(driver_prop.driver_id))) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, NULL, - "invalid driver ID in file access property list"); + /* Retrieve the subfiling configuration for the current file */ + if (H5_subfiling_get_config_prop(plist_ptr, &file_ptr->subf_config) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, NULL, "can't get subfiling configuration from FAPL"); + if (H5_subfiling_validate_config(&file_ptr->subf_config) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_BADVALUE, NULL, "invalid subfiling configuration"); - if (driver->value != H5_VFD_MPIO) { - H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, - "unable to open file '%s' - only MPI I/O VFD is currently supported", name); - } - else { - subfiling_context_t *sf_context = NULL; - void *file_handle = NULL; - int ioc_flags; - int l_error = 0; - int g_error = 0; - - /* Translate the HDF5 file open flags into standard POSIX open flags */ - ioc_flags = (H5F_ACC_RDWR & flags) ? O_RDWR : O_RDONLY; - if (H5F_ACC_TRUNC & flags) - ioc_flags |= O_TRUNC; - if (H5F_ACC_CREAT & flags) - ioc_flags |= O_CREAT; - if (H5F_ACC_EXCL & flags) - ioc_flags |= O_EXCL; - - file_ptr->ioc_file = H5FD_open(file_ptr->file_path, flags, file_ptr->fa.under_fapl_id, HADDR_UNDEF); - if (file_ptr->ioc_file) { - if (H5FDget_vfd_handle(file_ptr->ioc_file, file_ptr->fa.under_fapl_id, &file_handle) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't get file handle"); - } - else { - l_error = 1; - } - - /* Check if any ranks had an issue opening the file */ - if (MPI_SUCCESS != - (mpi_code = MPI_Allreduce(&l_error, &g_error, 1, MPI_INT, MPI_SUM, file_ptr->comm))) - H5_SUBFILING_MPI_GOTO_ERROR(NULL, "MPI_Allreduce failed", mpi_code); - if (g_error) - H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, - "one or more MPI ranks were unable to open file '%s'", name); + /* Retrieve the HDF5 stub file ID for the current file */ + if (H5_subfiling_get_file_id_prop(plist_ptr, &file_ptr->file_id) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, NULL, "can't get stub file ID from FAPL"); - /* - * Open the subfiles for this HDF5 file. A subfiling - * context ID will be returned, which is used for - * further interactions with this file's subfiles. - */ - if (H5_open_subfiles(file_ptr->file_path, file_handle, &file_ptr->fa.subf_config, ioc_flags, - file_ptr->comm, &file_ptr->context_id) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to open subfiles for file '%s'", - name); - - /* Initialize I/O concentrator threads if this MPI rank is an I/O concentrator */ - sf_context = H5_get_subfiling_object(file_ptr->context_id); - if (sf_context && sf_context->topology->rank_is_ioc) { - if (initialize_ioc_threads(sf_context) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTINIT, NULL, - "unable to initialize I/O concentrator threads"); - } + /* + * Open the subfiles for this HDF5 file. A subfiling + * context ID will be returned, which is used for + * further interactions with this file's subfiles. + */ + if (H5_open_subfiles(file_ptr->file_path, file_ptr->file_id, &file_ptr->subf_config, ioc_flags, + file_ptr->comm, &file_ptr->context_id) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to open subfiles for file '%s'", + name); + + /* Initialize I/O concentrator threads if this MPI rank is an I/O concentrator */ + sf_context = H5_get_subfiling_object(file_ptr->context_id); + if (sf_context && sf_context->topology->rank_is_ioc) { + if (initialize_ioc_threads(sf_context) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTINIT, NULL, + "unable to initialize I/O concentrator threads"); } ret_value = (H5FD_t *)file_ptr; done: - /* run a barrier just before exit. The objective is to - * ensure that the IOCs are fully up and running before - * we proceed. Note that this barrier is not sufficient - * by itself -- we also need code in initialize_ioc_threads() - * to wait until the main IOC thread has finished its - * initialization. + /* + * Check if any ranks failed before exit. The objective + * here is twofold: + * + * - prevent possible hangs caused by ranks sending + * messages to I/O concentrators that failed and + * didn't spin up + * - use the barrier semantics of MPI_Allreduce to + * ensure that the I/O concentrators are fully up + * and running before proceeding. */ if (mpi_inited) { - MPI_Comm barrier_comm = MPI_COMM_WORLD; + MPI_Comm reduce_comm = MPI_COMM_WORLD; + int mpi_size = -1; + int err_result = (ret_value == NULL); if (file_ptr && (file_ptr->comm != MPI_COMM_NULL)) - barrier_comm = file_ptr->comm; + reduce_comm = file_ptr->comm; - if (MPI_SUCCESS != (mpi_code = MPI_Barrier(barrier_comm))) - H5_SUBFILING_MPI_DONE_ERROR(NULL, "MPI_Barrier failed", mpi_code); - } + if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(reduce_comm, &mpi_size))) + H5_SUBFILING_MPI_DONE_ERROR(NULL, "MPI_Comm_size failed", mpi_code); - if (config_ptr == &default_config) - if (H5I_dec_ref(config_ptr->under_fapl_id) < 0) - H5_SUBFILING_DONE_ERROR(H5E_PLIST, H5E_CANTCLOSEOBJ, NULL, "can't close IOC under FAPL"); + if (mpi_size > 1) { + if (MPI_SUCCESS != + (mpi_code = MPI_Allreduce(MPI_IN_PLACE, &err_result, 1, MPI_INT, MPI_MAX, reduce_comm))) + H5_SUBFILING_MPI_DONE_ERROR(NULL, "MPI_Allreduce failed", mpi_code); + } + + if (err_result) + H5_SUBFILING_DONE_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, + "one or more MPI ranks were unable to open file '%s'", name); + } if (NULL == ret_value) { if (file_ptr) { @@ -945,39 +891,14 @@ H5FD__ioc_close_int(H5FD_ioc_t *file_ptr) HDassert(file_ptr); -#ifdef H5FD_IOC_DEBUG - { - subfiling_context_t *sf_context = H5_get_subfiling_object(file_ptr->context_id); - if (sf_context) { - if (sf_context->topology->rank_is_ioc) - HDprintf("[%s %d] fd=%d\n", __func__, file_ptr->mpi_rank, sf_context->sf_fid); - else - HDprintf("[%s %d] fd=*\n", __func__, file_ptr->mpi_rank); - } - else - HDprintf("[%s %d] invalid subfiling context", __func__, file_ptr->mpi_rank); - HDfflush(stdout); - } -#endif - - if (file_ptr->fa.under_fapl_id >= 0 && H5I_dec_ref(file_ptr->fa.under_fapl_id) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_ARGS, FAIL, "can't close IOC under FAPL"); - file_ptr->fa.under_fapl_id = H5I_INVALID_HID; - - /* Close underlying file */ - if (file_ptr->ioc_file) { - if (H5FD_close(file_ptr->ioc_file) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTCLOSEFILE, FAIL, "unable to close HDF5 file"); - file_ptr->ioc_file = NULL; - } - if (file_ptr->context_id >= 0) { subfiling_context_t *sf_context = H5_get_subfiling_object(file_ptr->context_id); int mpi_code; /* Don't allow IOC threads to be finalized until everyone gets here */ - if (MPI_SUCCESS != (mpi_code = MPI_Barrier(file_ptr->comm))) - H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); + if (file_ptr->mpi_size > 1) + if (MPI_SUCCESS != (mpi_code = MPI_Barrier(file_ptr->comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); if (sf_context && sf_context->topology->rank_is_ioc) { if (finalize_ioc_threads(sf_context) < 0) @@ -985,7 +906,7 @@ H5FD__ioc_close_int(H5FD_ioc_t *file_ptr) H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTCLOSEFILE, FAIL, "unable to finalize IOC threads"); } - if (H5_close_subfiles(file_ptr->context_id) < 0) + if (H5_close_subfiles(file_ptr->context_id, file_ptr->comm) < 0) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTCLOSEFILE, FAIL, "unable to close subfiling file(s)"); file_ptr->context_id = -1; } @@ -1053,31 +974,8 @@ H5FD__ioc_cmp(const H5FD_t *_f1, const H5FD_t *_f2) HDassert(f1); HDassert(f2); - if (f1->ioc_file && f1->ioc_file->cls && f1->ioc_file->cls->cmp && f2->ioc_file && f2->ioc_file->cls && - f2->ioc_file->cls->cmp) { - ret_value = H5FD_cmp(f1->ioc_file, f2->ioc_file); - } - else { - h5_stat_t st1; - h5_stat_t st2; - - /* - * If under VFD has no compare routine, get - * inode of HDF5 stub file and compare them - * - * Note that the compare callback doesn't - * allow for failure, so we just return -1 - * if stat fails. - */ - if (HDstat(f1->file_path, &st1) < 0) - H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_CANTGET, -1, "couldn't stat file"); - if (HDstat(f2->file_path, &st2) < 0) - H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_CANTGET, -1, "couldn't stat file"); - - ret_value = (st1.st_ino > st2.st_ino) - (st1.st_ino < st2.st_ino); - } + ret_value = (f1->file_id > f2->file_id) - (f1->file_id < f2->file_id); -done: H5_SUBFILING_FUNC_LEAVE; } /* end H5FD__ioc_cmp */ @@ -1091,30 +989,20 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5FD__ioc_query(const H5FD_t *_file, unsigned long *flags /* out */) +H5FD__ioc_query(const H5FD_t H5_ATTR_UNUSED *_file, unsigned long *flags /* out */) { - const H5FD_ioc_t *file_ptr = (const H5FD_ioc_t *)_file; - herr_t ret_value = SUCCEED; + herr_t ret_value = SUCCEED; H5FD_IOC_LOG_CALL(__func__); - if (file_ptr == NULL) { - if (flags) - *flags = 0; - } - else if (file_ptr->ioc_file) { - if (H5FDquery(file_ptr->ioc_file, flags) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTLOCK, FAIL, "unable to query R/W file"); - } - else { - /* There is no file. Because this is a pure passthrough VFD, - * it has no features of its own. - */ - if (flags) - *flags = 0; + /* Set the VFL feature flags that this driver supports */ + if (flags) { + *flags = 0; + *flags |= H5FD_FEAT_AGGREGATE_METADATA; /* OK to aggregate metadata allocations */ + *flags |= H5FD_FEAT_AGGREGATE_SMALLDATA; /* OK to aggregate "small" raw data allocations */ + *flags |= H5FD_FEAT_HAS_MPI; /* This driver uses MPI */ } -done: H5_SUBFILING_FUNC_LEAVE; } /* end H5FD__ioc_query() */ @@ -1127,22 +1015,14 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5FD__ioc_get_type_map(const H5FD_t *_file, H5FD_mem_t *type_map) +H5FD__ioc_get_type_map(const H5FD_t H5_ATTR_UNUSED *_file, H5FD_mem_t H5_ATTR_UNUSED *type_map) { - const H5FD_ioc_t *file = (const H5FD_ioc_t *)_file; - herr_t ret_value = SUCCEED; + herr_t ret_value = SUCCEED; H5FD_IOC_LOG_CALL(__func__); - /* Check arguments */ - HDassert(file); - HDassert(file->ioc_file); - - /* Retrieve memory type mapping for R/W channel only */ - if (H5FD_get_fs_type_map(file->ioc_file, type_map) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "unable to allocate for R/W file"); + /* TODO: placeholder for now */ -done: H5_SUBFILING_FUNC_LEAVE; } /* end H5FD__ioc_get_type_map() */ @@ -1155,23 +1035,15 @@ done: *------------------------------------------------------------------------- */ static haddr_t -H5FD__ioc_alloc(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, hsize_t size) +H5FD__ioc_alloc(H5FD_t H5_ATTR_UNUSED *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNUSED dxpl_id, + hsize_t H5_ATTR_UNUSED size) { - H5FD_ioc_t *file = (H5FD_ioc_t *)_file; /* VFD file struct */ - haddr_t ret_value = HADDR_UNDEF; /* Return value */ + haddr_t ret_value = HADDR_UNDEF; /* Return value */ H5FD_IOC_LOG_CALL(__func__); - /* Check arguments */ - HDassert(file); - HDassert(file->ioc_file); - - /* Allocate memory for each file, only return the return value for R/W file. - */ - if ((ret_value = H5FDalloc(file->ioc_file, type, dxpl_id, size)) == HADDR_UNDEF) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, HADDR_UNDEF, "unable to allocate for R/W file"); + /* TODO: placeholder for now */ -done: H5_SUBFILING_FUNC_LEAVE; } /* end H5FD__ioc_alloc() */ @@ -1184,21 +1056,15 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5FD__ioc_free(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, hsize_t size) +H5FD__ioc_free(H5FD_t H5_ATTR_UNUSED *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNUSED dxpl_id, + haddr_t H5_ATTR_UNUSED addr, hsize_t H5_ATTR_UNUSED size) { - H5FD_ioc_t *file = (H5FD_ioc_t *)_file; /* VFD file struct */ - herr_t ret_value = SUCCEED; /* Return value */ + herr_t ret_value = SUCCEED; /* Return value */ H5FD_IOC_LOG_CALL(__func__); - /* Check arguments */ - HDassert(file); - HDassert(file->ioc_file); - - if (H5FDfree(file->ioc_file, type, dxpl_id, addr, size) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free for R/W file"); + /* TODO: placeholder for now */ -done: H5_SUBFILING_FUNC_LEAVE; } /* end H5FD__ioc_free() */ @@ -1224,12 +1090,9 @@ H5FD__ioc_get_eoa(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type) /* Sanity check */ HDassert(file); - HDassert(file->ioc_file); - if ((ret_value = H5FD_get_eoa(file->ioc_file, type)) == HADDR_UNDEF) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, HADDR_UNDEF, "unable to get eoa"); + ret_value = file->eoa; -done: H5_SUBFILING_FUNC_LEAVE; } /* end H5FD__ioc_get_eoa */ @@ -1253,13 +1116,9 @@ H5FD__ioc_set_eoa(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, haddr_t addr) /* Sanity check */ HDassert(file); - HDassert(file->ioc_file); - HDassert(file->ioc_file); - if (H5FD_set_eoa(file->ioc_file, type, addr) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTSET, FAIL, "H5FDset_eoa failed for R/W file"); + file->eoa = addr; -done: H5_SUBFILING_FUNC_LEAVE; } /* end H5FD__ioc_set_eoa() */ @@ -1286,16 +1145,14 @@ H5FD__ioc_get_eof(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type) /* Sanity check */ HDassert(file); - HDassert(file->ioc_file); sf_context = H5_get_subfiling_object(file->context_id); if (sf_context) { ret_value = sf_context->sf_eof; goto done; } - - if (HADDR_UNDEF == (ret_value = H5FD_get_eof(file->ioc_file, type))) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, HADDR_UNDEF, "unable to get eof"); + else + ret_value = file->eof; done: H5_SUBFILING_FUNC_LEAVE; @@ -1311,22 +1168,15 @@ done: *-------------------------------------------------------------------------- */ static herr_t -H5FD__ioc_get_handle(H5FD_t *_file, hid_t H5_ATTR_UNUSED fapl, void **file_handle) +H5FD__ioc_get_handle(H5FD_t H5_ATTR_UNUSED *_file, hid_t H5_ATTR_UNUSED fapl, + void H5_ATTR_UNUSED **file_handle) { - H5FD_ioc_t *file = (H5FD_ioc_t *)_file; - herr_t ret_value = SUCCEED; /* Return value */ + herr_t ret_value = SUCCEED; H5FD_IOC_LOG_CALL(__func__); - /* Check arguments */ - HDassert(file); - HDassert(file->ioc_file); - HDassert(file_handle); + /* TODO: placeholder for now */ - if (H5FD_get_vfd_handle(file->ioc_file, file->fa.under_fapl_id, file_handle) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "unable to get handle of R/W file"); - -done: H5_SUBFILING_FUNC_LEAVE; } /* end H5FD__ioc_get_handle */ @@ -1362,9 +1212,7 @@ H5FD__ioc_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNUS if (REGION_OVERFLOW(addr, size)) H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow, addr = %" PRIuHADDR, addr); - /* Public API for dxpl "context" */ - if (H5FDread(file->ioc_file, type, dxpl_id, addr, size, buf) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "Reading from R/W channel failed"); + ret_value = H5FD__ioc_read_vector_internal(_file, 1, &addr, &size, &buf); done: H5_SUBFILING_FUNC_LEAVE; @@ -1381,19 +1229,15 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5FD__ioc_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size, const void *buf) +H5FD__ioc_write(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, haddr_t addr, size_t size, + const void *buf) { - H5P_genplist_t *plist_ptr = NULL; - herr_t ret_value = SUCCEED; - - if (NULL == (plist_ptr = (H5P_genplist_t *)H5I_object(dxpl_id))) - H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a property list"); + herr_t ret_value = SUCCEED; addr += _file->base_addr; ret_value = H5FD__ioc_write_vector_internal(_file, 1, &type, &addr, &size, &buf); -done: H5_SUBFILING_FUNC_LEAVE; } /* end H5FD__ioc_write() */ @@ -1492,17 +1336,14 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5FD__ioc_flush(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t closing) +H5FD__ioc_flush(H5FD_t H5_ATTR_UNUSED *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t H5_ATTR_UNUSED closing) { - H5FD_ioc_t *file = (H5FD_ioc_t *)_file; - herr_t ret_value = SUCCEED; /* Return value */ + herr_t ret_value = SUCCEED; H5FD_IOC_LOG_CALL(__func__); - if (H5FDflush(file->ioc_file, dxpl_id, closing) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFLUSH, FAIL, "unable to flush R/W file"); + /* TODO: placeholder for now */ -done: H5_SUBFILING_FUNC_LEAVE; } /* end H5FD__ioc_flush() */ @@ -1515,21 +1356,20 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5FD__ioc_truncate(H5FD_t *_file, hid_t dxpl_id, hbool_t closing) +H5FD__ioc_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t H5_ATTR_UNUSED closing) { H5FD_ioc_t *file = (H5FD_ioc_t *)_file; - herr_t ret_value = SUCCEED; /* Return value */ + herr_t ret_value = SUCCEED; H5FD_IOC_LOG_CALL(__func__); HDassert(file); - HDassert(file->ioc_file); - HDassert(file->ioc_file); - if (H5FDtruncate(file->ioc_file, dxpl_id, closing) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTUPDATE, FAIL, "unable to truncate R/W file"); + /* TODO: placeholder for now since Subfiling does the truncation */ + if (!H5F_addr_eq(file->eoa, file->last_eoa)) { + file->last_eoa = file->eoa; + } -done: H5_SUBFILING_FUNC_LEAVE; } /* end H5FD__ioc_truncate */ @@ -1542,20 +1382,14 @@ done: *-------------------------------------------------------------------------- */ static herr_t -H5FD__ioc_lock(H5FD_t *_file, hbool_t rw) +H5FD__ioc_lock(H5FD_t H5_ATTR_UNUSED *_file, hbool_t H5_ATTR_UNUSED rw) { - H5FD_ioc_t *file = (H5FD_ioc_t *)_file; /* VFD file struct */ - herr_t ret_value = SUCCEED; /* Return value */ + herr_t ret_value = SUCCEED; H5FD_IOC_LOG_CALL(__func__); - HDassert(file); - HDassert(file->ioc_file); - - if (H5FD_lock(file->ioc_file, rw) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTLOCKFILE, FAIL, "unable to lock file"); + /* TODO: placeholder for now */ -done: H5_SUBFILING_FUNC_LEAVE; } /* end H5FD__ioc_lock */ @@ -1568,21 +1402,14 @@ done: *-------------------------------------------------------------------------- */ static herr_t -H5FD__ioc_unlock(H5FD_t *_file) +H5FD__ioc_unlock(H5FD_t H5_ATTR_UNUSED *_file) { - H5FD_ioc_t *file = (H5FD_ioc_t *)_file; /* VFD file struct */ - herr_t ret_value = SUCCEED; /* Return value */ + herr_t ret_value = SUCCEED; H5FD_IOC_LOG_CALL(__func__); - /* Check arguments */ - HDassert(file); - HDassert(file->ioc_file); + /* TODO: placeholder for now */ - if (H5FD_unlock(file->ioc_file) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTUNLOCKFILE, FAIL, "unable to unlock file"); - -done: H5_SUBFILING_FUNC_LEAVE; } /* end H5FD__ioc_unlock */ @@ -1626,8 +1453,9 @@ H5FD__ioc_del(const char *name, hid_t fapl) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code); if (mpi_rank == 0) { - int n_io_concentrators = 0; - int num_digits = 0; + int64_t read_n_subfiles = 0; + int32_t n_subfiles = 0; + int num_digits = 0; if (HDstat(name, &st) < 0) H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_SYSERRSTR, FAIL, "HDstat failed"); @@ -1643,7 +1471,7 @@ H5FD__ioc_del(const char *name, hid_t fapl) "can't allocate config file name buffer"); /* TODO: No support for subfile directory prefix currently */ - HDsnprintf(tmp_filename, PATH_MAX, "%s/%s" H5FD_SUBFILING_CONFIG_FILENAME_TEMPLATE, file_dirname, + HDsnprintf(tmp_filename, PATH_MAX, "%s/" H5FD_SUBFILING_CONFIG_FILENAME_TEMPLATE, file_dirname, base_filename, (uint64_t)st.st_ino); if (NULL == (config_file = HDfopen(tmp_filename, "r"))) { @@ -1659,9 +1487,12 @@ H5FD__ioc_del(const char *name, hid_t fapl) "can't open subfiling config file"); } - if (H5_get_num_iocs_from_config_file(config_file, &n_io_concentrators) < 0) + if (H5_get_subfiling_config_from_file(config_file, NULL, &read_n_subfiles) < 0) H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_READERROR, FAIL, "can't read subfiling config file"); + H5_CHECK_OVERFLOW(read_n_subfiles, int64_t, int32_t); + n_subfiles = (int32_t)read_n_subfiles; + /* Delete the Subfiling configuration file */ if (EOF == HDfclose(config_file)) { config_file = NULL; @@ -1676,12 +1507,12 @@ H5FD__ioc_del(const char *name, hid_t fapl) "can't delete subfiling config file"); /* Try to delete each of the subfiles */ - num_digits = (int)(HDlog10(n_io_concentrators) + 1); + num_digits = (int)(HDlog10(n_subfiles) + 1); - for (int i = 0; i < n_io_concentrators; i++) { + for (int i = 0; i < n_subfiles; i++) { /* TODO: No support for subfile directory prefix currently */ - HDsnprintf(tmp_filename, PATH_MAX, "%s/%s" H5FD_SUBFILING_FILENAME_TEMPLATE, file_dirname, - base_filename, (uint64_t)st.st_ino, num_digits, i + 1, n_io_concentrators); + HDsnprintf(tmp_filename, PATH_MAX, "%s/" H5FD_SUBFILING_FILENAME_TEMPLATE, file_dirname, + base_filename, (uint64_t)st.st_ino, num_digits, i + 1, n_subfiles); if (HDremove(tmp_filename) < 0) { #ifdef H5FD_IOC_DEBUG @@ -1704,8 +1535,16 @@ done: H5_SUBFILING_DONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, "can't close subfiling config file"); /* Set up a barrier (don't want processes to run ahead of the delete) */ - if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm))) - H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Barrier failed", mpi_code); + if (comm != MPI_COMM_NULL) { + int comm_size = -1; + + if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(comm, &comm_size))) + H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Comm_size failed", mpi_code); + + if (comm_size > 1) + if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm))) + H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Barrier failed", mpi_code); + } /* Free duplicated MPI Communicator and Info objects */ if (H5_mpi_comm_free(&comm) < 0) @@ -1737,19 +1576,15 @@ done: *-------------------------------------------------------------------------- */ static herr_t -H5FD__ioc_write_vector_internal(H5FD_t *_file, uint32_t count, H5FD_mem_t types[], haddr_t addrs[], - size_t sizes[], const void *bufs[] /* in */) +H5FD__ioc_write_vector_internal(H5FD_t *_file, uint32_t count, H5FD_mem_t H5_ATTR_UNUSED types[], + haddr_t addrs[], size_t sizes[], const void *bufs[] /* in */) { subfiling_context_t *sf_context = NULL; - MPI_Request *active_reqs = NULL; + MPI_Request *mpi_reqs = NULL; H5FD_ioc_t *file_ptr = (H5FD_ioc_t *)_file; - io_req_t **sf_async_reqs = NULL; + io_req_t **sf_io_reqs = NULL; int64_t sf_context_id = -1; herr_t ret_value = SUCCEED; - struct __mpi_req { - int n_reqs; - MPI_Request *active_reqs; - } *mpi_reqs = NULL; HDassert(_file); HDassert(addrs); @@ -1764,22 +1599,20 @@ H5FD__ioc_write_vector_internal(H5FD_t *_file, uint32_t count, H5FD_mem_t types[ if (NULL == (sf_context = H5_get_subfiling_object(sf_context_id))) H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "can't get subfiling context from ID"); HDassert(sf_context->topology); - HDassert(sf_context->topology->n_io_concentrators); - - if (NULL == (active_reqs = HDcalloc((size_t)(count + 2), sizeof(struct __mpi_req)))) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, - "can't allocate active I/O requests array"); - - if (NULL == (sf_async_reqs = HDcalloc((size_t)count, sizeof(*sf_async_reqs)))) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate I/O request array"); /* - * Note: We allocated extra space in the active_requests (above). - * The extra should be enough for an integer plus a pointer. + * Allocate an array of I/O requests and an array twice that size for + * MPI_Request objects. Each write I/O request has an MPI_Request + * object for the I/O data transfer and an MPI_Request object that, + * when waited on until completion, signifies that the actual I/O + * call (currently, HDpwrite) has completed. This is needed for ensuring + * that blocking write calls do not return early before the data is + * actually written. */ - mpi_reqs = (struct __mpi_req *)&active_reqs[count]; - mpi_reqs->n_reqs = (int)count; - mpi_reqs->active_reqs = active_reqs; + if (NULL == (sf_io_reqs = HDcalloc((size_t)count, sizeof(*sf_io_reqs)))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate I/O request array"); + if (NULL == (mpi_reqs = HDmalloc(2 * (size_t)count * sizeof(*mpi_reqs)))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate MPI request array"); /* Each pass thru the following should queue an MPI write * to a new IOC. Both the IOC selection and offset within the @@ -1794,47 +1627,30 @@ H5FD__ioc_write_vector_internal(H5FD_t *_file, uint32_t count, H5FD_mem_t types[ H5_CHECK_OVERFLOW(addrs[i], haddr_t, int64_t); H5_CHECK_OVERFLOW(sizes[i], size_t, int64_t); - write_status = - ioc__write_independent_async(sf_context_id, sf_context->topology->n_io_concentrators, - (int64_t)addrs[i], (int64_t)sizes[i], bufs[i], &sf_async_reqs[i]); + write_status = ioc__write_independent_async(sf_context_id, (int64_t)addrs[i], (int64_t)sizes[i], + bufs[i], &sf_io_reqs[i]); if (write_status < 0) H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "couldn't queue write operation"); - mpi_reqs->active_reqs[i] = sf_async_reqs[i]->completion_func.io_args.io_req; - } - - /* - * Mirror superblock writes to the stub file so that - * legacy HDF5 applications can check what type of - * file they are reading - */ - for (size_t i = 0; i < (size_t)count; i++) { - if (types[i] == H5FD_MEM_SUPER) { - if (H5FDwrite(file_ptr->ioc_file, H5FD_MEM_SUPER, H5P_DEFAULT, addrs[i], sizes[i], bufs[i]) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, - "couldn't write superblock information to stub file"); - } + mpi_reqs[(2 * i)] = sf_io_reqs[i]->io_transfer_req; + mpi_reqs[(2 * i) + 1] = sf_io_reqs[i]->io_comp_req; } /* Here, we should have queued 'count' async requests. * We can can now try to complete those before returning * to the caller for the next set of IO operations. */ - if (sf_async_reqs[0]->completion_func.io_function) - ret_value = (*sf_async_reqs[0]->completion_func.io_function)(mpi_reqs); + if (ioc__async_completion(mpi_reqs, 2 * (size_t)count) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "can't complete I/O requests"); done: - if (active_reqs) - HDfree(active_reqs); + HDfree(mpi_reqs); - if (sf_async_reqs) { - for (size_t i = 0; i < (size_t)count; i++) { - if (sf_async_reqs[i]) { - HDfree(sf_async_reqs[i]); - } - } - HDfree(sf_async_reqs); + if (sf_io_reqs) { + for (size_t i = 0; i < count; i++) + HDfree(sf_io_reqs[i]); + HDfree(sf_io_reqs); } H5_SUBFILING_FUNC_LEAVE; @@ -1845,15 +1661,11 @@ H5FD__ioc_read_vector_internal(H5FD_t *_file, uint32_t count, haddr_t addrs[], s void *bufs[] /* out */) { subfiling_context_t *sf_context = NULL; - MPI_Request *active_reqs = NULL; + MPI_Request *mpi_reqs = NULL; H5FD_ioc_t *file_ptr = (H5FD_ioc_t *)_file; - io_req_t **sf_async_reqs = NULL; + io_req_t **sf_io_reqs = NULL; int64_t sf_context_id = -1; herr_t ret_value = SUCCEED; - struct __mpi_req { - int n_reqs; - MPI_Request *active_reqs; - } *mpi_reqs = NULL; HDassert(_file); HDassert(addrs); @@ -1868,36 +1680,31 @@ H5FD__ioc_read_vector_internal(H5FD_t *_file, uint32_t count, haddr_t addrs[], s if (NULL == (sf_context = H5_get_subfiling_object(sf_context_id))) H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "can't get subfiling context from ID"); HDassert(sf_context->topology); - HDassert(sf_context->topology->n_io_concentrators); - - if (NULL == (active_reqs = HDcalloc((size_t)(count + 2), sizeof(struct __mpi_req)))) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, - "can't allocate active I/O requests array"); - - if (NULL == (sf_async_reqs = HDcalloc((size_t)count, sizeof(*sf_async_reqs)))) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate I/O request array"); /* - * Note: We allocated extra space in the active_requests (above). - * The extra should be enough for an integer plus a pointer. + * Allocate an array of I/O requests and an array for MPI_Request + * objects. Each read I/O request has an MPI_Request object for the + * I/O data transfer that, when waited on until completion, signifies + * that the actual I/O call (currently, HDpread) has completed and + * the data read from the file has been transferred to the caller. */ - mpi_reqs = (struct __mpi_req *)&active_reqs[count]; - mpi_reqs->n_reqs = (int)count; - mpi_reqs->active_reqs = active_reqs; + if (NULL == (sf_io_reqs = HDcalloc((size_t)count, sizeof(*sf_io_reqs)))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate I/O request array"); + if (NULL == (mpi_reqs = HDmalloc((size_t)count * sizeof(*mpi_reqs)))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate MPI request array"); for (size_t i = 0; i < (size_t)count; i++) { int read_status; H5_CHECK_OVERFLOW(addrs[i], haddr_t, int64_t); H5_CHECK_OVERFLOW(sizes[i], size_t, int64_t); - read_status = - ioc__read_independent_async(sf_context_id, sf_context->topology->n_io_concentrators, - (int64_t)addrs[i], (int64_t)sizes[i], bufs[i], &sf_async_reqs[i]); + read_status = ioc__read_independent_async(sf_context_id, (int64_t)addrs[i], (int64_t)sizes[i], + bufs[i], &sf_io_reqs[i]); if (read_status < 0) H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "couldn't queue read operation"); - mpi_reqs->active_reqs[i] = sf_async_reqs[i]->completion_func.io_args.io_req; + mpi_reqs[i] = sf_io_reqs[i]->io_transfer_req; } /* Here, we should have queued 'count' async requests @@ -1906,20 +1713,16 @@ H5FD__ioc_read_vector_internal(H5FD_t *_file, uint32_t count, haddr_t addrs[], s * We can can now try to complete those before returning * to the caller for the next set of IO operations. */ - if (sf_async_reqs[0]->completion_func.io_function) - ret_value = (*sf_async_reqs[0]->completion_func.io_function)(mpi_reqs); + if (ioc__async_completion(mpi_reqs, (size_t)count) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "can't complete I/O requests"); done: - if (active_reqs) - HDfree(active_reqs); + HDfree(mpi_reqs); - if (sf_async_reqs) { - for (size_t i = 0; i < count; i++) { - if (sf_async_reqs[i]) { - HDfree(sf_async_reqs[i]); - } - } - HDfree(sf_async_reqs); + if (sf_io_reqs) { + for (size_t i = 0; i < count; i++) + HDfree(sf_io_reqs[i]); + HDfree(sf_io_reqs); } H5_SUBFILING_FUNC_LEAVE; diff --git a/src/H5FDsubfiling/H5FDioc.h b/src/H5FDsubfiling/H5FDioc.h index 7173aa9..2b68d9c 100644 --- a/src/H5FDsubfiling/H5FDioc.h +++ b/src/H5FDsubfiling/H5FDioc.h @@ -84,11 +84,6 @@ * Property List. A pointer to an instance of this structure is * a parameter to H5Pset_fapl_ioc() and H5Pget_fapl_ioc(). * - * The #H5FD_IOC driver shares much of its configuration with the - * #H5FD_SUBFILING driver and so its configuration structure - * contains an instance of a H5FD_subfiling_shared_config_t - * configuration structure. - * * \var uint32_t H5FD_ioc_config_t::magic * A somewhat unique number which distinguishes the #H5FD_IOC driver * from other drivers. Used in combination with a version number, it @@ -101,31 +96,17 @@ * number or an error will be raised. Currently, this field should be set * to #H5FD_IOC_CURR_FAPL_VERSION. * - * \var hid_t H5FD_ioc_config_t::under_fapl_id - * The File Access Property List which is setup with the file driver - * to use for I/O to the HDF5 stub file. The stub file looks like a - * typical HDF5 file, but currently only contains the superblock metadata - * for compatibility with legacy HDF5 applications. The default driver used - * is currently the #H5FD_MPIO driver. - * * \var int32_t H5FD_ioc_config_t::thread_pool_size * The number of I/O concentrator worker threads to use. * * This value can also be set or adjusted with the #H5FD_IOC_THREAD_POOL_SIZE * environment variable. * - * \var H5FD_subfiling_shared_config_t H5FD_ioc_config_t::subf_config - * Subfiling configuration data for the parent #H5FD_SUBFILING driver. This - * includes the sub-file stripe size, number of I/O concentrators, IOC - * selection method, etc. - * */ typedef struct H5FD_ioc_config_t { uint32_t magic; /* Must be set to H5FD_IOC_FAPL_MAGIC */ uint32_t version; /* Must be set to H5FD_IOC_CURR_FAPL_VERSION */ - hid_t under_fapl_id; /* FAPL setup with the VFD to use for I/O to the HDF5 stub file */ int32_t thread_pool_size; /* Number of I/O concentrator worker threads to use */ - H5FD_subfiling_shared_config_t subf_config; /* Subfiling driver configuration */ } H5FD_ioc_config_t; //! @@ -152,7 +133,7 @@ H5_DLL hid_t H5FD_ioc_init(void); * * The #H5FD_IOC driver is a reference implementation of an "I/O concentrator" * file driver that works in conjunction with the #H5FD_SUBFILING driver and - * provides the I/O backend for servicing I/O requests to sub-files. + * provides the I/O backend for servicing I/O requests to subfiles. * * Typically, an HDF5 application won't need to call this routine directly. * The #H5FD_IOC driver is usually set up as a side effect of an HDF5 application diff --git a/src/H5FDsubfiling/H5FDioc_int.c b/src/H5FDsubfiling/H5FDioc_int.c index 71afef4..e2ba95a 100644 --- a/src/H5FDsubfiling/H5FDioc_int.c +++ b/src/H5FDsubfiling/H5FDioc_int.c @@ -16,31 +16,36 @@ #include "H5FDioc_priv.h" -static int async_completion(void *arg); - /* - * Given a file offset, the stripe size and - * the number of IOCs, calculate the target - * IOC for I/O and the file offset for the - * subfile that IOC controls + * Given a file offset, the stripe size, the + * number of IOCs and the number of subfiles, + * calculate the target IOC for I/O, the index + * of the target subfile out of the subfiles + * that the IOC controls and the file offset + * into that subfile */ static inline void -calculate_target_ioc(int64_t file_offset, int64_t stripe_size, int n_io_concentrators, int64_t *target_ioc, - int64_t *ioc_file_offset) +calculate_target_ioc(int64_t file_offset, int64_t stripe_size, int num_io_concentrators, int num_subfiles, + int64_t *target_ioc, int64_t *ioc_file_offset, int64_t *ioc_subfile_idx) { int64_t stripe_idx; int64_t subfile_row; + int64_t subfile_idx; + HDassert(stripe_size > 0); + HDassert(num_io_concentrators > 0); + HDassert(num_subfiles > 0); HDassert(target_ioc); HDassert(ioc_file_offset); - HDassert(stripe_size > 0); - HDassert(n_io_concentrators > 0); + HDassert(ioc_subfile_idx); stripe_idx = file_offset / stripe_size; - subfile_row = stripe_idx / n_io_concentrators; + subfile_row = stripe_idx / num_subfiles; + subfile_idx = (stripe_idx % num_subfiles) / num_io_concentrators; - *target_ioc = stripe_idx % n_io_concentrators; + *target_ioc = (stripe_idx % num_subfiles) % num_io_concentrators; *ioc_file_offset = (subfile_row * stripe_size) + (file_offset % stripe_size); + *ioc_subfile_idx = subfile_idx; } /* @@ -90,17 +95,20 @@ cast_to_void(const void *data) *------------------------------------------------------------------------- */ herr_t -ioc__write_independent_async(int64_t context_id, int n_io_concentrators, int64_t offset, int64_t elements, - const void *data, io_req_t **io_req) +ioc__write_independent_async(int64_t context_id, int64_t offset, int64_t elements, const void *data, + io_req_t **io_req) { subfiling_context_t *sf_context = NULL; MPI_Request ack_request = MPI_REQUEST_NULL; io_req_t *sf_io_request = NULL; int64_t ioc_start; int64_t ioc_offset; + int64_t ioc_subfile_idx; int64_t msg[3] = {0}; int *io_concentrators = NULL; - int data_tag = 0; + int num_io_concentrators; + int num_subfiles; + int data_tag = 0; int mpi_code; herr_t ret_value = SUCCEED; @@ -111,13 +119,16 @@ ioc__write_independent_async(int64_t context_id, int n_io_concentrators, int64_t HDassert(sf_context->topology); HDassert(sf_context->topology->io_concentrators); - io_concentrators = sf_context->topology->io_concentrators; + io_concentrators = sf_context->topology->io_concentrators; + num_io_concentrators = sf_context->topology->n_io_concentrators; + num_subfiles = sf_context->sf_num_subfiles; /* * Calculate the IOC that we'll send the I/O request to * and the offset within that IOC's subfile */ - calculate_target_ioc(offset, sf_context->sf_stripe_size, n_io_concentrators, &ioc_start, &ioc_offset); + calculate_target_ioc(offset, sf_context->sf_stripe_size, num_io_concentrators, num_subfiles, &ioc_start, + &ioc_offset, &ioc_subfile_idx); /* * Wait for memory to be allocated on the target IOC before @@ -141,37 +152,43 @@ ioc__write_independent_async(int64_t context_id, int n_io_concentrators, int64_t */ msg[0] = elements; msg[1] = ioc_offset; - msg[2] = context_id; - if (MPI_SUCCESS != (mpi_code = MPI_Send(msg, 3, MPI_INT64_T, io_concentrators[ioc_start], WRITE_INDEP, - sf_context->sf_msg_comm))) + msg[2] = ioc_subfile_idx; + if (MPI_SUCCESS != (mpi_code = MPI_Send(msg, 1, H5_subfiling_rpc_msg_type, io_concentrators[ioc_start], + WRITE_INDEP, sf_context->sf_msg_comm))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Send failed", mpi_code); - /* Wait to receive data tag */ + /* Wait to receive the data tag from the IOC */ if (MPI_SUCCESS != (mpi_code = MPI_Wait(&ack_request, MPI_STATUS_IGNORE))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Wait failed", mpi_code); if (data_tag == 0) H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "received NACK from IOC"); - /* At this point in the new implementation, we should queue - * the async write so that when the top level VFD tells us - * to complete all pending IO requests, we have all the info - * we need to accomplish that. + /* + * Allocate the I/O request object that will + * be returned to the caller */ if (NULL == (sf_io_request = HDmalloc(sizeof(io_req_t)))) H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_WRITEERROR, FAIL, "couldn't allocate I/O request"); H5_CHECK_OVERFLOW(ioc_start, int64_t, int); - sf_io_request->completion_func.io_args.ioc = (int)ioc_start; - sf_io_request->completion_func.io_args.context_id = context_id; - sf_io_request->completion_func.io_args.offset = offset; - sf_io_request->completion_func.io_args.elements = elements; - sf_io_request->completion_func.io_args.data = cast_to_void(data); - sf_io_request->completion_func.io_args.io_req = MPI_REQUEST_NULL; - sf_io_request->completion_func.io_function = async_completion; - sf_io_request->completion_func.pending = 0; + sf_io_request->ioc = (int)ioc_start; + sf_io_request->context_id = context_id; + sf_io_request->offset = offset; + sf_io_request->elements = elements; + sf_io_request->data = cast_to_void(data); + sf_io_request->io_transfer_req = MPI_REQUEST_NULL; + sf_io_request->io_comp_req = MPI_REQUEST_NULL; + sf_io_request->io_comp_tag = -1; - sf_io_request->prev = sf_io_request->next = NULL; + /* + * Start a non-blocking receive from the IOC that signifies + * when the actual write is complete + */ + if (MPI_SUCCESS != + (mpi_code = MPI_Irecv(&sf_io_request->io_comp_tag, 1, MPI_INT, io_concentrators[ioc_start], + WRITE_DATA_DONE, sf_context->sf_data_comm, &sf_io_request->io_comp_req))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Irecv failed", mpi_code); /* * Start the actual data transfer using the ack received @@ -180,7 +197,7 @@ ioc__write_independent_async(int64_t context_id, int n_io_concentrators, int64_t H5_CHECK_OVERFLOW(elements, int64_t, int); if (MPI_SUCCESS != (mpi_code = MPI_Isend(data, (int)elements, MPI_BYTE, io_concentrators[ioc_start], data_tag, - sf_context->sf_data_comm, &sf_io_request->completion_func.io_args.io_req))) + sf_context->sf_data_comm, &sf_io_request->io_transfer_req))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Isend failed", mpi_code); /* @@ -193,14 +210,23 @@ ioc__write_independent_async(int64_t context_id, int n_io_concentrators, int64_t * to the caller. */ - sf_io_request->completion_func.pending = 1; - *io_req = sf_io_request; + *io_req = sf_io_request; done: if (ret_value < 0) { if (ack_request != MPI_REQUEST_NULL) { - if (MPI_SUCCESS != (mpi_code = MPI_Cancel(&ack_request))) - H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Cancel failed", mpi_code); + if (MPI_SUCCESS != (mpi_code = MPI_Wait(&ack_request, MPI_STATUS_IGNORE))) + H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Wait failed", mpi_code); + } + if (sf_io_request) { + if (sf_io_request->io_transfer_req != MPI_REQUEST_NULL) { + if (MPI_SUCCESS != (mpi_code = MPI_Wait(&sf_io_request->io_transfer_req, MPI_STATUS_IGNORE))) + H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Wait failed", mpi_code); + } + if (sf_io_request->io_comp_req != MPI_REQUEST_NULL) { + if (MPI_SUCCESS != (mpi_code = MPI_Wait(&sf_io_request->io_comp_req, MPI_STATUS_IGNORE))) + H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Wait failed", mpi_code); + } } HDfree(sf_io_request); @@ -241,81 +267,141 @@ done: *------------------------------------------------------------------------- */ herr_t -ioc__read_independent_async(int64_t context_id, int n_io_concentrators, int64_t offset, int64_t elements, - void *data, io_req_t **io_req) +ioc__read_independent_async(int64_t context_id, int64_t offset, int64_t elements, void *data, + io_req_t **io_req) { subfiling_context_t *sf_context = NULL; + MPI_Request ack_request = MPI_REQUEST_NULL; io_req_t *sf_io_request = NULL; + hbool_t need_data_tag = FALSE; int64_t ioc_start; int64_t ioc_offset; + int64_t ioc_subfile_idx; int64_t msg[3] = {0}; int *io_concentrators = NULL; + int num_io_concentrators; + int num_subfiles; + int data_tag = 0; int mpi_code; herr_t ret_value = SUCCEED; HDassert(io_req); + H5_CHECK_OVERFLOW(elements, int64_t, int); + if (NULL == (sf_context = H5_get_subfiling_object(context_id))) H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "can't get subfiling context from ID"); HDassert(sf_context->topology); HDassert(sf_context->topology->io_concentrators); - io_concentrators = sf_context->topology->io_concentrators; + io_concentrators = sf_context->topology->io_concentrators; + num_io_concentrators = sf_context->topology->n_io_concentrators; + num_subfiles = sf_context->sf_num_subfiles; + + /* + * If we are using 1 subfile per IOC, we can optimize reads + * a little since each read will go to a separate IOC and we + * won't be in danger of data being received in an + * unpredictable order. However, if some IOCs own more than + * 1 subfile, we need to associate each read with a unique + * message tag to make sure the data is received in the + * correct order. + */ + need_data_tag = num_subfiles != num_io_concentrators; + if (!need_data_tag) + data_tag = READ_INDEP_DATA; /* * Calculate the IOC that we'll send the I/O request to * and the offset within that IOC's subfile */ - calculate_target_ioc(offset, sf_context->sf_stripe_size, n_io_concentrators, &ioc_start, &ioc_offset); + calculate_target_ioc(offset, sf_context->sf_stripe_size, num_io_concentrators, num_subfiles, &ioc_start, + &ioc_offset, &ioc_subfile_idx); /* - * At this point in the new implementation, we should queue - * the non-blocking recv so that when the top level VFD tells - * us to complete all pending IO requests, we have all the info - * we need to accomplish that. - * - * Post the early non-blocking receive here. + * Allocate the I/O request object that will + * be returned to the caller */ if (NULL == (sf_io_request = HDmalloc(sizeof(io_req_t)))) H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_READERROR, FAIL, "couldn't allocate I/O request"); H5_CHECK_OVERFLOW(ioc_start, int64_t, int); - sf_io_request->completion_func.io_args.ioc = (int)ioc_start; - sf_io_request->completion_func.io_args.context_id = context_id; - sf_io_request->completion_func.io_args.offset = offset; - sf_io_request->completion_func.io_args.elements = elements; - sf_io_request->completion_func.io_args.data = data; - sf_io_request->completion_func.io_args.io_req = MPI_REQUEST_NULL; - sf_io_request->completion_func.io_function = async_completion; - sf_io_request->completion_func.pending = 0; - - sf_io_request->prev = sf_io_request->next = NULL; + sf_io_request->ioc = (int)ioc_start; + sf_io_request->context_id = context_id; + sf_io_request->offset = offset; + sf_io_request->elements = elements; + sf_io_request->data = data; + sf_io_request->io_transfer_req = MPI_REQUEST_NULL; + sf_io_request->io_comp_req = MPI_REQUEST_NULL; + sf_io_request->io_comp_tag = -1; + + if (need_data_tag) { + /* + * Post an early non-blocking receive for IOC to send an ACK + * (or NACK) message with a data tag that we will use for + * receiving data + */ + if (MPI_SUCCESS != (mpi_code = MPI_Irecv(&data_tag, 1, MPI_INT, io_concentrators[ioc_start], + READ_INDEP_ACK, sf_context->sf_data_comm, &ack_request))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Irecv failed", mpi_code); + + /* + * Prepare and send an I/O request to the IOC identified + * by the file offset + */ + msg[0] = elements; + msg[1] = ioc_offset; + msg[2] = ioc_subfile_idx; + if (MPI_SUCCESS != + (mpi_code = MPI_Send(msg, 1, H5_subfiling_rpc_msg_type, io_concentrators[ioc_start], READ_INDEP, + sf_context->sf_msg_comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Send failed", mpi_code); + + /* Wait to receive the data tag from the IOC */ + if (MPI_SUCCESS != (mpi_code = MPI_Wait(&ack_request, MPI_STATUS_IGNORE))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Wait failed", mpi_code); + + if (data_tag == 0) + H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "received NACK from IOC"); + } - H5_CHECK_OVERFLOW(elements, int64_t, int); + /* + * Post a non-blocking receive for the data from the IOC + * using the selected data tag (either the one received + * from the IOC or the static READ_INDEP_DATA tag) + */ if (MPI_SUCCESS != - (mpi_code = MPI_Irecv(data, (int)elements, MPI_BYTE, io_concentrators[ioc_start], READ_INDEP_DATA, - sf_context->sf_data_comm, &sf_io_request->completion_func.io_args.io_req))) + (mpi_code = MPI_Irecv(data, (int)elements, MPI_BYTE, io_concentrators[ioc_start], data_tag, + sf_context->sf_data_comm, &sf_io_request->io_transfer_req))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Irecv failed", mpi_code); - sf_io_request->completion_func.pending = 1; - *io_req = sf_io_request; + if (!need_data_tag) { + /* + * Prepare and send an I/O request to the IOC identified + * by the file offset + */ + msg[0] = elements; + msg[1] = ioc_offset; + msg[2] = ioc_subfile_idx; + if (MPI_SUCCESS != + (mpi_code = MPI_Send(msg, 1, H5_subfiling_rpc_msg_type, io_concentrators[ioc_start], READ_INDEP, + sf_context->sf_msg_comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Send failed", mpi_code); + } - /* - * Prepare and send an I/O request to the IOC identified - * by the file offset - */ - msg[0] = elements; - msg[1] = ioc_offset; - msg[2] = context_id; - if (MPI_SUCCESS != (mpi_code = MPI_Send(msg, 3, MPI_INT64_T, io_concentrators[ioc_start], READ_INDEP, - sf_context->sf_msg_comm))) - H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Send failed", mpi_code); + *io_req = sf_io_request; done: if (ret_value < 0) { - if (sf_io_request && sf_io_request->completion_func.io_args.io_req != MPI_REQUEST_NULL) { - if (MPI_SUCCESS != (mpi_code = MPI_Cancel(&sf_io_request->completion_func.io_args.io_req))) - H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Cancel failed", mpi_code); + if (ack_request != MPI_REQUEST_NULL) { + if (MPI_SUCCESS != (mpi_code = MPI_Wait(&ack_request, MPI_STATUS_IGNORE))) + H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Wait failed", mpi_code); + } + if (sf_io_request) { + if (sf_io_request->io_transfer_req != MPI_REQUEST_NULL) { + if (MPI_SUCCESS != (mpi_code = MPI_Wait(&sf_io_request->io_transfer_req, MPI_STATUS_IGNORE))) + H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Wait failed", mpi_code); + } } HDfree(sf_io_request); @@ -326,56 +412,27 @@ done: } /* end ioc__read_independent_async() */ /*------------------------------------------------------------------------- - * Function: async_completion + * Function: ioc__async_completion * - * Purpose: Given a single io_func_t structure containing the function - * pointer and it's input arguments and a single MPI_Request - * argument which needs to be completed, we make progress - * by calling MPI_Test. In this initial example, we loop - * until the request is completed as indicated by a non-zero - * flag variable. + * Purpose: IOC function to complete outstanding I/O requests. + * Currently just a wrapper around MPI_Waitall on the given + * MPI_Request array. * - * As we go further with the implementation, we anticipate that - * rather than testing a single request variable, we will - * deal with a collection of all pending IO requests (on - * this rank). + * Return: Non-negative on success/Negative on failure * - * Return: an integer status. Zero(0) indicates success. Negative - * values (-1) indicates an error. *------------------------------------------------------------------------- */ -static int -async_completion(void *arg) +herr_t +ioc__async_completion(MPI_Request *mpi_reqs, size_t num_reqs) { - int n_reqs; - int mpi_code; - int ret_value = 0; - struct async_arg { - int n_reqs; - MPI_Request *sf_reqs; - } *in_progress = (struct async_arg *)arg; - - HDassert(arg); - - n_reqs = in_progress->n_reqs; + herr_t ret_value = SUCCEED; + int mpi_code; - if (n_reqs < 0) { -#ifdef H5FD_IOC_DEBUG - HDprintf("%s: invalid number of in progress I/O requests\n", __func__); -#endif + HDassert(mpi_reqs); - ret_value = -1; - goto done; - } - - if (MPI_SUCCESS != (mpi_code = MPI_Waitall(n_reqs, in_progress->sf_reqs, MPI_STATUSES_IGNORE))) { -#ifdef H5FD_IOC_DEBUG - HDprintf("%s: MPI_Waitall failed with rc %d\n", __func__, mpi_code); -#endif - - ret_value = -1; - goto done; - } + H5_CHECK_OVERFLOW(num_reqs, size_t, int); + if (MPI_SUCCESS != (mpi_code = MPI_Waitall((int)num_reqs, mpi_reqs, MPI_STATUSES_IGNORE))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Waitall failed", mpi_code); done: H5_SUBFILING_FUNC_LEAVE; diff --git a/src/H5FDsubfiling/H5FDioc_priv.h b/src/H5FDsubfiling/H5FDioc_priv.h index a86810c..3b0c4d0 100644 --- a/src/H5FDsubfiling/H5FDioc_priv.h +++ b/src/H5FDsubfiling/H5FDioc_priv.h @@ -394,26 +394,15 @@ typedef struct ioc_io_queue { * input arguments for the functions which were originally * invoked. See below. */ -typedef struct _client_io_args { - int ioc; /* ID of the IO Concentrator handling this IO. */ - int64_t context_id; /* The context id provided for the read or write */ - int64_t offset; /* The file offset for the IO operation */ - int64_t elements; /* How many bytes */ - void *data; /* A pointer to the (contiguous) data segment */ - MPI_Request io_req; /* An MPI request to allow the code to loop while */ - /* making progress on multiple IOs */ -} io_args_t; - -typedef struct _client_io_func { - int (*io_function)(void *this_io); /* pointer to a completion function */ - io_args_t io_args; /* arguments passed to the completion function */ - int pending; /* The function is complete (0) or pending (1)? */ -} io_func_t; - typedef struct _io_req { - struct _io_req *prev; /* A simple list structure containing completion */ - struct _io_req *next; /* functions. These should get removed as IO ops */ - io_func_t completion_func; /* are completed */ + int ioc; /* ID of the IO Concentrator handling this IO. */ + int64_t context_id; /* The context id provided for the read or write */ + int64_t offset; /* The file offset for the IO operation */ + int64_t elements; /* How many bytes */ + void *data; /* A pointer to the (contiguous) data segment */ + MPI_Request io_transfer_req; /* MPI request for Isend/Irecv of I/O data */ + MPI_Request io_comp_req; /* MPI request signifying when actual I/O is finished */ + int io_comp_tag; /* MPI tag value used for completed I/O request */ } io_req_t; extern int *H5FD_IOC_tag_ub_val_ptr; @@ -425,10 +414,12 @@ extern "C" { H5_DLL int initialize_ioc_threads(void *_sf_context); H5_DLL int finalize_ioc_threads(void *_sf_context); -H5_DLL herr_t ioc__write_independent_async(int64_t context_id, int n_io_concentrators, int64_t offset, - int64_t elements, const void *data, io_req_t **io_req); -H5_DLL herr_t ioc__read_independent_async(int64_t context_id, int n_io_concentrators, int64_t offset, - int64_t elements, void *data, io_req_t **io_req); +H5_DLL herr_t ioc__write_independent_async(int64_t context_id, int64_t offset, int64_t elements, + const void *data, io_req_t **io_req); +H5_DLL herr_t ioc__read_independent_async(int64_t context_id, int64_t offset, int64_t elements, void *data, + io_req_t **io_req); + +H5_DLL herr_t ioc__async_completion(MPI_Request *mpi_reqs, size_t num_reqs); H5_DLL int wait_for_thread_main(void); diff --git a/src/H5FDsubfiling/H5FDioc_threads.c b/src/H5FDsubfiling/H5FDioc_threads.c index 813fb3f..b3e8ebc 100644 --- a/src/H5FDsubfiling/H5FDioc_threads.c +++ b/src/H5FDsubfiling/H5FDioc_threads.c @@ -72,16 +72,16 @@ static double sf_queue_delay_time = 0.0; static HG_THREAD_RETURN_TYPE ioc_thread_main(void *arg); static int ioc_main(ioc_data_t *ioc_data); -static int ioc_file_queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm, +static int ioc_file_queue_write_indep(sf_work_request_t *msg, int ioc_idx, int source, MPI_Comm comm, uint32_t counter); -static int ioc_file_queue_read_indep(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm); +static int ioc_file_queue_read_indep(sf_work_request_t *msg, int ioc_idx, int source, MPI_Comm comm, + uint32_t counter); static int ioc_file_write_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, - int subfile_rank); -static int ioc_file_read_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, - int subfile_rank); -static int ioc_file_truncate(int fd, int64_t length, int subfile_rank); -static int ioc_file_report_eof(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm); + int ioc_idx); +static int ioc_file_read_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, int ioc_idx); +static int ioc_file_truncate(sf_work_request_t *msg); +static int ioc_file_report_eof(sf_work_request_t *msg, MPI_Comm comm); static ioc_io_queue_entry_t *ioc_io_queue_alloc_entry(void); static void ioc_io_queue_complete_entry(ioc_data_t *ioc_data, ioc_io_queue_entry_t *entry_ptr); @@ -156,6 +156,8 @@ initialize_ioc_threads(void *_sf_context) #endif }; + sf_context->ioc_data = ioc_data; + /* Initialize atomic vars */ atomic_init(&ioc_data->sf_ioc_ready, 0); atomic_init(&ioc_data->sf_shutdown_flag, 0); @@ -194,7 +196,7 @@ initialize_ioc_threads(void *_sf_context) t_end = MPI_Wtime(); #ifdef H5FD_IOC_DEBUG - if (sf_context->topology->subfile_rank == 0) { + if (sf_context->topology->ioc_idx == 0) { HDprintf("%s: time = %lf seconds\n", __func__, (t_end - t_start)); HDfflush(stdout); } @@ -202,8 +204,6 @@ initialize_ioc_threads(void *_sf_context) #endif - sf_context->ioc_data = ioc_data; - done: H5_SUBFILING_FUNC_LEAVE; } @@ -245,6 +245,7 @@ finalize_ioc_threads(void *_sf_context) ioc_data->io_queue.num_failed); HDfree(ioc_data); + sf_context->ioc_data = NULL; H5_SUBFILING_FUNC_LEAVE; } @@ -346,7 +347,6 @@ ioc_main(ioc_data_t *ioc_data) { subfiling_context_t *context = NULL; sf_work_request_t wk_req; - int subfile_rank; int shutdown_requested; int ret_value = 0; @@ -362,8 +362,6 @@ ioc_main(ioc_data_t *ioc_data) * represent an open file). */ - subfile_rank = context->sf_group_rank; - /* tell initialize_ioc_threads() that ioc_main() is ready to enter its main loop */ atomic_store(&ioc_data->sf_ioc_ready, 1); @@ -415,11 +413,11 @@ ioc_main(ioc_data_t *ioc_data) queue_start_time = MPI_Wtime(); - wk_req.tag = tag; - wk_req.source = source; - wk_req.subfile_rank = subfile_rank; - wk_req.context_id = ioc_data->sf_context_id; - wk_req.start_time = queue_start_time; + wk_req.tag = tag; + wk_req.source = source; + wk_req.ioc_idx = context->topology->ioc_idx; + wk_req.context_id = ioc_data->sf_context_id; + wk_req.start_time = queue_start_time; ioc_io_queue_add_entry(ioc_data, &wk_req); @@ -506,7 +504,7 @@ handle_work_request(void *arg) subfiling_context_t *sf_context = NULL; sf_work_request_t *msg = &(q_entry_ptr->wk_req); ioc_data_t *ioc_data = NULL; - int64_t file_context_id = msg->header[2]; + int64_t file_context_id = msg->context_id; int op_ret; hg_thread_ret_t ret_value = 0; @@ -524,27 +522,27 @@ handle_work_request(void *arg) switch (msg->tag) { case WRITE_INDEP: - op_ret = ioc_file_queue_write_indep(msg, msg->subfile_rank, msg->source, sf_context->sf_data_comm, + op_ret = ioc_file_queue_write_indep(msg, msg->ioc_idx, msg->source, sf_context->sf_data_comm, q_entry_ptr->counter); break; case READ_INDEP: - op_ret = ioc_file_queue_read_indep(msg, msg->subfile_rank, msg->source, sf_context->sf_data_comm); + op_ret = ioc_file_queue_read_indep(msg, msg->ioc_idx, msg->source, sf_context->sf_data_comm, + q_entry_ptr->counter); break; case TRUNC_OP: - op_ret = ioc_file_truncate(sf_context->sf_fid, q_entry_ptr->wk_req.header[0], - sf_context->topology->subfile_rank); + op_ret = ioc_file_truncate(msg); break; case GET_EOF_OP: - op_ret = ioc_file_report_eof(msg, msg->subfile_rank, msg->source, sf_context->sf_eof_comm); + op_ret = ioc_file_report_eof(msg, sf_context->sf_eof_comm); break; default: #ifdef H5_SUBFILING_DEBUG H5_subfiling_log(file_context_id, "%s: IOC %d received unknown message with tag %x from rank %d", - __func__, msg->subfile_rank, msg->tag, msg->source); + __func__, msg->ioc_idx, msg->tag, msg->source); #endif op_ret = -1; @@ -555,11 +553,11 @@ handle_work_request(void *arg) if (op_ret < 0) { #ifdef H5_SUBFILING_DEBUG - H5_subfiling_log( - file_context_id, - "%s: IOC %d request(%s) filename=%s from rank(%d), size=%ld, offset=%ld FAILED with ret %d", - __func__, msg->subfile_rank, translate_opcode((io_op_t)msg->tag), sf_context->sf_filename, - msg->source, msg->header[0], msg->header[1], op_ret); + H5_subfiling_log(file_context_id, + "%s: IOC %d request(%s) from rank(%d), (%" PRId64 ", %" PRId64 ", %" PRId64 + ") FAILED with ret %d", + __func__, msg->ioc_idx, translate_opcode((io_op_t)msg->tag), msg->source, + msg->header[0], msg->header[1], msg->header[2], op_ret); #endif q_entry_ptr->wk_ret = op_ret; @@ -686,15 +684,15 @@ from the thread pool threads... *------------------------------------------------------------------------- */ static int -ioc_file_queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm, - uint32_t counter) +ioc_file_queue_write_indep(sf_work_request_t *msg, int ioc_idx, int source, MPI_Comm comm, uint32_t counter) { subfiling_context_t *sf_context = NULL; MPI_Status msg_status; hbool_t send_nack = FALSE; + int64_t file_context_id; int64_t data_size; int64_t file_offset; - int64_t file_context_id; + int64_t subfile_idx; int64_t stripe_id; haddr_t sf_eof; #ifdef H5FD_IOC_COLLECT_STATS @@ -714,10 +712,12 @@ ioc_file_queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source, HDassert(msg); + file_context_id = msg->context_id; + /* Retrieve the fields of the RPC message for the write operation */ - data_size = msg->header[0]; - file_offset = msg->header[1]; - file_context_id = msg->header[2]; + data_size = msg->header[0]; + file_offset = msg->header[1]; + subfile_idx = msg->header[2]; if (data_size < 0) { send_nack = TRUE; @@ -746,7 +746,7 @@ ioc_file_queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source, #ifdef H5_SUBFILING_DEBUG H5_subfiling_log(file_context_id, "[ioc(%d) %s]: msg from %d: datasize=%ld\toffset=%ld, queue_delay = %lf seconds\n", - subfile_rank, __func__, source, data_size, file_offset, t_queue_delay); + ioc_idx, __func__, source, data_size, file_offset, t_queue_delay); #endif #endif @@ -764,12 +764,12 @@ ioc_file_queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source, * allows us to distinguish between multiple concurrent * writes from a single rank. */ - HDassert(H5FD_IOC_tag_ub_val_ptr && (*H5FD_IOC_tag_ub_val_ptr >= WRITE_TAG_BASE)); - rcv_tag = (int)(counter % (INT_MAX - WRITE_TAG_BASE)); - rcv_tag %= (*H5FD_IOC_tag_ub_val_ptr - WRITE_TAG_BASE); - rcv_tag += WRITE_TAG_BASE; + HDassert(H5FD_IOC_tag_ub_val_ptr && (*H5FD_IOC_tag_ub_val_ptr >= IO_TAG_BASE)); + rcv_tag = (int)(counter % (INT_MAX - IO_TAG_BASE)); + rcv_tag %= (*H5FD_IOC_tag_ub_val_ptr - IO_TAG_BASE); + rcv_tag += IO_TAG_BASE; - if (send_ack_to_client(rcv_tag, source, subfile_rank, WRITE_INDEP_ACK, comm) < 0) + if (send_ack_to_client(rcv_tag, source, ioc_idx, WRITE_INDEP_ACK, comm) < 0) H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_WRITEERROR, -1, "couldn't send ACK to client"); /* Receive data from client */ @@ -794,13 +794,14 @@ ioc_file_queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source, t_start = t_end; #ifdef H5_SUBFILING_DEBUG - H5_subfiling_log(file_context_id, "[ioc(%d) %s] MPI_Recv(%ld bytes, from = %d) status = %d\n", - subfile_rank, __func__, data_size, source, mpi_code); + H5_subfiling_log(file_context_id, "[ioc(%d) %s] MPI_Recv(%ld bytes, from = %d) status = %d\n", ioc_idx, + __func__, data_size, source, mpi_code); #endif #endif - sf_fid = sf_context->sf_fid; + HDassert(subfile_idx < sf_context->sf_num_fids); + sf_fid = sf_context->sf_fids[subfile_idx]; #ifdef H5_SUBFILING_DEBUG if (sf_fid < 0) @@ -810,7 +811,7 @@ ioc_file_queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source, if (sf_fid >= 0) { /* Actually write data received from client into subfile */ - if ((write_ret = ioc_file_write_data(sf_fid, file_offset, recv_buf, data_size, subfile_rank)) < 0) + if ((write_ret = ioc_file_write_data(sf_fid, file_offset, recv_buf, data_size, ioc_idx)) < 0) H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_WRITEERROR, -1, "write function(FID=%d, Source=%d) returned an error (%d)", sf_fid, source, write_ret); @@ -834,10 +835,17 @@ ioc_file_queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source, H5FD_ioc_end_thread_exclusive(); + /* + * Send a message back to the client that the I/O call has + * completed and it is safe to return from the write call + */ + if (MPI_SUCCESS != (mpi_code = MPI_Send(&rcv_tag, 1, MPI_INT, source, WRITE_DATA_DONE, comm))) + H5_SUBFILING_MPI_GOTO_ERROR(-1, "MPI_Send failed", mpi_code); + done: if (send_nack) { /* Send NACK back to client so client can handle failure gracefully */ - if (send_nack_to_client(source, subfile_rank, WRITE_INDEP_ACK, comm) < 0) + if (send_nack_to_client(source, ioc_idx, WRITE_INDEP_ACK, comm) < 0) H5_SUBFILING_DONE_ERROR(H5E_IO, H5E_WRITEERROR, -1, "couldn't send NACK to client"); } @@ -867,13 +875,16 @@ done: *------------------------------------------------------------------------- */ static int -ioc_file_queue_read_indep(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm) +ioc_file_queue_read_indep(sf_work_request_t *msg, int ioc_idx, int source, MPI_Comm comm, uint32_t counter) { subfiling_context_t *sf_context = NULL; hbool_t send_empty_buf = TRUE; + hbool_t send_nack = FALSE; + hbool_t need_data_tag = FALSE; + int64_t file_context_id; int64_t data_size; int64_t file_offset; - int64_t file_context_id; + int64_t subfile_idx; #ifdef H5FD_IOC_COLLECT_STATS double t_start; double t_end; @@ -881,6 +892,7 @@ ioc_file_queue_read_indep(sf_work_request_t *msg, int subfile_rank, int source, double t_queue_delay; #endif char *send_buf = NULL; + int send_tag; int sf_fid; int read_ret; int mpi_code; @@ -888,17 +900,37 @@ ioc_file_queue_read_indep(sf_work_request_t *msg, int subfile_rank, int source, HDassert(msg); - /* Retrieve the fields of the RPC message for the read operation */ - data_size = msg->header[0]; - file_offset = msg->header[1]; - file_context_id = msg->header[2]; - - if (data_size < 0) - H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_BADVALUE, -1, "invalid data size for read"); + file_context_id = msg->context_id; sf_context = H5_get_subfiling_object(file_context_id); HDassert(sf_context); + /* + * If we are using 1 subfile per IOC, we can optimize reads + * a little since each read will go to a separate IOC and we + * won't be in danger of data being received in an + * unpredictable order. However, if some IOCs own more than + * 1 subfile, we need to associate each read with a unique + * message tag to make sure the data is received in the + * correct order. + */ + need_data_tag = sf_context->sf_num_subfiles != sf_context->topology->n_io_concentrators; + if (!need_data_tag) + send_tag = READ_INDEP_DATA; + + /* Retrieve the fields of the RPC message for the read operation */ + data_size = msg->header[0]; + file_offset = msg->header[1]; + subfile_idx = msg->header[2]; + + if (data_size < 0) { + if (need_data_tag) { + send_nack = TRUE; + send_empty_buf = FALSE; + } + H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_BADVALUE, -1, "invalid data size for read"); + } + /* Flag that we've attempted to read data from the file */ sf_context->sf_read_count++; @@ -911,22 +943,48 @@ ioc_file_queue_read_indep(sf_work_request_t *msg, int subfile_rank, int source, #ifdef H5_SUBFILING_DEBUG H5_subfiling_log(file_context_id, - "[ioc(%d) %s] msg from %d: datasize=%ld\toffset=%ld queue_delay=%lf seconds\n", - subfile_rank, __func__, source, data_size, file_offset, t_queue_delay); + "[ioc(%d) %s] msg from %d: datasize=%ld\toffset=%ld queue_delay=%lf seconds\n", ioc_idx, + __func__, source, data_size, file_offset, t_queue_delay); #endif #endif /* Allocate space to send data read from file to client */ - if (NULL == (send_buf = HDmalloc((size_t)data_size))) + if (NULL == (send_buf = HDmalloc((size_t)data_size))) { + if (need_data_tag) { + send_nack = TRUE; + send_empty_buf = FALSE; + } H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, -1, "couldn't allocate send buffer for data"); + } + + if (need_data_tag) { + /* + * Calculate message tag for the client to use for receiving + * data, then send an ACK message to the client with the + * calculated message tag. This calculated message tag + * allows us to distinguish between multiple concurrent + * reads from a single rank, which can happen when a rank + * owns multiple subfiles. + */ + HDassert(H5FD_IOC_tag_ub_val_ptr && (*H5FD_IOC_tag_ub_val_ptr >= IO_TAG_BASE)); + send_tag = (int)(counter % (INT_MAX - IO_TAG_BASE)); + send_tag %= (*H5FD_IOC_tag_ub_val_ptr - IO_TAG_BASE); + send_tag += IO_TAG_BASE; + + if (send_ack_to_client(send_tag, source, ioc_idx, READ_INDEP_ACK, comm) < 0) { + send_empty_buf = FALSE; + H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_READERROR, -1, "couldn't send ACK to client"); + } + } - sf_fid = sf_context->sf_fid; + /* Read data from the subfile */ + HDassert(subfile_idx < sf_context->sf_num_fids); + sf_fid = sf_context->sf_fids[subfile_idx]; if (sf_fid < 0) H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_BADVALUE, -1, "subfile file descriptor %d is invalid", sf_fid); - /* Read data from the subfile */ - if ((read_ret = ioc_file_read_data(sf_fid, file_offset, send_buf, data_size, subfile_rank)) < 0) { + if ((read_ret = ioc_file_read_data(sf_fid, file_offset, send_buf, data_size, ioc_idx)) < 0) { H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_READERROR, read_ret, "read function(FID=%d, Source=%d) returned an error (%d)", sf_fid, source, read_ret); @@ -936,8 +994,7 @@ ioc_file_queue_read_indep(sf_work_request_t *msg, int subfile_rank, int source, /* Send read data to the client */ H5_CHECK_OVERFLOW(data_size, int64_t, int); - if (MPI_SUCCESS != - (mpi_code = MPI_Send(send_buf, (int)data_size, MPI_BYTE, source, READ_INDEP_DATA, comm))) + if (MPI_SUCCESS != (mpi_code = MPI_Send(send_buf, (int)data_size, MPI_BYTE, source, send_tag, comm))) H5_SUBFILING_MPI_GOTO_ERROR(-1, "MPI_Send failed", mpi_code); #ifdef H5FD_IOC_COLLECT_STATS @@ -947,19 +1004,24 @@ ioc_file_queue_read_indep(sf_work_request_t *msg, int subfile_rank, int source, sf_queue_delay_time += t_queue_delay; #ifdef H5_SUBFILING_DEBUG - H5_subfiling_log(sf_context->sf_context_id, "[ioc(%d)] MPI_Send to source(%d) completed\n", subfile_rank, + H5_subfiling_log(sf_context->sf_context_id, "[ioc(%d)] MPI_Send to source(%d) completed\n", ioc_idx, source); #endif #endif done: + if (need_data_tag && send_nack) { + /* Send NACK back to client so client can handle failure gracefully */ + if (send_nack_to_client(source, ioc_idx, READ_INDEP_ACK, comm) < 0) + H5_SUBFILING_DONE_ERROR(H5E_IO, H5E_READERROR, -1, "couldn't send NACK to client"); + } if (send_empty_buf) { /* * Send an empty message back to client on failure. The client will * likely get a message truncation error, but at least shouldn't hang. */ - if (MPI_SUCCESS != (mpi_code = MPI_Send(NULL, 0, MPI_BYTE, source, READ_INDEP_DATA, comm))) + if (MPI_SUCCESS != (mpi_code = MPI_Send(NULL, 0, MPI_BYTE, source, send_tag, comm))) H5_SUBFILING_MPI_DONE_ERROR(-1, "MPI_Send failed", mpi_code); } @@ -978,7 +1040,7 @@ being thread safe. */ static int -ioc_file_write_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, int subfile_rank) +ioc_file_write_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, int ioc_idx) { ssize_t bytes_remaining = (ssize_t)data_size; ssize_t bytes_written = 0; @@ -986,7 +1048,7 @@ ioc_file_write_data(int fd, int64_t file_offset, void *data_buffer, int64_t data int ret_value = 0; #ifndef H5FD_IOC_DEBUG - (void)subfile_rank; + (void)ioc_idx; #endif HDcompile_assert(H5_SIZEOF_OFF_T == sizeof(file_offset)); @@ -1000,7 +1062,7 @@ ioc_file_write_data(int fd, int64_t file_offset, void *data_buffer, int64_t data bytes_remaining -= bytes_written; #ifdef H5FD_IOC_DEBUG - HDprintf("[ioc(%d) %s]: wrote %ld bytes, remaining=%ld, file_offset=%" PRId64 "\n", subfile_rank, + HDprintf("[ioc(%d) %s]: wrote %ld bytes, remaining=%ld, file_offset=%" PRId64 "\n", ioc_idx, __func__, bytes_written, bytes_remaining, file_offset); #endif @@ -1024,7 +1086,7 @@ done: } /* end ioc_file_write_data() */ static int -ioc_file_read_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, int subfile_rank) +ioc_file_read_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, int ioc_idx) { useconds_t delay = 100; ssize_t bytes_remaining = (ssize_t)data_size; @@ -1034,7 +1096,7 @@ ioc_file_read_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_ int ret_value = 0; #ifndef H5FD_IOC_DEBUG - (void)subfile_rank; + (void)ioc_idx; #endif HDcompile_assert(H5_SIZEOF_OFF_T == sizeof(file_offset)); @@ -1052,7 +1114,7 @@ ioc_file_read_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_ bytes_remaining -= bytes_read; #ifdef H5FD_IOC_DEBUG - HDprintf("[ioc(%d) %s]: read %ld bytes, remaining=%ld, file_offset=%" PRId64 "\n", subfile_rank, + HDprintf("[ioc(%d) %s]: read %ld bytes, remaining=%ld, file_offset=%" PRId64 "\n", ioc_idx, __func__, bytes_read, bytes_remaining, file_offset); #endif @@ -1069,8 +1131,8 @@ ioc_file_read_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_ else { if (retries == 0) { #ifdef H5FD_IOC_DEBUG - HDprintf("[ioc(%d) %s]: TIMEOUT: file_offset=%" PRId64 ", data_size=%ld\n", subfile_rank, - __func__, file_offset, data_size); + HDprintf("[ioc(%d) %s]: TIMEOUT: file_offset=%" PRId64 ", data_size=%ld\n", ioc_idx, __func__, + file_offset, data_size); #endif H5_SUBFILING_SYS_GOTO_ERROR(H5E_IO, H5E_READERROR, -1, "HDpread failed"); @@ -1087,19 +1149,40 @@ done: } /* end ioc_file_read_data() */ static int -ioc_file_truncate(int fd, int64_t length, int subfile_rank) +ioc_file_truncate(sf_work_request_t *msg) { - int ret_value = 0; + subfiling_context_t *sf_context = NULL; + int64_t file_context_id; + int64_t length; + int64_t subfile_idx; + int fd; + int ioc_idx; + int ret_value = 0; + + HDassert(msg); + + file_context_id = msg->context_id; + ioc_idx = msg->ioc_idx; + + length = msg->header[0]; + subfile_idx = msg->header[1]; #ifndef H5FD_IOC_DEBUG - (void)subfile_rank; + (void)ioc_idx; #endif + if (NULL == (sf_context = H5_get_subfiling_object(file_context_id))) + H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTGET, -1, "couldn't retrieve subfiling context"); + + HDassert(subfile_idx < sf_context->sf_num_fids); + + fd = sf_context->sf_fids[subfile_idx]; + if (HDftruncate(fd, (off_t)length) != 0) H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_SEEKERROR, -1, "HDftruncate failed"); #ifdef H5FD_IOC_DEBUG - HDprintf("[ioc(%d) %s]: truncated subfile to %lld bytes. ret = %d\n", subfile_rank, __func__, + HDprintf("[ioc(%d) %s]: truncated subfile to %lld bytes. ret = %d\n", ioc_idx, __func__, (long long)length, errno); HDfflush(stdout); #endif @@ -1111,7 +1194,7 @@ done: /*------------------------------------------------------------------------- * Function: ioc_file_report_eof * - * Purpose: Determine the target sub-file's eof and report this value + * Purpose: Determine the target subfile's eof and report this value * to the requesting rank. * * Notes: This function will have to be reworked once we solve @@ -1131,40 +1214,48 @@ done: */ static int -ioc_file_report_eof(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm) +ioc_file_report_eof(sf_work_request_t *msg, MPI_Comm comm) { subfiling_context_t *sf_context = NULL; h5_stat_t sb; int64_t eof_req_reply[3]; int64_t file_context_id; + int64_t subfile_idx; int fd; + int source; + int ioc_idx; int mpi_code; int ret_value = 0; HDassert(msg); - /* first get the EOF of the target file. */ + file_context_id = msg->context_id; + source = msg->source; + ioc_idx = msg->ioc_idx; - file_context_id = msg->header[2]; + subfile_idx = msg->header[0]; if (NULL == (sf_context = H5_get_subfiling_object(file_context_id))) H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTGET, -1, "couldn't retrieve subfiling context"); - fd = sf_context->sf_fid; + HDassert(subfile_idx < sf_context->sf_num_fids); + + fd = sf_context->sf_fids[subfile_idx]; if (HDfstat(fd, &sb) < 0) H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_SYSERRSTR, -1, "HDfstat failed"); - eof_req_reply[0] = (int64_t)subfile_rank; + eof_req_reply[0] = (int64_t)ioc_idx; eof_req_reply[1] = (int64_t)(sb.st_size); - eof_req_reply[2] = 0; /* not used */ + eof_req_reply[2] = subfile_idx; #ifdef H5_SUBFILING_DEBUG H5_subfiling_log(file_context_id, "%s: reporting file EOF as %" PRId64 ".", __func__, eof_req_reply[1]); #endif /* return the subfile EOF to the querying rank */ - if (MPI_SUCCESS != (mpi_code = MPI_Send(eof_req_reply, 3, MPI_INT64_T, source, GET_EOF_COMPLETED, comm))) + if (MPI_SUCCESS != + (mpi_code = MPI_Send(eof_req_reply, 1, H5_subfiling_rpc_msg_type, source, GET_EOF_COMPLETED, comm))) H5_SUBFILING_MPI_GOTO_ERROR(-1, "MPI_Send", mpi_code); done: @@ -1272,12 +1363,13 @@ ioc_io_queue_add_entry(ioc_data_t *ioc_data, sf_work_request_t *wk_req_ptr) atomic_fetch_add(&ioc_data->sf_io_ops_pending, 1); #ifdef H5_SUBFILING_DEBUG - H5_subfiling_log(wk_req_ptr->context_id, - "%s: request %d queued. op = %d, offset/len = %lld/%lld, q-ed/disp/ops_pend = %d/%d/%d.", - __func__, entry_ptr->counter, (entry_ptr->wk_req.tag), - (long long)(entry_ptr->wk_req.header[1]), (long long)(entry_ptr->wk_req.header[0]), - ioc_data->io_queue.num_pending, ioc_data->io_queue.num_in_progress, - atomic_load(&ioc_data->sf_io_ops_pending)); + H5_subfiling_log( + wk_req_ptr->context_id, + "%s: request %d queued. op = %d, req = (%lld, %lld, %lld), q-ed/disp/ops_pend = %d/%d/%d.", __func__, + entry_ptr->counter, (entry_ptr->wk_req.tag), (long long)(entry_ptr->wk_req.header[0]), + (long long)(entry_ptr->wk_req.header[1]), (long long)(entry_ptr->wk_req.header[2]), + ioc_data->io_queue.num_pending, ioc_data->io_queue.num_in_progress, + atomic_load(&ioc_data->sf_io_ops_pending)); #endif HDassert(ioc_data->io_queue.num_pending + ioc_data->io_queue.num_in_progress == ioc_data->io_queue.q_len); @@ -1478,14 +1570,14 @@ ioc_io_queue_dispatch_eligible_entries(ioc_data_t *ioc_data, hbool_t try_lock) entry_ptr->thread_wk.args = entry_ptr; #ifdef H5_SUBFILING_DEBUG - H5_subfiling_log(entry_ptr->wk_req.context_id, - "%s: request %d dispatched. op = %d, offset/len = %lld/%lld, " - "q-ed/disp/ops_pend = %d/%d/%d.", - __func__, entry_ptr->counter, (entry_ptr->wk_req.tag), - (long long)(entry_ptr->wk_req.header[1]), - (long long)(entry_ptr->wk_req.header[0]), ioc_data->io_queue.num_pending, - ioc_data->io_queue.num_in_progress, - atomic_load(&ioc_data->sf_io_ops_pending)); + H5_subfiling_log( + entry_ptr->wk_req.context_id, + "%s: request %d dispatched. op = %d, req = (%lld, %lld, %lld), " + "q-ed/disp/ops_pend = %d/%d/%d.", + __func__, entry_ptr->counter, (entry_ptr->wk_req.tag), + (long long)(entry_ptr->wk_req.header[0]), (long long)(entry_ptr->wk_req.header[1]), + (long long)(entry_ptr->wk_req.header[2]), ioc_data->io_queue.num_pending, + ioc_data->io_queue.num_in_progress, atomic_load(&ioc_data->sf_io_ops_pending)); #endif #ifdef H5FD_IOC_COLLECT_STATS @@ -1564,12 +1656,12 @@ ioc_io_queue_complete_entry(ioc_data_t *ioc_data, ioc_io_queue_entry_t *entry_pt #ifdef H5_SUBFILING_DEBUG H5_subfiling_log(entry_ptr->wk_req.context_id, - "%s: request %d completed with ret %d. op = %d, offset/len = %lld/%lld, " + "%s: request %d completed with ret %d. op = %d, req = (%lld, %lld, %lld), " "q-ed/disp/ops_pend = %d/%d/%d.", __func__, entry_ptr->counter, entry_ptr->wk_ret, (entry_ptr->wk_req.tag), - (long long)(entry_ptr->wk_req.header[1]), (long long)(entry_ptr->wk_req.header[0]), - ioc_data->io_queue.num_pending, ioc_data->io_queue.num_in_progress, - atomic_load(&ioc_data->sf_io_ops_pending)); + (long long)(entry_ptr->wk_req.header[0]), (long long)(entry_ptr->wk_req.header[1]), + (long long)(entry_ptr->wk_req.header[2]), ioc_data->io_queue.num_pending, + ioc_data->io_queue.num_in_progress, atomic_load(&ioc_data->sf_io_ops_pending)); /* * If this I/O request is a truncate or "get eof" op, make sure diff --git a/src/H5FDsubfiling/H5FDsubfile_int.c b/src/H5FDsubfiling/H5FDsubfile_int.c index 22a5bd0..c089509 100644 --- a/src/H5FDsubfiling/H5FDsubfile_int.c +++ b/src/H5FDsubfiling/H5FDsubfile_int.c @@ -30,11 +30,11 @@ * Note: This code should be moved -- most likely to the IOC * code files. * - * Purpose: Apply a truncate operation to the sub-files. + * Purpose: Apply a truncate operation to the subfiles. * * In the context of the I/O concentrators, the eof must be * translated into the appropriate value for each of the - * sub-files, and then applied to same. + * subfiles, and then applied to same. * * Further, we must ensure that all prior I/O requests complete * before the truncate is applied. @@ -44,7 +44,7 @@ * 1) Run a barrier on entry. * * 2) Determine if this rank is a IOC. If it is, compute - * the correct EOF for this sub-file, and send a truncate + * the correct EOF for this subfile, and send a truncate * request to the IOC. * * 3) On the IOC thread, allow all pending I/O requests @@ -72,50 +72,61 @@ herr_t H5FD__subfiling__truncate_sub_files(hid_t context_id, int64_t logical_file_eof, MPI_Comm comm) { - int mpi_code; /* MPI return code */ subfiling_context_t *sf_context = NULL; - int64_t msg[3] = { - 0, - }; - herr_t ret_value = SUCCEED; /* Return value */ + int64_t msg[3] = {0}; + int mpi_size; + int mpi_code; + herr_t ret_value = SUCCEED; + + if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(comm, &mpi_size))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code); /* Barrier on entry */ - if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm))) - H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); + if (mpi_size > 1) + if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); if (NULL == (sf_context = (subfiling_context_t *)H5_get_subfiling_object(context_id))) H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "can't get subfile context"); - /* Test to see if this rank is running an I/O concentrator. */ - if (sf_context->topology->rank_is_ioc) { - - int i; - int64_t subfile_eof; int64_t num_full_stripes; + int64_t num_leftover_stripes; int64_t partial_stripe_len; -#ifndef NDEBUG - int64_t test_file_eof; -#endif /* NDEBUG */ - /* if it is, first compute the sub-file EOF */ + num_full_stripes = logical_file_eof / sf_context->sf_blocksize_per_stripe; + partial_stripe_len = logical_file_eof % sf_context->sf_blocksize_per_stripe; + num_leftover_stripes = partial_stripe_len / sf_context->sf_stripe_size; - num_full_stripes = logical_file_eof / sf_context->sf_blocksize_per_stripe; - partial_stripe_len = logical_file_eof % sf_context->sf_blocksize_per_stripe; + /* Compute the EOF for each subfile this IOC owns */ + for (int i = 0; i < sf_context->sf_num_fids; i++) { + int64_t subfile_eof = num_full_stripes * sf_context->sf_stripe_size; + int64_t global_subfile_idx; - subfile_eof = num_full_stripes * sf_context->sf_stripe_size; + global_subfile_idx = + (i * sf_context->topology->n_io_concentrators) + sf_context->topology->ioc_idx; - if (sf_context->topology->subfile_rank < (partial_stripe_len / sf_context->sf_stripe_size)) { + if (global_subfile_idx < num_leftover_stripes) { + subfile_eof += sf_context->sf_stripe_size; + } + else if (global_subfile_idx == num_leftover_stripes) { + subfile_eof += partial_stripe_len % sf_context->sf_stripe_size; + } - subfile_eof += sf_context->sf_stripe_size; - } - else if (sf_context->topology->subfile_rank == (partial_stripe_len / sf_context->sf_stripe_size)) { + /* Direct the IOC to truncate this subfile to the correct EOF */ + msg[0] = subfile_eof; + msg[1] = i; + msg[2] = -1; /* padding -- not used in this message */ - subfile_eof += partial_stripe_len % sf_context->sf_stripe_size; + if (MPI_SUCCESS != + (mpi_code = MPI_Send(msg, 1, H5_subfiling_rpc_msg_type, + sf_context->topology->io_concentrators[sf_context->topology->ioc_idx], + TRUNC_OP, sf_context->sf_msg_comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Send failed", mpi_code); } /* sanity check -- compute the file eof using the same mechanism used to - * compute the sub-file eof. Assert that the computed value and the + * compute the subfile eof. Assert that the computed value and the * actual value match. * * Do this only for debug builds -- probably delete this before release. @@ -124,40 +135,29 @@ H5FD__subfiling__truncate_sub_files(hid_t context_id, int64_t logical_file_eof, */ #ifndef NDEBUG - test_file_eof = 0; - - for (i = 0; i < sf_context->topology->n_io_concentrators; i++) { - - test_file_eof += num_full_stripes * sf_context->sf_stripe_size; - - if (i < (partial_stripe_len / sf_context->sf_stripe_size)) { - - test_file_eof += sf_context->sf_stripe_size; + { + int64_t test_file_eof = 0; + + for (int i = 0; i < sf_context->sf_num_subfiles; i++) { + test_file_eof += num_full_stripes * sf_context->sf_stripe_size; + + if (i < num_leftover_stripes) { + test_file_eof += sf_context->sf_stripe_size; + } + else if (i == num_leftover_stripes) { + test_file_eof += partial_stripe_len % sf_context->sf_stripe_size; + } } - else if (i == (partial_stripe_len / sf_context->sf_stripe_size)) { - test_file_eof += partial_stripe_len % sf_context->sf_stripe_size; - } + HDassert(test_file_eof == logical_file_eof); } - HDassert(test_file_eof == logical_file_eof); #endif /* NDEBUG */ - - /* then direct the IOC to truncate the sub-file to the correct EOF */ - - msg[0] = subfile_eof; - msg[1] = 0; /* padding -- not used in this message */ - msg[2] = context_id; - - if (MPI_SUCCESS != - (mpi_code = MPI_Send(msg, 3, MPI_INT64_T, - sf_context->topology->io_concentrators[sf_context->topology->subfile_rank], - TRUNC_OP, sf_context->sf_msg_comm))) - H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Send failed", mpi_code); } /* Barrier on exit */ - if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm))) - H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); + if (mpi_size > 1) + if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); done: @@ -176,9 +176,10 @@ done: * Do this as follows: * * 1) allocate an array of int64_t of length equal to the - * the number of IOCs, and initialize all fields to -1. + * the number of subfiles, and initialize all fields to -1. * - * 2) Send each IOC a message requesting that sub-file's EOF. + * 2) Send each subfile's IOC a message requesting that + * subfile's EOF. * * 3) Await reply from each IOC, storing the reply in * the appropriate entry in the array allocated in 1. @@ -197,13 +198,13 @@ done: * than for the more traditional HDF5 file implementations. * This statement derives from the fact that unlike "normal" * HDF5 files, subfiling introduces a multi-file representation - * of a single HDF5 file. The plurality of sub-files represents - * a software RAID-0 based HDF5 file. As such, each sub-file + * of a single HDF5 file. The plurality of subfiles represents + * a software RAID-0 based HDF5 file. As such, each subfile * contains a designated portion of the address space of the * virtual HDF5 storage. We have no notion of HDF5 datatypes, * datasets, metadata, or other HDF5 structures; only BYTES. * - * The organization of the bytes within sub-files is consistent + * The organization of the bytes within subfiles is consistent * with the RAID-0 striping, i.e. there are IO Concentrators * (IOCs) which correspond to a stripe-count (in Lustre) as * well as a stripe_size. The combination of these two @@ -220,7 +221,7 @@ done: * follows. * 1. At file creation, each IOC is assigned a rank value * (0 to N-1, where N is the total number of IOCs) and - * a 'sf_base_addr' = 'subfile_rank' * 'sf_stripe_size') + * a 'sf_base_addr' = 'ioc_idx' * 'sf_stripe_size') * we also determine the 'sf_blocksize_per_stripe' which * is simply the 'sf_stripe_size' * 'n_ioc_concentrators' * @@ -263,9 +264,10 @@ H5FD__subfiling__get_real_eof(hid_t context_id, int64_t *logical_eof_ptr) int64_t msg[3] = {0, 0, 0}; int64_t logical_eof = 0; int64_t sf_logical_eof; - int n_io_concentrators = 0; /* copy of value in topology */ - int mpi_code; /* MPI return code */ - herr_t ret_value = SUCCEED; /* Return value */ + int n_io_concentrators = 0; + int num_subfiles = 0; + int mpi_code; /* MPI return code */ + herr_t ret_value = SUCCEED; /* Return value */ HDassert(logical_eof_ptr); @@ -275,56 +277,60 @@ H5FD__subfiling__get_real_eof(hid_t context_id, int64_t *logical_eof_ptr) HDassert(sf_context->topology); n_io_concentrators = sf_context->topology->n_io_concentrators; + num_subfiles = sf_context->sf_num_subfiles; HDassert(n_io_concentrators > 0); + HDassert(num_subfiles >= n_io_concentrators); - if (NULL == (sf_eofs = HDmalloc((size_t)n_io_concentrators * sizeof(int64_t)))) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate sub-file EOFs array"); - if (NULL == (recv_reqs = HDmalloc((size_t)n_io_concentrators * sizeof(*recv_reqs)))) + if (NULL == (sf_eofs = HDmalloc((size_t)num_subfiles * sizeof(int64_t)))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate subfile EOFs array"); + if (NULL == (recv_reqs = HDmalloc((size_t)num_subfiles * sizeof(*recv_reqs)))) H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate receive requests array"); - if (NULL == (recv_msg = HDmalloc((size_t)n_io_concentrators * 3 * sizeof(*recv_msg)))) + if (NULL == (recv_msg = HDmalloc((size_t)num_subfiles * sizeof(msg)))) H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate message array"); - for (int i = 0; i < n_io_concentrators; i++) { + for (int i = 0; i < num_subfiles; i++) { sf_eofs[i] = -1; recv_reqs[i] = MPI_REQUEST_NULL; } - /* Post early non-blocking receives for replies from each IOC */ - for (int i = 0; i < n_io_concentrators; i++) { - int ioc_rank = sf_context->topology->io_concentrators[i]; + /* Post early non-blocking receives for the EOF of each subfile */ + for (int i = 0; i < num_subfiles; i++) { + int ioc_rank = sf_context->topology->io_concentrators[i % n_io_concentrators]; - if (MPI_SUCCESS != (mpi_code = MPI_Irecv(&recv_msg[3 * i], 3, MPI_INT64_T, ioc_rank, + if (MPI_SUCCESS != (mpi_code = MPI_Irecv(&recv_msg[3 * i], 1, H5_subfiling_rpc_msg_type, ioc_rank, GET_EOF_COMPLETED, sf_context->sf_eof_comm, &recv_reqs[i]))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Irecv", mpi_code); } - /* Send each IOC a message requesting that subfile's EOF */ + /* Send each subfile's IOC a message requesting that subfile's EOF */ + + msg[1] = -1; /* padding -- not used in this message */ + msg[2] = -1; /* padding -- not used in this message */ - msg[0] = 0; /* padding -- not used in this message */ - msg[1] = 0; /* padding -- not used in this message */ - msg[2] = context_id; + for (int i = 0; i < num_subfiles; i++) { + int ioc_rank = sf_context->topology->io_concentrators[i % n_io_concentrators]; - for (int i = 0; i < n_io_concentrators; i++) { - int ioc_rank = sf_context->topology->io_concentrators[i]; + /* Set subfile index for receiving IOC */ + msg[0] = i / n_io_concentrators; - if (MPI_SUCCESS != - (mpi_code = MPI_Send(msg, 3, MPI_INT64_T, ioc_rank, GET_EOF_OP, sf_context->sf_msg_comm))) + if (MPI_SUCCESS != (mpi_code = MPI_Send(msg, 1, H5_subfiling_rpc_msg_type, ioc_rank, GET_EOF_OP, + sf_context->sf_msg_comm))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Send", mpi_code); } /* Wait for EOF communication to complete */ - if (MPI_SUCCESS != (mpi_code = MPI_Waitall(n_io_concentrators, recv_reqs, MPI_STATUSES_IGNORE))) + if (MPI_SUCCESS != (mpi_code = MPI_Waitall(num_subfiles, recv_reqs, MPI_STATUSES_IGNORE))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Waitall", mpi_code); - for (int i = 0; i < n_io_concentrators; i++) { + for (int i = 0; i < num_subfiles; i++) { int ioc_rank = (int)recv_msg[3 * i]; HDassert(ioc_rank >= 0); HDassert(ioc_rank < n_io_concentrators); - HDassert(sf_eofs[ioc_rank] == -1); + HDassert(sf_eofs[i] == -1); - sf_eofs[ioc_rank] = recv_msg[(3 * i) + 1]; + sf_eofs[i] = recv_msg[(3 * i) + 1]; } /* 4) After all IOCs have replied, compute the offset of @@ -333,21 +339,21 @@ H5FD__subfiling__get_real_eof(hid_t context_id, int64_t *logical_eof_ptr) * EOF. */ - for (int i = 0; i < n_io_concentrators; i++) { + for (int i = 0; i < num_subfiles; i++) { /* compute number of complete stripes */ sf_logical_eof = sf_eofs[i] / sf_context->sf_stripe_size; /* multiply by stripe size */ - sf_logical_eof *= sf_context->sf_stripe_size * n_io_concentrators; + sf_logical_eof *= sf_context->sf_stripe_size * num_subfiles; - /* if the sub-file doesn't end on a stripe size boundary, must add in a partial stripe */ + /* if the subfile doesn't end on a stripe size boundary, must add in a partial stripe */ if (sf_eofs[i] % sf_context->sf_stripe_size > 0) { /* add in the size of the partial stripe up to but not including this subfile */ sf_logical_eof += i * sf_context->sf_stripe_size; - /* finally, add in the number of bytes in the last partial stripe depth in the sub-file */ + /* finally, add in the number of bytes in the last partial stripe depth in the subfile */ sf_logical_eof += sf_eofs[i] % sf_context->sf_stripe_size; } @@ -365,7 +371,7 @@ H5FD__subfiling__get_real_eof(hid_t context_id, int64_t *logical_eof_ptr) done: if (ret_value < 0) { - for (int i = 0; i < n_io_concentrators; i++) { + for (int i = 0; i < num_subfiles; i++) { if (recv_reqs && (recv_reqs[i] != MPI_REQUEST_NULL)) { if (MPI_SUCCESS != (mpi_code = MPI_Cancel(&recv_reqs[i]))) H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Cancel", mpi_code); diff --git a/src/H5FDsubfiling/H5FDsubfiling.c b/src/H5FDsubfiling/H5FDsubfiling.c index 8fe8f77..33a57e9 100644 --- a/src/H5FDsubfiling/H5FDsubfiling.c +++ b/src/H5FDsubfiling/H5FDsubfiling.c @@ -91,7 +91,6 @@ static hbool_t H5FD_mpi_self_initialized = FALSE; typedef struct H5FD_subfiling_t { H5FD_t pub; /* public stuff, must be first */ - int fd; /* the filesystem file descriptor */ H5FD_subfiling_config_t fa; /* driver-specific file access properties */ /* MPI Info */ @@ -102,8 +101,10 @@ typedef struct H5FD_subfiling_t { int mpi_size; H5FD_t *sf_file; + H5FD_t *stub_file; - int64_t context_id; /* The value used to lookup a subfiling context for the file */ + uint64_t file_id; + int64_t context_id; /* The value used to lookup a subfiling context for the file */ char *file_dir; /* Directory where we find files */ char *file_path; /* The user defined filename */ @@ -146,6 +147,9 @@ typedef struct H5FD_subfiling_t { /* Prototypes */ static herr_t H5FD__subfiling_term(void); +static hsize_t H5FD__subfiling_sb_size(H5FD_t *_file); +static herr_t H5FD__subfiling_sb_encode(H5FD_t *_file, char *name, unsigned char *buf); +static herr_t H5FD__subfiling_sb_decode(H5FD_t *_file, const char *name, const unsigned char *buf); static void *H5FD__subfiling_fapl_get(H5FD_t *_file); static void *H5FD__subfiling_fapl_copy(const void *_old_fa); static herr_t H5FD__subfiling_fapl_free(void *_fa); @@ -182,8 +186,8 @@ static herr_t H5FD__subfiling_close_int(H5FD_subfiling_t *file_ptr); static herr_t init_indep_io(subfiling_context_t *sf_context, int64_t file_offset, size_t io_nelemts, size_t dtype_extent, size_t max_iovec_len, int64_t *mem_buf_offset, - int64_t *target_file_offset, int64_t *io_block_len, int *first_ioc_index, - int *n_iocs_used, int64_t *max_io_req_per_ioc); + int64_t *target_file_offset, int64_t *io_block_len, int *first_subfile_index, + int *n_subfiles_used, int64_t *max_io_req_per_subfile); static herr_t iovec_fill_first(subfiling_context_t *sf_context, int64_t iovec_depth, int64_t target_datasize, int64_t start_mem_offset, int64_t start_file_offset, int64_t first_io_len, int64_t *mem_offset_out, int64_t *target_file_offset_out, @@ -211,9 +215,9 @@ static const H5FD_class_t H5FD_subfiling_g = { MAXADDR, /* maxaddr */ H5F_CLOSE_WEAK, /* fc_degree */ H5FD__subfiling_term, /* terminate */ - NULL, /* sb_size */ - NULL, /* sb_encode */ - NULL, /* sb_decode */ + H5FD__subfiling_sb_size, /* sb_size */ + H5FD__subfiling_sb_encode, /* sb_encode */ + H5FD__subfiling_sb_decode, /* sb_decode */ sizeof(H5FD_subfiling_config_t), /* fapl_size */ H5FD__subfiling_fapl_get, /* fapl_get */ H5FD__subfiling_fapl_copy, /* fapl_copy */ @@ -326,6 +330,18 @@ H5FD_subfiling_init(void) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, H5I_INVALID_HID, "can't register atexit handler for MPI_Finalize"); } + + /* + * Create the MPI Datatype that will be used + * for sending/receiving RPC messages + */ + HDcompile_assert(sizeof(((sf_work_request_t *)NULL)->header) == 3 * sizeof(int64_t)); + if (H5_subfiling_rpc_msg_type == MPI_DATATYPE_NULL) { + if (MPI_SUCCESS != (mpi_code = MPI_Type_contiguous(3, MPI_INT64_T, &H5_subfiling_rpc_msg_type))) + H5_SUBFILING_MPI_GOTO_ERROR(H5I_INVALID_HID, "MPI_Type_contiguous failed", mpi_code); + if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(&H5_subfiling_rpc_msg_type))) + H5_SUBFILING_MPI_GOTO_ERROR(H5I_INVALID_HID, "MPI_Type_commit failed", mpi_code); + } } /* Set return value */ @@ -350,6 +366,18 @@ H5FD__subfiling_term(void) herr_t ret_value = SUCCEED; if (H5FD_SUBFILING_g >= 0) { + int mpi_code; + + /* Free RPC message MPI Datatype */ + if (H5_subfiling_rpc_msg_type != MPI_DATATYPE_NULL) + if (MPI_SUCCESS != (mpi_code = MPI_Type_free(&H5_subfiling_rpc_msg_type))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Type_free failed", mpi_code); + + /* Clean up resources */ + if (H5_subfiling_terminate() < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, + "can't cleanup internal subfiling resources"); + /* Unregister from HDF5 error API */ if (H5subfiling_err_class_g >= 0) { if (H5Eunregister_class(H5subfiling_err_class_g) < 0) @@ -402,6 +430,9 @@ H5Pset_fapl_subfiling(hid_t fapl_id, const H5FD_subfiling_config_t *vfd_config) { H5FD_subfiling_config_t *subfiling_conf = NULL; H5P_genplist_t *plist = NULL; + H5P_genplist_t *ioc_plist = NULL; + MPI_Comm comm = MPI_COMM_NULL; + MPI_Info info = MPI_INFO_NULL; herr_t ret_value = SUCCEED; /*NO TRACE*/ @@ -427,12 +458,38 @@ H5Pset_fapl_subfiling(hid_t fapl_id, const H5FD_subfiling_config_t *vfd_config) vfd_config = subfiling_conf; } + /* Check if any MPI parameters were set on the FAPL */ + if (H5P_get(plist, H5F_ACS_MPI_PARAMS_COMM_NAME, &comm) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI communicator from plist"); + if (H5P_get(plist, H5F_ACS_MPI_PARAMS_INFO_NAME, &info) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI info from plist"); + if (comm == MPI_COMM_NULL) + comm = MPI_COMM_WORLD; + + /* Set MPI parameters on IOC FAPL */ + if (NULL == (ioc_plist = H5P_object_verify(vfd_config->ioc_fapl_id, H5P_FILE_ACCESS))) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list"); + if (H5P_set(ioc_plist, H5F_ACS_MPI_PARAMS_COMM_NAME, &comm) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI communicator on plist"); + if (H5P_set(ioc_plist, H5F_ACS_MPI_PARAMS_INFO_NAME, &info) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI info on plist"); + if (H5FD__subfiling_validate_config(vfd_config) < 0) H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid subfiling VFD configuration"); + /* Set Subfiling configuration on IOC FAPL */ + if (H5_subfiling_set_config_prop(ioc_plist, &vfd_config->shared_cfg) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, + "can't set subfiling configuration on IOC FAPL"); + ret_value = H5P_set_driver(plist, H5FD_SUBFILING, vfd_config, NULL); done: + if (H5_mpi_comm_free(&comm) < 0) + H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI Communicator"); + if (H5_mpi_info_free(&info) < 0) + H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI Info object"); + if (subfiling_conf) { if (subfiling_conf->ioc_fapl_id >= 0 && H5I_dec_ref(subfiling_conf->ioc_fapl_id) < 0) H5_SUBFILING_DONE_ERROR(H5E_PLIST, H5E_CANTDEC, FAIL, "can't close IOC FAPL"); @@ -516,7 +573,7 @@ H5FD__subfiling_get_default_config(hid_t fapl_id, H5FD_subfiling_config_t *confi config_out->shared_cfg.ioc_selection = SELECT_IOC_ONE_PER_NODE; config_out->shared_cfg.stripe_size = H5FD_SUBFILING_DEFAULT_STRIPE_SIZE; - config_out->shared_cfg.stripe_count = 0; + config_out->shared_cfg.stripe_count = H5FD_SUBFILING_DEFAULT_STRIPE_COUNT; if ((h5_require_ioc = HDgetenv("H5_REQUIRE_IOC")) != NULL) { int value_check = HDatoi(h5_require_ioc); @@ -553,9 +610,9 @@ H5FD__subfiling_get_default_config(hid_t fapl_id, H5FD_subfiling_config_t *confi done: if (H5_mpi_comm_free(&comm) < 0) - H5_SUBFILING_DONE_ERROR(H5E_PLIST, H5E_CANTFREE, FAIL, "can't free MPI Communicator"); + H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI Communicator"); if (H5_mpi_info_free(&info) < 0) - H5_SUBFILING_DONE_ERROR(H5E_PLIST, H5E_CANTFREE, FAIL, "can't free MPI Info object"); + H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI Info object"); if (ret_value < 0) { if (config_out->ioc_fapl_id >= 0 && H5Pclose(config_out->ioc_fapl_id) < 0) @@ -603,15 +660,193 @@ H5FD__subfiling_validate_config(const H5FD_subfiling_config_t *fa) H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "Subfiling VFD currently always requires IOC VFD to be used"); - if (fa->shared_cfg.ioc_selection < SELECT_IOC_ONE_PER_NODE || - fa->shared_cfg.ioc_selection >= ioc_selection_options) - H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid IOC selection method"); + if (H5_subfiling_validate_config(&fa->shared_cfg) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid subfiling configuration parameters"); done: H5_SUBFILING_FUNC_LEAVE; } /* end H5FD__subfiling_validate_config() */ /*------------------------------------------------------------------------- + * Function: H5FD__subfiling_sb_size + * + * Purpose: Returns the size of the subfiling configuration information + * to be stored in the superblock. + * + * Return: Size of subfiling configuration information (never fails) + *------------------------------------------------------------------------- + */ +static hsize_t +H5FD__subfiling_sb_size(H5FD_t *_file) +{ + H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file; + hsize_t ret_value = 0; + + HDassert(file); + + /* Configuration structure magic number */ + ret_value += sizeof(uint32_t); + + /* Configuration structure version number */ + ret_value += sizeof(uint32_t); + + /* "Require IOC" field */ + ret_value += sizeof(int32_t); + + /* Subfiling stripe size */ + ret_value += sizeof(int64_t); + + /* Subfiling stripe count (encoded as int64_t for future) */ + ret_value += sizeof(int64_t); + + /* Add superblock information from IOC file if necessary */ + if (file->sf_file) { + /* Encode the IOC's name into the subfiling information */ + ret_value += 9; + + ret_value += H5FD_sb_size(file->sf_file); + } + + H5_SUBFILING_FUNC_LEAVE; +} /* end H5FD__subfiling_sb_size() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__subfiling_sb_encode + * + * Purpose: Encodes the subfiling configuration information into the + * specified buffer. + * + * Return: Non-negative on success/Negative on failure + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__subfiling_sb_encode(H5FD_t *_file, char *name, unsigned char *buf) +{ + subfiling_context_t *sf_context = NULL; + H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file; + uint8_t *p = (uint8_t *)buf; + int64_t tmp64; + int32_t tmp32; + herr_t ret_value = SUCCEED; + + if (NULL == (sf_context = H5_get_subfiling_object(file->context_id))) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get subfiling context object"); + + /* Encode driver name */ + HDstrncpy(name, "Subfilin", 9); + name[8] = '\0'; + + /* Encode configuration structure magic number */ + UINT32ENCODE(p, file->fa.magic); + + /* Encode configuration structure version number */ + UINT32ENCODE(p, file->fa.version); + + /* Encode "require IOC" field */ + tmp32 = (int32_t)file->fa.require_ioc; + INT32ENCODE(p, tmp32); + + /* Encode subfiling stripe size */ + INT64ENCODE(p, sf_context->sf_stripe_size); + + /* Encode subfiling stripe count (number of subfiles) */ + tmp64 = sf_context->sf_num_subfiles; + INT64ENCODE(p, tmp64); + + /* Encode IOC VFD configuration information if necessary */ + if (file->sf_file) { + char ioc_name[9]; + + HDmemset(ioc_name, 0, sizeof(ioc_name)); + + if (H5FD_sb_encode(file->sf_file, ioc_name, p + 9) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTENCODE, FAIL, + "unable to encode IOC VFD's superblock information"); + + /* Copy the IOC VFD's name into our buffer */ + HDmemcpy(p, ioc_name, 9); + } + +done: + H5_SUBFILING_FUNC_LEAVE; +} /* end H5FD__subfiling_sb_encode() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__subfiling_sb_decode + * + * Purpose: Decodes the subfiling configuration information from the + * specified buffer. + * + * Return: Non-negative on success/Negative on failure + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__subfiling_sb_decode(H5FD_t *_file, const char *name, const unsigned char *buf) +{ + subfiling_context_t *sf_context = NULL; + H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file; + const uint8_t *p = (const uint8_t *)buf; + int64_t tmp64; + int32_t tmp32; + herr_t ret_value = SUCCEED; + + if (NULL == (sf_context = H5_get_subfiling_object(file->context_id))) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get subfiling context object"); + + if (HDstrncmp(name, "Subfilin", 9)) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "invalid driver name in superblock"); + + /* Decode configuration structure magic number */ + UINT32DECODE(p, file->fa.magic); + + /* Decode configuration structure version number */ + UINT32DECODE(p, file->fa.version); + + /* Decode "require IOC" field */ + INT32DECODE(p, tmp32); + file->fa.require_ioc = (hbool_t)tmp32; + + /* Decode subfiling stripe size */ + INT64DECODE(p, file->fa.shared_cfg.stripe_size); + + /* Decode subfiling stripe count */ + INT64DECODE(p, tmp64); + H5_CHECK_OVERFLOW(tmp64, int64_t, int32_t); + file->fa.shared_cfg.stripe_count = (int32_t)tmp64; + + if (file->sf_file) { + char ioc_name[9]; + + HDmemcpy(ioc_name, p, 9); + p += 9; + + if (H5FD_sb_load(file->sf_file, ioc_name, p) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTDECODE, FAIL, + "unable to decode IOC VFD's superblock information"); + } + + /* Validate the decoded configuration */ + if (H5FD__subfiling_validate_config(&file->fa) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, + "decoded subfiling configuration info is invalid"); + + if (file->fa.shared_cfg.stripe_size != sf_context->sf_stripe_size) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, + "specified subfiling stripe size (%" PRId64 + ") doesn't match value stored in file (%" PRId64 ")", + sf_context->sf_stripe_size, file->fa.shared_cfg.stripe_size); + + if (file->fa.shared_cfg.stripe_count != sf_context->sf_num_subfiles) + H5_SUBFILING_GOTO_ERROR( + H5E_VFL, H5E_BADVALUE, FAIL, + "specified subfiling stripe count (%d) doesn't match value stored in file (%" PRId32 ")", + sf_context->sf_num_subfiles, file->fa.shared_cfg.stripe_count); + +done: + H5_SUBFILING_FUNC_LEAVE; +} /* end H5FD__subfiling_sb_decode() */ + +/*------------------------------------------------------------------------- * Function: H5FD__subfiling_fapl_get * * Purpose: Gets a file access property list which could be used to @@ -797,7 +1032,6 @@ H5FD__subfiling_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t ma H5FD_driver_prop_t driver_prop; /* Property for driver ID & info */ hbool_t bcasted_eof = FALSE; int64_t sf_eof = -1; - void *file_handle = NULL; int mpi_code; /* MPI return code */ H5FD_t *ret_value = NULL; @@ -813,6 +1047,7 @@ H5FD__subfiling_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t ma H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTALLOC, NULL, "unable to allocate file struct"); file_ptr->comm = MPI_COMM_NULL; file_ptr->info = MPI_INFO_NULL; + file_ptr->file_id = UINT64_MAX; file_ptr->context_id = -1; file_ptr->fa.ioc_fapl_id = H5I_INVALID_HID; file_ptr->ext_comm = MPI_COMM_NULL; @@ -868,33 +1103,6 @@ H5FD__subfiling_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t ma H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, NULL, "can't copy FAPL"); } - if (NULL != (file_ptr->file_path = HDrealpath(name, NULL))) { - char *path = NULL; - - if (NULL == (path = H5MM_strdup(file_ptr->file_path))) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTCOPY, NULL, "can't copy subfiling subfile path"); - if (H5_dirname(path, &file_ptr->file_dir) < 0) { - H5MM_free(path); - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL, "couldn't get subfile dirname"); - } - - H5MM_free(path); - } - else { - if (ENOENT == errno) { - if (NULL == (file_ptr->file_path = HDstrdup(name))) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTCOPY, NULL, "can't copy file name"); - if (NULL == (file_ptr->file_dir = H5MM_strdup("."))) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, NULL, "can't set subfile directory path"); - } - else - H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't resolve subfile path"); - } - - file_ptr->sf_file = H5FD_open(name, flags, file_ptr->fa.ioc_fapl_id, HADDR_UNDEF); - if (!file_ptr->sf_file) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, NULL, "unable to open IOC file"); - /* Check the "native" driver (IOC/sec2/etc.) */ if (NULL == (plist_ptr = H5I_object(file_ptr->fa.ioc_fapl_id))) H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_BADVALUE, NULL, "invalid IOC FAPL"); @@ -905,17 +1113,36 @@ H5FD__subfiling_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t ma H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, NULL, "invalid driver ID in file access property list"); - if (driver->value != H5_VFD_IOC && driver->value != H5_VFD_SEC2) - H5_SUBFILING_GOTO_ERROR( - H5E_FILE, H5E_CANTOPENFILE, NULL, - "unable to open file '%s' - only IOC and Sec2 VFDs are currently supported for subfiles", name); + if (driver->value != H5_VFD_IOC) + H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, + "unable to open file '%s' - only IOC VFD is currently supported for subfiles", + name); + + /* Fully resolve the given filepath and get its dirname */ + if (H5_resolve_pathname(name, file_ptr->comm, &file_ptr->file_path) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't resolve filepath"); + if (H5_dirname(file_ptr->file_path, &file_ptr->file_dir) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't get filepath dirname"); + + /* + * Create/open the HDF5 stub file and get its inode value for + * the internal mapping from file inode to subfiling context. + */ + if (H5_open_subfiling_stub_file(file_ptr->file_path, flags, file_ptr->comm, &file_ptr->stub_file, + &file_ptr->file_id) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "can't open HDF5 stub file"); - if (H5FDget_vfd_handle(file_ptr->sf_file, file_ptr->fa.ioc_fapl_id, &file_handle) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTGET, NULL, "can't get file handle"); + /* Set stub file ID on IOC fapl so it can reuse on open */ + if (H5_subfiling_set_file_id_prop(plist_ptr, file_ptr->file_id) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, NULL, "can't set stub file ID on FAPL"); + + /* Open the HDF5 file's subfiles */ + if (NULL == (file_ptr->sf_file = H5FD_open(name, flags, file_ptr->fa.ioc_fapl_id, HADDR_UNDEF))) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, NULL, "unable to open IOC file"); if (driver->value == H5_VFD_IOC) { /* Get a copy of the context ID for later use */ - file_ptr->context_id = H5_subfile_fhandle_to_context(file_handle); + file_ptr->context_id = H5_subfile_fid_to_context(file_ptr->file_id); file_ptr->fa.require_ioc = true; } else if (driver->value == H5_VFD_SEC2) { @@ -935,7 +1162,7 @@ H5FD__subfiling_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t ma * context ID will be returned, which is used for * further interactions with this file's subfiles. */ - if (H5_open_subfiles(file_ptr->file_path, file_handle, &file_ptr->fa.shared_cfg, ioc_flags, + if (H5_open_subfiles(file_ptr->file_path, file_ptr->file_id, &file_ptr->fa.shared_cfg, ioc_flags, file_ptr->comm, &file_ptr->context_id) < 0) H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to open subfiling files = %s\n", name); @@ -946,8 +1173,10 @@ H5FD__subfiling_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t ma sf_eof = -1; } - if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&sf_eof, 1, MPI_INT64_T, 0, file_ptr->comm))) - H5_SUBFILING_MPI_GOTO_ERROR(NULL, "MPI_Bcast", mpi_code); + if (file_ptr->mpi_size > 1) { + if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&sf_eof, 1, MPI_INT64_T, 0, file_ptr->comm))) + H5_SUBFILING_MPI_GOTO_ERROR(NULL, "MPI_Bcast", mpi_code); + } bcasted_eof = TRUE; @@ -971,8 +1200,10 @@ done: if (!bcasted_eof) { sf_eof = -1; - if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&sf_eof, 1, MPI_INT64_T, 0, file_ptr->comm))) - H5_SUBFILING_MPI_DONE_ERROR(NULL, "MPI_Bcast failed", mpi_code); + if (file_ptr->mpi_size > 1) { + if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&sf_eof, 1, MPI_INT64_T, 0, file_ptr->comm))) + H5_SUBFILING_MPI_DONE_ERROR(NULL, "MPI_Bcast failed", mpi_code); + } } } @@ -993,11 +1224,8 @@ H5FD__subfiling_close_int(H5FD_subfiling_t *file_ptr) if (file_ptr->sf_file && H5FD_close(file_ptr->sf_file) < 0) H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTCLOSEFILE, FAIL, "unable to close subfile"); - - if (!file_ptr->fa.require_ioc) { - if (file_ptr->context_id >= 0 && H5_free_subfiling_object(file_ptr->context_id) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free subfiling context object"); - } + if (file_ptr->stub_file && H5FD_close(file_ptr->stub_file) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTCLOSEFILE, FAIL, "unable to close HDF5 stub file"); /* if set, close the copy of the plist for the underlying VFD. */ if ((file_ptr->fa.ioc_fapl_id >= 0) && (H5I_dec_ref(file_ptr->fa.ioc_fapl_id) < 0)) @@ -1107,7 +1335,6 @@ H5FD__subfiling_query(const H5FD_t H5_ATTR_UNUSED *_file, unsigned long *flags / *flags |= H5FD_FEAT_AGGREGATE_METADATA; /* OK to aggregate metadata allocations */ *flags |= H5FD_FEAT_AGGREGATE_SMALLDATA; /* OK to aggregate "small" raw data allocations */ *flags |= H5FD_FEAT_HAS_MPI; /* This driver uses MPI */ - *flags |= H5FD_FEAT_ALLOCATE_EARLY; /* Allocate space early instead of late */ } H5_SUBFILING_FUNC_LEAVE_API; @@ -1151,15 +1378,22 @@ H5FD__subfiling_get_eoa(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type) *------------------------------------------------------------------------- */ static herr_t -H5FD__subfiling_set_eoa(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, haddr_t addr) +H5FD__subfiling_set_eoa(H5FD_t *_file, H5FD_mem_t type, haddr_t addr) { H5FD_subfiling_t *file_ptr = (H5FD_subfiling_t *)_file; herr_t ret_value = SUCCEED; file_ptr->eoa = addr; + /* Set EOA for HDF5 stub file */ + if (file_ptr->mpi_rank == 0) { + if (H5FD_set_eoa(file_ptr->stub_file, type, addr) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, "can't set HDF5 stub file EOA"); + } + ret_value = H5FD_set_eoa(file_ptr->sf_file, type, addr); +done: H5_SUBFILING_FUNC_LEAVE_API; } /* end H5FD__subfiling_set_eoa() */ @@ -1208,7 +1442,7 @@ H5FD__subfiling_get_handle(H5FD_t *_file, hid_t H5_ATTR_UNUSED fapl, void **file if (!file_handle) H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file handle not valid"); - *file_handle = &(file->fd); + H5FD_get_vfd_handle(file->sf_file, file->fa.ioc_fapl_id, file_handle); done: H5_SUBFILING_FUNC_LEAVE_API; @@ -1230,7 +1464,7 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5FD__subfiling_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size, +H5FD__subfiling_read(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, haddr_t addr, size_t size, void *buf /*out*/) { subfiling_context_t *sf_context = NULL; @@ -1243,7 +1477,7 @@ H5FD__subfiling_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr int64_t *sf_data_size = NULL; int64_t *sf_offset = NULL; hbool_t rank0_bcast = FALSE; - int ioc_total; + int num_subfiles; herr_t ret_value = SUCCEED; HDassert(file_ptr && file_ptr->pub.cls); @@ -1286,7 +1520,7 @@ H5FD__subfiling_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr /* * Retrieve the subfiling context object and the number - * of I/O concentrators. + * of subfiles. * * Given the current I/O and the I/O concentrator info, * we can determine some I/O transaction parameters. @@ -1300,50 +1534,50 @@ H5FD__subfiling_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr HDassert(sf_context); HDassert(sf_context->topology); - ioc_total = sf_context->topology->n_io_concentrators; + num_subfiles = sf_context->sf_num_subfiles; - if (ioc_total == 0) { - H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid number of I/O concentrators (%d)", - ioc_total); + if (num_subfiles <= 0) { + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid number of subfiles (%d)", + num_subfiles); } - else if (ioc_total == 1) { - /*********************************** - * No striping - just a single IOC * - ***********************************/ + else if (num_subfiles == 1) { + /*************************************** + * No striping - just a single subfile * + ***************************************/ /* Make vector read call to subfile */ - if (H5FDread_vector(file_ptr->sf_file, dxpl_id, 1, &type, &addr, &size, &buf) < 0) + if (H5FD_read_vector(file_ptr->sf_file, 1, &type, &addr, &size, &buf) < 0) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "read from subfile failed"); } else { - int64_t max_io_req_per_ioc; + int64_t max_io_req_per_subfile; int64_t file_offset; int64_t block_size; size_t max_depth; herr_t status; - int ioc_count = 0; - int ioc_start = -1; + int num_subfiles_used = 0; + int first_subfile_idx = -1; - /********************************* - * Striping across multiple IOCs * - *********************************/ + /************************************* + * Striping across multiple subfiles * + *************************************/ block_size = sf_context->sf_blocksize_per_stripe; max_depth = (size / (size_t)block_size) + 2; /* - * Given the number of I/O concentrators, allocate vectors (one per IOC) - * to contain the translation of the I/O request into a collection of I/O - * requests. + * Given the number of subfiles, allocate vectors (one per subfile) + * to contain the translation of the I/O request into a collection of + * I/O requests. */ - if (NULL == - (source_data_offset = HDcalloc(1, (size_t)ioc_total * max_depth * sizeof(*source_data_offset)))) + if (NULL == (source_data_offset = + HDcalloc(1, (size_t)num_subfiles * max_depth * sizeof(*source_data_offset)))) H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate source data offset I/O vector"); - if (NULL == (sf_data_size = HDcalloc(1, (size_t)ioc_total * max_depth * sizeof(*sf_data_size)))) + if (NULL == (sf_data_size = HDcalloc(1, (size_t)num_subfiles * max_depth * sizeof(*sf_data_size)))) H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate subfile data size I/O vector"); - if (NULL == (sf_offset = HDcalloc(1, (size_t)ioc_total * max_depth * sizeof(*sf_offset)))) + if (NULL == (sf_offset = HDcalloc(1, (size_t)num_subfiles * max_depth * sizeof(*sf_offset)))) H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate subfile offset I/O vector"); @@ -1351,31 +1585,27 @@ H5FD__subfiling_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr /* * Get the potential set of IOC transactions; e.g., data sizes, - * offsets and datatypes. These can all be used by either the - * underlying IOC or by the sec2 driver. - * - * For now, assume we're dealing with contiguous datasets. Vector - * I/O will probably handle the non-contiguous case. + * offsets and datatypes. */ - status = init_indep_io(sf_context, /* IN: Context used to look up config info */ - file_offset, /* IN: Starting file offset */ - size, /* IN: I/O size */ - 1, /* IN: Data extent of the 'type' assumes byte */ - max_depth, /* IN: Maximum stripe depth */ - source_data_offset, /* OUT: Memory offset */ - sf_offset, /* OUT: File offset */ - sf_data_size, /* OUT: Length of this contiguous block */ - &ioc_start, /* OUT: IOC index corresponding to starting offset */ - &ioc_count, /* OUT: Number of actual IOCs used */ - &max_io_req_per_ioc); /* OUT: Maximum number of requests to any IOC */ + status = init_indep_io(sf_context, /* IN: Context used to look up config info */ + file_offset, /* IN: Starting file offset */ + size, /* IN: I/O size */ + 1, /* IN: Data extent of the 'type' assumes byte */ + max_depth, /* IN: Maximum stripe depth */ + source_data_offset, /* OUT: Memory offset */ + sf_offset, /* OUT: File offset */ + sf_data_size, /* OUT: Length of this contiguous block */ + &first_subfile_idx, /* OUT: Subfile index corresponding to starting offset */ + &num_subfiles_used, /* OUT: Number of actual subfiles used */ + &max_io_req_per_subfile); /* OUT: Maximum number of requests to any subfile */ if (status < 0) H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTINIT, FAIL, "can't initialize IOC transactions"); - if (max_io_req_per_ioc > 0) { + if (max_io_req_per_subfile > 0) { uint32_t vector_len; - H5_CHECKED_ASSIGN(vector_len, uint32_t, ioc_count, int); + H5_CHECKED_ASSIGN(vector_len, uint32_t, num_subfiles_used, int); /* Allocate I/O vectors */ if (NULL == (io_types = HDmalloc(vector_len * sizeof(*io_types)))) @@ -1391,20 +1621,20 @@ H5FD__subfiling_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate subfile I/O buffers vector"); - for (int64_t i = 0; i < max_io_req_per_ioc; i++) { - uint32_t final_vec_len = vector_len; - int next_ioc = ioc_start; + for (int64_t i = 0; i < max_io_req_per_subfile; i++) { + uint32_t final_vec_len = vector_len; + int next_subfile_idx = first_subfile_idx; /* Fill in I/O types, offsets, sizes and buffers vectors */ for (uint32_t k = 0, vec_idx = 0; k < vector_len; k++) { - size_t idx = (size_t)next_ioc * max_depth + (size_t)i; + size_t idx = (size_t)next_subfile_idx * max_depth + (size_t)i; io_types[vec_idx] = type; H5_CHECKED_ASSIGN(io_addrs[vec_idx], haddr_t, sf_offset[idx], int64_t); H5_CHECKED_ASSIGN(io_sizes[vec_idx], size_t, sf_data_size[idx], int64_t); io_bufs[vec_idx] = ((char *)buf + source_data_offset[idx]); - next_ioc = (next_ioc + 1) % ioc_total; + next_subfile_idx = (next_subfile_idx + 1) % num_subfiles; /* Skip 0-sized I/Os */ if (io_sizes[vec_idx] == 0) { @@ -1417,13 +1647,13 @@ H5FD__subfiling_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr if (!rank0_bcast || (file_ptr->mpi_rank == 0)) { /* Make vector read call to subfile */ - if (H5FDread_vector(file_ptr->sf_file, dxpl_id, final_vec_len, io_types, io_addrs, - io_sizes, io_bufs) < 0) + if (H5FD_read_vector(file_ptr->sf_file, final_vec_len, io_types, io_addrs, io_sizes, + io_bufs) < 0) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "read from subfile failed"); } } - if (rank0_bcast) { + if (rank0_bcast && (file_ptr->mpi_size > 1)) { H5_CHECK_OVERFLOW(size, size_t, int); if (MPI_SUCCESS != MPI_Bcast(buf, (int)size, MPI_BYTE, 0, file_ptr->comm)) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "can't broadcast data from rank 0"); @@ -1470,7 +1700,7 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5FD__subfiling_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size, +H5FD__subfiling_write(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, haddr_t addr, size_t size, const void *buf /*in*/) { subfiling_context_t *sf_context = NULL; @@ -1482,7 +1712,7 @@ H5FD__subfiling_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t add int64_t *source_data_offset = NULL; int64_t *sf_data_size = NULL; int64_t *sf_offset = NULL; - int ioc_total; + int num_subfiles; herr_t ret_value = SUCCEED; HDassert(file_ptr && file_ptr->pub.cls); @@ -1522,7 +1752,7 @@ H5FD__subfiling_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t add /* * Retrieve the subfiling context object and the number - * of I/O concentrators. + * of subfiles. * * Given the current I/O and the I/O concentrator info, * we can determine some I/O transaction parameters. @@ -1536,50 +1766,61 @@ H5FD__subfiling_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t add HDassert(sf_context); HDassert(sf_context->topology); - ioc_total = sf_context->topology->n_io_concentrators; + num_subfiles = sf_context->sf_num_subfiles; - if (ioc_total == 0) { - H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid number of I/O concentrators (%d)", - ioc_total); + if (num_subfiles <= 0) { + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid number of subfiles (%d)", + num_subfiles); } - else if (ioc_total == 1) { - /*********************************** - * No striping - just a single IOC * - ***********************************/ + else if (num_subfiles == 1) { + /*************************************** + * No striping - just a single subfile * + ***************************************/ /* Make vector write call to subfile */ - if (H5FDwrite_vector(file_ptr->sf_file, dxpl_id, 1, &type, &addr, &size, &buf) < 0) + if (H5FD_write_vector(file_ptr->sf_file, 1, &type, &addr, &size, &buf) < 0) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "write to subfile failed"); + + /* + * Mirror superblock writes to the stub file so that + * legacy HDF5 applications can check what type of + * file they are reading + */ + if ((type == H5FD_MEM_SUPER) && (file_ptr->mpi_rank == 0)) { + if (H5FD_write_vector(file_ptr->stub_file, 1, &type, &addr, &size, &buf) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, + "couldn't write superblock information to stub file"); + } } else { - int64_t max_io_req_per_ioc; + int64_t max_io_req_per_subfile; int64_t file_offset; int64_t block_size; size_t max_depth; herr_t status; - int ioc_count = 0; - int ioc_start = -1; + int num_subfiles_used = 0; + int first_subfile_idx = -1; - /********************************* - * Striping across multiple IOCs * - *********************************/ + /************************************* + * Striping across multiple subfiles * + *************************************/ block_size = sf_context->sf_blocksize_per_stripe; max_depth = (size / (size_t)block_size) + 2; /* - * Given the number of I/O concentrators, allocate vectors (one per IOC) - * to contain the translation of the I/O request into a collection of I/O - * requests. + * Given the number of subfiles, allocate vectors (one per subfile) + * to contain the translation of the I/O request into a collection of + * I/O requests. */ - if (NULL == - (source_data_offset = HDcalloc(1, (size_t)ioc_total * max_depth * sizeof(*source_data_offset)))) + if (NULL == (source_data_offset = + HDcalloc(1, (size_t)num_subfiles * max_depth * sizeof(*source_data_offset)))) H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate source data offset I/O vector"); - if (NULL == (sf_data_size = HDcalloc(1, (size_t)ioc_total * max_depth * sizeof(*sf_data_size)))) + if (NULL == (sf_data_size = HDcalloc(1, (size_t)num_subfiles * max_depth * sizeof(*sf_data_size)))) H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate subfile data size I/O vector"); - if (NULL == (sf_offset = HDcalloc(1, (size_t)ioc_total * max_depth * sizeof(*sf_offset)))) + if (NULL == (sf_offset = HDcalloc(1, (size_t)num_subfiles * max_depth * sizeof(*sf_offset)))) H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate subfile offset I/O vector"); @@ -1587,31 +1828,27 @@ H5FD__subfiling_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t add /* * Get the potential set of IOC transactions; e.g., data sizes, - * offsets and datatypes. These can all be used by either the - * underlying IOC or by the sec2 driver. - * - * For now, assume we're dealing with contiguous datasets. Vector - * I/O will probably handle the non-contiguous case. + * offsets and datatypes. */ - status = init_indep_io(sf_context, /* IN: Context used to look up config info */ - file_offset, /* IN: Starting file offset */ - size, /* IN: I/O size */ - 1, /* IN: Data extent of the 'type' assumes byte */ - max_depth, /* IN: Maximum stripe depth */ - source_data_offset, /* OUT: Memory offset */ - sf_offset, /* OUT: File offset */ - sf_data_size, /* OUT: Length of this contiguous block */ - &ioc_start, /* OUT: IOC index corresponding to starting offset */ - &ioc_count, /* OUT: Number of actual IOCs used */ - &max_io_req_per_ioc); /* OUT: Maximum number of requests to any IOC */ + status = init_indep_io(sf_context, /* IN: Context used to look up config info */ + file_offset, /* IN: Starting file offset */ + size, /* IN: I/O size */ + 1, /* IN: Data extent of the 'type' assumes byte */ + max_depth, /* IN: Maximum stripe depth */ + source_data_offset, /* OUT: Memory offset */ + sf_offset, /* OUT: File offset */ + sf_data_size, /* OUT: Length of this contiguous block */ + &first_subfile_idx, /* OUT: Subfile index corresponding to starting offset */ + &num_subfiles_used, /* OUT: Number of actual subfiles used */ + &max_io_req_per_subfile); /* OUT: Maximum number of requests to any subfile */ if (status < 0) H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTINIT, FAIL, "can't initialize IOC transactions"); - if (max_io_req_per_ioc > 0) { + if (max_io_req_per_subfile > 0) { uint32_t vector_len; - H5_CHECKED_ASSIGN(vector_len, uint32_t, ioc_count, int); + H5_CHECKED_ASSIGN(vector_len, uint32_t, num_subfiles_used, int); /* Allocate I/O vectors */ if (NULL == (io_types = HDmalloc(vector_len * sizeof(*io_types)))) @@ -1627,20 +1864,20 @@ H5FD__subfiling_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t add H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate subfile I/O buffers vector"); - for (int64_t i = 0; i < max_io_req_per_ioc; i++) { - uint32_t final_vec_len = vector_len; - int next_ioc = ioc_start; + for (int64_t i = 0; i < max_io_req_per_subfile; i++) { + uint32_t final_vec_len = vector_len; + int next_subfile_idx = first_subfile_idx; /* Fill in I/O types, offsets, sizes and buffers vectors */ for (uint32_t k = 0, vec_idx = 0; k < vector_len; k++) { - size_t idx = (size_t)next_ioc * max_depth + (size_t)i; + size_t idx = (size_t)next_subfile_idx * max_depth + (size_t)i; io_types[vec_idx] = type; H5_CHECKED_ASSIGN(io_addrs[vec_idx], haddr_t, sf_offset[idx], int64_t); H5_CHECKED_ASSIGN(io_sizes[vec_idx], size_t, sf_data_size[idx], int64_t); io_bufs[vec_idx] = ((const char *)buf + source_data_offset[idx]); - next_ioc = (next_ioc + 1) % ioc_total; + next_subfile_idx = (next_subfile_idx + 1) % num_subfiles; /* Skip 0-sized I/Os */ if (io_sizes[vec_idx] == 0) { @@ -1652,9 +1889,25 @@ H5FD__subfiling_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t add } /* Make vector write call to subfile */ - if (H5FDwrite_vector(file_ptr->sf_file, dxpl_id, final_vec_len, io_types, io_addrs, io_sizes, - io_bufs) < 0) + if (H5FD_write_vector(file_ptr->sf_file, final_vec_len, io_types, io_addrs, io_sizes, + io_bufs) < 0) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "write to subfile failed"); + + /* + * Mirror superblock writes to the stub file so that + * legacy HDF5 applications can check what type of + * file they are reading + */ + if (file_ptr->mpi_rank == 0) { + for (size_t count_idx = 0; count_idx < (size_t)final_vec_len; count_idx++) { + if (io_types[count_idx] == H5FD_MEM_SUPER) { + if (H5FD_write(file_ptr->stub_file, H5FD_MEM_SUPER, io_addrs[count_idx], + io_sizes[count_idx], io_bufs[count_idx]) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, + "couldn't write superblock information to stub file"); + } + } + } } } } @@ -2044,31 +2297,43 @@ H5FD__subfiling_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t H5 int64_t eoa; int mpi_code; - if (!H5CX_get_mpi_file_flushing()) - if (MPI_SUCCESS != (mpi_code = MPI_Barrier(file->comm))) - H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); + if (!H5CX_get_mpi_file_flushing()) { + if (file->mpi_size > 1) + if (MPI_SUCCESS != (mpi_code = MPI_Barrier(file->comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); + } if (0 == file->mpi_rank) { if (H5FD__subfiling__get_real_eof(file->context_id, &sf_eof) < 0) H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "can't get EOF"); } - if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&sf_eof, 1, MPI_INT64_T, 0, file->comm))) - H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code); + if (file->mpi_size > 1) { + if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&sf_eof, 1, MPI_INT64_T, 0, file->comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code); + } if (sf_eof < 0) H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "invalid EOF"); H5_CHECKED_ASSIGN(eoa, int64_t, file->eoa, haddr_t); - /* truncate sub-files */ - /* This is a hack. We should be doing the truncate of the sub-files via calls to + /* truncate subfiles */ + /* This is a hack. We should be doing the truncate of the subfiles via calls to * H5FD_truncate() with the IOC. However, that system is messed up at present. * thus the following hack. * JRM -- 12/18/21 */ if (H5FD__subfiling__truncate_sub_files(file->context_id, eoa, file->comm) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTUPDATE, FAIL, "sub-file truncate request failed"); + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTUPDATE, FAIL, "subfile truncate request failed"); + +#if 0 /* TODO: Should be truncated only to size of superblock metadata */ + /* Truncate the HDF5 stub file */ + if (file->mpi_rank == 0) { + if (H5FD_truncate(file->stub_file, closing) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTUPDATE, FAIL, "stub file truncate request failed"); + } +#endif /* Reset last file I/O information */ file->pos = HADDR_UNDEF; @@ -2271,24 +2536,24 @@ done: * As a consequence of not allowing use of MPI derived * datatypes in the VFD layer, we need to accommodate the * possibility that large I/O transactions will be required to - * use multiple I/Os per IOC. + * use multiple I/Os per subfile. * - * Example: Using 4 IOCs, each with 1M stripe-depth; when - * presented an I/O request for 8MB then at a minimum each IOC - * will require 2 I/Os of 1MB each. Depending on the starting - * file offset, the 2 I/Os can instead be 3... + * Example: Using 4 subfiles, each with 1M stripe-depth; when + * presented an I/O request for 8MB then at a minimum each + * subfile will require 2 I/Os of 1MB each. Depending on the + * starting file offset, the 2 I/Os can instead be 3... * * To fully describe the I/O transactions for reads and writes * the output arrays are therefore arrays of I/O vectors, * where each vector has a length of which corresponds to the - * max number of I/O transactions per IOC. In the example + * max number of I/O transactions per subfile. In the example * above, these vector lengths can be 2 or 3. The actual * length is determined by the 'container_depth' variable. * - * For I/O operations which involve a subset of I/O - * concentrators, the vector entries for the unused I/O - * concentrators IOCs will have lengths of zero and be empty. - * The 'container_depth' in this case will always be 1. + * For I/O operations which involve a subset of subfiles, the + * vector entries for the unused subfiles will have lengths of + * zero and be empty. The 'container_depth' in this case will + * always be 1. * * sf_context (IN) * - the subfiling context for the file @@ -2308,37 +2573,37 @@ done: * the output arrays `mem_buf_offset`, `io_block_len` * and `sf_offset`. NOTE that this routine expects each * of these output arrays to have enough space allocated - * for one I/O vector PER I/O concentrator. Therefore, - * the total size of each output array should be at least - * `max_iovec_len * n_io_concentrators`. + * for one I/O vector PER subfile. Therefore, the total + * size of each output array should be at least + * `max_iovec_len * num_subfiles`. * * mem_buf_offset (OUT) - * - output array of vectors (one vector for each IOC) + * - output array of vectors (one vector for each subfile) * containing the set of offsets into the memory buffer * for I/O * * target_file_offset (OUT) - * - output array of vectors (one vector for each IOC) + * - output array of vectors (one vector for each subfile) * containing the set of offsets into the target file * * io_block_len (OUT) - * - output array of vectors (one vector for each IOC) + * - output array of vectors (one vector for each subfile) * containing the set of block lengths for each source * buffer/target file offset. * - * first_ioc_index (OUT) - * - the index of the first I/O concentrator that this I/O - * operation begins at + * first_subfile_index (OUT) + * - the index of the first subfile that this I/O operation + * begins at * - * n_iocs_used (OUT) - * - the number of I/O concentrators actually used for this - * I/O operation, which may be different from the total - * number of I/O concentrators for the file + * n_subfiles_used (OUT) + * - the number of subfiles actually used for this I/O + * operation, which may be different from the total + * number of subfiles for the file * - * max_io_req_per_ioc (OUT) + * max_io_req_per_subfile (OUT) * - the maximum number of I/O requests to any particular - * I/O concentrator, or the maximum "depth" of each I/O - * vector in the output arrays. + * subfile, or the maximum "depth" of each I/O vector + * in the output arrays. * * Return: Non-negative on success/Negative on failure * @@ -2347,7 +2612,8 @@ done: static herr_t init_indep_io(subfiling_context_t *sf_context, int64_t file_offset, size_t io_nelemts, size_t dtype_extent, size_t max_iovec_len, int64_t *mem_buf_offset, int64_t *target_file_offset, - int64_t *io_block_len, int *first_ioc_index, int *n_iocs_used, int64_t *max_io_req_per_ioc) + int64_t *io_block_len, int *first_subfile_index, int *n_subfiles_used, + int64_t *max_io_req_per_subfile) { int64_t stripe_size = 0; int64_t block_size = 0; @@ -2360,8 +2626,8 @@ init_indep_io(subfiling_context_t *sf_context, int64_t file_offset, size_t io_ne int64_t final_offset = 0; int64_t start_length = 0; int64_t final_length = 0; - int64_t ioc_start = 0; - int64_t ioc_final = 0; + int64_t first_subfile = 0; + int64_t last_subfile = 0; int64_t start_row = 0; int64_t row_offset = 0; int64_t row_stripe_idx_start = 0; @@ -2370,41 +2636,44 @@ init_indep_io(subfiling_context_t *sf_context, int64_t file_offset, size_t io_ne int64_t curr_max_iovec_depth = 0; int64_t total_bytes = 0; int64_t mem_offset = 0; - int ioc_count = 0; + int num_subfiles = 0; herr_t ret_value = SUCCEED; HDassert(sf_context); - HDassert(sf_context->topology); - HDassert(sf_context->topology->n_io_concentrators > 0); HDassert(sf_context->sf_stripe_size > 0); HDassert(sf_context->sf_blocksize_per_stripe > 0); + HDassert(sf_context->sf_num_subfiles > 0); + HDassert(sf_context->topology); HDassert(mem_buf_offset); HDassert(target_file_offset); HDassert(io_block_len); - HDassert(first_ioc_index); - HDassert(n_iocs_used); - HDassert(max_io_req_per_ioc); + HDassert(first_subfile_index); + HDassert(n_subfiles_used); + HDassert(max_io_req_per_subfile); - *first_ioc_index = 0; - *n_iocs_used = 0; - *max_io_req_per_ioc = 0; + *first_subfile_index = 0; + *n_subfiles_used = 0; + *max_io_req_per_subfile = 0; /* * Retrieve the needed fields from the subfiling context. * - * ioc_count - * - the total number of I/O concentrators in the - * application topology * stripe_size * - the size of the data striping across the file's subfiles * block_size * - the size of a "block" across the IOCs, as calculated - * by the stripe size multiplied by the number of I/O - * concentrators + * by the stripe size multiplied by the number of + * subfiles + * num_subfiles + * - the total number of subfiles for the logical + * HDF5 file + * num_io_concentrators + * - the number of I/O concentrators currently being + * used */ - ioc_count = sf_context->topology->n_io_concentrators; - stripe_size = sf_context->sf_stripe_size; - block_size = sf_context->sf_blocksize_per_stripe; + stripe_size = sf_context->sf_stripe_size; + block_size = sf_context->sf_blocksize_per_stripe; + num_subfiles = sf_context->sf_num_subfiles; H5_CHECKED_ASSIGN(data_size, int64_t, (io_nelemts * dtype_extent), size_t); @@ -2415,16 +2684,16 @@ init_indep_io(subfiling_context_t *sf_context, int64_t file_offset, size_t io_ne * - a stripe "index" given by the file offset divided by the * stripe size. Note that when the file offset equals or exceeds * the block size, we simply wrap around. So, for example, if 4 - * I/O concentrators are being used with a stripe size of 1MiB, - * the block size would be 4MiB and file offset 4096 would have - * a stripe index of 4 and reside in the same subfile as stripe - * index 0 (offsets 0-1023) + * subfiles are being used with a stripe size of 1MiB, the block + * size would be 4MiB and file offset 4096 would have a stripe + * index of 4 and reside in the same subfile as stripe index 0 + * (offsets 0-1023) * offset_in_stripe * - the relative offset in the stripe that the starting file * offset resides in * offset_in_block - * - the relative offset in the "block" of stripes across the I/O - * concentrators + * - the relative offset in the "block" of stripes across the + * subfiles * final_offset * - the last offset in the virtual file covered by this I/O * operation. Simply the I/O size added to the starting file @@ -2442,19 +2711,18 @@ init_indep_io(subfiling_context_t *sf_context, int64_t file_offset, size_t io_ne HDassert(final_length <= stripe_size); /* - * Determine which I/O concentrator the I/O request begins - * in and which "row" the I/O request begins in within the - * "block" of stripes across the I/O concentrators. Note that - * "row" here is just a conceptual way to think of how a block - * of data stripes is laid out across the I/O concentrator - * subfiles. A block's "column" size in bytes is equal to the - * stripe size multiplied the number of I/O concentrators. - * Therefore, file offsets that are multiples of the block size - * begin a new "row". + * Determine which subfile the I/O request begins in and which + * "row" the I/O request begins in within the "block" of stripes + * across the subfiles. Note that "row" here is just a conceptual + * way to think of how a block of data stripes is laid out across + * the subfiles. A block's "column" size in bytes is equal to the + * stripe size multiplied by the number of subfiles. Therefore, + * file offsets that are multiples of the block size begin a new + * "row". */ - start_row = stripe_idx / ioc_count; - ioc_start = stripe_idx % ioc_count; - H5_CHECK_OVERFLOW(ioc_start, int64_t, int); + start_row = stripe_idx / num_subfiles; + first_subfile = stripe_idx % num_subfiles; + H5_CHECK_OVERFLOW(first_subfile, int64_t, int); /* * Set initial file offset for starting "row" @@ -2464,53 +2732,52 @@ init_indep_io(subfiling_context_t *sf_context, int64_t file_offset, size_t io_ne /* * Determine the stripe "index" of the last offset in the - * virtual file and, from that, determine the I/O concentrator - * that the I/O request ends in. + * virtual file and, from that, determine the subfile that + * the I/O request ends in. */ final_stripe_idx = final_offset / stripe_size; - ioc_final = final_stripe_idx % ioc_count; + last_subfile = final_stripe_idx % num_subfiles; /* * Determine how "deep" the resulting I/O vectors are at * most by calculating the maximum number of "rows" spanned * for any particular subfile; e.g. the maximum number of - * I/O requests for any particular I/O concentrator + * I/O requests for any particular subfile */ - row_stripe_idx_start = stripe_idx - ioc_start; - row_stripe_idx_final = final_stripe_idx - ioc_final; - max_iovec_depth = ((row_stripe_idx_final - row_stripe_idx_start) / ioc_count) + 1; + row_stripe_idx_start = stripe_idx - first_subfile; + row_stripe_idx_final = final_stripe_idx - last_subfile; + max_iovec_depth = ((row_stripe_idx_final - row_stripe_idx_start) / num_subfiles) + 1; - if (ioc_final < ioc_start) + if (last_subfile < first_subfile) max_iovec_depth--; /* Set returned parameters early */ - *first_ioc_index = (int)ioc_start; - *n_iocs_used = ioc_count; - *max_io_req_per_ioc = max_iovec_depth; + *first_subfile_index = (int)first_subfile; + *n_subfiles_used = num_subfiles; + *max_io_req_per_subfile = max_iovec_depth; #ifdef H5_SUBFILING_DEBUG H5_subfiling_log(sf_context->sf_context_id, "%s: FILE OFFSET = %" PRId64 ", DATA SIZE = %zu, STRIPE SIZE = %" PRId64, __func__, file_offset, io_nelemts, stripe_size); H5_subfiling_log(sf_context->sf_context_id, - "%s: IOC START = %" PRId64 ", IOC FINAL = %" PRId64 ", " + "%s: FIRST SUBFILE = %" PRId64 ", LAST SUBFILE = %" PRId64 ", " "MAX IOVEC DEPTH = %" PRId64 ", START LENGTH = %" PRId64 ", FINAL LENGTH = %" PRId64, - __func__, ioc_start, ioc_final, max_iovec_depth, start_length, final_length); + __func__, first_subfile, last_subfile, max_iovec_depth, start_length, final_length); #endif /* - * Loop through the set of I/O concentrators to determine - * the various vector components for each. I/O concentrators - * whose data size is zero will not have I/O requests passed - * to them. + * Loop through the set of subfiles to determine the various + * vector components for each. Subfiles whose data size is + * zero will not have I/O requests passed to them. */ curr_stripe_idx = stripe_idx; curr_max_iovec_depth = max_iovec_depth; - for (int i = 0, k = (int)ioc_start; i < ioc_count; i++) { + for (int i = 0, k = (int)first_subfile; i < num_subfiles; i++) { int64_t *_mem_buf_offset; int64_t *_target_file_offset; int64_t *_io_block_len; - int64_t ioc_bytes = 0; + int64_t subfile_bytes = 0; int64_t iovec_depth; hbool_t is_first = FALSE; hbool_t is_last = FALSE; @@ -2532,14 +2799,14 @@ init_indep_io(subfiling_context_t *sf_context, int64_t file_offset, size_t io_ne HDmemset(_io_block_len, 0, (max_iovec_len * sizeof(*_io_block_len))); if (total_bytes == data_size) { - *n_iocs_used = i; + *n_subfiles_used = i; goto done; } if (total_bytes < data_size) { int64_t num_full_stripes = iovec_depth; - if (k == ioc_start) { + if (k == first_subfile) { is_first = TRUE; /* @@ -2547,12 +2814,12 @@ init_indep_io(subfiling_context_t *sf_context, int64_t file_offset, size_t io_ne * starting on a stripe boundary */ if (start_length < stripe_size) { - ioc_bytes += start_length; + subfile_bytes += start_length; num_full_stripes--; } } - if (k == ioc_final) { + if (k == last_subfile) { is_last = TRUE; /* @@ -2560,34 +2827,35 @@ init_indep_io(subfiling_context_t *sf_context, int64_t file_offset, size_t io_ne * ending on a stripe boundary */ if (final_length < stripe_size) { - ioc_bytes += final_length; + subfile_bytes += final_length; if (num_full_stripes) num_full_stripes--; } } - /* Account for IOCs with uniform segments */ + /* Account for subfiles with uniform segments */ if (!is_first && !is_last) { hbool_t thin_uniform_section = FALSE; - if (ioc_final >= ioc_start) { + if (last_subfile >= first_subfile) { /* - * When an IOC has an index value that is greater - * than both the starting IOC and ending IOC indices, - * it is a "thinner" section with a smaller I/O vector - * depth. + * When a subfile has an index value that is greater + * than both the starting subfile and ending subfile + * indices, it is a "thinner" section with a smaller + * I/O vector depth. */ - thin_uniform_section = (k > ioc_start) && (k > ioc_final); + thin_uniform_section = (k > first_subfile) && (k > last_subfile); } - if (ioc_final < ioc_start) { + if (last_subfile < first_subfile) { /* - * This can also happen when the IOC with the final - * data segment has a smaller IOC index than the IOC - * with the first data segment and the current IOC - * index falls between the two. + * This can also happen when the subfile with the final + * data segment has a smaller subfile index than the + * subfile with the first data segment and the current + * subfile index falls between the two. */ - thin_uniform_section = thin_uniform_section || ((ioc_final < k) && (k < ioc_start)); + thin_uniform_section = + thin_uniform_section || ((last_subfile < k) && (k < first_subfile)); } if (thin_uniform_section) { @@ -2605,45 +2873,45 @@ init_indep_io(subfiling_context_t *sf_context, int64_t file_offset, size_t io_ne * size of the fully selected I/O stripes to the * running bytes total */ - ioc_bytes += num_full_stripes * stripe_size; - total_bytes += ioc_bytes; + subfile_bytes += num_full_stripes * stripe_size; + total_bytes += subfile_bytes; } _mem_buf_offset[0] = mem_offset; _target_file_offset[0] = row_offset + offset_in_block; - _io_block_len[0] = ioc_bytes; + _io_block_len[0] = subfile_bytes; - if (ioc_count > 1) { + if (num_subfiles > 1) { int64_t curr_file_offset = row_offset + offset_in_block; /* Fill the I/O vectors */ if (is_first) { if (is_last) { /* First + Last */ - if (iovec_fill_first_last(sf_context, iovec_depth, ioc_bytes, mem_offset, + if (iovec_fill_first_last(sf_context, iovec_depth, subfile_bytes, mem_offset, curr_file_offset, start_length, final_length, _mem_buf_offset, _target_file_offset, _io_block_len) < 0) H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTINIT, FAIL, "can't fill I/O vectors"); } else { /* First ONLY */ - if (iovec_fill_first(sf_context, iovec_depth, ioc_bytes, mem_offset, curr_file_offset, + if (iovec_fill_first(sf_context, iovec_depth, subfile_bytes, mem_offset, curr_file_offset, start_length, _mem_buf_offset, _target_file_offset, _io_block_len) < 0) H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTINIT, FAIL, "can't fill I/O vectors"); } /* Move the memory pointer to the starting location - * for next IOC request. + * for next subfile I/O request. */ mem_offset += start_length; } else if (is_last) { /* Last ONLY */ - if (iovec_fill_last(sf_context, iovec_depth, ioc_bytes, mem_offset, curr_file_offset, + if (iovec_fill_last(sf_context, iovec_depth, subfile_bytes, mem_offset, curr_file_offset, final_length, _mem_buf_offset, _target_file_offset, _io_block_len) < 0) H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTINIT, FAIL, "can't fill I/O vectors"); mem_offset += stripe_size; } else { /* Everything else (uniform) */ - if (iovec_fill_uniform(sf_context, iovec_depth, ioc_bytes, mem_offset, curr_file_offset, + if (iovec_fill_uniform(sf_context, iovec_depth, subfile_bytes, mem_offset, curr_file_offset, _mem_buf_offset, _target_file_offset, _io_block_len) < 0) H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTINIT, FAIL, "can't fill I/O vectors"); @@ -2656,10 +2924,10 @@ init_indep_io(subfiling_context_t *sf_context, int64_t file_offset, size_t io_ne k++; curr_stripe_idx++; - if (k == ioc_count) { + if (k == num_subfiles) { k = 0; offset_in_block = 0; - curr_max_iovec_depth = ((final_stripe_idx - curr_stripe_idx) / ioc_count) + 1; + curr_max_iovec_depth = ((final_stripe_idx - curr_stripe_idx) / num_subfiles) + 1; row_offset += block_size; } diff --git a/src/H5FDsubfiling/H5FDsubfiling.h b/src/H5FDsubfiling/H5FDsubfiling.h index 3bc448b..93d0c3e 100644 --- a/src/H5FDsubfiling/H5FDsubfiling.h +++ b/src/H5FDsubfiling/H5FDsubfiling.h @@ -48,21 +48,51 @@ /** * \def H5FD_SUBFILING_DEFAULT_STRIPE_SIZE - * The default stripe size (in bytes) for data stripes in sub-files + * The default stripe size (in bytes) for data stripes in subfiles */ #define H5FD_SUBFILING_DEFAULT_STRIPE_SIZE (32 * 1024 * 1024) /** + * \def H5FD_SUBFILING_DEFAULT_STRIPE_COUNT + * Macro for the default Subfiling stripe count value. The default + * is currently to use one subfile per node. + */ +#define H5FD_SUBFILING_DEFAULT_STRIPE_COUNT -1 + +/** * \def H5FD_SUBFILING_FILENAME_TEMPLATE - * The basic template for a sub-file filename + * The basic template for a subfile filename. The format specifiers + * correspond to: + * + * %s -> base filename, e.g. "file.h5" + * %PRIu64 -> file inode, e.g. 11273556 + * %0*d -> number (starting at 1) signifying the Nth (out of total + * number of subfiles) subfile. Zero-padded according + * to the number of digits in the number of subfiles + * (calculated by log10(num_subfiles) + 1) + * %d -> number of subfiles + * + * yielding filenames such as: + * + * file.h5.subfile_11273556_01_of_10 + * file.h5.subfile_11273556_02_of_10 + * file.h5.subfile_11273556_10_of_10 */ -#define H5FD_SUBFILING_FILENAME_TEMPLATE ".subfile_%" PRIu64 "_%0*d_of_%d" +#define H5FD_SUBFILING_FILENAME_TEMPLATE "%s.subfile_%" PRIu64 "_%0*d_of_%d" /** * \def H5FD_SUBFILING_CONFIG_FILENAME_TEMPLATE - * The basic template for a #H5FD_SUBFILING driver configuration filename + * The basic template for a #H5FD_SUBFILING driver configuration filename. + * The format specifiers correspond to: + * + * %s -> base filename, e.g. "file.h5" + * %PRIu64 -> file inode, e.g. 11273556 + * + * yielding a filename such as: + * + * file.h5.subfile_11273556.config */ -#define H5FD_SUBFILING_CONFIG_FILENAME_TEMPLATE ".subfile_%" PRIu64 ".config" +#define H5FD_SUBFILING_CONFIG_FILENAME_TEMPLATE "%s.subfile_%" PRIu64 ".config" /* * Environment variables interpreted by the HDF5 Subfiling feature @@ -71,7 +101,7 @@ /** * \def H5FD_SUBFILING_STRIPE_SIZE * Macro for name of the environment variable that specifies the size - * (in bytes) for data stripes in sub-files + * (in bytes) for data stripes in subfiles * * The value set for this environment variable is interpreted as a * long long value and must be > 0. @@ -112,7 +142,7 @@ /** * \def H5FD_SUBFILING_SUBFILE_PREFIX * Macro for name of the environment variable that specifies a prefix - * to apply to the filenames generated for sub-files + * to apply to the filenames generated for subfiles * * The value set for this environment variable is interpreted as a * pathname. @@ -153,53 +183,56 @@ * Unused. Sentinel value */ typedef enum { - SELECT_IOC_ONE_PER_NODE = 0, /* Default */ + SELECT_IOC_ONE_PER_NODE = 0, /* Default */ SELECT_IOC_EVERY_NTH_RANK, /* Starting at rank 0, select-next += N */ - SELECT_IOC_WITH_CONFIG, /* NOT IMPLEMENTED: Read-from-file */ - SELECT_IOC_TOTAL, /* Starting at rank 0, mpi_size / total */ - ioc_selection_options /* Sentinel value */ + SELECT_IOC_WITH_CONFIG, /* NOT IMPLEMENTED: Read-from-file */ + SELECT_IOC_TOTAL, /* Starting at rank 0, mpi_size / total */ + ioc_selection_options /* Sentinel value */ } H5FD_subfiling_ioc_select_t; /** - * \struct H5FD_subfiling_shared_config_t - * \brief Subfiling configuration structure that is shared between the #H5FD_SUBFILING + * \struct H5FD_subfiling_params_t + * \brief Subfiling parameter structure that is shared between the #H5FD_SUBFILING * and #H5FD_IOC drivers * - * \var H5FD_subfiling_ioc_select_t H5FD_subfiling_shared_config_t::ioc_selection + * \var H5FD_subfiling_ioc_select_t H5FD_subfiling_params_t::ioc_selection * The method to use for selecting MPI ranks to be I/O concentrators. The * current default is to select one MPI rank per node to be an I/O concentrator. * Refer to #H5FD_subfiling_ioc_select_t for a description of the algorithms * available for use. * - * \var int64_t H5FD_subfiling_shared_config_t::stripe_size + * \var int64_t H5FD_subfiling_params_t::stripe_size * The stripe size defines the size (in bytes) of the data stripes in the - * sub-files for the logical HDF5 file. Data is striped across the sub-files + * subfiles for the logical HDF5 file. Data is striped across the subfiles * in a round-robin wrap-around fashion in segments equal to the stripe size. * - * For example, in an HDF5 file consisting of four sub-files with a 1MiB stripe - * size, the first and fifth 1MiB of data would reside in the first sub-file, - * the second and sixth 1MiB of data would reside in the second sub-file and so + * For example, in an HDF5 file consisting of four subfiles with a 1MiB stripe + * size, the first and fifth 1MiB of data would reside in the first subfile, + * the second and sixth 1MiB of data would reside in the second subfile and so * on. * * This value can also be set or adjusted with the #H5FD_SUBFILING_STRIPE_SIZE * environment variable. * - * \var int32_t H5FD_subfiling_shared_config_t::stripe_count - * The number of I/O concentrators (and, currently, the number of sub-files) - * to use for the logical HDF5 file. This value is used in conjunction with - * the IOC selection method to determine which MPI ranks will be assigned as - * I/O concentrators. - * - * Alternatively, the mapping between MPI ranks and I/O concentrators can be - * set or adjusted with a combination of the #ioc_selection field and the - * #H5FD_SUBFILING_IOC_PER_NODE and #H5FD_SUBFILING_IOC_SELECTION_CRITERIA - * environment variables. + * \var int32_t H5FD_subfiling_params_t::stripe_count + * The target number of subfiles to use for the logical HDF5 file. The current + * default is to use one subfile per node, but it can be useful to set a + * different target number of subfiles, especially if the HDF5 application will + * pre-create the HDF5 file on a single MPI rank. In that particular case, the + * single rank will need to know how many subfiles the logical HDF5 file will + * consist of in order to properly pre-create the file. + * + * This value is used in conjunction with the IOC selection method to determine + * which MPI ranks will be assigned as I/O concentrators. Alternatively, the + * mapping between MPI ranks and I/O concentrators can be set or adjusted with a + * combination of the #ioc_selection field and the #H5FD_SUBFILING_IOC_PER_NODE + * and #H5FD_SUBFILING_IOC_SELECTION_CRITERIA environment variables. */ -typedef struct H5FD_subfiling_shared_config_t { - H5FD_subfiling_ioc_select_t ioc_selection; /* Method to select I/O concentrators */ - int64_t stripe_size; /* Size (in bytes) of data stripes in sub-files */ - int32_t stripe_count; /* Number of I/O concentrators to use */ -} H5FD_subfiling_shared_config_t; +typedef struct H5FD_subfiling_params_t { + H5FD_subfiling_ioc_select_t ioc_selection; /* Method to select I/O concentrators */ + int64_t stripe_size; /* Size (in bytes) of data stripes in subfiles */ + int32_t stripe_count; /* Target number of subfiles to use */ +} H5FD_subfiling_params_t; //! /** @@ -226,7 +259,7 @@ typedef struct H5FD_subfiling_shared_config_t { * \var hid_t H5FD_subfiling_config_t::ioc_fapl_id * The File Access Property List which is setup with the file driver that * the #H5FD_SUBFILING driver will use for servicing I/O requests to the - * sub-files. Currently, the File Access Property List must be setup with + * subfiles. Currently, the File Access Property List must be setup with * the #H5FD_IOC driver by calling H5Pset_fapl_ioc(), but future development * may allow other file drivers to be used. * @@ -235,19 +268,18 @@ typedef struct H5FD_subfiling_shared_config_t { * use the #H5FD_IOC driver for its I/O operations. This field should currently * always be set to TRUE. * - * \var H5FD_subfiling_shared_config_t H5FD_subfiling_config_t::shared_cfg + * \var H5FD_subfiling_params_t H5FD_subfiling_config_t::shared_cfg * A structure which contains the subfiling parameters that are shared between - * the #H5FD_SUBFILING and #H5FD_IOC drivers. This includes the sub-file stripe - * size, number of I/O concentrators, IOC selection method, etc. + * the #H5FD_SUBFILING and #H5FD_IOC drivers. This includes the subfile stripe + * size, stripe count, IOC selection method, etc. * */ typedef struct H5FD_subfiling_config_t { - uint32_t magic; /* Must be set to H5FD_SUBFILING_FAPL_MAGIC */ - uint32_t version; /* Must be set to H5FD_SUBFILING_CURR_FAPL_VERSION */ - hid_t ioc_fapl_id; /* The FAPL setup with the stacked VFD to use for I/O concentrators */ - hbool_t require_ioc; /* Whether to use the IOC VFD (currently must always be TRUE) */ - H5FD_subfiling_shared_config_t - shared_cfg; /* Subfiling/IOC parameters (stripe size, stripe count, etc.) */ + uint32_t magic; /* Must be set to H5FD_SUBFILING_FAPL_MAGIC */ + uint32_t version; /* Must be set to H5FD_SUBFILING_CURR_FAPL_VERSION */ + hid_t ioc_fapl_id; /* The FAPL setup with the stacked VFD to use for I/O concentrators */ + hbool_t require_ioc; /* Whether to use the IOC VFD (currently must always be TRUE) */ + H5FD_subfiling_params_t shared_cfg; /* Subfiling/IOC parameters (stripe size, stripe count, etc.) */ } H5FD_subfiling_config_t; //! @@ -274,8 +306,8 @@ H5_DLL hid_t H5FD_subfiling_init(void); * * The #H5FD_SUBFILING driver is an MPI-based file driver that allows an * HDF5 application to distribute a logical HDF5 file across a collection - * of "sub-files" in equal-sized data segment "stripes". I/O to the logical - * HDF5 file is then directed to the appropriate "sub-file" according to the + * of "subfiles" in equal-sized data segment "stripes". I/O to the logical + * HDF5 file is then directed to the appropriate "subfile" according to the * #H5FD_SUBFILING configuration and a system of I/O concentrators, which * are MPI ranks operating worker threads. * diff --git a/src/H5FDsubfiling/H5subfiling_common.c b/src/H5FDsubfiling/H5subfiling_common.c index 9cc2c65..a1cca65 100644 --- a/src/H5FDsubfiling/H5subfiling_common.c +++ b/src/H5FDsubfiling/H5subfiling_common.c @@ -19,9 +19,9 @@ #include "H5MMprivate.h" -typedef struct { /* Format of a context map entry */ - void *file_handle; /* key value (linear search of the cache) */ - int64_t sf_context_id; /* The return value if matching file_handle */ +typedef struct { /* Format of a context map entry */ + uint64_t file_id; /* key value (linear search of the cache) */ + int64_t sf_context_id; /* The return value if matching file_handle */ } file_map_to_context_t; /* Identifiers for HDF5's error API */ @@ -30,423 +30,52 @@ hid_t H5subfiling_err_class_g = H5I_INVALID_HID; char H5subfiling_mpi_error_str[MPI_MAX_ERROR_STRING]; int H5subfiling_mpi_error_str_len; -static subfiling_context_t *sf_context_cache = NULL; -static sf_topology_t *sf_topology_cache = NULL; +/* MPI Datatype used to send/receive an RPC message */ +MPI_Datatype H5_subfiling_rpc_msg_type = MPI_DATATYPE_NULL; -static size_t sf_context_cache_limit = 16; -static size_t sf_topology_cache_limit = 4; +static subfiling_context_t **sf_context_cache = NULL; +static sf_topology_t **sf_topology_cache = NULL; + +static size_t sf_context_cache_size = 0; +static size_t sf_topology_cache_size = 0; +static size_t sf_context_cache_num_entries = 0; +static size_t sf_topology_cache_num_entries = 0; static file_map_to_context_t *sf_open_file_map = NULL; static int sf_file_map_size = 0; -#define DEFAULT_FILE_MAP_ENTRIES 8 +#define DEFAULT_CONTEXT_CACHE_SIZE 16 +#define DEFAULT_TOPOLOGY_CACHE_SIZE 4 +#define DEFAULT_FILE_MAP_ENTRIES 8 + +static herr_t H5_free_subfiling_object(int64_t object_id); static herr_t H5_free_subfiling_object_int(subfiling_context_t *sf_context); static herr_t H5_free_subfiling_topology(sf_topology_t *topology); -static herr_t init_subfiling(H5FD_subfiling_shared_config_t *subfiling_config, MPI_Comm comm, +static herr_t init_subfiling(const char *base_filename, uint64_t file_id, + H5FD_subfiling_params_t *subfiling_config, int file_acc_flags, MPI_Comm comm, int64_t *context_id_out); -static herr_t init_app_topology(H5FD_subfiling_ioc_select_t ioc_selection_type, MPI_Comm comm, +static herr_t init_app_topology(H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, MPI_Comm node_comm, sf_topology_t **app_topology_out); -static herr_t init_subfiling_context(subfiling_context_t *sf_context, - H5FD_subfiling_shared_config_t *subfiling_config, +static herr_t get_ioc_selection_criteria_from_env(H5FD_subfiling_ioc_select_t *ioc_selection_type, + char **ioc_sel_info_str); +static herr_t find_cached_topology_info(MPI_Comm comm, H5FD_subfiling_params_t *subf_config, + long iocs_per_node, sf_topology_t **app_topology); +static herr_t init_app_layout(sf_topology_t *app_topology, MPI_Comm comm, MPI_Comm node_comm); +static herr_t gather_topology_info(app_layout_t *app_layout, MPI_Comm comm, MPI_Comm intra_comm); +static int compare_layout_nodelocal(const void *layout1, const void *layout2); +static herr_t identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride); +static herr_t init_subfiling_context(subfiling_context_t *sf_context, const char *base_filename, + uint64_t file_id, H5FD_subfiling_params_t *subfiling_config, sf_topology_t *app_topology, MPI_Comm file_comm); static herr_t open_subfile_with_context(subfiling_context_t *sf_context, int file_acc_flags); -static herr_t record_fid_to_subfile(void *file_handle, int64_t subfile_context_id, int *next_index); -static herr_t ioc_open_file(int64_t file_context_id, int file_acc_flags); -static herr_t generate_subfile_name(subfiling_context_t *sf_context, int file_acc_flags, char *filename_out, - size_t filename_out_len, char **filename_basename_out, - char **subfile_dir_out); +static herr_t record_fid_to_subfile(uint64_t file_id, int64_t subfile_context_id, int *next_index); +static void clear_fid_map_entry(uint64_t file_id, int64_t sf_context_id); +static herr_t ioc_open_files(int64_t file_context_id, int file_acc_flags); static herr_t create_config_file(subfiling_context_t *sf_context, const char *base_filename, const char *subfile_dir, hbool_t truncate_if_exists); -static herr_t open_config_file(subfiling_context_t *sf_context, const char *base_filename, - const char *subfile_dir, const char *mode, FILE **config_file_out); - -static int get_next_fid_map_index(void); -static void clear_fid_map_entry(void *file_handle, int64_t sf_context_id); -static int compare_hostid(const void *h1, const void *h2); -static herr_t get_ioc_selection_criteria_from_env(H5FD_subfiling_ioc_select_t *ioc_selection_type, - char **ioc_sel_info_str); -static int count_nodes(sf_topology_t *info, MPI_Comm comm); -static herr_t gather_topology_info(sf_topology_t *info, MPI_Comm comm); -static int identify_ioc_ranks(sf_topology_t *info, int node_count, int iocs_per_node); -static inline void assign_ioc_ranks(sf_topology_t *app_topology, int ioc_count, int rank_multiple); - -static int -get_next_fid_map_index(void) -{ - int index = 0; - - HDassert(sf_open_file_map || (sf_file_map_size == 0)); - - for (int i = 0; i < sf_file_map_size; i++) { - if (sf_open_file_map[i].file_handle == NULL) { - index = i; - break; - } - } - - /* A valid index should always be found here */ - HDassert(index >= 0); - HDassert((sf_file_map_size == 0) || (index < sf_file_map_size)); - - return index; -} - -/*------------------------------------------------------------------------- - * Function: clear_fid_map_entry - * - * Purpose: Remove the map entry associated with the file->inode. - * This is done at file close. - * - * Return: None - * Errors: Cannot fail. - * - * Programmer: Richard Warren - * 7/17/2020 - * - * Changes: Initial Version/None. - * - *------------------------------------------------------------------------- - */ -static void -clear_fid_map_entry(void *file_handle, int64_t sf_context_id) -{ - if (sf_open_file_map) { - for (int i = 0; i < sf_file_map_size; i++) { - if ((sf_open_file_map[i].file_handle == file_handle) && - (sf_open_file_map[i].sf_context_id == sf_context_id)) { - sf_open_file_map[i].file_handle = NULL; - sf_open_file_map[i].sf_context_id = -1; - return; - } - } - } -} /* end clear_fid_map_entry() */ - -/* - * --------------------------------------------------- - * Topology discovery related functions for choosing - * I/O Concentrator (IOC) ranks. - * Currently, the default approach for assigning an IOC - * is select the lowest MPI rank on each node. - * - * The approach collectively generates N tuples - * consisting of the MPI rank and hostid. This - * collection is then sorted by hostid and scanned - * to identify the IOC ranks. - * - * As time permits, addition assignment methods will - * be implemented, e.g. 1-per-Nranks or via a config - * option. Additional selection methodologies can - * be included as users get more experience using the - * subfiling implementation. - * --------------------------------------------------- - */ - -/*------------------------------------------------------------------------- - * Function: compare_hostid - * - * Purpose: qsort sorting function. - * Compares tuples of 'layout_t'. The sorting is based on - * the long hostid values. - * - * Return: result of: (hostid1 > hostid2) - * - * Programmer: Richard Warren - * 7/17/2020 - * - * Changes: Initial Version/None. - * - *------------------------------------------------------------------------- - */ -static int -compare_hostid(const void *h1, const void *h2) -{ - const layout_t *host1 = (const layout_t *)h1; - const layout_t *host2 = (const layout_t *)h2; - return (host1->hostid > host2->hostid); -} - -/* -------------------------------------------------------------------------- - Programmer: Richard Warren - Purpose: Return a character string which represents either the - default selection method: SELECT_IOC_ONE_PER_NODE; or - if the user has selected a method via the environment - variable (H5FD_SUBFILING_IOC_SELECTION_CRITERIA), we - return that along with any optional qualifier with for - that method. - - Errors: None. - - Revision History -- Initial implementation -------------------------------------------------------------------------- -*/ -static herr_t -get_ioc_selection_criteria_from_env(H5FD_subfiling_ioc_select_t *ioc_selection_type, char **ioc_sel_info_str) -{ - char *opt_value = NULL; - char *env_value = HDgetenv(H5FD_SUBFILING_IOC_SELECTION_CRITERIA); - herr_t ret_value = SUCCEED; - - HDassert(ioc_selection_type); - HDassert(ioc_sel_info_str); - - *ioc_sel_info_str = NULL; - - if (env_value) { - long check_value; - - /* - * For non-default options, the environment variable - * should have the following form: integer:[integer|string] - * In particular, EveryNthRank == 1:64 or every 64 ranks assign an IOC - * or WithConfig == 2:/ - */ - if ((opt_value = HDstrchr(env_value, ':'))) - *opt_value++ = '\0'; - - errno = 0; - check_value = HDstrtol(env_value, NULL, 0); - - if (errno == ERANGE) - H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, - "couldn't parse value from " H5FD_SUBFILING_IOC_SELECTION_CRITERIA - " environment variable"); - - if ((check_value < 0) || (check_value >= ioc_selection_options)) - H5_SUBFILING_GOTO_ERROR( - H5E_VFL, H5E_BADVALUE, FAIL, - "invalid IOC selection type value %ld from " H5FD_SUBFILING_IOC_SELECTION_CRITERIA - " environment variable", - check_value); - - *ioc_selection_type = (H5FD_subfiling_ioc_select_t)check_value; - *ioc_sel_info_str = opt_value; - } - -done: - H5_SUBFILING_FUNC_LEAVE; -} - -/*------------------------------------------------------------------------- - * Function: count_nodes - * - * Purpose: Initializes the sorted collection of hostid+mpi_rank - * tuples. After initialization, the collection is scanned - * to determine the number of unique hostid entries. This - * value will determine the number of actual I/O concentrators - * that available to the application. A side effect is to - * identify the 'node_index' of the current process. - * - * Return: The number of unique hostid's (nodes). - * Errors: MPI_Abort if memory cannot be allocated. - * - * Programmer: Richard Warren - * 7/17/2020 - * - * Changes: Initial Version/None. - * - *------------------------------------------------------------------------- - */ -static int -count_nodes(sf_topology_t *info, MPI_Comm comm) -{ - app_layout_t *app_layout = NULL; - long nextid; - int node_count; - int hostid_index = -1; - int my_rank; - int mpi_code; - int ret_value = 0; - - HDassert(info); - HDassert(info->app_layout); - HDassert(info->app_layout->layout); - HDassert(info->app_layout->node_ranks); - HDassert(MPI_COMM_NULL != comm); - - if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &my_rank))) - H5_SUBFILING_MPI_GOTO_ERROR(-1, "MPI_Comm_rank failed", mpi_code); - - app_layout = info->app_layout; - node_count = app_layout->node_count; - - nextid = app_layout->layout[0].hostid; - /* Possibly record my hostid_index */ - if (app_layout->layout[0].rank == my_rank) { - hostid_index = 0; - } - - app_layout->node_ranks[0] = 0; /* Add index */ - node_count = 1; - - /* Recall that the topology array has been sorted! */ - for (int k = 1; k < app_layout->world_size; k++) { - /* Possibly record my hostid_index */ - if (app_layout->layout[k].rank == my_rank) - hostid_index = k; - if (app_layout->layout[k].hostid != nextid) { - nextid = app_layout->layout[k].hostid; - /* Record the index of new hostid */ - app_layout->node_ranks[node_count++] = k; - } - } - - /* Mark the end of the node_ranks */ - app_layout->node_ranks[node_count] = app_layout->world_size; - /* Save the index where we first located my hostid */ - app_layout->node_index = hostid_index; - - app_layout->node_count = node_count; - - ret_value = node_count; - -done: - H5_SUBFILING_FUNC_LEAVE; -} - -/*------------------------------------------------------------------------- - * Function: gather_topology_info - * - * Purpose: Collectively generate a sorted collection of hostid+mpi_rank - * tuples. The result is returned in the 'topology' field - * of the sf_topology_t structure. - * - * Return: Non-negative on success/Negative on failure - * - * Programmer: Richard Warren - * 7/17/2020 - * - * Changes: Initial Version/None. - * - *------------------------------------------------------------------------- - */ -static herr_t -gather_topology_info(sf_topology_t *info, MPI_Comm comm) -{ - app_layout_t *app_layout = NULL; - layout_t my_hostinfo; - long hostid; - int sf_world_size; - int sf_world_rank; - herr_t ret_value = SUCCEED; - - HDassert(info); - HDassert(info->app_layout); - HDassert(info->app_layout->layout); - HDassert(MPI_COMM_NULL != comm); - - app_layout = info->app_layout; - sf_world_size = app_layout->world_size; - sf_world_rank = app_layout->world_rank; - - hostid = gethostid(); - - my_hostinfo.hostid = hostid; - my_hostinfo.rank = sf_world_rank; - - app_layout->hostid = hostid; - app_layout->layout[sf_world_rank] = my_hostinfo; - - if (sf_world_size > 1) { - int mpi_code; - - if (MPI_SUCCESS != - (mpi_code = MPI_Allgather(&my_hostinfo, 2, MPI_LONG, app_layout->layout, 2, MPI_LONG, comm))) - H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Allgather failed", mpi_code); - - HDqsort(app_layout->layout, (size_t)sf_world_size, sizeof(layout_t), compare_hostid); - } - -done: - H5_SUBFILING_FUNC_LEAVE; -} - -/*------------------------------------------------------------------------- - * Function: identify_ioc_ranks - * - * Purpose: We've already identified the number of unique nodes and - * have a sorted list layout_t structures. Under normal - * conditions, we only utilize a single IOC per node. Under - * that circumstance, we only need to fill the io_concentrator - * vector from the node_ranks array (which contains the index - * into the layout array of lowest MPI rank on each node) into - * the io_concentrator vector; - * Otherwise, while determining the number of local_peers per - * node, we can also select one or more additional IOCs. - * - * As a side effect, we fill the 'ioc_concentrator' vector - * and set the 'rank_is_ioc' flag to TRUE if our rank is - * identified as owning an I/O Concentrator (IOC). - * - *------------------------------------------------------------------------- - */ -static int -identify_ioc_ranks(sf_topology_t *info, int node_count, int iocs_per_node) -{ - app_layout_t *app_layout = NULL; - int total_ioc_count = 0; - - HDassert(info); - HDassert(info->app_layout); - - app_layout = info->app_layout; - - for (int n = 0; n < node_count; n++) { - int node_index = app_layout->node_ranks[n]; - int local_peer_count = app_layout->node_ranks[n + 1] - app_layout->node_ranks[n]; - - info->io_concentrators[total_ioc_count++] = (int)(app_layout->layout[node_index++].rank); - - if (app_layout->layout[node_index - 1].rank == app_layout->world_rank) { - info->subfile_rank = total_ioc_count - 1; - info->rank_is_ioc = TRUE; - } - - for (int k = 1; k < iocs_per_node; k++) { - if (k < local_peer_count) { - if (app_layout->layout[node_index].rank == app_layout->world_rank) { - info->rank_is_ioc = TRUE; - info->subfile_rank = total_ioc_count; - } - info->io_concentrators[total_ioc_count++] = (int)(app_layout->layout[node_index++].rank); - } - } - } - - info->n_io_concentrators = total_ioc_count; - - return total_ioc_count; -} /* end identify_ioc_ranks() */ - -static inline void -assign_ioc_ranks(sf_topology_t *app_topology, int ioc_count, int rank_multiple) -{ - app_layout_t *app_layout = NULL; - int *io_concentrators = NULL; - - HDassert(app_topology); - HDassert(app_topology->app_layout); - HDassert(app_topology->io_concentrators); - - app_layout = app_topology->app_layout; - io_concentrators = app_topology->io_concentrators; - - /* fill the io_concentrators values based on the application layout */ - if (io_concentrators) { - int ioc_index; - for (int k = 0, ioc_next = 0; ioc_next < ioc_count; ioc_next++) { - ioc_index = rank_multiple * k++; - io_concentrators[ioc_next] = (int)(app_layout->layout[ioc_index].rank); - if (io_concentrators[ioc_next] == app_layout->world_rank) { - app_topology->subfile_rank = ioc_next; - app_topology->rank_is_ioc = TRUE; - } - } - app_topology->n_io_concentrators = ioc_count; - } -} /* end assign_ioc_ranks() */ +static herr_t open_config_file(const char *base_filename, const char *subfile_dir, uint64_t file_id, + const char *mode, FILE **config_file_out); /*------------------------------------------------------------------------- * Function: H5_new_subfiling_object_id @@ -459,10 +88,19 @@ assign_ioc_ranks(sf_topology_t *app_topology, int ioc_count, int rank_multiple) *------------------------------------------------------------------------- */ int64_t -H5_new_subfiling_object_id(sf_obj_type_t obj_type, int64_t index_val) +H5_new_subfiling_object_id(sf_obj_type_t obj_type) { - if (obj_type != SF_CONTEXT && obj_type != SF_TOPOLOGY) + int64_t index_val = 0; + + if (obj_type == SF_CONTEXT) { + index_val = (int64_t)sf_context_cache_num_entries; + } + else if (obj_type == SF_TOPOLOGY) { + index_val = (int64_t)sf_topology_cache_num_entries; + } + else return -1; + if (index_val < 0) return -1; @@ -492,12 +130,6 @@ H5_new_subfiling_object_id(sf_obj_type_t obj_type, int64_t index_val) * *------------------------------------------------------------------------- */ -/* - * TODO: we don't appear to ever use this for retrieving a subfile topology - * object. Might be able to refactor to just return a subfile context - * object. - */ -/* TODO: no way of freeing caches on close currently */ void * H5_get_subfiling_object(int64_t object_id) { @@ -512,7 +144,7 @@ H5_get_subfiling_object(int64_t object_id) if (obj_type == SF_CONTEXT) { /* Contexts provide information principally about * the application and how the data layout is managed - * over some number of sub-files. The important + * over some number of subfiles. The important * parameters are the number of subfiles (or in the * context of IOCs, the MPI ranks and counts of the * processes which host an I/O Concentrator. We @@ -522,63 +154,126 @@ H5_get_subfiling_object(int64_t object_id) /* Create subfiling context cache if it doesn't exist */ if (!sf_context_cache) { - if (NULL == (sf_context_cache = HDcalloc(sf_context_cache_limit, sizeof(subfiling_context_t)))) + if (NULL == (sf_context_cache = HDcalloc(DEFAULT_CONTEXT_CACHE_SIZE, sizeof(*sf_context_cache)))) H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL, "couldn't allocate space for subfiling context cache"); + sf_context_cache_size = DEFAULT_CONTEXT_CACHE_SIZE; + sf_context_cache_num_entries = 0; } /* Make more space in context cache if needed */ - if ((size_t)obj_index == sf_context_cache_limit) { + if ((size_t)obj_index >= sf_context_cache_size) { size_t old_num_entries; + size_t new_size; void *tmp_realloc; - old_num_entries = sf_context_cache_limit; + old_num_entries = sf_context_cache_num_entries; - sf_context_cache_limit *= 2; + new_size = (sf_context_cache_size * 3) / 2; - if (NULL == (tmp_realloc = HDrealloc(sf_context_cache, - sf_context_cache_limit * sizeof(subfiling_context_t)))) + if (NULL == (tmp_realloc = HDrealloc(sf_context_cache, new_size * sizeof(*sf_context_cache)))) H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL, "couldn't allocate space for subfiling context cache"); - sf_context_cache = tmp_realloc; + sf_context_cache = tmp_realloc; + sf_context_cache_size = new_size; /* Clear newly-allocated entries */ - HDmemset(&sf_context_cache[obj_index], 0, - (sf_context_cache_limit - old_num_entries) * sizeof(subfiling_context_t)); + HDmemset(&sf_context_cache[old_num_entries], 0, + (sf_context_cache_size - old_num_entries) * sizeof(*sf_context_cache)); + + /* + * If we had to make more space, the given object index + * should always fall within range after a single re-allocation + */ + HDassert((size_t)obj_index < sf_context_cache_size); } - /* Return direct pointer to the context cache entry */ - return (void *)&sf_context_cache[obj_index]; + /* + * Since this cache currently just keeps all entries until + * application exit, context entry indices should just be + * consecutive + */ + HDassert((size_t)obj_index <= sf_context_cache_num_entries); + if ((size_t)obj_index < sf_context_cache_num_entries) + ret_value = sf_context_cache[obj_index]; + else { + HDassert(!sf_context_cache[sf_context_cache_num_entries]); + + /* Allocate a new subfiling context object */ + if (NULL == (ret_value = HDcalloc(1, sizeof(subfiling_context_t)))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL, + "couldn't allocate subfiling context object"); + + sf_context_cache[sf_context_cache_num_entries++] = ret_value; + } } else if (obj_type == SF_TOPOLOGY) { /* Create subfiling topology cache if it doesn't exist */ if (!sf_topology_cache) { - if (NULL == (sf_topology_cache = HDcalloc(sf_topology_cache_limit, sizeof(sf_topology_t)))) + if (NULL == + (sf_topology_cache = HDcalloc(DEFAULT_TOPOLOGY_CACHE_SIZE, sizeof(*sf_topology_cache)))) H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL, "couldn't allocate space for subfiling topology cache"); + sf_topology_cache_size = DEFAULT_TOPOLOGY_CACHE_SIZE; + sf_topology_cache_num_entries = 0; } - /* We will likely only cache a single topology - * which is that of the original parallel application. - * In that context, we will identify the number of - * nodes along with the number of MPI ranks on a node. - */ - if ((size_t)obj_index >= sf_topology_cache_limit) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, NULL, - "invalid object index for subfiling topology object ID"); - - /* Return direct pointer to the topology cache entry */ - return (void *)&sf_topology_cache[obj_index]; - } + /* Make more space in topology cache if needed */ + if ((size_t)obj_index >= sf_topology_cache_size) { + size_t old_num_entries; + size_t new_size; + void *tmp_realloc; -#ifdef H5_SUBFILING_DEBUG - HDprintf("%s: Unknown subfiling object type for ID %" PRId64 "\n", __func__, object_id); -#endif + old_num_entries = sf_topology_cache_num_entries; -done: - H5_SUBFILING_FUNC_LEAVE; -} + new_size = (sf_topology_cache_size * 3) / 2; + + if (NULL == (tmp_realloc = HDrealloc(sf_topology_cache, new_size * sizeof(*sf_topology_cache)))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL, + "couldn't allocate space for subfiling topology cache"); + + sf_topology_cache = tmp_realloc; + sf_topology_cache_size = new_size; + + /* Clear newly-allocated entries */ + HDmemset(&sf_topology_cache[old_num_entries], 0, + (sf_topology_cache_size - old_num_entries) * sizeof(*sf_topology_cache)); + + /* + * If we had to make more space, the given object index + * should always fall within range after a single re-allocation + */ + HDassert((size_t)obj_index < sf_topology_cache_size); + } + + /* + * Since this cache currently just keeps all entries until + * application exit, topology entry indices should just be + * consecutive + */ + HDassert((size_t)obj_index <= sf_topology_cache_num_entries); + if ((size_t)obj_index < sf_topology_cache_num_entries) + ret_value = sf_topology_cache[obj_index]; + else { + HDassert(!sf_topology_cache[sf_topology_cache_num_entries]); + + /* Allocate a new subfiling topology object */ + if (NULL == (ret_value = HDmalloc(sizeof(sf_topology_t)))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL, + "couldn't allocate subfiling topology object"); + + sf_topology_cache[sf_topology_cache_num_entries++] = ret_value; + } + } +#ifdef H5_SUBFILING_DEBUG + else + HDprintf("%s: Unknown subfiling object type for ID %" PRId64 "\n", __func__, object_id); +#endif + +done: + H5_SUBFILING_FUNC_LEAVE; +} /*------------------------------------------------------------------------- * Function: H5_free_subfiling_object @@ -586,27 +281,55 @@ done: * Purpose: Frees the underlying subfiling object for a given subfiling * object ID. * + * NOTE: Currently we assume that all created subfiling + * objects are cached in the (very simple) context/topology + * cache until application exit, so the only time a subfiling + * object should be freed by this routine is if something + * fails right after creating one. Otherwise, the internal + * indexing for the relevant cache will be invalid. + * * Return: Non-negative on success/Negative on failure * *------------------------------------------------------------------------- */ -herr_t +static herr_t H5_free_subfiling_object(int64_t object_id) { - subfiling_context_t *sf_context = NULL; - int64_t obj_type = (object_id >> 32) & 0x0FFFF; - herr_t ret_value = SUCCEED; + int64_t obj_type = (object_id >> 32) & 0x0FFFF; + herr_t ret_value = SUCCEED; - if (obj_type != SF_CONTEXT) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "invalid subfiling object type for ID %" PRId64, - object_id); + if (obj_type == SF_CONTEXT) { + subfiling_context_t *sf_context; - if (NULL == (sf_context = H5_get_subfiling_object(object_id))) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, - "couldn't get subfiling context for subfiling object ID"); + if (NULL == (sf_context = H5_get_subfiling_object(object_id))) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, + "couldn't get subfiling context for subfiling object ID"); + + if (H5_free_subfiling_object_int(sf_context) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling object"); + + HDassert(sf_context_cache_num_entries > 0); + HDassert(sf_context == sf_context_cache[sf_context_cache_num_entries - 1]); + sf_context_cache[sf_context_cache_num_entries - 1] = NULL; + sf_context_cache_num_entries--; + } + else { + sf_topology_t *sf_topology; + + HDassert(obj_type == SF_TOPOLOGY); - if (H5_free_subfiling_object_int(sf_context) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling object"); + if (NULL == (sf_topology = H5_get_subfiling_object(object_id))) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, + "couldn't get subfiling context for subfiling object ID"); + + if (H5_free_subfiling_topology(sf_topology) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling topology"); + + HDassert(sf_topology_cache_num_entries > 0); + HDassert(sf_topology == sf_topology_cache[sf_topology_cache_num_entries - 1]); + sf_topology_cache[sf_topology_cache_num_entries - 1] = NULL; + sf_topology_cache_num_entries--; + } done: H5_SUBFILING_FUNC_LEAVE; @@ -617,25 +340,10 @@ H5_free_subfiling_object_int(subfiling_context_t *sf_context) { HDassert(sf_context); -#ifdef H5_SUBFILING_DEBUG - if (sf_context->sf_logfile) { - struct tm *tm = NULL; - time_t cur_time; - - cur_time = time(NULL); - tm = localtime(&cur_time); - - H5_subfiling_log(sf_context->sf_context_id, "\n-- LOGGING FINISH - %s", asctime(tm)); - - HDfclose(sf_context->sf_logfile); - sf_context->sf_logfile = NULL; - } -#endif - sf_context->sf_context_id = -1; sf_context->h5_file_id = UINT64_MAX; - sf_context->h5_file_handle = NULL; - sf_context->sf_fid = -1; + sf_context->sf_num_fids = 0; + sf_context->sf_num_subfiles = -1; sf_context->sf_write_count = 0; sf_context->sf_read_count = 0; sf_context->sf_eof = HADDR_UNDEF; @@ -658,52 +366,63 @@ H5_free_subfiling_object_int(subfiling_context_t *sf_context) return FAIL; sf_context->sf_eof_comm = MPI_COMM_NULL; } - if (sf_context->sf_barrier_comm != MPI_COMM_NULL) { - if (H5_mpi_comm_free(&sf_context->sf_barrier_comm) < 0) + if (sf_context->sf_node_comm != MPI_COMM_NULL) { + if (H5_mpi_comm_free(&sf_context->sf_node_comm) < 0) return FAIL; - sf_context->sf_barrier_comm = MPI_COMM_NULL; + sf_context->sf_node_comm = MPI_COMM_NULL; } if (sf_context->sf_group_comm != MPI_COMM_NULL) { if (H5_mpi_comm_free(&sf_context->sf_group_comm) < 0) return FAIL; sf_context->sf_group_comm = MPI_COMM_NULL; } - if (sf_context->sf_intercomm != MPI_COMM_NULL) { - if (H5_mpi_comm_free(&sf_context->sf_intercomm) < 0) - return FAIL; - sf_context->sf_intercomm = MPI_COMM_NULL; - } - sf_context->sf_group_size = -1; - sf_context->sf_group_rank = -1; - sf_context->sf_intercomm_root = -1; + sf_context->sf_group_size = -1; + sf_context->sf_group_rank = -1; HDfree(sf_context->subfile_prefix); sf_context->subfile_prefix = NULL; - HDfree(sf_context->sf_filename); - sf_context->sf_filename = NULL; - HDfree(sf_context->h5_filename); sf_context->h5_filename = NULL; - if (H5_free_subfiling_topology(sf_context->topology) < 0) - return FAIL; + HDfree(sf_context->sf_fids); + sf_context->sf_fids = NULL; + + /* + * Currently we assume that all created application topology + * objects are cached until application exit and may be shared + * among multiple subfiling contexts, so we free them separately + * from here to avoid issues with stale pointers. + */ sf_context->topology = NULL; + HDfree(sf_context); + return SUCCEED; } static herr_t H5_free_subfiling_topology(sf_topology_t *topology) { + herr_t ret_value = SUCCEED; + HDassert(topology); - topology->subfile_rank = -1; - topology->n_io_concentrators = 0; +#ifndef NDEBUG + { + hbool_t topology_cached = FALSE; - HDfree(topology->subfile_fd); - topology->subfile_fd = NULL; + /* Make sure this application topology object is in the cache */ + for (size_t i = 0; i < sf_topology_cache_num_entries; i++) + if (topology == sf_topology_cache[i]) + topology_cached = TRUE; + HDassert(topology_cached); + } +#endif + + topology->ioc_idx = -1; + topology->n_io_concentrators = 0; if (topology->app_layout) { HDfree(topology->app_layout->layout); @@ -720,9 +439,134 @@ H5_free_subfiling_topology(sf_topology_t *topology) HDfree(topology->io_concentrators); topology->io_concentrators = NULL; + if (H5_mpi_comm_free(&topology->app_comm) < 0) + H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI communicator"); + HDfree(topology); - return SUCCEED; + H5_SUBFILING_FUNC_LEAVE; +} + +/*------------------------------------------------------------------------- + * Function: H5_open_subfiling_stub_file + * + * Purpose: Opens the stub file for an HDF5 file created with the + * Subfiling VFD. This stub file only contains some superblock + * metadata that can allow HDF5 applications to determine that + * the file is an HDF5 file and was created with the Subfiling + * VFD. + * + * This routine is collective across `file_comm`; once the + * stub file has been opened, the inode value for the file is + * retrieved and broadcasted to all MPI ranks in `file_comm` + * for future use. + * + * To avoid unnecessary overhead from a large-scale file open, + * this stub file is currently only opened on MPI rank 0. Note + * that this assumes that all the relevant metadata will be + * written from MPI rank 0. This should be fine for now since + * the HDF file signature and Subfiling driver info is really + * all that's needed, but this should be revisited since the + * file metadata can and will come from other MPI ranks as + * well. + * + * Return: Non-negative on success/Negative on failure + *------------------------------------------------------------------------- + */ +herr_t +H5_open_subfiling_stub_file(const char *name, unsigned flags, MPI_Comm file_comm, H5FD_t **file_ptr, + uint64_t *file_id) +{ + H5P_genplist_t *plist = NULL; + uint64_t stub_file_id = UINT64_MAX; + hbool_t bcasted_inode = FALSE; + H5FD_t *stub_file = NULL; + hid_t fapl_id = H5I_INVALID_HID; + int mpi_rank = 0; + int mpi_size = 1; + int mpi_code; + herr_t ret_value = SUCCEED; + + if (!name) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid subfiling stub file name"); + if (file_comm == MPI_COMM_NULL) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid MPI communicator"); + if (!file_id) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL file ID pointer"); + + if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(file_comm, &mpi_rank))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code); + if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(file_comm, &mpi_size))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code); + + if (!file_ptr && (mpi_rank == 0)) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL stub file pointer"); + + /* Open stub file on MPI rank 0 only */ + if (mpi_rank == 0) { + h5_stat_t st; + MPI_Comm stub_comm = MPI_COMM_SELF; + MPI_Info stub_info = MPI_INFO_NULL; + + if ((fapl_id = H5P_create_id(H5P_CLS_FILE_ACCESS_g, FALSE)) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTREGISTER, FAIL, "can't create FAPL for stub file"); + if (NULL == (plist = H5P_object_verify(fapl_id, H5P_FILE_ACCESS))) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a file access property list"); + + /* Use MPI I/O driver for stub file to allow access to vector I/O */ + if (H5P_set(plist, H5F_ACS_MPI_PARAMS_COMM_NAME, &stub_comm) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI communicator"); + if (H5P_set(plist, H5F_ACS_MPI_PARAMS_INFO_NAME, &stub_info) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI info object"); + if (H5P_set_driver(plist, H5FD_MPIO, NULL, NULL) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI I/O driver on FAPL"); + + if (NULL == (stub_file = H5FD_open(name, flags, fapl_id, HADDR_UNDEF))) + H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL, "couldn't open HDF5 stub file"); + + HDcompile_assert(sizeof(uint64_t) >= sizeof(ino_t)); + + /* Retrieve Inode value for stub file */ + if (HDstat(name, &st) < 0) { + stub_file_id = UINT64_MAX; + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, + "couldn't stat HDF5 stub file, errno = %d, error message = '%s'", errno, + HDstrerror(errno)); + } + else + stub_file_id = (uint64_t)st.st_ino; + } + + bcasted_inode = TRUE; + + if (mpi_size > 1) { + if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&stub_file_id, 1, MPI_UINT64_T, 0, file_comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code); + } + + if (stub_file_id == UINT64_MAX) + H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "couldn't get inode value for HDF5 stub file"); + + if (file_ptr) + *file_ptr = stub_file; + *file_id = stub_file_id; + +done: + if (fapl_id >= 0 && H5I_dec_ref(fapl_id) < 0) + H5_SUBFILING_DONE_ERROR(H5E_ID, H5E_CANTDEC, FAIL, "can't close FAPL ID"); + + if (ret_value < 0) { + if (!bcasted_inode && (mpi_size > 1)) { + if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&stub_file_id, 1, MPI_UINT64_T, 0, file_comm))) + H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Bcast failed", mpi_code); + } + if (stub_file) { + if (H5FD_close(stub_file) < 0) + H5_SUBFILING_DONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, "couldn't close HDF5 stub file"); + } + } + + H5_SUBFILING_FUNC_LEAVE; } /*------------------------------------------------------------------------- @@ -752,16 +596,12 @@ H5_free_subfiling_topology(sf_topology_t *topology) * Changes: Initial Version/None. *------------------------------------------------------------------------- */ -/* TODO: revise description */ herr_t -H5_open_subfiles(const char *base_filename, void *file_handle, - H5FD_subfiling_shared_config_t *subfiling_config, int file_acc_flags, MPI_Comm file_comm, - int64_t *context_id_out) +H5_open_subfiles(const char *base_filename, uint64_t file_id, H5FD_subfiling_params_t *subfiling_config, + int file_acc_flags, MPI_Comm file_comm, int64_t *context_id_out) { subfiling_context_t *sf_context = NULL; int64_t context_id = -1; - int l_errors = 0; - int g_errors = 0; int mpi_code; herr_t ret_value = SUCCEED; @@ -775,20 +615,13 @@ H5_open_subfiles(const char *base_filename, void *file_handle, H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL, "invalid subfiling context ID pointer"); /* Initialize new subfiling context ID based on configuration information */ - if (init_subfiling(subfiling_config, file_comm, &context_id) < 0) + if (init_subfiling(base_filename, file_id, subfiling_config, file_acc_flags, file_comm, &context_id) < 0) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't initialize subfiling context"); /* Retrieve the subfiling object for the newly-created context ID */ if (NULL == (sf_context = H5_get_subfiling_object(context_id))) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't get subfiling object from context ID"); - /* Save some basic things in the new subfiling context */ - sf_context->h5_file_handle = file_handle; - - if (NULL == (sf_context->h5_filename = HDstrdup(base_filename))) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, - "couldn't allocate space for subfiling filename"); - /* * If we're actually using the IOCs, we will * start the service threads on the identified @@ -802,7 +635,6 @@ H5_open_subfiles(const char *base_filename, void *file_handle, struct tm *tm = NULL; time_t cur_time; int mpi_rank; - int mpi_code; /* Open debugging logfile */ @@ -825,24 +657,30 @@ H5_open_subfiles(const char *base_filename, void *file_handle, *context_id_out = context_id; done: - if (ret_value < 0) { - l_errors = 1; - } - /* * Form consensus on whether opening subfiles was * successful */ - if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&l_errors, &g_errors, 1, MPI_INT, MPI_SUM, file_comm))) - H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Allreduce failed", mpi_code); + { + int mpi_size = -1; + int err_result = (ret_value < 0); + + if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(file_comm, &mpi_size))) + H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Comm_size failed", mpi_code); + + if (mpi_size > 1) { + if (MPI_SUCCESS != + (mpi_code = MPI_Allreduce(MPI_IN_PLACE, &err_result, 1, MPI_INT, MPI_MAX, file_comm))) + H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Allreduce failed", mpi_code); + } - if (g_errors > 0) { - H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTOPENFILE, FAIL, - "one or more IOC ranks couldn't open subfiles"); + if (err_result) + H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTOPENFILE, FAIL, + "one or more IOC ranks couldn't open subfiles"); } if (ret_value < 0) { - clear_fid_map_entry(file_handle, context_id); + clear_fid_map_entry(file_id, context_id); if (context_id >= 0 && H5_free_subfiling_object(context_id) < 0) H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling object"); @@ -873,54 +711,175 @@ done: ------------------------------------------------------------------------- */ static herr_t -init_subfiling(H5FD_subfiling_shared_config_t *subfiling_config, MPI_Comm comm, int64_t *context_id_out) +init_subfiling(const char *base_filename, uint64_t file_id, H5FD_subfiling_params_t *subfiling_config, + int file_acc_flags, MPI_Comm comm, int64_t *context_id_out) { - subfiling_context_t *new_context = NULL; - sf_topology_t *app_topology = NULL; - int64_t context_id = -1; - int file_index = -1; - herr_t ret_value = SUCCEED; + subfiling_context_t *new_context = NULL; + sf_topology_t *app_topology = NULL; + MPI_Comm node_comm = MPI_COMM_NULL; + int64_t context_id = -1; + FILE *config_file = NULL; + char *file_basename = NULL; + char *subfile_dir = NULL; + int mpi_rank; + int mpi_size; + int mpi_code; + herr_t ret_value = SUCCEED; HDassert(context_id_out); - file_index = get_next_fid_map_index(); - HDassert(file_index >= 0); + if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &mpi_rank))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code); + if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(comm, &mpi_size))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code); /* Use the file's index to create a new subfiling context ID */ - if ((context_id = H5_new_subfiling_object_id(SF_CONTEXT, file_index)) < 0) + if ((context_id = H5_new_subfiling_object_id(SF_CONTEXT)) < 0) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't create new subfiling context ID"); /* Create a new subfiling context object with the created context ID */ if (NULL == (new_context = H5_get_subfiling_object(context_id))) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't create new subfiling object"); + new_context->sf_context_id = -1; + new_context->topology = NULL; + new_context->sf_msg_comm = MPI_COMM_NULL; + new_context->sf_data_comm = MPI_COMM_NULL; + new_context->sf_eof_comm = MPI_COMM_NULL; + new_context->sf_node_comm = MPI_COMM_NULL; + new_context->sf_group_comm = MPI_COMM_NULL; + + /* + * If there's an existing subfiling configuration file for + * this file, read the stripe size and number of subfiles + * from it + */ + if (0 == (file_acc_flags & O_CREAT)) { + int64_t config[2] = {0, 0}; /* {stripe size, num subfiles} */ + + if (mpi_rank == 0) { + /* TODO: currently no support for subfile prefix */ + if (H5_dirname(base_filename, &subfile_dir) < 0) + config[0] = -1; + + if (config[0] >= 0) { + if (H5_basename(base_filename, &file_basename) < 0) + config[0] = -1; + } + + if (config[0] >= 0) { + if (open_config_file(file_basename, subfile_dir, file_id, "r", &config_file) < 0) + config[0] = -1; + } + + if (config[0] >= 0) { + if (!config_file) + config[0] = -2; /* No config file; use setting from configuration */ + else { + /* + * If a subfiling configuration file exists and we aren't truncating + * it, read the number of subfiles used at file creation time. + */ + if (H5_get_subfiling_config_from_file(config_file, &config[0], &config[1]) < 0) + config[0] = -1; + } + } + } + + if (mpi_size > 1) { + if (MPI_SUCCESS != (mpi_code = MPI_Bcast(config, 2, MPI_INT64_T, 0, comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code); + } + + /* + * Override the stripe size and stripe count settings in the + * application's subfiling configuration if we read values + * from an existing subfiling configuration file + */ + if (config[0] == -1) + H5_SUBFILING_GOTO_ERROR( + H5E_FILE, H5E_CANTOPENFILE, FAIL, + "lead process couldn't read the number of subfiles from subfiling configuration file"); + else { + if (config[0] > 0) + subfiling_config->stripe_size = config[0]; + if (config[1] > 0) { + H5_CHECK_OVERFLOW(config[1], int64_t, int32_t); + subfiling_config->stripe_count = (int32_t)config[1]; + } + } + } + else { + char *env_value = NULL; + + /* Check for a subfiling stripe size setting from the environment */ + if ((env_value = HDgetenv(H5FD_SUBFILING_STRIPE_SIZE))) { + long long stripe_size = -1; + + errno = 0; + + stripe_size = HDstrtoll(env_value, NULL, 0); + if (ERANGE == errno) + H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, + "invalid stripe size setting for " H5FD_SUBFILING_STRIPE_SIZE); + + if (stripe_size > 0) { + subfiling_config->stripe_size = (int64_t)stripe_size; + } + } + } + +#if H5_CHECK_MPI_VERSION(3, 0) + if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &mpi_rank))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code); + + /* Create an MPI sub-communicator for intra-node communications */ + if (MPI_SUCCESS != + (mpi_code = MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, mpi_rank, MPI_INFO_NULL, &node_comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_split_type failed", mpi_code); + + if (MPI_SUCCESS != (mpi_code = MPI_Comm_set_errhandler(node_comm, MPI_ERRORS_RETURN))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_set_errhandler failed", mpi_code); +#else +#error "MPI-3 required for MPI_Comm_split_type" +#endif /* * Setup the application topology information, including the computed * number and distribution map of the set of I/O concentrators */ - if (init_app_topology(subfiling_config->ioc_selection, comm, &app_topology) < 0) + if (init_app_topology(subfiling_config, comm, node_comm, &app_topology) < 0) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't initialize application topology"); new_context->sf_context_id = context_id; - if (init_subfiling_context(new_context, subfiling_config, app_topology, comm) < 0) + if (init_subfiling_context(new_context, base_filename, file_id, subfiling_config, app_topology, comm) < 0) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't initialize subfiling application topology object"); - - new_context->sf_base_addr = 0; - if (new_context->topology->rank_is_ioc) { - new_context->sf_base_addr = - (int64_t)(new_context->topology->subfile_rank * new_context->sf_stripe_size); - } + new_context->sf_node_comm = node_comm; *context_id_out = context_id; done: + if (config_file && (EOF == HDfclose(config_file))) + H5_SUBFILING_DONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, + "couldn't close subfiling configuration file"); + + H5MM_free(file_basename); + H5MM_free(subfile_dir); + if (ret_value < 0) { - HDfree(app_topology); + if (app_topology) { + if (H5_free_subfiling_topology(app_topology) < 0) + H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling topology"); + } + + if (H5_mpi_comm_free(&node_comm) < 0) + H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free MPI communicator"); if (context_id >= 0 && H5_free_subfiling_object(context_id) < 0) H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling object"); + + *context_id_out = -1; } H5_SUBFILING_FUNC_LEAVE; @@ -929,76 +888,89 @@ done: /*------------------------------------------------------------------------- * Function: init_app_topology * - * Purpose: Once a sorted collection of hostid/mpi_rank tuples has been - * created and the number of unique hostids (nodes) has - * been determined, we may modify this "default" value for - * the number of IO Concentrators for this application. + * Purpose: Determine the topology of the application so that MPI ranks + * can be assigned as I/O concentrators. The default is to use + * 1 MPI rank per node as an I/O concentrator, but this can be + * changed by the application's subfiling configuration, or by + * an environment variable (H5FD_SUBFILING_IOC_PER_NODE). * - * The default of one(1) IO concentrator per node can be - * changed (principally for testing) by environment variable. - * if IOC_COUNT_PER_NODE is defined, then that integer value - * is utilized as a multiplier to modify the set of - * IO Concentrator ranks. - * - * The cached results will be replicated within the - * subfiling_context_t structure and is utilized as a map from - * io concentrator rank to MPI communicator rank for message - * sends and receives. - * - * Return: The number of IO Concentrator ranks. We also cache - * the MPI ranks in the 'io_concentrator' vector variable. - * The length of this vector is cached as 'n_io_concentrators'. - * Errors: MPI_Abort if memory cannot be allocated. - * - * Programmer: Richard Warren - * 7/17/2020 - * - * Changes: - Initial Version/None. - * - Updated the API to allow a variety of methods for - * determining the number and MPI ranks that will have - * IO Concentrators. The default approach will define - * a single IOC per node. + * Return: Non-negative on success/Negative on failure * *------------------------------------------------------------------------- */ static herr_t -init_app_topology(H5FD_subfiling_ioc_select_t ioc_selection_type, MPI_Comm comm, +init_app_topology(H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, MPI_Comm node_comm, sf_topology_t **app_topology_out) { - sf_topology_t *app_topology = NULL; - app_layout_t *app_layout = NULL; - char *env_value = NULL; - char *ioc_sel_str = NULL; - long ioc_select_val = -1; - long iocs_per_node = 1; - int ioc_count = 0; - int comm_rank; - int comm_size; - int mpi_code; - herr_t ret_value = SUCCEED; + H5FD_subfiling_ioc_select_t ioc_selection_type; + sf_topology_t *app_topology = NULL; + int64_t topology_id = -1; + char *env_value = NULL; + char *ioc_sel_str = NULL; + long ioc_select_val = -1; + long iocs_per_node = 1; + int ioc_count = 0; + int rank_multiple = 1; + int comm_rank; + int comm_size; + int mpi_code; + herr_t ret_value = SUCCEED; + HDassert(subfiling_config); HDassert(MPI_COMM_NULL != comm); + HDassert(MPI_COMM_NULL != node_comm); HDassert(app_topology_out); HDassert(!*app_topology_out); if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &comm_rank))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code); - if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(comm, &comm_size))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code); + ioc_selection_type = subfiling_config->ioc_selection; + /* Check if an IOC selection type was specified by environment variable */ if (get_ioc_selection_criteria_from_env(&ioc_selection_type, &ioc_sel_str) < 0) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "couldn't get IOC selection type from environment"); - /* Sanity checking on different IOC selection strategies */ + /* + * Check parameters for the specified IOC selection strategy + * and determine the maximum number of I/O concentrators + */ switch (ioc_selection_type) { - case SELECT_IOC_EVERY_NTH_RANK: { - errno = 0; + case SELECT_IOC_ONE_PER_NODE: { + if (comm_size > 1) { + /* Check for an IOC-per-node value set in the environment */ + if ((env_value = HDgetenv(H5FD_SUBFILING_IOC_PER_NODE))) { + errno = 0; + ioc_select_val = HDstrtol(env_value, NULL, 0); + if ((ERANGE == errno)) { + HDprintf("invalid value '%s' for " H5FD_SUBFILING_IOC_PER_NODE "\n", env_value); + ioc_select_val = 1; + } + + if (ioc_select_val > 0) + iocs_per_node = ioc_select_val; + } + } + /* IOC count will be adjusted after number of nodes is determined */ + H5_CHECK_OVERFLOW(iocs_per_node, long, int); + ioc_count = (int)iocs_per_node; + + break; + } + + case SELECT_IOC_EVERY_NTH_RANK: { + /* + * User specifies a rank multiple value. Selection starts + * with rank 0 and then the user-specified stride is applied + * to identify other IOC ranks. + */ ioc_select_val = 1; if (ioc_sel_str) { + errno = 0; ioc_select_val = HDstrtol(ioc_sel_str, NULL, 0); if ((ERANGE == errno) || (ioc_select_val <= 0)) { HDprintf("invalid IOC selection strategy string '%s' for strategy " @@ -1009,137 +981,655 @@ init_app_topology(H5FD_subfiling_ioc_select_t ioc_selection_type, MPI_Comm comm, } } - break; - } + H5_CHECK_OVERFLOW(ioc_select_val, long, int); + ioc_count = (comm_size / (int)ioc_select_val); + + if ((comm_size % ioc_select_val) != 0) { + ioc_count++; + } + + break; + } + + case SELECT_IOC_TOTAL: { + /* + * User specifies a total number of I/O concentrators. + * Starting with rank 0, a stride of (mpi_size / total) + * is applied to identify other IOC ranks. + */ + ioc_select_val = 1; + if (ioc_sel_str) { + errno = 0; + ioc_select_val = HDstrtol(ioc_sel_str, NULL, 0); + if ((ERANGE == errno) || (ioc_select_val <= 0) || (ioc_select_val >= comm_size)) { + HDprintf("invalid IOC selection strategy string '%s' for strategy SELECT_IOC_TOTAL; " + "defaulting to SELECT_IOC_ONE_PER_NODE\n", + ioc_sel_str); + ioc_select_val = 1; + ioc_selection_type = SELECT_IOC_ONE_PER_NODE; + } + } + + H5_CHECK_OVERFLOW(ioc_select_val, long, int); + ioc_count = (int)ioc_select_val; + + rank_multiple = (comm_size / ioc_count); + + break; + } + + case SELECT_IOC_WITH_CONFIG: + default: + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "invalid IOC selection strategy"); + break; + } + + /* + * TODO: A different IOC selection string from the environment than what was + * used originally will cause the IOCs to be assigned differently than + * expected. While this generally shouldn't cause issues (other than + * for the SELECT_IOC_TOTAL case), this should still be dealt with + * eventually. + */ + /* Check the subfiling topology cache to see if there's a matching object */ + if (find_cached_topology_info(comm, subfiling_config, iocs_per_node, &app_topology) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, + "can't check for cached subfiling topology object"); + HDassert(!app_topology || (app_topology->selection_type == ioc_selection_type)); + + if (!app_topology) { + /* Generate an ID for the application topology object */ + if ((topology_id = H5_new_subfiling_object_id(SF_TOPOLOGY)) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get ID for subfiling topology object"); + + /* Get a new application topology object from the cache */ + if (NULL == (app_topology = H5_get_subfiling_object(topology_id))) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get subfiling topology object"); + app_topology->app_layout = NULL; + app_topology->app_comm = MPI_COMM_NULL; + app_topology->rank_is_ioc = FALSE; + app_topology->ioc_idx = -1; + app_topology->n_io_concentrators = ioc_count; + app_topology->io_concentrators = NULL; + app_topology->selection_type = ioc_selection_type; + + if (H5_mpi_comm_dup(comm, &app_topology->app_comm) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTCOPY, FAIL, "can't duplicate MPI communicator"); + + if (init_app_layout(app_topology, comm, node_comm) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't initialize application layout"); + HDassert(app_topology->app_layout); + HDassert(app_topology->app_layout->layout); + HDassert(app_topology->app_layout->node_ranks); + HDassert(app_topology->app_layout->node_count > 0); + + /* + * Now that the application node count has been determined, adjust the + * number of I/O concentrators for the SELECT_IOC_ONE_PER_NODE case + */ + if (app_topology->selection_type == SELECT_IOC_ONE_PER_NODE) + app_topology->n_io_concentrators = (int)iocs_per_node * app_topology->app_layout->node_count; + + /* + * Make sure the number of I/O concentrators doesn't + * exceed the specified number of subfiles + */ + if (subfiling_config->stripe_count != H5FD_SUBFILING_DEFAULT_STRIPE_COUNT) { + if (app_topology->n_io_concentrators > subfiling_config->stripe_count) + app_topology->n_io_concentrators = subfiling_config->stripe_count; + } + + /* + * Determine which ranks are I/O concentrator ranks, based on the + * given IOC selection strategy and MPI information. + */ + if (identify_ioc_ranks(app_topology, rank_multiple) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, + "couldn't determine which MPI ranks are I/O concentrators"); + } + + *app_topology_out = app_topology; + +done: + if (ret_value < 0) { + if (app_topology && (topology_id >= 0)) { + if (H5_free_subfiling_object(topology_id) < 0) + H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free subfiling topology object"); + } + } + + H5_SUBFILING_FUNC_LEAVE; +} + +/* +------------------------------------------------------------------------- + Programmer: Richard Warren + Purpose: Return a character string which represents either the + default selection method: SELECT_IOC_ONE_PER_NODE; or + if the user has selected a method via the environment + variable (H5FD_SUBFILING_IOC_SELECTION_CRITERIA), we + return that along with any optional qualifier with for + that method. + + Errors: None. + + Revision History -- Initial implementation +------------------------------------------------------------------------- +*/ +static herr_t +get_ioc_selection_criteria_from_env(H5FD_subfiling_ioc_select_t *ioc_selection_type, char **ioc_sel_info_str) +{ + char *opt_value = NULL; + char *env_value = HDgetenv(H5FD_SUBFILING_IOC_SELECTION_CRITERIA); + herr_t ret_value = SUCCEED; + + HDassert(ioc_selection_type); + HDassert(ioc_sel_info_str); + + *ioc_sel_info_str = NULL; + + if (env_value) { + long check_value; + + /* + * For non-default options, the environment variable + * should have the following form: integer:[integer|string] + * In particular, EveryNthRank == 1:64 or every 64 ranks assign an IOC + * or WithConfig == 2:/ + */ + if ((opt_value = HDstrchr(env_value, ':'))) + *opt_value++ = '\0'; + + errno = 0; + check_value = HDstrtol(env_value, NULL, 0); + + if (errno == ERANGE) + H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, + "couldn't parse value from " H5FD_SUBFILING_IOC_SELECTION_CRITERIA + " environment variable"); + + if ((check_value < 0) || (check_value >= ioc_selection_options)) + H5_SUBFILING_GOTO_ERROR( + H5E_VFL, H5E_BADVALUE, FAIL, + "invalid IOC selection type value %ld from " H5FD_SUBFILING_IOC_SELECTION_CRITERIA + " environment variable", + check_value); + + *ioc_selection_type = (H5FD_subfiling_ioc_select_t)check_value; + *ioc_sel_info_str = opt_value; + } + +done: + H5_SUBFILING_FUNC_LEAVE; +} + +/*------------------------------------------------------------------------- + * Function: find_cached_topology_info + * + * Purpose: Given an MPI communicator and IOC selection strategy, + * checks the subfiling topology cached to see if any matching + * topology objects have been cached. + * + * Return: Non-negative on success/Negative on failure + * + *------------------------------------------------------------------------- + */ +static herr_t +find_cached_topology_info(MPI_Comm comm, H5FD_subfiling_params_t *subf_config, long iocs_per_node, + sf_topology_t **app_topology) +{ + H5FD_subfiling_ioc_select_t ioc_selection_type; + int32_t stripe_count; + herr_t ret_value = SUCCEED; + + HDassert(subf_config); + + ioc_selection_type = subf_config->ioc_selection; + stripe_count = subf_config->stripe_count; + + for (size_t i = 0; i < sf_topology_cache_num_entries; i++) { + sf_topology_t *cached_topology = sf_topology_cache[i]; + int result; + int mpi_code; + + HDassert(cached_topology); + + /* + * If the selection types differ, just reject the cached topology + * for now rather than checking if the mapping is equivalent + */ + if (ioc_selection_type != cached_topology->selection_type) + continue; + + /* + * If the number of I/O concentrators in the cached topology + * is greater than the specified target number of subfiles, + * reject the cached topology + */ + if (stripe_count != H5FD_SUBFILING_DEFAULT_STRIPE_COUNT) { + if (stripe_count < cached_topology->n_io_concentrators) + continue; + } + + if (cached_topology->selection_type == SELECT_IOC_ONE_PER_NODE) { + HDassert(iocs_per_node >= 1); + HDassert(cached_topology->app_layout->node_count > 0); + + /* + * If a IOCs-per-node setting was set in the environment and would + * cause the application topology to differ from the cached topology + * we found, don't reuse the cached topology + */ + if (cached_topology->n_io_concentrators != + (iocs_per_node * cached_topology->app_layout->node_count)) + continue; + } + + if (MPI_SUCCESS != (mpi_code = MPI_Comm_compare(comm, cached_topology->app_comm, &result))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_compare failed", mpi_code); + + if (MPI_IDENT == result || MPI_CONGRUENT == result) { + *app_topology = cached_topology; + break; + } + } + +done: + H5_SUBFILING_FUNC_LEAVE; +} + +/*------------------------------------------------------------------------- + * Function: init_app_layout + * + * Purpose: Determines the layout of MPI ranks across nodes in order to + * figure out the final application topology + * + * Return: Non-negative on success/Negative on failure + * + *------------------------------------------------------------------------- + */ +static herr_t +init_app_layout(sf_topology_t *app_topology, MPI_Comm comm, MPI_Comm node_comm) +{ + app_layout_t *app_layout = NULL; + int mpi_code; + herr_t ret_value = SUCCEED; + + HDassert(app_topology); + HDassert(!app_topology->app_layout); + HDassert(MPI_COMM_NULL != comm); + HDassert(MPI_COMM_NULL != node_comm); + + if (NULL == (app_layout = HDcalloc(1, sizeof(*app_layout)))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, + "couldn't allocate application layout structure"); + + if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &app_layout->world_rank))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code); + if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(comm, &app_layout->world_size))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code); + if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(node_comm, &app_layout->node_local_rank))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code); + if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(node_comm, &app_layout->node_local_size))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code); + + if (NULL == (app_layout->layout = HDmalloc((size_t)app_layout->world_size * sizeof(*app_layout->layout)))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, + "couldn't allocate application layout array"); + + /* Gather the list of layout_t pairs to all ranks */ + if (gather_topology_info(app_layout, comm, node_comm) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "can't gather application topology info"); + + /* Sort the list according to the node local lead rank values */ + HDqsort(app_layout->layout, (size_t)app_layout->world_size, sizeof(layout_t), compare_layout_nodelocal); + + /* + * Count the number of nodes by checking how many + * entries have a node local rank value of 0 + */ + app_layout->node_count = 0; + for (size_t i = 0; i < (size_t)app_layout->world_size; i++) + if (app_layout->layout[i].node_local_rank == 0) + app_layout->node_count++; + + HDassert(app_layout->node_count > 0); + + if (NULL == + (app_layout->node_ranks = HDmalloc((size_t)app_layout->node_count * sizeof(*app_layout->node_ranks)))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, + "couldn't allocate application layout node rank array"); + + /* + * Record the rank value of the "lead" + * MPI rank on each node for later use + */ + for (size_t i = 0, node_rank_index = 0; i < (size_t)app_layout->world_size; i++) { + if (app_layout->layout[i].node_local_rank == 0) { + HDassert(node_rank_index < (size_t)app_layout->node_count); + app_layout->node_ranks[node_rank_index++] = app_layout->layout[i].rank; + } + } + + app_topology->app_layout = app_layout; + +done: + if (ret_value < 0) { + if (app_layout) { + HDfree(app_layout->layout); + HDfree(app_layout->node_ranks); + HDfree(app_layout); + } + } + + H5_SUBFILING_FUNC_LEAVE; +} + +/*------------------------------------------------------------------------- + * Function: gather_topology_info + * + * Purpose: Collectively generate a list of layout_t structures + * + * Return: Non-negative on success/Negative on failure + * + *------------------------------------------------------------------------- + */ +static herr_t +gather_topology_info(app_layout_t *app_layout, MPI_Comm comm, MPI_Comm intra_comm) +{ + MPI_Group file_group = MPI_GROUP_NULL; + MPI_Group node_group = MPI_GROUP_NULL; + layout_t my_layout_info; + layout_t *layout_info_partial = NULL; + MPI_Comm aggr_comm = MPI_COMM_NULL; + int *recv_counts = NULL; + int *recv_displs = NULL; + int sf_world_size; + int sf_world_rank; + int node_local_rank; + int node_local_size; + int mpi_code; + herr_t ret_value = SUCCEED; + + HDassert(app_layout); + HDassert(app_layout->layout); + HDassert(MPI_COMM_NULL != comm); + + sf_world_rank = app_layout->world_rank; + sf_world_size = app_layout->world_size; + node_local_rank = app_layout->node_local_rank; + node_local_size = app_layout->node_local_size; + + my_layout_info.rank = sf_world_rank; + my_layout_info.node_local_rank = node_local_rank; + my_layout_info.node_local_size = node_local_size; + + /* + * Get the rank value for the "lead" rank on this + * rank's node so that we can group the layout_t + * information for all node-local ranks together + */ + { + const int local_lead = 0; + int lead_rank; + + if (MPI_SUCCESS != (mpi_code = MPI_Comm_group(comm, &file_group))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_group failed", mpi_code); + if (MPI_SUCCESS != (mpi_code = MPI_Comm_group(intra_comm, &node_group))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_group failed", mpi_code); + if (MPI_SUCCESS != + (mpi_code = MPI_Group_translate_ranks(node_group, 1, &local_lead, file_group, &lead_rank))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Group_translate_ranks failed", mpi_code); + + if (MPI_UNDEFINED == lead_rank) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't determine lead rank on node"); + + my_layout_info.node_lead_rank = lead_rank; + + if (MPI_SUCCESS != (mpi_code = MPI_Group_free(&node_group))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Group_free failed", mpi_code); + if (MPI_SUCCESS != (mpi_code = MPI_Group_free(&file_group))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Group_free failed", mpi_code); + } - case SELECT_IOC_WITH_CONFIG: - HDprintf("SELECT_IOC_WITH_CONFIG IOC selection strategy not supported yet; defaulting to " - "SELECT_IOC_ONE_PER_NODE\n"); - ioc_selection_type = SELECT_IOC_ONE_PER_NODE; - break; + app_layout->layout[sf_world_rank] = my_layout_info; - case SELECT_IOC_TOTAL: { - errno = 0; + if (sf_world_size > 1) { +#ifdef H5_SUBFILING_PREFER_ALLGATHER_TOPOLOGY + (void)intra_comm; - ioc_select_val = 1; - if (ioc_sel_str) { - ioc_select_val = HDstrtol(ioc_sel_str, NULL, 0); - if ((ERANGE == errno) || (ioc_select_val <= 0) || (ioc_select_val >= comm_size)) { - HDprintf("invalid IOC selection strategy string '%s' for strategy SELECT_IOC_TOTAL; " - "defaulting to SELECT_IOC_ONE_PER_NODE\n", - ioc_sel_str); - ioc_select_val = 1; - ioc_selection_type = SELECT_IOC_ONE_PER_NODE; - } - } + if (MPI_SUCCESS != + (mpi_code = MPI_Allgather(&my_layout_info, 4, MPI_INT, app_layout->layout, 4, MPI_INT, comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Allgather failed", mpi_code); +#else + int aggr_comm_size = 0; - break; + HDassert(MPI_COMM_NULL != intra_comm); + + /* Split the file communicator into a sub-group of one rank per node */ + if (MPI_SUCCESS != (mpi_code = MPI_Comm_split(comm, node_local_rank, sf_world_rank, &aggr_comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_split failed", mpi_code); + + if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(aggr_comm, &aggr_comm_size))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code); + + /* Allocate a partial layout info array to aggregate into from node-local ranks */ + if (node_local_rank == 0) { + if (NULL == + (layout_info_partial = HDmalloc((size_t)node_local_size * sizeof(*layout_info_partial)))) + /* Push error, but participate in gather operation */ + H5_SUBFILING_DONE_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, + "can't allocate layout info array"); } - default: - break; + /* Gather node-local layout info to single master rank on each node */ + if (MPI_SUCCESS != (mpi_code = MPI_Gather(&my_layout_info, 4, MPI_INT, layout_info_partial, 4, + MPI_INT, 0, intra_comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Gather failed", mpi_code); + + /* Gather total layout info from/to each master rank on each node */ + if (node_local_rank == 0) { + int send_size = 4 * node_local_size; + + if (NULL == (recv_counts = HDmalloc((size_t)aggr_comm_size * sizeof(*recv_counts)))) + H5_SUBFILING_DONE_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, + "can't allocate receive counts array"); + if (NULL == (recv_displs = HDmalloc((size_t)aggr_comm_size * sizeof(*recv_displs)))) + H5_SUBFILING_DONE_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, + "can't allocate receive displacements array"); + + if (MPI_SUCCESS != + (mpi_code = MPI_Allgather(&send_size, 1, MPI_INT, recv_counts, 1, MPI_INT, aggr_comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Allgather failed", mpi_code); + + recv_displs[0] = 0; + for (int i = 1; i < aggr_comm_size; i++) + recv_displs[i] = recv_displs[i - 1] + recv_counts[i - 1]; + + if (MPI_SUCCESS != + (mpi_code = MPI_Allgatherv(layout_info_partial, send_size, MPI_INT, app_layout->layout, + recv_counts, recv_displs, MPI_INT, aggr_comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Allgatherv failed", mpi_code); + + HDfree(recv_displs); + HDfree(recv_counts); + recv_displs = NULL; + recv_counts = NULL; + } + + /* + * Each master rank on each node distributes the total + * layout info back to other node-local ranks + */ + if (MPI_SUCCESS != + (mpi_code = MPI_Bcast(app_layout->layout, 4 * sf_world_size, MPI_INT, 0, intra_comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code); +#endif } - /* Allocate new application topology information object */ - if (NULL == (app_topology = HDcalloc(1, sizeof(*app_topology)))) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, - "couldn't create new subfiling topology object"); +done: + HDfree(recv_displs); + HDfree(recv_counts); + HDfree(layout_info_partial); - app_topology->subfile_rank = -1; - app_topology->selection_type = ioc_selection_type; + if (H5_mpi_comm_free(&aggr_comm) < 0) + H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI communicator"); - if (NULL == (app_topology->io_concentrators = HDcalloc((size_t)comm_size, sizeof(int)))) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, - "couldn't allocate array of I/O concentrator ranks"); + if (node_group != MPI_GROUP_NULL) + if (MPI_SUCCESS != (mpi_code = MPI_Group_free(&node_group))) + H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Group_free failed", mpi_code); + if (file_group != MPI_GROUP_NULL) + if (MPI_SUCCESS != (mpi_code = MPI_Group_free(&file_group))) + H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Group_free failed", mpi_code); - if (!app_layout) { - if (NULL == (app_layout = HDcalloc(1, sizeof(*app_layout)))) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, - "couldn't allocate application layout structure"); + H5_SUBFILING_FUNC_LEAVE; +} - if (NULL == (app_layout->node_ranks = HDcalloc(1, ((size_t)comm_size + 1) * sizeof(int)))) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, - "couldn't allocate application layout node rank array"); +/*------------------------------------------------------------------------- + * Function: compare_layout_nodelocal + * + * Purpose: Qsort sorting callback that sorts layout_t structures + * according to their node local lead MPI rank values. Ties + * are broken according to their regular node local MPI rank + * values + * + *------------------------------------------------------------------------- + */ +static int +compare_layout_nodelocal(const void *layout1, const void *layout2) +{ + const layout_t *l1 = (const layout_t *)layout1; + const layout_t *l2 = (const layout_t *)layout2; - if (NULL == (app_layout->layout = HDcalloc(1, ((size_t)comm_size + 1) * sizeof(layout_t)))) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, - "couldn't allocate application layout array"); + if (l1->node_lead_rank == l2->node_lead_rank) { + return (l1->node_local_rank > l2->node_local_rank) - (l1->node_local_rank < l2->node_local_rank); } + else + return (l1->node_lead_rank > l2->node_lead_rank) - (l1->node_lead_rank < l2->node_lead_rank); +} + +/*------------------------------------------------------------------------- + * Function: identify_ioc_ranks + * + * Purpose: We've already identified the number of unique nodes and + * have a sorted list of layout_t structures. Under normal + * conditions, we only utilize a single IOC per node. Under + * that circumstance, we only need to fill the + * io_concentrators vector from the node_ranks array (which + * contains the index into the layout array of lowest MPI rank + * on each node) into the io_concentrators vector; Otherwise, + * while determining the number of local ranks per node, we + * can also select one or more additional IOCs. + * + * As a side effect, we fill the 'io_concentrators' vector + * and set the 'rank_is_ioc' flag to TRUE if our rank is + * identified as owning an I/O Concentrator (IOC). + * + *------------------------------------------------------------------------- + */ +static herr_t +identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride) +{ + app_layout_t *app_layout = NULL; + int *io_concentrators = NULL; + int max_iocs = 0; + herr_t ret_value = SUCCEED; - app_layout->world_size = comm_size; - app_layout->world_rank = comm_rank; + HDassert(app_topology); + HDassert(!app_topology->io_concentrators); + HDassert(app_topology->n_io_concentrators > 0); + HDassert(app_topology->app_layout); + HDassert(app_topology->app_layout->layout); + HDassert(app_topology->app_layout->node_count > 0); - app_topology->app_layout = app_layout; + app_layout = app_topology->app_layout; - gather_topology_info(app_topology, comm); + max_iocs = app_topology->n_io_concentrators; - /* - * Determine which ranks are I/O concentrator ranks, based on the - * given IOC selection strategy and MPI information. - */ - switch (ioc_selection_type) { + if (NULL == (app_topology->io_concentrators = HDmalloc((size_t)app_topology->n_io_concentrators * + sizeof(*app_topology->io_concentrators)))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, + "couldn't allocate array of I/O concentrator ranks"); + + io_concentrators = app_topology->io_concentrators; + + switch (app_topology->selection_type) { case SELECT_IOC_ONE_PER_NODE: { - int node_count; + int total_ioc_count = 0; + int iocs_per_node = 1; - app_topology->selection_type = SELECT_IOC_ONE_PER_NODE; + if (app_topology->n_io_concentrators > app_layout->node_count) + iocs_per_node = app_topology->n_io_concentrators / app_layout->node_count; - if ((node_count = count_nodes(app_topology, comm)) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, - "couldn't determine number of nodes used"); + HDassert(app_layout->node_ranks); - /* Check for an IOC-per-node value set in the environment */ - if ((env_value = HDgetenv(H5FD_SUBFILING_IOC_PER_NODE))) { - errno = 0; - ioc_select_val = HDstrtol(env_value, NULL, 0); - if ((ERANGE == errno)) { - HDprintf("invalid value '%s' for " H5FD_SUBFILING_IOC_PER_NODE "\n", env_value); - ioc_select_val = 1; + for (size_t i = 0; i < (size_t)app_layout->node_count; i++) { + int node_index = app_layout->node_ranks[i]; + int local_size = app_layout->layout[node_index].node_local_size; + + HDassert(total_ioc_count < app_topology->n_io_concentrators); + io_concentrators[total_ioc_count] = app_layout->layout[node_index++].rank; + + if (app_layout->world_rank == io_concentrators[total_ioc_count]) { + app_topology->ioc_idx = total_ioc_count; + app_topology->rank_is_ioc = TRUE; } - if (ioc_select_val > 0) - iocs_per_node = ioc_select_val; - } + total_ioc_count++; - H5_CHECK_OVERFLOW(iocs_per_node, long, int); - ioc_count = identify_ioc_ranks(app_topology, node_count, (int)iocs_per_node); + for (size_t j = 1; j < (size_t)iocs_per_node; j++) { + if (total_ioc_count >= max_iocs) + break; + if (j >= (size_t)local_size) + break; - break; - } + HDassert(total_ioc_count < app_topology->n_io_concentrators); + io_concentrators[total_ioc_count] = app_layout->layout[node_index++].rank; - case SELECT_IOC_EVERY_NTH_RANK: { - /* - * User specifies a rank multiple value. Selection starts - * with rank 0 and then the user-specified stride is applied\ - * to identify other IOC ranks. - */ + if (app_layout->world_rank == io_concentrators[total_ioc_count]) { + app_topology->ioc_idx = total_ioc_count; + app_topology->rank_is_ioc = TRUE; + } - H5_CHECK_OVERFLOW(ioc_select_val, long, int); - ioc_count = (comm_size / (int)ioc_select_val); + total_ioc_count++; + } - if ((comm_size % ioc_select_val) != 0) { - ioc_count++; + if (total_ioc_count >= max_iocs) + break; } - assign_ioc_ranks(app_topology, ioc_count, (int)ioc_select_val); + /* Set final number of I/O concentrators after adjustments */ + app_topology->n_io_concentrators = total_ioc_count; break; } + case SELECT_IOC_EVERY_NTH_RANK: case SELECT_IOC_TOTAL: { - int rank_multiple = 0; + int world_size = app_layout->world_size; + int ioc_next = 0; - /* - * User specifies a total number of I/O concentrators. - * Starting with rank 0, a stride of (mpi_size / total) - * is applied to identify other IOC ranks. - */ + HDassert(rank_stride > 0); - H5_CHECK_OVERFLOW(ioc_select_val, long, int); - ioc_count = (int)ioc_select_val; + for (int i = 0; ioc_next < app_topology->n_io_concentrators; ioc_next++) { + int ioc_index = rank_stride * i++; - rank_multiple = (comm_size / ioc_count); + if (ioc_index >= world_size) + break; + + io_concentrators[ioc_next] = app_layout->layout[ioc_index].rank; + + if (app_layout->world_rank == io_concentrators[ioc_next]) { + app_topology->ioc_idx = ioc_next; + app_topology->rank_is_ioc = TRUE; + } + + if (ioc_next + 1 >= max_iocs) + break; + } - assign_ioc_ranks(app_topology, ioc_count, rank_multiple); + /* Set final number of I/O concentrators after adjustments */ + app_topology->n_io_concentrators = ioc_next; break; } @@ -1150,31 +1640,10 @@ init_app_topology(H5FD_subfiling_ioc_select_t ioc_selection_type, MPI_Comm comm, break; } - HDassert(ioc_count > 0); - app_topology->n_io_concentrators = ioc_count; - - /* - * Create a vector of "potential" file descriptors - * which can be indexed by the IOC ID - */ - if (NULL == (app_topology->subfile_fd = HDcalloc((size_t)ioc_count, sizeof(int)))) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, - "couldn't allocate subfile file descriptor array"); - - *app_topology_out = app_topology; - done: if (ret_value < 0) { - if (app_layout) { - HDfree(app_layout->layout); - HDfree(app_layout->node_ranks); - HDfree(app_layout); - } - if (app_topology) { - HDfree(app_topology->subfile_fd); + if (app_topology) HDfree(app_topology->io_concentrators); - HDfree(app_topology); - } } H5_SUBFILING_FUNC_LEAVE; @@ -1196,77 +1665,104 @@ done: *------------------------------------------------------------------------- */ static herr_t -init_subfiling_context(subfiling_context_t *sf_context, H5FD_subfiling_shared_config_t *subfiling_config, - sf_topology_t *app_topology, MPI_Comm file_comm) +init_subfiling_context(subfiling_context_t *sf_context, const char *base_filename, uint64_t file_id, + H5FD_subfiling_params_t *subfiling_config, sf_topology_t *app_topology, + MPI_Comm file_comm) { char *env_value = NULL; - int comm_rank; + int mpi_rank; int mpi_code; herr_t ret_value = SUCCEED; HDassert(sf_context); HDassert(sf_context->topology == NULL); + HDassert(sf_context->sf_context_id >= 0); + HDassert(base_filename); + HDassert(file_id != UINT64_MAX); HDassert(subfiling_config); HDassert(app_topology); HDassert(app_topology->n_io_concentrators > 0); HDassert(MPI_COMM_NULL != file_comm); - sf_context->topology = app_topology; + sf_context->h5_file_id = file_id; + sf_context->sf_fids = NULL; + sf_context->sf_num_fids = 0; + sf_context->sf_num_subfiles = subfiling_config->stripe_count; + sf_context->sf_write_count = 0; + sf_context->sf_read_count = 0; + sf_context->sf_eof = HADDR_UNDEF; + sf_context->sf_stripe_size = H5FD_SUBFILING_DEFAULT_STRIPE_SIZE; + sf_context->sf_base_addr = 0; sf_context->sf_msg_comm = MPI_COMM_NULL; sf_context->sf_data_comm = MPI_COMM_NULL; sf_context->sf_eof_comm = MPI_COMM_NULL; - sf_context->sf_barrier_comm = MPI_COMM_NULL; + sf_context->sf_node_comm = MPI_COMM_NULL; sf_context->sf_group_comm = MPI_COMM_NULL; - sf_context->sf_intercomm = MPI_COMM_NULL; - sf_context->sf_stripe_size = H5FD_SUBFILING_DEFAULT_STRIPE_SIZE; - sf_context->sf_write_count = 0; - sf_context->sf_read_count = 0; - sf_context->sf_eof = HADDR_UNDEF; - sf_context->h5_file_handle = NULL; - sf_context->sf_fid = -1; sf_context->sf_group_size = 1; sf_context->sf_group_rank = 0; - sf_context->h5_filename = NULL; - sf_context->sf_filename = NULL; sf_context->subfile_prefix = NULL; + sf_context->h5_filename = NULL; sf_context->ioc_data = NULL; + sf_context->topology = app_topology; #ifdef H5_SUBFILING_DEBUG sf_context->sf_logfile = NULL; #endif + if (NULL == (sf_context->h5_filename = HDstrdup(base_filename))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, + "couldn't allocate space for subfiling filename"); + + /* Check for a subfile name prefix setting in the environment */ + if ((env_value = HDgetenv(H5FD_SUBFILING_SUBFILE_PREFIX))) { + if (NULL == (sf_context->subfile_prefix = HDstrdup(env_value))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't copy subfile prefix value"); + } + /* - * Set IOC stripe size from subfiling configuration, then check - * for a setting from the environment + * Set IOC stripe size from subfiling configuration */ if (subfiling_config->stripe_size > 0) sf_context->sf_stripe_size = subfiling_config->stripe_size; - if ((env_value = HDgetenv(H5FD_SUBFILING_STRIPE_SIZE))) { - long long stripe_size = -1; - - errno = 0; - - stripe_size = HDstrtoll(env_value, NULL, 0); - if (ERANGE == errno) - H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, - "invalid stripe size setting for " H5FD_SUBFILING_STRIPE_SIZE); - - if (stripe_size > 0) { - sf_context->sf_stripe_size = (int64_t)stripe_size; - } - } + /* + * If still set to the default, set the number of subfiles + * according to the default mapping of 1 I/O concentrator + * -> 1 subfile + */ + if (sf_context->sf_num_subfiles == H5FD_SUBFILING_DEFAULT_STRIPE_COUNT) + sf_context->sf_num_subfiles = app_topology->n_io_concentrators; /* * Set blocksize per stripe value after possibly adjusting - * for user-specified subfile stripe size + * for user-specified subfile stripe size and number of + * subfiles */ - sf_context->sf_blocksize_per_stripe = sf_context->sf_stripe_size * app_topology->n_io_concentrators; + sf_context->sf_blocksize_per_stripe = sf_context->sf_stripe_size * sf_context->sf_num_subfiles; - /* Check for a subfile name prefix setting in the environment */ - if ((env_value = HDgetenv(H5FD_SUBFILING_SUBFILE_PREFIX))) { - if (NULL == (sf_context->subfile_prefix = HDstrdup(env_value))) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't copy subfile prefix value"); + if (app_topology->rank_is_ioc) { + int leftover_subfiles; + + /* Adjust base address after stripe size is set, if necessary */ + sf_context->sf_base_addr = (int64_t)(app_topology->ioc_idx * sf_context->sf_stripe_size); + + /* + * Calculate the number of subfiles this rank owns by + * round-robining them across the available IOCs and + * then allocate an array for the subfile IDs + */ + sf_context->sf_num_fids = sf_context->sf_num_subfiles / app_topology->n_io_concentrators; + + leftover_subfiles = sf_context->sf_num_subfiles % app_topology->n_io_concentrators; + if (leftover_subfiles && (leftover_subfiles > app_topology->ioc_idx)) + sf_context->sf_num_fids++; + + if (NULL == + (sf_context->sf_fids = HDmalloc((size_t)sf_context->sf_num_fids * sizeof(*sf_context->sf_fids)))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't allocate subfile IDs array"); + + for (int i = 0; i < sf_context->sf_num_fids; i++) + sf_context->sf_fids[i] = -1; } /* @@ -1274,7 +1770,7 @@ init_subfiling_context(subfiling_context_t *sf_context, H5FD_subfiling_shared_co * to/from IOC ranks */ - if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(file_comm, &comm_rank))) + if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(file_comm, &mpi_rank))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code); if (MPI_SUCCESS != (mpi_code = MPI_Comm_dup(file_comm, &sf_context->sf_msg_comm))) @@ -1295,15 +1791,9 @@ init_subfiling_context(subfiling_context_t *sf_context, H5FD_subfiling_shared_co if (MPI_SUCCESS != (mpi_code = MPI_Comm_set_errhandler(sf_context->sf_eof_comm, MPI_ERRORS_RETURN))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_set_errhandler failed", mpi_code); - if (MPI_SUCCESS != (mpi_code = MPI_Comm_dup(file_comm, &sf_context->sf_barrier_comm))) - H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_dup failed", mpi_code); - - if (MPI_SUCCESS != (mpi_code = MPI_Comm_set_errhandler(sf_context->sf_barrier_comm, MPI_ERRORS_RETURN))) - H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_set_errhandler failed", mpi_code); - /* Create an MPI sub-communicator for IOC ranks */ if (app_topology->n_io_concentrators > 1) { - if (MPI_SUCCESS != (mpi_code = MPI_Comm_split(file_comm, app_topology->rank_is_ioc, comm_rank, + if (MPI_SUCCESS != (mpi_code = MPI_Comm_split(file_comm, app_topology->rank_is_ioc, mpi_rank, &sf_context->sf_group_comm))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_split failed", mpi_code); @@ -1314,11 +1804,18 @@ init_subfiling_context(subfiling_context_t *sf_context, H5FD_subfiling_shared_co H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code); } -done: - if (ret_value < 0) { - H5_free_subfiling_object_int(sf_context); - } + /* Perform some final validation of subfiling configuration */ + if (sf_context->sf_stripe_size <= 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "invalid subfiling stripe size (%" PRId64 ")", + sf_context->sf_stripe_size); + + if (sf_context->sf_num_subfiles <= 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "invalid subfiling stripe count (%d)", + sf_context->sf_num_subfiles); + + HDassert(sf_context->sf_num_subfiles >= app_topology->n_io_concentrators); +done: H5_SUBFILING_FUNC_LEAVE; } @@ -1362,37 +1859,29 @@ open_subfile_with_context(subfiling_context_t *sf_context, int file_acc_flags) herr_t ret_value = SUCCEED; HDassert(sf_context); + HDassert(sf_context->h5_file_id != UINT64_MAX); /* - * Save the HDF5 file ID (fid) to subfile context mapping. + * Save the HDF5 file ID (e.g., inode) to subfile context mapping. * There shouldn't be any issue, but check the status and * return if there was a problem. */ - if (record_fid_to_subfile(sf_context->h5_file_handle, sf_context->sf_context_id, NULL) < 0) + if (record_fid_to_subfile(sf_context->h5_file_id, sf_context->sf_context_id, NULL) < 0) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't record HDF5 file ID to subfile context mapping"); /* * If this rank is an I/O concentrator, actually open - * the subfile belonging to this IOC rank + * the subfiles belonging to this IOC rank */ if (sf_context->topology->rank_is_ioc) { - h5_stat_t st; - - /* Retrieve Inode value for HDF5 stub file */ - if (HDstat(sf_context->h5_filename, &st) < 0) - H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "couldn't stat HDF5 stub file"); - - HDcompile_assert(sizeof(uint64_t) >= sizeof(ino_t)); - sf_context->h5_file_id = (uint64_t)st.st_ino; - - if (ioc_open_file(sf_context->sf_context_id, file_acc_flags) < 0) + if (ioc_open_files(sf_context->sf_context_id, file_acc_flags) < 0) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, FAIL, "IOC couldn't open subfile"); } done: if (ret_value < 0) { - clear_fid_map_entry(sf_context->h5_file_handle, sf_context->sf_context_id); + clear_fid_map_entry(sf_context->h5_file_id, sf_context->sf_context_id); } H5_SUBFILING_FUNC_LEAVE; @@ -1429,29 +1918,29 @@ done: *------------------------------------------------------------------------- */ static herr_t -record_fid_to_subfile(void *file_handle, int64_t subfile_context_id, int *next_index) +record_fid_to_subfile(uint64_t file_id, int64_t subfile_context_id, int *next_index) { int index; herr_t ret_value = SUCCEED; - if (sf_file_map_size == 0) { + if (!sf_open_file_map) { if (NULL == (sf_open_file_map = HDmalloc((size_t)DEFAULT_FILE_MAP_ENTRIES * sizeof(*sf_open_file_map)))) H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't allocate open file mapping"); sf_file_map_size = DEFAULT_FILE_MAP_ENTRIES; for (int i = 0; i < sf_file_map_size; i++) { - sf_open_file_map[i].file_handle = NULL; + sf_open_file_map[i].file_id = UINT64_MAX; sf_open_file_map[i].sf_context_id = -1; } } for (index = 0; index < sf_file_map_size; index++) { - if (sf_open_file_map[index].file_handle == file_handle) + if (sf_open_file_map[index].file_id == file_id) goto done; - if (sf_open_file_map[index].file_handle == NULL) { - sf_open_file_map[index].file_handle = file_handle; + if (sf_open_file_map[index].file_id == UINT64_MAX) { + sf_open_file_map[index].file_id = file_id; sf_open_file_map[index].sf_context_id = subfile_context_id; if (next_index) { @@ -1474,14 +1963,14 @@ record_fid_to_subfile(void *file_handle, int64_t subfile_context_id, int *next_i sf_file_map_size *= 2; for (int i = index; i < sf_file_map_size; i++) { - sf_open_file_map[i].file_handle = NULL; + sf_open_file_map[i].file_id = UINT64_MAX; } if (next_index) { *next_index = index; } - sf_open_file_map[index].file_handle = file_handle; + sf_open_file_map[index].file_id = file_id; sf_open_file_map[index++].sf_context_id = subfile_context_id; } @@ -1490,13 +1979,44 @@ done: } /*------------------------------------------------------------------------- - * Function: ioc_open_file + * Function: clear_fid_map_entry + * + * Purpose: Remove the map entry associated with the file->inode. + * This is done at file close. + * + * Return: None + * Errors: Cannot fail. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +static void +clear_fid_map_entry(uint64_t file_id, int64_t sf_context_id) +{ + if (sf_open_file_map) { + for (int i = 0; i < sf_file_map_size; i++) { + if ((sf_open_file_map[i].file_id == file_id) && + (sf_open_file_map[i].sf_context_id == sf_context_id)) { + sf_open_file_map[i].file_id = UINT64_MAX; + sf_open_file_map[i].sf_context_id = -1; + return; + } + } + } +} /* end clear_fid_map_entry() */ + +/*------------------------------------------------------------------------- + * Function: ioc_open_files * * Purpose: This function is called by an I/O concentrator in order to - * open the subfile it is responsible for. + * open the subfiles it is responsible for. * - * The name of the subfile to be opened is generated based on - * values from either: + * The names of the subfiles to be opened are generated based + * on values from either: * * - The corresponding subfiling configuration file, if one * exists and the HDF5 file isn't being truncated @@ -1504,7 +2024,7 @@ done: * subfiling configuration file doesn't exist or the HDF5 * file is being truncated * - * After the subfile has been opened, a subfiling + * After the subfiles have been opened, a subfiling * configuration file will be created if this is a file * creation operation. If the truncate flag is specified, the * subfiling configuration file will be re-created in order to @@ -1525,210 +2045,113 @@ done: * * Changes: Initial Version/None. * - *------------------------------------------------------------------------- - */ -static herr_t -ioc_open_file(int64_t file_context_id, int file_acc_flags) -{ - subfiling_context_t *sf_context = NULL; - mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; - char *filepath = NULL; - char *subfile_dir = NULL; - char *base = NULL; - int fd = -1; - herr_t ret_value = SUCCEED; - - if (NULL == (sf_context = H5_get_subfiling_object(file_context_id))) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, FAIL, - "couldn't get subfiling object from context ID"); - - /* Only IOC ranks should be here */ - HDassert(sf_context->topology); - HDassert(sf_context->topology->subfile_rank >= 0); - - if (NULL == (filepath = HDcalloc(1, PATH_MAX))) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, - "couldn't allocate space for subfile filename"); - - /* Generate the name of the subfile that this IOC rank will open */ - if (generate_subfile_name(sf_context, file_acc_flags, filepath, PATH_MAX, &base, &subfile_dir) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, FAIL, "couldn't generate name for subfile"); - - if (NULL == (sf_context->sf_filename = HDstrdup(filepath))) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't copy subfile name"); - - /* Attempt to create/open the subfile for this IOC rank */ - if ((fd = HDopen(filepath, file_acc_flags, mode)) < 0) - H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL, "failed to open subfile"); - - sf_context->sf_fid = fd; - if (file_acc_flags & O_CREAT) - sf_context->sf_eof = 0; - - /* - * If subfiles were created (rather than simply opened), - * check if we also need to create a config file. - */ - if ((file_acc_flags & O_CREAT) && (sf_context->topology->subfile_rank == 0)) { - if (create_config_file(sf_context, base, subfile_dir, (file_acc_flags & O_TRUNC)) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTCREATE, FAIL, - "couldn't create subfiling configuration file"); - } - -done: - if (ret_value < 0) { - if (sf_context) { - HDfree(sf_context->sf_filename); - sf_context->sf_filename = NULL; - - if (sf_context->sf_fid >= 0) { - HDclose(sf_context->sf_fid); - sf_context->sf_fid = -1; - } - } - } - - H5MM_free(base); - H5MM_free(subfile_dir); - HDfree(filepath); - - H5_SUBFILING_FUNC_LEAVE; -} - -/* - * Generate the name of the subfile this IOC rank will open, - * based on available information. - * - * This may include: - * - the subfiling configuration (from a subfiling configuration - * file if one exists, or from the subfiling context object - * otherwise) - * - the base file's name and ID (inode or similar) - * - the IOC's rank value within the set of I/O concentrators - * - an optional filename prefix specified by the user + *------------------------------------------------------------------------- */ static herr_t -generate_subfile_name(subfiling_context_t *sf_context, int file_acc_flags, char *filename_out, - size_t filename_out_len, char **filename_basename_out, char **subfile_dir_out) +ioc_open_files(int64_t file_context_id, int file_acc_flags) { - FILE *config_file = NULL; - char *subfile_dir = NULL; - char *prefix = NULL; - char *base = NULL; - int n_io_concentrators; - int num_digits; - herr_t ret_value = SUCCEED; + subfiling_context_t *sf_context = NULL; + mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; + char *filepath = NULL; + char *subfile_dir = NULL; + char *base = NULL; + int num_subfiles = 0; + int num_digits = 0; + herr_t ret_value = SUCCEED; - HDassert(sf_context); + if (NULL == (sf_context = H5_get_subfiling_object(file_context_id))) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, FAIL, + "couldn't get subfiling object from context ID"); + + HDassert(sf_context->h5_file_id != UINT64_MAX); HDassert(sf_context->h5_filename); - HDassert(filename_out); - HDassert(filename_basename_out); - HDassert(subfile_dir_out); + HDassert(sf_context->sf_fids); + HDassert(sf_context->sf_num_subfiles > 0); + HDassert(sf_context->sf_num_fids > 0); + HDassert(sf_context->topology); + HDassert(sf_context->topology->ioc_idx >= 0); /* Only IOC ranks should be here */ - *filename_basename_out = NULL; - *subfile_dir_out = NULL; + /* Get the basename of the full HDF5 filename */ + if (H5_basename(sf_context->h5_filename, &base) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't get HDF5 file basename"); /* - * Initially use the number of I/O concentrators specified in the - * subfiling context. However, if there's an existing subfiling - * configuration file (and we aren't truncating it) we will use - * the number specified there instead, as that should be the actual - * number that the subfile names were originally generated with. - * The current subfiling context may have a different number of I/O - * concentrators specified; e.g. a simple serial file open for - * reading purposes (think h5dump) might only be using 1 I/O - * concentrator, whereas the file was created with several I/O - * concentrators. + * Get the directory prefix where subfiles will be placed. + * Under normal circumstances, the subfiles are co-located + * with the HDF5 file, but users may specify a different + * directory name. */ - n_io_concentrators = sf_context->topology->n_io_concentrators; - - if (NULL == (prefix = HDmalloc(PATH_MAX))) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, - "couldn't allocate space for subfile prefix"); - - /* Under normal operation, we co-locate subfiles with the HDF5 file */ - HDstrncpy(prefix, sf_context->h5_filename, PATH_MAX - 1); - prefix[PATH_MAX - 1] = '\0'; - - if (H5_basename(prefix, &base) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't get subfile basename"); - if (sf_context->subfile_prefix) { - /* Note: Users may specify a directory name which is inaccessible - * from where the current is running. In particular, "node-local" - * storage is not uniformly available to all processes. - * We would like to check if the user pathname unavailable and - * if so, we could default to creating the subfiles in the - * current directory. (?) - */ if (NULL == (subfile_dir = H5MM_strdup(sf_context->subfile_prefix))) H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't copy subfile prefix"); } else { - if (H5_dirname(prefix, &subfile_dir) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't get subfile prefix"); + if (H5_dirname(sf_context->h5_filename, &subfile_dir) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't get HDF5 file dirname"); } - /* - * Open the file's subfiling configuration file, if it exists and - * we aren't truncating the file. - */ - if (0 == (file_acc_flags & O_TRUNC)) { - if (open_config_file(sf_context, base, subfile_dir, "r", &config_file) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, FAIL, - "couldn't open existing subfiling configuration file"); - } + if (NULL == (filepath = HDmalloc(PATH_MAX))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, + "couldn't allocate space for subfile filename"); + + num_subfiles = sf_context->sf_num_subfiles; + num_digits = (int)(HDlog10(num_subfiles) + 1); /* - * If a subfiling configuration file exists and we aren't truncating - * it, read the number of I/O concentrators used at file creation time - * in order to generate the correct subfile names. + * For each subfile this IOC rank owns, generate the name + * of the subfile and create/open it */ - if (config_file) { - if (H5_get_num_iocs_from_config_file(config_file, &n_io_concentrators) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, - "couldn't read from subfiling configuration file"); + for (int i = 0; i < sf_context->sf_num_fids; i++) { + int subfile_idx; + + /* Round-robin subfiles among the available IOCs */ + subfile_idx = (i * sf_context->topology->n_io_concentrators) + sf_context->topology->ioc_idx + 1; + + /* + * Generate the name of the subfile. The subfile naming should + * produce files of the following form: + * If we assume the HDF5 file is named ABC.h5, and 20 subfiles + * are used, then the subfiles will have names: + * ABC.h5.subfile__01_of_20, + * ABC.h5.subfile__02_of_20, etc. + * + * and the configuration file will be named: + * ABC.h5.subfile_.config + */ + HDsnprintf(filepath, PATH_MAX, "%s/" H5FD_SUBFILING_FILENAME_TEMPLATE, subfile_dir, base, + sf_context->h5_file_id, num_digits, subfile_idx, num_subfiles); + + if ((sf_context->sf_fids[i] = HDopen(filepath, file_acc_flags, mode)) < 0) + H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL, "failed to open subfile"); } + if (file_acc_flags & O_CREAT) + sf_context->sf_eof = 0; + /* - * Generate the name of the subfile. The subfile naming should - * produce files of the following form: - * If we assume the HDF5 file is named ABC.h5, and 20 I/O - * concentrators are used, then the subfiles will have names: - * ABC.h5.subfile__01_of_20, - * ABC.h5.subfile__02_of_20, etc. - * - * and the configuration file will be named: - * ABC.h5.subfile_.config + * If subfiles were created (rather than simply opened), + * check if we also need to create a config file. */ - num_digits = (int)(HDlog10(n_io_concentrators) + 1); - HDsnprintf(filename_out, filename_out_len, "%s/%s" H5FD_SUBFILING_FILENAME_TEMPLATE, subfile_dir, base, - sf_context->h5_file_id, num_digits, sf_context->topology->subfile_rank + 1, - n_io_concentrators); - - *filename_basename_out = base; - *subfile_dir_out = subfile_dir; + if ((file_acc_flags & O_CREAT) && (sf_context->topology->ioc_idx == 0)) { + if (create_config_file(sf_context, base, subfile_dir, (file_acc_flags & O_TRUNC)) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTCREATE, FAIL, + "couldn't create subfiling configuration file"); + } done: - if (config_file && (EOF == HDfclose(config_file))) - H5_SUBFILING_DONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, - "couldn't close subfiling configuration file"); - if (ret_value < 0) { - H5MM_free(subfile_dir); - H5MM_free(base); - - if (*filename_basename_out) { - H5MM_free(*filename_basename_out); - *filename_basename_out = NULL; - } - if (*subfile_dir_out) { - H5MM_free(*subfile_dir_out); - *subfile_dir_out = NULL; + if (sf_context) { + for (int i = 0; i < sf_context->sf_num_fids; i++) { + if (sf_context->sf_fids[i] >= 0 && HDclose(sf_context->sf_fids[i]) < 0) + H5_SUBFILING_DONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, "failed to close subfile"); + sf_context->sf_fids[i] = -1; + } } } - HDfree(prefix); + H5MM_free(base); + H5MM_free(subfile_dir); + HDfree(filepath); H5_SUBFILING_FUNC_LEAVE; } @@ -1742,6 +2165,7 @@ done: * * - the stripe size for the file's subfiles * - the number of I/O concentrators used for I/O to the file's subfiles + * - the number of subfiles the logical HDF5 file consists of * - the base HDF5 filename * - the optional directory prefix where the file's subfiles are placed * - the names of each of the file's subfiles @@ -1777,7 +2201,7 @@ create_config_file(subfiling_context_t *sf_context, const char *base_filename, c H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't allocate space for subfiling configuration filename"); - HDsnprintf(config_filename, PATH_MAX, "%s/%s" H5FD_SUBFILING_CONFIG_FILENAME_TEMPLATE, subfile_dir, + HDsnprintf(config_filename, PATH_MAX, "%s/" H5FD_SUBFILING_CONFIG_FILENAME_TEMPLATE, subfile_dir, base_filename, sf_context->h5_file_id); /* Determine whether a subfiling configuration file exists */ @@ -1796,9 +2220,8 @@ create_config_file(subfiling_context_t *sf_context, const char *base_filename, c * O_TRUNC flag was specified. In this case, truncate * the existing config file and create a new one. */ - /* TODO: if truncating, consider removing old stale config files. */ if (!config_file_exists || truncate_if_exists) { - int n_io_concentrators = sf_context->topology->n_io_concentrators; + int n_subfiles = sf_context->sf_num_subfiles; int num_digits; if (NULL == (config_file = HDfopen(config_filename, "w+"))) @@ -1816,7 +2239,13 @@ create_config_file(subfiling_context_t *sf_context, const char *base_filename, c "failed to write to subfiling configuration file"); /* Write the number of I/O concentrators to the configuration file */ - HDsnprintf(line_buf, PATH_MAX, "aggregator_count=%d\n", n_io_concentrators); + HDsnprintf(line_buf, PATH_MAX, "aggregator_count=%d\n", sf_context->topology->n_io_concentrators); + if (HDfwrite(line_buf, HDstrlen(line_buf), 1, config_file) != 1) + H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, + "failed to write to subfiling configuration file"); + + /* Write the number of subfiles to the configuration file */ + HDsnprintf(line_buf, PATH_MAX, "subfile_count=%d\n", n_subfiles); if (HDfwrite(line_buf, HDstrlen(line_buf), 1, config_file) != 1) H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, "failed to write to subfiling configuration file"); @@ -1834,10 +2263,10 @@ create_config_file(subfiling_context_t *sf_context, const char *base_filename, c "failed to write to subfiling configuration file"); /* Write out each subfile name to the configuration file */ - num_digits = (int)(HDlog10(n_io_concentrators) + 1); - for (int k = 0; k < n_io_concentrators; k++) { - HDsnprintf(line_buf, PATH_MAX, "%s" H5FD_SUBFILING_FILENAME_TEMPLATE "\n", base_filename, - sf_context->h5_file_id, num_digits, k + 1, n_io_concentrators); + num_digits = (int)(HDlog10(n_subfiles) + 1); + for (int k = 0; k < n_subfiles; k++) { + HDsnprintf(line_buf, PATH_MAX, H5FD_SUBFILING_FILENAME_TEMPLATE "\n", base_filename, + sf_context->h5_file_id, num_digits, k + 1, n_subfiles); if (HDfwrite(line_buf, HDstrlen(line_buf), 1, config_file) != 1) H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, @@ -1873,8 +2302,8 @@ done: *------------------------------------------------------------------------- */ static herr_t -open_config_file(subfiling_context_t *sf_context, const char *base_filename, const char *subfile_dir, - const char *mode, FILE **config_file_out) +open_config_file(const char *base_filename, const char *subfile_dir, uint64_t file_id, const char *mode, + FILE **config_file_out) { hbool_t config_file_exists = FALSE; FILE *config_file = NULL; @@ -1882,17 +2311,14 @@ open_config_file(subfiling_context_t *sf_context, const char *base_filename, con int ret = 0; herr_t ret_value = SUCCEED; - HDassert(sf_context); HDassert(base_filename); HDassert(subfile_dir); + HDassert(file_id != UINT64_MAX); HDassert(mode); HDassert(config_file_out); *config_file_out = NULL; - if (sf_context->h5_file_id == UINT64_MAX) - H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "invalid HDF5 file ID %" PRIu64, - sf_context->h5_file_id); if (*base_filename == '\0') H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "invalid base HDF5 filename '%s'", base_filename); @@ -1903,8 +2329,8 @@ open_config_file(subfiling_context_t *sf_context, const char *base_filename, con H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't allocate space for subfiling configuration filename"); - HDsnprintf(config_filename, PATH_MAX, "%s/%s" H5FD_SUBFILING_CONFIG_FILENAME_TEMPLATE, subfile_dir, - base_filename, sf_context->h5_file_id); + HDsnprintf(config_filename, PATH_MAX, "%s/" H5FD_SUBFILING_CONFIG_FILENAME_TEMPLATE, subfile_dir, + base_filename, file_id); /* Determine whether a subfiling configuration file exists */ errno = 0; @@ -1938,26 +2364,26 @@ done: } /*------------------------------------------------------------------------- - * Function: H5_get_num_iocs_from_config_file + * Function: H5_get_subfiling_config_from_file * - * Purpose: Reads a Subfiling configuration file to get the number of - * I/O concentrators used for the logical HDF5 file. + * Purpose: Reads a Subfiling configuration file to get the stripe size + * and number of subfiles used for the logical HDF5 file. * * Return: Non-negative on success/Negative on failure * *------------------------------------------------------------------------- */ herr_t -H5_get_num_iocs_from_config_file(FILE *config_file, int *n_io_concentrators) +H5_get_subfiling_config_from_file(FILE *config_file, int64_t *stripe_size, int64_t *num_subfiles) { - char *config_buf = NULL; - char *ioc_substr = NULL; - long config_file_len = 0; - int read_n_io_concs = 0; - herr_t ret_value = SUCCEED; + int64_t read_stripe_size = 0; + int64_t read_num_subfiles = 0; + char *config_buf = NULL; + char *substr = NULL; + long config_file_len = 0; + herr_t ret_value = SUCCEED; HDassert(config_file); - HDassert(n_io_concentrators); if (HDfseek(config_file, 0, SEEK_END) < 0) H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_SEEKERROR, FAIL, @@ -1981,22 +2407,40 @@ H5_get_num_iocs_from_config_file(FILE *config_file, int *n_io_concentrators) config_buf[config_file_len] = '\0'; - if (NULL == (ioc_substr = HDstrstr(config_buf, "aggregator_count"))) - H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, - "malformed subfiling configuration file - no aggregator count entry"); + if (stripe_size) { + if (NULL == (substr = HDstrstr(config_buf, "stripe_size"))) + H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, + "malformed subfiling configuration file - no stripe size entry"); + + if (EOF == HDsscanf(substr, "stripe_size=%" PRId64, &read_stripe_size)) + H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, + "couldn't get stripe size from subfiling configuration file"); + + if (read_stripe_size <= 0) + H5_SUBFILING_GOTO_ERROR( + H5E_FILE, H5E_BADVALUE, FAIL, + "invalid stripe size (%" PRId64 ") read from subfiling configuration file", read_stripe_size); + + *stripe_size = read_stripe_size; + } + + if (num_subfiles) { + if (NULL == (substr = HDstrstr(config_buf, "subfile_count"))) + H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, + "malformed subfiling configuration file - no subfile count entry"); - if (EOF == HDsscanf(ioc_substr, "aggregator_count=%d", &read_n_io_concs)) - H5_SUBFILING_SYS_GOTO_ERROR( - H5E_FILE, H5E_CANTGET, FAIL, - "couldn't get number of I/O concentrators from subfiling configuration file"); + if (EOF == HDsscanf(substr, "subfile_count=%" PRId64, &read_num_subfiles)) + H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, + "couldn't get number of subfiles from subfiling configuration file"); - if (read_n_io_concs <= 0) - H5_SUBFILING_GOTO_ERROR( - H5E_FILE, H5E_BADVALUE, FAIL, - "invalid number of I/O concentrators (%d) read from subfiling configuration file", - read_n_io_concs); + if (read_num_subfiles <= 0) + H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, + "invalid number of subfiles (%" PRId64 + ") read from subfiling configuration file", + read_num_subfiles); - *n_io_concentrators = read_n_io_concs; + *num_subfiles = read_num_subfiles; + } done: HDfree(config_buf); @@ -2005,6 +2449,135 @@ done: } /*------------------------------------------------------------------------- + * Function: H5_resolve_pathname + * + * Purpose: Simple wrapper routine around realpath(3) to fully resolve + * a given filepath. Collective across the specified MPI + * communicator in order to minimize file system contention + * between MPI ranks. + * + * The resolved filepath returned through `resolved_filepath` + * must be freed by the caller with HDfree. + * + * Return Non-negative on success/Negative on failure + * + *------------------------------------------------------------------------- + */ +herr_t +H5_resolve_pathname(const char *filepath, MPI_Comm comm, char **resolved_filepath) +{ + hsize_t path_len = HSIZE_UNDEF; + hbool_t bcasted_path_len = FALSE; + hbool_t bcasted_path = FALSE; + char *resolved_path = NULL; + char *file_basename = NULL; + char *file_dirname = NULL; + char *cwd = NULL; + int mpi_rank; + int mpi_size; + int mpi_code; + herr_t ret_value = SUCCEED; + + HDassert(filepath); + HDassert(resolved_filepath); + + if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &mpi_rank))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code); + if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(comm, &mpi_size))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code); + + if (mpi_rank == 0) { + if (NULL == (resolved_path = HDrealpath(filepath, NULL))) { + if (ENOENT == errno) { + if (H5_dirname(filepath, &file_dirname) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't get file dirname"); + + /* If filepath is just the filename, set up path using CWD */ + if (!HDstrcmp(file_dirname, ".")) { + if (NULL == (resolved_path = HDmalloc(PATH_MAX))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, + "can't allocate buffer for filepath"); + if (H5_basename(filepath, &file_basename) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't get file basename"); + if (NULL == (cwd = HDmalloc(PATH_MAX))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, + "can't allocate buffer for CWD"); + + if (NULL == HDgetcwd(cwd, PATH_MAX)) + H5_SUBFILING_GOTO_ERROR( + H5E_VFL, H5E_CANTGET, FAIL, + "can't get current working directory, errno = %d, error message = '%s'", errno, + HDstrerror(errno)); + + HDsnprintf(resolved_path, PATH_MAX, "%s/%s", cwd, file_basename); + } + else { + /* Otherwise, just use what was given as the pathname */ + if (NULL == (resolved_path = HDstrdup(filepath))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't copy filename"); + } + } + else + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, + "can't resolve subfile path, errno = %d, error message = '%s'", errno, + HDstrerror(errno)); + } + + if (resolved_path) { + H5_CHECKED_ASSIGN(path_len, hsize_t, (HDstrlen(resolved_path) + 1), size_t); + } + else + path_len = HSIZE_UNDEF; + } + + /* Broadcast the size of the resolved filepath string to other ranks */ + bcasted_path_len = TRUE; + if (mpi_size > 1) { + if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&path_len, 1, HSIZE_AS_MPI_TYPE, 0, comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code); + } + + if (path_len == HSIZE_UNDEF) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "couldn't resolve filepath"); + + if (mpi_rank != 0) { + if (NULL == (resolved_path = HDmalloc(path_len))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate file name buffer"); + } + + /* Broadcast the resolved filepath to other ranks */ + bcasted_path = TRUE; + if (mpi_size > 1) { + H5_CHECK_OVERFLOW(path_len, hsize_t, int); + if (MPI_SUCCESS != (mpi_code = MPI_Bcast(resolved_path, (int)path_len, MPI_CHAR, 0, comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code); + } + + *resolved_filepath = resolved_path; + +done: + HDfree(cwd); + H5MM_free(file_basename); + H5MM_free(file_dirname); + + if (ret_value < 0) { + if (!bcasted_path_len) { + if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&path_len, 1, HSIZE_AS_MPI_TYPE, 0, comm))) + H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Bcast failed", mpi_code); + } + if (!bcasted_path && (path_len != HSIZE_UNDEF)) { + H5_CHECK_OVERFLOW(path_len, hsize_t, int); + if (MPI_SUCCESS != (mpi_code = MPI_Bcast(resolved_path, (int)path_len, MPI_CHAR, 0, comm))) + H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Bcast failed", mpi_code); + } + + HDfree(resolved_path); + } + + H5_SUBFILING_FUNC_LEAVE; +} + +/*------------------------------------------------------------------------- * Function: H5_close_subfiles * * Purpose: This is a simple wrapper function for the internal version @@ -2046,35 +2619,39 @@ done: *------------------------------------------------------------------------- */ herr_t -H5_close_subfiles(int64_t subfiling_context_id) +H5_close_subfiles(int64_t subfiling_context_id, MPI_Comm file_comm) { subfiling_context_t *sf_context = NULL; MPI_Request barrier_req = MPI_REQUEST_NULL; + int mpi_size; int mpi_code; herr_t ret_value = SUCCEED; if (NULL == (sf_context = H5_get_subfiling_object(subfiling_context_id))) H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "couldn't get subfiling object from context ID"); - /* We make the subfile close operation collective. - * Otherwise, there may be a race condition between - * our closing the subfiles and the user application - * moving ahead and possibly re-opening a file. - * - * If we can, we utilize an async barrier which gives - * us the opportunity to reduce the CPU load due to - * MPI spinning while waiting for the barrier to - * complete. This is especially important if there - * is heavy thread utilization due to subfiling - * activities, i.e. the thread pool might be - * extremely busy servicing I/O requests from all - * HDF5 application ranks. - */ -#if MPI_VERSION > 3 || (MPI_VERSION == 3 && MPI_SUBVERSION >= 1) - { + if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(file_comm, &mpi_size))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code); + + /* We make the subfile close operation collective. + * Otherwise, there may be a race condition between + * our closing the subfiles and the user application + * moving ahead and possibly re-opening a file. + * + * If we can, we utilize an async barrier which gives + * us the opportunity to reduce the CPU load due to + * MPI spinning while waiting for the barrier to + * complete. This is especially important if there + * is heavy thread utilization due to subfiling + * activities, i.e. the thread pool might be + * extremely busy servicing I/O requests from all + * HDF5 application ranks. + */ + if (mpi_size > 1) { +#if H5_CHECK_MPI_VERSION(3, 1) int barrier_complete = 0; - if (MPI_SUCCESS != (mpi_code = MPI_Ibarrier(sf_context->sf_barrier_comm, &barrier_req))) + if (MPI_SUCCESS != (mpi_code = MPI_Ibarrier(file_comm, &barrier_req))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Ibarrier failed", mpi_code); while (!barrier_complete) { @@ -2084,24 +2661,25 @@ H5_close_subfiles(int64_t subfiling_context_id) if (MPI_SUCCESS != (mpi_code = MPI_Test(&barrier_req, &barrier_complete, MPI_STATUS_IGNORE))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Test failed", mpi_code); } - } #else - if (MPI_SUCCESS != (mpi_code = MPI_Barrier(sf_context->sf_barrier_comm))) - H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); + if (MPI_SUCCESS != (mpi_code = MPI_Barrier(file_comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); #endif + } /* The map from file handle to subfiling context can now be cleared */ - if (sf_context->h5_file_handle != NULL) { - clear_fid_map_entry(sf_context->h5_file_handle, sf_context->sf_context_id); + if (sf_context->h5_file_id != UINT64_MAX) { + clear_fid_map_entry(sf_context->h5_file_id, sf_context->sf_context_id); } if (sf_context->topology->rank_is_ioc) { - if (sf_context->sf_fid >= 0) { - errno = 0; - if (HDclose(sf_context->sf_fid) < 0) - H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, "couldn't close subfile"); - - sf_context->sf_fid = -1; + if (sf_context->sf_fids) { + for (int i = 0; i < sf_context->sf_num_fids; i++) { + errno = 0; + if (sf_context->sf_fids[i] >= 0 && HDclose(sf_context->sf_fids[i]) < 0) + H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, "couldn't close subfile"); + sf_context->sf_fids[i] = -1; + } } } @@ -2110,11 +2688,11 @@ H5_close_subfiles(int64_t subfiling_context_id) * and opening another file before this file is completely closed * down. */ -#if MPI_VERSION > 3 || (MPI_VERSION == 3 && MPI_SUBVERSION >= 1) - { + if (mpi_size > 1) { +#if H5_CHECK_MPI_VERSION(3, 1) int barrier_complete = 0; - if (MPI_SUCCESS != (mpi_code = MPI_Ibarrier(sf_context->sf_barrier_comm, &barrier_req))) + if (MPI_SUCCESS != (mpi_code = MPI_Ibarrier(file_comm, &barrier_req))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Ibarrier failed", mpi_code); while (!barrier_complete) { @@ -2124,24 +2702,213 @@ H5_close_subfiles(int64_t subfiling_context_id) if (MPI_SUCCESS != (mpi_code = MPI_Test(&barrier_req, &barrier_complete, MPI_STATUS_IGNORE))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Test failed", mpi_code); } - } #else - if (MPI_SUCCESS != (mpi_code = MPI_Barrier(sf_context->sf_barrier_comm))) - H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); + if (MPI_SUCCESS != (mpi_code = MPI_Barrier(file_comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); +#endif + } + +#ifdef H5_SUBFILING_DEBUG + if (sf_context->sf_logfile) { + struct tm *tm = NULL; + time_t cur_time; + + cur_time = time(NULL); + tm = localtime(&cur_time); + + H5_subfiling_log(sf_context->sf_context_id, "\n-- LOGGING FINISH - %s", asctime(tm)); + + HDfclose(sf_context->sf_logfile); + sf_context->sf_logfile = NULL; + } #endif done: - if (sf_context && H5_free_subfiling_object_int(sf_context) < 0) - H5_SUBFILING_DONE_ERROR(H5E_FILE, H5E_CANTFREE, FAIL, "couldn't free subfiling context object"); + H5_SUBFILING_FUNC_LEAVE; +} + +/*------------------------------------------------------------------------- + * Function: H5_subfiling_set_config_prop + * + * Purpose: Sets the specified Subfiling VFD configuration as a + * property on the given FAPL pointer. The Subfiling VFD uses + * this property to pass its configuration down to the IOC VFD + * without needing each IOC VFD to include it as part of its + * public configuration. + * + * Return: Non-negative on success/Negative on failure + * + *------------------------------------------------------------------------- + */ +herr_t +H5_subfiling_set_config_prop(H5P_genplist_t *plist_ptr, const H5FD_subfiling_params_t *vfd_config) +{ + htri_t prop_exists = FAIL; + herr_t ret_value = SUCCEED; + + if (!plist_ptr) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL FAPL pointer"); + if (!vfd_config) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid subfiling configuration pointer"); + + if ((prop_exists = H5P_exist_plist(plist_ptr, H5FD_SUBFILING_CONFIG_PROP)) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, + "can't check if subfiling configuration property exists in FAPL"); + + if (prop_exists) { + if (H5P_set(plist_ptr, H5FD_SUBFILING_CONFIG_PROP, vfd_config) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, + "can't set subfiling configuration property on FAPL"); + } + else { + union { + const void *const_ptr_to_data; + void *ptr_to_data; + } eliminate_const_warning; + + /* + * Cast away const since H5P_insert doesn't match the signature + * for "value" as H5P_set + */ + eliminate_const_warning.const_ptr_to_data = vfd_config; + + if (H5P_insert(plist_ptr, H5FD_SUBFILING_CONFIG_PROP, sizeof(H5FD_subfiling_params_t), + eliminate_const_warning.ptr_to_data, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTREGISTER, FAIL, + "unable to register subfiling configuration property in FAPL"); + } + +done: + H5_SUBFILING_FUNC_LEAVE; +} + +/*------------------------------------------------------------------------- + * Function: H5_subfiling_get_config_prop + * + * Purpose: Retrieves the Subfiling VFD configuration from the given + * FAPL pointer. The Subfiling VFD uses this property to pass + * its configuration down to the IOC VFD without needing each + * IOC VFD to include it as part of its public configuration. + * + * Return: Non-negative on success/Negative on failure + * + *------------------------------------------------------------------------- + */ +herr_t +H5_subfiling_get_config_prop(H5P_genplist_t *plist_ptr, H5FD_subfiling_params_t *vfd_config) +{ + htri_t prop_exists = FAIL; + herr_t ret_value = SUCCEED; + + if (!plist_ptr) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL FAPL pointer"); + if (!vfd_config) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid subfiling configuration pointer"); + + if ((prop_exists = H5P_exist_plist(plist_ptr, H5FD_SUBFILING_CONFIG_PROP)) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, + "can't check if subfiling configuration property exists in FAPL"); + + if (prop_exists) { + if (H5P_get(plist_ptr, H5FD_SUBFILING_CONFIG_PROP, vfd_config) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, + "can't get subfiling configuration property from FAPL"); + } + else { + vfd_config->ioc_selection = SELECT_IOC_ONE_PER_NODE; + vfd_config->stripe_size = H5FD_SUBFILING_DEFAULT_STRIPE_SIZE; + vfd_config->stripe_count = H5FD_SUBFILING_DEFAULT_STRIPE_COUNT; + } + +done: + H5_SUBFILING_FUNC_LEAVE; +} + +/*------------------------------------------------------------------------- + * Function: H5_subfiling_set_file_id_prop + * + * Purpose: Sets the specified file ID (Inode) value as a property on + * the given FAPL pointer. The Subfiling VFD uses this + * property to pass the HDF5 stub file ID value down to the + * IOC VFD. + * + * Return: Non-negative on success/Negative on failure + * + *------------------------------------------------------------------------- + */ +herr_t +H5_subfiling_set_file_id_prop(H5P_genplist_t *plist_ptr, uint64_t file_id) +{ + htri_t prop_exists = FAIL; + herr_t ret_value = SUCCEED; + + if (!plist_ptr) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL FAPL pointer"); + if (file_id == UINT64_MAX) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid file ID value"); + + if ((prop_exists = H5P_exist_plist(plist_ptr, H5FD_SUBFILING_STUB_FILE_ID)) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, + "can't check if file ID property exists in FAPL"); + + if (prop_exists) { + if (H5P_set(plist_ptr, H5FD_SUBFILING_STUB_FILE_ID, &file_id) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set file ID property on FAPL"); + } + else { + if (H5P_insert(plist_ptr, H5FD_SUBFILING_STUB_FILE_ID, sizeof(uint64_t), &file_id, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTREGISTER, FAIL, + "unable to register file ID property in FAPL"); + } + +done: + H5_SUBFILING_FUNC_LEAVE; +} + +/*------------------------------------------------------------------------- + * Function: H5_subfiling_get_file_id_prop + * + * Purpose: Retrieves the file ID (Inode) value from the given FAPL + * pointer. The Subfiling VFD uses this property to pass the + * HDF5 stub file ID value down to the IOC VFD. + * + * Return: Non-negative on success/Negative on failure + * + *------------------------------------------------------------------------- + */ +herr_t +H5_subfiling_get_file_id_prop(H5P_genplist_t *plist_ptr, uint64_t *file_id) +{ + htri_t prop_exists = FAIL; + herr_t ret_value = SUCCEED; + + if (!plist_ptr) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL FAPL pointer"); + if (!file_id) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL file ID pointer"); + if ((prop_exists = H5P_exist_plist(plist_ptr, H5FD_SUBFILING_STUB_FILE_ID)) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, + "can't check if file ID property exists in FAPL"); + + if (prop_exists) { + if (H5P_get(plist_ptr, H5FD_SUBFILING_STUB_FILE_ID, file_id) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get file ID property from FAPL"); + } + else + *file_id = UINT64_MAX; + +done: H5_SUBFILING_FUNC_LEAVE; } /*------------------------------------------------------------------------- - * Function: H5_subfile_fhandle_to_context + * Function: H5_subfile_fid_to_context * * Purpose: This is a basic lookup function which returns the subfiling - * context id associated with the specified file handle. + * context id associated with the specified file ID. * * Return: Non-negative subfiling context ID if the context exists * Negative on failure or if the subfiling context doesn't @@ -2155,7 +2922,7 @@ done: *------------------------------------------------------------------------- */ int64_t -H5_subfile_fhandle_to_context(void *file_handle) +H5_subfile_fid_to_context(uint64_t file_id) { int64_t ret_value = -1; @@ -2163,14 +2930,107 @@ H5_subfile_fhandle_to_context(void *file_handle) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, -1, "open file map is NULL"); for (int i = 0; i < sf_file_map_size; i++) { - if (sf_open_file_map[i].file_handle == file_handle) { + if (sf_open_file_map[i].file_id == file_id) { return sf_open_file_map[i].sf_context_id; } } done: H5_SUBFILING_FUNC_LEAVE; -} /* end H5_subfile_fhandle_to_context() */ +} /* end H5_subfile_fid_to_context() */ + +/*------------------------------------------------------------------------- + * Function: H5_subfiling_validate_config + * + * Purpose: Checks that the given subfiling configuration parameters + * are valid + * + * Return: Non-negative on success/Negative on failure + *------------------------------------------------------------------------- + */ +herr_t +H5_subfiling_validate_config(const H5FD_subfiling_params_t *subf_config) +{ + H5FD_subfiling_ioc_select_t ioc_sel_type; + herr_t ret_value = SUCCEED; + + if (!subf_config) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL subfiling configuration pointer"); + + /* + * Compare against each IOC selection value directly since + * the enum might be a signed or unsigned type and a comparison + * against < 0 could generate a warning + */ + ioc_sel_type = subf_config->ioc_selection; + if (ioc_sel_type != SELECT_IOC_ONE_PER_NODE && ioc_sel_type != SELECT_IOC_EVERY_NTH_RANK && + ioc_sel_type != SELECT_IOC_WITH_CONFIG && ioc_sel_type != SELECT_IOC_TOTAL) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid IOC selection method"); + + if (subf_config->stripe_size <= 0) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid stripe size"); + + if (subf_config->stripe_count <= 0 && subf_config->stripe_count != H5FD_SUBFILING_DEFAULT_STRIPE_COUNT) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid stripe count"); + +done: + H5_SUBFILING_FUNC_LEAVE; +} + +/*------------------------------------------------------------------------- + * Function: H5_subfiling_terminate + * + * Purpose: A cleanup routine to be called by the Subfiling VFD when + * it is terminating. Cleans up internal resources such as the + * context and topology caches. + * + * Return: Non-negative on success/Negative on failure + * + *------------------------------------------------------------------------- + */ +herr_t +H5_subfiling_terminate(void) +{ + herr_t ret_value = SUCCEED; + + /* Clean up subfiling context and topology caches */ + if (sf_context_cache) { + for (size_t i = 0; i < sf_context_cache_num_entries; i++) { + if (H5_free_subfiling_object_int(sf_context_cache[i]) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, + "couldn't free subfiling context object"); + sf_context_cache[i] = NULL; + } + + sf_context_cache_size = 0; + sf_context_cache_num_entries = 0; + + HDfree(sf_context_cache); + sf_context_cache = NULL; + } + if (sf_topology_cache) { + for (size_t i = 0; i < sf_topology_cache_num_entries; i++) { + if (H5_free_subfiling_topology(sf_topology_cache[i]) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, + "couldn't free subfiling topology object"); + sf_topology_cache[i] = NULL; + } + + sf_topology_cache_size = 0; + sf_topology_cache_num_entries = 0; + + HDfree(sf_topology_cache); + sf_topology_cache = NULL; + } + + /* Clean up the file ID to context object mapping */ + sf_file_map_size = 0; + HDfree(sf_open_file_map); + sf_open_file_map = NULL; + +done: + H5_SUBFILING_FUNC_LEAVE; +} #ifdef H5_SUBFILING_DEBUG void diff --git a/src/H5FDsubfiling/H5subfiling_common.h b/src/H5FDsubfiling/H5subfiling_common.h index 6e2965f..ba6dfdc 100644 --- a/src/H5FDsubfiling/H5subfiling_common.h +++ b/src/H5FDsubfiling/H5subfiling_common.h @@ -20,17 +20,49 @@ #include #include "H5private.h" +#include "H5FDprivate.h" #include "H5Iprivate.h" +#include "H5Pprivate.h" #include "H5FDsubfiling.h" #include "H5FDioc.h" +#ifndef PATH_MAX +#define PATH_MAX 4096 +#endif + /* * Some definitions for debugging the Subfiling feature */ /* #define H5_SUBFILING_DEBUG */ /* + * Some definitions for controlling performance across + * different machines where some types of MPI operations + * may be better optimized than others + */ +/* #define H5_SUBFILING_PREFER_ALLGATHER_TOPOLOGY */ +#ifndef H5_SUBFILING_PREFER_ALLGATHER_TOPOLOGY +#if !H5_CHECK_MPI_VERSION(3, 0) +#error "MPI 3 required for MPI_Comm_split_type" +#endif +#endif + +/* + * Name of the HDF5 FAPL property that the Subfiling VFD + * uses to pass its configuration down to the underlying + * IOC VFD + */ +#define H5FD_SUBFILING_CONFIG_PROP "H5FD_SUBFILING_CONFIG_PROP" + +/* + * Name of the HDF5 FAPL property that the Subfiling VFD + * uses to pass the HDF5 stub file's Inode value to the + * underlying IOC VFD + */ +#define H5FD_SUBFILING_STUB_FILE_ID "H5FD_SUBFILING_STUB_FILE_ID" + +/* * MPI Tags are 32 bits, we treat them as unsigned * to allow the use of the available bits for RPC * selections, i.e. a message from the VFD read or write functions @@ -80,8 +112,10 @@ /* MPI tag values for data communicator */ #define WRITE_INDEP_ACK 0 -#define READ_INDEP_DATA 1 -#define WRITE_TAG_BASE 2 +#define READ_INDEP_ACK 1 +#define READ_INDEP_DATA 2 +#define WRITE_DATA_DONE 3 +#define IO_TAG_BASE 4 /* * Object type definitions for subfiling objects. @@ -112,70 +146,70 @@ typedef enum io_ops { LOGGING_OP = 16 } io_op_t; -/* Every application rank will record their MPI rank - * and hostid as a structure. These eventually get - * communicated to MPI rank zero(0) and sorted before - * being broadcast. The resulting sorted vector - * provides a basis for determining which MPI ranks - * will host an IO Concentrator (IOC), e.g. For - * default behavior, we choose the first vector entry - * associated with a "new" hostid. +/* + * Every MPI rank in a file's communicator will + * record their MPI rank for the file communicator + * and their node-local MPI rank for the node's + * communicator. Then the resulting information + * will be broadcast to all MPI ranks and will + * provide a basis for determining which MPI ranks + * will host an I/O concentrator. */ typedef struct { - long rank; - long hostid; + int rank; + int node_local_rank; + int node_local_size; + int node_lead_rank; } layout_t; -/* This typedef defines a fixed process layout which +/* + * This typedef defines a fixed process layout which * can be reused for any number of file open operations */ typedef struct app_layout_t { - long hostid; /* value returned by gethostid() */ - layout_t *layout; /* Vector of {rank,hostid} values */ - int *node_ranks; /* ranks extracted from sorted layout */ - int node_count; /* Total nodes (different hostids) */ - int node_index; /* My node: index into node_ranks */ - int local_peers; /* How may local peers on my node */ - int world_rank; /* My MPI rank */ - int world_size; /* Total number of MPI ranks */ + layout_t *layout; /* Array of (rank, node local rank, node local size) values */ + int *node_ranks; /* Array of lowest MPI rank values on each node */ + int node_count; /* Total number of nodes */ + int world_rank; /* MPI rank in file communicator */ + int world_size; /* Size of file communicator */ + int node_local_rank; /* MPI rank on node */ + int node_local_size; /* Size of node intra-communicator */ } app_layout_t; /* This typedef defines things related to IOC selections */ typedef struct topology { - app_layout_t *app_layout; /* Pointer to our layout struct */ - bool rank_is_ioc; /* Indicates that we host an IOC */ - int subfile_rank; /* Valid only if rank_is_ioc */ - int n_io_concentrators; /* Number of IO concentrators */ - int *io_concentrators; /* Vector of ranks which are IOCs */ - int *subfile_fd; /* file descriptor (if IOC) */ - H5FD_subfiling_ioc_select_t selection_type; /* Cache our IOC selection criteria */ + app_layout_t *app_layout; /* Pointer to our layout struct */ + MPI_Comm app_comm; /* MPI communicator for this topology */ + bool rank_is_ioc; /* Indicates that we host an IOC */ + int ioc_idx; /* Valid only if rank_is_ioc */ + int n_io_concentrators; /* Number of I/O concentrators */ + int *io_concentrators; /* Vector of ranks which are IOCs */ + H5FD_subfiling_ioc_select_t selection_type; /* Cache our IOC selection criteria */ } sf_topology_t; typedef struct { int64_t sf_context_id; /* Generated context ID which embeds the cache index */ - uint64_t h5_file_id; /* GUID (basically the inode value) */ - void *h5_file_handle; /* Low-level handle for the HDF5 stub file */ - int sf_fid; /* value returned by open(file,..) */ - size_t sf_write_count; /* Statistics: write_count */ - size_t sf_read_count; /* Statistics: read_count */ - haddr_t sf_eof; /* File eof */ - int64_t sf_stripe_size; /* Stripe-depth */ - int64_t sf_blocksize_per_stripe; /* Stripe-depth X n_IOCs */ - int64_t sf_base_addr; /* For an IOC, our base address */ - MPI_Comm sf_msg_comm; /* MPI comm used to send RPC msg */ - MPI_Comm sf_data_comm; /* MPI comm used to move data */ - MPI_Comm sf_eof_comm; /* MPI comm used to communicate EOF */ - MPI_Comm sf_barrier_comm; /* MPI comm used for barrier operations */ - MPI_Comm sf_group_comm; /* Not used: for IOC collectives */ - MPI_Comm sf_intercomm; /* Not used: for msgs to all IOC */ - int sf_group_size; /* IOC count (in sf_group_comm) */ - int sf_group_rank; /* IOC rank (in sf_group_comm) */ - int sf_intercomm_root; /* Not used: for IOC comms */ - char *subfile_prefix; /* If subfiles are node-local */ - char *sf_filename; /* A generated subfile name */ - char *h5_filename; /* The user supplied file name */ - void *ioc_data; /* Private data for underlying IOC */ - sf_topology_t *topology; /* pointer to our topology */ + uint64_t h5_file_id; /* GUID (basically the inode value) */ + int *sf_fids; /* Array of file IDs for subfiles this rank owns */ + int sf_num_fids; /* Number of subfiles this rank owns */ + int sf_num_subfiles; /* Total number of subfiles for logical HDF5 file */ + size_t sf_write_count; /* Statistics: write_count */ + size_t sf_read_count; /* Statistics: read_count */ + haddr_t sf_eof; /* File eof */ + int64_t sf_stripe_size; /* Stripe-depth */ + int64_t sf_blocksize_per_stripe; /* Stripe-depth X n_IOCs */ + int64_t sf_base_addr; /* For an IOC, our base address */ + MPI_Comm sf_msg_comm; /* MPI comm used to send RPC msg */ + MPI_Comm sf_data_comm; /* MPI comm used to move data */ + MPI_Comm sf_eof_comm; /* MPI comm used to communicate EOF */ + MPI_Comm sf_node_comm; /* MPI comm used for intra-node comms */ + MPI_Comm sf_group_comm; /* Not used: for IOC collectives */ + int sf_group_size; /* IOC count (in sf_group_comm) */ + int sf_group_rank; /* IOC rank (in sf_group_comm) */ + char *subfile_prefix; /* If subfiles are node-local */ + char *h5_filename; /* The user supplied file name */ + void *ioc_data; /* Private data for underlying IOC */ + sf_topology_t *topology; /* Pointer to our topology */ #ifdef H5_SUBFILING_DEBUG char sf_logfile_name[PATH_MAX]; @@ -189,30 +223,45 @@ typedef struct { * an easy gathering of statistics by the IO Concentrator. */ typedef struct { - /* {Datasize, Offset, FileID} */ - int64_t header[3]; /* The basic RPC input plus */ - int tag; /* the supplied OPCODE tag */ - int source; /* Rank of who sent the message */ - int subfile_rank; /* The IOC rank */ - int64_t context_id; /* context to be used to complete */ - double start_time; /* the request, + time of receipt */ - /* from which we calc Time(queued) */ + int64_t header[3]; /* The basic RPC input */ + int tag; /* the supplied OPCODE tag */ + int source; /* Rank of who sent the message */ + int ioc_idx; /* The IOC rank */ + int64_t context_id; /* context to be used to complete */ + double start_time; /* the request, + time of receipt */ + /* from which we calc Time(queued) */ } sf_work_request_t; +/* MPI Datatype used to send/receive an RPC message */ +extern MPI_Datatype H5_subfiling_rpc_msg_type; + #ifdef __cplusplus extern "C" { #endif -H5_DLL herr_t H5_open_subfiles(const char *base_filename, void *h5_file_handle, - H5FD_subfiling_shared_config_t *subfiling_config, int file_acc_flags, +H5_DLL herr_t H5_open_subfiling_stub_file(const char *name, unsigned flags, MPI_Comm file_comm, + H5FD_t **file_ptr, uint64_t *file_id); +H5_DLL herr_t H5_open_subfiles(const char *base_filename, uint64_t file_id, + H5FD_subfiling_params_t *subfiling_config, int file_acc_flags, MPI_Comm file_comm, int64_t *context_id_out); -H5_DLL herr_t H5_close_subfiles(int64_t subfiling_context_id); +H5_DLL herr_t H5_close_subfiles(int64_t subfiling_context_id, MPI_Comm file_comm); -H5_DLL int64_t H5_new_subfiling_object_id(sf_obj_type_t obj_type, int64_t index_val); +H5_DLL int64_t H5_new_subfiling_object_id(sf_obj_type_t obj_type); H5_DLL void *H5_get_subfiling_object(int64_t object_id); -H5_DLL int64_t H5_subfile_fhandle_to_context(void *file_handle); -H5_DLL herr_t H5_free_subfiling_object(int64_t object_id); -H5_DLL herr_t H5_get_num_iocs_from_config_file(FILE *config_file, int *n_io_concentrators); +H5_DLL herr_t H5_get_subfiling_config_from_file(FILE *config_file, int64_t *stripe_size, + int64_t *num_subfiles); +H5_DLL herr_t H5_resolve_pathname(const char *filepath, MPI_Comm comm, char **resolved_filepath); + +H5_DLL herr_t H5_subfiling_set_config_prop(H5P_genplist_t *plist_ptr, + const H5FD_subfiling_params_t *vfd_config); +H5_DLL herr_t H5_subfiling_get_config_prop(H5P_genplist_t *plist_ptr, H5FD_subfiling_params_t *vfd_config); +H5_DLL herr_t H5_subfiling_set_file_id_prop(H5P_genplist_t *plist_ptr, uint64_t file_id); +H5_DLL herr_t H5_subfiling_get_file_id_prop(H5P_genplist_t *plist_ptr, uint64_t *file_id); +H5_DLL int64_t H5_subfile_fid_to_context(uint64_t file_id); + +H5_DLL herr_t H5_subfiling_validate_config(const H5FD_subfiling_params_t *subf_config); + +H5_DLL herr_t H5_subfiling_terminate(void); H5_DLL void H5_subfiling_log(int64_t sf_context_id, const char *fmt, ...); diff --git a/testpar/CMakeTests.cmake b/testpar/CMakeTests.cmake index 1a3f409..26968de 100644 --- a/testpar/CMakeTests.cmake +++ b/testpar/CMakeTests.cmake @@ -98,6 +98,14 @@ set (test_par_CLEANFILES t_filters_parallel.h5 MPItest.h5 ShapeSameTest.h5 + test_subfiling_basic_create.h5 + test_subfiling_config_file.h5 + test_subfiling_stripe_sizes.h5 + test_subfiling_read_different_stripe_sizes.h5 + test_subfiling_precreate_rank_0.h5 + test_subfiling_write_many_read_one.h5 + test_subfiling_write_many_read_few.h5 + test_subfiling_h5fuse.h5 ) # Remove any output file left over from previous test run diff --git a/testpar/t_subfiling_vfd.c b/testpar/t_subfiling_vfd.c index c289fac..a214502 100644 --- a/testpar/t_subfiling_vfd.c +++ b/testpar/t_subfiling_vfd.c @@ -12,12 +12,18 @@ /* * HDF5 Subfiling VFD tests + * + * NOTE: these tests currently assume that the default I/O concentrator + * selection strategy for the Subfiling VFD is to use 1 I/O + * concentrator per node. If that changes in the future, some of + * these tests will need updating. */ #include #include "testpar.h" #include "H5srcdir.h" +#include "H5MMprivate.h" #ifdef H5_HAVE_SUBFILING_VFD @@ -26,37 +32,85 @@ #define SUBFILING_TEST_DIR H5FD_SUBFILING_NAME +/* The smallest Subfiling stripe size used for testing */ +#define SUBFILING_MIN_STRIPE_SIZE 128 + #ifndef PATH_MAX #define PATH_MAX 4096 #endif #define ARRAY_SIZE(a) sizeof(a) / sizeof(a[0]) -static MPI_Comm comm = MPI_COMM_WORLD; -static MPI_Info info = MPI_INFO_NULL; +#define CHECK_PASSED() \ + do { \ + int err_result = (nerrors > curr_nerrors); \ + \ + mpi_code_g = MPI_Allreduce(MPI_IN_PLACE, &err_result, 1, MPI_INT, MPI_MAX, comm_g); \ + VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Allreduce succeeded"); \ + \ + if (MAINPROCESS) { \ + if (err_result == 0) \ + PASSED(); \ + else \ + H5_FAILED(); \ + } \ + } while (0) + +static MPI_Comm comm_g = MPI_COMM_WORLD; +static MPI_Info info_g = MPI_INFO_NULL; static int mpi_rank; static int mpi_size; +static int mpi_code_g; +static int num_nodes_g; +static int num_iocs_g; + +static MPI_Comm node_local_comm = MPI_COMM_WORLD; +static int node_local_rank; +static int node_local_size; -int nerrors = 0; +static MPI_Comm ioc_comm = MPI_COMM_WORLD; +static int ioc_comm_rank; +static int ioc_comm_size; + +static long long stripe_size_g = -1; +static long ioc_per_node_g = -1; +static int ioc_thread_pool_size_g = -1; + +int nerrors = 0; +int curr_nerrors = 0; /* Function pointer typedef for test functions */ typedef void (*test_func)(void); /* Utility functions */ -static hid_t create_subfiling_ioc_fapl(void); +static hid_t create_subfiling_ioc_fapl(MPI_Comm comm, MPI_Info info, hbool_t custom_config, + H5FD_subfiling_params_t *custom_cfg, int32_t thread_pool_size); /* Test functions */ static void test_create_and_close(void); +static void test_config_file(void); +static void test_stripe_sizes(void); +static void test_read_different_stripe_size(void); +static void test_subfiling_precreate_rank_0(void); +static void test_subfiling_write_many_read_one(void); +static void test_subfiling_write_many_read_few(void); +static void test_subfiling_h5fuse(void); static test_func tests[] = { test_create_and_close, + test_config_file, + test_stripe_sizes, + test_read_different_stripe_size, + test_subfiling_precreate_rank_0, + test_subfiling_write_many_read_one, + test_subfiling_write_many_read_few, + test_subfiling_h5fuse, }; /* --------------------------------------------------------------------------- * Function: create_subfiling_ioc_fapl * - * Purpose: Create and populate a subfiling FAPL ID that uses either the - * IOC VFD or sec2 VFD. + * Purpose: Create and populate a subfiling FAPL ID. * * Return: Success: HID of the top-level (subfiling) FAPL, a non-negative * value. @@ -64,18 +118,14 @@ static test_func tests[] = { * --------------------------------------------------------------------------- */ static hid_t -create_subfiling_ioc_fapl(void) +create_subfiling_ioc_fapl(MPI_Comm comm, MPI_Info info, hbool_t custom_config, + H5FD_subfiling_params_t *custom_cfg, int32_t thread_pool_size) { - H5FD_subfiling_config_t *subfiling_conf = NULL; - H5FD_ioc_config_t *ioc_conf = NULL; - hid_t ioc_fapl = H5I_INVALID_HID; - hid_t ret_value = H5I_INVALID_HID; - - if (NULL == (subfiling_conf = HDcalloc(1, sizeof(*subfiling_conf)))) - TEST_ERROR; + H5FD_subfiling_config_t subfiling_conf; + H5FD_ioc_config_t ioc_conf; + hid_t ret_value = H5I_INVALID_HID; - if ((ioc_fapl = H5Pcreate(H5P_FILE_ACCESS)) < 0) - TEST_ERROR; + HDassert(!custom_config || custom_cfg); if ((ret_value = H5Pcreate(H5P_FILE_ACCESS)) < 0) TEST_ERROR; @@ -83,49 +133,41 @@ create_subfiling_ioc_fapl(void) if (H5Pset_mpi_params(ret_value, comm, info) < 0) TEST_ERROR; - /* Get defaults for Subfiling configuration */ - if (H5Pget_fapl_subfiling(ret_value, subfiling_conf) < 0) - TEST_ERROR; - - if (subfiling_conf->require_ioc) { - if (NULL == (ioc_conf = HDcalloc(1, sizeof(*ioc_conf)))) - TEST_ERROR; - - /* Get IOC VFD defaults */ - if (H5Pget_fapl_ioc(ioc_fapl, ioc_conf) < 0) - TEST_ERROR; - - if (H5Pset_mpi_params(ioc_fapl, comm, info) < 0) - TEST_ERROR; - - if (H5Pset_fapl_ioc(ioc_fapl, ioc_conf) < 0) + if (!custom_config) { + if (H5Pset_fapl_subfiling(ret_value, NULL) < 0) TEST_ERROR; } else { - if (H5Pset_fapl_sec2(ioc_fapl) < 0) + /* Get defaults for Subfiling configuration */ + if (H5Pget_fapl_subfiling(ret_value, &subfiling_conf) < 0) TEST_ERROR; - } - if (H5Pclose(subfiling_conf->ioc_fapl_id) < 0) - TEST_ERROR; - subfiling_conf->ioc_fapl_id = ioc_fapl; + /* Set custom configuration */ + subfiling_conf.shared_cfg = *custom_cfg; - if (H5Pset_fapl_subfiling(ret_value, subfiling_conf) < 0) - TEST_ERROR; + if (subfiling_conf.require_ioc) { + /* Get IOC VFD defaults */ + if (H5Pget_fapl_ioc(ret_value, &ioc_conf) < 0) + TEST_ERROR; + + /* Set custom configuration */ + ioc_conf.thread_pool_size = thread_pool_size; + + if (H5Pset_fapl_ioc(subfiling_conf.ioc_fapl_id, &ioc_conf) < 0) + TEST_ERROR; + } + else { + if (H5Pset_fapl_sec2(subfiling_conf.ioc_fapl_id) < 0) + TEST_ERROR; + } - HDfree(ioc_conf); - HDfree(subfiling_conf); + if (H5Pset_fapl_subfiling(ret_value, &subfiling_conf) < 0) + TEST_ERROR; + } return ret_value; error: - HDfree(ioc_conf); - HDfree(subfiling_conf); - - if ((H5I_INVALID_HID != ioc_fapl) && (H5Pclose(ioc_fapl) < 0)) { - H5_FAILED(); - AT(); - } if ((H5I_INVALID_HID != ret_value) && (H5Pclose(ret_value) < 0)) { H5_FAILED(); AT(); @@ -138,103 +180,1765 @@ error: * A simple test that creates and closes a file with the * subfiling VFD */ +#define SUBF_FILENAME "test_subfiling_basic_create.h5" static void test_create_and_close(void) { - const char *test_filenames[2]; - hid_t file_id = H5I_INVALID_HID; - hid_t fapl_id = H5I_INVALID_HID; + hid_t file_id = H5I_INVALID_HID; + hid_t fapl_id = H5I_INVALID_HID; + + curr_nerrors = nerrors; if (MAINPROCESS) - HDprintf("File creation and immediate close\n"); + TESTING_2("file creation and immediate close"); - fapl_id = create_subfiling_ioc_fapl(); + /* Get a default Subfiling FAPL */ + fapl_id = create_subfiling_ioc_fapl(comm_g, info_g, FALSE, NULL, 0); VRFY((fapl_id >= 0), "FAPL creation succeeded"); - file_id = H5Fcreate("basic_create.h5", H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); + file_id = H5Fcreate(SUBF_FILENAME, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); VRFY((file_id >= 0), "H5Fcreate succeeded"); VRFY((H5Fclose(file_id) >= 0), "File close succeeded"); - test_filenames[0] = "basic_create.h5"; - test_filenames[1] = NULL; - h5_clean_files(test_filenames, fapl_id); + H5E_BEGIN_TRY + { + H5Fdelete(SUBF_FILENAME, fapl_id); + } + H5E_END_TRY; + + VRFY((H5Pclose(fapl_id) >= 0), "FAPL close succeeded"); - return; + CHECK_PASSED(); } +#undef SUBF_FILENAME -int -main(int argc, char **argv) +/* + * Test to check that Subfiling configuration file matches + * what is expected for a given configuration + */ +#define SUBF_FILENAME "test_subfiling_config_file.h5" +static void +test_config_file(void) { - int required = MPI_THREAD_MULTIPLE; - int provided = 0; - int mpi_code; + H5FD_subfiling_params_t cfg; + int64_t stripe_size; + int64_t read_stripe_size; + FILE *config_file; + char *config_filename = NULL; + char *config_buf = NULL; + long config_file_len; + hid_t file_id = H5I_INVALID_HID; + hid_t fapl_id = H5I_INVALID_HID; + int read_stripe_count; + int read_aggr_count; - /* Initialize MPI */ - if (MPI_SUCCESS != MPI_Init_thread(&argc, &argv, required, &provided)) { - HDprintf("MPI_Init_thread failed\n"); - nerrors++; - goto exit; - } + curr_nerrors = nerrors; - if (provided != required) { - HDprintf("MPI doesn't support MPI_Init_thread with MPI_THREAD_MULTIPLE\n"); - nerrors++; - goto exit; - } + if (MAINPROCESS) + TESTING_2("subfiling configuration file format"); - MPI_Comm_size(comm, &mpi_size); - MPI_Comm_rank(comm, &mpi_rank); + /* + * Choose a random Subfiling stripe size between + * the smallest allowed value and 32MiB + */ + stripe_size = (rand() % (H5FD_SUBFILING_DEFAULT_STRIPE_SIZE - SUBFILING_MIN_STRIPE_SIZE + 1)) + + SUBFILING_MIN_STRIPE_SIZE; - if (H5dont_atexit() < 0) { - if (MAINPROCESS) - HDprintf("Failed to turn off atexit processing. Continue.\n"); - } + cfg.ioc_selection = SELECT_IOC_ONE_PER_NODE; + cfg.stripe_size = (stripe_size_g > 0) ? stripe_size_g : stripe_size; + cfg.stripe_count = num_iocs_g > 1 ? (num_iocs_g / 2) : 1; - H5open(); + fapl_id = create_subfiling_ioc_fapl(comm_g, info_g, TRUE, &cfg, H5FD_IOC_DEFAULT_THREAD_POOL_SIZE); + VRFY((fapl_id >= 0), "FAPL creation succeeded"); - /* Enable selection I/O using internal temporary workaround */ - H5_use_selection_io_g = TRUE; + file_id = H5Fcreate(SUBF_FILENAME, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); + VRFY((file_id >= 0), "H5Fcreate succeeded"); + + VRFY((H5Fclose(file_id) >= 0), "File close succeeded"); if (MAINPROCESS) { - HDprintf("Testing Subfiling VFD functionality\n"); + h5_stat_t file_info; + char *resolved_path; + char *subfile_dir; + char *subfile_name; + char *tmp_buf; + char *substr; + char scan_format[256]; + int num_digits; + + VRFY((HDstat(SUBF_FILENAME, &file_info) >= 0), "HDstat succeeded"); + + config_filename = HDmalloc(PATH_MAX); + VRFY(config_filename, "HDmalloc succeeded"); + + HDsnprintf(config_filename, PATH_MAX, H5FD_SUBFILING_CONFIG_FILENAME_TEMPLATE, SUBF_FILENAME, + (uint64_t)file_info.st_ino); + + config_file = HDfopen(config_filename, "r"); + VRFY(config_file, "HDfopen succeeded"); + + HDfree(config_filename); + + VRFY((HDfseek(config_file, 0, SEEK_END) >= 0), "HDfseek succeeded"); + + config_file_len = HDftell(config_file); + VRFY((config_file_len > 0), "HDftell succeeded"); + + VRFY((HDfseek(config_file, 0, SEEK_SET) >= 0), "HDfseek succeeded"); + + config_buf = HDmalloc((size_t)config_file_len + 1); + VRFY(config_buf, "HDmalloc succeeded"); + + VRFY((HDfread(config_buf, (size_t)config_file_len, 1, config_file) == 1), "HDfread succeeded"); + config_buf[config_file_len] = '\0'; + + /* Check the stripe_size field in the configuration file */ + substr = HDstrstr(config_buf, "stripe_size"); + VRFY(substr, "HDstrstr succeeded"); + + VRFY((HDsscanf(substr, "stripe_size=%" PRId64, &read_stripe_size) == 1), "HDsscanf succeeded"); + VRFY((read_stripe_size == cfg.stripe_size), "Stripe size comparison succeeded"); + + /* Check the aggregator_count field in the configuration file */ + substr = HDstrstr(config_buf, "aggregator_count"); + VRFY(substr, "HDstrstr succeeded"); + + VRFY((HDsscanf(substr, "aggregator_count=%d", &read_aggr_count) == 1), "HDsscanf succeeded"); + if (cfg.stripe_count < num_iocs_g) + VRFY((read_aggr_count == cfg.stripe_count), "Aggregator count comparison succeeded"); + else + VRFY((read_aggr_count == num_iocs_g), "Aggregator count comparison succeeded"); + + /* Check the subfile_count field in the configuration file */ + substr = HDstrstr(config_buf, "subfile_count"); + VRFY(substr, "HDstrstr succeeded"); + + VRFY((HDsscanf(substr, "subfile_count=%d", &read_stripe_count) == 1), "HDsscanf succeeded"); + VRFY((read_stripe_count == cfg.stripe_count), "Stripe count comparison succeeded"); + + /* Check the hdf5_file and subfile_dir fields in the configuration file */ + resolved_path = HDrealpath(SUBF_FILENAME, NULL); + VRFY(resolved_path, "HDrealpath succeeded"); + + VRFY((H5_dirname(resolved_path, &subfile_dir) >= 0), "H5_dirname succeeded"); + + tmp_buf = HDmalloc(PATH_MAX); + VRFY(tmp_buf, "HDmalloc succeeded"); + + substr = HDstrstr(config_buf, "hdf5_file"); + VRFY(substr, "HDstrstr succeeded"); + + HDsnprintf(scan_format, sizeof(scan_format), "hdf5_file=%%%zus", (size_t)(PATH_MAX - 1)); + VRFY((HDsscanf(substr, scan_format, tmp_buf) == 1), "HDsscanf succeeded"); + + VRFY((HDstrcmp(tmp_buf, resolved_path) == 0), "HDstrcmp succeeded"); + + substr = HDstrstr(config_buf, "subfile_dir"); + VRFY(substr, "HDstrstr succeeded"); + + HDsnprintf(scan_format, sizeof(scan_format), "subfile_dir=%%%zus", (size_t)(PATH_MAX - 1)); + VRFY((HDsscanf(substr, scan_format, tmp_buf) == 1), "HDsscanf succeeded"); + + VRFY((HDstrcmp(tmp_buf, subfile_dir) == 0), "HDstrcmp succeeded"); + + HDfree(tmp_buf); + H5MM_free(subfile_dir); + HDfree(resolved_path); + + subfile_name = HDmalloc(PATH_MAX); + VRFY(subfile_name, "HDmalloc succeeded"); + + /* Verify the name of each subfile is in the configuration file */ + num_digits = (int)(HDlog10(cfg.stripe_count) + 1); + for (size_t i = 0; i < (size_t)cfg.stripe_count; i++) { + HDsnprintf(subfile_name, PATH_MAX, H5FD_SUBFILING_FILENAME_TEMPLATE, SUBF_FILENAME, + (uint64_t)file_info.st_ino, num_digits, (int)i + 1, cfg.stripe_count); + + substr = HDstrstr(config_buf, subfile_name); + VRFY(substr, "HDstrstr succeeded"); + } + + /* Verify that there aren't too many subfiles */ + HDsnprintf(subfile_name, PATH_MAX, H5FD_SUBFILING_FILENAME_TEMPLATE, SUBF_FILENAME, + (uint64_t)file_info.st_ino, num_digits, (int)cfg.stripe_count + 1, cfg.stripe_count); + substr = HDstrstr(config_buf, subfile_name); + VRFY(substr == NULL, "HDstrstr correctly failed"); + + HDfree(subfile_name); + HDfree(config_buf); + + VRFY((HDfclose(config_file) >= 0), "HDfclose on configuration file succeeded"); } - TestAlarmOn(); + mpi_code_g = MPI_Barrier(comm_g); + VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Barrier succeeded"); - /* Create directories for test-generated .h5 files */ - if ((HDmkdir(SUBFILING_TEST_DIR, (mode_t)0755) < 0) && (errno != EEXIST)) { - HDprintf("couldn't create subfiling testing directory\n"); - nerrors++; - goto exit; + H5E_BEGIN_TRY + { + H5Fdelete(SUBF_FILENAME, fapl_id); } + H5E_END_TRY; - for (size_t i = 0; i < ARRAY_SIZE(tests); i++) { - if (MPI_SUCCESS == (mpi_code = MPI_Barrier(comm))) { - (*tests[i])(); + VRFY((H5Pclose(fapl_id) >= 0), "FAPL close succeeded"); + + CHECK_PASSED(); +} +#undef SUBF_FILENAME + +/* + * Test a few different Subfiling stripe sizes with a fixed + * stripe count + */ +/* TODO: Test collective I/O as well when support is implemented */ +#define SUBF_FILENAME "test_subfiling_stripe_sizes.h5" +#define SUBF_NITER 10 +static void +test_stripe_sizes(void) +{ + H5FD_t *file_ptr = NULL; + void *write_buf = NULL; + char *tmp_filename = NULL; + hid_t dxpl_id = H5I_INVALID_HID; + int num_subfiles; + int num_digits; + hid_t fapl_id = H5I_INVALID_HID; + + curr_nerrors = nerrors; + + if (MAINPROCESS) + TESTING_2("random subfiling stripe sizes"); + + tmp_filename = HDmalloc(PATH_MAX); + VRFY(tmp_filename, "HDmalloc succeeded"); + + dxpl_id = H5Pcreate(H5P_DATASET_XFER); + VRFY((dxpl_id >= 0), "DCPL creation succeeded"); + + for (size_t i = 0; i < SUBF_NITER; i++) { + H5FD_subfiling_params_t cfg; + h5_stat_size_t file_size; + const void *c_write_buf; + h5_stat_t file_info; + int64_t file_size64; + int64_t stripe_size; + haddr_t file_end_addr; + haddr_t write_addr; + size_t nbytes; + herr_t write_status; + hid_t file_id; + + /* + * Choose a random Subfiling stripe size between + * the smallest allowed value and the default value + */ + stripe_size = (rand() % (H5FD_SUBFILING_DEFAULT_STRIPE_SIZE - SUBFILING_MIN_STRIPE_SIZE + 1)) + + SUBFILING_MIN_STRIPE_SIZE; + + cfg.ioc_selection = SELECT_IOC_ONE_PER_NODE; + cfg.stripe_size = (stripe_size_g > 0) ? stripe_size_g : stripe_size; + cfg.stripe_count = 1; + + /* First, try I/O with a single rank */ + if (MAINPROCESS) { + FILE *subfile_ptr; + + num_subfiles = 1; + num_digits = (int)(HDlog10(num_subfiles) + 1); + + nbytes = (size_t)(cfg.stripe_size * num_subfiles); + + write_buf = HDmalloc(nbytes); + VRFY(write_buf, "HDmalloc succeeded"); + + HDmemset(write_buf, 255, nbytes); + + c_write_buf = write_buf; + + fapl_id = create_subfiling_ioc_fapl(MPI_COMM_SELF, MPI_INFO_NULL, TRUE, &cfg, + H5FD_IOC_DEFAULT_THREAD_POOL_SIZE); + VRFY((fapl_id >= 0), "FAPL creation succeeded"); + + /* Create and close file with H5Fcreate to setup superblock */ + file_id = H5Fcreate(SUBF_FILENAME, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); + VRFY((file_id >= 0), "H5Fcreate succeeded"); + VRFY((H5Fclose(file_id) >= 0), "H5Fclose succeeded"); + + /* Re-open file through H5FDopen for direct writes */ + file_ptr = H5FDopen(SUBF_FILENAME, H5F_ACC_RDWR, fapl_id, HADDR_UNDEF); + VRFY(file_ptr, "H5FDopen succeeded"); + + /* + * Get the current file size to see where we can safely + * write to in the file without overwriting the superblock + */ + VRFY((HDstat(SUBF_FILENAME, &file_info) >= 0), "HDstat succeeded"); + file_size = (h5_stat_size_t)file_info.st_size; + + H5_CHECK_OVERFLOW(file_size, h5_stat_size_t, haddr_t); + file_end_addr = (haddr_t)file_size; + + /* Set independent I/O on DXPL */ + VRFY((H5Pset_dxpl_mpio(dxpl_id, H5FD_MPIO_INDEPENDENT) >= 0), "H5Pset_dxpl_mpio succeeded"); + + /* Set EOA for following write call */ + VRFY((H5FDset_eoa(file_ptr, H5FD_MEM_DEFAULT, file_end_addr + nbytes) >= 0), + "H5FDset_eoa succeeded"); + + /* + * Write "number of IOCs" X "stripe size" bytes to the file + * and ensure that we have "number of IOCs" subfiles, each + * with a size of at least "stripe size" bytes. The first + * (few) subfile(s) may be a bit larger due to file metadata. + */ + write_addr = file_end_addr; + write_status = H5FDwrite(file_ptr, H5FD_MEM_DRAW, dxpl_id, write_addr, nbytes, c_write_buf); + VRFY((write_status >= 0), "H5FDwrite succeeded"); + + file_end_addr += nbytes; + + for (int j = 0; j < num_subfiles; j++) { + h5_stat_size_t subfile_size; + h5_stat_t subfile_info; + + HDsnprintf(tmp_filename, PATH_MAX, H5FD_SUBFILING_FILENAME_TEMPLATE, SUBF_FILENAME, + (uint64_t)file_info.st_ino, num_digits, j + 1, num_subfiles); + + /* Ensure file exists */ + subfile_ptr = HDfopen(tmp_filename, "r"); + VRFY(subfile_ptr, "HDfopen on subfile succeeded"); + VRFY((HDfclose(subfile_ptr) >= 0), "HDfclose on subfile succeeded"); + + /* Check file size */ + VRFY((HDstat(tmp_filename, &subfile_info) >= 0), "HDstat succeeded"); + subfile_size = (h5_stat_size_t)subfile_info.st_size; + + VRFY((subfile_size >= cfg.stripe_size), "File size verification succeeded"); + } + + /* Verify that there aren't too many subfiles */ + HDsnprintf(tmp_filename, PATH_MAX, H5FD_SUBFILING_FILENAME_TEMPLATE, SUBF_FILENAME, + (uint64_t)file_info.st_ino, num_digits, num_subfiles + 1, num_subfiles); + + /* Ensure file doesn't exist */ + subfile_ptr = HDfopen(tmp_filename, "r"); + VRFY(subfile_ptr == NULL, "HDfopen on subfile correctly failed"); + + /* Set EOA for following write call */ + VRFY((H5FDset_eoa(file_ptr, H5FD_MEM_DEFAULT, file_end_addr + nbytes) >= 0), + "H5FDset_eoa succeeded"); + + /* + * Write another round of "number of IOCs" X "stripe size" + * bytes to the file using vector I/O and ensure we have + * "number of IOCs" subfiles, each with a size of at least + * 2 * "stripe size" bytes. The first (few) subfile(s) may + * be a bit larger due to file metadata. + */ + H5FD_mem_t write_type = H5FD_MEM_DRAW; + write_addr = file_end_addr; + write_status = + H5FDwrite_vector(file_ptr, dxpl_id, 1, &write_type, &write_addr, &nbytes, &c_write_buf); + VRFY((write_status >= 0), "H5FDwrite_vector succeeded"); + + for (int j = 0; j < num_subfiles; j++) { + h5_stat_size_t subfile_size; + h5_stat_t subfile_info; + + HDsnprintf(tmp_filename, PATH_MAX, H5FD_SUBFILING_FILENAME_TEMPLATE, SUBF_FILENAME, + (uint64_t)file_info.st_ino, num_digits, j + 1, num_subfiles); + + /* Ensure file exists */ + subfile_ptr = HDfopen(tmp_filename, "r"); + VRFY(subfile_ptr, "HDfopen on subfile succeeded"); + VRFY((HDfclose(subfile_ptr) >= 0), "HDfclose on subfile succeeded"); + + /* Check file size */ + VRFY((HDstat(tmp_filename, &subfile_info) >= 0), "HDstat succeeded"); + subfile_size = (h5_stat_size_t)subfile_info.st_size; + + VRFY((subfile_size >= 2 * cfg.stripe_size), "File size verification succeeded"); + } + + /* Verify that there aren't too many subfiles */ + HDsnprintf(tmp_filename, PATH_MAX, H5FD_SUBFILING_FILENAME_TEMPLATE, SUBF_FILENAME, + (uint64_t)file_info.st_ino, num_digits, num_subfiles + 1, num_subfiles); + + /* Ensure file doesn't exist */ + subfile_ptr = HDfopen(tmp_filename, "r"); + VRFY(subfile_ptr == NULL, "HDfopen on subfile correctly failed"); + + HDfree(write_buf); + write_buf = NULL; + + VRFY((H5FDclose(file_ptr) >= 0), "H5FDclose succeeded"); + + H5E_BEGIN_TRY + { + H5Fdelete(SUBF_FILENAME, fapl_id); + } + H5E_END_TRY; + + VRFY((H5Pclose(fapl_id) >= 0), "FAPL close succeeded"); } - else { - if (MAINPROCESS) - MESG("MPI_Barrier failed"); - nerrors++; + + mpi_code_g = MPI_Barrier(comm_g); + VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Barrier succeeded"); + + /* Next, try I/O with all ranks */ + + cfg.stripe_count = num_iocs_g; + + fapl_id = create_subfiling_ioc_fapl(comm_g, info_g, TRUE, &cfg, H5FD_IOC_DEFAULT_THREAD_POOL_SIZE); + VRFY((fapl_id >= 0), "FAPL creation succeeded"); + + /* Create and close file with H5Fcreate to setup superblock */ + file_id = H5Fcreate(SUBF_FILENAME, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); + VRFY((file_id >= 0), "H5Fcreate succeeded"); + VRFY((H5Fclose(file_id) >= 0), "H5Fclose succeeded"); + + /* Re-open file through H5FDopen for direct writes */ + file_ptr = H5FDopen(SUBF_FILENAME, H5F_ACC_RDWR, fapl_id, HADDR_UNDEF); + VRFY(file_ptr, "H5FDopen succeeded"); + + num_subfiles = num_iocs_g; + num_digits = (int)(HDlog10(num_subfiles) + 1); + + nbytes = (size_t)(cfg.stripe_size * num_subfiles); + + write_buf = HDmalloc(nbytes); + VRFY(write_buf, "HDmalloc succeeded"); + + HDmemset(write_buf, 255, nbytes); + + c_write_buf = write_buf; + + /* + * Get the current file size to see where we can safely + * write to in the file without overwriting the superblock + */ + if (MAINPROCESS) { + VRFY((HDstat(SUBF_FILENAME, &file_info) >= 0), "HDstat succeeded"); + file_size = (h5_stat_size_t)file_info.st_size; + + H5_CHECK_OVERFLOW(file_size, h5_stat_size_t, int64_t); + file_size64 = (int64_t)file_size; + } + + if (mpi_size > 1) { + mpi_code_g = MPI_Bcast(&file_size64, 1, MPI_INT64_T, 0, comm_g); + VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Bcast succeeded"); + } + + H5_CHECK_OVERFLOW(file_size64, int64_t, haddr_t); + file_end_addr = (haddr_t)file_size64; + + /* Set independent I/O on DXPL */ + VRFY((H5Pset_dxpl_mpio(dxpl_id, H5FD_MPIO_INDEPENDENT) >= 0), "H5Pset_dxpl_mpio succeeded"); + + /* Set EOA for following write call */ + VRFY((H5FDset_eoa(file_ptr, H5FD_MEM_DEFAULT, file_end_addr + ((size_t)mpi_size * nbytes)) >= 0), + "H5FDset_eoa succeeded"); + + /* + * Write "number of IOCs" X "stripe size" bytes to the file + * from each rank and ensure that we have "number of IOCs" + * subfiles, each with a size of at least "mpi size" * "stripe size" + * bytes. The first (few) subfile(s) may be a bit larger + * due to file metadata. + */ + write_addr = file_end_addr + ((size_t)mpi_rank * nbytes); + write_status = H5FDwrite(file_ptr, H5FD_MEM_DRAW, dxpl_id, write_addr, nbytes, c_write_buf); + VRFY((write_status >= 0), "H5FDwrite succeeded"); + + file_end_addr += ((size_t)mpi_size * nbytes); + + mpi_code_g = MPI_Barrier(comm_g); + VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Barrier succeeded"); + + if (MAINPROCESS) { + FILE *subfile_ptr; + + for (int j = 0; j < num_subfiles; j++) { + h5_stat_size_t subfile_size; + h5_stat_t subfile_info; + + HDsnprintf(tmp_filename, PATH_MAX, H5FD_SUBFILING_FILENAME_TEMPLATE, SUBF_FILENAME, + (uint64_t)file_info.st_ino, num_digits, j + 1, num_subfiles); + + /* Ensure file exists */ + subfile_ptr = HDfopen(tmp_filename, "r"); + VRFY(subfile_ptr, "HDfopen on subfile succeeded"); + VRFY((HDfclose(subfile_ptr) >= 0), "HDfclose on subfile succeeded"); + + /* Check file size */ + VRFY((HDstat(tmp_filename, &subfile_info) >= 0), "HDstat succeeded"); + subfile_size = (h5_stat_size_t)subfile_info.st_size; + + VRFY((subfile_size >= (mpi_size * cfg.stripe_size)), "File size verification succeeded"); + } + + /* Verify that there aren't too many subfiles */ + HDsnprintf(tmp_filename, PATH_MAX, H5FD_SUBFILING_FILENAME_TEMPLATE, SUBF_FILENAME, + (uint64_t)file_info.st_ino, num_digits, num_subfiles + 1, num_subfiles); + + /* Ensure file doesn't exist */ + subfile_ptr = HDfopen(tmp_filename, "r"); + VRFY(subfile_ptr == NULL, "HDfopen on subfile correctly failed"); + } + + mpi_code_g = MPI_Barrier(comm_g); + VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Barrier succeeded"); + + /* Set EOA for following write call */ + VRFY((H5FDset_eoa(file_ptr, H5FD_MEM_DEFAULT, file_end_addr + ((size_t)mpi_size * nbytes)) >= 0), + "H5FDset_eoa succeeded"); + + /* + * Write another round of "number of IOCs" X "stripe size" + * bytes to the file from each rank using vector I/O and + * ensure we have "number of IOCs" subfiles, each with a + * size of at least 2 * "mpi size" * "stripe size" bytes. + * The first (few) subfile(s) may be a bit larger due to + * file metadata. + */ + H5FD_mem_t write_type = H5FD_MEM_DRAW; + write_addr = file_end_addr + ((size_t)mpi_rank * nbytes); + write_status = + H5FDwrite_vector(file_ptr, dxpl_id, 1, &write_type, &write_addr, &nbytes, &c_write_buf); + VRFY((write_status >= 0), "H5FDwrite_vector succeeded"); + + mpi_code_g = MPI_Barrier(comm_g); + VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Barrier succeeded"); + + if (MAINPROCESS) { + FILE *subfile_ptr; + + for (int j = 0; j < num_subfiles; j++) { + h5_stat_size_t subfile_size; + h5_stat_t subfile_info; + + HDsnprintf(tmp_filename, PATH_MAX, H5FD_SUBFILING_FILENAME_TEMPLATE, SUBF_FILENAME, + (uint64_t)file_info.st_ino, num_digits, j + 1, num_subfiles); + + /* Ensure file exists */ + subfile_ptr = HDfopen(tmp_filename, "r"); + VRFY(subfile_ptr, "HDfopen on subfile succeeded"); + VRFY((HDfclose(subfile_ptr) >= 0), "HDfclose on subfile succeeded"); + + /* Check file size */ + VRFY((HDstat(tmp_filename, &subfile_info) >= 0), "HDstat succeeded"); + subfile_size = (h5_stat_size_t)subfile_info.st_size; + + VRFY((subfile_size >= (2 * mpi_size * cfg.stripe_size)), "File size verification succeeded"); + } + + /* Verify that there aren't too many subfiles */ + HDsnprintf(tmp_filename, PATH_MAX, H5FD_SUBFILING_FILENAME_TEMPLATE, SUBF_FILENAME, + (uint64_t)file_info.st_ino, num_digits, num_subfiles + 1, num_subfiles); + + /* Ensure file doesn't exist */ + subfile_ptr = HDfopen(tmp_filename, "r"); + VRFY(subfile_ptr == NULL, "HDfopen on subfile correctly failed"); + } + + VRFY((H5FDclose(file_ptr) >= 0), "H5FDclose succeeded"); + + mpi_code_g = MPI_Barrier(comm_g); + VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Barrier succeeded"); + + H5E_BEGIN_TRY + { + H5Fdelete(SUBF_FILENAME, fapl_id); } + H5E_END_TRY; + + HDfree(write_buf); + + VRFY((H5Pclose(fapl_id) >= 0), "FAPL close succeeded"); } - if (nerrors) - goto exit; + HDfree(tmp_filename); + + VRFY((H5Pclose(dxpl_id) >= 0), "DXPL close succeeded"); + + CHECK_PASSED(); +} +#undef SUBF_FILENAME +#undef SUBF_NITER + +/* + * Test that opening a file with a different stripe + * size/count than was used when creating the file + * results in the original stripe size/count being + * used. As there is currently no API to check the + * exact values used, we rely on the assumption that + * using a different stripe size/count would result + * in data verification failures. + */ +#define SUBF_FILENAME "test_subfiling_read_different_stripe_sizes.h5" +#define SUBF_HDF5_TYPE H5T_NATIVE_INT +#define SUBF_C_TYPE int +static void +test_read_different_stripe_size(void) +{ + H5FD_subfiling_params_t cfg; + hsize_t start[1]; + hsize_t count[1]; + hsize_t dset_dims[1]; + size_t target_size; + hid_t file_id = H5I_INVALID_HID; + hid_t fapl_id = H5I_INVALID_HID; + hid_t dset_id = H5I_INVALID_HID; + hid_t fspace_id = H5I_INVALID_HID; + char *tmp_filename = NULL; + void *buf = NULL; + + curr_nerrors = nerrors; if (MAINPROCESS) - HDputs("All Subfiling VFD tests passed\n"); + TESTING_2("file re-opening with different stripe size"); -exit: - if (nerrors) { - if (MAINPROCESS) - HDprintf("*** %d TEST ERROR%s OCCURRED ***\n", nerrors, nerrors > 1 ? "S" : ""); + tmp_filename = HDmalloc(PATH_MAX); + VRFY(tmp_filename, "HDmalloc succeeded"); + + /* Use a 1MiB stripe size and a subfile for each IOC */ + cfg.ioc_selection = SELECT_IOC_ONE_PER_NODE; + cfg.stripe_size = (stripe_size_g > 0) ? stripe_size_g : 1048576; + cfg.stripe_count = num_iocs_g; + + fapl_id = create_subfiling_ioc_fapl(comm_g, info_g, TRUE, &cfg, H5FD_IOC_DEFAULT_THREAD_POOL_SIZE); + VRFY((fapl_id >= 0), "FAPL creation succeeded"); + + file_id = H5Fcreate(SUBF_FILENAME, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); + VRFY((file_id >= 0), "H5Fcreate succeeded"); + + VRFY((H5Pclose(fapl_id) >= 0), "FAPL close succeeded"); + + target_size = (size_t)cfg.stripe_size; + + /* Nudge stripe size to be multiple of C type size */ + if ((target_size % sizeof(SUBF_C_TYPE)) != 0) + target_size += sizeof(SUBF_C_TYPE) - (target_size % sizeof(SUBF_C_TYPE)); + + target_size *= (size_t)mpi_size; + + VRFY(((target_size % sizeof(SUBF_C_TYPE)) == 0), "target size check succeeded"); + + dset_dims[0] = (hsize_t)(target_size / sizeof(SUBF_C_TYPE)); + + fspace_id = H5Screate_simple(1, dset_dims, NULL); + VRFY((fspace_id >= 0), "H5Screate_simple succeeded"); + + dset_id = H5Dcreate2(file_id, "DSET", SUBF_HDF5_TYPE, fspace_id, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + VRFY((dset_id >= 0), "Dataset creation succeeded"); + + /* Select hyperslab */ + count[0] = dset_dims[0] / (hsize_t)mpi_size; + start[0] = (hsize_t)mpi_rank * count[0]; + VRFY((H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, start, NULL, count, NULL) >= 0), + "H5Sselect_hyperslab succeeded"); + + buf = HDmalloc(count[0] * sizeof(SUBF_C_TYPE)); + VRFY(buf, "HDmalloc succeeded"); + + for (size_t i = 0; i < count[0]; i++) + ((SUBF_C_TYPE *)buf)[i] = (SUBF_C_TYPE)((size_t)mpi_rank + i); + + VRFY((H5Dwrite(dset_id, SUBF_HDF5_TYPE, H5S_BLOCK, fspace_id, H5P_DEFAULT, buf) >= 0), + "Dataset write succeeded"); + + HDfree(buf); + buf = NULL; + + VRFY((H5Sclose(fspace_id) >= 0), "File dataspace close succeeded"); + VRFY((H5Dclose(dset_id) >= 0), "Dataset close succeeded"); + VRFY((H5Fclose(file_id) >= 0), "File close succeeded"); + + /* Ensure all the subfiles are present */ + if (MAINPROCESS) { + h5_stat_t file_info; + FILE *subfile_ptr; + int num_subfiles = cfg.stripe_count; + int num_digits = (int)(HDlog10(num_subfiles) + 1); + + VRFY((HDstat(SUBF_FILENAME, &file_info) >= 0), "HDstat succeeded"); + + for (int j = 0; j < num_subfiles; j++) { + h5_stat_size_t subfile_size; + h5_stat_t subfile_info; + + HDsnprintf(tmp_filename, PATH_MAX, H5FD_SUBFILING_FILENAME_TEMPLATE, SUBF_FILENAME, + (uint64_t)file_info.st_ino, num_digits, j + 1, num_subfiles); + + /* Ensure file exists */ + subfile_ptr = HDfopen(tmp_filename, "r"); + VRFY(subfile_ptr, "HDfopen on subfile succeeded"); + VRFY((HDfclose(subfile_ptr) >= 0), "HDfclose on subfile succeeded"); + + /* Check file size */ + VRFY((HDstat(tmp_filename, &subfile_info) >= 0), "HDstat succeeded"); + subfile_size = (h5_stat_size_t)subfile_info.st_size; + + VRFY((subfile_size >= cfg.stripe_size), "File size verification succeeded"); + } } - TestAlarmOff(); + mpi_code_g = MPI_Barrier(comm_g); + VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Barrier succeeded"); - H5close(); + /* Add a bit to the stripe size and specify a few more subfiles */ + cfg.stripe_size += (cfg.stripe_size / 2); + cfg.stripe_count *= 2; + + fapl_id = create_subfiling_ioc_fapl(comm_g, info_g, TRUE, &cfg, H5FD_IOC_DEFAULT_THREAD_POOL_SIZE); + VRFY((fapl_id >= 0), "FAPL creation succeeded"); + + file_id = H5Fopen(SUBF_FILENAME, H5F_ACC_RDONLY, fapl_id); + VRFY((file_id >= 0), "H5Fopen succeeded"); + + dset_id = H5Dopen2(file_id, "DSET", H5P_DEFAULT); + VRFY((dset_id >= 0), "Dataset open succeeded"); + + fspace_id = H5Dget_space(dset_id); + VRFY((fspace_id >= 0), "Dataspace retrieval succeeded"); + + /* Select hyperslab */ + count[0] = dset_dims[0] / (hsize_t)mpi_size; + start[0] = (hsize_t)mpi_rank * count[0]; + VRFY((H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, start, NULL, count, NULL) >= 0), + "H5Sselect_hyperslab succeeded"); + + buf = HDcalloc(1, count[0] * sizeof(SUBF_C_TYPE)); + VRFY(buf, "HDcalloc succeeded"); + + VRFY((H5Dread(dset_id, SUBF_HDF5_TYPE, H5S_BLOCK, fspace_id, H5P_DEFAULT, buf) >= 0), + "Dataset read succeeded"); + + for (size_t i = 0; i < count[0]; i++) { + SUBF_C_TYPE buf_value = ((SUBF_C_TYPE *)buf)[i]; + + VRFY((buf_value == (SUBF_C_TYPE)((size_t)mpi_rank + i)), "data verification succeeded"); + } + + HDfree(buf); + buf = NULL; + + VRFY((H5Sclose(fspace_id) >= 0), "File dataspace close succeeded"); + VRFY((H5Dclose(dset_id) >= 0), "Dataset close succeeded"); + VRFY((H5Fclose(file_id) >= 0), "File close succeeded"); + + /* Ensure only the original subfiles are present */ + if (MAINPROCESS) { + h5_stat_t file_info; + FILE *subfile_ptr; + int num_subfiles = cfg.stripe_count; + int num_digits = (int)(HDlog10(num_subfiles) + 1); + + VRFY((HDstat(SUBF_FILENAME, &file_info) >= 0), "HDstat succeeded"); + + for (int j = 0; j < num_subfiles; j++) { + HDsnprintf(tmp_filename, PATH_MAX, H5FD_SUBFILING_FILENAME_TEMPLATE, SUBF_FILENAME, + (uint64_t)file_info.st_ino, num_digits, j + 1, num_subfiles / 2); + + if (j < (num_subfiles / 2)) { + /* Ensure file exists */ + subfile_ptr = HDfopen(tmp_filename, "r"); + VRFY(subfile_ptr, "HDfopen on subfile succeeded"); + VRFY((HDfclose(subfile_ptr) >= 0), "HDfclose on subfile succeeded"); + } + else { + /* Ensure file doesn't exist */ + subfile_ptr = HDfopen(tmp_filename, "r"); + VRFY(subfile_ptr == NULL, "HDfopen on subfile correctly failed"); + } + } + } + + mpi_code_g = MPI_Barrier(comm_g); + VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Barrier succeeded"); + + H5E_BEGIN_TRY + { + H5Fdelete(SUBF_FILENAME, fapl_id); + } + H5E_END_TRY; + + VRFY((H5Pclose(fapl_id) >= 0), "FAPL close succeeded"); + + HDfree(tmp_filename); + + CHECK_PASSED(); +} +#undef SUBF_FILENAME +#undef SUBF_HDF5_TYPE +#undef SUBF_C_TYPE + +/* + * Test that everything works correctly when a file is + * pre-created on rank 0 with a specified target number + * of subfiles and then read back on all ranks. + */ +#define SUBF_FILENAME "test_subfiling_precreate_rank_0.h5" +#define SUBF_HDF5_TYPE H5T_NATIVE_INT +#define SUBF_C_TYPE int +static void +test_subfiling_precreate_rank_0(void) +{ + hsize_t start[1]; + hsize_t count[1]; + hsize_t dset_dims[1]; + size_t target_size; + size_t n_elements_per_rank; + hid_t file_id = H5I_INVALID_HID; + hid_t fapl_id = H5I_INVALID_HID; + hid_t dset_id = H5I_INVALID_HID; + hid_t fspace_id = H5I_INVALID_HID; + void *buf = NULL; + + curr_nerrors = nerrors; + + if (MAINPROCESS) + TESTING_2("file pre-creation on rank 0"); + + /* Calculate target size for dataset to stripe it across available IOCs */ + target_size = (stripe_size_g > 0) ? (size_t)stripe_size_g : H5FD_SUBFILING_DEFAULT_STRIPE_SIZE; + + /* Nudge stripe size to be multiple of C type size */ + if ((target_size % sizeof(SUBF_C_TYPE)) != 0) + target_size += sizeof(SUBF_C_TYPE) - (target_size % sizeof(SUBF_C_TYPE)); + + target_size *= (size_t)mpi_size; + + VRFY(((target_size % sizeof(SUBF_C_TYPE)) == 0), "target size check succeeded"); + + if (stripe_size_g > 0) { + VRFY((target_size >= (size_t)stripe_size_g), "target size check succeeded"); + } + else { + VRFY((target_size >= H5FD_SUBFILING_DEFAULT_STRIPE_SIZE), "target size check succeeded"); + } + + dset_dims[0] = (hsize_t)(target_size / sizeof(SUBF_C_TYPE)); + n_elements_per_rank = (dset_dims[0] / (size_t)mpi_size); + + /* Create and populate file on rank 0 only */ + if (MAINPROCESS) { + H5FD_subfiling_params_t cfg; + h5_stat_size_t file_size; + h5_stat_t file_info; + FILE *subfile_ptr; + char *tmp_filename = NULL; + int num_subfiles; + int num_digits; + + /* Create a file consisting of 1 subfile per application I/O concentrator */ + cfg.ioc_selection = SELECT_IOC_ONE_PER_NODE; + cfg.stripe_size = (stripe_size_g > 0) ? stripe_size_g : H5FD_SUBFILING_DEFAULT_STRIPE_SIZE; + cfg.stripe_count = num_iocs_g; + + fapl_id = create_subfiling_ioc_fapl(MPI_COMM_SELF, MPI_INFO_NULL, TRUE, &cfg, + H5FD_IOC_DEFAULT_THREAD_POOL_SIZE); + VRFY((fapl_id >= 0), "FAPL creation succeeded"); + + file_id = H5Fcreate(SUBF_FILENAME, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); + VRFY((file_id >= 0), "H5Fcreate succeeded"); + + fspace_id = H5Screate_simple(1, dset_dims, NULL); + VRFY((fspace_id >= 0), "H5Screate_simple succeeded"); + + dset_id = + H5Dcreate2(file_id, "DSET", SUBF_HDF5_TYPE, fspace_id, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + VRFY((dset_id >= 0), "Dataset creation succeeded"); + + buf = HDmalloc(dset_dims[0] * sizeof(SUBF_C_TYPE)); + VRFY(buf, "HDmalloc succeeded"); + + for (size_t i = 0; i < dset_dims[0]; i++) + ((SUBF_C_TYPE *)buf)[i] = (SUBF_C_TYPE)((i / n_elements_per_rank) + (i % n_elements_per_rank)); + + VRFY((H5Dwrite(dset_id, SUBF_HDF5_TYPE, H5S_BLOCK, fspace_id, H5P_DEFAULT, buf) >= 0), + "Dataset write succeeded"); + + HDfree(buf); + buf = NULL; + + VRFY((H5Sclose(fspace_id) >= 0), "File dataspace close succeeded"); + VRFY((H5Dclose(dset_id) >= 0), "Dataset close succeeded"); + VRFY((H5Pclose(fapl_id) >= 0), "FAPL close succeeded"); + VRFY((H5Fclose(file_id) >= 0), "H5Fclose succeeded"); + + /* + * Ensure that all the subfiles are present + */ + + num_subfiles = cfg.stripe_count; + num_digits = (int)(HDlog10(num_subfiles) + 1); + + VRFY((HDstat(SUBF_FILENAME, &file_info) >= 0), "HDstat succeeded"); + + tmp_filename = HDmalloc(PATH_MAX); + VRFY(tmp_filename, "HDmalloc succeeded"); + + for (int i = 0; i < num_subfiles; i++) { + h5_stat_t subfile_info; + + HDsnprintf(tmp_filename, PATH_MAX, H5FD_SUBFILING_FILENAME_TEMPLATE, SUBF_FILENAME, + (uint64_t)file_info.st_ino, num_digits, i + 1, num_subfiles); + + /* Ensure file exists */ + subfile_ptr = HDfopen(tmp_filename, "r"); + VRFY(subfile_ptr, "HDfopen on subfile succeeded"); + VRFY((HDfclose(subfile_ptr) >= 0), "HDfclose on subfile succeeded"); + + /* Check file size */ + VRFY((HDstat(tmp_filename, &subfile_info) >= 0), "HDstat succeeded"); + file_size = (h5_stat_size_t)subfile_info.st_size; + + VRFY((file_size >= cfg.stripe_size), "File size verification succeeded"); + } + + /* Verify that there aren't too many subfiles */ + HDsnprintf(tmp_filename, PATH_MAX, H5FD_SUBFILING_FILENAME_TEMPLATE, SUBF_FILENAME, + (uint64_t)file_info.st_ino, num_digits, num_subfiles + 1, num_subfiles); + + /* Ensure file doesn't exist */ + subfile_ptr = HDfopen(tmp_filename, "r"); + VRFY(subfile_ptr == NULL, "HDfopen on subfile correctly failed"); + + HDfree(tmp_filename); + tmp_filename = NULL; + } + + mpi_code_g = MPI_Barrier(comm_g); + VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Barrier succeeded"); + + /* Open the file on all ranks */ + + fapl_id = create_subfiling_ioc_fapl(comm_g, info_g, FALSE, NULL, 0); + VRFY((fapl_id >= 0), "FAPL creation succeeded"); + + file_id = H5Fopen(SUBF_FILENAME, H5F_ACC_RDONLY, fapl_id); + VRFY((file_id >= 0), "H5Fopen succeeded"); + + dset_id = H5Dopen2(file_id, "DSET", H5P_DEFAULT); + VRFY((dset_id >= 0), "Dataset open succeeded"); + + fspace_id = H5Dget_space(dset_id); + VRFY((fspace_id >= 0), "Dataset dataspace retrieval succeeded"); + + /* Select hyperslab */ + count[0] = n_elements_per_rank; + start[0] = (hsize_t)mpi_rank * count[0]; + VRFY((H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, start, NULL, count, NULL) >= 0), + "H5Sselect_hyperslab succeeded"); + + buf = HDcalloc(1, count[0] * sizeof(SUBF_C_TYPE)); + VRFY(buf, "HDcalloc succeeded"); + + VRFY((H5Dread(dset_id, SUBF_HDF5_TYPE, H5S_BLOCK, fspace_id, H5P_DEFAULT, buf) >= 0), + "Dataset read succeeded"); + + for (size_t i = 0; i < n_elements_per_rank; i++) { + SUBF_C_TYPE buf_value = ((SUBF_C_TYPE *)buf)[i]; + + VRFY((buf_value == (SUBF_C_TYPE)((size_t)mpi_rank + i)), "data verification succeeded"); + } + + HDfree(buf); + buf = NULL; + + VRFY((H5Sclose(fspace_id) >= 0), "File dataspace close succeeded"); + VRFY((H5Dclose(dset_id) >= 0), "Dataset close succeeded"); + VRFY((H5Fclose(file_id) >= 0), "H5Fclose succeeded"); + + H5E_BEGIN_TRY + { + H5Fdelete(SUBF_FILENAME, fapl_id); + } + H5E_END_TRY; + + VRFY((H5Pclose(fapl_id) >= 0), "FAPL close succeeded"); + + CHECK_PASSED(); +} +#undef SUBF_FILENAME +#undef SUBF_HDF5_TYPE +#undef SUBF_C_TYPE + +/* + * Test to check that an HDF5 file created with the + * Subfiling VFD can be read back with a single MPI + * rank + */ +#define SUBF_FILENAME "test_subfiling_write_many_read_one.h5" +#define SUBF_HDF5_TYPE H5T_NATIVE_INT +#define SUBF_C_TYPE int +static void +test_subfiling_write_many_read_one(void) +{ + hsize_t start[1]; + hsize_t count[1]; + hsize_t dset_dims[1]; + size_t target_size; + hid_t file_id = H5I_INVALID_HID; + hid_t fapl_id = H5I_INVALID_HID; + hid_t dset_id = H5I_INVALID_HID; + hid_t fspace_id = H5I_INVALID_HID; + void *buf = NULL; + + curr_nerrors = nerrors; + + if (MAINPROCESS) + TESTING_2("reading back file with single MPI rank"); + + /* Get a default Subfiling FAPL */ + fapl_id = create_subfiling_ioc_fapl(comm_g, info_g, FALSE, NULL, 0); + VRFY((fapl_id >= 0), "FAPL creation succeeded"); + + /* Create file on all ranks */ + file_id = H5Fcreate(SUBF_FILENAME, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); + VRFY((file_id >= 0), "H5Fcreate succeeded"); + + VRFY((H5Pclose(fapl_id) >= 0), "FAPL close succeeded"); + + /* Calculate target size for dataset to stripe it across available IOCs */ + target_size = (stripe_size_g > 0) ? (size_t)stripe_size_g : H5FD_SUBFILING_DEFAULT_STRIPE_SIZE; + + /* Nudge stripe size to be multiple of C type size */ + if ((target_size % sizeof(SUBF_C_TYPE)) != 0) + target_size += sizeof(SUBF_C_TYPE) - (target_size % sizeof(SUBF_C_TYPE)); + + target_size *= (size_t)mpi_size; + + VRFY(((target_size % sizeof(SUBF_C_TYPE)) == 0), "target size check succeeded"); + + if (stripe_size_g > 0) { + VRFY((target_size >= (size_t)stripe_size_g), "target size check succeeded"); + } + else { + VRFY((target_size >= H5FD_SUBFILING_DEFAULT_STRIPE_SIZE), "target size check succeeded"); + } + + dset_dims[0] = (hsize_t)(target_size / sizeof(SUBF_C_TYPE)); + + fspace_id = H5Screate_simple(1, dset_dims, NULL); + VRFY((fspace_id >= 0), "H5Screate_simple succeeded"); + + dset_id = H5Dcreate2(file_id, "DSET", SUBF_HDF5_TYPE, fspace_id, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + VRFY((dset_id >= 0), "Dataset creation succeeded"); + + /* Select hyperslab */ + count[0] = dset_dims[0] / (hsize_t)mpi_size; + start[0] = (hsize_t)mpi_rank * count[0]; + VRFY((H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, start, NULL, count, NULL) >= 0), + "H5Sselect_hyperslab succeeded"); + + buf = HDmalloc(count[0] * sizeof(SUBF_C_TYPE)); + VRFY(buf, "HDmalloc succeeded"); + + for (size_t i = 0; i < count[0]; i++) + ((SUBF_C_TYPE *)buf)[i] = (SUBF_C_TYPE)((size_t)mpi_rank + i); + + VRFY((H5Dwrite(dset_id, SUBF_HDF5_TYPE, H5S_BLOCK, fspace_id, H5P_DEFAULT, buf) >= 0), + "Dataset write succeeded"); + + HDfree(buf); + buf = NULL; + + VRFY((H5Dclose(dset_id) >= 0), "Dataset close succeeded"); + VRFY((H5Fclose(file_id) >= 0), "File close succeeded"); + + mpi_code_g = MPI_Barrier(comm_g); + VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Barrier succeeded"); + + if (MAINPROCESS) { + fapl_id = create_subfiling_ioc_fapl(MPI_COMM_SELF, MPI_INFO_NULL, FALSE, NULL, 0); + VRFY((fapl_id >= 0), "FAPL creation succeeded"); + + file_id = H5Fopen(SUBF_FILENAME, H5F_ACC_RDONLY, fapl_id); + VRFY((file_id >= 0), "H5Fopen succeeded"); + + dset_id = H5Dopen2(file_id, "DSET", H5P_DEFAULT); + VRFY((dset_id >= 0), "Dataset open succeeded"); + + buf = HDcalloc(1, target_size); + VRFY(buf, "HDcalloc succeeded"); + + VRFY((H5Dread(dset_id, SUBF_HDF5_TYPE, H5S_BLOCK, H5S_ALL, H5P_DEFAULT, buf) >= 0), + "Dataset read succeeded"); + + for (size_t i = 0; i < (size_t)mpi_size; i++) { + for (size_t j = 0; j < count[0]; j++) { + SUBF_C_TYPE buf_value = ((SUBF_C_TYPE *)buf)[(i * count[0]) + j]; + + VRFY((buf_value == (SUBF_C_TYPE)(j + i)), "data verification succeeded"); + } + } + + HDfree(buf); + buf = NULL; + + VRFY((H5Dclose(dset_id) >= 0), "Dataset close succeeded"); + VRFY((H5Fclose(file_id) >= 0), "File close succeeded"); + + H5E_BEGIN_TRY + { + H5Fdelete(SUBF_FILENAME, fapl_id); + } + H5E_END_TRY; + + VRFY((H5Pclose(fapl_id) >= 0), "FAPL close succeeded"); + } + + mpi_code_g = MPI_Barrier(comm_g); + VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Barrier succeeded"); + + VRFY((H5Sclose(fspace_id) >= 0), "File dataspace close succeeded"); + + CHECK_PASSED(); +} +#undef SUBF_FILENAME +#undef SUBF_HDF5_TYPE +#undef SUBF_C_TYPE + +/* + * Test to check that an HDF5 file created with the + * Subfiling VFD can be read back with less MPI ranks + * than the file was written with + */ +#define SUBF_FILENAME "test_subfiling_write_many_read_few.h5" +#define SUBF_HDF5_TYPE H5T_NATIVE_INT +#define SUBF_C_TYPE int +static void +test_subfiling_write_many_read_few(void) +{ + MPI_Comm sub_comm = MPI_COMM_NULL; + hsize_t start[1]; + hsize_t count[1]; + hsize_t dset_dims[1]; + size_t target_size; + hid_t file_id = H5I_INVALID_HID; + hid_t fapl_id = H5I_INVALID_HID; + hid_t dset_id = H5I_INVALID_HID; + hid_t fspace_id = H5I_INVALID_HID; + void *buf = NULL; + + curr_nerrors = nerrors; + + if (MAINPROCESS) + TESTING_2("reading back file with fewer MPI ranks than written with"); + + /* + * Skip this test for an MPI communicator size of 1, + * as the test wouldn't really be meaningful + */ + if (mpi_size == 1) { + if (MAINPROCESS) + SKIPPED(); + return; + } + + /* Get a default Subfiling FAPL */ + fapl_id = create_subfiling_ioc_fapl(comm_g, info_g, FALSE, NULL, 0); + VRFY((fapl_id >= 0), "FAPL creation succeeded"); + + /* Create file on all ranks */ + file_id = H5Fcreate(SUBF_FILENAME, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); + VRFY((file_id >= 0), "H5Fcreate succeeded"); + + VRFY((H5Pclose(fapl_id) >= 0), "FAPL close succeeded"); + + /* Calculate target size for dataset to stripe it across available IOCs */ + target_size = (stripe_size_g > 0) ? (size_t)stripe_size_g : H5FD_SUBFILING_DEFAULT_STRIPE_SIZE; + + /* Nudge stripe size to be multiple of C type size */ + if ((target_size % sizeof(SUBF_C_TYPE)) != 0) + target_size += sizeof(SUBF_C_TYPE) - (target_size % sizeof(SUBF_C_TYPE)); + + target_size *= (size_t)mpi_size; + + VRFY(((target_size % sizeof(SUBF_C_TYPE)) == 0), "target size check succeeded"); + + if (stripe_size_g > 0) { + VRFY((target_size >= (size_t)stripe_size_g), "target size check succeeded"); + } + else { + VRFY((target_size >= H5FD_SUBFILING_DEFAULT_STRIPE_SIZE), "target size check succeeded"); + } + + dset_dims[0] = (hsize_t)(target_size / sizeof(SUBF_C_TYPE)); + + fspace_id = H5Screate_simple(1, dset_dims, NULL); + VRFY((fspace_id >= 0), "H5Screate_simple succeeded"); + + dset_id = H5Dcreate2(file_id, "DSET", SUBF_HDF5_TYPE, fspace_id, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + VRFY((dset_id >= 0), "Dataset creation succeeded"); + + /* Select hyperslab */ + count[0] = dset_dims[0] / (hsize_t)mpi_size; + start[0] = (hsize_t)mpi_rank * count[0]; + VRFY((H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, start, NULL, count, NULL) >= 0), + "H5Sselect_hyperslab succeeded"); + + buf = HDmalloc(count[0] * sizeof(SUBF_C_TYPE)); + VRFY(buf, "HDmalloc succeeded"); + + for (size_t i = 0; i < count[0]; i++) + ((SUBF_C_TYPE *)buf)[i] = (SUBF_C_TYPE)((size_t)mpi_rank + i); + + VRFY((H5Dwrite(dset_id, SUBF_HDF5_TYPE, H5S_BLOCK, fspace_id, H5P_DEFAULT, buf) >= 0), + "Dataset write succeeded"); + + HDfree(buf); + buf = NULL; + + VRFY((H5Dclose(dset_id) >= 0), "Dataset close succeeded"); + VRFY((H5Fclose(file_id) >= 0), "File close succeeded"); + + /* Read file back with half the number of MPI ranks */ + int color = (mpi_rank < (mpi_size / 2)); + mpi_code_g = MPI_Comm_split(comm_g, color, mpi_rank, &sub_comm); + VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Comm_split succeeded"); + + if (color) { + fapl_id = create_subfiling_ioc_fapl(sub_comm, MPI_INFO_NULL, FALSE, NULL, 0); + VRFY((fapl_id >= 0), "FAPL creation succeeded"); + + file_id = H5Fopen(SUBF_FILENAME, H5F_ACC_RDONLY, fapl_id); + VRFY((file_id >= 0), "H5Fopen succeeded"); + + dset_id = H5Dopen2(file_id, "DSET", H5P_DEFAULT); + VRFY((dset_id >= 0), "Dataset open succeeded"); + + buf = HDcalloc(1, target_size); + VRFY(buf, "HDcalloc succeeded"); + + VRFY((H5Dread(dset_id, SUBF_HDF5_TYPE, H5S_BLOCK, H5S_ALL, H5P_DEFAULT, buf) >= 0), + "Dataset read succeeded"); + + for (size_t i = 0; i < (size_t)mpi_size; i++) { + for (size_t j = 0; j < count[0]; j++) { + SUBF_C_TYPE buf_value = ((SUBF_C_TYPE *)buf)[(i * count[0]) + j]; + + VRFY((buf_value == (SUBF_C_TYPE)(j + i)), "data verification succeeded"); + } + } + + HDfree(buf); + buf = NULL; + + VRFY((H5Dclose(dset_id) >= 0), "Dataset close succeeded"); + VRFY((H5Fclose(file_id) >= 0), "File close succeeded"); + + H5E_BEGIN_TRY + { + H5Fdelete(SUBF_FILENAME, fapl_id); + } + H5E_END_TRY; + + VRFY((H5Pclose(fapl_id) >= 0), "FAPL close succeeded"); + } + + mpi_code_g = MPI_Comm_free(&sub_comm); + VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Comm_free succeeded"); + + mpi_code_g = MPI_Barrier(comm_g); + VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Barrier succeeded"); + + VRFY((H5Sclose(fspace_id) >= 0), "File dataspace close succeeded"); + + CHECK_PASSED(); +} +#undef SUBF_FILENAME +#undef SUBF_HDF5_TYPE +#undef SUBF_C_TYPE + +/* + * Test that the subfiling file can be read with the + * sec2 driver after being fused back together with + * the h5fuse utility + */ +#define SUBF_FILENAME "test_subfiling_h5fuse.h5" +#define SUBF_HDF5_TYPE H5T_NATIVE_INT +#define SUBF_C_TYPE int +static void +test_subfiling_h5fuse(void) +{ + hsize_t start[1]; + hsize_t count[1]; + hsize_t dset_dims[1]; + size_t target_size; + hid_t file_id = H5I_INVALID_HID; + hid_t fapl_id = H5I_INVALID_HID; + hid_t dset_id = H5I_INVALID_HID; + hid_t fspace_id = H5I_INVALID_HID; + void *buf = NULL; + int skip_test = 0; + + curr_nerrors = nerrors; + + if (MAINPROCESS) + TESTING_2("h5fuse utility"); + +#if defined(H5_HAVE_FORK) && defined(H5_HAVE_WAITPID) + + /* + * Check if h5fuse script exists in current directory; + * Skip test if it doesn't + */ + if (MAINPROCESS) { + FILE *h5fuse_script; + + h5fuse_script = HDfopen("h5fuse.sh", "r"); + if (h5fuse_script) + HDfclose(h5fuse_script); + else + skip_test = 1; + } + + if (mpi_size > 1) { + mpi_code_g = MPI_Bcast(&skip_test, 1, MPI_INT, 0, comm_g); + VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Bcast succeeded"); + } + + if (skip_test) { + if (MAINPROCESS) + SKIPPED(); + return; + } + + /* Get a default Subfiling FAPL */ + fapl_id = create_subfiling_ioc_fapl(comm_g, info_g, FALSE, NULL, 0); + VRFY((fapl_id >= 0), "FAPL creation succeeded"); + + /* Create file on all ranks */ + file_id = H5Fcreate(SUBF_FILENAME, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); + VRFY((file_id >= 0), "H5Fcreate succeeded"); + + /* Calculate target size for dataset to stripe it across available IOCs */ + target_size = (stripe_size_g > 0) ? (size_t)stripe_size_g : H5FD_SUBFILING_DEFAULT_STRIPE_SIZE; + + /* Nudge stripe size to be multiple of C type size */ + if ((target_size % sizeof(SUBF_C_TYPE)) != 0) + target_size += sizeof(SUBF_C_TYPE) - (target_size % sizeof(SUBF_C_TYPE)); + + target_size *= (size_t)mpi_size; + + VRFY(((target_size % sizeof(SUBF_C_TYPE)) == 0), "target size check succeeded"); + + if (stripe_size_g > 0) { + VRFY((target_size >= (size_t)stripe_size_g), "target size check succeeded"); + } + else { + VRFY((target_size >= H5FD_SUBFILING_DEFAULT_STRIPE_SIZE), "target size check succeeded"); + } + + dset_dims[0] = (hsize_t)(target_size / sizeof(SUBF_C_TYPE)); + + fspace_id = H5Screate_simple(1, dset_dims, NULL); + VRFY((fspace_id >= 0), "H5Screate_simple succeeded"); + + dset_id = H5Dcreate2(file_id, "DSET", SUBF_HDF5_TYPE, fspace_id, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + VRFY((dset_id >= 0), "Dataset creation succeeded"); + + /* Select hyperslab */ + count[0] = dset_dims[0] / (hsize_t)mpi_size; + start[0] = (hsize_t)mpi_rank * count[0]; + VRFY((H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, start, NULL, count, NULL) >= 0), + "H5Sselect_hyperslab succeeded"); + + buf = HDmalloc(count[0] * sizeof(SUBF_C_TYPE)); + VRFY(buf, "HDmalloc succeeded"); + + for (size_t i = 0; i < count[0]; i++) + ((SUBF_C_TYPE *)buf)[i] = (SUBF_C_TYPE)((size_t)mpi_rank + i); + + VRFY((H5Dwrite(dset_id, SUBF_HDF5_TYPE, H5S_BLOCK, fspace_id, H5P_DEFAULT, buf) >= 0), + "Dataset write succeeded"); + + HDfree(buf); + buf = NULL; + + VRFY((H5Sclose(fspace_id) >= 0), "File dataspace close succeeded"); + VRFY((H5Dclose(dset_id) >= 0), "Dataset close succeeded"); + VRFY((H5Fclose(file_id) >= 0), "File close succeeded"); + + if (MAINPROCESS) { + h5_stat_t file_info; + pid_t pid = 0; + pid_t tmppid; + int status; + + pid = HDfork(); + VRFY(pid >= 0, "HDfork succeeded"); + + if (pid == 0) { + char *tmp_filename; + char *args[6]; + + tmp_filename = HDmalloc(PATH_MAX); + VRFY(tmp_filename, "HDmalloc succeeded"); + + VRFY((HDstat(SUBF_FILENAME, &file_info) >= 0), "HDstat succeeded"); + + /* Generate name for configuration file */ + HDsnprintf(tmp_filename, PATH_MAX, H5FD_SUBFILING_CONFIG_FILENAME_TEMPLATE, SUBF_FILENAME, + (uint64_t)file_info.st_ino); + + args[0] = HDstrdup("env"); + args[1] = HDstrdup("sh"); + args[2] = HDstrdup("h5fuse.sh"); + args[3] = HDstrdup("-f"); + args[4] = tmp_filename; + args[5] = NULL; + + /* Call h5fuse script from MPI rank 0 */ + HDexecvp("env", args); + } + else { + tmppid = HDwaitpid(pid, &status, 0); + VRFY(tmppid >= 0, "HDwaitpid succeeded"); + + if (WIFEXITED(status)) { + int ret; + + if ((ret = WEXITSTATUS(status)) != 0) { + HDprintf("h5fuse process exited with error code %d\n", ret); + HDfflush(stdout); + MPI_Abort(comm_g, -1); + } + } + else { + HDprintf("h5fuse process terminated abnormally\n"); + HDfflush(stdout); + MPI_Abort(comm_g, -1); + } + } + + /* Verify the size of the fused file */ + VRFY((HDstat(SUBF_FILENAME, &file_info) >= 0), "HDstat succeeded"); + VRFY(((size_t)file_info.st_size >= target_size), "File size verification succeeded"); + + /* Re-open file with sec2 driver and verify the data */ + file_id = H5Fopen(SUBF_FILENAME, H5F_ACC_RDONLY, H5P_DEFAULT); + VRFY((file_id >= 0), "H5Fopen succeeded"); + + dset_id = H5Dopen2(file_id, "DSET", H5P_DEFAULT); + VRFY((dset_id >= 0), "Dataset open succeeded"); + + buf = HDcalloc(1, target_size); + VRFY(buf, "HDcalloc succeeded"); + + VRFY((H5Dread(dset_id, SUBF_HDF5_TYPE, H5S_BLOCK, H5S_ALL, H5P_DEFAULT, buf) >= 0), + "Dataset read succeeded"); + + for (size_t i = 0; i < (size_t)mpi_size; i++) { + for (size_t j = 0; j < count[0]; j++) { + SUBF_C_TYPE buf_value = ((SUBF_C_TYPE *)buf)[(i * count[0]) + j]; + + VRFY((buf_value == (SUBF_C_TYPE)(j + i)), "data verification succeeded"); + } + } + + HDfree(buf); + buf = NULL; + + VRFY((H5Dclose(dset_id) >= 0), "Dataset close succeeded"); + VRFY((H5Fclose(file_id) >= 0), "File close succeeded"); + } + + mpi_code_g = MPI_Barrier(comm_g); + VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Barrier succeeded"); + + H5E_BEGIN_TRY + { + H5Fdelete(SUBF_FILENAME, fapl_id); + } + H5E_END_TRY; + + VRFY((H5Pclose(fapl_id) >= 0), "FAPL close succeeded"); + + CHECK_PASSED(); +#else + SKIPPED(); +#endif +} +#undef SUBF_FILENAME +#undef SUBF_HDF5_TYPE +#undef SUBF_C_TYPE + +static void +parse_subfiling_env_vars(void) +{ + char *env_value; + + if (NULL != (env_value = HDgetenv(H5FD_SUBFILING_STRIPE_SIZE))) { + stripe_size_g = HDstrtoll(env_value, NULL, 0); + if ((ERANGE == errno) || (stripe_size_g <= 0)) + stripe_size_g = -1; + } + + if (NULL != (env_value = HDgetenv(H5FD_SUBFILING_IOC_PER_NODE))) { + ioc_per_node_g = HDstrtol(env_value, NULL, 0); + if ((ERANGE == errno) || (ioc_per_node_g <= 0)) + ioc_per_node_g = -1; + else if (ioc_per_node_g * num_nodes_g > mpi_size) + /* + * If the number of IOCs per node from the environment + * causes the total number of IOCs to exceed the number + * of MPI ranks available, the Subfiling VFD will simply + * use all of the MPI ranks on a node as IOCs + */ + ioc_per_node_g = node_local_size; + } + + if (NULL != (env_value = HDgetenv(H5FD_IOC_THREAD_POOL_SIZE))) { + ioc_thread_pool_size_g = HDatoi(env_value); + if (ioc_thread_pool_size_g <= 0) + ioc_thread_pool_size_g = -1; + } +} + +int +main(int argc, char **argv) +{ + time_t seed; + int required = MPI_THREAD_MULTIPLE; + int provided = 0; + + HDcompile_assert(SUBFILING_MIN_STRIPE_SIZE <= H5FD_SUBFILING_DEFAULT_STRIPE_SIZE); + + /* Initialize MPI */ + if (MPI_SUCCESS != (mpi_code_g = MPI_Init_thread(&argc, &argv, required, &provided))) { + HDprintf("MPI_Init_thread failed with error code %d\n", mpi_code_g); + nerrors++; + goto exit; + } + + if (MPI_SUCCESS != (mpi_code_g = MPI_Comm_rank(comm_g, &mpi_rank))) { + HDprintf("MPI_Comm_rank failed with error code %d\n", mpi_code_g); + nerrors++; + goto exit; + } + + if (provided != required) { + if (MAINPROCESS) + HDprintf("MPI doesn't support MPI_Init_thread with MPI_THREAD_MULTIPLE\n"); + nerrors++; + goto exit; + } + + if (MPI_SUCCESS != (mpi_code_g = MPI_Comm_size(comm_g, &mpi_size))) { + if (MAINPROCESS) + HDprintf("MPI_Comm_size failed with error code %d\n", mpi_code_g); + nerrors++; + goto exit; + } + + /* Split communicator according to node-local ranks */ + if (MPI_SUCCESS != (mpi_code_g = MPI_Comm_split_type(comm_g, MPI_COMM_TYPE_SHARED, mpi_rank, + MPI_INFO_NULL, &node_local_comm))) { + if (MAINPROCESS) + HDprintf("MPI_Comm_split_type failed with error code %d\n", mpi_code_g); + nerrors++; + goto exit; + } + if (MPI_SUCCESS != (mpi_code_g = MPI_Comm_size(node_local_comm, &node_local_size))) { + if (MAINPROCESS) + HDprintf("MPI_Comm_size failed with error code %d\n", mpi_code_g); + nerrors++; + goto exit; + } + if (MPI_SUCCESS != (mpi_code_g = MPI_Comm_rank(node_local_comm, &node_local_rank))) { + if (MAINPROCESS) + HDprintf("MPI_Comm_rank failed with error code %d\n", mpi_code_g); + nerrors++; + goto exit; + } + + /* Get the number of nodes being run on */ + num_nodes_g = (node_local_rank == 0) ? 1 : 0; + if (MPI_SUCCESS != + (mpi_code_g = MPI_Allreduce(MPI_IN_PLACE, &num_nodes_g, 1, MPI_INT, MPI_SUM, comm_g))) { + if (MAINPROCESS) + HDprintf("MPI_Allreduce failed with error code %d\n", mpi_code_g); + nerrors++; + goto exit; + } + + /* + * Split communicator according to rank value across nodes. + * If the SELECT_IOC_ONE_PER_NODE IOC selection strategy is + * used, each rank with a node local rank value of 0 will + * be an IOC in the new communicator. + */ + if (MPI_SUCCESS != (mpi_code_g = MPI_Comm_split(comm_g, node_local_rank, mpi_rank, &ioc_comm))) { + if (MAINPROCESS) + HDprintf("MPI_Comm_split failed with error code %d\n", mpi_code_g); + nerrors++; + goto exit; + } + if (MPI_SUCCESS != (mpi_code_g = MPI_Comm_size(ioc_comm, &ioc_comm_size))) { + if (MAINPROCESS) + HDprintf("MPI_Comm_size failed with error code %d\n", mpi_code_g); + nerrors++; + goto exit; + } + if (MPI_SUCCESS != (mpi_code_g = MPI_Comm_rank(ioc_comm, &ioc_comm_rank))) { + if (MAINPROCESS) + HDprintf("MPI_Comm_rank failed with error code %d\n", mpi_code_g); + nerrors++; + goto exit; + } + + if (H5dont_atexit() < 0) { + if (MAINPROCESS) + HDprintf("Failed to turn off atexit processing. Continue.\n"); + } + + H5open(); + + /* Enable selection I/O using internal temporary workaround */ + H5_use_selection_io_g = TRUE; + + if (MAINPROCESS) { + HDprintf("Testing Subfiling VFD functionality\n"); + } + + TestAlarmOn(); + + seed = time(NULL); + srand((unsigned)seed); + + if (MAINPROCESS) + HDprintf("Using seed: %lld\n\n", (long long)seed); + + /* Grab values from environment variables if set */ + parse_subfiling_env_vars(); + + /* + * Assume that we use the "one IOC per node" selection + * strategy by default, with a possibly modified + * number of IOCs per node value + */ + num_iocs_g = (ioc_per_node_g > 0) ? (int)ioc_per_node_g * num_nodes_g : num_nodes_g; + if (num_iocs_g > mpi_size) + num_iocs_g = mpi_size; + + for (size_t i = 0; i < ARRAY_SIZE(tests); i++) { + if (MPI_SUCCESS == (mpi_code_g = MPI_Barrier(comm_g))) { + (*tests[i])(); + } + else { + if (MAINPROCESS) + MESG("MPI_Barrier failed"); + nerrors++; + } + } + + if (MAINPROCESS) + HDputs(""); + + /* + * Set any unset Subfiling environment variables and re-run + * the tests as a quick smoke check of whether those are + * working correctly + */ + if (stripe_size_g < 0) { + int64_t stripe_size; + char tmp[64]; + + /* + * Choose a random Subfiling stripe size between + * the smallest allowed value and the default value + */ + stripe_size = (rand() % (H5FD_SUBFILING_DEFAULT_STRIPE_SIZE - SUBFILING_MIN_STRIPE_SIZE + 1)) + + SUBFILING_MIN_STRIPE_SIZE; + + HDsnprintf(tmp, sizeof(tmp), "%" PRId64, stripe_size); + + if (HDsetenv(H5FD_SUBFILING_STRIPE_SIZE, tmp, 1) < 0) { + if (MAINPROCESS) + HDprintf("HDsetenv failed\n"); + nerrors++; + goto exit; + } + } + if (ioc_per_node_g < 0) { + const char *ioc_per_node_str; + + if (2 * num_nodes_g <= mpi_size) + ioc_per_node_str = "2"; + else + ioc_per_node_str = "1"; + + if (HDsetenv(H5FD_SUBFILING_IOC_PER_NODE, ioc_per_node_str, 1) < 0) { + if (MAINPROCESS) + HDprintf("HDsetenv failed\n"); + nerrors++; + goto exit; + } + } + if (ioc_thread_pool_size_g < 0) { + if (HDsetenv(H5FD_IOC_THREAD_POOL_SIZE, "2", 1) < 0) { + if (MAINPROCESS) + HDprintf("HDsetenv failed\n"); + nerrors++; + goto exit; + } + } + + /* Grab values from environment variables */ + parse_subfiling_env_vars(); + + /* + * Assume that we use the "one IOC per node" selection + * strategy by default, with a possibly modified + * number of IOCs per node value + */ + num_iocs_g = (ioc_per_node_g > 0) ? (int)ioc_per_node_g * num_nodes_g : num_nodes_g; + if (num_iocs_g > mpi_size) + num_iocs_g = mpi_size; + + if (MAINPROCESS) { + HDprintf("Re-running tests with environment variables set\n"); + } + + for (size_t i = 0; i < ARRAY_SIZE(tests); i++) { + if (MPI_SUCCESS == (mpi_code_g = MPI_Barrier(comm_g))) { + (*tests[i])(); + } + else { + if (MAINPROCESS) + MESG("MPI_Barrier failed"); + nerrors++; + } + } + + if (MAINPROCESS) + HDputs(""); + + if (nerrors) + goto exit; + + if (MAINPROCESS) + HDputs("All Subfiling VFD tests passed\n"); + +exit: + if (nerrors) { + if (MAINPROCESS) + HDprintf("*** %d TEST ERROR%s OCCURRED ***\n", nerrors, nerrors > 1 ? "S" : ""); + } + + TestAlarmOff(); + + H5close(); + + if (MPI_COMM_WORLD != ioc_comm) + MPI_Comm_free(&ioc_comm); + if (MPI_COMM_WORLD != node_local_comm) + MPI_Comm_free(&node_local_comm); MPI_Finalize(); diff --git a/testpar/t_vfd.c b/testpar/t_vfd.c index b65812a..512aa5b 100644 --- a/testpar/t_vfd.c +++ b/testpar/t_vfd.c @@ -327,7 +327,7 @@ setup_vfd_test_file(int file_name_id, char *file_name, int mpi_size, H5FD_mpio_x #ifdef H5_HAVE_SUBFILING_VFD else if (HDstrcmp(vfd_name, H5FD_SUBFILING_NAME) == 0) { - H5FD_subfiling_shared_config_t shared_conf = { + H5FD_subfiling_params_t shared_conf = { /* ioc_selection = */ SELECT_IOC_ONE_PER_NODE, /* stripe_size = */ (INTS_PER_RANK / 2), /* stripe_count = */ 0, /* will over write */ @@ -342,9 +342,7 @@ setup_vfd_test_file(int file_name_id, char *file_name, int mpi_size, H5FD_mpio_x H5FD_ioc_config_t ioc_config = { /* magic = */ H5FD_IOC_FAPL_MAGIC, /* version = */ H5FD_IOC_CURR_FAPL_VERSION, - /* under_fapl_id = */ H5P_DEFAULT, /* thread_pool_size = */ H5FD_IOC_DEFAULT_THREAD_POOL_SIZE, - /* subf_config = */ shared_conf, }; hid_t ioc_fapl = H5I_INVALID_HID; diff --git a/utils/subfiling_vfd/CMakeLists.txt b/utils/subfiling_vfd/CMakeLists.txt index da5e44b..3acdc6b 100644 --- a/utils/subfiling_vfd/CMakeLists.txt +++ b/utils/subfiling_vfd/CMakeLists.txt @@ -1,10 +1,20 @@ cmake_minimum_required (VERSION 3.18) project (HDF5_UTILS_SUBFILINGVFD C) -configure_file (${HDF5_UTILS_SUBFILINGVFD_SOURCE_DIR}/h5fuse.sh.in ${HDF5_BINARY_DIR}/h5fuse.sh @ONLY) +configure_file (${HDF5_UTILS_SUBFILINGVFD_SOURCE_DIR}/h5fuse.sh.in ${HDF5_UTILS_SUBFILINGVFD_BINARY_DIR}/h5fuse.sh @ONLY) + +# Copy h5fuse.sh to testpar directory for subfiling tests +if (HDF5_ENABLE_PARALLEL AND HDF5_TEST_PARALLEL) + file ( + COPY + ${HDF5_UTILS_SUBFILINGVFD_BINARY_DIR}/h5fuse.sh + DESTINATION + ${HDF5_TEST_PAR_BINARY_DIR} + ) +endif () install ( - FILES ${HDF5_BINARY_DIR}/h5fuse.sh + FILES ${HDF5_UTILS_SUBFILINGVFD_BINARY_DIR}/h5fuse.sh DESTINATION ${HDF5_INSTALL_BIN_DIR} PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE COMPONENT utilsapplications diff --git a/utils/subfiling_vfd/h5fuse.sh.in b/utils/subfiling_vfd/h5fuse.sh.in index b526b0b..dafaab9 100755 --- a/utils/subfiling_vfd/h5fuse.sh.in +++ b/utils/subfiling_vfd/h5fuse.sh.in @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # Copyright by The HDF Group. # All rights reserved. @@ -49,7 +49,7 @@ while getopts ":h:f:" option; do h) # display Help usage exit;; - f) # subfiling configureation file + f) # subfiling configuration file file_config=$OPTARG;; \?) # Invalid option echo -e "$RED ERROR: Invalid option ${BLD}-${OPTARG}${RED} $NC" -- cgit v0.12