summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--MANIFEST1
-rw-r--r--release_docs/RELEASE.txt25
-rw-r--r--src/H5Fsuper.c133
-rw-r--r--testpar/CMakeLists.txt1
-rw-r--r--testpar/Makefile.am2
-rw-r--r--testpar/t_pread.c904
6 files changed, 1022 insertions, 44 deletions
diff --git a/MANIFEST b/MANIFEST
index f194f72..e0e30c7 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -1244,6 +1244,7 @@
./testpar/t_ph5basic.c
./testpar/t_pflush1.c
./testpar/t_pflush2.c
+./testpar/t_pread.c
./testpar/t_prop.c
./testpar/t_shapesame.c
./testpar/t_pshutdown.c
diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt
index db946a0..db3e4e0 100644
--- a/release_docs/RELEASE.txt
+++ b/release_docs/RELEASE.txt
@@ -62,6 +62,31 @@ New Features
Parallel Library:
-----------------
+ - Optimize parallel open/location of the HDF5 super-block
+
+ Previous releases of PHDF5 required all parallel ranks to
+ search for the HDF5 superblock signature when opening the
+ file. As this is accomplished more or less as a synchronous
+ operation, a large number of processes can experience a
+ slowdown in the file open due to filesystem contention.
+
+ As a first step in improving the startup/file-open performance,
+ we allow MPI rank 0 of the associated MPI communicator to locate
+ the base offset of the super-block and then broadcast that result
+ to the remaining ranks in the parallel group. Note that this
+ approach is utilized ONLY during file opens which employ the MPIO
+ file driver in HDF5 by previously having called H5Pset_fapl_mpio().
+
+ HDF5 parallel file operations which do not employ multiple ranks
+ e.g. specifiying MPI_COMM_SELF (whose MPI_Comm_size == 1)
+ as opposed to MPI_COMM_WORLD, will not be affected by this
+ optimization. Conversely, parallel file operations on subgroups
+ of MPI_COMM_WORLD are allowed to be run in parallel with each
+ subgroup operating as an independant collection of processes.
+
+ (RAW – 2017/10/10, HDFFV-10294)
+
+
- Large MPI-IO transfers
Previous releases of PHDF5 would fail when attempting to
diff --git a/src/H5Fsuper.c b/src/H5Fsuper.c
index 942a7ed..0c6f9cd 100644
--- a/src/H5Fsuper.c
+++ b/src/H5Fsuper.c
@@ -21,15 +21,15 @@
/***********/
/* Headers */
/***********/
-#include "H5private.h" /* Generic Functions */
+#include "H5private.h" /* Generic Functions */
#include "H5ACprivate.h" /* Metadata cache */
-#include "H5Eprivate.h" /* Error handling */
+#include "H5Eprivate.h" /* Error handling */
#include "H5Fpkg.h" /* File access */
-#include "H5FDprivate.h" /* File drivers */
-#include "H5Iprivate.h" /* IDs */
+#include "H5FDprivate.h" /* File drivers */
+#include "H5Iprivate.h" /* IDs */
#include "H5MFprivate.h" /* File memory management */
-#include "H5MMprivate.h" /* Memory management */
-#include "H5Pprivate.h" /* Property lists */
+#include "H5MMprivate.h" /* Memory management */
+#include "H5Pprivate.h" /* Property lists */
#include "H5SMprivate.h" /* Shared Object Header Messages */
@@ -158,7 +158,7 @@ H5F_super_ext_open(H5F_t *f, haddr_t ext_addr, H5O_loc_t *ext_ptr)
/* Open the superblock extension object header */
if(H5O_open(ext_ptr) < 0)
- HGOTO_ERROR(H5E_OHDR, H5E_CANTOPENOBJ, FAIL, "unable to open superblock extension")
+ HGOTO_ERROR(H5E_OHDR, H5E_CANTOPENOBJ, FAIL, "unable to open superblock extension")
done:
FUNC_LEAVE_NOAPI(ret_value)
@@ -224,12 +224,12 @@ done:
/*-------------------------------------------------------------------------
* Function: H5F__update_super_ext_driver_msg
*
- * Purpose: Update the superblock extension file driver info message if
- * we are using a V 2 superblock. Observe that the function
- * is a NO-OP if the file driver info message does not exist.
+ * Purpose: Update the superblock extension file driver info message if
+ * we are using a V 2 superblock. Observe that the function
+ * is a NO-OP if the file driver info message does not exist.
* This is necessary, as the function is called whenever the
- * EOA is updated, and were it to create the file driver info
- * message, it would find itself in an infinite recursion.
+ * EOA is updated, and were it to create the file driver info
+ * message, it would find itself in an infinite recursion.
*
* Return: Success: SUCCEED
* Failure: FAIL
@@ -267,7 +267,7 @@ H5F__update_super_ext_driver_msg(H5F_t *f, hid_t dxpl_id)
/* Check for driver info */
H5_CHECKED_ASSIGN(driver_size, size_t, H5FD_sb_size(f->shared->lf), hsize_t);
- /* Nothing to do unless there is both driver info and
+ /* Nothing to do unless there is both driver info and
* the driver info superblock extension message has
* already been created.
*/
@@ -330,9 +330,13 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial
unsigned sblock_flags = H5AC__NO_FLAGS_SET; /* flags used in superblock unprotect call */
haddr_t super_addr; /* Absolute address of superblock */
haddr_t eof; /* End of file address */
- unsigned rw_flags; /* Read/write permissions for file */
- hbool_t skip_eof_check = FALSE; /* Whether to skip checking the EOF value */
+ unsigned rw_flags; /* Read/write permissions for file */
+ hbool_t skip_eof_check = FALSE; /* Whether to skip checking the EOF value */
herr_t ret_value = SUCCEED; /* Return value */
+#ifdef H5_HAVE_PARALLEL
+ int mpi_rank = 0, mpi_size = 1;
+ int mpi_result;
+#endif /* H5_HAVE_PARALLEL */
FUNC_ENTER_PACKAGE_TAG(meta_dxpl_id, H5AC__SUPERBLOCK_TAG, FAIL)
@@ -354,8 +358,51 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial
HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "can't get property list")
/* Find the superblock */
- if(H5FD_locate_signature(&fdio_info, &super_addr) < 0)
- HGOTO_ERROR(H5E_FILE, H5E_NOTHDF5, FAIL, "unable to locate file signature")
+#ifdef H5_HAVE_PARALLEL
+ if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) {
+
+ if((mpi_rank = H5F_mpi_get_rank(f)) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "Can't get MPI rank")
+
+ if((mpi_size = H5F_mpi_get_size(f)) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "can't retrieve MPI communicator size")
+ }
+
+ /* If we are an MPI application with at least two processes, the
+ * following superblock signature location optimization is applicable.
+ *
+ * Note:: For parallel applications which don't setup for using the
+ * HDF5 MPIO driver, we will arrive here with mpi_size == 1.
+ * This occurs because of the variable initialization (above) and the
+ * fact that we have skipped actually calling MPI functions to determine
+ * our MPI rank and size.
+ */
+ if ( mpi_size > 1 ) {
+ MPI_Comm this_comm = MPI_COMM_NULL;
+
+ if ( mpi_rank == 0 ) {
+ if (H5FD_locate_signature(&fdio_info, &super_addr) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_NOTHDF5, FAIL, "unable to locate file signature")
+ }
+ HDassert(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI));
+
+ if ( MPI_COMM_NULL == (this_comm = H5F_mpi_get_comm(f)) )
+ HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "can't get MPI communicator")
+
+ if ( MPI_SUCCESS !=
+ (mpi_result = MPI_Bcast(&super_addr,sizeof(super_addr), MPI_BYTE, 0, this_comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_result)
+ }
+ else {
+ /* Locate the signature as per per the serial library */
+#endif /* H5_HAVE_PARALLEL */
+
+ if (H5FD_locate_signature(&fdio_info, &super_addr) < 0)
+ HGOTO_ERROR(H5E_FILE, H5E_NOTHDF5, FAIL, "unable to locate file signature")
+
+#ifdef H5_HAVE_PARALLEL
+ }
+#endif /* H5_HAVE_PARALLEL */
if(HADDR_UNDEF == super_addr)
HGOTO_ERROR(H5E_FILE, H5E_NOTHDF5, FAIL, "file signature not found")
@@ -406,12 +453,12 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial
HGOTO_ERROR(H5E_FILE, H5E_CANTPROTECT, FAIL, "unable to load superblock")
if(H5F_INTENT(f) & H5F_ACC_SWMR_WRITE)
- if(sblock->super_vers < HDF5_SUPERBLOCK_VERSION_3)
- HGOTO_ERROR(H5E_FILE, H5E_CANTPROTECT, FAIL, "invalid superblock version for SWMR_WRITE")
+ if(sblock->super_vers < HDF5_SUPERBLOCK_VERSION_3)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTPROTECT, FAIL, "invalid superblock version for SWMR_WRITE")
/* Enable all latest version support when file has v3 superblock */
if(sblock->super_vers >= HDF5_SUPERBLOCK_VERSION_3)
- f->shared->latest_flags |= H5F_LATEST_ALL_FLAGS;
+ f->shared->latest_flags |= H5F_LATEST_ALL_FLAGS;
/* Pin the superblock in the cache */
if(H5AC_pin_protected_entry(sblock) < 0)
@@ -511,15 +558,15 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial
* been flushed to disk by the SWMR writer process.
*/
if(H5F_INTENT(f) & H5F_ACC_SWMR_READ) {
- /*
- * When the file is opened for SWMR read access, skip the check if:
- * --the file is already marked for SWMR writing and
- * --the file has version 3 superblock for SWMR support
- */
- if((sblock->status_flags & H5F_SUPER_SWMR_WRITE_ACCESS) &&
+ /*
+ * When the file is opened for SWMR read access, skip the check if:
+ * --the file is already marked for SWMR writing and
+ * --the file has version 3 superblock for SWMR support
+ */
+ if((sblock->status_flags & H5F_SUPER_SWMR_WRITE_ACCESS) &&
(sblock->status_flags & H5F_SUPER_WRITE_ACCESS) &&
sblock->super_vers >= HDF5_SUPERBLOCK_VERSION_3)
- skip_eof_check = TRUE;
+ skip_eof_check = TRUE;
} /* end if */
if(!skip_eof_check && initial_read) {
if(HADDR_UNDEF == (eof = H5FD_get_eof(f->shared->lf, H5FD_MEM_DEFAULT)))
@@ -593,7 +640,7 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial
H5O_loc_t ext_loc; /* "Object location" for superblock extension */
H5O_btreek_t btreek; /* v1 B-tree 'K' value message from superblock extension */
H5O_drvinfo_t drvinfo; /* Driver info message from superblock extension */
- size_t u; /* Local index variable */
+ size_t u; /* Local index variable */
htri_t status; /* Status for message existing */
/* Sanity check - superblock extension should only be defined for
@@ -614,7 +661,7 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial
} /* end if */
/* Open the superblock extension */
- if(H5F_super_ext_open(f, sblock->ext_addr, &ext_loc) < 0)
+ if(H5F_super_ext_open(f, sblock->ext_addr, &ext_loc) < 0)
HGOTO_ERROR(H5E_FILE, H5E_CANTOPENOBJ, FAIL, "unable to open file's superblock extension")
/* Check for the extension having a 'driver info' message */
@@ -637,8 +684,8 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial
/* Reset driver info message */
H5O_msg_reset(H5O_DRVINFO_ID, &drvinfo);
- HDassert(FALSE == f->shared->drvinfo_sb_msg_exists);
- f->shared->drvinfo_sb_msg_exists = TRUE;
+ HDassert(FALSE == f->shared->drvinfo_sb_msg_exists);
+ f->shared->drvinfo_sb_msg_exists = TRUE;
} /* end else */
} /* end if */
@@ -764,37 +811,37 @@ H5F__super_read(H5F_t *f, hid_t meta_dxpl_id, hid_t raw_dxpl_id, hbool_t initial
} /* end if not marked "unknown" */
} /* end if */
- /* Check for the extension having a 'metadata cache image' message */
+ /* Check for the extension having a 'metadata cache image' message */
if((status = H5O_msg_exists(&ext_loc, H5O_MDCI_MSG_ID, meta_dxpl_id)) < 0)
HGOTO_ERROR(H5E_FILE, H5E_EXISTS, FAIL, "unable to read object header")
if(status) {
- hbool_t rw = ((rw_flags & H5AC__READ_ONLY_FLAG) == 0);
- H5O_mdci_t mdci_msg;
+ hbool_t rw = ((rw_flags & H5AC__READ_ONLY_FLAG) == 0);
+ H5O_mdci_t mdci_msg;
- /* if the metadata cache image superblock extension message exists,
+ /* if the metadata cache image superblock extension message exists,
* read its contents and pass the data on to the metadata cache.
* Given this data, the cache will load and decode the metadata
- * cache image block, decoded it and load its contents into the
- * the cache on the test protect call.
+ * cache image block, decoded it and load its contents into the
+ * the cache on the test protect call.
*
* Further, if the file is opened R/W, the metadata cache will
- * delete the metadata cache image superblock extension and free
- * the cache image block. Don't do this now as f->shared
- * is not fully setup, which complicates matters.
+ * delete the metadata cache image superblock extension and free
+ * the cache image block. Don't do this now as f->shared
+ * is not fully setup, which complicates matters.
*/
/* Retrieve the 'metadata cache image message' structure */
- if(NULL == H5O_msg_read(&ext_loc, H5O_MDCI_MSG_ID, &mdci_msg, meta_dxpl_id))
+ if(NULL == H5O_msg_read(&ext_loc, H5O_MDCI_MSG_ID, &mdci_msg, meta_dxpl_id))
HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "unable to get metadata cache image message")
/* Indicate to the cache that there's an image to load on first protect call */
if(H5AC_load_cache_image_on_next_protect(f, mdci_msg.addr, mdci_msg.size, rw) < 0)
- HGOTO_ERROR(H5E_FILE, H5E_CANTLOAD, FAIL, "call to H5AC_load_cache_image_on_next_protect failed");
+ HGOTO_ERROR(H5E_FILE, H5E_CANTLOAD, FAIL, "call to H5AC_load_cache_image_on_next_protect failed");
} /* end if */
/* Close superblock extension */
if(H5F_super_ext_close(f, &ext_loc, meta_dxpl_id, FALSE) < 0)
- HGOTO_ERROR(H5E_FILE, H5E_CANTCLOSEOBJ, FAIL, "unable to close file's superblock extension")
+ HGOTO_ERROR(H5E_FILE, H5E_CANTCLOSEOBJ, FAIL, "unable to close file's superblock extension")
} /* end if */
/* Update the driver info if VFD indicated to do so */
diff --git a/testpar/CMakeLists.txt b/testpar/CMakeLists.txt
index 39d23a9..0c9f70e 100644
--- a/testpar/CMakeLists.txt
+++ b/testpar/CMakeLists.txt
@@ -47,6 +47,7 @@ set (H5P_TESTS
t_cache
t_pflush1
t_pflush2
+ t_pread
t_pshutdown
t_prestart
t_init_term
diff --git a/testpar/Makefile.am b/testpar/Makefile.am
index b0fe0cd..1f15830 100644
--- a/testpar/Makefile.am
+++ b/testpar/Makefile.am
@@ -23,7 +23,7 @@ AM_CPPFLAGS+=-I$(top_srcdir)/src -I$(top_srcdir)/test
# Test programs. These are our main targets.
#
-TEST_PROG_PARA=t_mpi t_bigio testphdf5 t_cache t_cache_image t_pflush1 t_pflush2 t_pshutdown t_prestart t_init_term t_shapesame t_filters_parallel
+TEST_PROG_PARA=t_mpi t_bigio testphdf5 t_cache t_cache_image t_pflush1 t_pflush2 t_pread t_pshutdown t_prestart t_init_term t_shapesame t_filters_parallel
check_PROGRAMS = $(TEST_PROG_PARA)
diff --git a/testpar/t_pread.c b/testpar/t_pread.c
new file mode 100644
index 0000000..7f23b9b
--- /dev/null
+++ b/testpar/t_pread.c
@@ -0,0 +1,904 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright by The HDF Group. *
+ * Copyright by the Board of Trustees of the University of Illinois. *
+ * All rights reserved. *
+ * *
+ * This file is part of HDF5. The full HDF5 copyright notice, including *
+ * terms governing use, modification, and redistribution, is contained in *
+ * the COPYING file, which can be found at the root of the source code *
+ * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. *
+ * If you do not have access to either file, you may request a copy from *
+ * help@hdfgroup.org. *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+/*
+ * Collective file open optimization tests
+ *
+ */
+
+#include "h5test.h"
+#include "testpar.h"
+
+/* The collection of files is included below to aid
+ * an external "cleanup" process if required.
+ *
+ * Note that the code below relies on the ordering of this array
+ * since each set of three is used by the tests either to construct
+ * or to read and validate.
+ */
+#define NFILENAME 9
+const char *FILENAMES[NFILENAME + 1]={"t_pread_data_file",
+ "reloc_t_pread_data_file",
+ "prefix_file",
+ "t_pread_group_0_file",
+ "reloc_t_pread_group_0_file",
+ "prefix_file_0",
+ "t_pread_group_1_file",
+ "reloc_t_pread_group_1_file",
+ "prefix_file_1",
+ NULL};
+#define FILENAME_BUF_SIZE 1024
+
+#define COUNT 1000
+
+hbool_t pass = true;
+static const char *random_hdf5_text =
+"Now is the time for all first-time-users of HDF5 to read their \
+manual or go thru the tutorials!\n\
+While you\'re at it, now is also the time to read up on MPI-IO.";
+
+static const char *hitchhiker_quote =
+"A common mistake that people make when trying to design something\n\
+completely foolproof is to underestimate the ingenuity of complete\n\
+fools.\n";
+
+static int generate_test_file(MPI_Comm comm, int mpi_rank, int group);
+static int test_parallel_read(MPI_Comm comm, int mpi_rank, int group);
+
+
+/*-------------------------------------------------------------------------
+ * Function: generate_test_file
+ *
+ * Purpose: This function is called to produce an HDF5 data file
+ * whose superblock is relocated to a non-zero offset by
+ * utilizing the 'h5jam' utility to write random text
+ * at the start of the file. Unlike simple concatenation
+ * of files, h5jam is used to place the superblock on a
+ * power-of-2 boundary.
+ *
+ * Since data will be read back and validated, we generate
+ * data in a predictable manner rather than randomly.
+ * For now, we simply use the global mpi_rank of the writing
+ * process as a starting component for the data generation.
+ * Subsequent writes are increments from the initial start
+ * value.
+ *
+ * In the overall scheme of running the test, we'll call
+ * this function twice: first as a collection of all MPI
+ * processes and then a second time with the processes split
+ * more or less in half. Each sub group will operate
+ * collectively on their assigned file. This split into
+ * subgroups validates that parallel groups can successfully
+ * open and read data independantly from the other parallel
+ * operations taking place.
+ *
+ * Return: Success: 0
+ *
+ * Failure: 1
+ *
+ * Programmer: Richard Warren
+ * 10/1/17
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+static int
+generate_test_file( MPI_Comm comm, int mpi_rank, int group_id )
+{
+ FILE *header = NULL;
+ const char *fcn_name = "generate_test_file()";
+ const char *failure_mssg = NULL;
+ const char *group_filename = NULL;
+ char data_filename[FILENAME_BUF_SIZE];
+ char reloc_data_filename[FILENAME_BUF_SIZE];
+ char prolog_filename[FILENAME_BUF_SIZE];
+ int file_index;
+ int group_size;
+ int group_rank;
+ int local_failure = 0;
+ int global_failures = 0;
+ hsize_t count = COUNT;
+ hsize_t i;
+ hsize_t offset;
+ hsize_t dims[1] = {0};
+ hid_t file_id = -1;
+ hid_t memspace = -1;
+ hid_t filespace = -1;
+ hid_t fapl_id = -1;
+ hid_t dxpl_id = -1;
+ hid_t dset_id = -1;
+ float nextValue;
+ float *data_slice = NULL;
+
+ pass = true;
+
+ HDassert(comm != MPI_COMM_NULL);
+
+ if ( (MPI_Comm_rank(comm, &group_rank)) != MPI_SUCCESS) {
+ pass = FALSE;
+ failure_mssg = "generate_test_file: MPI_Comm_rank failed.\n";
+ }
+
+ if ( (MPI_Comm_size(comm, &group_size)) != MPI_SUCCESS) {
+ pass = FALSE;
+ failure_mssg = "generate_test_file: MPI_Comm_size failed.\n";
+ }
+
+ if ( mpi_rank == 0 ) {
+
+ HDfprintf(stdout, "Constructing test files...");
+ }
+
+ /* Setup the file names
+ * The test specfic filenames are stored as consecutive
+ * array entries in the global 'FILENAMES' array above.
+ * Here, we simply decide on the starting index for
+ * file construction. The reading portion of the test
+ * will have a similar setup process...
+ */
+ if ( pass ) {
+ if ( comm == MPI_COMM_WORLD ) { /* Test 1 */
+ file_index = 0;
+ }
+ else if ( group_id == 0 ) { /* Test 2 group 0 */
+ file_index = 3;
+ }
+ else { /* Test 2 group 1 */
+ file_index = 6;
+ }
+
+ /* The 'group_filename' is just a temp variable and
+ * is used to call into the h5_fixname function. No
+ * need to worry that we reassign it for each file!
+ */
+ HDassert((group_filename = FILENAMES[file_index]));
+
+ /* Assign the 'data_filename' */
+ if ( h5_fixname(group_filename, H5P_DEFAULT, data_filename,
+ sizeof(data_filename)) == NULL ) {
+ pass = FALSE;
+ failure_mssg = "h5_fixname(0) failed.\n";
+ }
+ }
+
+ if ( pass ) {
+
+ HDassert( (group_filename = FILENAMES[file_index+1]) );
+
+ /* Assign the 'reloc_data_filename' */
+ if ( h5_fixname(group_filename, H5P_DEFAULT, reloc_data_filename,
+ sizeof(reloc_data_filename)) == NULL ) {
+
+ pass = FALSE;
+ failure_mssg = "h5_fixname(1) failed.\n";
+ }
+ }
+
+ if ( pass ) {
+
+ HDassert( (group_filename = FILENAMES[file_index+2]) );
+
+ /* Assign the 'prolog_filename' */
+ if ( h5_fixname(group_filename, H5P_DEFAULT, prolog_filename,
+ sizeof(prolog_filename)) == NULL ) {
+ pass = FALSE;
+ failure_mssg = "h5_fixname(2) failed.\n";
+ }
+ }
+
+ /* setup data to write */
+ if ( pass ) {
+ if ( (data_slice = (float *)HDmalloc(COUNT * sizeof(float))) == NULL ) {
+ pass = FALSE;
+ failure_mssg = "malloc of data_slice failed.\n";
+ }
+ }
+
+ if ( pass ) {
+ nextValue = (float)(mpi_rank * COUNT);
+
+ for(i=0; i<COUNT; i++) {
+ data_slice[i] = nextValue;
+ nextValue += 1;
+ }
+ }
+
+ /* setup FAPL */
+ if ( pass ) {
+ if ( (fapl_id = H5Pcreate(H5P_FILE_ACCESS)) < 0 ) {
+ pass = FALSE;
+ failure_mssg = "H5Pcreate(H5P_FILE_ACCESS) failed.\n";
+ }
+ }
+
+ if ( pass ) {
+ if ( (H5Pset_fapl_mpio(fapl_id, comm, MPI_INFO_NULL)) < 0 ) {
+ pass = FALSE;
+ failure_mssg = "H5Pset_fapl_mpio() failed\n";
+ }
+ }
+
+ /* create the data file */
+ if ( pass ) {
+ if ( (file_id = H5Fcreate(data_filename, H5F_ACC_TRUNC,
+ H5P_DEFAULT, fapl_id)) < 0 ) {
+ pass = FALSE;
+ failure_mssg = "H5Fcreate() failed.\n";
+ }
+ }
+
+ /* create and write the dataset */
+ if ( pass ) {
+ if ( (dxpl_id = H5Pcreate(H5P_DATASET_XFER)) < 0 ) {
+ pass = FALSE;
+ failure_mssg = "H5Pcreate(H5P_DATASET_XFER) failed.\n";
+ }
+ }
+
+ if ( pass ) {
+ if ( (H5Pset_dxpl_mpio(dxpl_id, H5FD_MPIO_COLLECTIVE)) < 0 ) {
+ pass = FALSE;
+ failure_mssg = "H5Pset_dxpl_mpio() failed.\n";
+ }
+ }
+
+ if ( pass ) {
+ dims[0] = COUNT;
+ if ( (memspace = H5Screate_simple(1, dims, NULL)) < 0 ) {
+ pass = FALSE;
+ failure_mssg = "H5Screate_simple(1, dims, NULL) failed (1).\n";
+ }
+ }
+
+ if ( pass ) {
+ dims[0] *= (hsize_t)group_size;
+ if ( (filespace = H5Screate_simple(1, dims, NULL)) < 0 ) {
+ pass = FALSE;
+ failure_mssg = "H5Screate_simple(1, dims, NULL) failed (2).\n";
+ }
+ }
+
+ if ( pass ) {
+ offset = (hsize_t)group_rank * (hsize_t)COUNT;
+ if ( (H5Sselect_hyperslab(filespace, H5S_SELECT_SET, &offset,
+ NULL, &count, NULL)) < 0 ) {
+ pass = FALSE;
+ failure_mssg = "H5Sselect_hyperslab() failed.\n";
+ }
+ }
+
+ if ( pass ) {
+ if ( (dset_id = H5Dcreate2(file_id, "dataset0", H5T_NATIVE_FLOAT,
+ filespace, H5P_DEFAULT, H5P_DEFAULT,
+ H5P_DEFAULT)) < 0 ) {
+ pass = false;
+ failure_mssg = "H5Dcreate2() failed.\n";
+ }
+ }
+
+ if ( pass ) {
+ if ( (H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace,
+ filespace, dxpl_id, data_slice)) < 0 ) {
+ pass = false;
+ failure_mssg = "H5Dwrite() failed.\n";
+ }
+ }
+
+ /* close file, etc. */
+ if ( pass || (dset_id != -1)) {
+ if ( H5Dclose(dset_id) < 0 ) {
+ pass = false;
+ failure_mssg = "H5Dclose(dset_id) failed.\n";
+ }
+ }
+
+ if ( pass || (memspace != -1) ) {
+ if ( H5Sclose(memspace) < 0 ) {
+ pass = false;
+ failure_mssg = "H5Sclose(memspace) failed.\n";
+ }
+ }
+
+ if ( pass || (filespace != -1) ) {
+ if ( H5Sclose(filespace) < 0 ) {
+ pass = false;
+ failure_mssg = "H5Sclose(filespace) failed.\n";
+ }
+ }
+
+ if ( pass || (file_id != -1) ) {
+ if ( H5Fclose(file_id) < 0 ) {
+ pass = false;
+ failure_mssg = "H5Fclose(file_id) failed.\n";
+ }
+ }
+
+ if ( pass || (dxpl_id != -1) ) {
+ if ( H5Pclose(dxpl_id) < 0 ) {
+ pass = false;
+ failure_mssg = "H5Pclose(dxpl_id) failed.\n";
+ }
+ }
+
+ if ( pass || (fapl_id != -1) ) {
+ if ( H5Pclose(fapl_id) < 0 ) {
+ pass = false;
+ failure_mssg = "H5Pclose(fapl_id) failed.\n";
+ }
+ }
+
+ /* Add a userblock to the head of the datafile.
+ * We will use this to for a functional test of the
+ * file open optimization. This is superblock
+ * relocation is done by the rank 0 process associated
+ * with the communicator being used. For test 1, we
+ * utilize MPI_COMM_WORLD, so group_rank 0 is the
+ * same as mpi_rank 0. For test 2 which utilizes
+ * two groups resulting from an MPI_Comm_split, we
+ * will have parallel groups and hence two
+ * group_rank(0) processes. Each parallel group
+ * will create a unique file with different text
+ * headers and different data.
+ *
+ * We also delete files that are no longer needed.
+ */
+ if ( group_rank == 0 ) {
+
+ const char *text_to_write;
+ size_t bytes_to_write;
+
+ if (group_id == 0)
+ text_to_write = random_hdf5_text;
+ else
+ text_to_write = hitchhiker_quote;
+
+ bytes_to_write = strlen(text_to_write);
+
+ if ( pass ) {
+ if ( (header = HDfopen(prolog_filename, "w+")) == NULL ) {
+ pass = FALSE;
+ failure_mssg = "HDfopen(prolog_filename, \"w+\") failed.\n";
+ }
+ }
+
+ if ( pass ) {
+
+ if ( HDfwrite(text_to_write, 1, bytes_to_write, header) !=
+ bytes_to_write ) {
+ pass = FALSE;
+ failure_mssg = "Unable to write header file.\n";
+ }
+ }
+
+ if ( pass || (header != NULL) ) {
+ if ( HDfclose(header) != 0 ) {
+ pass = FALSE;
+ failure_mssg = "HDfclose() failed.\n";
+ }
+ }
+
+ if ( pass ) {
+ char cmd[256];
+
+ HDsprintf(cmd, "../tools/src/h5jam/h5jam -i %s -u %s -o %s",
+ data_filename, prolog_filename, reloc_data_filename);
+
+ if ( system(cmd) != 0 ) {
+ pass = FALSE;
+ failure_mssg = "invocation of h5jam failed.\n";
+ }
+ }
+
+ HDremove(prolog_filename);
+ HDremove(data_filename);
+ }
+
+ /* collect results from other processes.
+ * Only overwrite the failure message if no preveious error
+ * has been detected
+ */
+ local_failure = ( pass ? 0 : 1 );
+
+ /* This is a global all reduce (NOT group specific) */
+ if ( MPI_Allreduce(&local_failure, &global_failures, 1,
+ MPI_INT, MPI_SUM, MPI_COMM_WORLD) != MPI_SUCCESS ) {
+ if ( pass ) {
+ pass = FALSE;
+ failure_mssg = "MPI_Allreduce() failed.\n";
+ }
+ } else if ( ( pass ) && ( global_failures > 0 ) ) {
+ pass = FALSE;
+ failure_mssg = "One or more processes report failure.\n";
+ }
+
+ /* report results */
+ if ( mpi_rank == 0 ) {
+ if ( pass ) {
+ HDfprintf(stdout, "Done.\n");
+ } else {
+ HDfprintf(stdout, "FAILED.\n");
+ HDfprintf(stdout, "%s: failure_mssg = \"%s\"\n",
+ fcn_name, failure_mssg);
+ }
+ }
+
+ /* free data_slice if it has been allocated */
+ if ( data_slice != NULL ) {
+ HDfree(data_slice);
+ data_slice = NULL;
+ }
+
+ return(! pass);
+
+} /* generate_test_file() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: test_parallel_read
+ *
+ * Purpose: This actually tests the superblock optimization
+ * and covers the two primary cases we're interested in.
+ * 1). That HDF5 files can be opened in parallel by
+ * the rank 0 process and that the superblock
+ * offset is correctly broadcast to the other
+ * parallel file readers.
+ * 2). That a parallel application can correctly
+ * handle reading multiple files by using
+ * subgroups of MPI_COMM_WORLD and that each
+ * subgroup operates as described in (1) to
+ * collectively read the data.
+ *
+ * The global MPI rank is used for reading and
+ * writing data for process specific data in the
+ * dataset. We do this rather simplisticly, i.e.
+ * rank 0: writes/reads 0-9999
+ * rank 1: writes/reads 1000-1999
+ * rank 2: writes/reads 2000-2999
+ * ...
+ *
+ * Return: Success: 0
+ *
+ * Failure: 1
+ *
+ * Programmer: Richard Warren
+ * 10/1/17
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+static int
+test_parallel_read(MPI_Comm comm, int mpi_rank, int group_id)
+{
+ const char *failure_mssg;
+ const char *fcn_name = "test_parallel_read()";
+ const char *group_filename = NULL;
+ char reloc_data_filename[FILENAME_BUF_SIZE];
+ int local_failure = 0;
+ int global_failures = 0;
+ int group_size;
+ int group_rank;
+ hid_t fapl_id = -1;
+ hid_t file_id = -1;
+ hid_t dset_id = -1;
+ hid_t memspace = -1;
+ hid_t filespace = -1;
+ hsize_t i;
+ hsize_t offset;
+ hsize_t count = COUNT;
+ hsize_t dims[1] = {0};
+ float nextValue;
+ float *data_slice = NULL;
+
+ pass = TRUE;
+
+ HDassert(comm != MPI_COMM_NULL);
+
+ if ( (MPI_Comm_rank(comm, &group_rank)) != MPI_SUCCESS) {
+ pass = FALSE;
+ failure_mssg = "test_parallel_read: MPI_Comm_rank failed.\n";
+ }
+
+ if ( (MPI_Comm_size(comm, &group_size)) != MPI_SUCCESS) {
+ pass = FALSE;
+ failure_mssg = "test_parallel_read: MPI_Comm_size failed.\n";
+ }
+
+ if ( mpi_rank == 0 ) {
+ if ( comm == MPI_COMM_WORLD ) {
+ TESTING("parallel file open test 1");
+ }
+ else {
+ TESTING("parallel file open test 2");
+ }
+ }
+
+ /* allocate space for the data_slice array */
+ if ( pass ) {
+ if ( (data_slice = (float *)HDmalloc(COUNT * sizeof(float))) == NULL ) {
+ pass = FALSE;
+ failure_mssg = "malloc of data_slice failed.\n";
+ }
+ }
+
+
+ /* Select the file file name to read
+ * Please see the comments in the 'generate_test_file' function
+ * for more details...
+ */
+ if ( pass ) {
+
+ if ( comm == MPI_COMM_WORLD ) /* test 1 */
+ group_filename = FILENAMES[1];
+ else if ( group_id == 0 ) /* test 2 group 0 */
+ group_filename = FILENAMES[4];
+ else /* test 2 group 1 */
+ group_filename = FILENAMES[7];
+
+ HDassert(group_filename);
+ if ( h5_fixname(group_filename, H5P_DEFAULT, reloc_data_filename,
+ sizeof(reloc_data_filename)) == NULL ) {
+
+ pass = FALSE;
+ failure_mssg = "h5_fixname(1) failed.\n";
+ }
+ }
+
+ /* setup FAPL */
+ if ( pass ) {
+ if ( (fapl_id = H5Pcreate(H5P_FILE_ACCESS)) < 0 ) {
+ pass = FALSE;
+ failure_mssg = "H5Pcreate(H5P_FILE_ACCESS) failed.\n";
+ }
+ }
+
+ if ( pass ) {
+ if ( (H5Pset_fapl_mpio(fapl_id, comm, MPI_INFO_NULL)) < 0 ) {
+ pass = FALSE;
+ failure_mssg = "H5Pset_fapl_mpio() failed\n";
+ }
+ }
+
+ /* open the file -- should have user block, exercising the optimization */
+ if ( pass ) {
+ if ( (file_id = H5Fopen(reloc_data_filename,
+ H5F_ACC_RDONLY, fapl_id)) < 0 ) {
+ pass = FALSE;
+ failure_mssg = "H5Fopen() failed\n";
+ }
+ }
+
+ /* open the data set */
+ if ( pass ) {
+ if ( (dset_id = H5Dopen2(file_id, "dataset0", H5P_DEFAULT)) < 0 ) {
+ pass = FALSE;
+ failure_mssg = "H5Dopen2() failed\n";
+ }
+ }
+
+ /* setup memspace */
+ if ( pass ) {
+ dims[0] = count;
+ if ( (memspace = H5Screate_simple(1, dims, NULL)) < 0 ) {
+ pass = FALSE;
+ failure_mssg = "H5Screate_simple(1, dims, NULL) failed\n";
+ }
+ }
+
+ /* setup filespace */
+ if ( pass ) {
+ if ( (filespace = H5Dget_space(dset_id)) < 0 ) {
+ pass = FALSE;
+ failure_mssg = "H5Dget_space(dataset) failed\n";
+ }
+ }
+
+ if ( pass ) {
+ offset = (hsize_t)group_rank * count;
+ if ( (H5Sselect_hyperslab(filespace, H5S_SELECT_SET,
+ &offset, NULL, &count, NULL)) < 0 ) {
+ pass = FALSE;
+ failure_mssg = "H5Sselect_hyperslab() failed\n";
+ }
+ }
+
+ /* read this processes section of the data */
+ if ( pass ) {
+ if ( (H5Dread(dset_id, H5T_NATIVE_FLOAT, memspace,
+ filespace, H5P_DEFAULT, data_slice)) < 0 ) {
+ pass = FALSE;
+ failure_mssg = "H5Dread() failed\n";
+ }
+ }
+
+ /* verify the data */
+ if ( pass ) {
+ nextValue = (float)((hsize_t)mpi_rank * count);
+ i = 0;
+ while ( ( pass ) && ( i < count ) ) {
+ /* what we really want is data_slice[i] != nextValue --
+ * the following is a circumlocution to shut up the
+ * the compiler.
+ */
+ if ( ( data_slice[i] > nextValue ) ||
+ ( data_slice[i] < nextValue ) ) {
+ pass = FALSE;
+ failure_mssg = "Unexpected dset contents.\n";
+ }
+ nextValue += 1;
+ i++;
+ }
+ }
+
+ /* close file, etc. */
+ if ( pass || (dset_id != -1) ) {
+ if ( H5Dclose(dset_id) < 0 ) {
+ pass = false;
+ failure_mssg = "H5Dclose(dset_id) failed.\n";
+ }
+ }
+
+ if ( pass || (memspace != -1) ) {
+ if ( H5Sclose(memspace) < 0 ) {
+ pass = false;
+ failure_mssg = "H5Sclose(memspace) failed.\n";
+ }
+ }
+
+ if ( pass || (filespace != -1) ) {
+ if ( H5Sclose(filespace) < 0 ) {
+ pass = false;
+ failure_mssg = "H5Sclose(filespace) failed.\n";
+ }
+ }
+
+ if ( pass || (file_id != -1) ) {
+ if ( H5Fclose(file_id) < 0 ) {
+ pass = false;
+ failure_mssg = "H5Fclose(file_id) failed.\n";
+ }
+ }
+
+ if ( pass || (fapl_id != -1) ) {
+ if ( H5Pclose(fapl_id) < 0 ) {
+ pass = false;
+ failure_mssg = "H5Pclose(fapl_id) failed.\n";
+ }
+ }
+
+ /* collect results from other processes.
+ * Only overwrite the failure message if no preveious error
+ * has been detected
+ */
+ local_failure = ( pass ? 0 : 1 );
+
+ if ( MPI_Allreduce( &local_failure, &global_failures, 1,
+ MPI_INT, MPI_SUM, MPI_COMM_WORLD) != MPI_SUCCESS ) {
+ if ( pass ) {
+ pass = FALSE;
+ failure_mssg = "MPI_Allreduce() failed.\n";
+ }
+ } else if ( ( pass ) && ( global_failures > 0 ) ) {
+ pass = FALSE;
+ failure_mssg = "One or more processes report failure.\n";
+ }
+
+ /* report results and finish cleanup */
+ if ( group_rank == 0 ) {
+ if ( pass ) {
+ PASSED();
+ } else {
+ H5_FAILED();
+ HDfprintf(stdout, "%s: failure_mssg = \"%s\"\n",
+ fcn_name, failure_mssg);
+ }
+
+ HDremove(reloc_data_filename);
+ }
+
+ /* free data_slice if it has been allocated */
+ if ( data_slice != NULL ) {
+ HDfree(data_slice);
+ data_slice = NULL;
+ }
+
+
+ return( ! pass );
+
+} /* test_parallel_read() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: main
+ *
+ * Purpose: To implement a parallel test which validates whether the
+ * new superblock lookup functionality is working correctly.
+ *
+ * The test consists of creating two seperate HDF datasets
+ * in which random text is inserted at the start of each
+ * file using the 'j5jam' application. This forces the
+ * HDF5 file superblock to a non-zero offset.
+ * Having created the two independant files, we create two
+ * non-overlapping MPI groups, each of which is then tasked
+ * with the opening and validation of the data contained
+ * therein.
+ *
+ * WARNING: This test uses fork() and execve(), and
+ * therefore will not run on Windows.
+ *
+ * Return: Success: 0
+ *
+ * Failure: 1
+ *
+ * Programmer: Richard Warren
+ * 10/1/17
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+
+int
+main( int argc, char **argv)
+{
+ int nerrs = 0;
+ int which_group = 0;
+ int mpi_rank;
+ int mpi_size;
+ int split_size;
+ MPI_Comm group_comm = MPI_COMM_WORLD;
+
+ if ( (MPI_Init(&argc, &argv)) != MPI_SUCCESS) {
+ HDfprintf(stderr, "FATAL: Unable to initialize MPI\n");
+ HDexit(EXIT_FAILURE);
+ }
+
+ if ( (MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank)) != MPI_SUCCESS) {
+ HDfprintf(stderr, "FATAL: MPI_Comm_rank returned an error\n");
+ HDexit(EXIT_FAILURE);
+ }
+
+ if ( (MPI_Comm_size(MPI_COMM_WORLD, &mpi_size)) != MPI_SUCCESS) {
+ HDfprintf(stderr, "FATAL: MPI_Comm_size returned an error\n");
+ HDexit(EXIT_FAILURE);
+ }
+
+ H5open();
+
+ if ( mpi_rank == 0 ) {
+ HDfprintf(stdout, "========================================\n");
+ HDfprintf(stdout, "Collective file open optimization tests\n");
+ HDfprintf(stdout, " mpi_size = %d\n", mpi_size);
+ HDfprintf(stdout, "========================================\n");
+ }
+
+ if ( mpi_size < 4 ) {
+
+ if ( mpi_rank == 0 ) {
+
+ HDprintf(" Need at least 4 processes. Exiting.\n");
+ }
+ goto finish;
+ }
+
+ /* ------ Create two (2) MPI groups ------
+ *
+ * We split MPI_COMM_WORLD into 2 more or less equal sized
+ * groups. The resulting communicators will be used to generate
+ * two HDF files which in turn will be opened in parallel and the
+ * contents verified in the second read test below.
+ */
+ split_size = mpi_size / 2;
+ which_group = (mpi_rank < split_size ? 0 : 1);
+
+ if ( (MPI_Comm_split(MPI_COMM_WORLD,
+ which_group,
+ 0,
+ &group_comm)) != MPI_SUCCESS) {
+
+ HDfprintf(stderr, "FATAL: MPI_Comm_split returned an error\n");
+ HDexit(EXIT_FAILURE);
+ }
+
+ /* ------ Generate all files ------ */
+
+ /* We generate the file used for test 1 */
+ nerrs += generate_test_file( MPI_COMM_WORLD, mpi_rank, which_group );
+
+ if ( nerrs > 0 ) {
+ if ( mpi_rank == 0 ) {
+ HDprintf(" Test(1) file construction failed -- skipping tests.\n");
+ }
+ goto finish;
+ }
+
+ /* We generate the file used for test 2 */
+ nerrs += generate_test_file( group_comm, mpi_rank, which_group );
+
+ if ( nerrs > 0 ) {
+ if ( mpi_rank == 0 ) {
+ HDprintf(" Test(2) file construction failed -- skipping tests.\n");
+ }
+ goto finish;
+ }
+
+ /* Now read the generated test file (stil using MPI_COMM_WORLD) */
+ nerrs += test_parallel_read( MPI_COMM_WORLD, mpi_rank, which_group);
+
+ if ( nerrs > 0 ) {
+ if ( mpi_rank == 0 ) {
+ HDprintf(" Parallel read test(1) failed -- skipping tests.\n");
+ }
+ goto finish;
+ }
+
+ /* Update the user on our progress so far. */
+ if ( mpi_rank == 0 ) {
+ HDprintf(" Test 1 of 2 succeeded\n");
+ HDprintf(" -- Starting multi-group parallel read test.\n");
+ }
+
+ /* run the 2nd set of tests */
+ nerrs += test_parallel_read(group_comm, mpi_rank, which_group);
+
+ if ( nerrs > 0 ) {
+ if ( mpi_rank == 0 ) {
+ HDprintf(" Multi-group read test(2) failed\n");
+ }
+ goto finish;
+ }
+
+ if ( mpi_rank == 0 ) {
+ HDprintf(" Test 2 of 2 succeeded\n");
+ }
+
+finish:
+
+ if ((group_comm != MPI_COMM_NULL) &&
+ (MPI_Comm_free(&group_comm)) != MPI_SUCCESS) {
+ HDfprintf(stderr, "MPI_Comm_free failed!\n");
+ }
+
+
+ /* make sure all processes are finished before final report, cleanup
+ * and exit.
+ */
+ MPI_Barrier(MPI_COMM_WORLD);
+
+ if ( mpi_rank == 0 ) { /* only process 0 reports */
+ const char *header = "Collective file open optimization tests";
+
+ HDfprintf(stdout, "===================================\n");
+ if ( nerrs > 0 ) {
+
+ HDfprintf(stdout, "***%s detected %d failures***\n", header, nerrs);
+ }
+ else {
+ HDfprintf(stdout, "%s finished with no failures\n", header);
+ }
+ HDfprintf(stdout, "===================================\n");
+ }
+
+ /* close HDF5 library */
+ if (H5close() != SUCCEED) {
+ HDfprintf(stdout, "H5close() failed. (Ignoring)\n");
+ }
+
+ /* MPI_Finalize must be called AFTER H5close which may use MPI calls */
+ MPI_Finalize();
+
+ /* cannot just return (nerrs) because exit code is limited to 1byte */
+ return((nerrs > 0) ? EXIT_FAILURE : EXIT_SUCCESS );
+
+} /* main() */