summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRichard Warren <Richard.Warren@hdfgroup.org>2020-09-25 11:45:15 (GMT)
committerRichard Warren <Richard.Warren@hdfgroup.org>2020-09-25 11:45:15 (GMT)
commit5dd85abb4124f337eea52ef33d0c7e3fd67ed92d (patch)
tree7146b025f19d5d5bb5f85428f2cf6d6d91ceca19
parentd612a249afe1eb4799ccbf725d8309452446dee7 (diff)
downloadhdf5-5dd85abb4124f337eea52ef33d0c7e3fd67ed92d.zip
hdf5-5dd85abb4124f337eea52ef33d0c7e3fd67ed92d.tar.gz
hdf5-5dd85abb4124f337eea52ef33d0c7e3fd67ed92d.tar.bz2
Create a new branch for the September Subfiling deliverable
-rwxr-xr-xbin/trace3
-rw-r--r--src/CMakeLists.txt2
-rw-r--r--src/H5FDfamily.c514
-rw-r--r--src/H5FDsplitter.c4
-rw-r--r--src/H5FDsubfile.c475
-rw-r--r--src/H5FDsubfile.h0
-rw-r--r--src/H5FDsubfile_mpi.c3839
-rw-r--r--src/H5FDsubfile_private.h299
-rw-r--r--src/H5FDsubfile_public.h4
-rw-r--r--src/H5FDsubfile_threads.c392
-rw-r--r--src/H5FDsubfiling.c992
-rw-r--r--src/H5FDsubfiling.h20
-rw-r--r--src/Makefile.am2
-rw-r--r--test/CMakeLists.txt1
-rw-r--r--test/Makefile.am6
-rw-r--r--test/tselect.c6
-rw-r--r--test/vfd.c364
-rw-r--r--testpar/CMakeLists.txt3
-rw-r--r--testpar/t_bigio.c23
-rw-r--r--testpar/t_subfile_openclose.c34
-rw-r--r--testpar/t_subfile_readwrite.c93
-rw-r--r--tools/lib/h5diff.c7
-rw-r--r--tools/lib/h5tools_utils.c1
-rw-r--r--tools/lib/h5tools_utils.h1
-rw-r--r--tools/lib/h5trav.c17
25 files changed, 4815 insertions, 2287 deletions
diff --git a/bin/trace b/bin/trace
index fd0248e..ab84153 100755
--- a/bin/trace
+++ b/bin/trace
@@ -67,6 +67,7 @@ $Source = "";
"hid_t" => "i",
"int" => "Is",
"int32_t" => "Is",
+ "int64_t" => "IL",
"unsigned" => "Iu",
"unsigned int" => "Iu",
"uint32_t" => "Iu",
@@ -161,6 +162,7 @@ $Source = "";
"H5FD_hdfs_fapl_t" => "x",
"H5FD_file_image_callbacks_t" => "x",
"H5FD_mirror_fapl_t" => "x",
+ "H5FD_subfiling_fapl_t" => "x",
"H5G_iterate_t" => "x",
"H5G_info_t" => "x",
"H5I_free_t" => "x",
@@ -201,6 +203,7 @@ $Source = "";
"H5VL_request_notify_t" => "x",
"H5Z_func_t" => "x",
"H5Z_filter_func_t" => "x",
+ "sf_ioc_selection_t" => "x",
"va_list" => "x",
"size_t" => "z",
"H5Z_SO_scale_type_t" => "Za",
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index dc97db3..1fe0bce 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -243,6 +243,7 @@ set (H5FD_SOURCES
${HDF5_SRC_DIR}/H5FDstdio.c
${HDF5_SRC_DIR}/H5FDtest.c
${HDF5_SRC_DIR}/H5FDwindows.c
+ ${HDF5_SRC_DIR}/H5FDsubfiling.c
${HDF5_SRC_DIR}/H5FDsubfile.c
${HDF5_SRC_DIR}/H5FDsubfile_threads.c
${HDF5_SRC_DIR}/H5FDsubfile_mpi.c
@@ -265,6 +266,7 @@ set (H5FD_HDRS
${HDF5_SRC_DIR}/H5FDsplitter.h
${HDF5_SRC_DIR}/H5FDstdio.h
${HDF5_SRC_DIR}/H5FDwindows.h
+ ${HDF5_SRC_DIR}/H5FDsubfiling.h
${HDF5_SRC_DIR}/H5FDsubfile_public.h
${HDF5_SRC_DIR}/mercury/mercury_thread.h
${HDF5_SRC_DIR}/mercury/mercury_thread_mutex.h
diff --git a/src/H5FDfamily.c b/src/H5FDfamily.c
index 2537d86..e7e2e8b 100644
--- a/src/H5FDfamily.c
+++ b/src/H5FDfamily.c
@@ -12,38 +12,38 @@
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/*
- * Programmer: Robb Matzke
- * Monday, November 10, 1997
- *
- * Purpose: Implements a family of files that acts as a single hdf5
- * file. The purpose is to be able to split a huge file on a
- * 64-bit platform, transfer all the <2GB members to a 32-bit
- * platform, and then access the entire huge file on the 32-bit
- * platform.
- *
- * All family members are logically the same size although their
- * physical sizes may vary. The logical member size is
- * determined by looking at the physical size of the first member
- * when the file is opened. When creating a file family, the
- * first member is created with a predefined physical size
- * (actually, this happens when the file family is flushed, and
- * can be quite time consuming on file systems that don't
- * implement holes, like nfs).
+ * Programmer: Robb Matzke
+ * Monday, November 10, 1997
+ *
+ * Purpose: Implements a family of files that acts as a single hdf5
+ * file. The purpose is to be able to split a huge file on a
+ * 64-bit platform, transfer all the <2GB members to a 32-bit
+ * platform, and then access the entire huge file on the 32-bit
+ * platform.
+ *
+ * All family members are logically the same size although their
+ * physical sizes may vary. The logical member size is
+ * determined by looking at the physical size of the first member
+ * when the file is opened. When creating a file family, the
+ * first member is created with a predefined physical size
+ * (actually, this happens when the file family is flushed, and
+ * can be quite time consuming on file systems that don't
+ * implement holes, like nfs).
*
*/
#include "H5FDdrvr_module.h" /* This source code file is part of the H5FD driver module */
-#include "H5private.h" /* Generic Functions */
-#include "H5CXprivate.h" /* API Contexts */
-#include "H5Eprivate.h" /* Error handling */
-#include "H5Fprivate.h" /* File access */
-#include "H5FDprivate.h" /* File drivers */
-#include "H5FDfamily.h" /* Family file driver */
-#include "H5Iprivate.h" /* IDs */
-#include "H5MMprivate.h" /* Memory management */
-#include "H5Pprivate.h" /* Property lists */
+#include "H5private.h" /* Generic Functions */
+#include "H5CXprivate.h" /* API Contexts */
+#include "H5Eprivate.h" /* Error handling */
+#include "H5Fprivate.h" /* File access */
+#include "H5FDprivate.h" /* File drivers */
+#include "H5FDfamily.h" /* Family file driver */
+#include "H5Iprivate.h" /* IDs */
+#include "H5MMprivate.h" /* Memory management */
+#include "H5Pprivate.h" /* Property lists */
/* The size of the member name buffers */
#define H5FD_FAM_MEMB_NAME_BUF_SIZE 4096
@@ -53,29 +53,29 @@ static hid_t H5FD_FAMILY_g = 0;
/* The description of a file belonging to this driver. */
typedef struct H5FD_family_t {
- H5FD_t pub; /*public stuff, must be first */
- hid_t memb_fapl_id; /*file access property list for members */
- hsize_t memb_size; /*actual size of each member file */
- hsize_t pmem_size; /*member size passed in from property */
- unsigned nmembs; /*number of family members */
- unsigned amembs; /*number of member slots allocated */
- H5FD_t **memb; /*dynamic array of member pointers */
- haddr_t eoa; /*end of allocated addresses */
- char *name; /*name generator printf format */
- unsigned flags; /*flags for opening additional members */
+ H5FD_t pub; /* public stuff, must be first */
+ hid_t memb_fapl_id; /* file access property list for members */
+ hsize_t memb_size; /* actual size of each member file */
+ hsize_t pmem_size; /* member size passed in from property */
+ unsigned nmembs; /* number of family members */
+ unsigned amembs; /* number of member slots allocated */
+ H5FD_t **memb; /* dynamic array of member pointers */
+ haddr_t eoa; /* end of allocated addresses */
+ char *name; /* name generator printf format */
+ unsigned flags; /* flags for opening additional members */
/* Information from properties set by 'h5repart' tool */
- hsize_t mem_newsize; /*new member size passed in as private
- * property. It's used only by h5repart */
- hbool_t repart_members; /* Whether to mark the superblock dirty
- * when it is loaded, so that the family
- * member sizes can be re-encoded */
+ hsize_t mem_newsize; /* new member size passed in as private
+ * property. It's used only by h5repart */
+ hbool_t repart_members; /* Whether to mark the superblock dirty
+ * when it is loaded, so that the family
+ * member sizes can be re-encoded */
} H5FD_family_t;
/* Driver-specific file access properties */
typedef struct H5FD_family_fapl_t {
- hsize_t memb_size; /*size of each member */
- hid_t memb_fapl_id; /*file access property list of each memb*/
+ hsize_t memb_size; /*size of each member */
+ hid_t memb_fapl_id; /*file access property list of each memb*/
} H5FD_family_fapl_t;
/* Callback prototypes */
@@ -85,11 +85,11 @@ static void *H5FD__family_fapl_copy(const void *_old_fa);
static herr_t H5FD__family_fapl_free(void *_fa);
static hsize_t H5FD__family_sb_size(H5FD_t *_file);
static herr_t H5FD__family_sb_encode(H5FD_t *_file, char *name/*out*/,
- unsigned char *buf/*out*/);
+ unsigned char *buf/*out*/);
static herr_t H5FD__family_sb_decode(H5FD_t *_file, const char *name,
const unsigned char *buf);
static H5FD_t *H5FD__family_open(const char *name, unsigned flags,
- hid_t fapl_id, haddr_t maxaddr);
+ hid_t fapl_id, haddr_t maxaddr);
static herr_t H5FD__family_close(H5FD_t *_file);
static int H5FD__family_cmp(const H5FD_t *_f1, const H5FD_t *_f2);
static herr_t H5FD__family_query(const H5FD_t *_f1, unsigned long *flags);
@@ -98,9 +98,9 @@ static herr_t H5FD__family_set_eoa(H5FD_t *_file, H5FD_mem_t type, haddr_t eoa);
static haddr_t H5FD__family_get_eof(const H5FD_t *_file, H5FD_mem_t type);
static herr_t H5FD__family_get_handle(H5FD_t *_file, hid_t fapl, void** file_handle);
static herr_t H5FD__family_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
- size_t size, void *_buf/*out*/);
+ size_t size, void *_buf/*out*/);
static herr_t H5FD__family_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
- size_t size, const void *_buf);
+ size_t size, const void *_buf);
static herr_t H5FD__family_flush(H5FD_t *_file, hid_t dxpl_id, hbool_t closing);
static herr_t H5FD__family_truncate(H5FD_t *_file, hid_t dxpl_id, hbool_t closing);
static herr_t H5FD__family_lock(H5FD_t *_file, hbool_t rw);
@@ -108,40 +108,40 @@ static herr_t H5FD__family_unlock(H5FD_t *_file);
/* The class struct */
static const H5FD_class_t H5FD_family_g = {
- "family", /* name */
- HADDR_MAX, /* maxaddr */
- H5F_CLOSE_WEAK, /* fc_degree */
- H5FD__family_term, /* terminate */
- H5FD__family_sb_size, /* sb_size */
- H5FD__family_sb_encode, /* sb_encode */
- H5FD__family_sb_decode, /* sb_decode */
- sizeof(H5FD_family_fapl_t), /* fapl_size */
- H5FD__family_fapl_get, /* fapl_get */
- H5FD__family_fapl_copy, /* fapl_copy */
- H5FD__family_fapl_free, /* fapl_free */
- 0, /* dxpl_size */
- NULL, /* dxpl_copy */
- NULL, /* dxpl_free */
- H5FD__family_open, /* open */
- H5FD__family_close, /* close */
- H5FD__family_cmp, /* cmp */
- H5FD__family_query, /* query */
- NULL, /* get_type_map */
- NULL, /* alloc */
- NULL, /* free */
- H5FD__family_get_eoa, /* get_eoa */
- H5FD__family_set_eoa, /* set_eoa */
- H5FD__family_get_eof, /* get_eof */
- H5FD__family_get_handle, /* get_handle */
- H5FD__family_read, /* read */
- H5FD__family_write, /* write */
- H5FD__family_flush, /* flush */
- NULL, /* read_vector */
- NULL, /* write_vector */
- H5FD__family_truncate, /* truncate */
- H5FD__family_lock, /* lock */
- H5FD__family_unlock, /* unlock */
- H5FD_FLMAP_DICHOTOMY /* fl_map */
+ "family", /* name */
+ HADDR_MAX, /* maxaddr */
+ H5F_CLOSE_WEAK, /* fc_degree */
+ H5FD__family_term, /* terminate */
+ H5FD__family_sb_size, /* sb_size */
+ H5FD__family_sb_encode, /* sb_encode */
+ H5FD__family_sb_decode, /* sb_decode */
+ sizeof(H5FD_family_fapl_t), /* fapl_size */
+ H5FD__family_fapl_get, /* fapl_get */
+ H5FD__family_fapl_copy, /* fapl_copy */
+ H5FD__family_fapl_free, /* fapl_free */
+ 0, /* dxpl_size */
+ NULL, /* dxpl_copy */
+ NULL, /* dxpl_free */
+ H5FD__family_open, /* open */
+ H5FD__family_close, /* close */
+ H5FD__family_cmp, /* cmp */
+ H5FD__family_query, /* query */
+ NULL, /* get_type_map */
+ NULL, /* alloc */
+ NULL, /* free */
+ H5FD__family_get_eoa, /* get_eoa */
+ H5FD__family_set_eoa, /* set_eoa */
+ H5FD__family_get_eof, /* get_eof */
+ H5FD__family_get_handle, /* get_handle */
+ H5FD__family_read, /* read */
+ H5FD__family_write, /* write */
+ NULL, /* read_vector */
+ NULL, /* write_vector */
+ H5FD__family_flush, /* flush */
+ H5FD__family_truncate, /* truncate */
+ H5FD__family_lock, /* lock */
+ H5FD__family_unlock, /* unlock */
+ H5FD_FLMAP_DICHOTOMY /* fl_map */
};
@@ -205,9 +205,9 @@ done:
/*---------------------------------------------------------------------------
- * Function: H5FD__family_term
+ * Function: H5FD__family_term
*
- * Purpose: Shut down the VFD
+ * Purpose: Shut down the VFD
*
* Returns: Non-negative on success or negative on failure
*
@@ -229,19 +229,19 @@ H5FD__family_term(void)
/*-------------------------------------------------------------------------
- * Function: H5Pset_fapl_family
+ * Function: H5Pset_fapl_family
*
- * Purpose: Sets the file access property list FAPL_ID to use the family
- * driver. The MEMB_SIZE is the size in bytes of each file
- * member (used only when creating a new file) and the
- * MEMB_FAPL_ID is a file access property list to be used for
- * each family member.
+ * Purpose: Sets the file access property list FAPL_ID to use the family
+ * driver. The MEMB_SIZE is the size in bytes of each file
+ * member (used only when creating a new file) and the
+ * MEMB_FAPL_ID is a file access property list to be used for
+ * each family member.
*
- * Return: Success: Non-negative
+ * Return: Success: Non-negative
*
- * Failure: Negative
+ * Failure: Negative
*
- * Programmer: Robb Matzke
+ * Programmer: Robb Matzke
* Wednesday, August 4, 1999
*
*-------------------------------------------------------------------------
@@ -250,7 +250,7 @@ herr_t
H5Pset_fapl_family(hid_t fapl_id, hsize_t msize, hid_t memb_fapl_id)
{
herr_t ret_value;
- H5FD_family_fapl_t fa={0, -1};
+ H5FD_family_fapl_t fa={0, -1};
H5P_genplist_t *plist; /* Property list pointer */
FUNC_ENTER_API(FAIL)
@@ -279,16 +279,16 @@ done:
/*-------------------------------------------------------------------------
- * Function: H5Pget_fapl_family
+ * Function: H5Pget_fapl_family
*
- * Purpose: Returns information about the family file access property
- * list though the function arguments.
+ * Purpose: Returns information about the family file access property
+ * list though the function arguments.
*
- * Return: Success: Non-negative
+ * Return: Success: Non-negative
*
- * Failure: Negative
+ * Failure: Negative
*
- * Programmer: Robb Matzke
+ * Programmer: Robb Matzke
* Wednesday, August 4, 1999
*
*-------------------------------------------------------------------------
@@ -297,7 +297,7 @@ herr_t
H5Pget_fapl_family(hid_t fapl_id, hsize_t *msize/*out*/, hid_t *memb_fapl_id/*out*/)
{
H5P_genplist_t *plist; /* Property list pointer */
- const H5FD_family_fapl_t *fa;
+ const H5FD_family_fapl_t *fa;
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_API(FAIL)
@@ -323,16 +323,16 @@ done:
/*-------------------------------------------------------------------------
- * Function: H5FD__family_fapl_get
+ * Function: H5FD__family_fapl_get
*
- * Purpose: Gets a file access property list which could be used to
- * create an identical file.
+ * Purpose: Gets a file access property list which could be used to
+ * create an identical file.
*
- * Return: Success: Ptr to new file access property list.
+ * Return: Success: Ptr to new file access property list.
*
- * Failure: NULL
+ * Failure: NULL
*
- * Programmer: Robb Matzke
+ * Programmer: Robb Matzke
* Friday, August 13, 1999
*
*-------------------------------------------------------------------------
@@ -340,8 +340,8 @@ done:
static void *
H5FD__family_fapl_get(H5FD_t *_file)
{
- H5FD_family_t *file = (H5FD_family_t*)_file;
- H5FD_family_fapl_t *fa = NULL;
+ H5FD_family_t *file = (H5FD_family_t*)_file;
+ H5FD_family_fapl_t *fa = NULL;
H5P_genplist_t *plist; /* Property list pointer */
void *ret_value = NULL; /* Return value */
@@ -368,15 +368,15 @@ done:
/*-------------------------------------------------------------------------
- * Function: H5FD__family_fapl_copy
+ * Function: H5FD__family_fapl_copy
*
- * Purpose: Copies the family-specific file access properties.
+ * Purpose: Copies the family-specific file access properties.
*
- * Return: Success: Ptr to a new property list
+ * Return: Success: Ptr to a new property list
*
- * Failure: NULL
+ * Failure: NULL
*
- * Programmer: Robb Matzke
+ * Programmer: Robb Matzke
* Wednesday, August 4, 1999
*
*-------------------------------------------------------------------------
@@ -421,15 +421,15 @@ done:
/*-------------------------------------------------------------------------
- * Function: H5FD__family_fapl_free
+ * Function: H5FD__family_fapl_free
*
- * Purpose: Frees the family-specific file access properties.
+ * Purpose: Frees the family-specific file access properties.
*
- * Return: Success: 0
+ * Return: Success: 0
*
- * Failure: -1
+ * Failure: -1
*
- * Programmer: Robb Matzke
+ * Programmer: Robb Matzke
* Wednesday, August 4, 1999
*
*-------------------------------------------------------------------------
@@ -437,7 +437,7 @@ done:
static herr_t
H5FD__family_fapl_free(void *_fa)
{
- H5FD_family_fapl_t *fa = (H5FD_family_fapl_t*)_fa;
+ H5FD_family_fapl_t *fa = (H5FD_family_fapl_t*)_fa;
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_STATIC
@@ -452,16 +452,16 @@ done:
/*-------------------------------------------------------------------------
- * Function: H5FD__family_sb_size
+ * Function: H5FD__family_sb_size
*
- * Purpose: Returns the size of the private information to be stored in
- * the superblock.
+ * Purpose: Returns the size of the private information to be stored in
+ * the superblock.
*
- * Return: Success: The super block driver data size.
+ * Return: Success: The super block driver data size.
*
- * Failure: never fails
+ * Failure: never fails
*
- * Programmer: Raymond Lu
+ * Programmer: Raymond Lu
* Tuesday, May 10, 2005
*
*-------------------------------------------------------------------------
@@ -478,19 +478,19 @@ H5FD__family_sb_size(H5FD_t H5_ATTR_UNUSED *_file)
/*-------------------------------------------------------------------------
- * Function: H5FD__family_sb_encode
+ * Function: H5FD__family_sb_encode
*
- * Purpose: Encode driver information for the superblock. The NAME
- * argument is a nine-byte buffer which will be initialized with
- * an eight-character name/version number and null termination.
+ * Purpose: Encode driver information for the superblock. The NAME
+ * argument is a nine-byte buffer which will be initialized with
+ * an eight-character name/version number and null termination.
*
- * The encoding is the member file size and name template.
+ * The encoding is the member file size and name template.
*
- * Return: Success: 0
+ * Return: Success: 0
*
- * Failure: -1
+ * Failure: -1
*
- * Programmer: Raymond Lu
+ * Programmer: Raymond Lu
* Tuesday, May 10, 2005
*
*-------------------------------------------------------------------------
@@ -498,7 +498,7 @@ H5FD__family_sb_size(H5FD_t H5_ATTR_UNUSED *_file)
static herr_t
H5FD__family_sb_encode(H5FD_t *_file, char *name/*out*/, unsigned char *buf/*out*/)
{
- H5FD_family_t *file = (H5FD_family_t*)_file;
+ H5FD_family_t *file = (H5FD_family_t*)_file;
FUNC_ENTER_STATIC_NOERR
@@ -521,19 +521,19 @@ H5FD__family_sb_encode(H5FD_t *_file, char *name/*out*/, unsigned char *buf/*out
/*-------------------------------------------------------------------------
- * Function: H5FD__family_sb_decode
+ * Function: H5FD__family_sb_decode
*
- * Purpose: This function has 2 separate purpose. One is to decodes the
+ * Purpose: This function has 2 separate purpose. One is to decodes the
* superblock information for this driver. The NAME argument is
* the eight-character (plus null termination) name stored in i
* the file. The FILE argument is updated according to the
* information in the superblock.
*
- * Return: Success: 0
+ * Return: Success: 0
*
- * Failure: -1
+ * Failure: -1
*
- * Programmer: Raymond Lu
+ * Programmer: Raymond Lu
* Tuesday, May 10, 2005
*
*-------------------------------------------------------------------------
@@ -541,7 +541,7 @@ H5FD__family_sb_encode(H5FD_t *_file, char *name/*out*/, unsigned char *buf/*out
static herr_t
H5FD__family_sb_decode(H5FD_t *_file, const char H5_ATTR_UNUSED *name, const unsigned char *buf)
{
- H5FD_family_t *file = (H5FD_family_t*)_file;
+ H5FD_family_t *file = (H5FD_family_t*)_file;
uint64_t msize;
herr_t ret_value = SUCCEED; /* Return value */
@@ -576,17 +576,17 @@ done:
/*-------------------------------------------------------------------------
- * Function: H5FD__family_open
+ * Function: H5FD__family_open
*
- * Purpose: Creates and/or opens a family of files as an HDF5 file.
+ * Purpose: Creates and/or opens a family of files as an HDF5 file.
*
- * Return: Success: A pointer to a new file dat structure. The
- * public fields will be initialized by the
- * caller, which is always H5FD_open().
+ * Return: Success: A pointer to a new file dat structure. The
+ * public fields will be initialized by the
+ * caller, which is always H5FD_open().
*
- * Failure: NULL
+ * Failure: NULL
*
- * Programmer: Robb Matzke
+ * Programmer: Robb Matzke
* Wednesday, August 4, 1999
*
*-------------------------------------------------------------------------
@@ -600,13 +600,13 @@ done:
H5_GCC_DIAG_OFF(format-nonliteral)
static H5FD_t *
H5FD__family_open(const char *name, unsigned flags, hid_t fapl_id,
- haddr_t maxaddr)
+ haddr_t maxaddr)
{
- H5FD_family_t *file = NULL;
- char *memb_name = NULL, *temp = NULL;
- hsize_t eof = HADDR_UNDEF;
- unsigned t_flags = flags & ~H5F_ACC_CREAT;
- H5FD_t *ret_value = NULL;
+ H5FD_family_t *file = NULL;
+ char *memb_name = NULL, *temp = NULL;
+ hsize_t eof = HADDR_UNDEF;
+ unsigned t_flags = flags & ~H5F_ACC_CREAT;
+ H5FD_t *ret_value = NULL;
FUNC_ENTER_STATIC
@@ -752,17 +752,17 @@ H5_GCC_DIAG_ON(format-nonliteral)
/*-------------------------------------------------------------------------
- * Function: H5FD__family_close
+ * Function: H5FD__family_close
*
- * Purpose: Closes a family of files.
+ * Purpose: Closes a family of files.
*
- * Return: Success: Non-negative
+ * Return: Success: Non-negative
*
- * Failure: Negative with as many members closed as
- * possible. The only subsequent operation
- * permitted on the file is a close operation.
+ * Failure: Negative with as many members closed as
+ * possible. The only subsequent operation
+ * permitted on the file is a close operation.
*
- * Programmer: Robb Matzke
+ * Programmer: Robb Matzke
* Wednesday, August 4, 1999
*
*-------------------------------------------------------------------------
@@ -771,8 +771,8 @@ static herr_t
H5FD__family_close(H5FD_t *_file)
{
H5FD_family_t *file = (H5FD_family_t*)_file;
- unsigned nerrors = 0; /* Number of errors while closing member files */
- unsigned u; /* Local index variable */
+ unsigned nerrors = 0; /* Number of errors while closing member files */
+ unsigned u; /* Local index variable */
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_STATIC
@@ -804,17 +804,17 @@ H5FD__family_close(H5FD_t *_file)
/*-------------------------------------------------------------------------
- * Function: H5FD__family_cmp
+ * Function: H5FD__family_cmp
*
- * Purpose: Compares two file families to see if they are the same. It
- * does this by comparing the first member of the two families.
+ * Purpose: Compares two file families to see if they are the same. It
+ * does this by comparing the first member of the two families.
*
- * Return: Success: like strcmp()
+ * Return: Success: like strcmp()
*
- * Failure: never fails (arguments were checked by the
- * caller).
+ * Failure: never fails (arguments were checked by the
+ * caller).
*
- * Programmer: Robb Matzke
+ * Programmer: Robb Matzke
* Wednesday, August 4, 1999
*
*-------------------------------------------------------------------------
@@ -822,8 +822,8 @@ H5FD__family_close(H5FD_t *_file)
static int
H5FD__family_cmp(const H5FD_t *_f1, const H5FD_t *_f2)
{
- const H5FD_family_t *f1 = (const H5FD_family_t*)_f1;
- const H5FD_family_t *f2 = (const H5FD_family_t*)_f2;
+ const H5FD_family_t *f1 = (const H5FD_family_t*)_f1;
+ const H5FD_family_t *f2 = (const H5FD_family_t*)_f2;
int ret_value = 0;
FUNC_ENTER_STATIC_NOERR
@@ -838,15 +838,15 @@ H5FD__family_cmp(const H5FD_t *_f1, const H5FD_t *_f2)
/*-------------------------------------------------------------------------
- * Function: H5FD__family_query
+ * Function: H5FD__family_query
*
- * Purpose: Set the flags that this VFL driver is capable of supporting.
+ * Purpose: Set the flags that this VFL driver is capable of supporting.
* (listed in H5FDpublic.h)
*
- * Return: Success: non-negative
- * Failure: negative
+ * Return: Success: non-negative
+ * Failure: negative
*
- * Programmer: Quincey Koziol
+ * Programmer: Quincey Koziol
* Friday, August 25, 2000
*
*-------------------------------------------------------------------------
@@ -854,7 +854,7 @@ H5FD__family_cmp(const H5FD_t *_f1, const H5FD_t *_f2)
static herr_t
H5FD__family_query(const H5FD_t * _file, unsigned long *flags /* out */)
{
- const H5FD_family_t *file = (const H5FD_family_t*)_file; /* Family VFD info */
+ const H5FD_family_t *file = (const H5FD_family_t*)_file; /* Family VFD info */
FUNC_ENTER_STATIC_NOERR
@@ -876,17 +876,17 @@ H5FD__family_query(const H5FD_t * _file, unsigned long *flags /* out */)
/*-------------------------------------------------------------------------
- * Function: H5FD__family_get_eoa
+ * Function: H5FD__family_get_eoa
*
- * Purpose: Returns the end-of-address marker for the file. The EOA
- * marker is the first address past the last byte allocated in
- * the format address space.
+ * Purpose: Returns the end-of-address marker for the file. The EOA
+ * marker is the first address past the last byte allocated in
+ * the format address space.
*
- * Return: Success: The end-of-address-marker
+ * Return: Success: The end-of-address-marker
*
- * Failure: HADDR_UNDEF
+ * Failure: HADDR_UNDEF
*
- * Programmer: Robb Matzke
+ * Programmer: Robb Matzke
* Wednesday, August 4, 1999
*
*-------------------------------------------------------------------------
@@ -894,7 +894,7 @@ H5FD__family_query(const H5FD_t * _file, unsigned long *flags /* out */)
static haddr_t
H5FD__family_get_eoa(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type)
{
- const H5FD_family_t *file = (const H5FD_family_t*)_file;
+ const H5FD_family_t *file = (const H5FD_family_t*)_file;
FUNC_ENTER_STATIC_NOERR
@@ -903,15 +903,15 @@ H5FD__family_get_eoa(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type)
/*-------------------------------------------------------------------------
- * Function: H5FD__family_set_eoa
+ * Function: H5FD__family_set_eoa
*
- * Purpose: Set the end-of-address marker for the file.
+ * Purpose: Set the end-of-address marker for the file.
*
- * Return: Success: 0
+ * Return: Success: 0
*
- * Failure: -1
+ * Failure: -1
*
- * Programmer: Robb Matzke
+ * Programmer: Robb Matzke
* Wednesday, August 4, 1999
*
*-------------------------------------------------------------------------
@@ -926,10 +926,10 @@ H5_GCC_DIAG_OFF(format-nonliteral)
static herr_t
H5FD__family_set_eoa(H5FD_t *_file, H5FD_mem_t type, haddr_t abs_eoa)
{
- H5FD_family_t *file = (H5FD_family_t*)_file;
- haddr_t addr = abs_eoa;
- char *memb_name = NULL;
- unsigned u; /* Local index variable */
+ H5FD_family_t *file = (H5FD_family_t*)_file;
+ haddr_t addr = abs_eoa;
+ char *memb_name = NULL;
+ unsigned u; /* Local index variable */
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_STATIC
@@ -993,18 +993,18 @@ H5_GCC_DIAG_ON(format-nonliteral)
/*-------------------------------------------------------------------------
- * Function: H5FD__family_get_eof
+ * Function: H5FD__family_get_eof
*
- * Purpose: Returns the end-of-file marker, which is the greater of
- * either the total family size or the current EOA marker.
+ * Purpose: Returns the end-of-file marker, which is the greater of
+ * either the total family size or the current EOA marker.
*
- * Return: Success: End of file address, the first address past
- * the end of the family of files or the current
- * EOA, whichever is larger.
+ * Return: Success: End of file address, the first address past
+ * the end of the family of files or the current
+ * EOA, whichever is larger.
*
- * Failure: HADDR_UNDEF
+ * Failure: HADDR_UNDEF
*
- * Programmer: Robb Matzke
+ * Programmer: Robb Matzke
* Wednesday, August 4, 1999
*
*-------------------------------------------------------------------------
@@ -1012,9 +1012,9 @@ H5_GCC_DIAG_ON(format-nonliteral)
static haddr_t
H5FD__family_get_eof(const H5FD_t *_file, H5FD_mem_t type)
{
- const H5FD_family_t *file = (const H5FD_family_t*)_file;
- haddr_t eof=0;
- int i; /* Local index variable */
+ const H5FD_family_t *file = (const H5FD_family_t*)_file;
+ haddr_t eof=0;
+ int i; /* Local index variable */
haddr_t ret_value = HADDR_UNDEF; /* Return value */
FUNC_ENTER_STATIC_NOERR
@@ -1089,32 +1089,32 @@ done:
/*-------------------------------------------------------------------------
- * Function: H5FD__family_read
+ * Function: H5FD__family_read
*
- * Purpose: Reads SIZE bytes of data from FILE beginning at address ADDR
- * into buffer BUF according to data transfer properties in
- * DXPL_ID.
+ * Purpose: Reads SIZE bytes of data from FILE beginning at address ADDR
+ * into buffer BUF according to data transfer properties in
+ * DXPL_ID.
*
- * Return: Success: Zero. Result is stored in caller-supplied
- * buffer BUF.
+ * Return: Success: Zero. Result is stored in caller-supplied
+ * buffer BUF.
*
- * Failure: -1, contents of buffer BUF are undefined.
+ * Failure: -1, contents of buffer BUF are undefined.
*
- * Programmer: Robb Matzke
+ * Programmer: Robb Matzke
* Wednesday, August 4, 1999
*
*-------------------------------------------------------------------------
*/
static herr_t
H5FD__family_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size,
- void *_buf/*out*/)
+ void *_buf/*out*/)
{
- H5FD_family_t *file = (H5FD_family_t*)_file;
- unsigned char *buf = (unsigned char*)_buf;
- haddr_t sub;
- size_t req;
+ H5FD_family_t *file = (H5FD_family_t*)_file;
+ unsigned char *buf = (unsigned char*)_buf;
+ haddr_t sub;
+ size_t req;
hsize_t tempreq;
- unsigned u; /* Local index variable */
+ unsigned u; /* Local index variable */
H5P_genplist_t *plist; /* Property list pointer */
herr_t ret_value=SUCCEED; /* Return value */
@@ -1133,12 +1133,12 @@ H5FD__family_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, s
sub = addr % file->memb_size;
- /* This check is for mainly for IA32 architecture whose size_t's size
- * is 4 bytes, to prevent overflow when user application is trying to
- * write files bigger than 4GB. */
+ /* This check is for mainly for IA32 architecture whose size_t's size
+ * is 4 bytes, to prevent overflow when user application is trying to
+ * write files bigger than 4GB. */
tempreq = file->memb_size-sub;
- if(tempreq > SIZET_MAX)
- tempreq = SIZET_MAX;
+ if(tempreq > SIZET_MAX)
+ tempreq = SIZET_MAX;
req = MIN(size, (size_t)tempreq);
HDassert(u<file->nmembs);
@@ -1157,31 +1157,31 @@ done:
/*-------------------------------------------------------------------------
- * Function: H5FD__family_write
+ * Function: H5FD__family_write
*
- * Purpose: Writes SIZE bytes of data to FILE beginning at address ADDR
- * from buffer BUF according to data transfer properties in
- * DXPL_ID.
+ * Purpose: Writes SIZE bytes of data to FILE beginning at address ADDR
+ * from buffer BUF according to data transfer properties in
+ * DXPL_ID.
*
- * Return: Success: Zero
+ * Return: Success: Zero
*
- * Failure: -1
+ * Failure: -1
*
- * Programmer: Robb Matzke
+ * Programmer: Robb Matzke
* Wednesday, August 4, 1999
*
*-------------------------------------------------------------------------
*/
static herr_t
H5FD__family_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size,
- const void *_buf)
+ const void *_buf)
{
- H5FD_family_t *file = (H5FD_family_t*)_file;
- const unsigned char *buf = (const unsigned char*)_buf;
- haddr_t sub;
- size_t req;
+ H5FD_family_t *file = (H5FD_family_t*)_file;
+ const unsigned char *buf = (const unsigned char*)_buf;
+ haddr_t sub;
+ size_t req;
hsize_t tempreq;
- unsigned u; /* Local index variable */
+ unsigned u; /* Local index variable */
H5P_genplist_t *plist; /* Property list pointer */
herr_t ret_value = SUCCEED; /* Return value */
@@ -1204,8 +1204,8 @@ H5FD__family_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
* is 4 bytes, to prevent overflow when user application is trying to
* write files bigger than 4GB. */
tempreq = file->memb_size-sub;
- if(tempreq > SIZET_MAX)
- tempreq = SIZET_MAX;
+ if(tempreq > SIZET_MAX)
+ tempreq = SIZET_MAX;
req = MIN(size, (size_t)tempreq);
HDassert(u<file->nmembs);
@@ -1224,14 +1224,14 @@ done:
/*-------------------------------------------------------------------------
- * Function: H5FD__family_flush
+ * Function: H5FD__family_flush
*
- * Purpose: Flushes all family members.
+ * Purpose: Flushes all family members.
*
- * Return: Success: 0
- * Failure: -1, as many files flushed as possible.
+ * Return: Success: 0
+ * Failure: -1, as many files flushed as possible.
*
- * Programmer: Robb Matzke
+ * Programmer: Robb Matzke
* Wednesday, August 4, 1999
*
*-------------------------------------------------------------------------
@@ -1239,8 +1239,8 @@ done:
static herr_t
H5FD__family_flush(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t closing)
{
- H5FD_family_t *file = (H5FD_family_t*)_file;
- unsigned u, nerrors = 0;
+ H5FD_family_t *file = (H5FD_family_t*)_file;
+ unsigned u, nerrors = 0;
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_STATIC
@@ -1258,15 +1258,15 @@ done:
/*-------------------------------------------------------------------------
- * Function: H5FD__family_truncate
+ * Function: H5FD__family_truncate
*
- * Purpose: Truncates all family members.
+ * Purpose: Truncates all family members.
*
- * Return: Success: 0
+ * Return: Success: 0
*
- * Failure: -1, as many files truncated as possible.
+ * Failure: -1, as many files truncated as possible.
*
- * Programmer: Quincey Koziol
+ * Programmer: Quincey Koziol
* Saturday, February 23, 2008
*
*-------------------------------------------------------------------------
@@ -1274,9 +1274,9 @@ done:
static herr_t
H5FD__family_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t closing)
{
- H5FD_family_t *file = (H5FD_family_t*)_file;
- unsigned u, nerrors = 0;
- herr_t ret_value = SUCCEED; /* Return value */
+ H5FD_family_t *file = (H5FD_family_t*)_file;
+ unsigned u, nerrors = 0;
+ herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_STATIC
@@ -1311,7 +1311,7 @@ H5FD__family_lock(H5FD_t *_file, hbool_t rw)
{
H5FD_family_t *file = (H5FD_family_t *)_file; /* VFD file struct */
unsigned u; /* Local index variable */
- herr_t ret_value = SUCCEED; /* Return value */
+ herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_STATIC
@@ -1354,8 +1354,8 @@ done:
static herr_t
H5FD__family_unlock(H5FD_t *_file)
{
- H5FD_family_t *file = (H5FD_family_t *)_file; /* VFD file struct */
- unsigned u; /* Local index variable */
+ H5FD_family_t *file = (H5FD_family_t *)_file; /* VFD file struct */
+ unsigned u; /* Local index variable */
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_STATIC
diff --git a/src/H5FDsplitter.c b/src/H5FDsplitter.c
index 7bf0de8..eac2dbe 100644
--- a/src/H5FDsplitter.c
+++ b/src/H5FDsplitter.c
@@ -156,8 +156,8 @@ static const H5FD_class_t H5FD_splitter_g = {
H5FD__splitter_get_handle, /* get_handle */
H5FD__splitter_read, /* read */
H5FD__splitter_write, /* write */
- NULL, /* read_vector */
- NULL, /* write_vector */
+ NULL, /* read_vector */
+ NULL, /* write_vector */
H5FD__splitter_flush, /* flush */
H5FD__splitter_truncate, /* truncate */
H5FD__splitter_lock, /* lock */
diff --git a/src/H5FDsubfile.c b/src/H5FDsubfile.c
index a467533..2b3d44b 100644
--- a/src/H5FDsubfile.c
+++ b/src/H5FDsubfile.c
@@ -1,273 +1,334 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright by The HDF Group. *
+ * Copyright by the Board of Trustees of the University of Illinois. *
+ * All rights reserved. *
+ * *
+ * This file is part of HDF5. The full HDF5 copyright notice, including *
+ * terms governing use, modification, and redistribution, is contained in *
+ * the COPYING file, which can be found at the root of the source code *
+ * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. *
+ * If you do not have access to either file, you may request a copy from *
+ * help@hdfgroup.org. *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
-#include "H5FDsubfile_public.h"
+/*
+ * Programmer: Richard Warren <Richard.Warren@hdfgroup.org>
+ * Wednesday, July 1, 2020
+ *
+ * Purpose: This is part of a parallel subfiling I/O driver.
+ *
+ */
-#ifdef H5_HAVE_PARALLEL
+#include "H5FDsubfile_public.h"
/***********/
/* Headers */
/***********/
-#include "H5private.h" /* Generic Functions */
-#include "H5CXprivate.h" /* API Contexts */
-#include "H5Dprivate.h" /* Datasets */
-#include "H5Eprivate.h" /* Error handling */
-#include "H5Ipublic.h" /* IDs */
-#include "H5Iprivate.h" /* IDs */
-#include "H5MMprivate.h" /* Memory management */
-#include "H5Pprivate.h" /* Property lists */
-
-/*
+#include "H5CXprivate.h" /* API Contexts */
+#include "H5Dprivate.h" /* Datasets */
+#include "H5Eprivate.h" /* Error handling */
+#include "H5Iprivate.h" /* IDs */
+#include "H5Ipublic.h" /* IDs */
+#include "H5MMprivate.h" /* Memory management */
+#include "H5Pprivate.h" /* Property lists */
+#include "H5private.h" /* Generic Functions */
+
+/*
=========================================
Private functions
-========================================
+=========================================
*/
-static size_t sf_topology_limit = 4;
-static size_t sf_topology_entries = 0;
-static sf_topology_t **sf_topology_cache = NULL;
+/* Modifiable via environment variable */
+static sf_ioc_selection_t sf_default_ioc_selection = SELECT_IOC_ONE_PER_NODE;
-static size_t sf_context_limit = 4;
-static size_t sf_context_entries = 0;
-static subfiling_context_t **sf_context_cache = NULL;
-static hid_t context_id = H5I_INVALID_HID;
-static hid_t topology_id = H5I_INVALID_HID;
+/*
+-----------------------------------------------------------------------------------
+sf_topology_limit -- How many different topologies can be recorded (default =
+4) sf_topology_entries -- The number of topologies that are currently recorded.
+sf_topology_cache -- Storage for the known topologies
+-----------------------------------------------------------------------------------
+*/
+static size_t sf_topology_limit = 4;
+static sf_topology_t *sf_topology_cache = NULL;
+/*
+--------------------------------------------------------------------------
+sf_context_limit -- How many contexts can be recorded (default = 4)
+sf_context_entries -- The number of contexts that are currently recorded.
+sf_context_cache -- Storage for contexts
+--------------------------------------------------------------------------
+*/
+static size_t sf_context_limit = 16;
+static subfiling_context_t *sf_context_cache = NULL;
-static int64_t record_subfiling_object(SF_OBJ_TYPE type, void *obj)
+/*
+-------------------------------------------------------------------------
+ Programmer: Richard Warren <Richard.Warren@hdfgroup.org>
+ Purpose: Return a pointer to the requested storage object.
+ There are only 2 object types: TOPOLOGY or CONTEXT
+ structures. An object_id contains the object type
+ in upper 32 bits and an index value in the lower 32 bits.
+ Storage for an object is allocated as required.
+
+ Topologies are static, i.e. for any one IO Concentrator
+ allocation strategy, the results should always be the
+ same.
+ FIXME: The one exception to this being the 1 IOC per
+ N MPI ranks. The value of N can be changed on a per-file
+ basis, so we need address that at some point.
+
+ Contexts are 1 per open file. If only one file is open
+ at a time, then we will only use a single context cache
+ entry.
+ Errors: returns NULL if input SF_OBJ_TYPE is unrecognized or
+ a memory allocation error.
+
+ Revision History -- Initial implementation
+-------------------------------------------------------------------------
+*/
+void *
+get_subfiling_object(int64_t object_id)
{
- size_t index;
- int64_t obj_reference;
- uint64_t tag;
- switch(type) {
- case SF_TOPOLOGY: {
- if (sf_topology_cache == NULL) {
- sf_topology_cache = (sf_topology_t **)
- calloc(sf_topology_limit, sizeof(sf_topology_t *));
- }
- assert(sf_topology_cache != NULL);
- index = sf_topology_entries++;
- tag = SF_TOPOLOGY;
- obj_reference = (int64_t)((tag << 32) | index);
- sf_topology_cache[index] = obj;
- return obj_reference;
- break;
- }
- case SF_CONTEXT: {
- if (sf_context_cache == NULL) {
- sf_context_cache = (subfiling_context_t **)
- calloc(sf_context_limit, sizeof(subfiling_context_t *));
- }
- assert(sf_context_cache != NULL);
- index = sf_context_entries++;
- tag = SF_CONTEXT;
- obj_reference = (int64_t)((tag << 32) | index);
- sf_context_cache[index] = (subfiling_context_t *)obj;
- return obj_reference;
- break;
- }
- default:
- puts("UNKNOWN Subfiling object type");
- }
-
- return -1;
+ int obj_type = (int) ((object_id >> 32) & 0x0FFFF);
+ /* We don't require a large indexing space
+ * 16 bits should be enough..
+ */
+ size_t index = (object_id & 0x0FFFF);
+ if (obj_type == SF_TOPOLOGY) {
+ if (sf_topology_cache == NULL) {
+ sf_topology_cache = (sf_topology_t *) calloc(
+ sf_topology_limit, sizeof(sf_topology_t));
+ assert(sf_topology_cache != NULL);
+ }
+ if (index < sf_topology_limit) {
+ return (void *) &sf_topology_cache[index];
+ } else {
+ puts("Illegal toplogy object index");
+ }
+ } else if (obj_type == SF_CONTEXT) {
+ if (sf_context_cache == NULL) {
+ sf_context_cache = (subfiling_context_t *) calloc(
+ sf_context_limit, sizeof(subfiling_context_t));
+ assert(sf_context_cache != NULL);
+ }
+ if (index == sf_context_limit) {
+ sf_context_limit *= 2;
+ sf_context_cache = (subfiling_context_t *) realloc(sf_context_cache,
+ sf_context_limit * sizeof(subfiling_context_t));
+ assert(sf_context_cache != NULL);
+ } else {
+ return (void *) &sf_context_cache[index];
+ }
+ } else {
+ printf(
+ "get_subfiling_object: UNKNOWN Subfiling object type id = 0x%lx\n",
+ object_id);
+ }
+ return NULL;
}
-/*
-=========================================
+/*
+======================================================
Public vars (for subfiling) and functions
-========================================
+We probably need a function to set and clear this
+======================================================
*/
-
int sf_verbose_flag = 0;
/*
-=========================================
+======================================================
File functions
-=========================================
The pread and pwrite posix functions are described as
-being thread safe. We include mutex locks and unlocks
-to work around any potential threading conflicts...
-Those however, are compiled according #ifdef
+being thread safe.
+======================================================
*/
-
-int sf_read_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, int subfile_rank)
+int
+sf_read_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size,
+ int subfile_rank)
{
- int ret = 0;
+ int ret = 0;
ssize_t bytes_read;
- ssize_t bytes_remaining = (ssize_t)data_size;
- char *this_buffer = data_buffer;
+ ssize_t bytes_remaining = (ssize_t) data_size;
+ char * this_buffer = data_buffer;
+
+ while (bytes_remaining) {
+ if ((bytes_read = (ssize_t) pread(
+ fd, this_buffer, (size_t) bytes_remaining, file_offset)) < 0) {
- while(bytes_remaining) {
- if ((bytes_read = (ssize_t)pread(fd, this_buffer, (size_t)bytes_remaining, file_offset)) < 0) {
perror("pread failed!");
+ printf("[ioc(%d) %s] pread(fd, buf, bytes_remaining=%ld, "
+ "file_offset =%ld)\n",
+ subfile_rank, __func__, bytes_remaining, file_offset);
fflush(stdout);
- }
- else if (bytes_read > 0) {
- if (sf_verbose_flag) {
- printf("[ioc(%d) %s] read %ld bytes of %ld requested\n",
- subfile_rank, __func__,
- bytes_read, bytes_remaining);
- }
+ return -1;
+ } else if (bytes_read > 0) {
bytes_remaining -= bytes_read;
this_buffer += bytes_read;
file_offset += bytes_read;
- }
- else {
- printf("[ioc(%d) %s] ERROR! read of 0 bytes == eof!\n", subfile_rank, __func__ );
+ } else {
+ printf("[ioc(%d) %s] ERROR! read of 0 bytes == eof!\n",
+ subfile_rank, __func__);
fflush(stdout);
- break;
+ return -2;
}
}
return ret;
}
-int sf_write_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, int subfile_rank)
+int
+sf_write_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size,
+ int subfile_rank)
{
- int ret = 0;
- char *this_data = (char *)data_buffer;
+ int ret = 0;
+ char * this_data = (char *) data_buffer;
ssize_t bytes_remaining = (ssize_t) data_size;
ssize_t written = 0;
- while(bytes_remaining) {
- if ((written = pwrite(fd, this_data, (size_t)bytes_remaining, file_offset)) < 0) {
+ while (bytes_remaining) {
+ if ((written = pwrite(
+ fd, this_data, (size_t) bytes_remaining, file_offset)) < 0) {
perror("pwrite failed!");
+ printf("[ioc(%d) %s] pwrite(fd, data, bytes_remaining=%ld, "
+ "file_offset =%ld)\n",
+ subfile_rank, __func__, bytes_remaining, file_offset);
fflush(stdout);
- break;
- }
- else {
- if (sf_verbose_flag) {
- printf("[ioc(%d) %s] wrote %ld bytes of %ld requested\n",
- subfile_rank, __func__,
- written, bytes_remaining);
- }
+ return -1;
+ } else {
bytes_remaining -= written;
this_data += written;
file_offset += written;
}
}
+ /* We don't usually use this for each file write. We usually do the file
+ * flush as part of file close operation.
+ */
#ifdef SUBFILE_REQUIRE_FLUSH
fdatasync(fd);
#endif
-
return ret;
}
-
-
-
-void * get_subfiling_object(int64_t object_id)
+/*
+-------------------------------------------------------------------------
+ Programmer: Richard Warren <Richard.Warren@hdfgroup.org>
+ Purpose: Return a character string which represents either the
+ default selection method: SELECT_IOC_ONE_PER_NODE; or
+ if the user has selected a method via the environment
+ variable (H5_IOC_SELECTION_CRITERIA), we return that
+ along with any optional qualifier with for that method.
+
+ Errors: None.
+
+ Revision History -- Initial implementation
+-------------------------------------------------------------------------
+*/
+char *
+get_ioc_selection_criteria(sf_ioc_selection_t *selection)
{
- int obj_type = (int)((object_id >> 32) & 0x0FFFF);
- /* We don't require a large indexing space
- * 16 bits should be enough..
- */
- size_t index = (object_id & 0x0FFFF);
- if (obj_type == SF_TOPOLOGY) {
- if (index < sf_context_entries) {
- return (void *)sf_topology_cache[index];
- }
- else {
- puts("Illegal object index");
- }
- }
- else if (obj_type == SF_CONTEXT) {
- if (index < sf_context_entries) {
- return (void *)sf_context_cache[index];
- }
- else {
- puts("Illegal object index");
- }
- }
- else {
- puts("UNKNOWN Subfiling object type");
- }
- return NULL;
+ char *optValue = NULL;
+ char *envValue = HDgetenv("H5_IOC_SELECTION_CRITERIA");
+
+ /* For non-default options, the environment variable
+ * should have the following form: integer:[integer|string]
+ * In particular, EveryNthRank == 1:64 or every 64 ranks assign an IOC
+ * or WithConfig == 2:/<full_path_to_config_file>
+ */
+ if (envValue && (optValue = strchr(envValue, ':'))) {
+ *optValue++ = 0;
+ }
+ if (envValue) {
+ int checkValue = atoi(envValue);
+ if ((checkValue < 0) || (checkValue >= ioc_selection_options)) {
+ *selection = sf_default_ioc_selection;
+ return NULL;
+ } else {
+ *selection = (sf_ioc_selection_t) checkValue;
+ return optValue;
+ }
+ }
+ *selection = sf_default_ioc_selection;
+ return NULL;
}
+/*
+-------------------------------------------------------------------------
+ Programmer: Richard Warren <Richard.Warren@hdfgroup.org>
+ Purpose: Called as part of a file open operation, we initialize a
+ subfiling context which includes the application topology
+ along with other relevant info such as the MPI objects
+ (communicators) for communicating with IO concentrators.
+ We also identify which MPI ranks will have IOC threads
+ started on them.
+
+ We return a context ID via the 'sf_context' variable.
+
+ Errors: returns an error if we detect any initialization errors,
+ including malloc failures or any resource allocation
+ problems.
+
+ Revision History -- Initial implementation
+-------------------------------------------------------------------------
+*/
herr_t
-H5FDsubfiling_init(void)
+H5FDsubfiling_init(sf_ioc_selection_t ioc_select_method, char *ioc_select_option,
+ int64_t *sf_context)
{
- herr_t ret_value = SUCCEED;
- int ioc_count;
- int world_rank, world_size;
- sf_topology_t *thisApp = NULL;
- subfiling_context_t *newContext = NULL;
-
- FUNC_ENTER_API(FAIL)
- H5TRACE0("e","");
-
- if (MPI_Comm_size(MPI_COMM_WORLD, &world_size) != MPI_SUCCESS) {
- puts("MPI_Comm_size returned an error");
- ret_value = FAIL;
- goto done;
- }
- if (MPI_Comm_rank(MPI_COMM_WORLD, &world_rank) != MPI_SUCCESS) {
- puts("MPI_Comm_rank returned an error");
- ret_value = FAIL;
- goto done;
- }
- if ((ioc_count = H5FD__determine_ioc_count (world_size, world_rank, &thisApp)) > 0) {
- topology_id = (hid_t)record_subfiling_object(SF_TOPOLOGY, thisApp);
- }
- if (topology_id < 0) {
- puts("Unable to register subfiling topology!");
- ret_value = FAIL;
- goto done;
- }
- if (H5FD__init_subfile_context(&newContext, ioc_count, world_size, world_rank, thisApp->rank_is_ioc) != SUCCEED) {
- puts("Unable to initialize a subfiling context!");
- ret_value = FAIL;
- goto done;
- }
- context_id = (hid_t)record_subfiling_object(SF_CONTEXT, newContext);
- if (context_id < 0) {
- ret_value = FAIL;
- puts("Unable to register subfiling context!");
- }
-
-done:
- FUNC_LEAVE_API(ret_value)
+ herr_t ret_value = SUCCEED;
+ int ioc_count;
+ int world_rank, world_size;
+ sf_topology_t * thisApp = NULL;
+ int active_file_maps = active_map_entries();
+ int64_t tag = SF_CONTEXT;
+ int64_t context_id = ((tag << 32) | active_file_maps);
+ subfiling_context_t *newContext =
+ (subfiling_context_t *) get_subfiling_object(context_id);
+
+ FUNC_ENTER_API(FAIL)
+ H5TRACE3("e", "x*s*IL", ioc_select_method, ioc_select_option, sf_context);
+
+ if (MPI_Comm_size(MPI_COMM_WORLD, &world_size) != MPI_SUCCESS) {
+ puts("MPI_Comm_size returned an error");
+ ret_value = FAIL;
+ goto done;
+ }
+ if (MPI_Comm_rank(MPI_COMM_WORLD, &world_rank) != MPI_SUCCESS) {
+ puts("MPI_Comm_rank returned an error");
+ ret_value = FAIL;
+ goto done;
+ }
- return ret_value;
-}
+ if ((ioc_count = H5FD__determine_ioc_count(world_size, world_rank,
+ ioc_select_method, ioc_select_option, &thisApp)) <= 0) {
+ puts("Unable to register subfiling topology!");
+ ret_value = FAIL;
+ goto done;
+ }
-herr_t
-H5FDsubfiling_finalize(void)
-{
- herr_t ret_value = SUCCEED; /* Return value */
- sf_topology_t *thisApp = NULL;
-
- FUNC_ENTER_API(FAIL)
- H5TRACE0("e","");
-
- /* Shutdown the IO Concentrator threads */
-
- if (topology_id != H5I_INVALID_HID) {
- thisApp = get_subfiling_object(topology_id);
- }
-
- if (thisApp && thisApp->rank_is_ioc) {
- begin_thread_exclusive();
- sf_shutdown_flag = 1;
- end_thread_exclusive();
-
- usleep(100);
-
- wait_for_thread_main();
- }
-
- MPI_Barrier(MPI_COMM_WORLD);
-
- delete_subfiling_context(context_id);
+ newContext->sf_context_id = context_id;
+
+ if (H5FD__init_subfile_context(
+ thisApp, ioc_count, world_rank, newContext) != SUCCEED) {
+ puts("Unable to initialize a subfiling context!");
+ ret_value = FAIL;
+ goto done;
+ }
+
+ if (newContext->topology->rank_is_ioc) {
+ int status = initialize_ioc_threads(newContext);
+ if (status)
+ goto done;
+ }
+
+ if (context_id < 0) {
+ ret_value = FAIL;
+
+ goto done;
+ }
+ *sf_context = context_id;
- FUNC_LEAVE_API(ret_value)
done:
- return ret_value;
-}
+ FUNC_LEAVE_API(ret_value)
-hid_t
-get_subfiling_context(void)
-{
- return context_id;
+ return ret_value;
}
-
-#endif /* H5_HAVE_PARALLEL */
diff --git a/src/H5FDsubfile.h b/src/H5FDsubfile.h
deleted file mode 100644
index e69de29..0000000
--- a/src/H5FDsubfile.h
+++ /dev/null
diff --git a/src/H5FDsubfile_mpi.c b/src/H5FDsubfile_mpi.c
index fda4928..57add71 100644
--- a/src/H5FDsubfile_mpi.c
+++ b/src/H5FDsubfile_mpi.c
@@ -1,52 +1,40 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright by The HDF Group. *
+ * Copyright by the Board of Trustees of the University of Illinois. *
+ * All rights reserved. *
+ * *
+ * This file is part of HDF5. The full HDF5 copyright notice, including *
+ * terms governing use, modification, and redistribution, is contained in *
+ * the COPYING file, which can be found at the root of the source code *
+ * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. *
+ * If you do not have access to either file, you may request a copy from *
+ * help@hdfgroup.org. *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
#include "H5FDsubfile_private.h"
-static int *io_concentrator = NULL;
-static int n_io_concentrators = -1;
static int sf_world_rank = -1;
static int sf_world_size = -1;
-static int subfile_fid = -1;
-static int64_t sf_stripe_size = -1;
-static int64_t sf_blocksize_per_stripe = 0;
-
-static MPI_Datatype H5FD__create_f_l_mpi_type(subfiling_context_t *context,
- int64_t target_write_bytes,
- int64_t first_write,
- int64_t last_write,
- int ioc_depth);
-static MPI_Datatype H5FD__create_first_mpi_type(subfiling_context_t *context,
- int64_t offset,
- int64_t target_write_bytes,
- int64_t first_write,
- int ioc_depth);
-static MPI_Datatype H5FD__create_final_mpi_type(subfiling_context_t *context,
- int64_t target_write_bytes,
- int64_t last_write,
- int ioc_depth);
-static MPI_Datatype H5FD__create_mpi_uniform_type(subfiling_context_t *context,
- int64_t offset,
- int64_t target_write_bytes,
- int ioc_depth);
-
-static int * request_count_per_rank = NULL;
+static int sf_open_file_count = 0;
+static int sf_close_file_count = 0;
+static int sf_ops_after_first_close = 0;
+
+static int *request_count_per_rank = NULL;
atomic_int sf_workinprogress = 0;
atomic_int sf_work_pending = 0;
atomic_int sf_file_close_count = 0;
atomic_int sf_file_refcount = 0;
+atomic_int sf_ioc_fini_refcount = 0;
-#ifdef DEBUG_TRACING
+#ifndef NDEBUG
FILE *sf_logfile = NULL;
#endif
-MPI_Comm sf_msg_comm = MPI_COMM_NULL; /* Messages IN */
-MPI_Comm sf_data_comm = MPI_COMM_NULL; /* Messages OUT */
-
int sf_shutdown_flag = 0;
const char *sf_subfile_prefix = ".";
-
#define MAX_WORK_PER_RANK 2
/*
@@ -55,261 +43,148 @@ Private functions
=========================================
*/
-static int _determine_subfile_rank(int myrank)
-{
- if (io_concentrator) {
- int i;
- for(i=0; i< n_io_concentrators; i++) {
- if (io_concentrator[i] == myrank)
- return i;
- }
- }
- return -1;
-}
-
-static int is_io_concentrator(int rank)
-{
- int index = _determine_subfile_rank(rank);
- if (index < 0) return 0;
- return 1; /* true */
-}
-
-
-
-static void init_io_vars(int64_t stripe_size, int64_t blocksize_per_stripe,
- int64_t file_offset, int64_t data_extent,
- int64_t *first_io, int64_t *first_io_offset, int64_t *last_io,
- int *starting_ioc, int *final_ioc, int *starting_row, int *final_row)
-{
- int64_t total_stripe_width = stripe_size * n_io_concentrators;
- int64_t starting_offset = file_offset % stripe_size;
- int64_t final_offset = (file_offset + data_extent -1);
- int64_t last_io_check = (starting_offset + data_extent) % stripe_size;
- *starting_row = (int)(file_offset / total_stripe_width);
- *final_row = (int)(final_offset / total_stripe_width);
-
- /* Maybe update how many bytes in the entire IOC collection */
- if (blocksize_per_stripe == 0)
- sf_blocksize_per_stripe = total_stripe_width;
-
- *starting_ioc = (int)((file_offset / stripe_size) % n_io_concentrators);
- *final_ioc = (int)((final_offset / stripe_size) % n_io_concentrators);
- *first_io_offset = starting_offset;
- *first_io = ((stripe_size - starting_offset) >= data_extent ? data_extent : (stripe_size - starting_offset));
- /* Check for just a single IO op */
- if (*first_io == data_extent) *last_io = 0;
- else *last_io = (last_io_check > 0 ? last_io_check : stripe_size);
-}
-
-static int init__indep_io(subfiling_context_t *sf_context,
- int64_t **source_data_offset, int64_t **sf_datasize,
- int64_t **sf_offset, MPI_Datatype **sf_dtype,
- int64_t offset, int64_t elements, int dtype_extent)
-{
- int64_t data_extent = elements * dtype_extent;
- int64_t first_io=0, last_io=0, first_io_offset=0;
-
- int64_t *data_offset = *source_data_offset;
- int64_t *ioc_datasize = *sf_datasize;
- int64_t *ioc_offset = *sf_offset;
- MPI_Datatype *ioc_type = *sf_dtype;
- int k, ioc_start, ioc_last, ioc_depth, starting_row, final_row;
- sf_stripe_size = sf_context->sf_stripe_size;
- sf_blocksize_per_stripe = sf_context->sf_blocksize_per_stripe;
-
- init_io_vars(sf_stripe_size, sf_blocksize_per_stripe, offset, data_extent,
- &first_io, &first_io_offset, &last_io,
- &ioc_start, &ioc_last, &starting_row, &final_row);
-
- if (sf_verbose_flag) {
- printf("[%d] offset=%ld,data_extent=%ld,sf_stripe_size=%ld,n_io_concentrators=%d,"
- "first_io=%ld,first_io_offset=%ld,last_io=%ld,ioc_start=%d,ioc_last=%d\n",
- sf_world_rank, offset,data_extent,sf_stripe_size,n_io_concentrators,
- first_io,first_io_offset,last_io,ioc_start,ioc_last);
- fflush(stdout);
- }
-
- if (data_offset == NULL) {
- data_offset = (int64_t *)calloc((size_t)n_io_concentrators, sizeof(int64_t));
- assert(data_offset != NULL);
- *source_data_offset = data_offset;
- }
-
- if (ioc_datasize == NULL) {
- ioc_datasize = (int64_t *)calloc((size_t)n_io_concentrators, sizeof(int64_t));
- assert(ioc_datasize != NULL);
- *sf_datasize = ioc_datasize;
- }
-
- if (ioc_offset == NULL) {
- ioc_offset = (int64_t *)calloc((size_t)n_io_concentrators, sizeof(int64_t));
- assert(ioc_offset != NULL);
- *sf_offset = ioc_offset;
- }
-
- if (ioc_type == NULL) {
- ioc_type = (MPI_Datatype *)calloc((size_t)n_io_concentrators, sizeof(MPI_Datatype));
- assert(ioc_type != NULL);
- *sf_dtype = ioc_type;
- }
-
- for(k=0; k < n_io_concentrators; k++) {
- ioc_datasize[k] = 0;
- ioc_offset[k] = 0;
- /* Free previously used datatypes */
- if (ioc_type[k] &&
- (ioc_type[k] != MPI_DATATYPE_NULL) &&
- (ioc_type[k] != MPI_BYTE))
- MPI_Type_free(&ioc_type[k]);
- else ioc_type[k] = MPI_DATATYPE_NULL;
- }
-
- if (data_extent) {
- int next_index = ioc_start;
- int64_t target_bytes;
- int64_t total_bytes_remaining = data_extent;
- int64_t row_base = starting_row * sf_stripe_size;
- int64_t subfile_offset = row_base + first_io_offset;
- int64_t source_offset = 0;
- int64_t remaining_bytes_in_row = ((n_io_concentrators - ioc_start) * sf_stripe_size) - first_io_offset;
-
- ioc_depth = (final_row - starting_row) +1;
- if ((ioc_start > ioc_last) && (data_extent > remaining_bytes_in_row)) ioc_depth--;
-
- while(total_bytes_remaining > 0) {
- target_bytes = 0;
- if (next_index == ioc_start) {
- target_bytes = first_io;
- }
- if (next_index == ioc_last) {
- target_bytes += last_io;
- ioc_depth--;
- }
- if (ioc_depth) {
- if (next_index == ioc_start)
- target_bytes += (sf_stripe_size * (ioc_depth -1));
- else target_bytes += (sf_stripe_size * ioc_depth);
- }
-
- data_offset[next_index] = source_offset;
- ioc_datasize[next_index] += target_bytes;
- ioc_offset[next_index] += subfile_offset;
- total_bytes_remaining -= target_bytes;
- /*
- * With the exception of the very 1st IO, all additional
- * IO operations start on a slice_boundary (and this is
- * consistent across the collection of IOCs).
- */
-
- subfile_offset = row_base;
-
- /*
- * Possibly Create an MPI datatype for each MPI_Send operation.
- * If the length allows writing into a single stripe on
- * a single IOC, then we can use the MPI_BYTE datatype.
- */
-
-
- if (next_index == ioc_start) { /* First target */
- if (next_index == ioc_last) {
- ioc_type[next_index] =
- H5FD__create_f_l_mpi_type(sf_context, target_bytes,
- first_io, last_io, ioc_depth+1);
- } else {
- ioc_type[next_index] =
- H5FD__create_first_mpi_type(sf_context, ioc_offset[next_index],
- target_bytes, first_io, ioc_depth);
- }
- source_offset += first_io;
- }
- else {
- if (next_index == ioc_last) {
- ioc_type[next_index] =
- H5FD__create_final_mpi_type(sf_context,
- target_bytes, last_io, ioc_depth+1);
- } else {
- ioc_type[next_index] =
- H5FD__create_mpi_uniform_type(sf_context,ioc_offset[next_index],
- target_bytes, ioc_depth);
- }
- source_offset += sf_stripe_size;
- }
-
- if (++next_index == n_io_concentrators) {
- next_index = 0;
- row_base += sf_stripe_size;
- subfile_offset = row_base;
- }
- }
- }
- return 0;
-}
-
+/*
+ * ---------------------------------------------------
+ * Topology discovery related functions for choosing
+ * IO Concentrator (IOC) ranks.
+ * Currently, the default approach for assigning an IOC
+ * is select the lowest MPI rank on each node.
+ *
+ * The approach collectively generates N tuples
+ * consisting of the MPI rank and hostid. This
+ * collection is then sorted by hostid and scanned
+ * to identify the IOC ranks.
+ *
+ * As time permits, addition assignment methods will
+ * be implemented, e.g. 1-per-Nranks or via a config
+ * option. Additional selection methodologies can
+ * be included as users get more experience using the
+ * subfiling implementation.
+ * ---------------------------------------------------
+ */
-static int compare_hostid(const void *h1, const void *h2)
+/*-------------------------------------------------------------------------
+ * Function: compare_hostid
+ *
+ * Purpose: qsort sorting function.
+ * Compares tuples of 'layout_t'. The sorting is based on
+ * the long hostid values.
+ *
+ * Return: result of: (hostid1 > hostid2)
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static int
+compare_hostid(const void *h1, const void *h2)
{
- const layout_t *host1 = (const layout_t *)h1;
- const layout_t *host2 = (const layout_t *)h2;
+ const layout_t *host1 = (const layout_t *) h1;
+ const layout_t *host2 = (const layout_t *) h2;
return (host1->hostid > host2->hostid);
}
-
-static void gather_topology_info(sf_topology_t *info)
+/*-------------------------------------------------------------------------
+ * Function: gather_topology_info
+ *
+ * Purpose: Collectively generate a sorted collection of hostid+mpi_rank
+ * tuples. The result is returned in the 'topology' field
+ * of the sf_topology_t structure.
+ *
+ * Return: Sorted array of hostid/mpi_rank tuples.
+ * Errors: MPI_Abort if memory cannot be allocated.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static void
+gather_topology_info(sf_topology_t *info)
{
- sf_world_size = info->world_size;
- sf_world_rank = info->world_rank;
+ sf_world_size = info->world_size;
+ sf_world_rank = info->world_rank;
- if (info->topology)
- return;
+ if (info->layout)
+ return;
if (sf_world_size) {
- long hostid = gethostid();
- layout_t my_hostinfo;
- layout_t *topology = (layout_t *)calloc((size_t)sf_world_size+1, sizeof(layout_t));
- if (topology == NULL) {
+ long hostid = gethostid();
+ layout_t my_hostinfo;
+ layout_t *layout =
+ (layout_t *) calloc((size_t) sf_world_size + 1, sizeof(layout_t));
+ if (layout == NULL) {
perror("calloc failure!");
MPI_Abort(MPI_COMM_WORLD, 1);
}
- info->hostid = hostid;
- info->topology = topology;
- my_hostinfo.rank = sf_world_rank;
- my_hostinfo.hostid = hostid;
- info->topology[sf_world_rank] = my_hostinfo;
- if (sf_world_size > 1) {
- if (MPI_Allgather(&my_hostinfo, 2, MPI_LONG,
- info->topology, 2, MPI_LONG,
- MPI_COMM_WORLD) == MPI_SUCCESS) {
- qsort(info->topology, (size_t)sf_world_size, sizeof(layout_t), compare_hostid);
- }
- }
+ info->hostid = hostid;
+ info->layout = layout;
+ my_hostinfo.rank = sf_world_rank;
+ my_hostinfo.hostid = hostid;
+ info->layout[sf_world_rank] = my_hostinfo;
+ if (sf_world_size > 1) {
+ if (MPI_Allgather(&my_hostinfo, 2, MPI_LONG, info->layout, 2,
+ MPI_LONG, MPI_COMM_WORLD) == MPI_SUCCESS) {
+ qsort(info->layout, (size_t) sf_world_size, sizeof(layout_t),
+ compare_hostid);
+ }
+ }
}
}
-static int count_nodes(sf_topology_t *info)
+/*-------------------------------------------------------------------------
+ * Function: count_nodes
+ *
+ * Purpose: Initializes the sorted collection of hostid+mpi_rank
+ * tuples. After initialization, the collection is scanned
+ * to determine the number of unique hostid entries. This
+ * value will determine the number of actual IO concentrators
+ * that available to the application. A side effect is to
+ * identify the 'node_index' of the current process.
+ *
+ * Return: The number of unique hostid's (nodes).
+ * Errors: MPI_Abort if memory cannot be allocated.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static int
+count_nodes(sf_topology_t *info)
{
- int k, node_count, hostid_index = -1;
+ int k, node_count, hostid_index = -1;
long nextid;
- assert(info != NULL);
- if (info->topology == NULL)
- gather_topology_info (info);
+ assert(info != NULL);
+ if (info->layout == NULL)
+ gather_topology_info(info);
- nextid = info->topology[0].hostid;
- info->node_ranks = (int *)calloc((size_t)(info->world_size+1), sizeof(int));
- assert(info->node_ranks != NULL);
+ nextid = info->layout[0].hostid;
+ info->node_ranks =
+ (int *) calloc((size_t)(info->world_size + 1), sizeof(int));
+ assert(info->node_ranks != NULL);
if (nextid == info->hostid)
- hostid_index = 0;
+ hostid_index = 0;
node_count = 1;
- /* Recall that the topology array has been sorted! */
- for (k=1; k < info->world_size; k++) {
- if (info->topology[k].hostid != nextid) {
- nextid = info->topology[k].hostid;
+ /* Recall that the topology array has been sorted! */
+ for (k = 1; k < info->world_size; k++) {
+ if (info->layout[k].hostid != nextid) {
+ nextid = info->layout[k].hostid;
if (hostid_index < 0) {
- if (nextid == info->hostid) hostid_index = k;
+ if (nextid == info->hostid)
+ hostid_index = k;
}
- /* Record the index of new hostid */
+ /* Record the index of new hostid */
info->node_ranks[node_count++] = k;
}
}
@@ -321,524 +196,1028 @@ static int count_nodes(sf_topology_t *info)
return info->node_count = node_count;
}
+/*-------------------------------------------------------------------------
+ * Function: H5FD__determine_ioc_count
+ *
+ * Purpose: Once a sorted collection of hostid/mpi_rank tuples has been
+ * created and the number of unique hostids (nodes) has
+ * been determined, we may modify this "default" value for
+ * the number of IO Concentrators for this application.
+ *
+ * The default of one(1) IO concentrator per node can be
+ * changed (principally for testing) by environment variable.
+ * if IOC_COUNT_PER_NODE is defined, then that integer value
+ * is utilized as a mulitiplier to modify the set of
+ * IO Concentrator ranks.
+ *
+ * The cached results will be replicated within the
+ * subfiling_context_t structure and is utilized as a map from
+ * io concentrator rank to MPI communicator rank for message
+ * sends and receives.
+ *
+ * Return: The number of IO Concentrator ranks. We also cache
+ * the MPI ranks in the 'io_concentrator' vector variable.
+ * The length of this vector is cached as 'n_io_concentrators'.
+ * Errors: MPI_Abort if memory cannot be allocated.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: - Initial Version/None.
+ * - Updated the API to allow a variety of methods for
+ * determining the number and MPI ranks that will have
+ * IO Concentrators. The default approach will define
+ * a single IOC per node.
+ *
+ *-------------------------------------------------------------------------
+ */
int
-H5FD__determine_ioc_count(int world_size, int world_rank, sf_topology_t **thisapp)
+H5FD__determine_ioc_count(int world_size, int world_rank,
+ sf_ioc_selection_t ioc_select_method, char *ioc_select_option,
+ sf_topology_t **thisapp)
{
- static int ioc_count = 0;
- if (!ioc_count) {
- int k, node;
- int node_index;
- int iocs_per_node = 1;
- char *envValue = NULL;
- sf_topology_t *app_topology = (sf_topology_t *)malloc(sizeof(sf_topology_t));
- assert(app_topology != NULL);
- memset(app_topology, 0, sizeof(sf_topology_t));
- app_topology->world_size = world_size;
- app_topology->world_rank = world_rank;
-
- io_concentrator = (int *)malloc(((size_t)world_size * sizeof(int)));
- assert(io_concentrator != NULL);
- ioc_count = count_nodes (app_topology);
- /* FIXME: This should ONLY be used for testing!
- * For production, we should probably limit the
- * number to a single IOC per node...
- * (based on performance numbers)
- */
- if ((envValue = getenv("IOC_COUNT_PER_NODE")) != NULL) {
- int value_check = atoi(envValue);
- if (value_check > 0) {
- iocs_per_node = value_check;
- }
- }
-
- /* 'node_ranks' contain the index of the first instance of a hostid
- * in the sorted sf_topology array. Our own index is 'node_index'.
- */
- node_index = app_topology->node_index;
- app_topology->local_peers = app_topology->node_ranks[node_index+1] -
- app_topology->node_ranks[node_index];
- if (app_topology->topology[node_index].rank == world_rank) {
- app_topology->rank_is_ioc = true;
- app_topology->subfile_rank = node_index;
- }
- /* FIXME: This should ONLY be used for testing!
- * NOTE: The app_topology->local_peers is ONLY valid
- * for the current NODE. There is no guarantee that
- * the application layout defines a uniform number of
- * MPI ranks per node...
- * Because this is only for testing purposes (at this time)
- * we can live with the assumption that if we define the
- * IOC_COUNT_PER_NODE environment variable, then each
- * node will have *at-least* that many MPI ranks assigned.
- * See above!
- */
- else if ((app_topology->local_peers > 1) && (iocs_per_node > 1)) {
- if (iocs_per_node > app_topology->local_peers)
- iocs_per_node = app_topology->local_peers;
- for(k=1; k< iocs_per_node; k++) {
- if (app_topology->topology[node_index + k].rank == world_rank) {
- app_topology->rank_is_ioc = true;
- app_topology->subfile_rank = node_index + k;
- break;
- }
- }
- }
- /* More hacks for testing */
- if (io_concentrator) {
- int n_iocs = 0;
- for(node = 0; node < ioc_count; node++) {
- for (k=0; k < iocs_per_node; k++) {
- node_index = app_topology->node_ranks[node];
- io_concentrator[n_iocs++] = (int)(
- app_topology->topology[node_index + k].rank);
- }
- }
- ioc_count = n_io_concentrators = n_iocs;
+ static int ioc_count = 0;
+ static int64_t topology_id = 0;
+ static sf_ioc_selection_t ioc_selection = ioc_selection_options;
+ sf_topology_t * app_topology = NULL;
+
+ assert(thisapp != NULL);
+
+ if (!ioc_count || (ioc_selection != ioc_select_method)) {
+ int k, node;
+ int node_index;
+ int iocs_per_node = 1;
+ char * envValue = NULL;
+ int * io_concentrator = NULL;
+ int index = (int) ioc_select_method;
+ int64_t tag = (int64_t) SF_TOPOLOGY;
+ topology_id = (int64_t)((tag << 32) | index);
+
+ app_topology = (sf_topology_t *) get_subfiling_object(topology_id);
+ assert(app_topology != NULL);
+ app_topology->world_size = world_size;
+ app_topology->world_rank = world_rank;
+ if (app_topology->io_concentrator == NULL) {
+ app_topology->io_concentrator = io_concentrator =
+ (int *) malloc(((size_t) world_size * sizeof(int)));
}
+ assert(io_concentrator != NULL);
+ app_topology->selection_type = ioc_selection = ioc_select_method;
+
+ if (ioc_select_method == SELECT_IOC_ONE_PER_NODE) {
+ ioc_count = count_nodes(app_topology);
+ /* FIXME: This should ONLY be used for testing!
+ * For production, we should probably limit the
+ * number to a single IOC per node...
+ * (based on performance numbers)
+ */
+ if ((envValue = getenv("IOC_COUNT_PER_NODE")) != NULL) {
+ int value_check = atoi(envValue);
+ if (value_check > 0) {
+ iocs_per_node = value_check;
+ }
+ }
- if (ioc_count > 0) {
- *thisapp = app_topology;
- }
+ /* 'node_ranks' contain the index of the first instance of a hostid
+ * in the sorted sf_topology array. Our own index is 'node_index'.
+ */
+ node_index = app_topology->node_index;
+ app_topology->local_peers =
+ app_topology->node_ranks[node_index + 1] -
+ app_topology->node_ranks[node_index];
+ if (app_topology->layout[node_index].rank == world_rank) {
+ app_topology->rank_is_ioc = true;
+ app_topology->subfile_rank = node_index;
+ }
+ /* FIXME: This should ONLY be used for testing!
+ * NOTE: The app_topology->local_peers is ONLY valid
+ * for the current NODE. There is no guarantee that
+ * the application layout defines a uniform number of
+ * MPI ranks per node...
+ * Because this is only for testing purposes (at this time)
+ * we can live with the assumption that if we define the
+ * IOC_COUNT_PER_NODE environment variable, then each
+ * node will have *at-least* that many MPI ranks assigned.
+ * See above!
+ */
+ else if ((app_topology->local_peers > 1) && (iocs_per_node > 1)) {
+ if (iocs_per_node > app_topology->local_peers)
+ iocs_per_node = app_topology->local_peers;
+ for (k = 1; k < iocs_per_node; k++) {
+ if (app_topology->layout[node_index + k].rank ==
+ world_rank) {
+ app_topology->rank_is_ioc = true;
+ app_topology->subfile_rank = node_index + k;
+ break;
+ }
+ }
+ }
+ /* More hacks for testing */
+ if (io_concentrator) {
+ int n_iocs = 0;
+ for (node = 0; node < ioc_count; node++) {
+ for (k = 0; k < iocs_per_node; k++) {
+ node_index = app_topology->node_ranks[node];
+ io_concentrator[n_iocs++] =
+ (int) (app_topology->layout[node_index + k].rank);
+ }
+ }
+ ioc_count = n_iocs;
+ }
+
+ if (ioc_count > 0) {
+ app_topology->n_io_concentrators = ioc_count;
+ *thisapp = app_topology;
+ // topology_id = (hid_t)record_subfiling_object(SF_TOPOLOGY,
+ // app_topology);
+ }
+ } else {
+ if (world_rank == 0) {
+ printf("[%d - %s] IOC_selection(%d) with option(%s) is not "
+ "supported\n",
+ world_rank, __func__, (int) ioc_select_method,
+ ioc_select_option);
+ }
+ }
+ } else {
+ app_topology = (sf_topology_t *) get_subfiling_object(topology_id);
+ *thisapp = app_topology;
}
return ioc_count;
}
-int
-H5FD__init_subfile_context(subfiling_context_t **newContext, int n_iocs, int world_size, int world_rank, bool rank_is_ioc)
-{
- int status;
- subfiling_context_t *next = (subfiling_context_t *) malloc(sizeof(subfiling_context_t));
- assert(next != NULL);
- memset(next,0, sizeof(subfiling_context_t));
-
- if (io_concentrator == NULL) {
- goto err_exit;
- }
- else {
- int k;
- char *envValue = NULL;
- int ioc_leader = io_concentrator[0];
- int app_leader = 0;
- *newContext = next;
- next->sf_stripe_size = DEFAULT_STRIPE_SIZE;
- if ((envValue = getenv("IOC_STRIPE_SIZE")) != NULL) {
- long value_check = atol(envValue);
- if (value_check > 0) {
- next->sf_stripe_size = (int64_t)value_check;
- }
- }
- if ((envValue = getenv("SUBFILE_PREFIX")) != NULL) {
- char temp[PATH_MAX];
- sprintf(temp,"%s", envValue);
- next->subfile_prefix = strdup(temp);
- sf_subfile_prefix = strdup(temp);
- }
-
- next->sf_blocksize_per_stripe = next->sf_stripe_size * n_iocs;
- status = MPI_Comm_dup(MPI_COMM_WORLD, &next->sf_msg_comm);
- if (status != MPI_SUCCESS) goto err_exit;
- status = MPI_Comm_set_errhandler(next->sf_msg_comm, MPI_ERRORS_RETURN);
- if (status != MPI_SUCCESS) goto err_exit;
- status = MPI_Comm_dup(MPI_COMM_WORLD, &next->sf_data_comm);
- if (status != MPI_SUCCESS) goto err_exit;
- status = MPI_Comm_set_errhandler(next->sf_data_comm, MPI_ERRORS_RETURN);
- if (status != MPI_SUCCESS) goto err_exit;
-
- k = 0;
- while(is_io_concentrator(k))
- k++;
- app_leader = k;
-
- /* Do this now rather than having the ioc thread
- * update the value
- */
- if (rank_is_ioc) {
- sf_stripe_size = next->sf_stripe_size;
- }
-
- if (sf_verbose_flag && (world_rank == 0)) {
- printf("app_leader = %d and ioc_leader = %d\n", app_leader, ioc_leader);
- }
-
- if (n_iocs > 1) {
- status = MPI_Comm_split(MPI_COMM_WORLD, rank_is_ioc, world_rank, &next->sf_group_comm);
- if (status != MPI_SUCCESS) goto err_exit;
- status = MPI_Comm_size(next->sf_group_comm, &next->sf_group_size);
- if (status != MPI_SUCCESS) goto err_exit;
- status = MPI_Comm_rank(next->sf_group_comm, &next->sf_group_rank);
- if (status != MPI_SUCCESS) goto err_exit;
- /*
- * There may be additional functionality we need for the IOCs...
- * If so, then can probably initialize those things here!
- */
- }
- else {
- next->sf_group_comm = MPI_COMM_NULL;
- }
-
- if (rank_is_ioc) {
- status = initialize_ioc_threads(next);
- if (status) goto err_exit;
- }
- }
- return 0;
-
-err_exit:
- if (next) {
- free(next);
- }
- return -1;
-}
-
-
-/*
----------------------------------------------------------------------------------
- The data that we're sending to receiving from an IO concentrator (IOC) contains
- the initial collection of bytes. The length of this initial segment is 'first_write'.
- Note that the terminology isn't significant. We are describing an IO operation in
- terms of an MPI datatype which will either gather data from a source buffer
- to send to an IOC or will be used to unpack data from an IOC into a user buffer.
- Subsequent IO operations which are related to the current File IO will begin on
- sf_stripe_size boundaries.
----------------------------------------------------------------------------------
-*/
-
-static MPI_Datatype H5FD__create_first_mpi_type(
- subfiling_context_t *context, int64_t offset,
- int64_t target_write_bytes, int64_t first_write, int ioc_depth)
+/* ===================================================================== */
+/* MPI_Datatype Creation functions.
+ * These are catagorized by usage paterns, i.e. when data is sent to or
+ * received from and IOC, the initial data offset provided by the user
+ * may or may NOT start on a stripe boundary. Because this, the initial
+ * data segment to the selected IOC will often be less than 'stripe_size'
+ * in length. The purpose of these Datatype creation functions is to
+ * enable the gathering of all data from this client to the IOC target
+ * into a single MPI message. The MPI datatype will the be utilized by
+ * the sending function to pack data into a contiguous block of memory
+ * which enables the IOC to write to disk in an effective manner.
+ * ===================================================================== */
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD__create_first_mpi_type
+ *
+ * Purpose: Return an appropriate MPI datatype to represent the initial
+ * IO operation when reading or writing data to or from an IO
+ * Concentrator (IOC).
+ *
+ * If the 'first_io' is sufficient to complete the IO to the
+ * IOC, then the returned MPI datatype will simply be MPI_BYTE.
+ * For all other non-zero length IO operations, we create a
+ * derived MPI datatype using MPI_Type_indexed. The 'ioc_depth'
+ * input will define the number of blocks/disps pairs that are
+ * required to represent the desired IO operation.
+ *
+ * Return: The MPI_Datatype that will be used to send or receive data.
+ * Errors: MPI_Type_NULL if for any reason, the MPI_Datatype creation
+ * fails.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static MPI_Datatype
+H5FD__create_first_mpi_type(subfiling_context_t *context, int ioc_depth,
+ int64_t offset, int64_t target_write_bytes, int64_t first_io)
{
MPI_Datatype newType = MPI_DATATYPE_NULL;
- int64_t stripe_size = context->sf_stripe_size;
- int64_t offset_in_stripe = offset % sf_stripe_size;
- int64_t depth_in_bytes = sf_stripe_size * ioc_depth;
- int64_t next_offset = context->sf_blocksize_per_stripe - offset_in_stripe;
- int64_t total_bytes = first_write;
-
- assert(ioc_depth > 0);
- if (stripe_size >= depth_in_bytes)
- return MPI_BYTE;
-
- if (depth_in_bytes) {
- int k;
- int temp_blocks[64];
- int temp_disps[64];
+ int64_t stripe_size = context->sf_stripe_size;
+ int64_t blocksize_per_stripe = context->sf_blocksize_per_stripe;
+ int64_t offset_in_stripe = offset % stripe_size;
+ int64_t next_offset = blocksize_per_stripe - offset_in_stripe;
+ int64_t total_bytes = first_io;
+
+ if (first_io == target_write_bytes) {
+ if (first_io > 0) {
+ return MPI_BYTE;
+ }
+ }
+ if (first_io) {
+ int k;
+ int temp_blocks[64];
+ int temp_disps[64];
int *blocks = temp_blocks;
int *disps = temp_disps;
if (ioc_depth > 64) {
- blocks = (int *)calloc((size_t)ioc_depth, sizeof(int));
- disps = (int *)calloc((size_t)ioc_depth, sizeof(int));
- }
- blocks[0] = (int)first_write;
+ blocks = (int *) calloc((size_t) ioc_depth, sizeof(int));
+ if (blocks == NULL) {
+ perror("calloc");
+ return newType;
+ }
+ disps = (int *) calloc((size_t) ioc_depth, sizeof(int));
+ if (disps == NULL) {
+ perror("calloc");
+ return newType;
+ }
+ }
+ blocks[0] = (int) first_io;
disps[0] = (int) 0;
- for(k=1; k < ioc_depth; k++) {
- disps[k] = (int)next_offset;
- blocks[k] = (int)stripe_size;
- total_bytes += stripe_size;
+ for (k = 1; k <= ioc_depth; k++) {
+ disps[k] = (int) next_offset;
+ blocks[k] = (int) stripe_size;
+ total_bytes += stripe_size;
next_offset += context->sf_blocksize_per_stripe;
}
- if (total_bytes != target_write_bytes) {
- printf("Warning (%s): total_SUM(%ld) != target_bytes(%ld)\n",
- __func__, total_bytes, target_write_bytes);
- }
-
- if (MPI_Type_indexed(ioc_depth, blocks, disps, MPI_BYTE, &newType) != MPI_SUCCESS) {
+ if (total_bytes != target_write_bytes) {
+ printf("Warning (%s): total_SUM(%ld) != target_bytes(%ld)\n",
+ __func__, total_bytes, target_write_bytes);
+ }
+
+ if (MPI_Type_indexed(k, blocks, disps, MPI_BYTE, &newType) !=
+ MPI_SUCCESS) {
perror("MPI_Type_indexed failed!");
- return MPI_DATATYPE_NULL;
+ return newType;
}
MPI_Type_commit(&newType);
+ if (1) {
+ int type_size;
+ MPI_Type_size(newType, &type_size);
+ if (type_size != target_write_bytes) {
+ printf("%s: type_size=%d should be: %ld\n", __func__, type_size,
+ target_write_bytes);
+ }
+ }
if (ioc_depth > 64) {
- if (blocks != temp_blocks) {
- free(blocks);
- blocks = NULL;
- }
- if (disps != temp_disps) {
- free(disps);
- disps = NULL;
- }
+ if (blocks != temp_blocks) {
+ free(blocks);
+ blocks = NULL;
+ }
+ if (disps != temp_disps) {
+ free(disps);
+ disps = NULL;
+ }
}
}
return newType;
}
-/*
----------------------------------------------------------------------------------
- The data that we're sending to an IO concentrator (IOC) contains the final
- collection of bytes. Other than that detail, this is pretty much like the
- typical' case... All chunks sizes are the identical (execpt for the very
- last chunk) and all will start at relative stripe offset of 0. More precisely,
- the start offset is a multiple of the subfiling "stripe_size".
- We can utilize MPI_Type_indexed to represent the new type.
----------------------------------------------------------------------------------
-*/
-static MPI_Datatype H5FD__create_final_mpi_type(subfiling_context_t *context, int64_t target_write_bytes, int64_t last_write, int ioc_depth)
+/*-------------------------------------------------------------------------
+ * Function: H5FD__create_final_mpi_type
+ *
+ * Purpose: Return an appropriate MPI datatype to represent the final
+ * IO operation when reading or writing data to or from an IO
+ * Concentrator (IOC).
+ *
+ * The data that we're sending to an IO concentrator (IOC)
+ * contains the final collection of bytes. Other than that detail,
+ * this is pretty much like the typical' IO case, i.e. all block
+ * sizes are identical (execpt for the very last block).
+ *Furthermore, they all start at relative stripe offset of 0, in other words on
+ *a 'stripe_size' boundary.
+ *
+ * Return: The MPI_Datatype that will be used to send or receive data.
+ * Errors: MPI_Type_NULL if for any reason, the MPI_Datatype creation
+ * fails.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static MPI_Datatype
+H5FD__create_final_mpi_type(subfiling_context_t *context, int ioc_depth,
+ int64_t target_write_bytes, int64_t last_write)
{
MPI_Datatype newType = MPI_DATATYPE_NULL;
- int64_t stripe_size = context->sf_stripe_size;
- int64_t depth_in_bytes = (stripe_size * ioc_depth) + last_write;
- int64_t total_bytes = last_write;
-
- assert(ioc_depth > 0);
+ int64_t stripe_size = context->sf_stripe_size;
+ int64_t depth_in_bytes = (stripe_size * ioc_depth) + last_write;
+ int64_t total_bytes = last_write;
- if (depth_in_bytes <= stripe_size)
- return MPI_BYTE;
+ if (depth_in_bytes == target_write_bytes) {
+ if (depth_in_bytes > 0) {
+ return MPI_BYTE;
+ }
+ }
if (depth_in_bytes) {
- int k;
- int temp_blocks[64];
- int temp_disps[64];
+ int k;
+ int temp_blocks[64];
+ int temp_disps[64];
int *blocks = temp_blocks;
int *disps = temp_disps;
if (ioc_depth > 64) {
- blocks = (int *)calloc((size_t)ioc_depth, sizeof(int));
- disps = (int *)calloc((size_t)ioc_depth, sizeof(int));
- }
-
- for(k=0; k < ioc_depth; k++) {
- disps[k] = (int)(k * context->sf_blocksize_per_stripe);
- blocks[k] = (int)stripe_size;
- total_bytes += stripe_size;
- }
- blocks[k-1] = (int)last_write;
- if (total_bytes != target_write_bytes) {
- printf("Warning (%s): total_SUM(%ld) != target_bytes(%ld)\n",
- __func__, total_bytes, target_write_bytes);
- }
+ blocks = (int *) calloc((size_t) ioc_depth, sizeof(int));
+ if (blocks == NULL) {
+ perror("calloc");
+ return newType;
+ }
+ disps = (int *) calloc((size_t) ioc_depth, sizeof(int));
+ if (disps == NULL) {
+ perror("calloc");
+ return newType;
+ }
+ }
+
+ for (k = 0; k < ioc_depth; k++) {
+ disps[k] = (int) (k * context->sf_blocksize_per_stripe);
+ blocks[k] = (int) stripe_size;
+ total_bytes += stripe_size;
+ }
+ blocks[k - 1] = (int) last_write;
+ if (total_bytes != target_write_bytes) {
+ printf("Warning (%s): total_SUM(%ld) != target_bytes(%ld)\n",
+ __func__, total_bytes, target_write_bytes);
+ }
- if (MPI_Type_indexed(ioc_depth, blocks, disps, MPI_BYTE, &newType) != MPI_SUCCESS) {
+ if (MPI_Type_indexed(ioc_depth, blocks, disps, MPI_BYTE, &newType) !=
+ MPI_SUCCESS) {
return MPI_DATATYPE_NULL;
}
MPI_Type_commit(&newType);
if (ioc_depth > 64) {
- if (blocks != temp_blocks) {
- free(blocks);
- blocks = NULL;
- }
- if (disps != temp_disps) {
- free(disps);
- disps = NULL;
- }
+ if (blocks != temp_blocks) {
+ free(blocks);
+ blocks = NULL;
+ }
+ if (disps != temp_disps) {
+ free(disps);
+ disps = NULL;
+ }
}
}
return newType;
}
-/*
----------------------------------------------------------------------------------
- Special case where the current IOC has both the first and final write chunks.
- This implmentation is a merge of the first_mpi_type and final_mpi_type
- functions.
----------------------------------------------------------------------------------
-*/
-static MPI_Datatype H5FD__create_f_l_mpi_type(subfiling_context_t *context,
- int64_t target_write_bytes,
- int64_t first_write,
- int64_t last_write, int ioc_depth)
+/*-------------------------------------------------------------------------
+ * Function: H5FD__create_f_l_mpi_type
+ *
+ * Purpose: Return an appropriate MPI datatype which includes both the
+ * first and final IO data segments.
+ *
+ * A special case where the current IOC has both the first and
+ * final write blocks. This function is basically a merge of
+ * the first_mpi_type and final_mpi_type functions.
+ *
+ * Return: The MPI_Datatype that will be used to send or receive data.
+ * Errors: MPI_Type_NULL if for any reason, the MPI_Datatype creation
+ * fails.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static MPI_Datatype
+H5FD__create_f_l_mpi_type(subfiling_context_t *context, int ioc_depth,
+ int64_t offset, int64_t target_write_bytes, int64_t first_write,
+ int64_t last_write)
{
MPI_Datatype newType = MPI_DATATYPE_NULL;
- int64_t stripe_size = context->sf_stripe_size;
- int64_t depth_in_bytes = stripe_size * ioc_depth;
- int64_t offset_in_stripe = stripe_size - first_write;
- int64_t next_offset = context->sf_blocksize_per_stripe - offset_in_stripe;
- int64_t total_bytes = first_write + last_write;
-
- assert(ioc_depth > 0);
- if (last_write == 0) {
- newType = MPI_BYTE;
- }
- else if (depth_in_bytes) {
- int k;
- int temp_blocks[64];
- int temp_disps[64];
+ int64_t stripe_size = context->sf_stripe_size;
+ int64_t blocksize_per_stripe = context->sf_blocksize_per_stripe;
+ int64_t offset_in_stripe = offset % stripe_size;
+ int64_t next_offset = blocksize_per_stripe - offset_in_stripe;
+ int64_t total_bytes = first_write + last_write;
+
+ /* We might actaully check that the 'target_write_bytes'
+ * input variable exceeds 2Gb. If so, then we should
+ * always create a derived type.
+ */
+ if ((total_bytes == target_write_bytes) &&
+ (context->topology->n_io_concentrators == 1)) {
+ return MPI_BYTE;
+ } else if (first_write) {
+ int k;
+ int temp_blocks[64];
+ int temp_disps[64];
int *blocks = temp_blocks;
int *disps = temp_disps;
-#if 0
- /* Depth in bytes might be incorrect... How? */
- if (total_bytes < target_write_bytes) {
- int64_t remaining = target_write_bytes - total_bytes;
- ioc_depth = (remaining / stripe_size) +1;
- }
-#endif
if (ioc_depth > 64) {
- blocks = (int *)calloc((size_t)ioc_depth, sizeof(int));
- disps = (int *)calloc((size_t)ioc_depth, sizeof(int));
- }
+ blocks = (int *) calloc((size_t) ioc_depth, sizeof(int));
+ if (blocks == NULL) {
+ perror("calloc");
+ return newType;
+ }
+ disps = (int *) calloc((size_t) ioc_depth, sizeof(int));
+ if (disps == NULL) {
+ perror("calloc");
+ return newType;
+ }
+ }
- blocks[0] = (int)first_write;
+ blocks[0] = (int) first_write;
disps[0] = 0;
- for(k=1; k < ioc_depth; k++) {
- blocks[k] = (int)stripe_size;
- disps[k] = (int)next_offset;
- next_offset += context->sf_blocksize_per_stripe;
- }
- blocks[k-1] = (int)last_write;
- if (ioc_depth > 2) total_bytes += (int64_t)((ioc_depth - 2) * stripe_size);
+ for (k = 1; k < ioc_depth; k++) {
+ blocks[k] = (int) stripe_size;
+ disps[k] = (int) next_offset;
+ next_offset += context->sf_blocksize_per_stripe;
+ total_bytes += stripe_size;
+ }
+ if (k == 1) {
+ disps[k] = (int) next_offset;
+ }
+ blocks[k] = (int) last_write;
- if (total_bytes != target_write_bytes) {
- printf("[%d] Warning (%s): total_SUM(%ld) != target_bytes(%ld)\n",
- sf_world_rank, __func__, total_bytes, target_write_bytes);
- }
+ if (total_bytes != target_write_bytes) {
+ printf("[%d] Warning (%s): total_SUM(%ld) != target_bytes(%ld)\n",
+ sf_world_rank, __func__, total_bytes, target_write_bytes);
+ }
- if (MPI_Type_indexed(ioc_depth, blocks, disps, MPI_BYTE, &newType) != MPI_SUCCESS) {
+ if (MPI_Type_indexed(k + 1, blocks, disps, MPI_BYTE, &newType) !=
+ MPI_SUCCESS) {
perror("MPI_Type_indexed failed!");
return MPI_DATATYPE_NULL;
}
MPI_Type_commit(&newType);
if (ioc_depth > 64) {
- if (blocks != temp_blocks) {
- free(blocks);
- blocks = NULL;
- }
- if (disps != temp_disps) {
- free(disps);
- disps = NULL;
- }
+ if (blocks != temp_blocks) {
+ free(blocks);
+ blocks = NULL;
+ }
+ if (disps != temp_disps) {
+ free(disps);
+ disps = NULL;
+ }
}
}
return newType;
}
-/*
----------------------------------------------------------------------------------
- This is the 'typical' case in which the IOC has neither the first chunck nor
- the last. All chunks sizes are the identical and start at offset = 0.
- We utilize MPI_Type_indexed to represent the new type.
----------------------------------------------------------------------------------
-*/
-MPI_Datatype H5FD__create_mpi_uniform_type(subfiling_context_t *context,
- int64_t offset,
- int64_t target_write_bytes, int ioc_depth)
+/*-------------------------------------------------------------------------
+ * Function: H5FD__create_mpi_uniform_type
+ *
+ * Purpose: Return an appropriate MPI datatype to represent the typical
+ * IO operation when reading or writing data to or from an IO
+ * Concentrator (IOC).
+ *
+ * Each data segment is of 'stripe_size' length and will be
+ * seperated from a previous or following segment by
+ * 'sf_blocksize_per_stripe' bytes of data.
+ *
+ * Return: The MPI_Datatype that will be used to send or receive data.
+ * Errors: MPI_Type_NULL if for any reason, the MPI_Datatype creation
+ * fails.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static MPI_Datatype
+H5FD__create_mpi_uniform_type(
+ subfiling_context_t *context, int ioc_depth, int64_t target_write_bytes)
{
- /* Maintain some state between function calls allow reuse of the new datatypes... */
- static MPI_Datatype uniformType = MPI_DATATYPE_NULL;
- static int64_t depth_in_bytes = 0;
-
MPI_Datatype newType = MPI_DATATYPE_NULL;
- int64_t stripe_size = context->sf_stripe_size;
- int64_t offset_in_stripe = offset % stripe_size;
- int64_t check_depth = stripe_size * ioc_depth;
- int64_t total_bytes = 0;
-
- assert(offset_in_stripe == 0);
- assert(ioc_depth > 0);
+ int64_t stripe_size = context->sf_stripe_size;
+ int64_t check_depth = stripe_size * ioc_depth;
+ int64_t total_bytes = 0;
- if (check_depth == stripe_size)
- return MPI_BYTE;
+ if (check_depth == stripe_size) {
+ if (target_write_bytes > 0) {
+ return MPI_BYTE;
+ }
+ }
- if (depth_in_bytes) {
- if (depth_in_bytes != check_depth) {
- MPI_Type_free(&uniformType);
- depth_in_bytes = 0;
- }
- }
- if (!depth_in_bytes) {
- int k;
- int temp_blocks[64];
- int temp_disps[64];
+ if (target_write_bytes) {
+ int k;
+ int temp_blocks[64];
+ int temp_disps[64];
int *blocks = temp_blocks;
int *disps = temp_disps;
if (ioc_depth > 64) {
- blocks = (int *)calloc((size_t)ioc_depth, sizeof(int));
- disps = (int *)calloc((size_t)ioc_depth, sizeof(int));
- }
- for(k=0; k < ioc_depth; k++) {
- disps[k] = (int)(k * context->sf_blocksize_per_stripe);
- blocks[k] = (int)(stripe_size);
- total_bytes += stripe_size;
- }
-
- if (total_bytes != target_write_bytes) {
- printf("Warning (%s): total_SUM(%ld) != target_bytes(%ld)\n",
- __func__, total_bytes, target_write_bytes);
- }
-
- if (MPI_Type_indexed(ioc_depth, blocks, disps, MPI_BYTE, &uniformType) != MPI_SUCCESS) {
+ blocks = (int *) calloc((size_t) ioc_depth, sizeof(int));
+ if (blocks == NULL) {
+ perror("calloc");
+ return newType;
+ }
+ disps = (int *) calloc((size_t) ioc_depth, sizeof(int));
+ if (disps == NULL) {
+ perror("calloc");
+ return newType;
+ }
+ }
+ for (k = 0; k < ioc_depth; k++) {
+ disps[k] = (int) (k * context->sf_blocksize_per_stripe);
+ blocks[k] = (int) (stripe_size);
+ total_bytes += stripe_size;
+ }
+
+ if (total_bytes != target_write_bytes) {
+ printf("Warning (%s): total_SUM(%ld) != target_bytes(%ld)\n",
+ __func__, total_bytes, target_write_bytes);
+ }
+
+ if (MPI_Type_indexed(ioc_depth, blocks, disps, MPI_BYTE, &newType) !=
+ MPI_SUCCESS) {
perror("MPI_Type_indexed failed!");
return MPI_DATATYPE_NULL;
}
- MPI_Type_commit(&uniformType);
+ MPI_Type_commit(&newType);
+ if (1) {
+ int type_size;
+ MPI_Type_size(newType, &type_size);
+ if (type_size != target_write_bytes) {
+ printf("%s: type_size=%d should be: %ld\n", __func__, type_size,
+ target_write_bytes);
+ }
+ }
+
if (ioc_depth > 64) {
- if (blocks != temp_blocks) {
- free(blocks);
- blocks = NULL;
- }
- if (disps != temp_disps) {
- free(disps);
- disps = NULL;
- }
+ if (blocks != temp_blocks) {
+ free(blocks);
+ blocks = NULL;
+ }
+ if (disps != temp_disps) {
+ free(disps);
+ disps = NULL;
+ }
}
- depth_in_bytes = check_depth;
}
- MPI_Type_dup(uniformType, &newType);
return newType;
}
-
-int sf_read_independent(hid_t context_id, int64_t offset, int64_t elements, int dtype_extent, void *data)
+static file_map_to_context_t *sf_open_file_map = NULL;
+static int sf_file_map_size = 0;
+#define DEFAULT_MAP_ENTRIES 8
+
+/*-------------------------------------------------------------------------
+ * Function: record_fid_to_subfile
+ *
+ * Purpose: Every opened HDF5 file will have (if utilizing subfiling)
+ * a subfiling context associated with it. It is important that
+ * the HDF5 file index is a constant rather than utilizing a
+ * posix file handle since files can be opened multiple times
+ * and with each file open, a new file handle will be assigned.
+ * Note that in such a case, the actual filesystem id will be
+ * retained.
+ *
+ * We utilize that filesystem id (ino_t inode) so that
+ * irrespective of what process opens a common file, the
+ * subfiling system will generate a consistent context for this
+ * file across all parallel ranks.
+ *
+ * This function simply records the filesystem handle to
+ * subfiling context mapping.
+ *
+ * Return: SUCCEED or FAIL.
+ * Errors: FAILs ONLY if storage for the mapping entry cannot
+ * be allocated.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+record_fid_to_subfile(hid_t fid, hid_t subfile_context_id, int *next_index)
{
- static int *acks = NULL;
- static int *indices = NULL;
- static MPI_Request *ackreqs = NULL;
- static MPI_Request *reqs = NULL;
- static MPI_Status *stats = NULL;
- static int64_t *source_data_offset = NULL;
- static int64_t *ioc_read_datasize = NULL;
- static int64_t *ioc_read_offset = NULL;
- static MPI_Datatype *ioc_read_type = NULL;
-
- subfiling_context_t *sf_context = get_subfiling_object(context_id);
- int i, ioc, n_waiting = 0, status = 0;
-
- assert(sf_context != NULL);
-
- if (acks == NULL) {
- if ((acks = (int *)calloc((size_t)n_io_concentrators*2, sizeof(int))) == NULL) {
+ herr_t status = SUCCEED;
+ int index;
+ if (sf_file_map_size == 0) {
+ int i;
+ sf_open_file_map = (file_map_to_context_t *) malloc(
+ (size_t) DEFAULT_MAP_ENTRIES * sizeof(file_map_to_context_t));
+ if (sf_open_file_map == NULL) {
perror("calloc");
- return -1;
+ return FAIL;
+ }
+ sf_file_map_size = DEFAULT_MAP_ENTRIES;
+ for (i = 0; i < sf_file_map_size; i++) {
+ sf_open_file_map[i].h5_file_id = H5I_INVALID_HID;
}
- else indices = &acks[n_io_concentrators];
}
- if (reqs == NULL) {
- if ((reqs = (MPI_Request *)calloc((size_t)n_io_concentrators, sizeof(MPI_Request))) == NULL) {
- perror("calloc");
- return -1;
+ for (index = 0; index < sf_file_map_size; index++) {
+ if (sf_open_file_map[index].h5_file_id == H5I_INVALID_HID) {
+ sf_open_file_map[index].h5_file_id = fid;
+ sf_open_file_map[index].sf_context_id = subfile_context_id;
+ if (next_index) {
+ *next_index = index;
+ }
+ return status;
}
}
- if (ackreqs == NULL) {
- if ((ackreqs = (MPI_Request *)calloc((size_t)n_io_concentrators, sizeof(MPI_Request))) == NULL) {
- perror("calloc");
- return -1;
+ if (index == sf_file_map_size) {
+ int i;
+ sf_open_file_map = reallocarray(sf_open_file_map,
+ (size_t)(sf_file_map_size * 2), sizeof(file_map_to_context_t));
+ if (sf_open_file_map == NULL) {
+ perror("realloc");
+ return FAIL;
+ }
+ sf_file_map_size *= 2;
+ for (i = index; i < sf_file_map_size; i++) {
+ sf_open_file_map[i].h5_file_id = H5I_INVALID_HID;
}
+
+ if (next_index) {
+ *next_index = index;
+ }
+
+ sf_open_file_map[index].h5_file_id = fid;
+ sf_open_file_map[index++].sf_context_id = subfile_context_id;
}
- if (stats == NULL) {
- if ((stats = (MPI_Status *)calloc((size_t)n_io_concentrators, sizeof(MPI_Status))) == NULL) {
- perror("calloc");
- return -1;
+ return status;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: fid_map_to_context
+ *
+ * Purpose: This is a basic lookup function which returns the subfiling
+ * context id associated with the specified file->inode.
+ *
+ * Return: The Subfiling context ID if it exists.
+ * Errors: H5I_INVALID_HID if the inode to context map is not found.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+hid_t
+fid_map_to_context(hid_t sf_fid)
+{
+ if (sf_open_file_map) {
+ int i;
+ for (i = 0; i < sf_file_map_size; i++) {
+ if (sf_open_file_map[i].h5_file_id == sf_fid) {
+ return sf_open_file_map[i].sf_context_id;
+ }
}
}
+ return H5I_INVALID_HID;
+}
- if (init__indep_io(sf_context, &source_data_offset, &ioc_read_datasize, &ioc_read_offset,
- &ioc_read_type, offset, elements, dtype_extent) < 0) {
- return -1;
- }
+/*-------------------------------------------------------------------------
+ * Function: clear_fid_map_entry
+ *
+ * Purpose: Remove the map entry associated with the file->inode.
+ * This is done at file close.
+ *
+ * Return: None
+ * Errors: Cannot fail.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+void
+clear_fid_map_entry(hid_t sf_fid)
+{
+ if (sf_open_file_map) {
+ int i;
+ for (i = 0; i < sf_file_map_size; i++) {
+ if (sf_open_file_map[i].h5_file_id == sf_fid) {
+ sf_open_file_map[i].h5_file_id = H5I_INVALID_HID;
+ sf_open_file_map[i].sf_context_id = H5I_INVALID_HID;
+ return;
+ }
+ }
+ }
+}
- if (sf_verbose_flag) {
- for(ioc=0; ioc < n_io_concentrators; ioc++) {
- int64_t sourceOffset = source_data_offset[ioc];
- printf("[%d %s]: read_source[ioc(%d), sourceOffset=%ld, datasize=%ld, foffset=%ld]\n",
- sf_world_rank, __func__, ioc, sourceOffset, ioc_read_datasize[ioc], ioc_read_offset[ioc] );
- }
- }
+/*-------------------------------------------------------------------------
+ * Function: active_map_entries
+ *
+ * Purpose: Count the number of entries that have valid h5_file_id
+ * values.
+ *
+ * Return: The number of active map entries (can be zero).
+ * Errors: Cannot fail.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+int
+active_map_entries(void)
+{
+ int i, map_entries = 0;
+ for (i = 0; i < sf_file_map_size; i++) {
+ if (sf_open_file_map[i].h5_file_id != H5I_INVALID_HID) {
+ map_entries++;
+ }
+ }
+ return map_entries;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: init__indep_io
+ *
+ * Purpose: Utility function to initialize the set of IO transactions
+ * used to communicate with IO concentrators for read and write
+ * IO operations.
+ *
+ * Return: A filled set of vectors (1 entry per IO concentrator) which
+ * fully describe the IO transactions for read and writes.
+ * At most, every IO concentrator will have a descriptor which
+ * identifies the local memory offset, the virtual FILE offset,
+ * and the total length of the IO which will be sent to or
+ * received from the individual IOCs.
+ *
+ * For IO operations which involve a subset of IO concentrators,
+ * the vector entries for the unused IOCs will have lengths of
+ * zero and MPI NULL datatypes.
+ *
+ * Errors: Cannot fail.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static int
+init__indep_io(subfiling_context_t *sf_context, int64_t *sf_source_data_offset,
+ int64_t *sf_datasize, int64_t *sf_offset, MPI_Datatype *sf_dtype,
+ int64_t offset, int64_t elements, int dtype_extent)
+{
+
+ int container_count = sf_context->topology->n_io_concentrators;
+ int64_t stripe_size = sf_context->sf_stripe_size;
+ int64_t data_size = elements * dtype_extent;
+ int64_t start_id = offset / stripe_size;
+ int64_t offset_in_stripe = offset % stripe_size;
+ int64_t start_length = MIN(data_size, (stripe_size - offset_in_stripe));
+ int64_t start_row = start_id / container_count;
+ int64_t ioc_start = start_id % container_count;
+
+ int64_t final_offset = offset + data_size;
+ int64_t final_id = final_offset / stripe_size;
+ int64_t final_length =
+ (start_length == data_size ? 0 : final_offset % stripe_size);
+ int64_t ioc_final = final_id % container_count;
+ int64_t container_bytes, total_bytes = 0;
+ int64_t source_offset = 0;
+
+ int row_id_start = (int) (start_id - ioc_start);
+ int row_id_final = (int) (final_id - ioc_final);
+ int i, k, depth = ((row_id_final - row_id_start) / container_count) + 1;
+ int container_id = (int) start_id;
+ int64_t row_offset = (int64_t)(start_row * stripe_size);
+
+ for (i = 0, k = (int) ioc_start; i < container_count; i++) {
+ int container_depth = depth;
+ hbool_t is_first = false, is_last = false;
+ container_bytes = 0;
+ sf_datasize[k] = container_bytes;
+ if (total_bytes < data_size) {
+ if (k == ioc_start) {
+ is_first = true;
+ container_bytes = start_length;
+ container_depth--; /* Account for the start_length */
+ if (ioc_final < ioc_start) {
+ container_depth--;
+ depth--;
+ }
+ }
+ if (k == ioc_final) {
+ is_last = true;
+ container_bytes += final_length;
+ if (container_depth)
+ container_depth--; /* Account for the final_length */
+ if (depth)
+ depth--;
+ }
+ container_bytes += container_depth * stripe_size;
+ total_bytes += container_bytes;
+ }
+
+ sf_source_data_offset[k] = source_offset;
+ sf_datasize[k] = container_bytes;
+ sf_offset[k] = row_offset + offset_in_stripe;
+
+ if (container_count == 1) {
+ sf_dtype[k] = MPI_BYTE;
+ } else {
+ /* Fill the IO datatypes */
+ if (is_first) {
+ if (is_last) { /* First + Last */
+ sf_dtype[k] = H5FD__create_f_l_mpi_type(sf_context,
+ container_depth + 1, sf_offset[k], container_bytes,
+ start_length, final_length);
+ } else { /* First ONLY */
+ sf_dtype[k] =
+ H5FD__create_first_mpi_type(sf_context, container_depth,
+ sf_offset[k], container_bytes, start_length);
+ }
+ source_offset += start_length;
+ offset_in_stripe = 0;
+ } else if (is_last) { /* Last ONLY */
+ source_offset += stripe_size;
+ sf_dtype[k] = H5FD__create_final_mpi_type(
+ sf_context, container_depth, container_bytes, final_length);
+ } else { /* Everything else (uniform) */
+ source_offset += stripe_size;
+ sf_dtype[k] = H5FD__create_mpi_uniform_type(
+ sf_context, container_depth, container_bytes);
+ }
+ }
+ k++;
+ container_id++;
+
+ if (k == container_count) {
+ k = 0;
+ depth = ((row_id_final - container_id) / container_count) + 1;
+ row_offset += stripe_size;
+ }
+ }
+ if (total_bytes != data_size) {
+ printf("Error: total_bytes != data_size\n");
+ }
+
+ return 0;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD__init_subfile_context
+ *
+ * Purpose: Called as part of the HDF5 file + subfiling opening.
+ * This initializes the subfiling context and associates
+ * this context with the specific HDF5 file.
+ *
+ * Return: Success (0) or Faiure (-1)
+ * Errors: If MPI operations fail for some reason.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *-------------------------------------------------------------------------
+ */
+int
+H5FD__init_subfile_context(sf_topology_t *thisApp, int n_iocs, int world_rank,
+ subfiling_context_t *newContext)
+{
+ static MPI_Comm sf_msg_comm = MPI_COMM_NULL;
+ static MPI_Comm sf_data_comm = MPI_COMM_NULL;
+
+ assert(newContext != NULL);
+
+ if (newContext->topology != thisApp) {
+ int status;
+ char *envValue = NULL;
+
+ newContext->topology = thisApp;
+ newContext->sf_msg_comm = sf_msg_comm;
+ newContext->sf_data_comm = sf_data_comm;
+ newContext->sf_group_comm = MPI_COMM_NULL;
+ newContext->sf_intercomm = MPI_COMM_NULL;
+ newContext->sf_stripe_size = DEFAULT_STRIPE_SIZE;
+ newContext->sf_write_count = 0;
+ newContext->sf_read_count = 0;
+ newContext->sf_eof = 0;
+ if ((envValue = getenv("IOC_STRIPE_SIZE")) != NULL) {
+ long value_check = atol(envValue);
+ if (value_check > 0) {
+ newContext->sf_stripe_size = (int64_t) value_check;
+ }
+ }
+ if ((envValue = getenv("SUBFILE_PREFIX")) != NULL) {
+ char temp[PATH_MAX];
+ sprintf(temp, "%s", envValue);
+ newContext->subfile_prefix = strdup(temp);
+ sf_subfile_prefix = strdup(temp);
+ }
+
+ newContext->sf_blocksize_per_stripe =
+ newContext->sf_stripe_size * n_iocs;
+ if (sf_msg_comm == MPI_COMM_NULL) {
+ status = MPI_Comm_dup(MPI_COMM_WORLD, &newContext->sf_msg_comm);
+ if (status != MPI_SUCCESS)
+ goto err_exit;
+ status = MPI_Comm_set_errhandler(
+ newContext->sf_msg_comm, MPI_ERRORS_RETURN);
+ if (status != MPI_SUCCESS)
+ goto err_exit;
+ sf_msg_comm = newContext->sf_msg_comm;
+ }
+ if (sf_data_comm == MPI_COMM_NULL) {
+ status = MPI_Comm_dup(MPI_COMM_WORLD, &newContext->sf_data_comm);
+ if (status != MPI_SUCCESS)
+ goto err_exit;
+ status = MPI_Comm_set_errhandler(
+ newContext->sf_data_comm, MPI_ERRORS_RETURN);
+ if (status != MPI_SUCCESS)
+ goto err_exit;
+ sf_data_comm = newContext->sf_data_comm;
+ }
+ if (n_iocs > 1) {
+ status = MPI_Comm_split(MPI_COMM_WORLD, thisApp->rank_is_ioc,
+ world_rank, &newContext->sf_group_comm);
+ if (status != MPI_SUCCESS)
+ goto err_exit;
+ status = MPI_Comm_size(
+ newContext->sf_group_comm, &newContext->sf_group_size);
+ if (status != MPI_SUCCESS)
+ goto err_exit;
+ status = MPI_Comm_rank(
+ newContext->sf_group_comm, &newContext->sf_group_rank);
+ if (status != MPI_SUCCESS)
+ goto err_exit;
+ /*
+ * There may be additional functionality we need for the IOCs...
+ * If so, then can probably initialize those things here!
+ */
+ } else {
+ newContext->sf_group_comm = MPI_COMM_SELF;
+ newContext->sf_group_size = 1;
+ newContext->sf_group_rank = 0;
+ }
+ }
+ return 0;
+
+err_exit:
+ return -1;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: Internal read__independent.
+ *
+ * Purpose: The IO operations can be striped across a selection of
+ * IO concentrators. The read and write independent calls
+ * compute the group of 1 or more IOCs and further create
+ * derived MPI datatypes when required by the size of the
+ * contiguous read or write requests.
+ *
+ * IOC(0) contains the logical data storage for file offset
+ * zero and all offsets that reside within modulo range of
+ * the subfiling stripe_size.
+ *
+ * We cycle through all 'n_io_conentrators' and send a
+ * descriptor to each IOC that has a non-zero sized IO
+ * request to fullfill.
+ *
+ * Sending descriptors to an IOC usually gets an ACK or
+ * NACK in response. For the read operations, we post
+ * asynch READs to receive the file data and wait until
+ * all pending operations have completed.
+ *
+ * Return: Success (0) or Faiure (non-zero)
+ * Errors: If MPI operations fail for some reason.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *-------------------------------------------------------------------------
+ */
+static int
+read__independent(int n_io_concentrators, hid_t context_id, int64_t offset,
+ int64_t elements, int dtype_extent, void *data)
+{
+ int i, ioc, n_waiting = 0, status = 0;
+ int * io_concentrator = NULL;
+ int indices[n_io_concentrators];
+ MPI_Request reqs[n_io_concentrators];
+ MPI_Status stats[n_io_concentrators];
+ int64_t source_data_offset[n_io_concentrators];
+ int64_t ioc_read_datasize[n_io_concentrators];
+ int64_t ioc_read_offset[n_io_concentrators];
+ MPI_Datatype ioc_read_type[n_io_concentrators];
+
+ subfiling_context_t *sf_context = get_subfiling_object(context_id);
+ assert(sf_context != NULL);
+
+ /* Note that the sf_write_count is only tracked by an IOC rank */
+ if (sf_context->sf_write_count && (sf_context->sf_fid > 0)) {
+ fdatasync(sf_context->sf_fid);
+ }
+
+ io_concentrator = sf_context->topology->io_concentrator;
+ if (init__indep_io(sf_context, source_data_offset, ioc_read_datasize,
+ ioc_read_offset, ioc_read_type, offset, elements,
+ dtype_extent) < 0) {
+ return -1;
+ }
/* Prepare the IOCs with a message which indicates the length
- * and file offset for the actual data to be provided.
+ * and file offset for the actual data to be provided.
*/
- for(ioc=0; ioc < n_io_concentrators; ioc++) {
- int64_t msg[2] = {ioc_read_datasize[ioc], ioc_read_offset[ioc]};
- char *sourceData = (char *)data;
+ for (ioc = 0; ioc < n_io_concentrators; ioc++) {
+ int64_t msg[3] = {ioc_read_datasize[ioc], ioc_read_offset[ioc],
+ sf_context->sf_context_id};
+ char * sourceData = (char *) data;
int64_t sourceOffset = source_data_offset[ioc];
-
+ int packsize = 0;
+ // printf("[%d] %s: context_id = 0x%lx\n", sf_world_rank, __func__,
+ // sf_context->sf_context_id);
/* We may not require data from this IOC...
* or we may read the data directly from the file!
* Check the size to verify!
@@ -848,27 +1227,46 @@ int sf_read_independent(hid_t context_id, int64_t offset, int64_t elements, int
continue;
}
- if (sf_verbose_flag ) {
- printf("[%d %s] Requesting %ld read bytes from IOC(%d): sourceOffset=%ld\n",
- sf_world_rank, __func__, msg[0], io_concentrator[ioc], sourceOffset );
+#ifndef NDEBUG
+ if (sf_verbose_flag) {
+#if 0
+ if (sf_logfile) {
+ fprintf(sf_logfile,
+ "[%d %s] Requesting %ld read bytes from IOC(%d): "
+ "sourceOffset=%ld subfile_offset=%ld\n",
+ sf_world_rank, __func__, msg[0], io_concentrator[ioc],
+ sourceOffset, msg[1]);
+ }
+#else
+ fprintf(stdout,
+ "[%d %s] Requesting %ld read bytes from IOC(%d): "
+ "sourceOffset=%ld subfile_offset=%ld\n",
+ sf_world_rank, __func__, msg[0], io_concentrator[ioc],
+ sourceOffset, msg[1]);
+ fflush(stdout);
+#endif
}
+#endif
- status = MPI_Ssend(msg, 2, MPI_INT64_T, io_concentrator[ioc], READ_INDEP, sf_context->sf_msg_comm);
+ status = MPI_Ssend(msg, 3, MPI_INT64_T, io_concentrator[ioc],
+ READ_INDEP, sf_context->sf_msg_comm);
if (status != MPI_SUCCESS) {
printf("[%d] MPI_Send failure!", sf_world_rank);
return status;
- }
- else {
+ } else {
if (ioc_read_type[ioc] == MPI_BYTE) {
int bytes = (int) ioc_read_datasize[ioc];
- status = MPI_Irecv(&sourceData[sourceOffset], bytes, ioc_read_type[ioc], io_concentrator[ioc],
- READ_INDEP_DATA, sf_context->sf_data_comm, &reqs[ioc]);
+ status = MPI_Irecv(&sourceData[sourceOffset], bytes,
+ ioc_read_type[ioc], io_concentrator[ioc], READ_INDEP_DATA,
+ sf_context->sf_data_comm, &reqs[ioc]);
} else {
- status = MPI_Irecv(&sourceData[sourceOffset], 1, ioc_read_type[ioc], io_concentrator[ioc],
- READ_INDEP_DATA, sf_context->sf_data_comm, &reqs[ioc]);
+ MPI_Pack_size(1, ioc_read_type[ioc], MPI_COMM_WORLD, &packsize);
+ status = MPI_Irecv(&sourceData[sourceOffset], 1,
+ ioc_read_type[ioc], io_concentrator[ioc], READ_INDEP_DATA,
+ sf_context->sf_data_comm, &reqs[ioc]);
}
if (status != MPI_SUCCESS) {
- int length = 256;
+ int length = 256;
char error_string[length];
MPI_Error_string(status, error_string, &length);
printf("(%s) MPI_Irecv error: %s\n", __func__, error_string);
@@ -876,419 +1274,1013 @@ int sf_read_independent(hid_t context_id, int64_t offset, int64_t elements, int
}
n_waiting++;
}
-
}
- /* We've queued all of the Async READs, now we just need to
+ /* We've queued all of the Async READs, now we just need to
* complete them in any order...
*/
- while(n_waiting) {
+ while (n_waiting) {
int ready = 0;
status = MPI_Waitsome(n_io_concentrators, reqs, &ready, indices, stats);
if (status != MPI_SUCCESS) {
- int len;
- char estring[MPI_MAX_ERROR_STRING];
- MPI_Error_string(status, estring, &len);
- printf("[%d %s] MPI_ERROR! MPI_Waitsome returned an error(%s)\n",
- sf_world_rank, __func__, estring );
- fflush(stdout);
+ int length = 256;
+ char error_string[length];
+ MPI_Error_string(status, error_string, &length);
+ printf("(%s) MPI_Waitsome error: %s\n", __func__, error_string);
+ for (i = 0; i < n_waiting; i++) {
+ printf(
+ "stats[%d].SOURCE=%d, stats.TAG=%d, stats.MPI_ERROR=%d\n",
+ i, stats[i].MPI_SOURCE, stats[i].MPI_TAG,
+ stats[i].MPI_ERROR);
+ fflush(stdout);
+ }
+ return status;
}
- for(i=0; i < ready; i++) {
- ioc = io_concentrator[indices[i]];
+ for (i = 0; i < ready; i++) {
+#ifndef NDEBUG
if (sf_verbose_flag) {
- printf("[%d] READ bytes(%ld) of data from ioc_concentrator %d complete\n",
- sf_world_rank, ioc_read_datasize[indices[i]] , ioc);
- fflush(stdout);
+#if 0
+ if (sf_logfile) {
+ fprintf(sf_logfile,
+ "[%d] READ bytes(%ld) of data from ioc_concentrator %d "
+ "complete\n",
+ sf_world_rank, ioc_read_datasize[indices[i]],
+ indices[i]);
+ }
+#else
+ fprintf(stdout,
+ "[%d] READ bytes(%ld) of data from ioc_concentrator %d "
+ "complete\n",
+ sf_world_rank, ioc_read_datasize[indices[i]],
+ indices[i]);
+ fflush(stdout);
+#endif
+ }
+#endif
+ if (ioc_read_type[indices[i]] != MPI_BYTE) {
+ MPI_Type_free(&ioc_read_type[indices[i]]);
}
- n_waiting--;
+ n_waiting--;
}
}
return status;
}
+/*-------------------------------------------------------------------------
+ * Function: Public/Client sf_read_independent
+ *
+ * Purpose: A public function which wraps the Internal version
+ * and allows the addition of the additional 'n_io_concentrator'
+ * argument. This is important as it allows me to skip
+ * memory allocation functions since storage for the various
+ * vector variables is on the call stack...
+ *
+ * Return: The integer status returned by the Internal read_independent
+ * function. Successful operations will return 0.
+ * Errors: An MPI related error value.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+int
+sf_read_independent(hid_t sf_fid, int64_t offset, int64_t elements,
+ int dtype_extent, void *data)
+{
+ hid_t sf_context_id = fid_map_to_context(sf_fid);
+ subfiling_context_t *sf_context = get_subfiling_object(sf_context_id);
+ assert(sf_context != NULL);
+ return read__independent(sf_context->topology->n_io_concentrators,
+ sf_context_id, offset, elements, dtype_extent, data);
+}
-int sf_write_independent(hid_t context_id, int64_t offset, int64_t elements, int dtype_extent, void *data)
+/*-------------------------------------------------------------------------
+ * Function: Public/Client sf_read_vector
+ *
+ * Purpose: Another read__independent wrapper. In this instance
+ * we simply loop over then collection of vector entries
+ * and call the sf__read_independent function.
+ *
+ * Return: The integer status returned by the Internal read_independent
+ * function. Successful operations will return 0.
+ * Errors: An MPI related error value.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+sf_read_vector(hid_t h5_fid, hssize_t count, haddr_t addrs[], hsize_t sizes[],
+ void *bufs[] /* data_out */)
{
- static int *acks = NULL;
- static int *indices = NULL;
- static MPI_Request *reqs = NULL;
- static MPI_Request *completed = NULL;
- static MPI_Status *stats = NULL;
- static int64_t *source_data_offset = NULL;
- static int64_t *ioc_write_datasize = NULL;
- static int64_t *ioc_write_offset = NULL;
- static MPI_Datatype *ioc_write_type = NULL;
-
- subfiling_context_t *sf_context = get_subfiling_object(context_id);
- int i, target, ioc, n_waiting = 0, status = 0;
- int awaiting_completion = 0;
- int errors = 0;
- if (acks == NULL) {
- if ((acks = (int *)calloc((size_t)n_io_concentrators*2, sizeof(int))) == NULL) {
- perror("calloc");
- return -1;
- }
- else indices = &acks[n_io_concentrators];
- }
- if (reqs == NULL) {
- if ((reqs = (MPI_Request *)calloc((size_t)n_io_concentrators, sizeof(MPI_Request))) == NULL) {
- perror("calloc");
- return -1;
- }
- }
- if (completed == NULL) {
- if ((completed = (MPI_Request *)calloc((size_t)n_io_concentrators, sizeof(MPI_Request))) == NULL) {
- perror("calloc");
- return -1;
+ hssize_t k;
+ herr_t ret_value = SUCCEED;
+ hid_t sf_context_id = fid_map_to_context(h5_fid);
+ subfiling_context_t *sf_context = get_subfiling_object(sf_context_id);
+
+ assert(sf_context != NULL);
+
+ /* Unfortunately, we cannot know whether an incoming vector represents
+ * (as a whole) a contiguous block of data. Certainly each vector entry
+ * is a contiguous block of data. There is a temptation of course to
+ * attempt to merge multiple vector instances into a single MPI write
+ * by utilizing MPI datatypes. At this time we don't attempt to
+ * consolidate multiple vector entries and are thus forced to loop
+ * over the vector, sending one a vector entry at a time.
+ */
+ for (k = 0; k < (int32_t) count; k++) {
+ if (read__independent(sf_context->topology->n_io_concentrators,
+ sf_context_id, (int64_t) addrs[k], (int64_t) sizes[k], 1,
+ bufs[k]) != 0) {
+ printf("%s - encountered an internal error!\n", __func__);
+ goto errors;
}
}
- if (stats == NULL) {
- if ((stats = (MPI_Status *)calloc((size_t)n_io_concentrators, sizeof(MPI_Status))) == NULL) {
- perror("calloc");
- return -1;
- }
+ return ret_value;
+
+errors:
+ return FAIL;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: Internal write__independent.
+ *
+ * Purpose: The IO operations can be striped across a selection of
+ * IO concentrators. The read and write independent calls
+ * compute the group of 1 or more IOCs and further create
+ * derived MPI datatypes when required by the size of the
+ * contiguous read or write requests.
+ *
+ * IOC(0) contains the logical data storage for file offset
+ * zero and all offsets that reside within modulo range of
+ * the subfiling stripe_size.
+ *
+ * We cycle through all 'n_io_conentrators' and send a
+ * descriptor to each IOC that has a non-zero sized IO
+ * request to fullfill.
+ *
+ * Sending descriptors to an IOC usually gets an ACK or
+ * NACK in response. For the write operations, we post
+ * asynch READs to receive ACKs from IOC ranks that have
+ * allocated memory receive the data to write to the
+ * subfile. Upon receiving an ACK, we send the actual
+ * user data to the IOC.
+ *
+ * Return: Success (0) or Faiure (non-zero)
+ * Errors: If MPI operations fail for some reason.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *-------------------------------------------------------------------------
+ */
+static int
+write__independent(int n_io_concentrators, hid_t context_id, int64_t offset,
+ int64_t elements, int dtype_extent, const void *data)
+{
+ int * io_concentrator = NULL;
+ int acks[n_io_concentrators];
+ int indices[n_io_concentrators];
+ MPI_Request reqs[n_io_concentrators];
+ MPI_Status stats[n_io_concentrators];
+ int64_t source_data_offset[n_io_concentrators];
+ int64_t ioc_write_datasize[n_io_concentrators];
+ int64_t ioc_write_offset[n_io_concentrators];
+ MPI_Datatype ioc_write_type[n_io_concentrators];
+
+ subfiling_context_t *sf_context = get_subfiling_object(context_id);
+ int i, target, ioc, n_waiting = 0, status = 0;
+ int errors = 0;
+
+ io_concentrator = sf_context->topology->io_concentrator;
+
+ if (sf_context->topology->rank_is_ioc) {
+ sf_context->sf_write_count++;
}
- if (init__indep_io(sf_context, &source_data_offset, &ioc_write_datasize, &ioc_write_offset,
- &ioc_write_type, offset, elements, dtype_extent) < 0) {
+ /* The following function will initialize the collection of IO transfer
+ * parameters, i.e. local memory (source) offsets, target file offsets,
+ * target data sizes (in bytes), and a MPI Datatype for each of the
+ * IO concentrator transactions.
+ *
+ * For small transfers, at least 1 IOC instance will have valid info.
+ * For larger transfers, it is likely that the full set of
+ * n_io_concentrators will be utilized. If the total transaction size is
+ * less than n_io_concentrators X stripe_size, then the MPI datatype should
+ * probably be MPI_BYTE. Larger tranactions will create MPI derived
+ * datatypes to span the entire logical collection of stripes. Said
+ * differently, the largest IO requests will require a stripe depth greater
+ * than one.
+ */
+ if (init__indep_io(sf_context, source_data_offset, ioc_write_datasize,
+ ioc_write_offset, ioc_write_type, offset, elements,
+ dtype_extent) < 0) {
return -1;
- }
-
- if (sf_verbose_flag) {
- for(ioc=0; ioc < n_io_concentrators; ioc++) {
- int64_t sourceOffset = source_data_offset[ioc];
- printf("[%d %s]: write_dest[ioc(%d), sourceOffset=%ld, datasize=%ld, foffset=%ld]\n",
- sf_world_rank, __func__, ioc, sourceOffset,
- ioc_write_datasize[ioc], ioc_write_offset[ioc] );
- }
- }
+ }
/* Prepare the IOCs with a message which indicates the length
* of the actual data to be written. We also provide the file
* offset so that when the IOC recieves the data (in whatever order)
* they can lseek to the correct offset and write the data.
+ *
+ * NOTE: we use 'pwrite' which provides the seek functionality
+ * as part of the API.
*/
- for(target=0; target < n_io_concentrators; target++) {
+ for (target = 0; target < n_io_concentrators; target++) {
int64_t sourceOffset;
- int64_t msg[2] = {0,};
- char *sourceData = (char *)data;
- ioc = (sf_world_rank + target) % n_io_concentrators;
-
- sourceOffset = source_data_offset[ioc];
- msg[0] = ioc_write_datasize[ioc];
- msg[1] = ioc_write_offset[ioc];
+ int64_t msg[3] = {
+ 0,
+ };
+ const char *sourceData = (const char *) data;
+ ioc = (sf_world_rank + target) % n_io_concentrators;
+
+ sourceOffset = source_data_offset[ioc];
+ msg[0] = ioc_write_datasize[ioc];
+ msg[1] = ioc_write_offset[ioc];
+ msg[2] = sf_context->sf_context_id;
acks[ioc] = 0;
reqs[ioc] = MPI_REQUEST_NULL;
if (ioc_write_datasize[ioc] == 0) {
- if (sf_verbose_flag) {
- printf("[%d %s] skipping ioc(%d) send datasize = %ld\n",
- sf_world_rank,__func__, ioc, ioc_write_datasize[ioc]);
- fflush(stdout);
- }
continue;
}
- if ( sf_verbose_flag ) {
- printf("[%d] Datatype(%x) Sending to ioc(%d) %ld bytes of data with file_offset=%ld\n",
- sf_world_rank, ioc_write_type[ioc], ioc, ioc_write_datasize[ioc], ioc_write_offset[ioc]);
- fflush(stdout);
- }
+
+#ifndef NDEBUG
+ if (sf_verbose_flag)
+ {
+#if 0
+ if (sf_logfile) {
+ fprintf(sf_logfile,
+ "[%d %s]: write_dest[ioc(%d), "
+ "sourceOffset=%ld, datasize=%ld, foffset=%ld]\n",
+ sf_world_rank, __func__, ioc, sourceOffset,
+ ioc_write_datasize[ioc], ioc_write_offset[ioc]);
+ }
+#else
+ fprintf(stdout,
+ "[%d %s]: write_dest[ioc(%d), "
+ "sourceOffset=%ld, datasize=%ld, foffset=%ld]\n",
+ sf_world_rank, __func__, ioc, sourceOffset,
+ ioc_write_datasize[ioc], ioc_write_offset[ioc]);
+ fflush(stdout);
+#endif
+ }
+#endif
+
/* Send the Message HEADER which indicates the requested IO operation
* (via the message TAG) along with the data size and file offset.
*/
- status = MPI_Ssend(msg, 2, MPI_INT64_T, io_concentrator[ioc],
- WRITE_INDEP, sf_context->sf_msg_comm);
+ status = MPI_Ssend(msg, 3, MPI_INT64_T, io_concentrator[ioc],
+ WRITE_INDEP, sf_context->sf_msg_comm);
if (status != MPI_SUCCESS) {
- int len;
+ int len;
char estring[MPI_MAX_ERROR_STRING];
MPI_Error_string(status, estring, &len);
- printf("[%d] ERROR! MPI_Send of %ld bytes to %d returned an error(%s)\n",
- sf_world_rank, sizeof(msg), io_concentrator[ioc], estring );
+ printf("[%d] ERROR! MPI_Send of %ld bytes to %d returned an "
+ "error(%s)\n",
+ sf_world_rank, sizeof(msg), io_concentrator[ioc], estring);
fflush(stdout);
+ break; /* If unable to send to an IOC, we can call it quits... */
}
- status = MPI_Recv(&acks[ioc], 1, MPI_INT, io_concentrator[ioc], WRITE_INDEP_ACK,
- sf_context->sf_data_comm, &stats[ioc]);
+ /* Wait for memory to be allocated on the target IOC so that we can
+ * start sending user data to this IOC.
+ * FIXME: We could possibly use Irecv for handling ACKs. This could
+ * potentially allow some additional overlap of posting IO requests
+ * to the collection of IO Concentrators.
+ */
+ status = MPI_Recv(&acks[ioc], 1, MPI_INT, io_concentrator[ioc],
+ WRITE_INDEP_ACK, sf_context->sf_data_comm, &stats[ioc]);
if (status == MPI_SUCCESS) {
+#ifndef NDEBUG
if (sf_verbose_flag) {
- printf("[%d] received ack(%d) from ioc(%d)\n",sf_world_rank, acks[ioc], ioc);
- fflush(stdout);
+ if (sf_logfile) {
+ fprintf(sf_logfile, "[%d] received ack(%d) from ioc(%d)\n",
+ sf_world_rank, acks[ioc], ioc);
+ }
}
+#endif
+ /* No errors, start sending data to the IOC.
+ * If the data transfer is small enough, we don't utilize a
+ * derived MPI type, i.e. we use MPI_BYTE.
+ */
if (acks[ioc] > 0) {
- if (ioc_write_type[ioc] == MPI_BYTE) {
- int datasize = (int)(ioc_write_datasize[ioc] & INT32_MASK);
+ if (ioc_write_type[ioc] == MPI_BYTE) {
+ int datasize = (int) (ioc_write_datasize[ioc] & INT32_MASK);
status = MPI_Issend(&sourceData[sourceOffset], datasize,
- MPI_BYTE, io_concentrator[ioc], WRITE_INDEP_DATA,
- sf_context->sf_data_comm,&reqs[ioc]);
- }
- else {
- status = MPI_Issend(&sourceData[sourceOffset], 1, ioc_write_type[ioc],
- io_concentrator[ioc], WRITE_INDEP_DATA,
- sf_context->sf_data_comm,&reqs[ioc]);
+ MPI_BYTE, io_concentrator[ioc], WRITE_INDEP_DATA,
+ sf_context->sf_data_comm, &reqs[ioc]);
+ } else {
+ status = MPI_Issend(&sourceData[sourceOffset], 1,
+ ioc_write_type[ioc], io_concentrator[ioc],
+ WRITE_INDEP_DATA, sf_context->sf_data_comm, &reqs[ioc]);
}
- /* Queued another Isend which need to be completed (below) */
n_waiting++;
}
} else {
errors++;
puts("ACK error!");
fflush(stdout);
+ break;
}
+
+ /* Check the status of our MPI_Issend... */
if (status != MPI_SUCCESS) {
errors++;
printf("[%d] ERROR! Unable to Send data to ioc(%d)\n",
- sf_world_rank, ioc);
+ sf_world_rank, ioc);
fflush(stdout);
+ break;
}
}
- while(n_waiting) {
+ /* Wait for the Issends to complete (in any order) */
+ while (n_waiting) {
int ready = 0;
status = MPI_Waitsome(n_io_concentrators, reqs, &ready, indices, stats);
if (status != MPI_SUCCESS) {
- int len;
+ int len;
char estring[MPI_MAX_ERROR_STRING];
MPI_Error_string(status, estring, &len);
printf("[%d %s] MPI_ERROR! MPI_Waitsome returned an error(%s)\n",
- sf_world_rank, __func__, estring );
+ sf_world_rank, __func__, estring);
fflush(stdout);
errors++;
}
- for(i=0; i < ready; i++) {
+ for (i = 0; i < ready; i++) {
/* One of the Issend calls has completed
- * Wait for another ACK to indicate that the data as been written
- * to the subfile.
+ * If we used a derived type to send data, then should free
+ * that datatype instance.
*/
+ if (ioc_write_type[indices[i]] != MPI_BYTE) {
+ MPI_Type_free(&ioc_write_type[indices[i]]);
+ }
+ n_waiting--;
+ }
+ }
+ if (errors)
+ return -1;
+ return status;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: Public/Client sf_write_independent
+ *
+ * Purpose: A public function which wraps the Internal version
+ * and allows the addition of the additional 'n_io_concentrator'
+ * argument. This is important as it allows me to skip
+ * memory allocation functions since storage for the various
+ * vector variables is on the call stack...
+ *
+ * Return: The integer status returned by the Internal read_independent
+ * function. Successful operations will return 0.
+ * Errors: An MPI related error value.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+int
+sf_write_independent(hid_t sf_fid, int64_t offset, int64_t elements,
+ int dtype_extent, const void *data)
+{
+ hid_t sf_context_id = fid_map_to_context(sf_fid);
+ subfiling_context_t *sf_context = get_subfiling_object(sf_context_id);
+
+ assert(sf_context != NULL);
+ return write__independent(sf_context->topology->n_io_concentrators,
+ sf_context_id, offset, elements, dtype_extent, data);
+}
+
+/*-------------------------------------------------------------------------
+ * Function: Public/Client sf_write_vector
+ *
+ * Purpose: Another write__independent wrapper. As with the
+ * sf_read_vector function, we simply loop over the vector
+ * elements and call the underlying write_independent function.
+ *
+ * Return: The integer status returned by the Internal read_independent
+ * function. Successful operations will return 0.
+ * Errors: An MPI related error value.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+sf_write_vector(hid_t h5_fid, hssize_t count, haddr_t addrs[], hsize_t sizes[],
+ void *bufs[] /* data_in */)
+{
+ hssize_t k;
+ herr_t ret_value = SUCCEED;
+ hid_t sf_context_id = fid_map_to_context(h5_fid);
+ subfiling_context_t *sf_context = get_subfiling_object(sf_context_id);
+
+ assert(sf_context != NULL);
+
+ /*
+ * Call the underlying write function for each vector element.
+ */
+ for (k = 0; k < count; k++) {
+ if (write__independent(sf_context->topology->n_io_concentrators,
+ sf_context_id, (int64_t) addrs[k], (int64_t) sizes[k], 1,
+ bufs[k]) < 0) {
+ printf("%s - encountered an internal error!\n", __func__);
+ goto errors;
+ }
+ }
+ return ret_value;
+
+errors:
+ return FAIL;
+}
+
+int
+sf_truncate(hid_t h5_fid, haddr_t H5_ATTR_PARALLEL_UNUSED addr)
+{
+ hid_t sf_context_id = fid_map_to_context(h5_fid);
+ subfiling_context_t *sf_context = get_subfiling_object(sf_context_id);
+
+ assert(sf_context != NULL);
+
#if 0
- acks[indices[i]] = 0;
- MPI_Irecv(&acks[indices[i]], 1, MPI_INT, io_concentrator[indices[i]], COMPLETED, sf_context->sf_data_comm, &completed[indices[i]]);
- awaiting_completion++;
+ if (sf_context->topology->n_io_concentrators > 1) {
+ if (MPI_Allreduce(&addr_in, &addr_max, 1, MPI_INT64_T, MPI_MAX, sf_context->sf_data_comm) != MPI_SUCCESS) {
+ addr_max = (int64_t)addr;
+ }
+ }
+ if (sf_context->topology->rank_is_ioc) {
+ int container_count = sf_context->topology->n_io_concentrators;
+ int64_t stripe_size = sf_context->sf_stripe_size;
+ int64_t addr_max_stripe_id = addr_max / stripe_size;
+ int64_t offset_in_stripe = addr_max % stripe_size;
+ int max_row = (int)(addr_max_stripe_id / container_count);
+ int addr_max_ioc = (int)(addr_max_stripe_id % container_count);
+ /*
+ * Subfiling storage can be thought of as a 2D array in which each row
+ * contains N columns (containers). The containers have a fixed width
+ * so that number of bytes in any "row" is (# of containers) X stripe_size.
+ *
+ * Given any offset, we can identify the 'row' of the specified offset
+ * as well as the offset within row and thus the specific container and
+ * actual offset within that container.
+ */
+ int64_t row_start = max_row * stripe_size;
+ int64_t container_addr_max = row_start + stripe_size;
+ if (sf_context->topology->subfile_rank == addr_max_ioc) {
+ container_addr_max = row_start + offset_in_stripe;
+ }
+ else if (sf_context->topology->subfile_rank < addr_max_ioc) {
+ container_addr_max = row_start + stripe_size;
+ }
+ if(-1 == HDftruncate(sf_context->sf_fid, (HDoff_t)container_addr_max)) {
+ puts("truncate failed!");
+ return -1;
+ }
+ }
#endif
- n_waiting--;
+ return 0;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: Internal close__subfiles
+ *
+ * Purpose: When closing and HDF5 file, we need to close any associated
+ * subfiles as well. This function cycles through all known
+ * IO Concentrators to send a file CLOSE_OP command.
+ *
+ * This function is collective across all MPI ranks which
+ * have opened HDF5 file which associated with the provided
+ * sf_context. Once the request has been issued by all
+ * ranks, the subfile at each IOC will be closed and an
+ * completion ACK will be received.
+ *
+ * Once the subfiles are closed, we initiate a teardown of
+ * the IOC and associated thread_pool threads.
+ *
+ * Return: Success (0) or Faiure (non-zero)
+ * Errors: If MPI operations fail for some reason.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *-------------------------------------------------------------------------
+ */
+static int
+close__subfiles(
+ subfiling_context_t *sf_context, int n_io_concentrators, hid_t fid)
+{
+ int i, status;
+ int global_errors = 0, errors = 0;
+ int n_waiting = 0;
+ int indices[n_io_concentrators];
+ int ioc_acks[n_io_concentrators];
+ MPI_Request reqs[n_io_concentrators];
+ int * io_concentrator = sf_context->topology->io_concentrator;
+
+ /* The map from fid to context can now be cleared */
+ clear_fid_map_entry(fid);
+
+ for (i = 0; i < n_io_concentrators; i++) {
+ int64_t msg[3] = {0, 0, sf_context->sf_context_id};
+ status = MPI_Ssend(msg, 3, MPI_INT64_T, io_concentrator[i], CLOSE_OP,
+ sf_context->sf_msg_comm);
+ if (status == MPI_SUCCESS) {
+ status = MPI_Irecv(&ioc_acks[i], 1, MPI_INT, io_concentrator[i],
+ COMPLETED, sf_context->sf_data_comm, &reqs[i]);
}
+ if (status != MPI_SUCCESS) {
+ printf("[%d] MPI close_subfiles failure!", sf_world_rank);
+ errors++;
+ } else
+ n_waiting++;
}
- while(awaiting_completion) {
+
+ while (n_waiting) {
int ready = 0;
- status = MPI_Waitsome(n_io_concentrators, completed, &ready, indices, stats);
+ status = MPI_Waitsome(
+ n_io_concentrators, reqs, &ready, indices, MPI_STATUSES_IGNORE);
if (status != MPI_SUCCESS) {
- int len;
+ int len;
char estring[MPI_MAX_ERROR_STRING];
MPI_Error_string(status, estring, &len);
printf("[%d %s] MPI_ERROR! MPI_Waitsome returned an error(%s)\n",
- sf_world_rank, __func__, estring );
+ sf_world_rank, __func__, estring);
fflush(stdout);
errors++;
}
-
- for(i=0; i < ready; i++) {
- /* One of the Issend calls has completed
- * Wait for another ACK to indicate that the data as been written
- * to the subfile.
- */
- acks[indices[i]] = 0;
- awaiting_completion--;
+ for (i = 0; i < ready; i++) {
+ n_waiting--;
}
}
- if (errors) return -1;
- return status;
+ if (sf_context->topology->rank_is_ioc) {
+ finalize_ioc_threads();
+ wait_for_thread_main();
+ }
+
+ status = MPI_Allreduce(
+ &errors, &global_errors, 1, MPI_INT, MPI_SUM, sf_context->sf_data_comm);
+
+ if (status != MPI_SUCCESS) {
+ global_errors++;
+ }
+ return global_errors;
}
-int sf_close_subfiles(hid_t context_id)
+/*-------------------------------------------------------------------------
+ * Function: Public/Client sf_close_subfiles
+ *
+ * Purpose: This is a simple wrapper function for the internal version
+ * which actually manages all subfile closing via commands
+ * to the set of IO Concentrators.
+ *
+ * Return: Success (0) or Faiure (non-zero)
+ * Errors: If MPI operations fail for some reason.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *-------------------------------------------------------------------------
+ */
+int
+sf_close_subfiles(hid_t fid)
{
- int i, status;
- int errors = 0;
- int n_waiting = 0;
- int indices[n_io_concentrators];
- int ioc_acks[n_io_concentrators];
- MPI_Request reqs[n_io_concentrators];
- subfiling_context_t *sf_context = get_subfiling_object(context_id);
-
- for (i=0; i < n_io_concentrators; i++) {
- int64_t msg[2] = {0, 0};
- status = MPI_Ssend(msg, 2, MPI_INT64_T, io_concentrator[i], CLOSE_OP, sf_context->sf_msg_comm);
- if (status == MPI_SUCCESS) {
- status = MPI_Irecv(&ioc_acks[i], 1, MPI_INT, io_concentrator[i], COMPLETED, sf_context->sf_data_comm, &reqs[i]);
- }
- if (status != MPI_SUCCESS) {
- printf("[%d] MPI close_subfiles failure!", sf_world_rank);
- errors++;
- }
- else n_waiting++;
- }
- while(n_waiting) {
- int ready = 0;
- status = MPI_Waitsome(n_io_concentrators, reqs, &ready, indices, MPI_STATUSES_IGNORE);
- if (status != MPI_SUCCESS) {
- int len;
- char estring[MPI_MAX_ERROR_STRING];
- MPI_Error_string(status, estring, &len);
- printf("[%d %s] MPI_ERROR! MPI_Waitsome returned an error(%s)\n",
- sf_world_rank, __func__, estring );
- fflush(stdout);
- errors++;
- }
- for(i=0; i < ready; i++) {
- n_waiting--;
- }
- }
- return errors;
+ hid_t context_id = fid_map_to_context(fid);
+ subfiling_context_t *sf_context = get_subfiling_object(context_id);
+ assert(sf_context != NULL);
+ return close__subfiles(
+ sf_context, sf_context->topology->n_io_concentrators, fid);
}
-int sf_open_subfiles(hid_t context_id, char *prefix, int flags)
+/*-------------------------------------------------------------------------
+ * Function: Internal open__subfiles
+ *
+ * Purpose: While we cannot know a priori, whether an HDF client will
+ * need to access data across the entirety of a file, e.g.
+ * an individual MPI rank may read or write only small
+ * segments of the entire file space; this function sends
+ * a file OPEN_OP to every IO concentrator.
+ *
+ * Prior to opening any subfiles, the H5FDopen will have
+ * created an HDF5 file with the user specified naming.
+ * A path prefix will be selected and is available as
+ * an input argument.
+ *
+ * The opened HDF5 file handle will contain device and
+ * inode values, these being constant for all processes
+ * opening the shared file. The inode value is utilized
+ * as a key value and is associated with the sf_context
+ * which we recieve as one of the input arguments.
+ *
+ * IO Concentrator threads will be initialized on MPI ranks
+ * which have been identified via application toplogy
+ * discovery. The number and mapping of IOC to MPI_rank
+ * is part of the sf_context->topology structure.
+ *
+ * Return: Success (0) or Faiure (non-zero)
+ * Errors: If MPI operations fail for some reason.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *-------------------------------------------------------------------------
+ */
+
+static int
+open__subfiles(subfiling_context_t *sf_context, int n_io_concentrators,
+ hid_t fid, char *prefix, int flags)
{
- int i, status;
- int n_waiting = 0;
- int indices[n_io_concentrators];
- int ioc_acks[n_io_concentrators];
- MPI_Request reqs[n_io_concentrators];
- subfiling_context_t *sf_context = get_subfiling_object(context_id);
-
- if ((sf_context->subfile_prefix != NULL) && (prefix != NULL)) {
- if (strcmp(sf_context->subfile_prefix, prefix) != 0) {
- sf_context->subfile_prefix = strdup(prefix);
- }
- }
+ int i, ret, status, n_waiting = 0;
+ int * io_concentrator = NULL;
+ int indices[n_io_concentrators];
+ int ioc_acks[n_io_concentrators];
+ MPI_Request reqs[n_io_concentrators];
+
+ assert(sf_context != NULL);
+
+ if (prefix) {
+ if (sf_context->subfile_prefix) {
+ if (strcmp(sf_context->subfile_prefix, prefix) != 0) {
+ sf_context->subfile_prefix = strdup(prefix);
+ }
+ } else {
+ sf_context->subfile_prefix = strdup(prefix);
+ }
+ sf_subfile_prefix = sf_context->subfile_prefix;
+ }
+
+ /*
+ * Save the HDF5 file id (fid) to subfile context mapping.
+ * There shouldn't be any issue, but check the status and
+ * return if there was a problem.
+ */
+ ret = record_fid_to_subfile(fid, sf_context->sf_context_id, NULL);
+ if (ret != SUCCEED) {
+ printf("[%d - %s] Error mapping hdf5 file to a subfiling context\n",
+ sf_context->topology->world_rank, __func__);
+ return -1;
+ }
+
+ /* We already know the number of IO concentrators, but
+ * grab the mapping of IO concentrator to MPI ranks for our
+ * messaging loop.
+ */
+ io_concentrator = sf_context->topology->io_concentrator;
+
+ for (i = 0; i < n_io_concentrators; i++) {
+ int64_t msg[3] = {flags, fid, sf_context->sf_context_id};
+
+#ifndef NDEBUG
+ if (sf_verbose_flag) {
+ if (sf_logfile) {
+ fprintf(sf_logfile, "[%d] file open request (flags = %0lx)\n",
+ sf_world_rank, msg[0]);
+ }
+ }
+#endif
+ /* Send the open_op message to an IOC */
+ status = MPI_Ssend(msg, 3, MPI_INT64_T, io_concentrator[i], OPEN_OP,
+ sf_context->sf_msg_comm);
+
+ /* Check for errors */
+ if (status == MPI_SUCCESS) {
+ /* And post a receive for the open file ACK */
+ status = MPI_Irecv(&ioc_acks[i], 1, MPI_INT, io_concentrator[i],
+ COMPLETED, sf_context->sf_data_comm, &reqs[i]);
+ }
- for (i=0; i < n_io_concentrators; i++) {
- int64_t msg[2] = {flags, 0};
- if (sf_verbose_flag) {
- printf("[%d] file open request (flags = %0lx)\n", sf_world_rank, msg[0]);
- }
- status = MPI_Ssend(msg, 2, MPI_INT64_T, io_concentrator[i], OPEN_OP, sf_context->sf_msg_comm);
- if (status == MPI_SUCCESS) {
- status = MPI_Irecv(&ioc_acks[i], 1, MPI_INT, io_concentrator[i], COMPLETED, sf_context->sf_data_comm, &reqs[i]);
- }
if (status != MPI_SUCCESS) {
printf("[%d] MPI close_subfiles failure!", sf_world_rank);
- }
- else n_waiting++;
- }
- while(n_waiting) {
- int ready = 0;
- status = MPI_Waitsome(n_io_concentrators, reqs, &ready, indices, MPI_STATUSES_IGNORE);
- if (status != MPI_SUCCESS) {
- int len;
+ } else
+ n_waiting++;
+ } /* END - for loop */
+
+ /* Wait for all (n_waiting) ACK messages to be received */
+ while (n_waiting) {
+ int ready = 0;
+ status = MPI_Waitsome(
+ n_io_concentrators, reqs, &ready, indices, MPI_STATUSES_IGNORE);
+ if (status != MPI_SUCCESS) {
+ int len;
char estring[MPI_MAX_ERROR_STRING];
MPI_Error_string(status, estring, &len);
printf("[%d %s] MPI_ERROR! MPI_Waitsome returned an error(%s)\n",
- sf_world_rank, __func__, estring );
+ sf_world_rank, __func__, estring);
fflush(stdout);
- }
- for(i=0; i < ready; i++) {
- n_waiting--;
- }
- }
+ }
+
+ for (i = 0; i < ready; i++) {
+ n_waiting--;
+ }
+ } /* END - while */
- return 0;
+ return 0;
}
-
+
+/*-------------------------------------------------------------------------
+ * Function: Public/Client open_subfiles
+ *
+ * Purpose: Wrapper for the internal 'open__subfiles' function
+ * Similar to the other public wrapper functions, we
+ * discover (via the sf_context) the number of io concentrators
+ * and pass that to the internal function so that vector
+ * storage arrays can be stack based rather than explicitly
+ * allocated and freed.
+ *
+ * The Internal function is resposible for sending all IOC
+ * instances, the (sub)file open requests.
+ *
+ * Prior to calling the internal open function, we initialize
+ * a new subfiling context that contains topology info and
+ * new MPI communicators that facilitate messaging between
+ * HDF5 clients and the IOCs.
+ *
+ * Return: Success (0) or Faiure (non-zero)
+ * Errors: If MPI operations fail for some reason.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *-------------------------------------------------------------------------
+ */
int
-ioc_main(subfiling_context_t *context)
+sf_open_subfiles(hid_t fid, char *filename, char *prefix, int flags)
{
- int subfile_rank;
- int flag, ret;
- int max_work_depth;
- int my_shutdown_flag = 0;
- MPI_Status status, msg_status;
- sf_work_request_t *incoming_requests = NULL;
- useconds_t delay = 20;
-
- assert(context != NULL);
- subfile_rank = context->sf_group_rank;
- if (request_count_per_rank == NULL) {
- request_count_per_rank = (int *)calloc((size_t)sf_world_size, sizeof(int));
- assert(request_count_per_rank != NULL);
- }
+ int status;
+ int64_t context_id = -1;
+ subfiling_context_t *sf_context = NULL;
+ sf_ioc_selection_t ioc_selection;
+ char *option_arg = get_ioc_selection_criteria(&ioc_selection);
+
+ status = H5FDsubfiling_init(ioc_selection, option_arg, &context_id);
+ if (status != SUCCEED) {
+ puts("H5FDsubfiling_init failed!");
+ return -1;
+ }
+#if 0
+ printf("[%d %s]\n", sf_world_rank, __func__);
+#endif
+
+ sf_context = get_subfiling_object(context_id);
+ assert(sf_context != NULL);
+
+ sf_context->sf_context_id = context_id;
+ sf_context->h5_file_id = fid;
+ sf_context->filename = strdup(filename);
+ sf_shutdown_flag = 0;
+
+ return open__subfiles(sf_context, sf_context->topology->n_io_concentrators,
+ fid, prefix, flags);
+}
- max_work_depth = MAX(8, sf_world_size * MAX_WORK_PER_RANK);
- incoming_requests = (sf_work_request_t *)calloc((size_t)(max_work_depth +1), sizeof(sf_work_request_t));
- assert(incoming_requests != NULL);
+/*-------------------------------------------------------------------------
+ * Function: Public/Client set_verbose_flag
+ *
+ * Purpose: For debugging purposes, I allow a verbose setting to
+ * have printing of relevent information into an IOC specific
+ * file that is opened as a result of enabling the flag
+ * and closed when the verbose setting is disabled.
+ *
+ * Return: None
+ * Errors: None
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *-------------------------------------------------------------------------
+ */
+void
+set_verbose_flag(int subfile_rank, int new_value)
+{
+#ifndef NDEBUG
+ sf_verbose_flag = (int) (new_value & 0x0FF);
-#ifdef DEBUG_TRACING
- char logname[64];
- sprintf(logname,"ioc_%d.log", subfile_rank);
- sf_logfile = fopen(logname, "w+");
+ if (sf_verbose_flag) {
+ char logname[64];
+ sprintf(logname, "ioc_%d.log", subfile_rank);
+ sf_logfile = fopen(logname, "w+");
+ } else if (sf_logfile) {
+ fclose(sf_logfile);
+ sf_logfile = NULL;
+ }
#endif
- /* Initialize atomic vars */
- atomic_init(&sf_workinprogress, 0);
- atomic_init(&sf_work_pending, 0);
- atomic_init(&sf_file_close_count, 0);
- atomic_init(&sf_file_refcount, 0);
-
- sf_msg_comm = context->sf_msg_comm; /* Messages IN */
- sf_data_comm = context->sf_data_comm; /* Messages OUT */
-
- while(!sf_shutdown_flag || sf_work_pending) {
+ return;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: Public/IOC ioc_main
+ *
+ * Purpose: This is the principal function run by the IO Concentrator
+ * main thread. It remains within a loop until allowed to
+ * exit by means of setting the 'sf_shutdown_flag'. This
+ * usually accomplished as part of the file close operation.
+ *
+ * The function implements an asynchronous polling approach
+ * for incoming messages. These messages can be thought of
+ * as a primitive RPC which utilizes MPI TAGs to code and
+ * implement the desired subfiling functionality.
+ *
+ * As each incoming message is received, it get added to
+ * a queue for processing by a thread_pool thread.
+ * The message handlers are dispatched via the
+ * "handle_work_request" ftn (see H5FDsubfile_thread.c)
+
+ * Subfiling is effectively a software RAID-0 implementation
+ * where having multiple IO Concentrators and independent
+ * subfiles is equated to the multiple disks and a true
+ * hardware base RAID implementation.
+ *
+ * IO Concentrators are ordered according to their MPI rank.
+ * In the simplest interpretation, IOC(0) will always contain
+ * the initial bytes of the logical disk image. Byte 0 of
+ * IOC(1) will contain the byte written to the logical disk
+ * offset "stripe_size" X IOC(number).
+ *
+ * Example: If the stripe size is defined to be 256K, then
+ * byte 0 of subfile(1) is at logical offset 262144 of the
+ * file. Similarly, byte 0 of subfile(2) represents the
+ * logical file offset = 524288. For logical files larger
+ * than 'N' X stripe_size, we simply "wrap around" back to
+ * subfile(0). The following shows the mapping of 30
+ * logical blocks of data over 3 subfiles:
+ * +--------+--------+--------+--------+--------+--------+
+ * | blk(0 )| blk(1) | blk(2 )| blk(3 )| blk(4 )| blk(5 )|
+ * | IOC(0) | IOC(1) | IOC(2) | IOC(0) | IOC(1) | IOC(2) |
+ * +--------+--------+--------+--------+--------+--------+
+ * | blk(6 )| blk(7) | blk(8 )| blk(9 )| blk(10)| blk(11)|
+ * | IOC(0) | IOC(1) | IOC(2) | IOC(0) | IOC(1) | IOC(2) |
+ * +--------+--------+--------+--------+--------+--------+
+ * | blk(12)| blk(13)| blk(14)| blk(15)| blk(16)| blk(17)|
+ * | IOC(0) | IOC(1) | IOC(2) | IOC(0) | IOC(1) | IOC(2) |
+ * +--------+--------+--------+--------+--------+--------+
+ * | blk(18)| blk(19)| blk(20)| blk(21)| blk(22)| blk(23)|
+ * | IOC(0) | IOC(1) | IOC(2) | IOC(0) | IOC(1) | IOC(2) |
+ * +--------+--------+--------+--------+--------+--------+
+ * | blk(24)| blk(25)| blk(26)| blk(27)| blk(28)| blk(29)|
+ * | IOC(0) | IOC(1) | IOC(2) | IOC(0) | IOC(1) | IOC(2) |
+ * +--------+--------+--------+--------+--------+--------+
+ *
+ * Return: None
+ * Errors: None
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *-------------------------------------------------------------------------
+ */
+int
+ioc_main(int64_t context_id)
+{
+ int subfile_rank;
+ int flag, ret;
+ int max_work_depth;
+ MPI_Status status, msg_status;
+ sf_work_request_t * incoming_requests = NULL;
+ useconds_t delay = 20;
+ subfiling_context_t *context = get_subfiling_object(context_id);
+
+ assert(context != NULL);
+ /* We can't have opened any files at this point.. */
+ context->sf_fid = -1;
+
+ subfile_rank = context->sf_group_rank;
+ if (request_count_per_rank == NULL) {
+ request_count_per_rank =
+ (int *) calloc((size_t) sf_world_size, sizeof(int));
+ assert(request_count_per_rank != NULL);
+ }
+
+ max_work_depth = MAX(8, sf_world_size * MAX_WORK_PER_RANK);
+ incoming_requests = (sf_work_request_t *) calloc(
+ (size_t)(max_work_depth + 1), sizeof(sf_work_request_t));
+
+ /* Validate that the allocation succeeded */
+ assert(incoming_requests != NULL);
+
+ /* Initialize atomic vars */
+ atomic_init(&sf_workinprogress, 0);
+ atomic_init(&sf_work_pending, 0);
+ atomic_init(&sf_file_close_count, 0);
+ atomic_init(&sf_file_refcount, 0);
+ atomic_init(&sf_ioc_fini_refcount, 0);
+
+ sf_open_file_count = 0;
+ sf_close_file_count = 0;
+ sf_ops_after_first_close = 0;
+
+#if 0
+ printf("Starting IOC! mpi_rank=%d\n", sf_world_rank);
+ fflush(stdout);
+#endif
+
+ while (!sf_shutdown_flag || sf_work_pending) {
flag = 0;
- ret = MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, context->sf_msg_comm, &flag, &status);
+ ret = MPI_Iprobe(
+ MPI_ANY_SOURCE, MPI_ANY_TAG, context->sf_msg_comm, &flag, &status);
if ((ret == MPI_SUCCESS) && (flag != 0)) {
sf_work_request_t *msg = NULL;
- int count;
- int request_size = (int)sizeof(sf_work_request_t);
- int source = status.MPI_SOURCE;
- int tag = status.MPI_TAG;
+ int count;
+ int request_size = (int) sizeof(sf_work_request_t);
+ int source = status.MPI_SOURCE;
+ int tag = status.MPI_TAG;
MPI_Get_count(&status, MPI_BYTE, &count);
if (count > request_size) {
- msg = (sf_work_request_t *) malloc((size_t)count);
- ret = MPI_Recv(msg,count,MPI_BYTE, source, tag, context->sf_msg_comm, &msg_status);
- }
- else {
- ret = MPI_Recv(&incoming_requests[sf_workinprogress],count, MPI_BYTE,
- source, tag, context->sf_msg_comm, &msg_status);
+ msg = (sf_work_request_t *) malloc((size_t) count);
+ ret = MPI_Recv(msg, count, MPI_BYTE, source, tag,
+ context->sf_msg_comm, &msg_status);
+ } else {
+ ret = MPI_Recv(&incoming_requests[sf_workinprogress], count,
+ MPI_BYTE, source, tag, context->sf_msg_comm, &msg_status);
}
if (ret == MPI_SUCCESS) {
-#ifdef DEBUG_TRACING
- printf("[[ioc(%d) msg from %d tag=%x, datasize=%ld, foffset=%ld]]\n", subfile_rank, source, tag,
- incoming_requests[sf_workinprogress].header[0],
- incoming_requests[sf_workinprogress].header[1]);
- fflush(stdout);
+#if 0
+ if (tag == OPEN_OP) {
+ sf_open_file_count++;
+ printf("source=%d: sf_open_file_count = %d\n", source, sf_open_file_count);
+ fflush(stdout);
+ }
+ else if (tag == CLOSE_OP) {
+ sf_close_file_count++;
+ printf("source=%d: sf_close_file_count = %d\n", source, sf_close_file_count);
+ fflush(stdout);
+ }
+ else {
+ printf("ioc(0): tag=%d\n", tag);
+ fflush(stdout);
+ if (sf_close_file_count) {
+ sf_ops_after_first_close++;
+ if (sf_close_file_count == sf_world_size) {
+ printf("op=%d from source(%d) after file close! sf_open_file_count=%d\n", tag, source, sf_open_file_count);
+ fflush(stdout);
+ }
+ }
+ }
#endif
if (msg) {
- msg->tag = tag;
msg->source = source;
- msg->subfile_rank = subfile_rank;
+ msg->subfile_rank = subfile_rank;
+ msg->context_id = context->sf_context_id;
tpool_add_work(msg);
- }
- else {
- int index = atomic_load(&sf_workinprogress);
- incoming_requests[sf_workinprogress].tag = tag;
- incoming_requests[sf_workinprogress].source = source;
- incoming_requests[sf_workinprogress].subfile_rank = subfile_rank;
- tpool_add_work(&incoming_requests[sf_workinprogress]);
- if (index == max_work_depth -1) {
- atomic_init(&sf_workinprogress, 0);
- }
- else {
- atomic_fetch_add(&sf_workinprogress, 1); // atomic
- }
+ } else {
+ int index = atomic_load(&sf_workinprogress);
+ incoming_requests[index].tag = tag;
+ incoming_requests[index].source = source;
+ incoming_requests[index].subfile_rank = subfile_rank;
+ tpool_add_work(&incoming_requests[index]);
+ if (index == max_work_depth - 1) {
+ atomic_init(&sf_workinprogress, 0);
+ } else {
+ atomic_fetch_add(&sf_workinprogress, 1); // atomic
+ }
}
}
+ } else {
+ usleep(delay);
}
- else {
- begin_thread_exclusive();
- my_shutdown_flag = sf_shutdown_flag;
- end_thread_exclusive();
- usleep(delay);
- }
}
-#ifdef DEBUG_TRACING
- fclose(sf_logfile);
+#ifndef NDEBUG
+ if (sf_logfile) {
+ fclose(sf_logfile);
+ sf_logfile = NULL;
+ }
#endif
- if (incoming_requests) {
- free(incoming_requests);
- }
+ if (incoming_requests) {
+ free(incoming_requests);
+ }
- return 0;
+ /* Reset the shutdown flag */
+ sf_shutdown_flag = 0;
+
+ return 0;
}
/*
@@ -1297,23 +2289,36 @@ Private helper functions
=========================================
*/
-static int send_ack__(int target, int subfile_rank, int tag, MPI_Comm comm)
+static int
+send_ack__(int target, int subfile_rank, int tag, MPI_Comm comm)
{
int ack = 1;
int ret = MPI_Send(&ack, 1, MPI_INT, target, tag, comm);
+#ifndef NDEBUG
if (sf_verbose_flag) {
- printf("[ioc(%d): Sending ACK to MPI_rank(%d)\n", subfile_rank, target);
+ if (sf_logfile) {
+ fprintf(sf_logfile, "[ioc(%d): Sending ACK to MPI_rank(%d)\n",
+ subfile_rank, target);
+ }
}
+#endif
return ret;
}
-static int send_nack__(int target, int subfile_rank, int tag, MPI_Comm comm)
+static int
+send_nack__(int target, int subfile_rank, int tag, MPI_Comm comm)
{
int nack = 0;
int ret = MPI_Send(&nack, 1, MPI_INT, target, tag, comm);
+
+#ifndef NDEBUG
if (sf_verbose_flag) {
- printf("[ioc(%d): Sending NACK to MPI_rank(%d)\n", subfile_rank, target);
+ if (sf_logfile) {
+ fprintf(sf_logfile, "[ioc(%d): Sending NACK to MPI_rank(%d)\n",
+ subfile_rank, target);
+ }
}
+#endif
return ret;
}
@@ -1324,30 +2329,103 @@ from the thread pool threads...
=========================================
*/
-int queue_write_coll(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm)
+/*-------------------------------------------------------------------------
+ * Function: Public/IOC queue_write_coll
+ *
+ * Purpose: Collective write function (NOT currently implemented)
+ *
+ * Return: The integer status returned by the Internal read_independent
+ * function. Successful operations will return 0.
+ * Errors: An MPI related error value.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+int
+queue_write_coll(sf_work_request_t H5_ATTR_PARALLEL_UNUSED *msg,
+ int H5_ATTR_PARALLEL_UNUSED subfile_rank,
+ int H5_ATTR_PARALLEL_UNUSED source, MPI_Comm H5_ATTR_PARALLEL_UNUSED comm)
{
- return 0;
+ return 0;
}
-int queue_read_coll(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm)
+/*-------------------------------------------------------------------------
+ * Function: Public/IOC queue_read_coll
+ *
+ * Purpose: Collective read function (NOT currently implemented)
+ *
+ * Return: The integer status returned by the Internal read_independent
+ * function. Successful operations will return 0.
+ * Errors: An MPI related error value.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+int
+queue_read_coll(sf_work_request_t H5_ATTR_PARALLEL_UNUSED *msg,
+ int H5_ATTR_PARALLEL_UNUSED subfile_rank,
+ int H5_ATTR_PARALLEL_UNUSED source, MPI_Comm H5_ATTR_PARALLEL_UNUSED comm)
{
- return 0;
+ return 0;
}
-int queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm)
+/*-------------------------------------------------------------------------
+ * Function: Public/IOC queue_write_indep
+ *
+ * Purpose: Implement the IOC independent write function. The
+ * function is invoked as a result of the IOC receiving the
+ * "header"/RPC. What remains is to allocate memory for the
+ * data sent by the client and then write the data to our
+ * subfile. We utilize pwrite for the actual file writing.
+ * File flushing is done at file close.
+ *
+ * Return: The integer status returned by the Internal read_independent
+ * function. Successful operations will return 0.
+ * Errors: An MPI related error value.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+int
+queue_write_indep(
+ sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm)
{
- char *recv_buffer = NULL;
- int ret = MPI_SUCCESS;
- MPI_Status msg_status;
- int64_t data_size = msg->header[0];
- int64_t file_offset = msg->header[1];
- int fd;
+ int fd;
+ char * recv_buffer = NULL;
+ int ret = MPI_SUCCESS;
+ MPI_Status msg_status;
+ int64_t data_size = msg->header[0];
+ int64_t file_offset = msg->header[1];
+ int64_t file_context_id = msg->header[2];
+ subfiling_context_t *sf_context = get_subfiling_object(file_context_id);
+ assert(sf_context != NULL);
+
+ /* flag that we've attempted to write data to the file */
+ sf_context->sf_write_count++;
+
+#ifndef NDEBUG
if (sf_verbose_flag) {
- printf("[ioc(%d) %s]: msg from %d: datasize=%ld\toffset=%ld\n", subfile_rank, __func__, source, data_size, file_offset );
- fflush(stdout);
+ if (sf_logfile) {
+ fprintf(sf_logfile,
+ "[ioc(%d) %s]: msg from %d: datasize=%ld\toffset=%ld\n",
+ subfile_rank, __func__, source, data_size, file_offset);
+ }
}
+#endif
if (recv_buffer == NULL) {
- if ((recv_buffer = (char *)malloc((size_t)data_size)) == NULL) {
+ if ((recv_buffer = (char *) malloc((size_t) data_size)) == NULL) {
perror("malloc");
send_nack__(source, subfile_rank, WRITE_INDEP_ACK, comm);
return -1;
@@ -1356,287 +2434,678 @@ int queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source, MPI_
send_ack__(source, subfile_rank, WRITE_INDEP_ACK, comm);
- ret = MPI_Recv(recv_buffer, (int)data_size, MPI_BYTE, source, WRITE_INDEP_DATA, comm, &msg_status );
+ ret = MPI_Recv(recv_buffer, (int) data_size, MPI_BYTE, source,
+ WRITE_INDEP_DATA, comm, &msg_status);
+
+#ifndef NDEBUG
+ if (sf_verbose_flag) {
+ if (sf_logfile) {
+ fprintf(sf_logfile,
+ "[ioc(%d) %s] MPI_Recv(%ld bytes, from = %d) status = %d\n",
+ subfile_rank, __func__, data_size, source, ret);
+ }
+ }
+#endif
+
if (ret != MPI_SUCCESS) {
- int len;
+ int len;
char estring[MPI_MAX_ERROR_STRING];
MPI_Error_string(ret, estring, &len);
- printf("[ioc(%d) %s] MPI_ERROR(%d)! MPI_Recv of %ld bytes from %d returned an error(%s)\n",
- subfile_rank, __func__, msg_status.MPI_ERROR, data_size, source, estring );
+ printf("[ioc(%d) %s] MPI_ERROR(%d)! MPI_Recv of %ld bytes from %d "
+ "returned an error(%s)\n",
+ subfile_rank, __func__, msg_status.MPI_ERROR, data_size, source,
+ estring);
fflush(stdout);
return ret;
- } else if(sf_verbose_flag) {
- printf("[ioc(%d) %s] MPI_Recv success. Writing %ld bytes from rank %d to disk\n",
- subfile_rank, __func__, data_size, source);
- fflush(stdout);
}
- if ((fd = subfile_fid) < 0) {
- printf("[ioc(%d)] WARNING: %s called while subfile_fid = %d (closed)\n", subfile_rank, __func__, subfile_fid);
- fflush(stdout);
- }
- else if (sf_write_data(fd, file_offset, recv_buffer, data_size, subfile_rank ) < 0) {
- free(recv_buffer);
- recv_buffer = NULL;
- printf("[ioc(%d) %s] sf_write_data returned an error!\n", subfile_rank, __func__);
+ fd = sf_context->sf_fid;
+
+ if (fd < 0) {
+ printf("[ioc(%d)] WARNING: %s called while subfile_fid = %d (closed)\n",
+ subfile_rank, __func__, fd);
+ fflush(stdout);
+ } else if (sf_write_data(
+ fd, file_offset, recv_buffer, data_size, subfile_rank) < 0) {
+ free(recv_buffer);
+ recv_buffer = NULL;
+ printf("[ioc(%d) %s] sf_write_data returned an error!\n", subfile_rank,
+ __func__);
fflush(stdout);
return -1;
}
- /* Done... */
- // send_ack__(source, subfile_rank, COMPLETED, comm);
+ /* Done... */
if (recv_buffer) {
- free(recv_buffer);
- }
- return 0;
+ free(recv_buffer);
+ }
+ return 0;
}
-int queue_read_indep(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm)
+/*-------------------------------------------------------------------------
+ * Function: Public/IOC queue_read_indep
+ *
+ * Purpose: Implement the IOC independent read function. The
+ * function is invoked as a result of the IOC receiving the
+ * "header"/RPC. What remains is to allocate memory for
+ * reading the data and then to send this to the client.
+ * We utilize pread for the actual file reading.
+ *
+ * Return: The integer status returned by the Internal read_independent
+ * function. Successful operations will return 0.
+ * Errors: An MPI related error value.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+int
+queue_read_indep(
+ sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm)
{
- char *send_buffer = NULL;
- int ret = MPI_SUCCESS;
- int64_t data_size = msg->header[0];
- int64_t file_offset = msg->header[1];
+ int fd;
+ char * send_buffer = NULL;
+ int ret = MPI_SUCCESS;
+ int64_t data_size = msg->header[0];
+ int64_t file_offset = msg->header[1];
+ int64_t file_context_id = msg->header[2];
+ subfiling_context_t *sf_context = get_subfiling_object(file_context_id);
+ assert(sf_context != NULL);
+
+ sf_context->sf_read_count++;
+
+ fd = sf_context->sf_fid;
+
+ if (fd < 0) {
+ printf("[ioc(%d) %s] subfile(%d) file descriptor not valid\n",
+ subfile_rank, __func__, fd);
+ return -1;
+ }
+ /* If there were writes to this file, we should flush the file cache
+ * before attempting to read the contents.
+ */
+ if (sf_context->sf_write_count) {
+ sf_context->sf_write_count = 0;
+ fdatasync(fd);
+ }
+#ifndef NDEBUG
if (sf_verbose_flag) {
- printf("[ioc(%d) %s] msg from %d: datasize=%ld\toffset=%ld\n", subfile_rank, __func__, source, data_size, file_offset );
- fflush(stdout);
+ if (sf_logfile) {
+ fprintf(sf_logfile,
+ "[ioc(%d) %s] msg from %d: datasize=%ld\toffset=%ld\n",
+ subfile_rank, __func__, source, data_size, file_offset);
+ }
}
- if ((send_buffer = (char *)malloc((size_t)data_size)) == NULL) {
+#endif
+ if ((send_buffer = (char *) malloc((size_t) data_size)) == NULL) {
perror("malloc");
return -1;
}
- if (sf_read_data(subfile_fid, file_offset, send_buffer, data_size, subfile_rank) < 0) {
- printf("[%d] %s - sf_read_data returned an error!\n", subfile_rank, __func__);
+ if (sf_read_data(fd, file_offset, send_buffer, data_size, subfile_rank) <
+ 0) {
+ printf("[%d] %s - sf_read_data for source(%d) returned an error! "
+ "read_count=%ld\n",
+ subfile_rank, __func__, source, sf_context->sf_read_count);
fflush(stdout);
return -1;
}
- ret = MPI_Send(send_buffer, (int)data_size, MPI_BYTE, source, READ_INDEP_DATA, comm);
+ ret = MPI_Send(
+ send_buffer, (int) data_size, MPI_BYTE, source, READ_INDEP_DATA, comm);
if (ret != MPI_SUCCESS) {
- int len;
+ int len;
char estring[MPI_MAX_ERROR_STRING];
MPI_Error_string(ret, estring, &len);
- printf("[ioc(%d)] ERROR! MPI_Send of %ld bytes to %d returned an error(%s)\n",subfile_rank, data_size, source, estring );
+ printf("[ioc(%d)] ERROR! MPI_Send of %ld bytes to %d returned an "
+ "error(%s)\n",
+ subfile_rank, data_size, source, estring);
fflush(stdout);
return ret;
}
+#ifndef NDEBUG
+ if (sf_verbose_flag) {
+ if (sf_logfile) {
+ fprintf(sf_logfile, "[ioc(%d)] MPI_Send to source(%d) completed\n",
+ subfile_rank, source);
+ }
+ }
+#endif
if (send_buffer) {
- free(send_buffer);
- send_buffer = NULL;
- }
+ free(send_buffer);
+ send_buffer = NULL;
+ }
- return 0;
+ return 0;
}
-
-int queue_file_open(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm)
+/*-------------------------------------------------------------------------
+ * Function: Public/IOC queue_file_open
+ *
+ * Purpose: Implement the IOC file open function. The
+ * function is invoked as a result of the IOC receiving the
+ * "header"/RPC. What remains is open the subfile if it
+ * isn't already open. This can happen if this function
+ * was invoked by another client process.
+ *
+ * Return: The integer status returned by the Internal read_independent
+ * function. Successful operations will return 0.
+ * Errors: An MPI related error value.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+int
+queue_file_open(
+ sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm)
{
- int ret, req_count, errors=0;
- int ref_count;
- int flags = (int)(msg->header[0] & 0x0ffffffff);
- atomic_fetch_add(&sf_file_refcount, 1); // atomic
- ref_count = atomic_load(&sf_file_refcount);
- if (sf_verbose_flag) {
- printf("[ioc(%d) %s] file open flags = %0x, source=%d\n", subfile_rank, __func__, flags, source);
- fflush(stdout);
- }
-
- errors = subfiling_open_file(sf_subfile_prefix, subfile_rank, flags);
+ int ret, errors = 0;
+ int flags = (int) (msg->header[0] & 0x0ffffffff);
+ // int open_count;
+ atomic_fetch_add(&sf_file_refcount, 1); // atomic
+#ifndef NDEBUG
+ if (sf_verbose_flag) {
+ if (sf_logfile) {
+ fprintf(sf_logfile,
+ "[ioc(%d) %s] file open flags = %0x, source=%d\n", subfile_rank,
+ __func__, flags, source);
+ }
+ }
+#endif
+#if 0
+ printf("[ioc(%d) %s]\n", subfile_rank, __func__);
+ fflush(stdout);
+#endif
+ errors = subfiling_open_file(msg, sf_subfile_prefix, subfile_rank, flags);
+ // open_count = atomic_load(&sf_file_refcount);
- req_count = COMPLETED;
- ret = MPI_Send(&req_count, 1, MPI_INT, source, COMPLETED, comm);
+#if 1
+ ret = MPI_Send(&errors, 1, MPI_INT, source, COMPLETED, comm);
if (ret != MPI_SUCCESS) {
- errors++;
- }
- if (errors) {
- printf("[ioc(%d) %s] Error opening file\n", subfile_rank, __func__);
+ printf("[ioc(%d)] MPI_Send FILE_OPEN, COMPLETED to source(%d) FAILED\n",
+ subfile_rank, source);
fflush(stdout);
+ errors++;
}
+#else
+ if (open_count == sf_world_size) {
+ int i, k = (sf_world_rank +1);
+ for (i=0; i < sf_world_size; i++, k++) {
+ source = k % sf_world_size;
+ ret = MPI_Send(&errors, 1, MPI_INT, source, COMPLETED, comm);
+ if (ret != MPI_SUCCESS) {
+ printf("[ioc(%d)] MPI_Send FILE_OPEN, COMPLETED to source(%d) FAILED\n",
+ subfile_rank, source);
+ fflush(stdout);
+ errors++;
+ }
+ }
+ }
+#endif
+ if (errors) {
+#ifndef NDEBUG
+ if (sf_verbose_flag) {
+ if (sf_logfile) {
+ fprintf(sf_logfile, "[ioc(%d) %s] Error opening file\n",
+ subfile_rank, __func__);
+ }
+ }
+#endif
+ }
return errors;
}
-/*
+/*
* The decrement is somewhat of misnomer, i.e. we check the number of file open
* requests to the number of file close requests. When those values match, the
- * actual file gets closed via the callback_ftn. The effects a weak collective
- * on the file close operation. File opens on the other hand, can occur in
- * any random order and no collective semanitics are enforced.
+ * actual file gets closed via the callback_ftn. This effects a weak
+ * collective on the file close operation. File opens (*) on the other hand,
+ * can occur in any random order and no collective semanitics are enforced.
+ *
+ * (*) Note that on the original file open, there are collective operations
+ * which take place to generate the MPI communications descriptors.
*/
-int decrement_file_ref_counts( int subfile_rank, int source, MPI_Comm comm, file_close_cb callback_ftn)
+int
+decrement_file_ref_counts(sf_work_request_t *msg, int subfile_rank,
+ int H5_ATTR_PARALLEL_UNUSED source, MPI_Comm comm,
+ file_close_cb callback_ftn)
{
- int close_count, open_count;
- atomic_fetch_add(&sf_file_close_count, 1); // atomic
- close_count = atomic_load(&sf_file_close_count);
- open_count = atomic_load(&sf_file_refcount);
-
- if (close_count == sf_world_size) {
- atomic_store(&sf_file_refcount, 0);
- atomic_store(&sf_file_close_count, 0); /* Complete the reset to zeros */
- while (!tpool_is_empty) {
- usleep(10);
- }
- if (callback_ftn(subfile_rank, comm) < 0) {
- printf("[ioc(%d) %s] callback_ftn returned an error\n", subfile_rank, __func__ );
+ int close_count, errors = 0;
+
+ atomic_fetch_add(&sf_file_close_count, 1); // atomic
+ close_count = atomic_load(&sf_file_close_count);
+
+ if (close_count == sf_world_size) {
+ int64_t file_context_id = msg->header[2];
+ subfiling_context_t *sf_context = get_subfiling_object(file_context_id);
+ assert(sf_context != NULL);
+
+ atomic_store(&sf_file_refcount, 0);
+ atomic_store(&sf_file_close_count, 0); /* Complete the reset to zeros */
+
+ /* Wait until any queued work has finished */
+ while (!tpool_is_empty()) {
+ usleep(20);
+ }
+
+ if (callback_ftn(subfile_rank, &sf_context->sf_fid, comm) < 0) {
+ printf("[ioc(%d) %s] callback_ftn returned an error\n",
+ subfile_rank, __func__);
fflush(stdout);
+ errors++;
+ } else {
+ sf_context->sf_fid = -1; /* reset the actual file descriptor */
}
}
- return 0;
+ return errors;
}
-/* Note: This function should be called ONLY when all clients
- * have called the CLOSE_OP on this IO Concentrator.
- * The IOC API maintains a reference count on subfiles
- * so that once that count is decremented to zero, the
- * decrement_file_ref_counts function will call here.
+/*-------------------------------------------------------------------------
+ * Function: Public/IOC subfiling_close_file
+ *
+ * Purpose: This function should be called ONLY when all clients
+ * have called the CLOSE_OP on this IO Concentrator.
+ * The IOC API maintains a reference count on subfiles
+ * so that once that count is decremented to zero, the
+ * decrement_file_ref_counts function will call here.
+ *
+ * Return: The integer status returned by the Internal read_independent
+ * function. Successful operations will return 0.
+ * Errors: An MPI related error value.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
*/
-int subfiling_close_file(int subfile_rank, MPI_Comm comm)
+int
+subfiling_close_file(int subfile_rank, int *fid, MPI_Comm comm)
{
- int ret, source = 0;
- int errors = 0, flag = COMPLETED;
+ int errors = 0;
+ int subfile_fid = *fid;
-#if 0
- printf("[ioc(%d) %s] subfile_fid = %d\n", subfile_rank, __func__, subfile_fid);
- fflush(stdout);
-#endif
if (subfile_fid >= 0) {
+ if (fdatasync(subfile_fid) < 0) {
+ perror("fdatasync");
+ printf("fdatasync(%d)\n", subfile_fid);
+ errors++;
+ }
+ }
-#if 0
+ errors += subfiling_shutdown(subfile_rank, fid, comm);
+
+ if (errors) {
+ printf("[ioc(%d) %s] Errors detected!\n", subfile_rank, __func__);
+ fflush(stdout);
+ }
+
+ return errors;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: Public/IOC subfiling_shutdown
+ *
+ * Purpose: This function gets called ONLY when all clients have
+ * invoked the file CLOSE_OP, which in turn decrements the
+ * file reference count maintained within the subfiling
+ * context. As a result, the subfiling_close_file call is
+ * invoked, forcing a file sync/flush and then calling
+ * function to close the local subfile and notify the
+ * clients with the close ACK to allow them to continue
+ * beyond the HDF5 file close function.
+ *
+ * Return: The integer status returned by the Internal read_independent
+ * function. Successful operations will return 0.
+ * Errors: An MPI related error value.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+int
+subfiling_shutdown(int subfile_rank, int *fid, MPI_Comm comm)
+{
+ int ret, source = 0;
+ int subfile_fid = *fid;
+ int errors = 0, flag = COMPLETED;
+ if (subfile_fid >= 0) {
if (close(subfile_fid) < 0) {
- perror("subfiling_close_file");
- }
- subfile_fid = -1;
-#else
- fdatasync(subfile_fid);
-#endif
+ perror("subfiling_close_file");
+ printf("subfile_fid = %d\n", subfile_fid);
+ errors++;
+ }
+ *fid = -1;
}
+
+ /* Shutdown the main IOC thread */
+ sf_shutdown_flag = 1;
+ /* Allow ioc_main to exit.*/
+ usleep(40);
+
/* Notify all ranks */
for (source = 0; source < sf_world_size; source++) {
- /* Don't release our local MPI process until all
- * other ranks are released.
- */
- if (source == sf_world_rank) {
- continue;
- }
+ /* Don't release our local MPI process until all
+ * other ranks are released.
+ */
+ if (source == sf_world_rank) {
+ continue;
+ }
ret = MPI_Send(&flag, 1, MPI_INT, source, COMPLETED, comm);
- if (ret != MPI_SUCCESS) errors++;
+ if (ret != MPI_SUCCESS)
+ errors++;
}
- /* Release the local MPI process */
- ret = MPI_Send(&flag, 1, MPI_INT, sf_world_rank, COMPLETED, comm);
- if (ret != MPI_SUCCESS) errors++;
+ /* Release the local MPI process */
+ ret = MPI_Send(&flag, 1, MPI_INT, sf_world_rank, COMPLETED, comm);
+ if (ret != MPI_SUCCESS)
+ errors++;
if (errors) {
- printf("[ioc(%d) %s] Errors sending file close replies\n", subfile_rank, __func__);
+ printf("[ioc(%d) %s] Errors sending ioc_fini replies\n", subfile_rank,
+ __func__);
fflush(stdout);
}
return errors;
}
-int subfiling_open_file(const char *prefix, int subfile_rank, int flags)
+/*-------------------------------------------------------------------------
+ * Function: Public/IOC increment_ioc_fini_counts
+ *
+ * Purpose: UNUSED. Was originally implemented to manage the shutdown
+ * of IO Concentrators. The subfiling design changed to
+ * create IOC instances as part of FILE opens and shutdowns
+ * as part of file closing.
+ *
+ * Return: The integer status returned by the Internal read_independent
+ * function. Successful operations will return 0.
+ * Errors: An MPI related error value.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+int
+increment_ioc_fini_counts(sf_work_request_t *msg, int subfile_rank,
+ int H5_ATTR_PARALLEL_UNUSED source, MPI_Comm comm,
+ file_close_cb callback_ftn)
{
- int errors = 0;
- /* Only the real IOCs open the subfiles
- * Once a file is opened, all additional file open requests
- * can return immediately.
- */
- if (subfile_rank >= 0) {
- char filepath[PATH_MAX];
- char config[PATH_MAX];
-
-
- if (subfile_fid < 0) {
- const char *dotconfig = ".subfile_config";
- mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
- if (prefix) {
- mkdir(prefix, S_IRWXU);
- sprintf(filepath, "%s/node_local_temp_%d_of_%d",
- prefix, subfile_rank, n_io_concentrators);
- sprintf(config, "%s/%s", prefix, dotconfig);
- }
- else {
- sprintf(filepath, "node_local_temp_%d_of_%d",
- subfile_rank,n_io_concentrators);
- strcpy(config, dotconfig);
- }
-
- begin_thread_exclusive();
+ int close_count, errors = 0;
+ atomic_fetch_add(&sf_ioc_fini_refcount, 1); // atomic
+ close_count = atomic_load(&sf_ioc_fini_refcount);
+
+ if (close_count == sf_world_size) {
+ int64_t file_context_id = msg->header[2];
+ subfiling_context_t *sf_context = get_subfiling_object(file_context_id);
+ assert(sf_context != NULL);
+ if (callback_ftn(subfile_rank, &sf_context->sf_fid, comm) < 0) {
+ printf("[ioc(%d) %s] callback_ftn returned an error\n",
+ subfile_rank, __func__);
+ fflush(stdout);
+ }
+ }
+ return errors;
+}
- if ((subfile_fid = open(filepath, flags, mode)) < 0) {
- perror("subfile open");
- end_thread_exclusive();
- errors++;
- goto done;
- }
+/*-------------------------------------------------------------------------
+ * Function: Public/IOC subfiling_open_file
+ *
+ * Purpose: This function gets called when a client invokes a OPEN_OP.
+ * The HDF5 file opening protocol actually attempts to open
+ * a file; first without any truncate other flags which would
+ * modify the file state if it already exists. A file close
+ * and then the second file open using the user supplied open
+ * flags is invoked. The OPEN_OP provides the user flags as
+ * part of the RPC message. The file prefix info doesn't
+ * transmited as part of the RPC since it is available as
+ * part of the client context which can be utilized by the
+ * IOC thread. We access the sf_context by reading the
+ * cache of contexts at the index provided with the RPC msg.
+ *
+ * Return: The integer status returned by the Internal read_independent
+ * function. Successful operations will return 0.
+ * Errors: An MPI related error value.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+int
+subfiling_open_file(
+ sf_work_request_t *msg, const char *prefix, int subfile_rank, int flags)
+{
+ int errors = 0;
- end_thread_exclusive();
-
- if (flags & O_CREAT) {
- size_t bufsize = PATH_MAX + 16;
- FILE *f = NULL;
- char linebuf[bufsize];
- /* If a config file already exists, AND
- * the user wants to truncate subfiles (if they exist),
- * then we should also truncate an existing config file.
- */
- if (access(config, flags) == 0) {
- truncate(config, 0);
- }
- f = fopen(config, "w+");
- if (f != NULL) {
- int k;
- char *underscore = strrchr(filepath,'_');
- *underscore=0;
- strcpy(config, filepath);
- *underscore='_';
- sprintf(linebuf,"stripe_size=%ld\n", sf_stripe_size);
- fwrite(linebuf, strlen(linebuf), 1, f);
- sprintf(linebuf,"aggregator_count=%d\n",n_io_concentrators);
- fwrite(linebuf, strlen(linebuf), 1, f);
-
- for(k=0; k < n_io_concentrators; k++) {
- snprintf(linebuf,bufsize,"%s_%d:%d\n",config, k, io_concentrator[k]);
- fwrite(linebuf, strlen(linebuf), 1, f);
- }
+ /* Only the real IOCs open the subfiles
+ * Once a file is opened, all additional file open requests
+ * can return immediately.
+ */
+ if (subfile_rank >= 0) {
+ char filepath[PATH_MAX];
+ char config[PATH_MAX];
+ int subfile_fid;
+ int64_t h5_file_id = msg->header[1];
+ int64_t file_context_id = msg->header[2];
+ subfiling_context_t *sf_context = get_subfiling_object(file_context_id);
+ assert(sf_context != NULL);
+
+ begin_thread_exclusive();
+
+ if (sf_context->sf_fid < 0) {
+ int n_io_concentrators = sf_context->topology->n_io_concentrators;
+ int *io_concentrator = sf_context->topology->io_concentrator;
+ const char *dotconfig = ".subfile_config";
+ mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
+ if (prefix) {
+ mkdir(prefix, S_IRWXU);
+ sprintf(filepath, "%s/%ld_node_local_temp_%d_of_%d", prefix,
+ h5_file_id, subfile_rank, n_io_concentrators);
+ sprintf(config, "%s/%ld%s", prefix, h5_file_id, dotconfig);
+ } else {
+ sprintf(filepath, "%ld_node_local_temp_%d_of_%d", h5_file_id,
+ subfile_rank, n_io_concentrators);
+ strcpy(config, dotconfig);
+ }
- fclose(f);
- }
- else {
- perror("fopen(config)");
- errors++;
- goto done;
- }
- }
- if (sf_verbose_flag) {
- printf("[ioc:%d] Opened subfile %s\n", subfile_rank, filepath);
- }
- }
+ if ((subfile_fid = open(filepath, flags, mode)) < 0) {
+ end_thread_exclusive();
+ errors++;
+ goto done;
+ } else {
+ sf_context->sf_fid = subfile_fid;
+ }
+ if (flags & O_CREAT) {
+ int64_t new_context = SF_CONTEXT;
+ int64_t objtype = (new_context << 32);
+ int context_id = (int) msg->context_id;
+ size_t bufsize = PATH_MAX + 16;
+ FILE * f = NULL;
+ char linebuf[bufsize];
+ int64_t thisId = (int64_t)(objtype | context_id);
+ subfiling_context_t *context =
+ (subfiling_context_t *) get_subfiling_object(thisId);
+ /* If a config file already exists, AND
+ * the user wants to truncate subfiles (if they exist),
+ * then we should also truncate an existing config file.
+ */
+ if (access(config, flags) == 0) {
+ truncate(config, 0);
+ }
+ f = fopen(config, "w+");
+ if (f != NULL) {
+ int k;
+ sprintf(
+ linebuf, "stripe_size=%ld\n", context->sf_stripe_size);
+ fwrite(linebuf, strlen(linebuf), 1, f);
+ sprintf(
+ linebuf, "aggregator_count=%d\n", n_io_concentrators);
+ fwrite(linebuf, strlen(linebuf), 1, f);
+ sprintf(linebuf,"hdf5_file=%s\n", context->filename);
+ fwrite(linebuf, strlen(linebuf), 1, f);
+
+ for (k = 0; k < n_io_concentrators; k++) {
+ if (prefix)
+ sprintf(linebuf, "%s/%ld_node_local_temp_%d_of_%d:%d", prefix,
+ h5_file_id, subfile_rank, n_io_concentrators, io_concentrator[k]);
+ else
+ sprintf(linebuf, "%ld_node_local_temp_%d_of_%d:%d", h5_file_id,
+ subfile_rank, n_io_concentrators, io_concentrator[k]);
+
+ fwrite(linebuf, strlen(linebuf), 1, f);
+ }
+
+ fclose(f);
+ } else {
+ perror("fopen(config)");
+ errors++;
+ goto done;
+ }
+ }
+#ifndef NDEBUG
+ if (sf_verbose_flag) {
+ if (sf_logfile) {
+ fprintf(sf_logfile, "[ioc:%d] Opened subfile %s\n",
+ subfile_rank, filepath);
+ }
+ }
+#endif
+ }
+ end_thread_exclusive();
}
done:
- return errors;
+ return errors;
}
-
+/*-------------------------------------------------------------------------
+ * Function: UTILITY FUNCTIONS:
+ * delete_subfiling_context - removes a context entry in the
+ * object cache. Free communicators
+ * and zero other structure fields.
+ *
+ * sf_get_mpi_rank - (not used) retrieves the MPI rank of the
+ * calling process. Was used when pairing
+ * the subfiling VFD with the SUBFILING VFD.
+ *
+ * sf_get_mpi_size - (not used) retrieves the MPI size of the
+ * communicator associated with the open
+ * file.
+ *
+ * sf_get_group_com - (not used) retrieves the MPI Comm object
+ * associated with the open file/sf_context.
+ *
+ * sf_subfile_set_logging - (not used) informs one or all IOC
+ * instances to set the verbose/logging flag
+ * to the value provided by the user.
+ *
+ * Return: none
+ * Errors: none
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
void
delete_subfiling_context(hid_t context_id)
{
- subfiling_context_t *sf_context = get_subfiling_object(context_id);
- if (sf_context) {
- MPI_Comm_free(&sf_context->sf_msg_comm);
- MPI_Comm_free(&sf_context->sf_data_comm);
- sf_msg_comm = MPI_COMM_NULL;
- sf_data_comm = MPI_COMM_NULL;
- if (n_io_concentrators > 1) {
- MPI_Comm_free(&sf_context->sf_group_comm);
- MPI_Comm_free(&sf_context->sf_intercomm);
- }
- free(sf_context);
- }
+ subfiling_context_t *sf_context = get_subfiling_object(context_id);
+ if (sf_context) {
+ if (sf_context->topology->n_io_concentrators > 1) {
+ if (sf_context->sf_group_comm != MPI_COMM_NULL) {
+ MPI_Comm_free(&sf_context->sf_group_comm);
+ }
+ if (sf_context->sf_intercomm != MPI_COMM_NULL) {
+ MPI_Comm_free(&sf_context->sf_intercomm);
+ }
+ }
+ free(sf_context);
+ }
- usleep(100);
- return;
+ return;
+}
+
+int
+sf_get_mpi_rank(hid_t fid, int *rank)
+{
+ hid_t context_id = fid_map_to_context(fid);
+ subfiling_context_t *sf_context = get_subfiling_object(context_id);
+ assert(sf_context != NULL);
+ assert(rank != NULL);
+ *rank = sf_context->sf_group_rank;
+ return 0;
+}
+
+int
+sf_get_mpi_size(hid_t fid, int *size)
+{
+ hid_t context_id = fid_map_to_context(fid);
+ subfiling_context_t *sf_context = get_subfiling_object(context_id);
+ assert(sf_context != NULL);
+ assert(size != NULL);
+ *size = sf_context->sf_group_size;
+ return 0;
+}
+
+int
+sf_get_group_comm(hid_t fid, MPI_Comm *comm)
+{
+ hid_t context_id = fid_map_to_context(fid);
+ subfiling_context_t *sf_context = get_subfiling_object(context_id);
+ assert(sf_context != NULL);
+ assert(comm != NULL);
+ *comm = sf_context->sf_group_comm;
+ return 0;
+}
+
+int
+sf_subfile_set_logging(hid_t sf_fid, int ioc_rank, int flag)
+{
+ int ioc;
+ int status = 0;
+ hid_t context_id = fid_map_to_context(sf_fid);
+ subfiling_context_t *sf_context = get_subfiling_object(context_id);
+ int n_io_concentrators;
+ int * io_concentrator = NULL;
+ int64_t lflag = (int64_t)(flag & 0xFF);
+ int64_t msg[3];
+
+ assert(sf_context != NULL);
+
+ msg[0] = lflag;
+ msg[1] = 0;
+ msg[2] = sf_context->sf_context_id;
+
+ n_io_concentrators = sf_context->topology->n_io_concentrators;
+ io_concentrator = sf_context->topology->io_concentrator;
+
+ for (ioc = 0; ioc < n_io_concentrators; ioc++) {
+ if ((flag < 0) || (flag == ioc_rank)) {
+ status = MPI_Ssend(msg, 3, MPI_INT64_T, io_concentrator[ioc],
+ LOGGING_OP, sf_context->sf_msg_comm);
+ }
+ }
+ return status;
}
diff --git a/src/H5FDsubfile_private.h b/src/H5FDsubfile_private.h
index 0088c13..db991f9 100644
--- a/src/H5FDsubfile_private.h
+++ b/src/H5FDsubfile_private.h
@@ -4,9 +4,9 @@
#include <assert.h>
#include <stdatomic.h>
-#include <stdio.h>
#include <stdbool.h>
#include <stdint.h>
+#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
@@ -14,87 +14,108 @@
/**************/
/* H5 Headers */
/**************/
-#include "H5private.h" /* Generic Functions */
-#include "H5CXprivate.h" /* API Contexts */
-#include "H5Dprivate.h" /* Datasets */
-#include "H5Eprivate.h" /* Error handling */
+#include "H5CXprivate.h" /* API Contexts */
+#include "H5Dprivate.h" /* Datasets */
+#include "H5Eprivate.h" /* Error handling */
+#include "H5Iprivate.h" /* IDs */
#include "H5Ipublic.h"
-#include "H5Iprivate.h" /* IDs */
-#include "H5MMprivate.h" /* Memory management */
-#include "H5Pprivate.h" /* Property lists */
-
+#include "H5MMprivate.h" /* Memory management */
+#include "H5Pprivate.h" /* Property lists */
+#include "H5private.h" /* Generic Functions */
#include "mpi.h"
#ifndef _H5FDsubfile_private_H
-#define _H5FDsubfile_private_H
+# define _H5FDsubfile_private_H
-typedef int (*file_close_cb)(int,MPI_Comm);
+typedef int (*file_close_cb)(int, int *, MPI_Comm);
-typedef struct {
- int64_t sf_stripe_size;
- int64_t sf_blocksize_per_stripe;
- MPI_Comm sf_msg_comm;
- MPI_Comm sf_data_comm;
- MPI_Comm sf_group_comm;
- MPI_Comm sf_intercomm;
- int sf_group_size;
- int sf_group_rank;
- int sf_intercomm_root;
- char *subfile_prefix;
-} subfiling_context_t;
-
-typedef struct {
- /* {Datasize, Offset} */
- int64_t header[2];
- int tag;
- int source;
- int subfile_rank;
-} sf_work_request_t;
+typedef enum io_ops {
+ READ_OP = 1,
+ WRITE_OP = 2,
+ OPEN_OP = 3,
+ CLOSE_OP = 4,
+ FINI_OP = 8,
+ LOGGING_OP = 16
+} io_op_t;
+typedef enum {
+ SF_BADID = (-1),
+ SF_TOPOLOGY = 1,
+ SF_CONTEXT = 2,
+ SF_NTYPES /* number of subfiling object types, MUST BE LAST */
+} sf_obj_type_t;
+typedef enum {
+ SELECT_IOC_ONE_PER_NODE = 0, /* Default */
+ SELECT_IOC_EVERY_NTH_RANK,
+ SELECT_IOC_WITH_CONFIG,
+ ioc_selection_options
+} sf_ioc_selection_t;
typedef struct {
- long rank;
- long hostid;
+ long rank;
+ long hostid;
} layout_t;
-typedef struct {
- long hostid;
- layout_t *topology;
- int *node_ranks;
- int node_count;
- int node_index;
- int local_peers;
- int subfile_rank;
- int world_rank;
- int world_size;
- bool rank_is_ioc;
+typedef struct topology {
+ long hostid;
+ layout_t * layout;
+ int * node_ranks;
+ int node_count;
+ int node_index;
+ int local_peers;
+ int subfile_rank;
+ int world_rank;
+ int world_size;
+ bool rank_is_ioc;
+ int n_io_concentrators;
+ int * io_concentrator;
+ sf_ioc_selection_t selection_type;
} sf_topology_t;
-#define K(n) ((n)*1024)
-#define DEFAULT_STRIPE_SIZE K(256) /* (1024*1024) */
-#define MAX_DEPTH 256
+typedef struct {
+ hid_t sf_context_id;
+ hid_t h5_file_id;
+ int sf_fid;
+ size_t sf_write_count;
+ size_t sf_read_count;
+ size_t sf_eof;
+ /* Copy of the HDF5 File 'serial' number */
+ unsigned long fileno;
+ int64_t sf_stripe_size;
+ int64_t sf_blocksize_per_stripe;
+ MPI_Comm sf_msg_comm;
+ MPI_Comm sf_data_comm;
+ MPI_Comm sf_group_comm;
+ MPI_Comm sf_intercomm;
+ int sf_group_size;
+ int sf_group_rank;
+ int sf_intercomm_root;
+ char * subfile_prefix;
+ char * filename;
+ sf_topology_t *topology;
+} subfiling_context_t;
-typedef enum io_ops {
- READ_OP = 1,
- WRITE_OP = 2,
- OPEN_OP = 3,
- CLOSE_OP = 4,
- INCR_OP = 8,
- DECR_OP = 16,
-} io_op_t;
-
-typedef enum {
- SF_BADID = (-1),
- SF_TOPOLOGY = 1,
- SF_CONTEXT,
- SF_NTYPES /* number of subfiling object types, MUST BE LAST */
-} SF_OBJ_TYPE;
-
+typedef struct {
+ /* {Datasize, Offset, FileID} */
+ int64_t header[3];
+ int tag;
+ int source;
+ int subfile_rank;
+ hid_t context_id;
+} sf_work_request_t;
+typedef struct {
+ hid_t h5_file_id;
+ hid_t sf_context_id;
+} file_map_to_context_t;
+
+# define K(n) ((n) *1024)
+# define DEFAULT_STRIPE_SIZE K(256) /* (1024*1024) */
+# define MAX_DEPTH 1024
-/* MPI Tags are 32 bits, we treat them as unsigned
+/* MPI Tags are 32 bits, we treat them as unsigned
* to allow the use of the available bits for RPC
* selections:
* 0000
@@ -108,85 +129,113 @@ typedef enum {
* 1010 COLLECTIVE_WRITE
* 1011 /////////
* 1100 COLLECTIVE_CLOSE
- *
+ *
* 31 28 24 20 16 12 8 4 0|
* +-------+-------+-------+-------+-------+-------+-------+-------+
* | | | ACKS | OP |
* +-------+-------+-------+-------+-------+-------+-------+-------+
- *
+ *
*/
/* Bit 3 SET indicates collectives */
-#define COLL_FUNC (0x1 << 3)
+# define COLL_FUNC (0x1 << 3)
-#define ACK_PART (0x0acc << 8)
-#define DATA_PART (0xd8da << 8)
-#define READY (0xfeed << 8)
-#define COMPLETED (0xfed1 << 8)
+# define ACK_PART (0x0acc << 8)
+# define DATA_PART (0xd8da << 8)
+# define READY (0xfeed << 8)
+# define COMPLETED (0xfed1 << 8)
-#define READ_INDEP (READ_OP)
-#define READ_COLL (COLL_FUNC | READ_OP)
-#define WRITE_INDEP (WRITE_OP)
-#define WRITE_COLL (COLL_FUNC | WRITE_OP)
+# define READ_INDEP (READ_OP)
+# define READ_COLL (COLL_FUNC | READ_OP)
+# define WRITE_INDEP (WRITE_OP)
+# define WRITE_COLL (COLL_FUNC | WRITE_OP)
-#define WRITE_INDEP_ACK (ACK_PART | WRITE_OP)
-#define WRITE_INDEP_DATA (DATA_PART | WRITE_OP)
+# define WRITE_INDEP_ACK (ACK_PART | WRITE_OP)
+# define WRITE_INDEP_DATA (DATA_PART | WRITE_OP)
-#define READ_INDEP_DATA (DATA_PART | READ_OP)
+# define READ_INDEP_DATA (DATA_PART | READ_OP)
+# define SET_LOGGING (LOGGING_OP)
-#define INT32_MASK 0x07FFFFFFFFFFFFFFF
+# define INT32_MASK 0x07FFFFFFFFFFFFFFF
-extern int sf_verbose_flag;
extern int sf_shutdown_flag;
extern atomic_int sf_workinprogress;
extern atomic_int sf_work_pending;
extern atomic_int sf_file_close_count;
extern atomic_int sf_file_refcount;
-
-/*
--------------
-Messages IN
--------------
-*/
-extern MPI_Comm sf_msg_comm;
-
-/*
--------------
-Messages OUT
--------------
-*/
-extern MPI_Comm sf_data_comm;
-
-
-
-H5_DLL int H5FD__determine_ioc_count(int world_size, int world_rank, sf_topology_t **thisapp);
-H5_DLL int H5FD__init_subfile_context(subfiling_context_t **newContext, int n_iocs, int world_size, int world_rank, bool rank_is_ioc);
-H5_DLL void * get_subfiling_object(int64_t object_id);
-H5_DLL hid_t get_subfiling_context(void);
-H5_DLL int initialize_ioc_threads(subfiling_context_t *sf_context);
-H5_DLL int tpool_add_work(sf_work_request_t *);
-H5_DLL bool tpool_is_empty(void);
-H5_DLL int ioc_main(subfiling_context_t *context);
-H5_DLL int queue_write_coll(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm);
-H5_DLL int queue_read_coll(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm);
-H5_DLL int queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm);
-H5_DLL int queue_read_indep(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm);
-H5_DLL int subfiling_close_file(int subfile_rank, MPI_Comm comm);
-H5_DLL int subfiling_open_file(const char *prefix, int subfile_rank, MPI_Comm comm);
-H5_DLL int queue_file_open(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm);
-H5_DLL int decrement_file_ref_counts( int subfile_rank, int source, MPI_Comm comm, file_close_cb callback_ftn);
-H5_DLL int sf_open_subfiles(hid_t context_id, char *prefix, int flags);
-H5_DLL int sf_close_subfiles(hid_t context_id);
-H5_DLL int sf_write_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, int subfile_rank);
-H5_DLL int sf_read_independent(hid_t context_id, int64_t offset, int64_t elements, int dtype_extent, void *data);
-H5_DLL int sf_write_independent(hid_t context_id, int64_t offset, int64_t elements, int dtype_extent, void *data);
-H5_DLL int sf_read_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, int subfile_rank);
-H5_DLL void delete_subfiling_context(hid_t context_id);
-H5_DLL void finalize_ioc_threads(void);
-H5_DLL int begin_thread_exclusive(void);
-H5_DLL int end_thread_exclusive(void);
-H5_DLL int wait_for_thread_main(void);
-H5_DLL int finalize_subfile_close(void);
+extern int sf_verbose_flag;
+
+# ifndef NDEBUG
+extern FILE *sf_logfile;
+# endif
+
+# ifdef __cplusplus
+extern "C" {
+# endif
+
+/* clang-format off */
+H5_DLL herr_t H5FDsubfiling_init(sf_ioc_selection_t ioc_select_method, char *ioc_select_option, int64_t *context);
+H5_DLL herr_t H5FDsubfiling_finalize(int64_t subfile_id);
+H5_DLL int H5FD__determine_ioc_count(int world_size, int world_rank,
+ sf_ioc_selection_t ioc_select_method, char *ioc_select_option, sf_topology_t **thisapp);
+H5_DLL int H5FD__init_subfile_context(sf_topology_t *thisApp, int n_iocs, int world_rank,
+ subfiling_context_t *newContext);
+H5_DLL int64_t record_subfiling_object(sf_obj_type_t type, void *obj);
+H5_DLL void * get_subfiling_object(int64_t object_id);
+H5_DLL herr_t sf_free_context(subfiling_context_t **sf_context);
+H5_DLL int initialize_ioc_threads(subfiling_context_t *sf_context);
+H5_DLL int tpool_add_work(sf_work_request_t *);
+H5_DLL bool tpool_is_empty(void);
+H5_DLL int ioc_main(int64_t context_id);
+H5_DLL int queue_write_coll( sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm);
+H5_DLL int queue_read_coll( sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm);
+H5_DLL int queue_write_indep( sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm);
+H5_DLL int queue_read_indep( sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm);
+H5_DLL int subfiling_close_file(int subfile_rank, int *subfile_fid, MPI_Comm comm);
+H5_DLL int subfiling_shutdown(int subfile_rank, int *subfile_fid, MPI_Comm comm);
+H5_DLL int subfiling_open_file( sf_work_request_t *msg, const char *prefix, int subfile_rank, int flags);
+H5_DLL int queue_file_open( sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm);
+H5_DLL int decrement_file_ref_counts(sf_work_request_t *msg, int subfile_rank, int source,
+ MPI_Comm comm, file_close_cb callback_ftn);
+H5_DLL int increment_ioc_fini_counts(sf_work_request_t *msg, int subfile_rank, int source,
+ MPI_Comm comm, file_close_cb callback_ftn);
+H5_DLL int sf_open_subfiles(hid_t context_id, char *filename, char *prefix, int flags);
+H5_DLL int sf_close_subfiles(hid_t context_id);
+H5_DLL int sf_notify_shutdown(hid_t context_id);
+H5_DLL int sf_write_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size,
+ int subfile_rank);
+H5_DLL int sf_read_independent(hid_t sf_fid, int64_t offset, int64_t elements,
+ int dtype_extent, void *data);
+H5_DLL int sf_write_independent(hid_t sf_fid, int64_t offset, int64_t elements,
+ int dtype_extent, const void *data);
+H5_DLL int sf_read_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size,
+ int subfile_rank);
+H5_DLL herr_t sf_read_vector(hid_t h5_fid, hssize_t count, haddr_t addrs[], hsize_t sizes[],
+ void *bufs[] /* in */);
+H5_DLL herr_t sf_write_vector(hid_t h5_fid, hssize_t count, haddr_t addrs[], hsize_t sizes[],
+ void *bufs[] /* in */);
+H5_DLL int sf_truncate(hid_t h5_fid, haddr_t addr);
+H5_DLL void delete_subfiling_context(hid_t context_id);
+H5_DLL void finalize_ioc_threads(void);
+H5_DLL int begin_thread_exclusive(void);
+H5_DLL int end_thread_exclusive(void);
+H5_DLL int wait_for_thread_main(void);
+H5_DLL int finalize_subfile_close(void);
+H5_DLL char * get_ioc_selection_criteria(sf_ioc_selection_t *selection_criteria);
+H5_DLL int active_map_entries(void);
+H5_DLL void clear_fid_map_entry(hid_t sf_fid);
+H5_DLL hid_t fid_map_to_context(hid_t sf_fid);
+H5_DLL void set_verbose_flag(int subfile_rank, int new_value);
+H5_DLL int sf_get_mpi_rank(hid_t fid, int *rank);
+H5_DLL int sf_get_mpi_size(hid_t fid, int *size);
+H5_DLL int sf_get_group_comm(hid_t fid, MPI_Comm *comm);
+H5_DLL int sf_subfile_set_logging(hid_t sf_fid, int ioc_rank, int flag);
+
+/* clang-format on */
+
+# ifdef __cplusplus
+}
+# endif
#endif
diff --git a/src/H5FDsubfile_public.h b/src/H5FDsubfile_public.h
index 6e4e23c..32a2785 100644
--- a/src/H5FDsubfile_public.h
+++ b/src/H5FDsubfile_public.h
@@ -3,9 +3,5 @@
#include "H5FDsubfile_private.h"
-herr_t H5FDsubfiling_init(void);
-herr_t H5FDsubfiling_finalize(void);
-
-
#endif /* _H5FDsubfile_public_H */
diff --git a/src/H5FDsubfile_threads.c b/src/H5FDsubfile_threads.c
index fb99930..fa957a5 100644
--- a/src/H5FDsubfile_threads.c
+++ b/src/H5FDsubfile_threads.c
@@ -1,161 +1,371 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright by The HDF Group. *
+ * Copyright by the Board of Trustees of the University of Illinois. *
+ * All rights reserved. *
+ * *
+ * This file is part of HDF5. The full HDF5 copyright notice, including *
+ * terms governing use, modification, and redistribution, is contained in *
+ * the COPYING file, which can be found at the root of the source code *
+ * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. *
+ * If you do not have access to either file, you may request a copy from *
+ * help@hdfgroup.org. *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
#include "H5FDsubfile_private.h"
-#include "mercury/mercury_util_config.h"
-#include "mercury/mercury_log.h"
+/*
+ * NOTES:
+ * Rather than re-create the code for creating and managing a thread pool,
+ * I'm utilizing a reasonably well tested implementation from the mercury
+ * project. At some point, we should revisit this decision or possibly
+ * directly link against the mercury library. This would make sense if
+ * we move away from using MPI as the messaging infrastructure and instead
+ * use mercury for that purpose...
+ */
+
#include "mercury/mercury_log.c"
-#include "mercury/mercury_util_error.c"
+#include "mercury/mercury_log.h"
#include "mercury/mercury_thread.c"
-#include "mercury/mercury_thread_mutex.c"
-#include "mercury/mercury_thread_condition.h"
#include "mercury/mercury_thread_condition.c"
+#include "mercury/mercury_thread_condition.h"
+#include "mercury/mercury_thread_mutex.c"
#include "mercury/mercury_thread_pool.c"
#include "mercury/mercury_thread_spin.c"
+#include "mercury/mercury_util_config.h"
+#include "mercury/mercury_util_error.c"
static hg_thread_mutex_t ioc_mutex = PTHREAD_MUTEX_INITIALIZER;
static hg_thread_mutex_t ioc_thread_mutex = PTHREAD_MUTEX_INITIALIZER;
static hg_thread_pool_t *ioc_thread_pool = NULL;
-static hg_thread_t ioc_thread;
+static hg_thread_t ioc_thread;
#ifndef HG_TEST_NUM_THREADS_DEFAULT
-#define HG_TEST_NUM_THREADS_DEFAULT 4
+# define HG_TEST_NUM_THREADS_DEFAULT 4
#endif
-#define POOL_CONCURRENT_MAX 64
+#define POOL_CONCURRENT_MAX 256
static struct hg_thread_work pool_request[POOL_CONCURRENT_MAX];
+/*-------------------------------------------------------------------------
+ * Function: local ioc_thread_main
+ *
+ * Purpose: An IO Concentrator instance is initialized with the
+ * specified subfiling context.
+ *
+ * Return: The IO concentrator thread executes as long as the HDF5
+ * file associated with this context is open. At file close,
+ * the thread will return from 'ioc_main' and the thread
+ * exit status will be checked by the main program.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
static HG_THREAD_RETURN_TYPE
ioc_thread_main(void *arg)
{
+ int64_t * context_id = (int64_t *) arg;
hg_thread_ret_t thread_ret = (hg_thread_ret_t) 0;
- /* Pass along the subfiling_context_t */
- ioc_main(arg);
+ /* Pass along the subfiling_context_t */
+ ioc_main(context_id[0]);
- // hg_thread_exit(thread_ret);
+ /* Upon exit, we can free the input arg */
+ free(arg);
return thread_ret;
}
+/*-------------------------------------------------------------------------
+ * Function: initialize_ioc_threads
+ *
+ * Purpose: The principal entry point to initialize the execution
+ * context for an IO Concentrator (IOC). The main thread
+ * is responsible for receiving IO requests from each
+ * HDF5 "client" and distibuting those to helper threads
+ * for actual processing. We initialize a fixed number
+ * of helper threads by creating a thread_pool.
+ *
+ * Return: SUCCESS (0) or FAIL (-1) if any errors are detected
+ * for the multi-threaded initialization.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
int
initialize_ioc_threads(subfiling_context_t *sf_context)
{
- int status;
- status = hg_thread_mutex_init(&ioc_mutex);
- if (status) {
- puts("hg_thread_mutex_init failed");
- goto err_exit;
- }
- status = hg_thread_mutex_init(&ioc_thread_mutex);
- if (status) {
- puts("hg_thread_mutex_init failed");
- goto err_exit;
- }
-
- status = hg_thread_pool_init(HG_TEST_NUM_THREADS_DEFAULT, &ioc_thread_pool);
- if (status) {
- puts("hg_thread_pool_init failed");
- goto err_exit;
- }
- status = hg_thread_create(&ioc_thread, ioc_thread_main, sf_context);
- if (status) {
- puts("hg_thread_create failed");
- goto err_exit;
- }
- return 0;
+ int status;
+ int64_t *context_id = (int64_t *) malloc(sizeof(int64_t));
+ assert(context_id != NULL);
+ /* Initialize the main IOC thread input argument.
+ * Each IOC request will utilize this context_id which is
+ * consistent across all MPI ranks, to ensure that requests
+ * involving reference counting are correctly using the
+ * correct file contexts.
+ */
+ context_id[0] = sf_context->sf_context_id;
+
+ /* Initialize a couple of mutex variables that are used
+ * during IO concentrator operations to serialize
+ * access to key objects, e.g. reference counting.
+ */
+ status = hg_thread_mutex_init(&ioc_mutex);
+ if (status) {
+ puts("hg_thread_mutex_init failed");
+ goto err_exit;
+ }
+ status = hg_thread_mutex_init(&ioc_thread_mutex);
+ if (status) {
+ puts("hg_thread_mutex_init failed");
+ goto err_exit;
+ }
+
+ /* Initialize a thread pool for the IO Concentrator to use */
+ status = hg_thread_pool_init(HG_TEST_NUM_THREADS_DEFAULT, &ioc_thread_pool);
+ if (status) {
+ puts("hg_thread_pool_init failed");
+ goto err_exit;
+ }
+
+ /* Arguments to hg_thread_create are:
+ * 1. A pointer to reference the created thread.
+ * 2. User function pointer for the new thread to execute.
+ * 3. Pointer to the input argument that gets passed along to the user
+ * function.
+ */
+ status = hg_thread_create(&ioc_thread, ioc_thread_main, context_id);
+ if (status) {
+ puts("hg_thread_create failed");
+ goto err_exit;
+ }
+ return 0;
err_exit:
- return -1;
+ return -1;
}
-
+/*-------------------------------------------------------------------------
+ * Function: finalize_ioc_threads
+ *
+ * Purpose: Normally we shouldn't have any IOC threads running by the
+ * program exits. If we do, this destructor function gets
+ * called to cleanup
+ *
+ * Return: None
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
void __attribute__((destructor)) finalize_ioc_threads(void)
{
- if (ioc_thread_pool != NULL) {
- hg_thread_pool_destroy(ioc_thread_pool);
- ioc_thread_pool = NULL;
- }
+ if (ioc_thread_pool != NULL) {
+ hg_thread_pool_destroy(ioc_thread_pool);
+ ioc_thread_pool = NULL;
+ }
}
-
+/*-------------------------------------------------------------------------
+ * Function: local: handle_work_request
+ *
+ * Purpose: Handle a work request from the thread pool work queue.
+ * We dispatch the specific function as indicated by the
+ * TAG that has been added to the work request by the
+ * IOC main thread (which is just a copy of the MPI tag
+ * associated with the RPC message) and provide the subfiling
+ * context associated with the HDF5 file.
+ *
+ * Any status associated with the function processing is
+ * returned directly to the client via ACK or NACK messages.
+ *
+ * Return: (none) Doesn't fail.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
static HG_THREAD_RETURN_TYPE
handle_work_request(void *arg)
{
- hg_thread_ret_t ret = 0;
- sf_work_request_t *msg = (sf_work_request_t *)arg;
- int status = 0;
-
- atomic_fetch_add(&sf_work_pending, 1); // atomic
- switch(msg->tag) {
- case WRITE_COLL:
- status = queue_write_coll( msg, msg->subfile_rank, msg->source, sf_data_comm);
+ int status = 0;
+ hg_thread_ret_t ret = 0;
+ sf_work_request_t * msg = (sf_work_request_t *) arg;
+ int64_t file_context_id = msg->header[2];
+ subfiling_context_t *sf_context = get_subfiling_object(file_context_id);
+ assert(sf_context != NULL);
+
+ atomic_fetch_add(&sf_work_pending, 1); // atomic
+ switch (msg->tag) {
+ case WRITE_COLL:
+ status = queue_write_coll(
+ msg, msg->subfile_rank, msg->source, sf_context->sf_data_comm);
break;
- case READ_COLL:
- status = queue_read_coll( msg, msg->subfile_rank, msg->source, sf_data_comm);
+ case READ_COLL:
+ status = queue_read_coll(
+ msg, msg->subfile_rank, msg->source, sf_context->sf_data_comm);
break;
- case WRITE_INDEP:
- status = queue_write_indep( msg, msg->subfile_rank, msg->source, sf_data_comm);
+ case WRITE_INDEP:
+ status = queue_write_indep(
+ msg, msg->subfile_rank, msg->source, sf_context->sf_data_comm);
break;
- case READ_INDEP:
- status = queue_read_indep( msg, msg->subfile_rank, msg->source, sf_data_comm);
+ case READ_INDEP:
+ status = queue_read_indep(
+ msg, msg->subfile_rank, msg->source, sf_context->sf_data_comm);
break;
- case CLOSE_OP:
- status = decrement_file_ref_counts( msg->subfile_rank, msg->source, sf_data_comm,
- subfiling_close_file);
+ case CLOSE_OP:
+ status = decrement_file_ref_counts(msg, msg->subfile_rank,
+ msg->source, sf_context->sf_data_comm, subfiling_close_file);
break;
- case OPEN_OP:
- status = queue_file_open( msg, msg->subfile_rank, msg->source, sf_data_comm);
+ case OPEN_OP:
+ status = queue_file_open(
+ msg, msg->subfile_rank, msg->source, sf_context->sf_data_comm);
break;
-
- default:
- printf("[ioc(%d)] received message tag(%x)from rank %d\n", msg->subfile_rank, msg->tag, msg->source);
+ case FINI_OP:
+ status = increment_ioc_fini_counts(msg, msg->subfile_rank,
+ msg->source, sf_context->sf_data_comm, subfiling_shutdown);
+ break;
+ default:
+ printf("[ioc(%d)] received message tag(%x)from rank %d\n",
+ msg->subfile_rank, msg->tag, msg->source);
status = -1;
break;
}
-
- atomic_fetch_sub(&sf_work_pending, 1); // atomic
+
+ atomic_fetch_sub(&sf_work_pending, 1); // atomic
if (status < 0) {
- printf("[ioc(%d) %s]: Error encounted processing request(%x) from rank(%d\n",
- msg->subfile_rank, __func__, msg->tag, msg->source);
- fflush(stdout);
+ printf("[ioc(%d) %s]: Error encounted processing request(%x) from "
+ "rank(%d)\n",
+ msg->subfile_rank, __func__, msg->tag, msg->source);
+ fflush(stdout);
}
- return ret;
+ return ret;
}
-int tpool_add_work(sf_work_request_t *work)
+/*-------------------------------------------------------------------------
+ * Function: tpool_add_work
+ *
+ * Purpose: Initiate the handoff of client request processing to a
+ * thread in the thread pool. A work request is created and
+ * added to the thread pool work queue. Once
+ *
+ * Return: result of: (hostid1 > hostid2)
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+int
+tpool_add_work(sf_work_request_t *work)
{
- static int work_index = 0;
- hg_thread_mutex_lock(&ioc_mutex);
- if (work_index == POOL_CONCURRENT_MAX)
- work_index = 0;
- pool_request[work_index].func = handle_work_request;
- pool_request[work_index].args = work;
- hg_thread_pool_post(ioc_thread_pool, &pool_request[work_index++]);
- hg_thread_mutex_unlock(&ioc_mutex);
- return 0;
+ static int work_index = 0;
+ hg_thread_mutex_lock(&ioc_mutex);
+ if (work_index == POOL_CONCURRENT_MAX)
+ work_index = 0;
+ pool_request[work_index].func = handle_work_request;
+ pool_request[work_index].args = work;
+ hg_thread_pool_post(ioc_thread_pool, &pool_request[work_index++]);
+ hg_thread_mutex_unlock(&ioc_mutex);
+ return 0;
}
-bool tpool_is_empty(void)
+/*-------------------------------------------------------------------------
+ * Function: tpool_is_empty
+ *
+ * Purpose: Utility function to indicate to the caller whether there
+ * is any remaining work in the thread pool queue.
+ *
+ * Return: TRUE or FALSE to indicate whether the work queue is empty.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+bool
+tpool_is_empty(void)
{
- return HG_QUEUE_IS_EMPTY(&ioc_thread_pool->queue);
+ return HG_QUEUE_IS_EMPTY(&ioc_thread_pool->queue);
}
-int begin_thread_exclusive(void)
+/*-------------------------------------------------------------------------
+ * Function: begin_thread_exclusive
+ *
+ * Purpose: Mutex lock to restrict access to code or variables.
+ *
+ * Return: integer result of mutex_lock request.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+int
+begin_thread_exclusive(void)
{
return hg_thread_mutex_lock(&ioc_thread_mutex);
}
-int end_thread_exclusive(void)
+/*-------------------------------------------------------------------------
+ * Function: end_thread_exclusive
+ *
+ * Purpose: Mutex unlock. Should only be called by the current holder
+ * of the locked mutex.
+ *
+ * Return: result of mutex_unlock operation.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+int
+end_thread_exclusive(void)
{
return hg_thread_mutex_unlock(&ioc_thread_mutex);
}
-int wait_for_thread_main(void)
+/*-------------------------------------------------------------------------
+ * Function: wait_for_thread_main
+ *
+ * Purpose: Perform a thread_join on the IOC main thread.
+ *
+ * Return: SUCCESS (0) or FAIL (-1) if the thread_join
+ * does not succeed.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+int
+wait_for_thread_main(void)
{
- if (hg_thread_join(ioc_thread) == 0)
- puts("thread_join succeeded");
- else {
- puts("thread_join failed");
- return -1;
- }
- return 0;
+ if (hg_thread_join(ioc_thread) != 0) {
+ return -1;
+ }
+ return 0;
}
diff --git a/src/H5FDsubfiling.c b/src/H5FDsubfiling.c
index 5626ed1..7fbfdc7 100644
--- a/src/H5FDsubfiling.c
+++ b/src/H5FDsubfiling.c
@@ -23,23 +23,30 @@
* application to the same file).
*/
+#define H5S_FRIEND /*suppress error about including H5Spkg */
#include "H5FDdrvr_module.h" /* This source code file is part of the H5FD driver module */
-
#include "H5private.h" /* Generic Functions */
+#include "H5Dprivate.h" /* Dataset stuff */
#include "H5Eprivate.h" /* Error handling */
#include "H5Fprivate.h" /* File access */
+#include "H5CXprivate.h" /* API contexts, etc. */
#include "H5FDprivate.h" /* File drivers */
#include "H5FDsubfiling.h" /* Subfiling file driver */
#include "H5FLprivate.h" /* Free Lists */
#include "H5Iprivate.h" /* IDs */
#include "H5MMprivate.h" /* Memory management */
#include "H5Pprivate.h" /* Property lists */
+#include "H5Spkg.h" /* For selections and creation of subfiling vectors */
/* The driver identification number, initialized at runtime */
static hid_t H5FD_SUBFILING_g = 0;
-
+/* These are used for the creation of read or write vectors */
+static hssize_t sf_vlen = -1;
+static hsize_t *sf_offsets = NULL;
+static hsize_t *sf_sizes = NULL;
+static void **sf_bufs = NULL;
/* The description of a file belonging to this driver. The 'eoa' and 'eof'
@@ -104,6 +111,11 @@ typedef struct H5FD_subfiling_t {
haddr_t pos; /* current file I/O position */
H5FD_file_op_t op; /* last operation */
char filename[H5FD_MAX_FILENAME_LEN]; /* Copy of file name from open operation */
+ MPI_Info info;
+ MPI_Comm comm;
+ int mpi_size;
+ int mpi_rank;
+
#ifndef H5_HAVE_WIN32_API
/* On most systems the combination of device and i-node number uniquely
* identify a file. Note that Cygwin, MinGW and other Windows POSIX
@@ -186,14 +198,28 @@ static herr_t H5FD_subfiling_read(H5FD_t *_file, H5FD_mem_t type,
hid_t fapl_id, haddr_t addr, size_t size, void *buf);
static herr_t H5FD_subfiling_write(H5FD_t *_file, H5FD_mem_t type,
hid_t fapl_id, haddr_t addr, size_t size, const void *buf);
+
+static herr_t H5FD__subfiling_read_vector(H5FD_t *file, hid_t dxpl_id,
+ uint32_t count, H5FD_mem_t types[], haddr_t addrs[], size_t sizes[],
+ void *bufs[] /* out */);
+static herr_t H5FD__subfiling_write_vector(H5FD_t *file, hid_t dxpl_id,
+ uint32_t count, H5FD_mem_t types[], haddr_t addrs[], size_t sizes[],
+ void *bufs[] /* in */);
+
static herr_t H5FD_subfiling_truncate(H5FD_t *_file, hid_t dxpl_id,
hbool_t closing);
static herr_t H5FD_subfiling_lock(H5FD_t *_file, hbool_t rw);
static herr_t H5FD_subfiling_unlock(H5FD_t *_file);
static herr_t H5FD_subfiling_validate_config(const H5FD_subfiling_fapl_t * fa);
+static int H5FD_subfiling_mpi_rank(const H5FD_t *_file);
+static int H5FD_subfiling_mpi_size(const H5FD_t *_file);
+static MPI_Comm H5FD_subfiling_communicator(const H5FD_t *_file);
+static herr_t H5FD_subfiling_get_info(H5FD_t *_file, void **mpi_info);
+
-static const H5FD_class_t H5FD_subfiling_g = {
+static const H5FD_class_mpi_t H5FD_subfiling_g = {
+ {
"subfiling", /* name */
MAXADDR, /* maxaddr */
H5F_CLOSE_WEAK, /* fc_degree */
@@ -221,13 +247,18 @@ static const H5FD_class_t H5FD_subfiling_g = {
H5FD_subfiling_get_handle, /* get_handle */
H5FD_subfiling_read, /* read */
H5FD_subfiling_write, /* write */
- NULL, /* read_vector */
- NULL, /* write_vector */
+ H5FD__subfiling_read_vector, /* read_vector */
+ H5FD__subfiling_write_vector, /* write_vector */
NULL, /* flush */
H5FD_subfiling_truncate, /* truncate */
H5FD_subfiling_lock, /* lock */
H5FD_subfiling_unlock, /* unlock */
H5FD_FLMAP_DICHOTOMY /* fl_map */
+ },
+ H5FD_subfiling_mpi_rank,
+ H5FD_subfiling_mpi_size,
+ H5FD_subfiling_communicator,
+ H5FD_subfiling_get_info
};
/* Declare a free list to manage the H5FD_subfiling_t struct */
@@ -605,19 +636,23 @@ H5FD_subfiling_fapl_free(void *_fa)
*-------------------------------------------------------------------------
*/
static H5FD_t *
-H5FD_subfiling_open(const char *name, unsigned flags, hid_t fapl_id,
- haddr_t maxaddr)
+H5FD_subfiling_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t maxaddr)
{
H5FD_subfiling_t *file = NULL; /* subfiling VFD info */
- int fd = -1; /* File descriptor */
- int o_flags; /* Flags for open() call */
+ int fd = -1; /* File descriptor */
+ int o_flags; /* Flags for open() call */
+ int mpi_enabled = 0;
+ int mpi_provides = -1;
+ int my_rank;
#ifdef H5_HAVE_WIN32_API
struct _BY_HANDLE_FILE_INFORMATION fileinfo;
#endif
h5_stat_t sb;
H5FD_subfiling_fapl_t fa;
H5FD_t *ret_value = NULL; /* Return value */
-
+ char *dir_path = NULL;
+ char file_prefix[H5FD_MAX_FILENAME_LEN];
+ hid_t h5_file_id;
FUNC_ENTER_NOAPI_NOINIT
/* Sanity check on file offsets */
@@ -644,14 +679,34 @@ H5FD_subfiling_open(const char *name, unsigned flags, hid_t fapl_id,
HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, NULL, "can't get property list")
}
+ if (MPI_Initialized(&mpi_enabled) == MPI_SUCCESS) {
+ MPI_Query_thread(&mpi_provides);
+ MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+ }
+
/* Open the file */
if((fd = HDopen(name, o_flags, H5_POSIX_CREATE_MODE_RW)) < 0) {
int myerrno = errno;
- HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to open file: name = '%s', errno = %d, error message = '%s', flags = %x, o_flags = %x", name, myerrno, HDstrerror(myerrno), flags, (unsigned)o_flags);
+ HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL,
+ "unable to open file: name = '%s', errno = %d, error message = '%s', flags = %x, o_flags = %x",
+ name, myerrno, HDstrerror(myerrno), flags, (unsigned)o_flags);
} /* end if */
- if(HDfstat(fd, &sb) < 0)
- HSYS_GOTO_ERROR(H5E_FILE, H5E_BADFILE, NULL, "unable to fstat file")
+ /* Avoid doing an addtional file stat on every MPI rank.
+ * By default the file open will stat the directory...
+ * We can be a bit more efficient by having rank 0 broadcast
+ * the stat buffer.
+ */
+#if 0
+ if (mpi_enabled && (my_rank == 0)) {
+ int sb_size = sizeof(sb);
+
+ MPI_Bcast(&sb, sb_size, MPI_BYTE, 0, MPI_COMM_WORLD);
+ }
+#else
+ if(HDfstat(fd, &sb) < 0)
+ HSYS_GOTO_ERROR(H5E_FILE, H5E_BADFILE, NULL, "unable to fstat file")
+#endif
/* Create the new file struct */
if(NULL == (file = H5FL_CALLOC(H5FD_subfiling_t)))
@@ -701,6 +756,32 @@ H5FD_subfiling_open(const char *name, unsigned flags, hid_t fapl_id,
HGOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't get property of changing family to single")
} /* end if */
+ /* Use a FILE_id which is consistent/constant across multiple MPI ranks */
+ h5_file_id = (hid_t)file->inode;
+ dir_path = strrchr(file->filename,'/');
+ if (dir_path) {
+ *dir_path = '\0';
+ strcpy(file_prefix, file->filename);
+ *dir_path = '/';
+ dir_path = file_prefix;
+ }
+
+ /* Only open subfiling if we've enabled MPI */
+ if (mpi_enabled &&
+ (mpi_provides == MPI_THREAD_MULTIPLE) &&
+ (sf_open_subfiles(h5_file_id, file->filename, dir_path, o_flags ) < 0)) {
+ HGOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't open subfiling files")
+ MPI_Comm_dup(MPI_COMM_WORLD, &file->comm);
+ MPI_Comm_rank(MPI_COMM_WORLD,&file->mpi_rank);
+ MPI_Comm_size(MPI_COMM_WORLD,&file->mpi_size);
+ file->info = MPI_INFO_NULL;
+ }
+ else {
+ /* MPI isn't avail, so neither is subfiling...
+ * In would be advantageous to replace subfiling parallel
+ * subfiling with serial..
+ */
+ }
/* Set return value */
ret_value = (H5FD_t*)file;
@@ -734,13 +815,20 @@ H5FD_subfiling_close(H5FD_t *_file)
{
H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file;
herr_t ret_value = SUCCEED; /* Return value */
+ int mpi_enabled = 0;
FUNC_ENTER_NOAPI_NOINIT
/* Sanity check */
HDassert(file);
- /* Close the underlying file */
+ if (MPI_Initialized(&mpi_enabled) == MPI_SUCCESS) {
+ /* Prepare to close the actual subfiles */
+ hid_t h5_fid = (hid_t)file->inode;
+ if (mpi_enabled && (sf_close_subfiles(h5_fid) < 0))
+ HSYS_GOTO_ERROR(H5E_IO, H5E_CANTCLOSEFILE, FAIL, "sf_close_subfiles returned and error")
+ }
+ /* Close the underlying HDF file */
if(HDclose(file->fd) < 0)
HSYS_GOTO_ERROR(H5E_IO, H5E_CANTCLOSEFILE, FAIL, "unable to close file")
@@ -822,7 +910,7 @@ done:
static herr_t
H5FD_subfiling_query(const H5FD_t *_file, unsigned long *flags /* out */)
{
- const H5FD_subfiling_t *file = (const H5FD_subfiling_t *)_file; /* subfiling VFD info */
+ const H5FD_subfiling_t *file = (const H5FD_subfiling_t *)_file; /* subfiling VFD info */
FUNC_ENTER_NOAPI_NOINIT_NOERR
@@ -841,8 +929,9 @@ H5FD_subfiling_query(const H5FD_t *_file, unsigned long *flags /* out */)
*flags |= H5FD_FEAT_AGGREGATE_SMALLDATA; /* OK to aggregate "small" raw data allocations */
*flags |= H5FD_FEAT_POSIX_COMPAT_HANDLE; /* get_handle callback returns a POSIX file descriptor */
*flags |= H5FD_FEAT_SUPPORTS_SWMR_IO; /* VFD supports the single-writer/multiple-readers (SWMR) pattern */
- *flags |= H5FD_FEAT_DEFAULT_VFD_COMPATIBLE; /* VFD creates a file which can be opened with the default VFD */
-
+#if 0
+ *flags |= H5FD_FEAT_HAS_MPI; /* FIXME:: for experimentation only... */
+#endif
/* Check for flags that are set by h5repart */
if(file && file->fam_to_single)
*flags |= H5FD_FEAT_IGNORE_DRVRINFO; /* Ignore the driver info when file is opened (which eliminates it) */
@@ -869,7 +958,7 @@ H5FD_subfiling_query(const H5FD_t *_file, unsigned long *flags /* out */)
static haddr_t
H5FD_subfiling_get_eoa(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type)
{
- const H5FD_subfiling_t *file = (const H5FD_subfiling_t *)_file;
+ const H5FD_subfiling_t *file = (const H5FD_subfiling_t *)_file;
FUNC_ENTER_NOAPI_NOINIT_NOERR
@@ -894,7 +983,7 @@ H5FD_subfiling_get_eoa(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type)
static herr_t
H5FD_subfiling_set_eoa(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, haddr_t addr)
{
- H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file;
+ H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file;
FUNC_ENTER_NOAPI_NOINIT_NOERR
@@ -982,8 +1071,9 @@ H5FD_subfiling_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type,
hid_t H5_ATTR_UNUSED dxpl_id, haddr_t addr, size_t size, void *buf /*out*/)
{
H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file;
- HDoff_t offset = (HDoff_t)addr;
herr_t ret_value = SUCCEED; /* Return value */
+ hbool_t addrs_cooked = FALSE;
+ int mpi_enabled = 0;
FUNC_ENTER_NOAPI_NOINIT
@@ -996,62 +1086,19 @@ H5FD_subfiling_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type,
if(REGION_OVERFLOW(addr, size))
HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow, addr = %llu", (unsigned long long)addr)
-#ifndef H5_HAVE_PREADWRITE
- /* Seek to the correct location (if we don't have pread) */
- if(addr != file->pos || OP_READ != file->op) {
- if(HDlseek(file->fd, (HDoff_t)addr, SEEK_SET) < 0)
- HSYS_GOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL, "unable to seek to proper position")
- }
-#endif /* H5_HAVE_PREADWRITE */
-
- /* Read data, being careful of interrupted system calls, partial results,
- * and the end of the file.
- */
- while(size > 0) {
-
- h5_posix_io_t bytes_in = 0; /* # of bytes to read */
- h5_posix_io_ret_t bytes_read = -1; /* # of bytes actually read */
-
- /* Trying to read more bytes than the return type can handle is
- * undefined behavior in POSIX.
- */
- if(size > H5_POSIX_MAX_IO_BYTES)
- bytes_in = H5_POSIX_MAX_IO_BYTES;
- else
- bytes_in = (h5_posix_io_t)size;
-
- do {
-#ifdef H5_HAVE_PREADWRITE
- bytes_read = HDpread(file->fd, buf, bytes_in, offset);
- if(bytes_read > 0)
- offset += bytes_read;
-#else
- bytes_read = HDread(file->fd, buf, bytes_in);
-#endif /* H5_HAVE_PREADWRITE */
- } while(-1 == bytes_read && EINTR == errno);
-
- if(-1 == bytes_read) { /* error */
- int myerrno = errno;
- time_t mytime = HDtime(NULL);
+ addr += _file->base_addr;
+ addrs_cooked = TRUE; /* Follow the example of read_vector (see H5FDint.c) */
- offset = HDlseek(file->fd, (HDoff_t)0, SEEK_CUR);
-
- HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed: time = %s, filename = '%s', file descriptor = %d, errno = %d, error message = '%s', buf = %p, total read size = %llu, bytes this sub-read = %llu, bytes actually read = %llu, offset = %llu", HDctime(&mytime), file->filename, file->fd, myerrno, HDstrerror(myerrno), buf, (unsigned long long)size, (unsigned long long)bytes_in, (unsigned long long)bytes_read, (unsigned long long)offset);
- } /* end if */
-
- if(0 == bytes_read) {
- /* end of file but not end of format address space */
- HDmemset(buf, 0, size);
- break;
- } /* end if */
-
- HDassert(bytes_read >= 0);
- HDassert((size_t)bytes_read <= size);
+ if (MPI_Initialized(&mpi_enabled) == MPI_SUCCESS) {
+ hid_t h5_fid = (hid_t)file->inode;
+ if (mpi_enabled && (sf_read_independent(h5_fid, (int64_t)addr, (int64_t)size, 1, buf) < 0))
+ HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "subfile read failed")
+ }
+ addr += (haddr_t)size;
- size -= (size_t)bytes_read;
- addr += (haddr_t)bytes_read;
- buf = (char *)buf + bytes_read;
- } /* end while */
+ if ( addrs_cooked ) {
+ addr -= _file->base_addr;
+ }
/* Update current position */
file->pos = addr;
@@ -1087,8 +1134,9 @@ H5FD_subfiling_write(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type,
hid_t H5_ATTR_UNUSED dxpl_id, haddr_t addr, size_t size, const void *buf)
{
H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file;
- HDoff_t offset = (HDoff_t)addr;
herr_t ret_value = SUCCEED; /* Return value */
+ hbool_t addrs_cooked = FALSE;
+ int mpi_enabled = 0;
FUNC_ENTER_NOAPI_NOINIT
@@ -1101,56 +1149,20 @@ H5FD_subfiling_write(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type,
if(REGION_OVERFLOW(addr, size))
HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow, addr = %llu, size = %llu", (unsigned long long)addr, (unsigned long long)size)
-#ifndef H5_HAVE_PREADWRITE
- /* Seek to the correct location (if we don't have pwrite) */
- if(addr != file->pos || OP_WRITE != file->op) {
- if(HDlseek(file->fd, (HDoff_t)addr, SEEK_SET) < 0)
- HSYS_GOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL, "unable to seek to proper position")
- }
-#endif /* H5_HAVE_PREADWRITE */
+ addr += _file->base_addr;
+ addrs_cooked = TRUE; /* Follow the example of read_vector (see H5FDint.c) */
- /* Write the data, being careful of interrupted system calls and partial
- * results
- */
- while(size > 0) {
-
- h5_posix_io_t bytes_in = 0; /* # of bytes to write */
- h5_posix_io_ret_t bytes_wrote = -1; /* # of bytes written */
-
- /* Trying to write more bytes than the return type can handle is
- * undefined behavior in POSIX.
- */
- if(size > H5_POSIX_MAX_IO_BYTES)
- bytes_in = H5_POSIX_MAX_IO_BYTES;
- else
- bytes_in = (h5_posix_io_t)size;
-
- do {
-#ifdef H5_HAVE_PREADWRITE
- bytes_wrote = HDpwrite(file->fd, buf, bytes_in, offset);
- if(bytes_wrote > 0)
- offset += bytes_wrote;
-#else
- bytes_wrote = HDwrite(file->fd, buf, bytes_in);
-#endif /* H5_HAVE_PREADWRITE */
- } while(-1 == bytes_wrote && EINTR == errno);
-
- if(-1 == bytes_wrote) { /* error */
- int myerrno = errno;
- time_t mytime = HDtime(NULL);
-
- offset = HDlseek(file->fd, (HDoff_t)0, SEEK_CUR);
-
- HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "file write failed: time = %s, filename = '%s', file descriptor = %d, errno = %d, error message = '%s', buf = %p, total write size = %llu, bytes this sub-write = %llu, bytes actually written = %llu, offset = %llu", HDctime(&mytime), file->filename, file->fd, myerrno, HDstrerror(myerrno), buf, (unsigned long long)size, (unsigned long long)bytes_in, (unsigned long long)bytes_wrote, (unsigned long long)offset);
- } /* end if */
+ if (MPI_Initialized(&mpi_enabled) == MPI_SUCCESS) {
+ hid_t h5_fid = (hid_t)file->inode;
+ if (mpi_enabled && (sf_write_independent(h5_fid, (int64_t)addr, (int64_t)size, 1, buf) < 0))
+ HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "subfile write failed")
+ }
- HDassert(bytes_wrote > 0);
- HDassert((size_t)bytes_wrote <= size);
+ addr += (haddr_t)size; /* Point to the end of the current IO */
- size -= (size_t)bytes_wrote;
- addr += (haddr_t)bytes_wrote;
- buf = (const char *)buf + bytes_wrote;
- } /* end while */
+ if ( addrs_cooked ) {
+ addr -= _file->base_addr;
+ }
/* Update current position and eof */
file->pos = addr;
@@ -1168,12 +1180,165 @@ done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD_subfiling_write() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FDsubfile__read_vector (internal function)
+ *
+ * Purpose: Perform count reads from the specified file at the offsets
+ * provided in the addrs array, with the lengths and memory
+ * types provided in the sizes and types arrays. Data read
+ * is returned in the buffers provided in the bufs array.
+ *
+ * All reads are done according to the data transfer property
+ * list dxpl_id (which may be the constant H5P_DEFAULT).
+ *
+ * Return: Success: SUCCEED
+ * All reads have completed successfully, and
+ * the results havce been into the supplied
+ * buffers.
+ *
+ * Failure: FAIL
+ * The contents of supplied buffers are undefined.
+ *
+ * Programmer: JRM -- 6/10/20
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD__subfiling_read_vector(H5FD_t *_file, hid_t dxpl_id, uint32_t count,
+ H5FD_mem_t types[], haddr_t addrs[], size_t sizes[],
+ void *bufs[] /* out */)
+{
+ H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file;
+ herr_t ret_value = SUCCEED; /* Return value */
+ hid_t h5_fid;
+
+ FUNC_ENTER_STATIC
+
+ /* Check arguments
+ * RAW - Do we really need to check arguments once again?
+ * These have already been checked in H5FD_subfiling_read_vector (see below)!
+ */
+ if(!file)
+ HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file pointer cannot be NULL")
+
+ if((!types) && (count > 0))
+ HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "types parameter can't be NULL if count is positive")
+
+ if((!addrs) && (count > 0))
+ HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "addrs parameter can't be NULL if count is positive")
+
+ if((!sizes) && (count > 0))
+ HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "sizes parameter can't be NULL if count is positive")
+
+ if((!bufs) && (count > 0))
+ HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "bufs parameter can't be NULL if count is positive")
+
+ /* Get the default dataset transfer property list if the user didn't provide one */
+ if(H5P_DEFAULT == dxpl_id) {
+ dxpl_id = H5P_DATASET_XFER_DEFAULT;
+ } else {
+ if(TRUE != H5P_isa_class(dxpl_id, H5P_DATASET_XFER))
+ HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a data transfer property list")
+ }
+
+ /* Set DXPL for operation */
+ H5CX_set_dxpl(dxpl_id);
+ h5_fid = (hid_t)file->inode;
+ if(sf_read_vector(h5_fid, count, (hsize_t *)addrs, (hsize_t *)sizes, bufs) != SUCCEED)
+ HGOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "file vector write request failed")
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+}
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FDsubfile__write_vector (internal function)
+ *
+ * Purpose: Perform count writes to the specified file at the offsets
+ * provided in the addrs array. Lengths and memory
+ * types provided in the sizes and types arrays. Data to be
+ * written is referenced by the bufs array.
+ *
+ * All writes are done according to the data transfer property
+ * list dxpl_id (which may be the constant H5P_DEFAULT).
+ *
+ * Return: Success: SUCCEED
+ * All writes have completed successfully.
+ *
+ * Failure: FAIL
+ * An internal error was encountered, e.g the
+ * input arguments are not valid, or the actual
+ * subfiling writes have failed for some reason.
+ *
+ * Programmer: JRM -- 6/10/20
+ *
+ * Changes: None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD__subfiling_write_vector(H5FD_t *_file, hid_t dxpl_id, uint32_t count,
+ H5FD_mem_t types[], haddr_t addrs[], size_t sizes[],
+ void *bufs[] /* in */)
+{
+ H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file;
+ herr_t ret_value = SUCCEED; /* Return value */
+ hid_t h5_fid;
+
+ FUNC_ENTER_STATIC
+
+ HDassert(file != NULL); /* sanity check */
+
+ /* Check arguments
+ * RAW - Do we really need to check arguments once again?
+ * These have already been checked in H5FD_subfiling_write_vector (see below)!
+ */
+ if(!file)
+ HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file pointer cannot be NULL")
+
+ if((!types) && (count > 0))
+ HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "types parameter can't be NULL if count is positive")
+
+ if((!addrs) && (count > 0))
+ HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "addrs parameter can't be NULL if count is positive")
+
+ if((!sizes) && (count > 0))
+ HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "sizes parameter can't be NULL if count is positive")
+
+ if((!bufs) && (count > 0))
+ HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "bufs parameter can't be NULL if count is positive")
+
+ /* Get the default dataset transfer property list if the user didn't provide one */
+ if(H5P_DEFAULT == dxpl_id) {
+ dxpl_id = H5P_DATASET_XFER_DEFAULT;
+ } else {
+ if(TRUE != H5P_isa_class(dxpl_id, H5P_DATASET_XFER))
+ HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a data transfer property list")
+ }
+
+ /* Set DXPL for operation */
+ H5CX_set_dxpl(dxpl_id);
+ h5_fid = (hid_t)file->inode;
+ if(sf_write_vector(h5_fid, count, (hsize_t *)addrs, (hsize_t *)sizes, bufs) != SUCCEED)
+ HGOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "file vector write request failed")
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+
+} /* end H5FDsubfile__write_vector() */
+
+
/*-------------------------------------------------------------------------
* Function: H5FD_subfiling_truncate
*
- * Purpose: Makes sure that the true file size is the same (or larger)
- * than the end-of-address.
+ * Purpose: Makes sure that the true file size is the same as
+ * the end-of-allocation.
*
* Return: SUCCEED/FAIL
*
@@ -1183,11 +1348,12 @@ done:
*-------------------------------------------------------------------------
*/
static herr_t
-H5FD_subfiling_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id,
+H5FD_subfiling_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id,
hbool_t H5_ATTR_UNUSED closing)
{
H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file;
herr_t ret_value = SUCCEED; /* Return value */
+ int mpi_enabled = 0;
FUNC_ENTER_NOAPI_NOINIT
@@ -1195,6 +1361,11 @@ H5FD_subfiling_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id,
/* Extend the file to make sure it's large enough */
if(!H5F_addr_eq(file->eoa, file->eof)) {
+ if (MPI_Initialized(&mpi_enabled) == MPI_SUCCESS) {
+ hid_t h5_fid = (hid_t)file->inode;
+ if (mpi_enabled && (sf_truncate(h5_fid, file->eof) < 0))
+ HGOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL, "unable to extend file properly")
+ }
#ifdef H5_HAVE_WIN32_API
LARGE_INTEGER li; /* 64-bit (union) integer for SetFilePointer() call */
DWORD dwPtrLow; /* Low-order pointer bits from SetFilePointer()
@@ -1243,9 +1414,9 @@ done:
* Function: H5FD_subfiling_lock
*
* Purpose: To place an advisory lock on a file.
- * The lock type to apply depends on the parameter "rw":
- * TRUE--opens for write: an exclusive lock
- * FALSE--opens for read: a shared lock
+ * The lock type to apply depends on the parameter "rw":
+ * TRUE--opens for write: an exclusive lock
+ * FALSE--opens for read: a shared lock
*
* Return: SUCCEED/FAIL
*
@@ -1312,3 +1483,568 @@ done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD_subfiling_unlock() */
+herr_t
+H5FD__get_file_ino(const char *name, uint64_t *st_ino)
+{
+ herr_t ret_value = SUCCEED; /* Return value */
+ h5_stat_t sb;
+
+ FUNC_ENTER_PACKAGE
+
+ if(HDstat(name, &sb) < 0)
+ HSYS_GOTO_ERROR(H5E_FILE, H5E_BADFILE, FAIL, "unable to fstat file")
+
+ *st_ino = sb.st_ino;
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
+}
+
+static
+herr_t create_simple_vector( hid_t file_space_id, void *memDataBuf, haddr_t addrBase, hssize_t elements, size_t type_extent, hssize_t *vlen, hsize_t **_offsets, hsize_t **_blocklens, void ***_bufs )
+{
+ int n_dims = H5Sget_simple_extent_ndims(file_space_id);
+ hsize_t *offsets = *_offsets;
+ hsize_t *blocklens = *_blocklens;
+ void **bufs = *_bufs;
+ void *nextBuf = memDataBuf;
+
+ assert(vlen);
+ assert(_offsets);
+ assert(_blocklens);
+ assert(_bufs);
+
+ if (n_dims > 0) {
+ hsize_t simple_dims[n_dims];
+ hsize_t stride[n_dims];
+ if (H5Sget_simple_extent_dims(file_space_id, simple_dims, stride) < 0) {
+ puts("H5Sget_simple_extent_dims returned an error");
+ return -1;
+ }
+
+ if (*vlen < 0) {
+ offsets = (hsize_t *)malloc((sizeof(haddr_t)));
+ assert(offsets);
+
+ blocklens = (hsize_t *)malloc((sizeof(hsize_t)));
+ assert(blocklens);
+
+ bufs = (void **)malloc((sizeof(void **)));
+ assert(bufs);
+ }
+ bufs[0] = nextBuf;
+ offsets[0] = addrBase;
+ blocklens[0] = (hsize_t )((hsize_t)elements * type_extent);
+
+ if (*vlen < 0) {
+ *_offsets = offsets;
+ *_blocklens = blocklens;
+ *_bufs = bufs;
+ }
+ *vlen = 1;
+ return 0;
+ }
+ return -1;
+}
+
+
+static
+herr_t create_vector_from_hyperslab( hid_t file_space_id, void *memDataBuf, haddr_t addrBase, size_t type_extent, hssize_t *vlen, hsize_t **_offsets, hsize_t **_blocklens, void ***_bufs )
+{
+ herr_t ret_value = SUCCEED;
+ hssize_t k, n_blocks = H5Sget_select_hyper_nblocks(file_space_id);
+
+ void *nextBuf = memDataBuf;
+
+ hsize_t stride[H5S_MAX_RANK];
+ hsize_t count[H5S_MAX_RANK];
+
+ hsize_t *strides = stride;
+ hsize_t *counts = count;
+
+ hsize_t *offsets = *_offsets;
+ hsize_t *blocklens = *_blocklens;
+ void **bufs = *_bufs;
+
+ assert(vlen);
+ assert(_offsets);
+ assert(_blocklens);
+ assert(_bufs);
+ assert(n_blocks > 0);
+
+ if (n_blocks > H5S_MAX_RANK) {
+ /* Allocate a temp for the H5Sget_regular_hyperslab function call */
+ if ((strides = (hsize_t *)malloc((size_t)n_blocks * sizeof(hsize_t))) == NULL) {
+ perror("unable to allocate storage for vector creation");
+ return -1;
+ }
+ if ((counts = (hsize_t *)malloc((size_t)n_blocks * sizeof(hsize_t))) == NULL) {
+ perror("unable to allocate storage for vector creation");
+ return -1;
+ }
+ }
+
+ /* Allocate storage for the vector elements */
+ if (*vlen < n_blocks) {
+ if (offsets) {
+ offsets = (hsize_t *)realloc(offsets, ((size_t)n_blocks * sizeof(haddr_t)));
+ } else {
+ offsets = (hsize_t *)malloc(((size_t)n_blocks * sizeof(haddr_t)));
+ }
+ assert(offsets);
+ if (blocklens) {
+ blocklens = (hsize_t *)realloc(blocklens, ((size_t)n_blocks * sizeof(hsize_t)));
+ } else {
+ blocklens = (hsize_t *)malloc(((size_t)n_blocks * sizeof(hsize_t)));
+ }
+ assert(blocklens);
+ if (bufs) {
+ bufs = (void **)realloc(bufs, ((size_t)n_blocks * sizeof(void **)));
+ } else {
+ bufs = (void **)malloc(((size_t)n_blocks * sizeof(void **)));
+ }
+ assert(bufs);
+ *vlen = n_blocks;
+ }
+ /* Fill vector elements */
+ if ((ret_value = H5Sget_regular_hyperslab(file_space_id, offsets, strides, counts, blocklens)) < 0) {
+ puts("H5Sget_regular_hyperslab failed");
+ return -1;
+ }
+
+ for(k=0; k < n_blocks; k++) {
+ bufs[k] = nextBuf;
+ offsets[k] *= type_extent;
+ offsets[k] += addrBase;
+ blocklens[k] *= type_extent;
+ nextBuf += (strides[k] * type_extent);
+ }
+ if (strides != stride)
+ free(strides);
+ if (counts != count)
+ free(counts);
+
+ *_offsets = offsets;
+ *_blocklens = blocklens;
+ *_bufs = bufs;
+
+ return ret_value;
+}
+
+
+static
+herr_t check_dims(int ndims, hsize_t *mem_dims, hsize_t *file_dims, int *diff_index)
+{
+ int i;
+ herr_t ret_value = SUCCEED;
+ for(i=0; i < ndims; i++) {
+ if (mem_dims[i] != file_dims[i]) {
+ *diff_index = i;
+ return 0;
+ }
+ }
+ /* ndims +1 == no differences */
+ *diff_index = i;
+ return ret_value;
+}
+
+static
+haddr_t get_data_offset(int mpi_rank, int mpi_size, size_t dtype_extent, const H5S_t *mem_space, const H5S_t *file_space)
+{
+ haddr_t this_base = 0;
+ return this_base;
+}
+
+
+
+static
+haddr_t get_base_offset(int mpi_rank, int mpi_size, hid_t mem_space_id, hid_t file_space_id)
+{
+ haddr_t this_base = 0;
+ int n_dims;
+ int is_simple = H5Sis_simple(file_space_id);
+ /* The 'is_simple' variable is actually a tri value type:
+ * -1 == failed
+ * 0 == NOT_SIMPLE
+ * 1 == SIMPLE
+ */
+ if (is_simple > 0) {
+ n_dims = H5Sget_simple_extent_ndims(mem_space_id);
+ if (n_dims > 0) {
+ hsize_t mem_stride[n_dims];
+ hsize_t mem_dims[n_dims];
+ hsize_t file_stride[n_dims];
+ hsize_t file_dims[n_dims];
+ hsize_t total_size;
+ if (H5Sget_simple_extent_dims(mem_space_id, mem_dims, mem_stride) < 0)
+ puts("H5Sget_simple_extent_dims returned an error");
+ if (H5Sget_simple_extent_dims(file_space_id, file_dims, file_stride) < 0)
+ puts("H5Sget_simple_extent_dims returned an error");
+
+ if (n_dims == 1) {
+ if ((total_size = mem_dims[0] * (hsize_t)mpi_size) == file_dims[0]) {
+ this_base = (mem_dims[0] * (hsize_t)mpi_rank);
+ }
+ }
+ else {
+ int diff_index = -1;
+ if (check_dims(n_dims, mem_dims, file_dims, &diff_index) < 0)
+ puts("check_dims returned an error");
+ if ((total_size = mem_dims[diff_index] * (hsize_t)mpi_size) == file_dims[diff_index]) {
+ this_base = (mem_dims[diff_index] * (hsize_t)mpi_rank);
+ }
+ }
+ }
+ }
+
+ return this_base;
+}
+
+
+
+herr_t
+H5FD__dataset_write_contiguous(hid_t h5_file_id, haddr_t dataset_baseAddr, size_t dtype_extent,
+ int mpi_rank, int mpi_size, void *_dset, hid_t mem_type_id,
+ hid_t mem_space_id, hid_t file_space_id, hid_t plist_id, const void *buf)
+{
+ H5D_t *dset = (H5D_t *)_dset;
+ herr_t ret_value = SUCCEED; /* Return value */
+ hssize_t num_elem_file = -1, num_elem_mem = -1;
+ H5S_sel_type sel_type;
+ hsize_t mem_nelem, file_nelem;
+ const H5S_t *mem_space;
+ const H5S_t *file_space;
+
+ FUNC_ENTER_PACKAGE
+
+ if((num_elem_file = H5Sget_select_npoints(file_space_id)) < 0)
+ puts("can't get number of points in file selection");
+ if((num_elem_mem = H5Sget_select_npoints(mem_space_id)) < 0)
+ puts("can't get number of points in memory selection");
+
+ if(num_elem_file != num_elem_mem)
+ puts("number of elements selected in file and memory dataspaces is different");
+
+ if (H5S_get_validated_dataspace(mem_space_id, &mem_space) < 0) {
+ puts("could not get a validated dataspace from mem_space_id");
+ }
+ else mem_nelem = mem_space->extent.nelem;
+ if (H5S_get_validated_dataspace(file_space_id, &file_space) < 0) {
+ puts("could not get a validated dataspace from file_space_id");
+ }
+ else file_nelem = file_space->extent.nelem;
+
+ if (num_elem_file > 0) {
+ sel_type = H5Sget_select_type(file_space_id);
+ switch (sel_type) {
+ case H5S_SEL_NONE:
+ // printf("[%d] H5S_SEL_NONE\n", mpi_rank);
+ break;
+ case H5S_SEL_POINTS:
+ {
+ haddr_t rank_baseAddr;
+ rank_baseAddr = get_base_offset(mpi_rank, mpi_size, mem_space_id, file_space_id);
+ rank_baseAddr += dataset_baseAddr;
+ // printf("[%d] H5S_SEL_POINTS - num_elem_file: %lld: UNSUPPORTED (for now)\n", mpi_rank, num_elem_file);
+ ret_value = -1;
+ goto done;
+
+ break;
+ }
+ case H5S_SEL_HYPERSLABS:
+ {
+ int status;
+ haddr_t rank_baseAddr;
+#if 0
+ rank_baseAddr = get_data_offset(mpi_rank, mpi_size, dtype_extent, mem_space, file_space);
+
+#else
+ rank_baseAddr = get_base_offset(mpi_rank, mpi_size, mem_space_id, file_space_id);
+ rank_baseAddr += dataset_baseAddr;
+#endif
+ // printf("[%d] H5S_SEL_HYPERSLABS, file_offset = %lld\n", mpi_rank, rank_baseAddr );
+ if ((status = H5Sis_regular_hyperslab(file_space_id)) < 0) {
+ puts("H5Sis_regular_hyperslab returned an error");
+ ret_value = -1;
+ goto done;
+ }
+ if (status > 0) {
+ hssize_t previous_vlen = sf_vlen;
+ if ((mem_space->extent.rank == 1)) {
+ if (sf_offsets == NULL)
+ sf_offsets = (hsize_t *)malloc(sizeof(hsize_t));
+ if (sf_sizes == NULL)
+ sf_sizes = (hsize_t *)malloc(sizeof(hsize_t));
+ if (sf_bufs == NULL)
+ sf_bufs = (void **)malloc(sizeof(void *));
+ sf_vlen = 1;
+ assert(sf_offsets);
+ assert(sf_sizes);
+ assert(sf_bufs);
+
+ sf_offsets[0] = rank_baseAddr;
+ sf_sizes[0] = num_elem_mem * dtype_extent;
+ sf_bufs[0] = buf;
+ }
+ else if (create_vector_from_hyperslab(file_space_id, buf, rank_baseAddr, dtype_extent,
+ &sf_vlen, &sf_offsets, &sf_sizes, &sf_bufs) < 0) {
+ puts("Unable to create vectors");
+ ret_value = -1;
+ goto done;
+ }
+ ret_value = sf_write_vector(h5_file_id, sf_vlen, sf_offsets, sf_sizes, sf_bufs);
+
+ /* Possibly restore the sf_vlen value to accurately reflect the malloc sizes */
+ if (sf_vlen < previous_vlen)
+ sf_vlen = previous_vlen;
+ }
+ break;
+ }
+ case H5S_SEL_ALL:
+ {
+ int status;
+ haddr_t rank_baseAddr;
+ rank_baseAddr = get_base_offset(mpi_rank, mpi_size, mem_space_id, file_space_id);
+ rank_baseAddr += dataset_baseAddr;
+ // printf("[%d] H5S_SEL_ALL\n", mpi_rank);
+ status = H5Sis_simple(file_space_id);
+ if (status > 0) {
+ if (create_simple_vector(file_space_id, buf, rank_baseAddr, num_elem_mem,
+ dtype_extent, &sf_vlen, &sf_offsets, &sf_sizes, &sf_bufs) < 0) {
+ puts("Unable to create simple vectors");
+ goto done;
+ }
+ ret_value = sf_write_vector(h5_file_id, sf_vlen, sf_offsets, sf_sizes, sf_bufs);
+ }
+ break;
+ }
+ default:
+ printf("[%d] UNSUPPORTED selection type\n", mpi_rank);
+ ret_value = -1;
+ } /* END switch (sel_type) */
+
+ } /* if (num_elem_file > 0) */
+
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
+}
+
+herr_t
+H5FD__dataset_read_contiguous(hid_t h5_file_id, haddr_t dataset_baseAddr, size_t dtype_extent,
+ int mpi_rank, int mpi_size, void *_dset, hid_t mem_type_id,
+ hid_t mem_space_id, hid_t file_space_id, hid_t plist_id, void *buf)
+{
+ H5FD_t *dset = (H5FD_t *)_dset;
+ herr_t ret_value = SUCCEED; /* Return value */
+ hssize_t num_elem_file = -1, num_elem_mem = -1;
+ H5S_sel_type sel_type;
+
+ FUNC_ENTER_PACKAGE
+ if((num_elem_file = H5Sget_select_npoints(file_space_id)) < 0)
+ puts("can't get number of points in file selection");
+ if((num_elem_mem = H5Sget_select_npoints(mem_space_id)) < 0)
+ puts("can't get number of points in memory selection");
+
+ if(num_elem_file != num_elem_mem)
+ puts("number of elements selected in file and memory dataspaces is different");
+
+ if (num_elem_file > 0) {
+ sel_type = H5Sget_select_type(file_space_id);
+ switch (sel_type) {
+ case H5S_SEL_NONE:
+ // printf("[%d] H5S_SEL_NONE\n", mpi_rank);
+ break;
+ case H5S_SEL_POINTS:
+ {
+ int status;
+ haddr_t rank_baseAddr;
+ rank_baseAddr = get_base_offset(mpi_rank, mpi_size, mem_space_id, file_space_id);
+ rank_baseAddr += dataset_baseAddr;
+ // printf("[%d] H5S_SEL_POINTS - num_elem_file: %lld: UNSUPPORTED (for now)\n", mpi_rank, num_elem_file);
+ ret_value = -1;
+ goto done;
+
+ break;
+ }
+ case H5S_SEL_HYPERSLABS:
+ {
+ int status;
+ haddr_t rank_baseAddr;
+ const H5S_t *mem_space;
+ const H5S_t *file_space;
+ rank_baseAddr = get_base_offset(mpi_rank, mpi_size, mem_space_id, file_space_id);
+ rank_baseAddr += dataset_baseAddr;
+ if (H5S_get_validated_dataspace(mem_space_id, &mem_space) < 0) {
+ puts("could not get a validated dataspace from mem_space_id");
+ }
+ if (H5S_get_validated_dataspace(file_space_id, &file_space) < 0) {
+ puts("could not get a validated dataspace from file_space_id");
+ }
+
+ // printf("[%d] H5S_SEL_HYPERSLABS, file_offset = %lld\n", mpi_rank, rank_baseAddr );
+ if ((status = H5Sis_regular_hyperslab(file_space_id)) < 0) {
+ puts("H5Sis_regular_hyperslab returned an error");
+ ret_value = -1;
+ goto done;
+ }
+ if (status > 0) {
+ hssize_t previous_vlen = sf_vlen;
+ if (mem_space->extent.rank == 1) {
+ if (sf_offsets == NULL)
+ sf_offsets = (hsize_t *)malloc(sizeof(hsize_t));
+ if (sf_sizes == NULL)
+ sf_sizes = (hsize_t *)malloc(sizeof(hsize_t));
+ if (sf_bufs == NULL)
+ sf_bufs = (void **)malloc(sizeof(void *));
+ sf_vlen = 1;
+ assert(sf_offsets);
+ assert(sf_sizes);
+ assert(sf_bufs);
+
+ sf_offsets[0] = rank_baseAddr;
+ sf_sizes[0] = num_elem_mem * dtype_extent;
+ sf_bufs[0] = buf;
+ }
+ else if (create_vector_from_hyperslab(file_space_id, buf, rank_baseAddr, dtype_extent,
+ &sf_vlen, &sf_offsets, &sf_sizes, &sf_bufs) < 0) {
+ puts("Unable to create vectors");
+ ret_value = -1;
+ goto done;
+ }
+ ret_value = sf_read_vector(h5_file_id, sf_vlen, sf_offsets, sf_sizes, sf_bufs);
+
+ /* Possibly restore the sf_vlen value to accurately reflect the malloc sizes */
+ if (sf_vlen < previous_vlen)
+ sf_vlen = previous_vlen;
+ }
+ break;
+ }
+ case H5S_SEL_ALL:
+ {
+ int status;
+ haddr_t rank_baseAddr;
+ rank_baseAddr = get_base_offset(mpi_rank, mpi_size, mem_space_id, file_space_id);
+ rank_baseAddr += dataset_baseAddr;
+ // printf("[%d] H5S_SEL_ALL\n", mpi_rank);
+ status = H5Sis_simple(file_space_id);
+ if (status > 0) {
+ if (create_simple_vector(file_space_id, buf, rank_baseAddr, num_elem_mem,
+ dtype_extent, &sf_vlen, &sf_offsets, &sf_sizes, &sf_bufs) < 0) {
+ puts("Unable to create simple vectors");
+ goto done;
+ }
+ ret_value = sf_read_vector(h5_file_id, sf_vlen, sf_offsets, sf_sizes, sf_bufs);
+ }
+ break;
+ }
+ default:
+ printf("[%d] UNSUPPORTED selection type\n", mpi_rank);
+ ret_value = -1;
+ } /* END switch (sel_type) */
+
+ } /* if (num_elem_file > 0) */
+
+done:
+
+ FUNC_LEAVE_NOAPI(ret_value)
+
+}
+
+static int
+H5FD_subfiling_mpi_rank(const H5FD_t *_file)
+{
+ const H5FD_subfiling_t *file = (const H5FD_subfiling_t*)_file;
+
+ FUNC_ENTER_STATIC_NOERR
+
+ /* Sanity checks */
+ HDassert(file);
+
+ FUNC_LEAVE_NOAPI(file->mpi_rank)
+} /* end H5FD__mpio_mpi_rank() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_subfiling_mpi_size
+ *
+ * Purpose: Returns the number of MPI processes
+ *
+ * Return: Success: non-negative
+ * Failure: negative
+ *
+ * Programmer: Quincey Koziol
+ * Thursday, May 16, 2002
+ *
+ *-------------------------------------------------------------------------
+ */
+static int
+H5FD_subfiling_mpi_size(const H5FD_t *_file)
+{
+ const H5FD_subfiling_t *file = (const H5FD_subfiling_t*)_file;
+
+ FUNC_ENTER_STATIC_NOERR
+
+ /* Sanity checks */
+ HDassert(file);
+
+ FUNC_LEAVE_NOAPI(file->mpi_size)
+} /* end H5FD__subfiling_mpi_size() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_subfiling_communicator
+ *
+ * Purpose: Returns the MPI communicator for the file.
+ *
+ * Return: Success: The communicator
+ * Failure: Can't fail
+ *
+ * Programmer: Robb Matzke
+ * Monday, August 9, 1999
+ *
+ *-------------------------------------------------------------------------
+ */
+static MPI_Comm
+H5FD_subfiling_communicator(const H5FD_t *_file)
+{
+ const H5FD_subfiling_t *file = (const H5FD_subfiling_t*)_file;
+
+ FUNC_ENTER_STATIC_NOERR
+
+ /* Sanity checks */
+ HDassert(file);
+
+ FUNC_LEAVE_NOAPI(file->comm)
+} /* end H5FD__subfiling_communicator() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_subfiling_get_info
+ *
+ * Purpose: Returns the file info of SUBFILING file driver.
+ *
+ * Returns: Non-negative if succeed or negative if fails.
+ *
+ * Programmer: John Mainzer
+ * April 4, 2017
+ *
+ *-------------------------------------------------------------------------
+*/
+static herr_t
+H5FD_subfiling_get_info(H5FD_t *_file, void **mpi_info)
+{
+ H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file;
+ herr_t ret_value = SUCCEED;
+
+ FUNC_ENTER_STATIC
+
+ if(!mpi_info)
+ HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "mpi info not valid")
+
+ *mpi_info = &(file->info);
+
+done:
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* H5FD__subfiling_get_info() */
+
diff --git a/src/H5FDsubfiling.h b/src/H5FDsubfiling.h
index b4b3f1c..8113354 100644
--- a/src/H5FDsubfiling.h
+++ b/src/H5FDsubfiling.h
@@ -66,9 +66,25 @@ extern "C" {
#endif
H5_DLL hid_t H5FD_subfiling_init(void);
-H5_DLL herr_t H5Pget_fapl_subfiling(hid_t fapl_id,
- H5FD_subfiling_fapl_t *fa_out);
+H5_DLL herr_t H5Pget_fapl_subfiling(hid_t fapl_id, H5FD_subfiling_fapl_t *fa_out);
H5_DLL herr_t H5Pset_fapl_subfiling(hid_t fapl_id, H5FD_subfiling_fapl_t *fa);
+H5_DLL herr_t H5FD__get_file_ino(const char *name, uint64_t *st_ino);
+H5_DLL herr_t H5FD__dataset_write_contiguous(hid_t h5_file_id, haddr_t dataset_baseAddr, size_t dtype_extent,
+ int mpi_rank, int mpi_size, void *_dset, hid_t mem_type_id, hid_t mem_space_id,
+ hid_t file_space_id, hid_t plist_id, const void *buf);
+H5_DLL herr_t H5FD__dataset_read_contiguous(hid_t h5_file_id, haddr_t dataset_baseAddr, size_t dtype_extent,
+ int mpi_rank, int mpi_size, void *_dset, hid_t mem_type_id, hid_t mem_space_id,
+ hid_t file_space_id, hid_t plist_id, void *buf);
+
+/* Copied from:: H5FDsubfile_private.h */
+H5_DLL int sf_open_subfiles(hid_t context_id, char *filename, char *prefix, int flags);
+H5_DLL int sf_close_subfiles(hid_t context_id);
+H5_DLL int sf_read_independent(hid_t sf_fid, int64_t offset, int64_t elements, int dtype_extent, void *data);
+H5_DLL int sf_write_independent(hid_t sf_fid, int64_t offset, int64_t elements, int dtype_extent, const void *data);
+H5_DLL herr_t sf_read_vector(hid_t h5_fid, hssize_t count, hsize_t addrs[], hsize_t sizes[], void *bufs[] /* out */);
+H5_DLL herr_t sf_write_vector(hid_t h5_fid, hssize_t count, hsize_t addrs[], hsize_t sizes[], void *bufs[] /* in */);
+H5_DLL int sf_truncate(hid_t h5_fid, haddr_t addr);
+
#ifdef __cplusplus
}
diff --git a/src/Makefile.am b/src/Makefile.am
index e1d8591..995af7b 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -207,7 +207,7 @@ $(top_srcdir)/src/H5overflow.h: $(top_srcdir)/src/H5overflow.txt
trace: $(libhdf5_la_SOURCES)
@for dep in $? dummy; do \
- if test $$dep != "dummy" -a -n "$(PERL)"; then \
+ if test $$dep != "dummy" -a -n "$(PERL)"; then \
case "$$dep" in \
*.c) \
$(TRACE) $$dep; \
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 311d753..7da92cd 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -145,6 +145,7 @@ if (BUILD_SHARED_LIBS)
#-----------------------------------------------------------------------------
set (VOL_PLUGIN_LIBS
null_vol_connector
+ h5subfiling_vol
)
foreach (vol_lib ${VOL_PLUGIN_LIBS})
diff --git a/test/Makefile.am b/test/Makefile.am
index 7ebeae7..8e6a900 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -116,7 +116,8 @@ if HAVE_SHARED_CONDITIONAL
# The libh5test library provides common support code for the tests.
# The filter_plugin* libraries are for use in filter_plugin.c.
# Build them as shared libraries if that option was enabled in configure.
- noinst_LTLIBRARIES=libh5test.la libfilter_plugin1_dsets.la libfilter_plugin2_dsets.la libfilter_plugin3_dsets.la libfilter_plugin4_groups.la libnull_vol_connector.la
+ noinst_LTLIBRARIES=libh5test.la libfilter_plugin1_dsets.la libfilter_plugin2_dsets.la libfilter_plugin3_dsets.la libfilter_plugin4_groups.la \
+ libnull_vol_connector.la libh5subfiling_vol.la
libfilter_plugin1_dsets_la_SOURCES=filter_plugin1_dsets.c
libfilter_plugin2_dsets_la_SOURCES=filter_plugin2_dsets.c
libfilter_plugin3_dsets_la_SOURCES=filter_plugin3_dsets.c
@@ -133,7 +134,8 @@ if HAVE_SHARED_CONDITIONAL
# null_vol_connector is used for testing basic VOL plugin functionality.
libnull_vol_connector_la_SOURCES=null_vol_connector.c
libnull_vol_connector_la_LDFLAGS=$(AM_LDFLAGS) -avoid-version -module -shared -export-dynamic -rpath /nowhere
-
+ libh5subfiling_vol_la_SOURCES=h5subfiling_vol.c
+ libh5subfiling_vol_la_LDFLAGS=$(AM_LDFLAGS) -avoid-version -module -shared -export-dynamic -rpath /nowhere
else
# The libh5test library provides common support code for the tests.
noinst_LTLIBRARIES=libh5test.la
diff --git a/test/tselect.c b/test/tselect.c
index c98db5d..305d660 100644
--- a/test/tselect.c
+++ b/test/tselect.c
@@ -14915,8 +14915,8 @@ test_sel_iter(void)
{
hid_t sid; /* Dataspace ID */
hid_t iter_id; /* Dataspace selection iterator ID */
- hsize_t dims1[] = {6, 12}; /* 2-D Dataspace dimensions */
- hsize_t coord1[POINT1_NPOINTS][2]; /* Coordinates for point selection */
+ hsize_t dims1[] = {6, 12}; /* 2-D Dataspace dimensions */
+ hsize_t coord1[POINT1_NPOINTS][2]; /* Coordinates for point selection */
hsize_t start[2]; /* Hyperslab start */
hsize_t stride[2]; /* Hyperslab stride */
hsize_t count[2]; /* Hyperslab block count */
@@ -14928,7 +14928,7 @@ test_sel_iter(void)
H5S_sel_type sel_type; /* Selection type */
unsigned sel_share; /* Whether to share selection with dataspace */
unsigned sel_iter_flags; /* Flags for selection iterator creation */
- herr_t ret; /* Generic return value */
+ herr_t ret; /* Generic return value */
/* Output message about test being performed */
MESSAGE(6, ("Testing Dataspace Selection Iterators\n"));
diff --git a/test/vfd.c b/test/vfd.c
index 8b59341..4fe229d 100644
--- a/test/vfd.c
+++ b/test/vfd.c
@@ -12,7 +12,7 @@
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/*
- * Programmer: Raymond Lu
+ * Programmer: Raymond Lu<slu@ncsa.uiuc.edu>
* Tuesday, Sept 24, 2002
*
* Purpose: Tests the basic features of Virtual File Drivers
@@ -2258,51 +2258,46 @@ static int
compare_splitter_config_info(hid_t fapl_id, H5FD_splitter_vfd_config_t *info)
{
int ret_value = 0;
- H5FD_splitter_vfd_config_t *fetched_info = NULL;
-
- if (NULL == (fetched_info = HDcalloc(1, sizeof(H5FD_splitter_vfd_config_t))))
- SPLITTER_TEST_FAULT("memory allocation for fetched_info struct failed");
+ H5FD_splitter_vfd_config_t fetched_info;
- fetched_info->magic = H5FD_SPLITTER_MAGIC;
- fetched_info->version = H5FD_CURR_SPLITTER_VFD_CONFIG_VERSION;
- fetched_info->rw_fapl_id = H5I_INVALID_HID;
- fetched_info->wo_fapl_id = H5I_INVALID_HID;
+ fetched_info.magic = H5FD_SPLITTER_MAGIC;
+ fetched_info.version = H5FD_CURR_SPLITTER_VFD_CONFIG_VERSION;
+ fetched_info.rw_fapl_id = H5I_INVALID_HID;
+ fetched_info.wo_fapl_id = H5I_INVALID_HID;
- if (H5Pget_fapl_splitter(fapl_id, fetched_info) < 0) {
- SPLITTER_TEST_FAULT("can't get splitter info");
+ if (H5Pget_fapl_splitter(fapl_id, &fetched_info) < 0) {
+ SPLITTER_TEST_FAULT("can't get splitter info\n");
}
if (info->rw_fapl_id == H5P_DEFAULT) {
- if (H5Pget_driver(fetched_info->rw_fapl_id) != H5Pget_driver(H5P_FILE_ACCESS_DEFAULT)) {
+ if (H5Pget_driver(fetched_info.rw_fapl_id) != H5Pget_driver(H5P_FILE_ACCESS_DEFAULT)) {
SPLITTER_TEST_FAULT("Read-Write driver mismatch (default)\n");
}
}
else {
- if (H5Pget_driver(fetched_info->rw_fapl_id) != H5Pget_driver(info->rw_fapl_id)) {
+ if (H5Pget_driver(fetched_info.rw_fapl_id) != H5Pget_driver(info->rw_fapl_id)) {
SPLITTER_TEST_FAULT("Read-Write driver mismatch\n");
}
}
if (info->wo_fapl_id == H5P_DEFAULT) {
- if (H5Pget_driver(fetched_info->wo_fapl_id) != H5Pget_driver(H5P_FILE_ACCESS_DEFAULT)) {
+ if (H5Pget_driver(fetched_info.wo_fapl_id) != H5Pget_driver(H5P_FILE_ACCESS_DEFAULT)) {
SPLITTER_TEST_FAULT("Write-Only driver mismatch (default)\n");
}
}
else {
- if (H5Pget_driver(fetched_info->wo_fapl_id) != H5Pget_driver(info->wo_fapl_id)) {
+ if (H5Pget_driver(fetched_info.wo_fapl_id) != H5Pget_driver(info->wo_fapl_id)) {
SPLITTER_TEST_FAULT("Write-Only driver mismatch\n");
}
}
- if ( (HDstrlen(info->wo_path) != HDstrlen(fetched_info->wo_path)) ||
- HDstrncmp(info->wo_path, fetched_info->wo_path, H5FD_SPLITTER_PATH_MAX))
+ if ( (HDstrlen(info->wo_path) != HDstrlen(fetched_info.wo_path)) ||
+ HDstrncmp(info->wo_path, fetched_info.wo_path, H5FD_SPLITTER_PATH_MAX))
{
- HDfprintf(stderr, "MISMATCH: '%s' :: '%s'\n", info->wo_path, fetched_info->wo_path);
+ HDfprintf(stderr, "MISMATCH: '%s' :: '%s'\n", info->wo_path, fetched_info.wo_path);
HEXPRINT(H5FD_SPLITTER_PATH_MAX, info->wo_path);
- HEXPRINT(H5FD_SPLITTER_PATH_MAX, fetched_info->wo_path);
+ HEXPRINT(H5FD_SPLITTER_PATH_MAX, fetched_info.wo_path);
SPLITTER_TEST_FAULT("Write-Only file path mismatch\n");
}
done:
- HDfree(fetched_info);
-
return ret_value;
} /* end compare_splitter_config_info() */
@@ -2336,42 +2331,37 @@ run_splitter_test(const struct splitter_dataset_def *data,
hid_t space_id = H5I_INVALID_HID;
hid_t fapl_id_out = H5I_INVALID_HID;
hid_t fapl_id_cpy = H5I_INVALID_HID;
- H5FD_splitter_vfd_config_t *vfd_config = NULL;
- char *filename_rw = NULL;
+ H5FD_splitter_vfd_config_t vfd_config;
+ char filename_rw[H5FD_SPLITTER_PATH_MAX + 1];
FILE *logfile = NULL;
int ret_value = 0;
- if (NULL == (vfd_config = HDcalloc(1, sizeof(H5FD_splitter_vfd_config_t))))
- SPLITTER_TEST_FAULT("memory allocation for vfd_config struct failed");
- if (NULL == (filename_rw = HDcalloc(H5FD_SPLITTER_PATH_MAX + 1, sizeof(char))))
- SPLITTER_TEST_FAULT("memory allocation for filename_rw string failed");
+ vfd_config.magic = H5FD_SPLITTER_MAGIC;
+ vfd_config.version = H5FD_CURR_SPLITTER_VFD_CONFIG_VERSION;
+ vfd_config.ignore_wo_errs = ignore_wo_errors;
+ vfd_config.rw_fapl_id = sub_fapl_ids[0];
+ vfd_config.wo_fapl_id = sub_fapl_ids[1];
- vfd_config->magic = H5FD_SPLITTER_MAGIC;
- vfd_config->version = H5FD_CURR_SPLITTER_VFD_CONFIG_VERSION;
- vfd_config->ignore_wo_errs = ignore_wo_errors;
- vfd_config->rw_fapl_id = sub_fapl_ids[0];
- vfd_config->wo_fapl_id = sub_fapl_ids[1];
-
- if (splitter_prepare_file_paths(vfd_config, filename_rw) < 0) {
+ if (splitter_prepare_file_paths(&vfd_config, filename_rw) < 0) {
SPLITTER_TEST_FAULT("can't prepare file paths\n");
}
if (provide_logfile_path == FALSE) {
- vfd_config->log_file_path[0] = '\0'; /* reset as empty string */
+ *vfd_config.log_file_path = '\0'; /* reset as empty string */
}
/* Create a new fapl to use the SPLITTER file driver */
if ((fapl_id = H5Pcreate(H5P_FILE_ACCESS)) == H5I_INVALID_HID) {
SPLITTER_TEST_FAULT("can't create FAPL ID\n");
}
- if (H5Pset_fapl_splitter(fapl_id, vfd_config) < 0) {
+ if (H5Pset_fapl_splitter(fapl_id, &vfd_config) < 0) {
SPLITTER_TEST_FAULT("can't set splitter FAPL\n");
}
if (H5Pget_driver(fapl_id) != H5FD_SPLITTER) {
SPLITTER_TEST_FAULT("set FAPL not SPLITTER\n");
}
- if (compare_splitter_config_info(fapl_id, vfd_config) < 0) {
+ if (compare_splitter_config_info(fapl_id, &vfd_config) < 0) {
SPLITTER_TEST_FAULT("information mismatch\n");
}
@@ -2384,7 +2374,7 @@ run_splitter_test(const struct splitter_dataset_def *data,
if (H5I_INVALID_HID == fapl_id_cpy) {
SPLITTER_TEST_FAULT("can't copy FAPL\n");
}
- if (compare_splitter_config_info(fapl_id_cpy, vfd_config) < 0) {
+ if (compare_splitter_config_info(fapl_id_cpy, &vfd_config) < 0) {
SPLITTER_TEST_FAULT("information mismatch\n");
}
if (H5Pclose(fapl_id_cpy) < 0) {
@@ -2411,7 +2401,7 @@ run_splitter_test(const struct splitter_dataset_def *data,
if (H5Pget_driver(fapl_id_out) != H5FD_SPLITTER) {
SPLITTER_TEST_FAULT("wrong file FAPL driver\n");
}
- if (compare_splitter_config_info(fapl_id_out, vfd_config) < 0) {
+ if (compare_splitter_config_info(fapl_id_out, &vfd_config) < 0) {
SPLITTER_TEST_FAULT("information mismatch\n");
}
if (H5Pclose(fapl_id_out) < 0) {
@@ -2449,12 +2439,12 @@ run_splitter_test(const struct splitter_dataset_def *data,
}
/* Verify that the R/W and W/O files are identical */
- if (h5_compare_file_bytes(filename_rw, vfd_config->wo_path) < 0) {
+ if (h5_compare_file_bytes(filename_rw, vfd_config.wo_path) < 0) {
SPLITTER_TEST_FAULT("files are not byte-for-byte equivalent\n");
}
- /* Verify existence of logfile if appropriate */
- logfile = fopen(vfd_config->log_file_path, "r");
+ /* Verify existence of logfile iff appropriate */
+ logfile = fopen(vfd_config.log_file_path, "r");
if ( (TRUE == provide_logfile_path && NULL == logfile) ||
(FALSE == provide_logfile_path && NULL != logfile) )
{
@@ -2464,22 +2454,19 @@ run_splitter_test(const struct splitter_dataset_def *data,
done:
if (ret_value < 0) {
H5E_BEGIN_TRY {
- H5Dclose(dset_id);
- H5Sclose(space_id);
- H5Pclose(fapl_id_out);
- H5Pclose(fapl_id_cpy);
- H5Pclose(fapl_id);
- H5Fclose(file_id);
+ (void)H5Dclose(dset_id);
+ (void)H5Sclose(space_id);
+ (void)H5Pclose(fapl_id_out);
+ (void)H5Pclose(fapl_id_cpy);
+ (void)H5Pclose(fapl_id);
+ (void)H5Fclose(file_id);
} H5E_END_TRY;
}
-
- if (logfile != NULL)
+ if (logfile != NULL) {
fclose(logfile);
-
- HDfree(vfd_config);
- HDfree(filename_rw);
-
+ }
return ret_value;
+
} /* end run_splitter_test() */
@@ -2501,28 +2488,25 @@ done:
static int
driver_is_splitter_compatible(hid_t fapl_id)
{
- H5FD_splitter_vfd_config_t *vfd_config = NULL;
+ H5FD_splitter_vfd_config_t vfd_config;
hid_t split_fapl_id = H5I_INVALID_HID;
herr_t ret = SUCCEED;
int ret_value = 0;
- if (NULL == (vfd_config = HDcalloc(1, sizeof(H5FD_splitter_vfd_config_t)))) {
- FAIL_PUTS_ERROR("memory allocation for vfd_config struct failed");
- }
-
- if(H5I_INVALID_HID == (split_fapl_id = H5Pcreate(H5P_FILE_ACCESS))) {
+ split_fapl_id = H5Pcreate(H5P_FILE_ACCESS);
+ if (H5I_INVALID_HID == split_fapl_id) {
FAIL_PUTS_ERROR("Can't create contained FAPL");
}
- vfd_config->magic = H5FD_SPLITTER_MAGIC;
- vfd_config->version = H5FD_CURR_SPLITTER_VFD_CONFIG_VERSION;
- vfd_config->ignore_wo_errs = FALSE;
- vfd_config->rw_fapl_id = H5P_DEFAULT;
- vfd_config->wo_fapl_id = fapl_id;
- HDstrncpy(vfd_config->wo_path, "nonesuch", H5FD_SPLITTER_PATH_MAX);
- vfd_config->log_file_path[0] = '\0';
+ vfd_config.magic = H5FD_SPLITTER_MAGIC;
+ vfd_config.version = H5FD_CURR_SPLITTER_VFD_CONFIG_VERSION;
+ vfd_config.ignore_wo_errs = FALSE;
+ vfd_config.rw_fapl_id = H5P_DEFAULT;
+ vfd_config.wo_fapl_id = fapl_id;
+ HDstrncpy(vfd_config.wo_path, "nonesuch", H5FD_SPLITTER_PATH_MAX);
+ *vfd_config.log_file_path = '\0';
H5E_BEGIN_TRY {
- ret = H5Pset_fapl_splitter(split_fapl_id, vfd_config);
+ ret = H5Pset_fapl_splitter(split_fapl_id, &vfd_config);
} H5E_END_TRY;
if (SUCCEED == ret) {
ret_value = -1;
@@ -2533,17 +2517,12 @@ driver_is_splitter_compatible(hid_t fapl_id)
}
split_fapl_id = H5I_INVALID_HID;
- HDfree(vfd_config);
-
return ret_value;
error:
H5E_BEGIN_TRY {
- H5Pclose(split_fapl_id);
+ (void)H5Pclose(split_fapl_id);
} H5E_END_TRY;
-
- HDfree(vfd_config);
-
return -1;
} /* end driver_is_splitter_compatible() */
@@ -2566,24 +2545,19 @@ splitter_RO_test(
const struct splitter_dataset_def *data,
hid_t child_fapl_id)
{
- char *filename_rw = NULL;
- H5FD_splitter_vfd_config_t *vfd_config = NULL;
+ char filename_rw[H5FD_SPLITTER_PATH_MAX + 1];
+ H5FD_splitter_vfd_config_t vfd_config;
hid_t fapl_id = H5I_INVALID_HID;
- hid_t file_id = H5I_INVALID_HID;
int ret_value = 0;
+ hid_t file_id = H5I_INVALID_HID;
- if (NULL == (vfd_config = HDcalloc(1, sizeof(H5FD_splitter_vfd_config_t))))
- SPLITTER_TEST_FAULT("memory allocation for vfd_config struct failed");
- if (NULL == (filename_rw = HDcalloc(H5FD_SPLITTER_PATH_MAX + 1, sizeof(char))))
- SPLITTER_TEST_FAULT("memory allocation for filename_rw string failed");
-
- vfd_config->magic = H5FD_SPLITTER_MAGIC;
- vfd_config->version = H5FD_CURR_SPLITTER_VFD_CONFIG_VERSION;
- vfd_config->ignore_wo_errs = FALSE;
- vfd_config->rw_fapl_id = child_fapl_id;
- vfd_config->wo_fapl_id = child_fapl_id;
+ vfd_config.magic = H5FD_SPLITTER_MAGIC;
+ vfd_config.version = H5FD_CURR_SPLITTER_VFD_CONFIG_VERSION;
+ vfd_config.ignore_wo_errs = FALSE;
+ vfd_config.rw_fapl_id = child_fapl_id;
+ vfd_config.wo_fapl_id = child_fapl_id;
- if (splitter_prepare_file_paths(vfd_config, filename_rw) < 0) {
+ if (splitter_prepare_file_paths(&vfd_config, filename_rw) < 0) {
SPLITTER_TEST_FAULT("can't prepare splitter file paths\n");
}
@@ -2592,7 +2566,7 @@ splitter_RO_test(
if (H5I_INVALID_HID == fapl_id) {
SPLITTER_TEST_FAULT("can't create FAPL ID\n");
}
- if (H5Pset_fapl_splitter(fapl_id, vfd_config) < 0) {
+ if (H5Pset_fapl_splitter(fapl_id, &vfd_config) < 0) {
SPLITTER_TEST_FAULT("can't set splitter FAPL\n");
}
if (H5Pget_driver(fapl_id) != H5FD_SPLITTER) {
@@ -2614,7 +2588,7 @@ splitter_RO_test(
* Should fail.
*/
- if (splitter_create_single_file_at(vfd_config->wo_path, vfd_config->wo_fapl_id, data) < 0) {
+ if (splitter_create_single_file_at(vfd_config.wo_path, vfd_config.wo_fapl_id, data) < 0) {
SPLITTER_TEST_FAULT("can't write W/O file\n");
}
H5E_BEGIN_TRY {
@@ -2623,13 +2597,13 @@ splitter_RO_test(
if (file_id >= 0) {
SPLITTER_TEST_FAULT("R/O open with extant W/O file unexpectedly successful\n");
}
- HDremove(vfd_config->wo_path);
+ HDremove(vfd_config.wo_path);
/* Attempt R/O open when only R/W file exists
* Should fail.
*/
- if (splitter_create_single_file_at(filename_rw, vfd_config->rw_fapl_id, data) < 0) {
+ if (splitter_create_single_file_at(filename_rw, vfd_config.rw_fapl_id, data) < 0) {
SPLITTER_TEST_FAULT("can't create R/W file\n");
}
H5E_BEGIN_TRY {
@@ -2642,7 +2616,7 @@ splitter_RO_test(
/* Attempt R/O open when both R/W and W/O files exist
*/
- if (splitter_create_single_file_at(vfd_config->wo_path, vfd_config->wo_fapl_id, data) < 0) {
+ if (splitter_create_single_file_at(vfd_config.wo_path, vfd_config.wo_fapl_id, data) < 0) {
SPLITTER_TEST_FAULT("can't create W/O file\n");
}
file_id = H5Fopen(filename_rw, H5F_ACC_RDONLY, fapl_id);
@@ -2668,14 +2642,10 @@ splitter_RO_test(
done:
if (ret_value < 0) {
H5E_BEGIN_TRY {
- H5Pclose(fapl_id);
- H5Fclose(file_id);
+ (void)H5Pclose(fapl_id);
+ (void)H5Fclose(file_id);
} H5E_END_TRY;
- }
-
- HDfree(vfd_config);
- HDfree(filename_rw);
-
+ } /* end if error */
return ret_value;
} /* end splitter_RO_test() */
@@ -2814,9 +2784,9 @@ splitter_create_single_file_at(
done:
if (ret_value < 0) {
H5E_BEGIN_TRY {
- H5Dclose(dset_id);
- H5Sclose(space_id);
- H5Fclose(file_id);
+ (void)H5Dclose(dset_id);
+ (void)H5Sclose(space_id);
+ (void)H5Fclose(file_id);
} H5E_END_TRY;
} /* end if error */
return ret_value;
@@ -2877,7 +2847,7 @@ splitter_compare_expected_data(hid_t file_id,
done:
if (ret_value < 0) {
H5E_BEGIN_TRY {
- H5Dclose(dset_id);
+ (void)H5Dclose(dset_id);
} H5E_END_TRY;
}
return ret_value;
@@ -2910,9 +2880,8 @@ done:
static int
splitter_tentative_open_test(hid_t child_fapl_id)
{
- const char *filename_tmp = "splitter_tmp.h5";
- char *filename_rw = NULL;
- H5FD_splitter_vfd_config_t *vfd_config = NULL;
+ char filename_rw[H5FD_SPLITTER_PATH_MAX + 1];
+ H5FD_splitter_vfd_config_t vfd_config;
hid_t fapl_id = H5I_INVALID_HID;
hid_t file_id = H5I_INVALID_HID;
int buf[SPLITTER_SIZE][SPLITTER_SIZE]; /* for comparison */
@@ -2922,11 +2891,6 @@ splitter_tentative_open_test(hid_t child_fapl_id)
struct splitter_dataset_def data; /* for comparison */
int ret_value = 0;
- if (NULL == (vfd_config = HDcalloc(1, sizeof(H5FD_splitter_vfd_config_t))))
- SPLITTER_TEST_FAULT("memory allocation for vfd_config struct failed");
- if (NULL == (filename_rw = HDcalloc(H5FD_SPLITTER_PATH_MAX + 1, sizeof(char))))
- SPLITTER_TEST_FAULT("memory allocation for filename_rw string failed");
-
/* pre-fill data buffer to write */
for (i=0; i < SPLITTER_SIZE; i++) {
for (j=0; j < SPLITTER_SIZE; j++) {
@@ -2941,13 +2905,13 @@ splitter_tentative_open_test(hid_t child_fapl_id)
data.n_dims = 2;
data.dset_name = SPLITTER_DATASET_NAME;
- vfd_config->magic = H5FD_SPLITTER_MAGIC;
- vfd_config->version = H5FD_CURR_SPLITTER_VFD_CONFIG_VERSION;
- vfd_config->ignore_wo_errs = FALSE;
- vfd_config->rw_fapl_id = child_fapl_id;
- vfd_config->wo_fapl_id = child_fapl_id;
+ vfd_config.magic = H5FD_SPLITTER_MAGIC;
+ vfd_config.version = H5FD_CURR_SPLITTER_VFD_CONFIG_VERSION;
+ vfd_config.ignore_wo_errs = FALSE;
+ vfd_config.rw_fapl_id = child_fapl_id;
+ vfd_config.wo_fapl_id = child_fapl_id;
- if (splitter_prepare_file_paths(vfd_config, filename_rw) < 0) {
+ if (splitter_prepare_file_paths(&vfd_config, filename_rw) < 0) {
SPLITTER_TEST_FAULT("can't prepare splitter file paths\n");
}
@@ -2955,23 +2919,14 @@ splitter_tentative_open_test(hid_t child_fapl_id)
if ((fapl_id = H5Pcreate(H5P_FILE_ACCESS)) == H5I_INVALID_HID) {
SPLITTER_TEST_FAULT("can't create FAPL ID\n");
}
- if (H5Pset_fapl_splitter(fapl_id, vfd_config) < 0) {
+ if (H5Pset_fapl_splitter(fapl_id, &vfd_config) < 0) {
SPLITTER_TEST_FAULT("can't set splitter FAPL\n");
}
if (H5Pget_driver(fapl_id) != H5FD_SPLITTER) {
SPLITTER_TEST_FAULT("set FAPL not SPLITTER\n");
}
- /* Create instance of file on disk.
- * Will be copied verbatim as needed, to avoid issues where differences in
- * the creation time would befoul comparisons.
- */
- if (splitter_create_single_file_at(filename_tmp, child_fapl_id, &data) < 0) {
- SPLITTER_TEST_FAULT("can't write W/O file\n");
- }
-
- /*
- * H5Fopen() with RDWR access.
+ /* H5Fopen() with RDWR access.
* Neither file exist already
* Should fail.
*/
@@ -2985,18 +2940,17 @@ splitter_tentative_open_test(hid_t child_fapl_id)
if (file_exists(filename_rw, child_fapl_id)) {
SPLITTER_TEST_FAULT("R/W file unexpectedly created\n");
}
- if (file_exists(vfd_config->wo_path, child_fapl_id)) {
+ if (file_exists(vfd_config.wo_path, child_fapl_id)) {
SPLITTER_TEST_FAULT("W/O file unexpectedly created\n");
}
- /*
- * H5Fopen() with RDWR access.
- * Only W/O file present.
+ /* H5Fopen() with RDWR access.
+ * W/O file exists already.
* Should fail.
*/
- if (h5_duplicate_file_by_bytes(filename_tmp, vfd_config->wo_path) < 0) {
- SPLITTER_TEST_FAULT("Can't create W/O file copy.\n");
+ if (splitter_create_single_file_at(vfd_config.wo_path, child_fapl_id, &data) < 0) {
+ SPLITTER_TEST_FAULT("can't write W/O file\n");
}
H5E_BEGIN_TRY {
file_id = H5Fopen(filename_rw, H5F_ACC_RDWR, fapl_id);
@@ -3007,22 +2961,21 @@ splitter_tentative_open_test(hid_t child_fapl_id)
if (file_exists(filename_rw, child_fapl_id)) {
SPLITTER_TEST_FAULT("R/W file unexpectedly created\n");
}
- if (!file_exists(vfd_config->wo_path, child_fapl_id)) {
+ if (!file_exists(vfd_config.wo_path, child_fapl_id)) {
SPLITTER_TEST_FAULT("W/O file mysteriously disappeared\n");
}
- HDremove(vfd_config->wo_path);
- if (file_exists(vfd_config->wo_path, child_fapl_id)) {
+ HDremove(vfd_config.wo_path);
+ if (file_exists(vfd_config.wo_path, child_fapl_id)) {
SPLITTER_TEST_FAULT("failed to remove W/O file\n");
}
- /*
- * H5Fopen() with RDWR access.
- * Only R/W file present.
+ /* H5Fopen() with RDWR access.
+ * R/W file exists already.
* Should fail.
*/
- if (h5_duplicate_file_by_bytes(filename_tmp, filename_rw) < 0) {
- SPLITTER_TEST_FAULT("Can't create R/W file copy.\n");
+ if (splitter_create_single_file_at(filename_rw, child_fapl_id, &data) < 0) {
+ SPLITTER_TEST_FAULT("can't write R/W file\n");
}
H5E_BEGIN_TRY {
file_id = H5Fopen(filename_rw, H5F_ACC_RDWR, fapl_id);
@@ -3033,17 +2986,16 @@ splitter_tentative_open_test(hid_t child_fapl_id)
if (!file_exists(filename_rw, child_fapl_id)) {
SPLITTER_TEST_FAULT("R/W file mysteriously disappeared\n");
}
- if (file_exists(vfd_config->wo_path, child_fapl_id)) {
+ if (file_exists(vfd_config.wo_path, child_fapl_id)) {
SPLITTER_TEST_FAULT("W/O file unexpectedly created\n");
}
- /*
- * H5Fopen() with RDWR access.
- * Both files present.
+ /* H5Fopen() with RDWR access.
+ * Both files already exist.
*/
- if (h5_duplicate_file_by_bytes(filename_tmp, vfd_config->wo_path) < 0) {
- SPLITTER_TEST_FAULT("Can't create W/O file copy.\n");
+ if (splitter_create_single_file_at(vfd_config.wo_path, child_fapl_id, &data) < 0) {
+ SPLITTER_TEST_FAULT("can't write W/O file\n");
}
file_id = H5Fopen(filename_rw, H5F_ACC_RDWR, fapl_id);
if (file_id == H5I_INVALID_HID) {
@@ -3056,13 +3008,15 @@ splitter_tentative_open_test(hid_t child_fapl_id)
if (!file_exists(filename_rw, child_fapl_id)) {
SPLITTER_TEST_FAULT("R/W file mysteriously disappared\n");
}
- if (!file_exists(vfd_config->wo_path, child_fapl_id)) {
+ if (!file_exists(vfd_config.wo_path, child_fapl_id)) {
SPLITTER_TEST_FAULT("W/O file mysteriously disappeared\n");
}
+ if (h5_compare_file_bytes(filename_rw, vfd_config.wo_path) < 0) {
+ SPLITTER_TEST_FAULT("files are not byte-for-byte equivalent\n");
+ }
- /*
- * H5Fcreate() with TRUNC access.
- * Both files present.
+ /* H5Fcreate() with TRUNC access.
+ * Both files already exist.
*/
file_id = H5Fcreate(filename_rw, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id);
@@ -3076,24 +3030,24 @@ splitter_tentative_open_test(hid_t child_fapl_id)
if (!file_exists(filename_rw, child_fapl_id)) {
SPLITTER_TEST_FAULT("R/W file mysteriously disappared\n");
}
- if (!file_exists(vfd_config->wo_path, child_fapl_id)) {
+ if (!file_exists(vfd_config.wo_path, child_fapl_id)) {
SPLITTER_TEST_FAULT("W/O file mysteriously disappeared\n");
}
- if (h5_compare_file_bytes(filename_rw, vfd_config->wo_path) < 0) {
+ if (h5_compare_file_bytes(filename_rw, vfd_config.wo_path) < 0) {
SPLITTER_TEST_FAULT("files are not byte-for-byte equivalent\n");
}
- HDremove(filename_rw);
- HDremove(vfd_config->wo_path);
- /*
- * H5Fcreate() with TRUNC access.
+ /* H5Fcreate() with TRUNC access.
* R/W already exists.
*/
- if (h5_duplicate_file_by_bytes(filename_tmp, filename_rw) < 0) {
- SPLITTER_TEST_FAULT("Can't create R/W file copy.\n");
+ HDremove(filename_rw);
+ HDremove(vfd_config.wo_path);
+ if (splitter_create_single_file_at(filename_rw, child_fapl_id, &data) < 0) {
+ SPLITTER_TEST_FAULT("can't write R/W file\n");
}
- if (file_exists(vfd_config->wo_path, child_fapl_id)) {
+
+ if (file_exists(vfd_config.wo_path, child_fapl_id)) {
SPLITTER_TEST_FAULT("failed to remove W/O file\n");
}
file_id = H5Fcreate(filename_rw, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id);
@@ -3107,23 +3061,23 @@ splitter_tentative_open_test(hid_t child_fapl_id)
if (!file_exists(filename_rw, child_fapl_id)) {
SPLITTER_TEST_FAULT("R/W file mysteriously disappared\n");
}
- if (!file_exists(vfd_config->wo_path, child_fapl_id)) {
+ if (!file_exists(vfd_config.wo_path, child_fapl_id)) {
SPLITTER_TEST_FAULT("W/O file mysteriously disappeared\n");
}
- if (h5_compare_file_bytes(filename_rw, vfd_config->wo_path) < 0) {
+ if (h5_compare_file_bytes(filename_rw, vfd_config.wo_path) < 0) {
SPLITTER_TEST_FAULT("files are not byte-for-byte equivalent\n");
}
- HDremove(filename_rw);
- HDremove(vfd_config->wo_path);
- /*
- * H5Fcreate() with TRUNC access.
- * Only W/O present.
+ /* H5Fcreate() with TRUNC access.
+ * W/O already exists.
*/
- if (h5_duplicate_file_by_bytes(filename_tmp, vfd_config->wo_path) < 0) {
- SPLITTER_TEST_FAULT("Can't create W/O file copy.\n");
+ HDremove(filename_rw);
+ HDremove(vfd_config.wo_path);
+ if (splitter_create_single_file_at(vfd_config.wo_path, child_fapl_id, &data) < 0) {
+ SPLITTER_TEST_FAULT("can't write R/W file\n");
}
+
if (file_exists(filename_rw, child_fapl_id)) {
SPLITTER_TEST_FAULT("failed to remove R/W file\n");
}
@@ -3138,19 +3092,16 @@ splitter_tentative_open_test(hid_t child_fapl_id)
if (!file_exists(filename_rw, child_fapl_id)) {
SPLITTER_TEST_FAULT("R/W file mysteriously disappared\n");
}
- if (!file_exists(vfd_config->wo_path, child_fapl_id)) {
+ if (!file_exists(vfd_config.wo_path, child_fapl_id)) {
SPLITTER_TEST_FAULT("W/O file mysteriously disappeared\n");
}
- if (h5_compare_file_bytes(filename_rw, vfd_config->wo_path) < 0) {
+ if (h5_compare_file_bytes(filename_rw, vfd_config.wo_path) < 0) {
SPLITTER_TEST_FAULT("files are not byte-for-byte equivalent\n");
}
- HDremove(filename_rw);
- HDremove(vfd_config->wo_path);
/* H5Fcreate with both files absent is tested elsewhere */
- /*
- * Cleanup
+ /* Cleanup
*/
if (H5Pclose(fapl_id) < 0) {
@@ -3160,14 +3111,10 @@ splitter_tentative_open_test(hid_t child_fapl_id)
done:
if (ret_value < 0) {
H5E_BEGIN_TRY {
- H5Pclose(fapl_id);
- H5Fclose(file_id);
+ (void)H5Pclose(fapl_id);
+ (void)H5Fclose(file_id);
} H5E_END_TRY;
- }
-
- HDfree(vfd_config);
- HDfree(filename_rw);
-
+ } /* end if error */
return ret_value;
} /* end splitter_tentative_open_test() */
@@ -3204,7 +3151,7 @@ file_exists(const char *filename, hid_t fapl_id)
error:
H5E_BEGIN_TRY {
- H5Fclose(file_id);
+ (void)H5Fclose(file_id);
} H5E_END_TRY;
return ret_value;
} /* end file_exists() */
@@ -3278,6 +3225,7 @@ test_splitter(void)
TEST_ERROR;
}
+
/* Test file creation, utilizing different child FAPLs (default vs.
* specified), logfile, and Write Channel error ignoring behavior.
*/
@@ -3303,6 +3251,7 @@ test_splitter(void)
/* TODO: SWMR open? */
/* Concurrent opens with both drivers using the Splitter */
+
if (H5Pclose(child_fapl_id) == FAIL) {
TEST_ERROR;
}
@@ -3311,9 +3260,9 @@ test_splitter(void)
return 0;
error:
- if (child_fapl_id != H5I_INVALID_HID)
- H5Pclose(child_fapl_id);
-
+ if (child_fapl_id != H5I_INVALID_HID) {
+ (void)H5Pclose(child_fapl_id);
+ }
return -1;
} /* end test_splitter() */
@@ -4088,10 +4037,26 @@ test_subfiling(void)
hid_t driver_id = -1; /* ID for this VFD */
unsigned long driver_flags = 0; /* VFD feature flags */
char filename[1024]; /* filename */
- void *os_file_handle = NULL; /* OS file handle */
- hsize_t file_size; /* file size */
H5FD_subfiling_fapl_t fa_in = {H5FD_CURR_SUBFILING_FAPL_T_VERSION};
H5FD_subfiling_fapl_t fa_out;
+ int require_mpi_finalize = 0;
+
+
+#if defined(HAVE_SERIAL_SUBFILING)
+ void *os_file_handle = NULL; /* OS file handle */
+ hsize_t file_size; /* file size */
+#else
+ int mpi_enabled; /* if MPI_Init has been called */
+ if (MPI_Initialized(&mpi_enabled) == MPI_SUCCESS) {
+ if (!mpi_enabled) {
+ int mpi_provides, require = MPI_THREAD_MULTIPLE;
+ if ((MPI_Init_thread(NULL, NULL, require, &mpi_provides)) != MPI_SUCCESS) {
+ TEST_ERROR;
+ }
+ require_mpi_finalize++;
+ }
+ }
+#endif
TESTING("subfiling file driver");
@@ -4124,7 +4089,6 @@ test_subfiling(void)
if(!(driver_flags & H5FD_FEAT_AGGREGATE_SMALLDATA)) TEST_ERROR
if(!(driver_flags & H5FD_FEAT_POSIX_COMPAT_HANDLE)) TEST_ERROR
if(!(driver_flags & H5FD_FEAT_SUPPORTS_SWMR_IO)) TEST_ERROR
- if(!(driver_flags & H5FD_FEAT_DEFAULT_VFD_COMPATIBLE)) TEST_ERROR
/* Check for extra flags not accounted for above */
if(driver_flags != (H5FD_FEAT_AGGREGATE_METADATA
@@ -4132,8 +4096,8 @@ test_subfiling(void)
| H5FD_FEAT_DATA_SIEVE
| H5FD_FEAT_AGGREGATE_SMALLDATA
| H5FD_FEAT_POSIX_COMPAT_HANDLE
- | H5FD_FEAT_SUPPORTS_SWMR_IO
- | H5FD_FEAT_DEFAULT_VFD_COMPATIBLE))
+ | H5FD_FEAT_SUPPORTS_SWMR_IO))
+
TEST_ERROR
if((fid = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id)) < 0)
@@ -4158,6 +4122,7 @@ test_subfiling(void)
if(H5Pclose(fapl_id_out) < 0)
TEST_ERROR;
+#if defined(HAVE_SERIAL_SUBFILING)
/* Check that we can get an operating-system-specific handle from
* the library.
*
@@ -4180,6 +4145,7 @@ test_subfiling(void)
if(file_size < 1 * KB || file_size > 4 * KB)
FAIL_PUTS_ERROR("suspicious file size obtained from H5Fget_filesize");
+#endif
/* Close and delete the file */
if(H5Fclose(fid) < 0)
@@ -4191,6 +4157,9 @@ test_subfiling(void)
if(H5Pclose(fapl_id) < 0)
TEST_ERROR;
+ if (require_mpi_finalize)
+ MPI_Finalize();
+
PASSED();
return 0;
@@ -4213,7 +4182,8 @@ error:
*
* Purpose: Tests the basic features of Virtual File Drivers
*
- * Return: EXIT_SUCCESS/EXIT_FAILURE
+ * Return: Success: 0
+ * Failure: 1
*
*-------------------------------------------------------------------------
*/
@@ -4249,11 +4219,11 @@ main(void)
if(nerrors) {
HDprintf("***** %d Virtual File Driver TEST%s FAILED! *****\n",
nerrors, nerrors > 1 ? "S" : "");
- return EXIT_FAILURE;
- }
+ return 1;
+ } /* end if */
HDprintf("All Virtual File Driver tests passed.\n");
- return EXIT_SUCCESS;
+ return 0;
} /* end main() */
diff --git a/testpar/CMakeLists.txt b/testpar/CMakeLists.txt
index c95e01f..67dcd00 100644
--- a/testpar/CMakeLists.txt
+++ b/testpar/CMakeLists.txt
@@ -74,9 +74,10 @@ set (H5P_TESTS
t_init_term
t_shapesame
t_filters_parallel
- t_2Gio
t_subfile_openclose
t_subfile_readwrite
+ t_subfile_bench
+# t_subfile_bench_hdf
)
foreach (h5_testp ${H5P_TESTS})
diff --git a/testpar/t_bigio.c b/testpar/t_bigio.c
index f86852a..26ee15f 100644
--- a/testpar/t_bigio.c
+++ b/testpar/t_bigio.c
@@ -48,6 +48,9 @@ static int mpi_size_g, mpi_rank_g;
hsize_t space_dim1 = SPACE_DIM1 * 256; // 4096
hsize_t space_dim2 = SPACE_DIM2;
+extern void
+set_verbose_flag(int subfile_rank, int new_value);
+
static void coll_chunktest(const char* filename, int chunk_factor, int select_factor,
int api_option, int file_selection, int mem_selection, int mode);
@@ -494,7 +497,6 @@ dataset_big_write(void)
size_t num_points;
B_DATATYPE * wdata;
-
/* allocate memory for data buffer */
wdata = (B_DATATYPE *)HDmalloc(bigcount*sizeof(B_DATATYPE));
VRFY_G((wdata != NULL), "wdata malloc succeeded");
@@ -516,7 +518,7 @@ dataset_big_write(void)
/* Each process takes a slabs of rows. */
if (mpi_rank_g == 0)
HDprintf("\nTesting Dataset1 write by ROW\n");
- /* Create a large dataset */
+ /* Create a large dataset - global dims as follows:: */
dims[0] = bigcount;
dims[1] = (hsize_t)mpi_size_g;
@@ -528,6 +530,7 @@ dataset_big_write(void)
block[0] = dims[0]/(hsize_t)mpi_size_g;
block[1] = dims[1];
+ printf("[%d] block[0] = %lld block[1] = %lld\n", mpi_rank_g, block[0], block[1]);
stride[0] = block[0];
stride[1] = block[1];
count[0] = 1;
@@ -776,6 +779,7 @@ dataset_big_write(void)
VRFY_G((ret >= 0), "H5Dclose1 succeeded");
HDfree(wdata);
+
H5Fclose(fid);
}
@@ -1922,6 +1926,8 @@ do_express_test(int world_mpi_rank)
int main(int argc, char **argv)
{
int ExpressMode = 0;
+ int mpi_provides, require = MPI_THREAD_MULTIPLE;
+
hsize_t newsize = 1048576;
/* Set the bigio processing limit to be 'newsize' bytes */
hsize_t oldsize = H5_mpi_set_bigio_count(newsize);
@@ -1934,8 +1940,10 @@ int main(int argc, char **argv)
if (newsize != oldsize) {
bigcount = newsize * 2;
}
-
- MPI_Init(&argc, &argv);
+ if ( (MPI_Init_thread(&argc, &argv, require, &mpi_provides)) != MPI_SUCCESS) {
+ HDfprintf(stderr, "FATAL: Unable to initialize MPI\n");
+ HDexit(EXIT_FAILURE);
+ }
MPI_Comm_size(MPI_COMM_WORLD,&mpi_size_g);
MPI_Comm_rank(MPI_COMM_WORLD,&mpi_rank_g);
@@ -1945,7 +1953,7 @@ int main(int argc, char **argv)
* calls. By then, MPI calls may not work.
*/
if (H5dont_atexit() < 0){
- HDprintf("Failed to turn off atexit processing. Continue.\n");
+ HDprintf("Failed to turn off atexit processing. Continue.\n");
};
/* set alarm. */
@@ -1953,6 +1961,8 @@ int main(int argc, char **argv)
ExpressMode = do_express_test(mpi_rank_g);
+ set_verbose_flag(0, 1);
+
dataset_big_write();
MPI_Barrier(MPI_COMM_WORLD);
@@ -1976,9 +1986,10 @@ int main(int argc, char **argv)
/* turn off alarm */
ALARM_OFF;
+#if 0
if (mpi_rank_g == 0)
HDremove(FILENAME[0]);
-
+#endif
/* close HDF5 library */
H5close();
diff --git a/testpar/t_subfile_openclose.c b/testpar/t_subfile_openclose.c
index fe39f2c..8ccf3c7 100644
--- a/testpar/t_subfile_openclose.c
+++ b/testpar/t_subfile_openclose.c
@@ -1,6 +1,11 @@
#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
#include "hdf5.h"
-#include "H5FDsubfile_public.h"
#include "mpi.h"
@@ -11,7 +16,9 @@ main(int argc, char **argv)
int i, mpi_size, mpi_rank;
int loop_count = 20;
int mpi_provides, require = MPI_THREAD_MULTIPLE;
- hid_t subfile_id = -1;
+ hid_t subfile_id = 1;
+ const char *h5_filename = "unused.h5";
+ FILE *h5file;
MPI_Init_thread(&argc, &argv, require, &mpi_provides);
MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
@@ -25,21 +32,22 @@ main(int argc, char **argv)
}
H5open();
-
- if (H5FDsubfiling_init() == SUCCEED) {
- subfile_id = get_subfiling_context();
- printf("[%d] subfile_id = %lx\n", mpi_rank, subfile_id);
- }
- else if (mpi_rank == 0) {
- puts("Error: Unable to initialize subfiling!");
- }
-
+
+ h5file = fopen(h5_filename, "w+");
for(i=0; i < loop_count; i++) {
- sf_open_subfiles(subfile_id, NULL, O_CREAT|O_TRUNC|O_RDWR);
+ if (mpi_rank == 0) {
+ printf("loop_count(%d)\n", i);
+ fflush(stdout);
+ }
+ sf_open_subfiles(subfile_id, h5_filename, NULL, O_CREAT|O_TRUNC|O_RDWR);
sf_close_subfiles(subfile_id);
}
- H5FDsubfiling_finalize();
+ if (h5file) {
+ fclose(h5file);
+ if (mpi_rank == 0)
+ unlink(h5_filename);
+ }
MPI_Barrier(MPI_COMM_WORLD);
diff --git a/testpar/t_subfile_readwrite.c b/testpar/t_subfile_readwrite.c
index b4c798a..34f3281 100644
--- a/testpar/t_subfile_readwrite.c
+++ b/testpar/t_subfile_readwrite.c
@@ -1,6 +1,12 @@
#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <string.h>
+
#include "hdf5.h"
-#include "H5FDsubfile_public.h"
#include "mpi.h"
@@ -13,46 +19,59 @@
int mpi_size = -1;
int mpi_rank = -1;
-static int test_subfile_op(int op_type, hid_t subfile_id, int64_t offset, int64_t local_elements, void *local_data)
+static int test_subfile_op(int op_type, hid_t subfile_id, char *prefix, int64_t offset, int64_t local_elements, void *local_data, int reporter)
{
int i, flags = O_RDWR;
int errors = 0;
int loop_count = 20;
- int64_t local_data_size = local_elements * sizeof(int);
+ int64_t local_data_size = local_elements * (int64_t)sizeof(int);
int64_t total_data_size = 0;
+ const char *h5_filename = "unused.h5";
+ FILE *h5file;
int (*subfile_ftn)(hid_t context_id,int64_t offset, int64_t elements, int dtype_extent, void *data) = sf_read_independent;
double m_startTime, m_endTime;
- double this_time, max_time, min_time, total_time, avg_time;
- double bw;
+ double this_time, avg_time, max_time = 0.0, min_time = 0.0, total_time = 0.0;
+ double bw = 0.0;
const char *OPERATION = "READ";
+
if (op_type == WRITE_OP) {
flags = O_CREAT|O_TRUNC|O_RDWR;
- subfile_ftn = sf_write_independent;
+ subfile_ftn = (int (*)(long int, long int, long int, int, void *))sf_write_independent;
OPERATION = "WRITE";
}
+ h5file = fopen(h5_filename, "w+");
+
for(i=0; i < loop_count; i++) {
- m_startTime = MPI_Wtime();
- if (sf_open_subfiles(subfile_id, NULL, flags) < 0) {
+ // if (mpi_rank == 0) set_verbose_flag(0, 1);
+
+ if (sf_open_subfiles(subfile_id, h5_filename, prefix, flags) < 0) {
puts("sf_open_subfiles returned an error!");
errors++;
goto done;
}
+
+ m_startTime = MPI_Wtime();
+
if (subfile_ftn(subfile_id, offset, local_elements, sizeof(int), local_data) < 0) {
puts("subfile_ftn returned an error!");
errors++;
goto done;
}
+ m_endTime = MPI_Wtime();
+
if (sf_close_subfiles(subfile_id) < 0) {
puts("sf_close_subfiles returned an error!");
errors++;
goto done;
}
- m_endTime = MPI_Wtime();
this_time = m_endTime - m_startTime;
+
+ // if (mpi_rank == 0) set_verbose_flag(0, 0);
+
if (i == 0) {
min_time = this_time;
max_time = this_time;
@@ -65,11 +84,17 @@ static int test_subfile_op(int op_type, hid_t subfile_id, int64_t offset, int64_
}
total_time += this_time;
}
+ if (h5file) {
+ fclose(h5file);
+ if (mpi_rank == 0)
+ unlink(h5_filename);
+ }
+
total_data_size = local_data_size * mpi_size;
avg_time = total_time / (double) loop_count;
bw = ((double)total_data_size)/ avg_time / (1024.0 * 1024.0);
- if (mpi_rank == 0) {
+ if (mpi_rank == reporter) {
printf("%s Perf: %lf BW/[MBs] %ld Bytes AvgTime[sec] %lf\n", OPERATION, bw, total_data_size, avg_time);
fflush(stdout);
}
@@ -82,16 +107,14 @@ int
main(int argc, char **argv)
{
int errors = 0;
- int mpi_provides, require = MPI_THREAD_MULTIPLE;
- hid_t subfile_id = -1;
- double m_startTime, m_endTime;
- double this_time, max_time, min_time, total_time, avg_time;
- double bw;
+ int proc, mpi_provides, require = MPI_THREAD_MULTIPLE;
+ hid_t subfile_id = 1;
int64_t local_elements = DATA_SIZE;
int64_t local_data_size = 0;
int64_t offset = 0;
int *local_data = NULL;
int *verify_data = NULL;
+ char *prefix = NULL;
MPI_Init_thread(&argc, &argv, require, &mpi_provides);
MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
@@ -103,14 +126,17 @@ main(int argc, char **argv)
local_elements = check_value;
}
}
+ if (argc > 2) {
+ prefix = strdup(argv[2]);
+ }
H5open();
- local_data_size = local_elements * sizeof(int);
+ local_data_size = local_elements * (int64_t)sizeof(int);
local_data = (int *)malloc((size_t)local_data_size);
if (local_data) {
- int k, base = local_elements * mpi_rank;
- offset = local_data_size * mpi_rank;
+ int k, base = (int)local_elements * mpi_rank;
+ offset = local_data_size * (int64_t)mpi_rank;
for (k=0; k < local_elements; k++) {
local_data[k] = k + base;
}
@@ -128,30 +154,21 @@ main(int argc, char **argv)
goto done;
}
- if (H5FDsubfiling_init() == SUCCEED) {
- subfile_id = get_subfiling_context();
- printf("[%d] subfile_id = %lx\n", mpi_rank, subfile_id);
- }
- else if (mpi_rank == 0) {
- puts("Error: Unable to initialize subfiling!");
- errors++;
- goto done;
- }
-
- if (test_subfile_op( WRITE_OP, subfile_id, offset, local_elements, local_data)) {
- puts("Subfile writing test returned an error!");
- errors++;
- goto done;
- }
- if (test_subfile_op( READ_OP, subfile_id, offset, local_elements, verify_data)) {
- puts("Subfile reading test returned an error!");
- errors++;
- goto done;
+ for(proc=0; proc < 10; proc++) {
+ if (test_subfile_op( WRITE_OP, subfile_id, prefix, offset, local_elements, local_data, proc)) {
+ puts("Subfile writing test returned an error!");
+ errors++;
+ goto done;
+ }
+ if (test_subfile_op( READ_OP, subfile_id, prefix, offset, local_elements, verify_data, proc)) {
+ puts("Subfile reading test returned an error!");
+ errors++;
+ goto done;
+ }
}
done:
- H5FDsubfiling_finalize();
MPI_Barrier(MPI_COMM_WORLD);
if (local_data) {
diff --git a/tools/lib/h5diff.c b/tools/lib/h5diff.c
index 8324714..87a3b11 100644
--- a/tools/lib/h5diff.c
+++ b/tools/lib/h5diff.c
@@ -986,7 +986,7 @@ h5diff(const char *fname1, const char *fname2, const char *objname1, const char
H5TOOLS_DEBUG("groups traversed - errstat:%d", opts->err_stat);
#ifdef H5_HAVE_PARALLEL
- if(g_Parallel && !g_CollectInfoOnly) {
+ if(g_Parallel) {
int i;
if((HDstrlen(fname1) > MAX_FILENAME) || (HDstrlen(fname2) > MAX_FILENAME)) {
@@ -1001,11 +1001,6 @@ h5diff(const char *fname1, const char *fname2, const char *objname1, const char
for(i = 1; i < g_nTasks; i++)
MPI_Send(filenames, (MAX_FILENAME * 2), MPI_CHAR, i, MPI_TAG_PARALLEL, MPI_COMM_WORLD);
} /* end if */
- else if (g_CollectInfoOnly) {
- build_match_list (obj1fullname, info1_lp, obj2fullname, info2_lp, &match_list, opts);
-
- }
-
#endif
H5TOOLS_DEBUG("build_match_list next - errstat:%d", opts->err_stat);
diff --git a/tools/lib/h5tools_utils.c b/tools/lib/h5tools_utils.c
index 63b3041..6167dd9 100644
--- a/tools/lib/h5tools_utils.c
+++ b/tools/lib/h5tools_utils.c
@@ -48,7 +48,6 @@ hsize_t H5TOOLS_BUFSIZE = ( 32 * 1024 * 1024); /* 32 MB */
/* ``parallel_print'' variables */
unsigned char g_Parallel = 0; /*0 for serial, 1 for parallel */
-unsigned char g_CollectInfoOnly = 0;
char outBuff[OUTBUFF_SIZE];
unsigned outBuffOffset;
FILE* overflow_file = NULL;
diff --git a/tools/lib/h5tools_utils.h b/tools/lib/h5tools_utils.h
index 2cd2eae..07069cc 100644
--- a/tools/lib/h5tools_utils.h
+++ b/tools/lib/h5tools_utils.h
@@ -32,7 +32,6 @@ extern "C" {
H5TOOLS_DLLVAR int g_nTasks;
H5TOOLS_DLLVAR unsigned char g_Parallel;
-H5TOOLS_DLLVAR unsigned char g_CollectInfoOnly;
H5TOOLS_DLLVAR char outBuff[];
H5TOOLS_DLLVAR unsigned outBuffOffset;
H5TOOLS_DLLVAR FILE *overflow_file;
diff --git a/tools/lib/h5trav.c b/tools/lib/h5trav.c
index a9b5b75..dc7e27d 100644
--- a/tools/lib/h5trav.c
+++ b/tools/lib/h5trav.c
@@ -15,9 +15,6 @@
#include "h5trav.h"
#include "h5tools.h"
#include "H5private.h"
-#ifdef H5_HAVE_PARALLEL
-#include "h5tools_utils.h"
-#endif
/*-------------------------------------------------------------------------
* local typedefs
@@ -182,10 +179,8 @@ static herr_t
traverse_cb(hid_t loc_id, const char *path, const H5L_info2_t *linfo,
void *_udata)
{
- herr_t ret_value = SUCCEED;
trav_ud_traverse_t *udata = (trav_ud_traverse_t *)_udata; /* User data */
char *new_name = NULL;
-
const char *full_name;
const char *already_visited = NULL; /* Whether the link/object was already visited */
@@ -206,18 +201,6 @@ traverse_cb(hid_t loc_id, const char *path, const H5L_info2_t *linfo,
else
full_name = path;
-#ifdef H5_HAVE_PARALLEL
- if(linfo->type == H5L_TYPE_EXTERNAL) {
- h5tool_link_info_t lnk_info;
- if ((ret_value = H5tools_get_symlink_info(loc_id, path, &lnk_info, FALSE)) < 0) {
- puts("H5tools_get_symlink_info failed!");
- }
- else if (ret_value == 0) {
- puts("Dangling link?");
- }
- printf("Visiting external link: %s\n", path);
- }
-#endif
/* Perform the correct action for different types of links */
if(linfo->type == H5L_TYPE_HARD) {
H5O_info2_t oinfo;