From 5dd85abb4124f337eea52ef33d0c7e3fd67ed92d Mon Sep 17 00:00:00 2001 From: Richard Warren Date: Fri, 25 Sep 2020 07:45:15 -0400 Subject: Create a new branch for the September Subfiling deliverable --- bin/trace | 3 + src/CMakeLists.txt | 2 + src/H5FDfamily.c | 514 +++--- src/H5FDsplitter.c | 4 +- src/H5FDsubfile.c | 475 ++--- src/H5FDsubfile.h | 0 src/H5FDsubfile_mpi.c | 3841 ++++++++++++++++++++++++++++------------- src/H5FDsubfile_private.h | 299 ++-- src/H5FDsubfile_public.h | 4 - src/H5FDsubfile_threads.c | 392 ++++- src/H5FDsubfiling.c | 992 +++++++++-- src/H5FDsubfiling.h | 20 +- src/Makefile.am | 2 +- test/CMakeLists.txt | 1 + test/Makefile.am | 6 +- test/tselect.c | 6 +- test/vfd.c | 364 ++-- testpar/CMakeLists.txt | 3 +- testpar/t_bigio.c | 23 +- testpar/t_subfile_openclose.c | 34 +- testpar/t_subfile_readwrite.c | 93 +- tools/lib/h5diff.c | 7 +- tools/lib/h5tools_utils.c | 1 - tools/lib/h5tools_utils.h | 1 - tools/lib/h5trav.c | 17 - 25 files changed, 4816 insertions(+), 2288 deletions(-) delete mode 100644 src/H5FDsubfile.h diff --git a/bin/trace b/bin/trace index fd0248e..ab84153 100755 --- a/bin/trace +++ b/bin/trace @@ -67,6 +67,7 @@ $Source = ""; "hid_t" => "i", "int" => "Is", "int32_t" => "Is", + "int64_t" => "IL", "unsigned" => "Iu", "unsigned int" => "Iu", "uint32_t" => "Iu", @@ -161,6 +162,7 @@ $Source = ""; "H5FD_hdfs_fapl_t" => "x", "H5FD_file_image_callbacks_t" => "x", "H5FD_mirror_fapl_t" => "x", + "H5FD_subfiling_fapl_t" => "x", "H5G_iterate_t" => "x", "H5G_info_t" => "x", "H5I_free_t" => "x", @@ -201,6 +203,7 @@ $Source = ""; "H5VL_request_notify_t" => "x", "H5Z_func_t" => "x", "H5Z_filter_func_t" => "x", + "sf_ioc_selection_t" => "x", "va_list" => "x", "size_t" => "z", "H5Z_SO_scale_type_t" => "Za", diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index dc97db3..1fe0bce 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -243,6 +243,7 @@ set (H5FD_SOURCES ${HDF5_SRC_DIR}/H5FDstdio.c ${HDF5_SRC_DIR}/H5FDtest.c ${HDF5_SRC_DIR}/H5FDwindows.c + ${HDF5_SRC_DIR}/H5FDsubfiling.c ${HDF5_SRC_DIR}/H5FDsubfile.c ${HDF5_SRC_DIR}/H5FDsubfile_threads.c ${HDF5_SRC_DIR}/H5FDsubfile_mpi.c @@ -265,6 +266,7 @@ set (H5FD_HDRS ${HDF5_SRC_DIR}/H5FDsplitter.h ${HDF5_SRC_DIR}/H5FDstdio.h ${HDF5_SRC_DIR}/H5FDwindows.h + ${HDF5_SRC_DIR}/H5FDsubfiling.h ${HDF5_SRC_DIR}/H5FDsubfile_public.h ${HDF5_SRC_DIR}/mercury/mercury_thread.h ${HDF5_SRC_DIR}/mercury/mercury_thread_mutex.h diff --git a/src/H5FDfamily.c b/src/H5FDfamily.c index 2537d86..e7e2e8b 100644 --- a/src/H5FDfamily.c +++ b/src/H5FDfamily.c @@ -12,38 +12,38 @@ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ /* - * Programmer: Robb Matzke - * Monday, November 10, 1997 - * - * Purpose: Implements a family of files that acts as a single hdf5 - * file. The purpose is to be able to split a huge file on a - * 64-bit platform, transfer all the <2GB members to a 32-bit - * platform, and then access the entire huge file on the 32-bit - * platform. - * - * All family members are logically the same size although their - * physical sizes may vary. The logical member size is - * determined by looking at the physical size of the first member - * when the file is opened. When creating a file family, the - * first member is created with a predefined physical size - * (actually, this happens when the file family is flushed, and - * can be quite time consuming on file systems that don't - * implement holes, like nfs). + * Programmer: Robb Matzke + * Monday, November 10, 1997 + * + * Purpose: Implements a family of files that acts as a single hdf5 + * file. The purpose is to be able to split a huge file on a + * 64-bit platform, transfer all the <2GB members to a 32-bit + * platform, and then access the entire huge file on the 32-bit + * platform. + * + * All family members are logically the same size although their + * physical sizes may vary. The logical member size is + * determined by looking at the physical size of the first member + * when the file is opened. When creating a file family, the + * first member is created with a predefined physical size + * (actually, this happens when the file family is flushed, and + * can be quite time consuming on file systems that don't + * implement holes, like nfs). * */ #include "H5FDdrvr_module.h" /* This source code file is part of the H5FD driver module */ -#include "H5private.h" /* Generic Functions */ -#include "H5CXprivate.h" /* API Contexts */ -#include "H5Eprivate.h" /* Error handling */ -#include "H5Fprivate.h" /* File access */ -#include "H5FDprivate.h" /* File drivers */ -#include "H5FDfamily.h" /* Family file driver */ -#include "H5Iprivate.h" /* IDs */ -#include "H5MMprivate.h" /* Memory management */ -#include "H5Pprivate.h" /* Property lists */ +#include "H5private.h" /* Generic Functions */ +#include "H5CXprivate.h" /* API Contexts */ +#include "H5Eprivate.h" /* Error handling */ +#include "H5Fprivate.h" /* File access */ +#include "H5FDprivate.h" /* File drivers */ +#include "H5FDfamily.h" /* Family file driver */ +#include "H5Iprivate.h" /* IDs */ +#include "H5MMprivate.h" /* Memory management */ +#include "H5Pprivate.h" /* Property lists */ /* The size of the member name buffers */ #define H5FD_FAM_MEMB_NAME_BUF_SIZE 4096 @@ -53,29 +53,29 @@ static hid_t H5FD_FAMILY_g = 0; /* The description of a file belonging to this driver. */ typedef struct H5FD_family_t { - H5FD_t pub; /*public stuff, must be first */ - hid_t memb_fapl_id; /*file access property list for members */ - hsize_t memb_size; /*actual size of each member file */ - hsize_t pmem_size; /*member size passed in from property */ - unsigned nmembs; /*number of family members */ - unsigned amembs; /*number of member slots allocated */ - H5FD_t **memb; /*dynamic array of member pointers */ - haddr_t eoa; /*end of allocated addresses */ - char *name; /*name generator printf format */ - unsigned flags; /*flags for opening additional members */ + H5FD_t pub; /* public stuff, must be first */ + hid_t memb_fapl_id; /* file access property list for members */ + hsize_t memb_size; /* actual size of each member file */ + hsize_t pmem_size; /* member size passed in from property */ + unsigned nmembs; /* number of family members */ + unsigned amembs; /* number of member slots allocated */ + H5FD_t **memb; /* dynamic array of member pointers */ + haddr_t eoa; /* end of allocated addresses */ + char *name; /* name generator printf format */ + unsigned flags; /* flags for opening additional members */ /* Information from properties set by 'h5repart' tool */ - hsize_t mem_newsize; /*new member size passed in as private - * property. It's used only by h5repart */ - hbool_t repart_members; /* Whether to mark the superblock dirty - * when it is loaded, so that the family - * member sizes can be re-encoded */ + hsize_t mem_newsize; /* new member size passed in as private + * property. It's used only by h5repart */ + hbool_t repart_members; /* Whether to mark the superblock dirty + * when it is loaded, so that the family + * member sizes can be re-encoded */ } H5FD_family_t; /* Driver-specific file access properties */ typedef struct H5FD_family_fapl_t { - hsize_t memb_size; /*size of each member */ - hid_t memb_fapl_id; /*file access property list of each memb*/ + hsize_t memb_size; /*size of each member */ + hid_t memb_fapl_id; /*file access property list of each memb*/ } H5FD_family_fapl_t; /* Callback prototypes */ @@ -85,11 +85,11 @@ static void *H5FD__family_fapl_copy(const void *_old_fa); static herr_t H5FD__family_fapl_free(void *_fa); static hsize_t H5FD__family_sb_size(H5FD_t *_file); static herr_t H5FD__family_sb_encode(H5FD_t *_file, char *name/*out*/, - unsigned char *buf/*out*/); + unsigned char *buf/*out*/); static herr_t H5FD__family_sb_decode(H5FD_t *_file, const char *name, const unsigned char *buf); static H5FD_t *H5FD__family_open(const char *name, unsigned flags, - hid_t fapl_id, haddr_t maxaddr); + hid_t fapl_id, haddr_t maxaddr); static herr_t H5FD__family_close(H5FD_t *_file); static int H5FD__family_cmp(const H5FD_t *_f1, const H5FD_t *_f2); static herr_t H5FD__family_query(const H5FD_t *_f1, unsigned long *flags); @@ -98,9 +98,9 @@ static herr_t H5FD__family_set_eoa(H5FD_t *_file, H5FD_mem_t type, haddr_t eoa); static haddr_t H5FD__family_get_eof(const H5FD_t *_file, H5FD_mem_t type); static herr_t H5FD__family_get_handle(H5FD_t *_file, hid_t fapl, void** file_handle); static herr_t H5FD__family_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, - size_t size, void *_buf/*out*/); + size_t size, void *_buf/*out*/); static herr_t H5FD__family_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, - size_t size, const void *_buf); + size_t size, const void *_buf); static herr_t H5FD__family_flush(H5FD_t *_file, hid_t dxpl_id, hbool_t closing); static herr_t H5FD__family_truncate(H5FD_t *_file, hid_t dxpl_id, hbool_t closing); static herr_t H5FD__family_lock(H5FD_t *_file, hbool_t rw); @@ -108,40 +108,40 @@ static herr_t H5FD__family_unlock(H5FD_t *_file); /* The class struct */ static const H5FD_class_t H5FD_family_g = { - "family", /* name */ - HADDR_MAX, /* maxaddr */ - H5F_CLOSE_WEAK, /* fc_degree */ - H5FD__family_term, /* terminate */ - H5FD__family_sb_size, /* sb_size */ - H5FD__family_sb_encode, /* sb_encode */ - H5FD__family_sb_decode, /* sb_decode */ - sizeof(H5FD_family_fapl_t), /* fapl_size */ - H5FD__family_fapl_get, /* fapl_get */ - H5FD__family_fapl_copy, /* fapl_copy */ - H5FD__family_fapl_free, /* fapl_free */ - 0, /* dxpl_size */ - NULL, /* dxpl_copy */ - NULL, /* dxpl_free */ - H5FD__family_open, /* open */ - H5FD__family_close, /* close */ - H5FD__family_cmp, /* cmp */ - H5FD__family_query, /* query */ - NULL, /* get_type_map */ - NULL, /* alloc */ - NULL, /* free */ - H5FD__family_get_eoa, /* get_eoa */ - H5FD__family_set_eoa, /* set_eoa */ - H5FD__family_get_eof, /* get_eof */ - H5FD__family_get_handle, /* get_handle */ - H5FD__family_read, /* read */ - H5FD__family_write, /* write */ - H5FD__family_flush, /* flush */ - NULL, /* read_vector */ - NULL, /* write_vector */ - H5FD__family_truncate, /* truncate */ - H5FD__family_lock, /* lock */ - H5FD__family_unlock, /* unlock */ - H5FD_FLMAP_DICHOTOMY /* fl_map */ + "family", /* name */ + HADDR_MAX, /* maxaddr */ + H5F_CLOSE_WEAK, /* fc_degree */ + H5FD__family_term, /* terminate */ + H5FD__family_sb_size, /* sb_size */ + H5FD__family_sb_encode, /* sb_encode */ + H5FD__family_sb_decode, /* sb_decode */ + sizeof(H5FD_family_fapl_t), /* fapl_size */ + H5FD__family_fapl_get, /* fapl_get */ + H5FD__family_fapl_copy, /* fapl_copy */ + H5FD__family_fapl_free, /* fapl_free */ + 0, /* dxpl_size */ + NULL, /* dxpl_copy */ + NULL, /* dxpl_free */ + H5FD__family_open, /* open */ + H5FD__family_close, /* close */ + H5FD__family_cmp, /* cmp */ + H5FD__family_query, /* query */ + NULL, /* get_type_map */ + NULL, /* alloc */ + NULL, /* free */ + H5FD__family_get_eoa, /* get_eoa */ + H5FD__family_set_eoa, /* set_eoa */ + H5FD__family_get_eof, /* get_eof */ + H5FD__family_get_handle, /* get_handle */ + H5FD__family_read, /* read */ + H5FD__family_write, /* write */ + NULL, /* read_vector */ + NULL, /* write_vector */ + H5FD__family_flush, /* flush */ + H5FD__family_truncate, /* truncate */ + H5FD__family_lock, /* lock */ + H5FD__family_unlock, /* unlock */ + H5FD_FLMAP_DICHOTOMY /* fl_map */ }; @@ -205,9 +205,9 @@ done: /*--------------------------------------------------------------------------- - * Function: H5FD__family_term + * Function: H5FD__family_term * - * Purpose: Shut down the VFD + * Purpose: Shut down the VFD * * Returns: Non-negative on success or negative on failure * @@ -229,19 +229,19 @@ H5FD__family_term(void) /*------------------------------------------------------------------------- - * Function: H5Pset_fapl_family + * Function: H5Pset_fapl_family * - * Purpose: Sets the file access property list FAPL_ID to use the family - * driver. The MEMB_SIZE is the size in bytes of each file - * member (used only when creating a new file) and the - * MEMB_FAPL_ID is a file access property list to be used for - * each family member. + * Purpose: Sets the file access property list FAPL_ID to use the family + * driver. The MEMB_SIZE is the size in bytes of each file + * member (used only when creating a new file) and the + * MEMB_FAPL_ID is a file access property list to be used for + * each family member. * - * Return: Success: Non-negative + * Return: Success: Non-negative * - * Failure: Negative + * Failure: Negative * - * Programmer: Robb Matzke + * Programmer: Robb Matzke * Wednesday, August 4, 1999 * *------------------------------------------------------------------------- @@ -250,7 +250,7 @@ herr_t H5Pset_fapl_family(hid_t fapl_id, hsize_t msize, hid_t memb_fapl_id) { herr_t ret_value; - H5FD_family_fapl_t fa={0, -1}; + H5FD_family_fapl_t fa={0, -1}; H5P_genplist_t *plist; /* Property list pointer */ FUNC_ENTER_API(FAIL) @@ -279,16 +279,16 @@ done: /*------------------------------------------------------------------------- - * Function: H5Pget_fapl_family + * Function: H5Pget_fapl_family * - * Purpose: Returns information about the family file access property - * list though the function arguments. + * Purpose: Returns information about the family file access property + * list though the function arguments. * - * Return: Success: Non-negative + * Return: Success: Non-negative * - * Failure: Negative + * Failure: Negative * - * Programmer: Robb Matzke + * Programmer: Robb Matzke * Wednesday, August 4, 1999 * *------------------------------------------------------------------------- @@ -297,7 +297,7 @@ herr_t H5Pget_fapl_family(hid_t fapl_id, hsize_t *msize/*out*/, hid_t *memb_fapl_id/*out*/) { H5P_genplist_t *plist; /* Property list pointer */ - const H5FD_family_fapl_t *fa; + const H5FD_family_fapl_t *fa; herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_API(FAIL) @@ -323,16 +323,16 @@ done: /*------------------------------------------------------------------------- - * Function: H5FD__family_fapl_get + * Function: H5FD__family_fapl_get * - * Purpose: Gets a file access property list which could be used to - * create an identical file. + * Purpose: Gets a file access property list which could be used to + * create an identical file. * - * Return: Success: Ptr to new file access property list. + * Return: Success: Ptr to new file access property list. * - * Failure: NULL + * Failure: NULL * - * Programmer: Robb Matzke + * Programmer: Robb Matzke * Friday, August 13, 1999 * *------------------------------------------------------------------------- @@ -340,8 +340,8 @@ done: static void * H5FD__family_fapl_get(H5FD_t *_file) { - H5FD_family_t *file = (H5FD_family_t*)_file; - H5FD_family_fapl_t *fa = NULL; + H5FD_family_t *file = (H5FD_family_t*)_file; + H5FD_family_fapl_t *fa = NULL; H5P_genplist_t *plist; /* Property list pointer */ void *ret_value = NULL; /* Return value */ @@ -368,15 +368,15 @@ done: /*------------------------------------------------------------------------- - * Function: H5FD__family_fapl_copy + * Function: H5FD__family_fapl_copy * - * Purpose: Copies the family-specific file access properties. + * Purpose: Copies the family-specific file access properties. * - * Return: Success: Ptr to a new property list + * Return: Success: Ptr to a new property list * - * Failure: NULL + * Failure: NULL * - * Programmer: Robb Matzke + * Programmer: Robb Matzke * Wednesday, August 4, 1999 * *------------------------------------------------------------------------- @@ -421,15 +421,15 @@ done: /*------------------------------------------------------------------------- - * Function: H5FD__family_fapl_free + * Function: H5FD__family_fapl_free * - * Purpose: Frees the family-specific file access properties. + * Purpose: Frees the family-specific file access properties. * - * Return: Success: 0 + * Return: Success: 0 * - * Failure: -1 + * Failure: -1 * - * Programmer: Robb Matzke + * Programmer: Robb Matzke * Wednesday, August 4, 1999 * *------------------------------------------------------------------------- @@ -437,7 +437,7 @@ done: static herr_t H5FD__family_fapl_free(void *_fa) { - H5FD_family_fapl_t *fa = (H5FD_family_fapl_t*)_fa; + H5FD_family_fapl_t *fa = (H5FD_family_fapl_t*)_fa; herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_STATIC @@ -452,16 +452,16 @@ done: /*------------------------------------------------------------------------- - * Function: H5FD__family_sb_size + * Function: H5FD__family_sb_size * - * Purpose: Returns the size of the private information to be stored in - * the superblock. + * Purpose: Returns the size of the private information to be stored in + * the superblock. * - * Return: Success: The super block driver data size. + * Return: Success: The super block driver data size. * - * Failure: never fails + * Failure: never fails * - * Programmer: Raymond Lu + * Programmer: Raymond Lu * Tuesday, May 10, 2005 * *------------------------------------------------------------------------- @@ -478,19 +478,19 @@ H5FD__family_sb_size(H5FD_t H5_ATTR_UNUSED *_file) /*------------------------------------------------------------------------- - * Function: H5FD__family_sb_encode + * Function: H5FD__family_sb_encode * - * Purpose: Encode driver information for the superblock. The NAME - * argument is a nine-byte buffer which will be initialized with - * an eight-character name/version number and null termination. + * Purpose: Encode driver information for the superblock. The NAME + * argument is a nine-byte buffer which will be initialized with + * an eight-character name/version number and null termination. * - * The encoding is the member file size and name template. + * The encoding is the member file size and name template. * - * Return: Success: 0 + * Return: Success: 0 * - * Failure: -1 + * Failure: -1 * - * Programmer: Raymond Lu + * Programmer: Raymond Lu * Tuesday, May 10, 2005 * *------------------------------------------------------------------------- @@ -498,7 +498,7 @@ H5FD__family_sb_size(H5FD_t H5_ATTR_UNUSED *_file) static herr_t H5FD__family_sb_encode(H5FD_t *_file, char *name/*out*/, unsigned char *buf/*out*/) { - H5FD_family_t *file = (H5FD_family_t*)_file; + H5FD_family_t *file = (H5FD_family_t*)_file; FUNC_ENTER_STATIC_NOERR @@ -521,19 +521,19 @@ H5FD__family_sb_encode(H5FD_t *_file, char *name/*out*/, unsigned char *buf/*out /*------------------------------------------------------------------------- - * Function: H5FD__family_sb_decode + * Function: H5FD__family_sb_decode * - * Purpose: This function has 2 separate purpose. One is to decodes the + * Purpose: This function has 2 separate purpose. One is to decodes the * superblock information for this driver. The NAME argument is * the eight-character (plus null termination) name stored in i * the file. The FILE argument is updated according to the * information in the superblock. * - * Return: Success: 0 + * Return: Success: 0 * - * Failure: -1 + * Failure: -1 * - * Programmer: Raymond Lu + * Programmer: Raymond Lu * Tuesday, May 10, 2005 * *------------------------------------------------------------------------- @@ -541,7 +541,7 @@ H5FD__family_sb_encode(H5FD_t *_file, char *name/*out*/, unsigned char *buf/*out static herr_t H5FD__family_sb_decode(H5FD_t *_file, const char H5_ATTR_UNUSED *name, const unsigned char *buf) { - H5FD_family_t *file = (H5FD_family_t*)_file; + H5FD_family_t *file = (H5FD_family_t*)_file; uint64_t msize; herr_t ret_value = SUCCEED; /* Return value */ @@ -576,17 +576,17 @@ done: /*------------------------------------------------------------------------- - * Function: H5FD__family_open + * Function: H5FD__family_open * - * Purpose: Creates and/or opens a family of files as an HDF5 file. + * Purpose: Creates and/or opens a family of files as an HDF5 file. * - * Return: Success: A pointer to a new file dat structure. The - * public fields will be initialized by the - * caller, which is always H5FD_open(). + * Return: Success: A pointer to a new file dat structure. The + * public fields will be initialized by the + * caller, which is always H5FD_open(). * - * Failure: NULL + * Failure: NULL * - * Programmer: Robb Matzke + * Programmer: Robb Matzke * Wednesday, August 4, 1999 * *------------------------------------------------------------------------- @@ -600,13 +600,13 @@ done: H5_GCC_DIAG_OFF(format-nonliteral) static H5FD_t * H5FD__family_open(const char *name, unsigned flags, hid_t fapl_id, - haddr_t maxaddr) + haddr_t maxaddr) { - H5FD_family_t *file = NULL; - char *memb_name = NULL, *temp = NULL; - hsize_t eof = HADDR_UNDEF; - unsigned t_flags = flags & ~H5F_ACC_CREAT; - H5FD_t *ret_value = NULL; + H5FD_family_t *file = NULL; + char *memb_name = NULL, *temp = NULL; + hsize_t eof = HADDR_UNDEF; + unsigned t_flags = flags & ~H5F_ACC_CREAT; + H5FD_t *ret_value = NULL; FUNC_ENTER_STATIC @@ -752,17 +752,17 @@ H5_GCC_DIAG_ON(format-nonliteral) /*------------------------------------------------------------------------- - * Function: H5FD__family_close + * Function: H5FD__family_close * - * Purpose: Closes a family of files. + * Purpose: Closes a family of files. * - * Return: Success: Non-negative + * Return: Success: Non-negative * - * Failure: Negative with as many members closed as - * possible. The only subsequent operation - * permitted on the file is a close operation. + * Failure: Negative with as many members closed as + * possible. The only subsequent operation + * permitted on the file is a close operation. * - * Programmer: Robb Matzke + * Programmer: Robb Matzke * Wednesday, August 4, 1999 * *------------------------------------------------------------------------- @@ -771,8 +771,8 @@ static herr_t H5FD__family_close(H5FD_t *_file) { H5FD_family_t *file = (H5FD_family_t*)_file; - unsigned nerrors = 0; /* Number of errors while closing member files */ - unsigned u; /* Local index variable */ + unsigned nerrors = 0; /* Number of errors while closing member files */ + unsigned u; /* Local index variable */ herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_STATIC @@ -804,17 +804,17 @@ H5FD__family_close(H5FD_t *_file) /*------------------------------------------------------------------------- - * Function: H5FD__family_cmp + * Function: H5FD__family_cmp * - * Purpose: Compares two file families to see if they are the same. It - * does this by comparing the first member of the two families. + * Purpose: Compares two file families to see if they are the same. It + * does this by comparing the first member of the two families. * - * Return: Success: like strcmp() + * Return: Success: like strcmp() * - * Failure: never fails (arguments were checked by the - * caller). + * Failure: never fails (arguments were checked by the + * caller). * - * Programmer: Robb Matzke + * Programmer: Robb Matzke * Wednesday, August 4, 1999 * *------------------------------------------------------------------------- @@ -822,8 +822,8 @@ H5FD__family_close(H5FD_t *_file) static int H5FD__family_cmp(const H5FD_t *_f1, const H5FD_t *_f2) { - const H5FD_family_t *f1 = (const H5FD_family_t*)_f1; - const H5FD_family_t *f2 = (const H5FD_family_t*)_f2; + const H5FD_family_t *f1 = (const H5FD_family_t*)_f1; + const H5FD_family_t *f2 = (const H5FD_family_t*)_f2; int ret_value = 0; FUNC_ENTER_STATIC_NOERR @@ -838,15 +838,15 @@ H5FD__family_cmp(const H5FD_t *_f1, const H5FD_t *_f2) /*------------------------------------------------------------------------- - * Function: H5FD__family_query + * Function: H5FD__family_query * - * Purpose: Set the flags that this VFL driver is capable of supporting. + * Purpose: Set the flags that this VFL driver is capable of supporting. * (listed in H5FDpublic.h) * - * Return: Success: non-negative - * Failure: negative + * Return: Success: non-negative + * Failure: negative * - * Programmer: Quincey Koziol + * Programmer: Quincey Koziol * Friday, August 25, 2000 * *------------------------------------------------------------------------- @@ -854,7 +854,7 @@ H5FD__family_cmp(const H5FD_t *_f1, const H5FD_t *_f2) static herr_t H5FD__family_query(const H5FD_t * _file, unsigned long *flags /* out */) { - const H5FD_family_t *file = (const H5FD_family_t*)_file; /* Family VFD info */ + const H5FD_family_t *file = (const H5FD_family_t*)_file; /* Family VFD info */ FUNC_ENTER_STATIC_NOERR @@ -876,17 +876,17 @@ H5FD__family_query(const H5FD_t * _file, unsigned long *flags /* out */) /*------------------------------------------------------------------------- - * Function: H5FD__family_get_eoa + * Function: H5FD__family_get_eoa * - * Purpose: Returns the end-of-address marker for the file. The EOA - * marker is the first address past the last byte allocated in - * the format address space. + * Purpose: Returns the end-of-address marker for the file. The EOA + * marker is the first address past the last byte allocated in + * the format address space. * - * Return: Success: The end-of-address-marker + * Return: Success: The end-of-address-marker * - * Failure: HADDR_UNDEF + * Failure: HADDR_UNDEF * - * Programmer: Robb Matzke + * Programmer: Robb Matzke * Wednesday, August 4, 1999 * *------------------------------------------------------------------------- @@ -894,7 +894,7 @@ H5FD__family_query(const H5FD_t * _file, unsigned long *flags /* out */) static haddr_t H5FD__family_get_eoa(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type) { - const H5FD_family_t *file = (const H5FD_family_t*)_file; + const H5FD_family_t *file = (const H5FD_family_t*)_file; FUNC_ENTER_STATIC_NOERR @@ -903,15 +903,15 @@ H5FD__family_get_eoa(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type) /*------------------------------------------------------------------------- - * Function: H5FD__family_set_eoa + * Function: H5FD__family_set_eoa * - * Purpose: Set the end-of-address marker for the file. + * Purpose: Set the end-of-address marker for the file. * - * Return: Success: 0 + * Return: Success: 0 * - * Failure: -1 + * Failure: -1 * - * Programmer: Robb Matzke + * Programmer: Robb Matzke * Wednesday, August 4, 1999 * *------------------------------------------------------------------------- @@ -926,10 +926,10 @@ H5_GCC_DIAG_OFF(format-nonliteral) static herr_t H5FD__family_set_eoa(H5FD_t *_file, H5FD_mem_t type, haddr_t abs_eoa) { - H5FD_family_t *file = (H5FD_family_t*)_file; - haddr_t addr = abs_eoa; - char *memb_name = NULL; - unsigned u; /* Local index variable */ + H5FD_family_t *file = (H5FD_family_t*)_file; + haddr_t addr = abs_eoa; + char *memb_name = NULL; + unsigned u; /* Local index variable */ herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_STATIC @@ -993,18 +993,18 @@ H5_GCC_DIAG_ON(format-nonliteral) /*------------------------------------------------------------------------- - * Function: H5FD__family_get_eof + * Function: H5FD__family_get_eof * - * Purpose: Returns the end-of-file marker, which is the greater of - * either the total family size or the current EOA marker. + * Purpose: Returns the end-of-file marker, which is the greater of + * either the total family size or the current EOA marker. * - * Return: Success: End of file address, the first address past - * the end of the family of files or the current - * EOA, whichever is larger. + * Return: Success: End of file address, the first address past + * the end of the family of files or the current + * EOA, whichever is larger. * - * Failure: HADDR_UNDEF + * Failure: HADDR_UNDEF * - * Programmer: Robb Matzke + * Programmer: Robb Matzke * Wednesday, August 4, 1999 * *------------------------------------------------------------------------- @@ -1012,9 +1012,9 @@ H5_GCC_DIAG_ON(format-nonliteral) static haddr_t H5FD__family_get_eof(const H5FD_t *_file, H5FD_mem_t type) { - const H5FD_family_t *file = (const H5FD_family_t*)_file; - haddr_t eof=0; - int i; /* Local index variable */ + const H5FD_family_t *file = (const H5FD_family_t*)_file; + haddr_t eof=0; + int i; /* Local index variable */ haddr_t ret_value = HADDR_UNDEF; /* Return value */ FUNC_ENTER_STATIC_NOERR @@ -1089,32 +1089,32 @@ done: /*------------------------------------------------------------------------- - * Function: H5FD__family_read + * Function: H5FD__family_read * - * Purpose: Reads SIZE bytes of data from FILE beginning at address ADDR - * into buffer BUF according to data transfer properties in - * DXPL_ID. + * Purpose: Reads SIZE bytes of data from FILE beginning at address ADDR + * into buffer BUF according to data transfer properties in + * DXPL_ID. * - * Return: Success: Zero. Result is stored in caller-supplied - * buffer BUF. + * Return: Success: Zero. Result is stored in caller-supplied + * buffer BUF. * - * Failure: -1, contents of buffer BUF are undefined. + * Failure: -1, contents of buffer BUF are undefined. * - * Programmer: Robb Matzke + * Programmer: Robb Matzke * Wednesday, August 4, 1999 * *------------------------------------------------------------------------- */ static herr_t H5FD__family_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size, - void *_buf/*out*/) + void *_buf/*out*/) { - H5FD_family_t *file = (H5FD_family_t*)_file; - unsigned char *buf = (unsigned char*)_buf; - haddr_t sub; - size_t req; + H5FD_family_t *file = (H5FD_family_t*)_file; + unsigned char *buf = (unsigned char*)_buf; + haddr_t sub; + size_t req; hsize_t tempreq; - unsigned u; /* Local index variable */ + unsigned u; /* Local index variable */ H5P_genplist_t *plist; /* Property list pointer */ herr_t ret_value=SUCCEED; /* Return value */ @@ -1133,12 +1133,12 @@ H5FD__family_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, s sub = addr % file->memb_size; - /* This check is for mainly for IA32 architecture whose size_t's size - * is 4 bytes, to prevent overflow when user application is trying to - * write files bigger than 4GB. */ + /* This check is for mainly for IA32 architecture whose size_t's size + * is 4 bytes, to prevent overflow when user application is trying to + * write files bigger than 4GB. */ tempreq = file->memb_size-sub; - if(tempreq > SIZET_MAX) - tempreq = SIZET_MAX; + if(tempreq > SIZET_MAX) + tempreq = SIZET_MAX; req = MIN(size, (size_t)tempreq); HDassert(unmembs); @@ -1157,31 +1157,31 @@ done: /*------------------------------------------------------------------------- - * Function: H5FD__family_write + * Function: H5FD__family_write * - * Purpose: Writes SIZE bytes of data to FILE beginning at address ADDR - * from buffer BUF according to data transfer properties in - * DXPL_ID. + * Purpose: Writes SIZE bytes of data to FILE beginning at address ADDR + * from buffer BUF according to data transfer properties in + * DXPL_ID. * - * Return: Success: Zero + * Return: Success: Zero * - * Failure: -1 + * Failure: -1 * - * Programmer: Robb Matzke + * Programmer: Robb Matzke * Wednesday, August 4, 1999 * *------------------------------------------------------------------------- */ static herr_t H5FD__family_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size, - const void *_buf) + const void *_buf) { - H5FD_family_t *file = (H5FD_family_t*)_file; - const unsigned char *buf = (const unsigned char*)_buf; - haddr_t sub; - size_t req; + H5FD_family_t *file = (H5FD_family_t*)_file; + const unsigned char *buf = (const unsigned char*)_buf; + haddr_t sub; + size_t req; hsize_t tempreq; - unsigned u; /* Local index variable */ + unsigned u; /* Local index variable */ H5P_genplist_t *plist; /* Property list pointer */ herr_t ret_value = SUCCEED; /* Return value */ @@ -1204,8 +1204,8 @@ H5FD__family_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, * is 4 bytes, to prevent overflow when user application is trying to * write files bigger than 4GB. */ tempreq = file->memb_size-sub; - if(tempreq > SIZET_MAX) - tempreq = SIZET_MAX; + if(tempreq > SIZET_MAX) + tempreq = SIZET_MAX; req = MIN(size, (size_t)tempreq); HDassert(unmembs); @@ -1224,14 +1224,14 @@ done: /*------------------------------------------------------------------------- - * Function: H5FD__family_flush + * Function: H5FD__family_flush * - * Purpose: Flushes all family members. + * Purpose: Flushes all family members. * - * Return: Success: 0 - * Failure: -1, as many files flushed as possible. + * Return: Success: 0 + * Failure: -1, as many files flushed as possible. * - * Programmer: Robb Matzke + * Programmer: Robb Matzke * Wednesday, August 4, 1999 * *------------------------------------------------------------------------- @@ -1239,8 +1239,8 @@ done: static herr_t H5FD__family_flush(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t closing) { - H5FD_family_t *file = (H5FD_family_t*)_file; - unsigned u, nerrors = 0; + H5FD_family_t *file = (H5FD_family_t*)_file; + unsigned u, nerrors = 0; herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_STATIC @@ -1258,15 +1258,15 @@ done: /*------------------------------------------------------------------------- - * Function: H5FD__family_truncate + * Function: H5FD__family_truncate * - * Purpose: Truncates all family members. + * Purpose: Truncates all family members. * - * Return: Success: 0 + * Return: Success: 0 * - * Failure: -1, as many files truncated as possible. + * Failure: -1, as many files truncated as possible. * - * Programmer: Quincey Koziol + * Programmer: Quincey Koziol * Saturday, February 23, 2008 * *------------------------------------------------------------------------- @@ -1274,9 +1274,9 @@ done: static herr_t H5FD__family_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t closing) { - H5FD_family_t *file = (H5FD_family_t*)_file; - unsigned u, nerrors = 0; - herr_t ret_value = SUCCEED; /* Return value */ + H5FD_family_t *file = (H5FD_family_t*)_file; + unsigned u, nerrors = 0; + herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_STATIC @@ -1311,7 +1311,7 @@ H5FD__family_lock(H5FD_t *_file, hbool_t rw) { H5FD_family_t *file = (H5FD_family_t *)_file; /* VFD file struct */ unsigned u; /* Local index variable */ - herr_t ret_value = SUCCEED; /* Return value */ + herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_STATIC @@ -1354,8 +1354,8 @@ done: static herr_t H5FD__family_unlock(H5FD_t *_file) { - H5FD_family_t *file = (H5FD_family_t *)_file; /* VFD file struct */ - unsigned u; /* Local index variable */ + H5FD_family_t *file = (H5FD_family_t *)_file; /* VFD file struct */ + unsigned u; /* Local index variable */ herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_STATIC diff --git a/src/H5FDsplitter.c b/src/H5FDsplitter.c index 7bf0de8..eac2dbe 100644 --- a/src/H5FDsplitter.c +++ b/src/H5FDsplitter.c @@ -156,8 +156,8 @@ static const H5FD_class_t H5FD_splitter_g = { H5FD__splitter_get_handle, /* get_handle */ H5FD__splitter_read, /* read */ H5FD__splitter_write, /* write */ - NULL, /* read_vector */ - NULL, /* write_vector */ + NULL, /* read_vector */ + NULL, /* write_vector */ H5FD__splitter_flush, /* flush */ H5FD__splitter_truncate, /* truncate */ H5FD__splitter_lock, /* lock */ diff --git a/src/H5FDsubfile.c b/src/H5FDsubfile.c index a467533..2b3d44b 100644 --- a/src/H5FDsubfile.c +++ b/src/H5FDsubfile.c @@ -1,273 +1,334 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright by The HDF Group. * + * Copyright by the Board of Trustees of the University of Illinois. * + * All rights reserved. * + * * + * This file is part of HDF5. The full HDF5 copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the COPYING file, which can be found at the root of the source code * + * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. * + * If you do not have access to either file, you may request a copy from * + * help@hdfgroup.org. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ -#include "H5FDsubfile_public.h" +/* + * Programmer: Richard Warren + * Wednesday, July 1, 2020 + * + * Purpose: This is part of a parallel subfiling I/O driver. + * + */ -#ifdef H5_HAVE_PARALLEL +#include "H5FDsubfile_public.h" /***********/ /* Headers */ /***********/ -#include "H5private.h" /* Generic Functions */ -#include "H5CXprivate.h" /* API Contexts */ -#include "H5Dprivate.h" /* Datasets */ -#include "H5Eprivate.h" /* Error handling */ -#include "H5Ipublic.h" /* IDs */ -#include "H5Iprivate.h" /* IDs */ -#include "H5MMprivate.h" /* Memory management */ -#include "H5Pprivate.h" /* Property lists */ - -/* +#include "H5CXprivate.h" /* API Contexts */ +#include "H5Dprivate.h" /* Datasets */ +#include "H5Eprivate.h" /* Error handling */ +#include "H5Iprivate.h" /* IDs */ +#include "H5Ipublic.h" /* IDs */ +#include "H5MMprivate.h" /* Memory management */ +#include "H5Pprivate.h" /* Property lists */ +#include "H5private.h" /* Generic Functions */ + +/* ========================================= Private functions -======================================== +========================================= */ -static size_t sf_topology_limit = 4; -static size_t sf_topology_entries = 0; -static sf_topology_t **sf_topology_cache = NULL; +/* Modifiable via environment variable */ +static sf_ioc_selection_t sf_default_ioc_selection = SELECT_IOC_ONE_PER_NODE; -static size_t sf_context_limit = 4; -static size_t sf_context_entries = 0; -static subfiling_context_t **sf_context_cache = NULL; -static hid_t context_id = H5I_INVALID_HID; -static hid_t topology_id = H5I_INVALID_HID; +/* +----------------------------------------------------------------------------------- +sf_topology_limit -- How many different topologies can be recorded (default = +4) sf_topology_entries -- The number of topologies that are currently recorded. +sf_topology_cache -- Storage for the known topologies +----------------------------------------------------------------------------------- +*/ +static size_t sf_topology_limit = 4; +static sf_topology_t *sf_topology_cache = NULL; +/* +-------------------------------------------------------------------------- +sf_context_limit -- How many contexts can be recorded (default = 4) +sf_context_entries -- The number of contexts that are currently recorded. +sf_context_cache -- Storage for contexts +-------------------------------------------------------------------------- +*/ +static size_t sf_context_limit = 16; +static subfiling_context_t *sf_context_cache = NULL; -static int64_t record_subfiling_object(SF_OBJ_TYPE type, void *obj) +/* +------------------------------------------------------------------------- + Programmer: Richard Warren + Purpose: Return a pointer to the requested storage object. + There are only 2 object types: TOPOLOGY or CONTEXT + structures. An object_id contains the object type + in upper 32 bits and an index value in the lower 32 bits. + Storage for an object is allocated as required. + + Topologies are static, i.e. for any one IO Concentrator + allocation strategy, the results should always be the + same. + FIXME: The one exception to this being the 1 IOC per + N MPI ranks. The value of N can be changed on a per-file + basis, so we need address that at some point. + + Contexts are 1 per open file. If only one file is open + at a time, then we will only use a single context cache + entry. + Errors: returns NULL if input SF_OBJ_TYPE is unrecognized or + a memory allocation error. + + Revision History -- Initial implementation +------------------------------------------------------------------------- +*/ +void * +get_subfiling_object(int64_t object_id) { - size_t index; - int64_t obj_reference; - uint64_t tag; - switch(type) { - case SF_TOPOLOGY: { - if (sf_topology_cache == NULL) { - sf_topology_cache = (sf_topology_t **) - calloc(sf_topology_limit, sizeof(sf_topology_t *)); - } - assert(sf_topology_cache != NULL); - index = sf_topology_entries++; - tag = SF_TOPOLOGY; - obj_reference = (int64_t)((tag << 32) | index); - sf_topology_cache[index] = obj; - return obj_reference; - break; - } - case SF_CONTEXT: { - if (sf_context_cache == NULL) { - sf_context_cache = (subfiling_context_t **) - calloc(sf_context_limit, sizeof(subfiling_context_t *)); - } - assert(sf_context_cache != NULL); - index = sf_context_entries++; - tag = SF_CONTEXT; - obj_reference = (int64_t)((tag << 32) | index); - sf_context_cache[index] = (subfiling_context_t *)obj; - return obj_reference; - break; - } - default: - puts("UNKNOWN Subfiling object type"); - } - - return -1; + int obj_type = (int) ((object_id >> 32) & 0x0FFFF); + /* We don't require a large indexing space + * 16 bits should be enough.. + */ + size_t index = (object_id & 0x0FFFF); + if (obj_type == SF_TOPOLOGY) { + if (sf_topology_cache == NULL) { + sf_topology_cache = (sf_topology_t *) calloc( + sf_topology_limit, sizeof(sf_topology_t)); + assert(sf_topology_cache != NULL); + } + if (index < sf_topology_limit) { + return (void *) &sf_topology_cache[index]; + } else { + puts("Illegal toplogy object index"); + } + } else if (obj_type == SF_CONTEXT) { + if (sf_context_cache == NULL) { + sf_context_cache = (subfiling_context_t *) calloc( + sf_context_limit, sizeof(subfiling_context_t)); + assert(sf_context_cache != NULL); + } + if (index == sf_context_limit) { + sf_context_limit *= 2; + sf_context_cache = (subfiling_context_t *) realloc(sf_context_cache, + sf_context_limit * sizeof(subfiling_context_t)); + assert(sf_context_cache != NULL); + } else { + return (void *) &sf_context_cache[index]; + } + } else { + printf( + "get_subfiling_object: UNKNOWN Subfiling object type id = 0x%lx\n", + object_id); + } + return NULL; } -/* -========================================= +/* +====================================================== Public vars (for subfiling) and functions -======================================== +We probably need a function to set and clear this +====================================================== */ - int sf_verbose_flag = 0; /* -========================================= +====================================================== File functions -========================================= The pread and pwrite posix functions are described as -being thread safe. We include mutex locks and unlocks -to work around any potential threading conflicts... -Those however, are compiled according #ifdef +being thread safe. +====================================================== */ - -int sf_read_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, int subfile_rank) +int +sf_read_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, + int subfile_rank) { - int ret = 0; + int ret = 0; ssize_t bytes_read; - ssize_t bytes_remaining = (ssize_t)data_size; - char *this_buffer = data_buffer; + ssize_t bytes_remaining = (ssize_t) data_size; + char * this_buffer = data_buffer; + + while (bytes_remaining) { + if ((bytes_read = (ssize_t) pread( + fd, this_buffer, (size_t) bytes_remaining, file_offset)) < 0) { - while(bytes_remaining) { - if ((bytes_read = (ssize_t)pread(fd, this_buffer, (size_t)bytes_remaining, file_offset)) < 0) { perror("pread failed!"); + printf("[ioc(%d) %s] pread(fd, buf, bytes_remaining=%ld, " + "file_offset =%ld)\n", + subfile_rank, __func__, bytes_remaining, file_offset); fflush(stdout); - } - else if (bytes_read > 0) { - if (sf_verbose_flag) { - printf("[ioc(%d) %s] read %ld bytes of %ld requested\n", - subfile_rank, __func__, - bytes_read, bytes_remaining); - } + return -1; + } else if (bytes_read > 0) { bytes_remaining -= bytes_read; this_buffer += bytes_read; file_offset += bytes_read; - } - else { - printf("[ioc(%d) %s] ERROR! read of 0 bytes == eof!\n", subfile_rank, __func__ ); + } else { + printf("[ioc(%d) %s] ERROR! read of 0 bytes == eof!\n", + subfile_rank, __func__); fflush(stdout); - break; + return -2; } } return ret; } -int sf_write_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, int subfile_rank) +int +sf_write_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, + int subfile_rank) { - int ret = 0; - char *this_data = (char *)data_buffer; + int ret = 0; + char * this_data = (char *) data_buffer; ssize_t bytes_remaining = (ssize_t) data_size; ssize_t written = 0; - while(bytes_remaining) { - if ((written = pwrite(fd, this_data, (size_t)bytes_remaining, file_offset)) < 0) { + while (bytes_remaining) { + if ((written = pwrite( + fd, this_data, (size_t) bytes_remaining, file_offset)) < 0) { perror("pwrite failed!"); + printf("[ioc(%d) %s] pwrite(fd, data, bytes_remaining=%ld, " + "file_offset =%ld)\n", + subfile_rank, __func__, bytes_remaining, file_offset); fflush(stdout); - break; - } - else { - if (sf_verbose_flag) { - printf("[ioc(%d) %s] wrote %ld bytes of %ld requested\n", - subfile_rank, __func__, - written, bytes_remaining); - } + return -1; + } else { bytes_remaining -= written; this_data += written; file_offset += written; } } + /* We don't usually use this for each file write. We usually do the file + * flush as part of file close operation. + */ #ifdef SUBFILE_REQUIRE_FLUSH fdatasync(fd); #endif - return ret; } - - - -void * get_subfiling_object(int64_t object_id) +/* +------------------------------------------------------------------------- + Programmer: Richard Warren + Purpose: Return a character string which represents either the + default selection method: SELECT_IOC_ONE_PER_NODE; or + if the user has selected a method via the environment + variable (H5_IOC_SELECTION_CRITERIA), we return that + along with any optional qualifier with for that method. + + Errors: None. + + Revision History -- Initial implementation +------------------------------------------------------------------------- +*/ +char * +get_ioc_selection_criteria(sf_ioc_selection_t *selection) { - int obj_type = (int)((object_id >> 32) & 0x0FFFF); - /* We don't require a large indexing space - * 16 bits should be enough.. - */ - size_t index = (object_id & 0x0FFFF); - if (obj_type == SF_TOPOLOGY) { - if (index < sf_context_entries) { - return (void *)sf_topology_cache[index]; - } - else { - puts("Illegal object index"); - } - } - else if (obj_type == SF_CONTEXT) { - if (index < sf_context_entries) { - return (void *)sf_context_cache[index]; - } - else { - puts("Illegal object index"); - } - } - else { - puts("UNKNOWN Subfiling object type"); - } - return NULL; + char *optValue = NULL; + char *envValue = HDgetenv("H5_IOC_SELECTION_CRITERIA"); + + /* For non-default options, the environment variable + * should have the following form: integer:[integer|string] + * In particular, EveryNthRank == 1:64 or every 64 ranks assign an IOC + * or WithConfig == 2:/ + */ + if (envValue && (optValue = strchr(envValue, ':'))) { + *optValue++ = 0; + } + if (envValue) { + int checkValue = atoi(envValue); + if ((checkValue < 0) || (checkValue >= ioc_selection_options)) { + *selection = sf_default_ioc_selection; + return NULL; + } else { + *selection = (sf_ioc_selection_t) checkValue; + return optValue; + } + } + *selection = sf_default_ioc_selection; + return NULL; } +/* +------------------------------------------------------------------------- + Programmer: Richard Warren + Purpose: Called as part of a file open operation, we initialize a + subfiling context which includes the application topology + along with other relevant info such as the MPI objects + (communicators) for communicating with IO concentrators. + We also identify which MPI ranks will have IOC threads + started on them. + + We return a context ID via the 'sf_context' variable. + + Errors: returns an error if we detect any initialization errors, + including malloc failures or any resource allocation + problems. + + Revision History -- Initial implementation +------------------------------------------------------------------------- +*/ herr_t -H5FDsubfiling_init(void) +H5FDsubfiling_init(sf_ioc_selection_t ioc_select_method, char *ioc_select_option, + int64_t *sf_context) { - herr_t ret_value = SUCCEED; - int ioc_count; - int world_rank, world_size; - sf_topology_t *thisApp = NULL; - subfiling_context_t *newContext = NULL; - - FUNC_ENTER_API(FAIL) - H5TRACE0("e",""); - - if (MPI_Comm_size(MPI_COMM_WORLD, &world_size) != MPI_SUCCESS) { - puts("MPI_Comm_size returned an error"); - ret_value = FAIL; - goto done; - } - if (MPI_Comm_rank(MPI_COMM_WORLD, &world_rank) != MPI_SUCCESS) { - puts("MPI_Comm_rank returned an error"); - ret_value = FAIL; - goto done; - } - if ((ioc_count = H5FD__determine_ioc_count (world_size, world_rank, &thisApp)) > 0) { - topology_id = (hid_t)record_subfiling_object(SF_TOPOLOGY, thisApp); - } - if (topology_id < 0) { - puts("Unable to register subfiling topology!"); - ret_value = FAIL; - goto done; - } - if (H5FD__init_subfile_context(&newContext, ioc_count, world_size, world_rank, thisApp->rank_is_ioc) != SUCCEED) { - puts("Unable to initialize a subfiling context!"); - ret_value = FAIL; - goto done; - } - context_id = (hid_t)record_subfiling_object(SF_CONTEXT, newContext); - if (context_id < 0) { - ret_value = FAIL; - puts("Unable to register subfiling context!"); - } - -done: - FUNC_LEAVE_API(ret_value) + herr_t ret_value = SUCCEED; + int ioc_count; + int world_rank, world_size; + sf_topology_t * thisApp = NULL; + int active_file_maps = active_map_entries(); + int64_t tag = SF_CONTEXT; + int64_t context_id = ((tag << 32) | active_file_maps); + subfiling_context_t *newContext = + (subfiling_context_t *) get_subfiling_object(context_id); + + FUNC_ENTER_API(FAIL) + H5TRACE3("e", "x*s*IL", ioc_select_method, ioc_select_option, sf_context); + + if (MPI_Comm_size(MPI_COMM_WORLD, &world_size) != MPI_SUCCESS) { + puts("MPI_Comm_size returned an error"); + ret_value = FAIL; + goto done; + } + if (MPI_Comm_rank(MPI_COMM_WORLD, &world_rank) != MPI_SUCCESS) { + puts("MPI_Comm_rank returned an error"); + ret_value = FAIL; + goto done; + } - return ret_value; -} + if ((ioc_count = H5FD__determine_ioc_count(world_size, world_rank, + ioc_select_method, ioc_select_option, &thisApp)) <= 0) { + puts("Unable to register subfiling topology!"); + ret_value = FAIL; + goto done; + } -herr_t -H5FDsubfiling_finalize(void) -{ - herr_t ret_value = SUCCEED; /* Return value */ - sf_topology_t *thisApp = NULL; - - FUNC_ENTER_API(FAIL) - H5TRACE0("e",""); - - /* Shutdown the IO Concentrator threads */ - - if (topology_id != H5I_INVALID_HID) { - thisApp = get_subfiling_object(topology_id); - } - - if (thisApp && thisApp->rank_is_ioc) { - begin_thread_exclusive(); - sf_shutdown_flag = 1; - end_thread_exclusive(); - - usleep(100); - - wait_for_thread_main(); - } - - MPI_Barrier(MPI_COMM_WORLD); - - delete_subfiling_context(context_id); + newContext->sf_context_id = context_id; + + if (H5FD__init_subfile_context( + thisApp, ioc_count, world_rank, newContext) != SUCCEED) { + puts("Unable to initialize a subfiling context!"); + ret_value = FAIL; + goto done; + } + + if (newContext->topology->rank_is_ioc) { + int status = initialize_ioc_threads(newContext); + if (status) + goto done; + } + + if (context_id < 0) { + ret_value = FAIL; + + goto done; + } + *sf_context = context_id; - FUNC_LEAVE_API(ret_value) done: - return ret_value; -} + FUNC_LEAVE_API(ret_value) -hid_t -get_subfiling_context(void) -{ - return context_id; + return ret_value; } - -#endif /* H5_HAVE_PARALLEL */ diff --git a/src/H5FDsubfile.h b/src/H5FDsubfile.h deleted file mode 100644 index e69de29..0000000 diff --git a/src/H5FDsubfile_mpi.c b/src/H5FDsubfile_mpi.c index fda4928..57add71 100644 --- a/src/H5FDsubfile_mpi.c +++ b/src/H5FDsubfile_mpi.c @@ -1,52 +1,40 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright by The HDF Group. * + * Copyright by the Board of Trustees of the University of Illinois. * + * All rights reserved. * + * * + * This file is part of HDF5. The full HDF5 copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the COPYING file, which can be found at the root of the source code * + * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. * + * If you do not have access to either file, you may request a copy from * + * help@hdfgroup.org. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #include "H5FDsubfile_private.h" -static int *io_concentrator = NULL; -static int n_io_concentrators = -1; static int sf_world_rank = -1; static int sf_world_size = -1; -static int subfile_fid = -1; -static int64_t sf_stripe_size = -1; -static int64_t sf_blocksize_per_stripe = 0; - -static MPI_Datatype H5FD__create_f_l_mpi_type(subfiling_context_t *context, - int64_t target_write_bytes, - int64_t first_write, - int64_t last_write, - int ioc_depth); -static MPI_Datatype H5FD__create_first_mpi_type(subfiling_context_t *context, - int64_t offset, - int64_t target_write_bytes, - int64_t first_write, - int ioc_depth); -static MPI_Datatype H5FD__create_final_mpi_type(subfiling_context_t *context, - int64_t target_write_bytes, - int64_t last_write, - int ioc_depth); -static MPI_Datatype H5FD__create_mpi_uniform_type(subfiling_context_t *context, - int64_t offset, - int64_t target_write_bytes, - int ioc_depth); - -static int * request_count_per_rank = NULL; +static int sf_open_file_count = 0; +static int sf_close_file_count = 0; +static int sf_ops_after_first_close = 0; + +static int *request_count_per_rank = NULL; atomic_int sf_workinprogress = 0; atomic_int sf_work_pending = 0; atomic_int sf_file_close_count = 0; atomic_int sf_file_refcount = 0; +atomic_int sf_ioc_fini_refcount = 0; -#ifdef DEBUG_TRACING +#ifndef NDEBUG FILE *sf_logfile = NULL; #endif -MPI_Comm sf_msg_comm = MPI_COMM_NULL; /* Messages IN */ -MPI_Comm sf_data_comm = MPI_COMM_NULL; /* Messages OUT */ - int sf_shutdown_flag = 0; const char *sf_subfile_prefix = "."; - #define MAX_WORK_PER_RANK 2 /* @@ -55,261 +43,148 @@ Private functions ========================================= */ -static int _determine_subfile_rank(int myrank) -{ - if (io_concentrator) { - int i; - for(i=0; i< n_io_concentrators; i++) { - if (io_concentrator[i] == myrank) - return i; - } - } - return -1; -} - -static int is_io_concentrator(int rank) -{ - int index = _determine_subfile_rank(rank); - if (index < 0) return 0; - return 1; /* true */ -} - - - -static void init_io_vars(int64_t stripe_size, int64_t blocksize_per_stripe, - int64_t file_offset, int64_t data_extent, - int64_t *first_io, int64_t *first_io_offset, int64_t *last_io, - int *starting_ioc, int *final_ioc, int *starting_row, int *final_row) -{ - int64_t total_stripe_width = stripe_size * n_io_concentrators; - int64_t starting_offset = file_offset % stripe_size; - int64_t final_offset = (file_offset + data_extent -1); - int64_t last_io_check = (starting_offset + data_extent) % stripe_size; - *starting_row = (int)(file_offset / total_stripe_width); - *final_row = (int)(final_offset / total_stripe_width); - - /* Maybe update how many bytes in the entire IOC collection */ - if (blocksize_per_stripe == 0) - sf_blocksize_per_stripe = total_stripe_width; - - *starting_ioc = (int)((file_offset / stripe_size) % n_io_concentrators); - *final_ioc = (int)((final_offset / stripe_size) % n_io_concentrators); - *first_io_offset = starting_offset; - *first_io = ((stripe_size - starting_offset) >= data_extent ? data_extent : (stripe_size - starting_offset)); - /* Check for just a single IO op */ - if (*first_io == data_extent) *last_io = 0; - else *last_io = (last_io_check > 0 ? last_io_check : stripe_size); -} - -static int init__indep_io(subfiling_context_t *sf_context, - int64_t **source_data_offset, int64_t **sf_datasize, - int64_t **sf_offset, MPI_Datatype **sf_dtype, - int64_t offset, int64_t elements, int dtype_extent) -{ - int64_t data_extent = elements * dtype_extent; - int64_t first_io=0, last_io=0, first_io_offset=0; - - int64_t *data_offset = *source_data_offset; - int64_t *ioc_datasize = *sf_datasize; - int64_t *ioc_offset = *sf_offset; - MPI_Datatype *ioc_type = *sf_dtype; - int k, ioc_start, ioc_last, ioc_depth, starting_row, final_row; - sf_stripe_size = sf_context->sf_stripe_size; - sf_blocksize_per_stripe = sf_context->sf_blocksize_per_stripe; - - init_io_vars(sf_stripe_size, sf_blocksize_per_stripe, offset, data_extent, - &first_io, &first_io_offset, &last_io, - &ioc_start, &ioc_last, &starting_row, &final_row); - - if (sf_verbose_flag) { - printf("[%d] offset=%ld,data_extent=%ld,sf_stripe_size=%ld,n_io_concentrators=%d," - "first_io=%ld,first_io_offset=%ld,last_io=%ld,ioc_start=%d,ioc_last=%d\n", - sf_world_rank, offset,data_extent,sf_stripe_size,n_io_concentrators, - first_io,first_io_offset,last_io,ioc_start,ioc_last); - fflush(stdout); - } - - if (data_offset == NULL) { - data_offset = (int64_t *)calloc((size_t)n_io_concentrators, sizeof(int64_t)); - assert(data_offset != NULL); - *source_data_offset = data_offset; - } - - if (ioc_datasize == NULL) { - ioc_datasize = (int64_t *)calloc((size_t)n_io_concentrators, sizeof(int64_t)); - assert(ioc_datasize != NULL); - *sf_datasize = ioc_datasize; - } - - if (ioc_offset == NULL) { - ioc_offset = (int64_t *)calloc((size_t)n_io_concentrators, sizeof(int64_t)); - assert(ioc_offset != NULL); - *sf_offset = ioc_offset; - } - - if (ioc_type == NULL) { - ioc_type = (MPI_Datatype *)calloc((size_t)n_io_concentrators, sizeof(MPI_Datatype)); - assert(ioc_type != NULL); - *sf_dtype = ioc_type; - } - - for(k=0; k < n_io_concentrators; k++) { - ioc_datasize[k] = 0; - ioc_offset[k] = 0; - /* Free previously used datatypes */ - if (ioc_type[k] && - (ioc_type[k] != MPI_DATATYPE_NULL) && - (ioc_type[k] != MPI_BYTE)) - MPI_Type_free(&ioc_type[k]); - else ioc_type[k] = MPI_DATATYPE_NULL; - } - - if (data_extent) { - int next_index = ioc_start; - int64_t target_bytes; - int64_t total_bytes_remaining = data_extent; - int64_t row_base = starting_row * sf_stripe_size; - int64_t subfile_offset = row_base + first_io_offset; - int64_t source_offset = 0; - int64_t remaining_bytes_in_row = ((n_io_concentrators - ioc_start) * sf_stripe_size) - first_io_offset; - - ioc_depth = (final_row - starting_row) +1; - if ((ioc_start > ioc_last) && (data_extent > remaining_bytes_in_row)) ioc_depth--; - - while(total_bytes_remaining > 0) { - target_bytes = 0; - if (next_index == ioc_start) { - target_bytes = first_io; - } - if (next_index == ioc_last) { - target_bytes += last_io; - ioc_depth--; - } - if (ioc_depth) { - if (next_index == ioc_start) - target_bytes += (sf_stripe_size * (ioc_depth -1)); - else target_bytes += (sf_stripe_size * ioc_depth); - } - - data_offset[next_index] = source_offset; - ioc_datasize[next_index] += target_bytes; - ioc_offset[next_index] += subfile_offset; - total_bytes_remaining -= target_bytes; - /* - * With the exception of the very 1st IO, all additional - * IO operations start on a slice_boundary (and this is - * consistent across the collection of IOCs). - */ - - subfile_offset = row_base; - - /* - * Possibly Create an MPI datatype for each MPI_Send operation. - * If the length allows writing into a single stripe on - * a single IOC, then we can use the MPI_BYTE datatype. - */ - - - if (next_index == ioc_start) { /* First target */ - if (next_index == ioc_last) { - ioc_type[next_index] = - H5FD__create_f_l_mpi_type(sf_context, target_bytes, - first_io, last_io, ioc_depth+1); - } else { - ioc_type[next_index] = - H5FD__create_first_mpi_type(sf_context, ioc_offset[next_index], - target_bytes, first_io, ioc_depth); - } - source_offset += first_io; - } - else { - if (next_index == ioc_last) { - ioc_type[next_index] = - H5FD__create_final_mpi_type(sf_context, - target_bytes, last_io, ioc_depth+1); - } else { - ioc_type[next_index] = - H5FD__create_mpi_uniform_type(sf_context,ioc_offset[next_index], - target_bytes, ioc_depth); - } - source_offset += sf_stripe_size; - } - - if (++next_index == n_io_concentrators) { - next_index = 0; - row_base += sf_stripe_size; - subfile_offset = row_base; - } - } - } - return 0; -} - +/* + * --------------------------------------------------- + * Topology discovery related functions for choosing + * IO Concentrator (IOC) ranks. + * Currently, the default approach for assigning an IOC + * is select the lowest MPI rank on each node. + * + * The approach collectively generates N tuples + * consisting of the MPI rank and hostid. This + * collection is then sorted by hostid and scanned + * to identify the IOC ranks. + * + * As time permits, addition assignment methods will + * be implemented, e.g. 1-per-Nranks or via a config + * option. Additional selection methodologies can + * be included as users get more experience using the + * subfiling implementation. + * --------------------------------------------------- + */ -static int compare_hostid(const void *h1, const void *h2) +/*------------------------------------------------------------------------- + * Function: compare_hostid + * + * Purpose: qsort sorting function. + * Compares tuples of 'layout_t'. The sorting is based on + * the long hostid values. + * + * Return: result of: (hostid1 > hostid2) + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +static int +compare_hostid(const void *h1, const void *h2) { - const layout_t *host1 = (const layout_t *)h1; - const layout_t *host2 = (const layout_t *)h2; + const layout_t *host1 = (const layout_t *) h1; + const layout_t *host2 = (const layout_t *) h2; return (host1->hostid > host2->hostid); } - -static void gather_topology_info(sf_topology_t *info) +/*------------------------------------------------------------------------- + * Function: gather_topology_info + * + * Purpose: Collectively generate a sorted collection of hostid+mpi_rank + * tuples. The result is returned in the 'topology' field + * of the sf_topology_t structure. + * + * Return: Sorted array of hostid/mpi_rank tuples. + * Errors: MPI_Abort if memory cannot be allocated. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +static void +gather_topology_info(sf_topology_t *info) { - sf_world_size = info->world_size; - sf_world_rank = info->world_rank; + sf_world_size = info->world_size; + sf_world_rank = info->world_rank; - if (info->topology) - return; + if (info->layout) + return; if (sf_world_size) { - long hostid = gethostid(); - layout_t my_hostinfo; - layout_t *topology = (layout_t *)calloc((size_t)sf_world_size+1, sizeof(layout_t)); - if (topology == NULL) { + long hostid = gethostid(); + layout_t my_hostinfo; + layout_t *layout = + (layout_t *) calloc((size_t) sf_world_size + 1, sizeof(layout_t)); + if (layout == NULL) { perror("calloc failure!"); MPI_Abort(MPI_COMM_WORLD, 1); } - info->hostid = hostid; - info->topology = topology; - my_hostinfo.rank = sf_world_rank; - my_hostinfo.hostid = hostid; - info->topology[sf_world_rank] = my_hostinfo; - if (sf_world_size > 1) { - if (MPI_Allgather(&my_hostinfo, 2, MPI_LONG, - info->topology, 2, MPI_LONG, - MPI_COMM_WORLD) == MPI_SUCCESS) { - qsort(info->topology, (size_t)sf_world_size, sizeof(layout_t), compare_hostid); - } - } + info->hostid = hostid; + info->layout = layout; + my_hostinfo.rank = sf_world_rank; + my_hostinfo.hostid = hostid; + info->layout[sf_world_rank] = my_hostinfo; + if (sf_world_size > 1) { + if (MPI_Allgather(&my_hostinfo, 2, MPI_LONG, info->layout, 2, + MPI_LONG, MPI_COMM_WORLD) == MPI_SUCCESS) { + qsort(info->layout, (size_t) sf_world_size, sizeof(layout_t), + compare_hostid); + } + } } } -static int count_nodes(sf_topology_t *info) +/*------------------------------------------------------------------------- + * Function: count_nodes + * + * Purpose: Initializes the sorted collection of hostid+mpi_rank + * tuples. After initialization, the collection is scanned + * to determine the number of unique hostid entries. This + * value will determine the number of actual IO concentrators + * that available to the application. A side effect is to + * identify the 'node_index' of the current process. + * + * Return: The number of unique hostid's (nodes). + * Errors: MPI_Abort if memory cannot be allocated. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +static int +count_nodes(sf_topology_t *info) { - int k, node_count, hostid_index = -1; + int k, node_count, hostid_index = -1; long nextid; - assert(info != NULL); - if (info->topology == NULL) - gather_topology_info (info); + assert(info != NULL); + if (info->layout == NULL) + gather_topology_info(info); - nextid = info->topology[0].hostid; - info->node_ranks = (int *)calloc((size_t)(info->world_size+1), sizeof(int)); - assert(info->node_ranks != NULL); + nextid = info->layout[0].hostid; + info->node_ranks = + (int *) calloc((size_t)(info->world_size + 1), sizeof(int)); + assert(info->node_ranks != NULL); if (nextid == info->hostid) - hostid_index = 0; + hostid_index = 0; node_count = 1; - /* Recall that the topology array has been sorted! */ - for (k=1; k < info->world_size; k++) { - if (info->topology[k].hostid != nextid) { - nextid = info->topology[k].hostid; + /* Recall that the topology array has been sorted! */ + for (k = 1; k < info->world_size; k++) { + if (info->layout[k].hostid != nextid) { + nextid = info->layout[k].hostid; if (hostid_index < 0) { - if (nextid == info->hostid) hostid_index = k; + if (nextid == info->hostid) + hostid_index = k; } - /* Record the index of new hostid */ + /* Record the index of new hostid */ info->node_ranks[node_count++] = k; } } @@ -321,524 +196,1028 @@ static int count_nodes(sf_topology_t *info) return info->node_count = node_count; } +/*------------------------------------------------------------------------- + * Function: H5FD__determine_ioc_count + * + * Purpose: Once a sorted collection of hostid/mpi_rank tuples has been + * created and the number of unique hostids (nodes) has + * been determined, we may modify this "default" value for + * the number of IO Concentrators for this application. + * + * The default of one(1) IO concentrator per node can be + * changed (principally for testing) by environment variable. + * if IOC_COUNT_PER_NODE is defined, then that integer value + * is utilized as a mulitiplier to modify the set of + * IO Concentrator ranks. + * + * The cached results will be replicated within the + * subfiling_context_t structure and is utilized as a map from + * io concentrator rank to MPI communicator rank for message + * sends and receives. + * + * Return: The number of IO Concentrator ranks. We also cache + * the MPI ranks in the 'io_concentrator' vector variable. + * The length of this vector is cached as 'n_io_concentrators'. + * Errors: MPI_Abort if memory cannot be allocated. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: - Initial Version/None. + * - Updated the API to allow a variety of methods for + * determining the number and MPI ranks that will have + * IO Concentrators. The default approach will define + * a single IOC per node. + * + *------------------------------------------------------------------------- + */ int -H5FD__determine_ioc_count(int world_size, int world_rank, sf_topology_t **thisapp) +H5FD__determine_ioc_count(int world_size, int world_rank, + sf_ioc_selection_t ioc_select_method, char *ioc_select_option, + sf_topology_t **thisapp) { - static int ioc_count = 0; - if (!ioc_count) { - int k, node; - int node_index; - int iocs_per_node = 1; - char *envValue = NULL; - sf_topology_t *app_topology = (sf_topology_t *)malloc(sizeof(sf_topology_t)); - assert(app_topology != NULL); - memset(app_topology, 0, sizeof(sf_topology_t)); - app_topology->world_size = world_size; - app_topology->world_rank = world_rank; - - io_concentrator = (int *)malloc(((size_t)world_size * sizeof(int))); - assert(io_concentrator != NULL); - ioc_count = count_nodes (app_topology); - /* FIXME: This should ONLY be used for testing! - * For production, we should probably limit the - * number to a single IOC per node... - * (based on performance numbers) - */ - if ((envValue = getenv("IOC_COUNT_PER_NODE")) != NULL) { - int value_check = atoi(envValue); - if (value_check > 0) { - iocs_per_node = value_check; - } - } - - /* 'node_ranks' contain the index of the first instance of a hostid - * in the sorted sf_topology array. Our own index is 'node_index'. - */ - node_index = app_topology->node_index; - app_topology->local_peers = app_topology->node_ranks[node_index+1] - - app_topology->node_ranks[node_index]; - if (app_topology->topology[node_index].rank == world_rank) { - app_topology->rank_is_ioc = true; - app_topology->subfile_rank = node_index; - } - /* FIXME: This should ONLY be used for testing! - * NOTE: The app_topology->local_peers is ONLY valid - * for the current NODE. There is no guarantee that - * the application layout defines a uniform number of - * MPI ranks per node... - * Because this is only for testing purposes (at this time) - * we can live with the assumption that if we define the - * IOC_COUNT_PER_NODE environment variable, then each - * node will have *at-least* that many MPI ranks assigned. - * See above! - */ - else if ((app_topology->local_peers > 1) && (iocs_per_node > 1)) { - if (iocs_per_node > app_topology->local_peers) - iocs_per_node = app_topology->local_peers; - for(k=1; k< iocs_per_node; k++) { - if (app_topology->topology[node_index + k].rank == world_rank) { - app_topology->rank_is_ioc = true; - app_topology->subfile_rank = node_index + k; - break; - } - } - } - /* More hacks for testing */ - if (io_concentrator) { - int n_iocs = 0; - for(node = 0; node < ioc_count; node++) { - for (k=0; k < iocs_per_node; k++) { - node_index = app_topology->node_ranks[node]; - io_concentrator[n_iocs++] = (int)( - app_topology->topology[node_index + k].rank); - } - } - ioc_count = n_io_concentrators = n_iocs; + static int ioc_count = 0; + static int64_t topology_id = 0; + static sf_ioc_selection_t ioc_selection = ioc_selection_options; + sf_topology_t * app_topology = NULL; + + assert(thisapp != NULL); + + if (!ioc_count || (ioc_selection != ioc_select_method)) { + int k, node; + int node_index; + int iocs_per_node = 1; + char * envValue = NULL; + int * io_concentrator = NULL; + int index = (int) ioc_select_method; + int64_t tag = (int64_t) SF_TOPOLOGY; + topology_id = (int64_t)((tag << 32) | index); + + app_topology = (sf_topology_t *) get_subfiling_object(topology_id); + assert(app_topology != NULL); + app_topology->world_size = world_size; + app_topology->world_rank = world_rank; + if (app_topology->io_concentrator == NULL) { + app_topology->io_concentrator = io_concentrator = + (int *) malloc(((size_t) world_size * sizeof(int))); } + assert(io_concentrator != NULL); + app_topology->selection_type = ioc_selection = ioc_select_method; + + if (ioc_select_method == SELECT_IOC_ONE_PER_NODE) { + ioc_count = count_nodes(app_topology); + /* FIXME: This should ONLY be used for testing! + * For production, we should probably limit the + * number to a single IOC per node... + * (based on performance numbers) + */ + if ((envValue = getenv("IOC_COUNT_PER_NODE")) != NULL) { + int value_check = atoi(envValue); + if (value_check > 0) { + iocs_per_node = value_check; + } + } - if (ioc_count > 0) { - *thisapp = app_topology; - } + /* 'node_ranks' contain the index of the first instance of a hostid + * in the sorted sf_topology array. Our own index is 'node_index'. + */ + node_index = app_topology->node_index; + app_topology->local_peers = + app_topology->node_ranks[node_index + 1] - + app_topology->node_ranks[node_index]; + if (app_topology->layout[node_index].rank == world_rank) { + app_topology->rank_is_ioc = true; + app_topology->subfile_rank = node_index; + } + /* FIXME: This should ONLY be used for testing! + * NOTE: The app_topology->local_peers is ONLY valid + * for the current NODE. There is no guarantee that + * the application layout defines a uniform number of + * MPI ranks per node... + * Because this is only for testing purposes (at this time) + * we can live with the assumption that if we define the + * IOC_COUNT_PER_NODE environment variable, then each + * node will have *at-least* that many MPI ranks assigned. + * See above! + */ + else if ((app_topology->local_peers > 1) && (iocs_per_node > 1)) { + if (iocs_per_node > app_topology->local_peers) + iocs_per_node = app_topology->local_peers; + for (k = 1; k < iocs_per_node; k++) { + if (app_topology->layout[node_index + k].rank == + world_rank) { + app_topology->rank_is_ioc = true; + app_topology->subfile_rank = node_index + k; + break; + } + } + } + /* More hacks for testing */ + if (io_concentrator) { + int n_iocs = 0; + for (node = 0; node < ioc_count; node++) { + for (k = 0; k < iocs_per_node; k++) { + node_index = app_topology->node_ranks[node]; + io_concentrator[n_iocs++] = + (int) (app_topology->layout[node_index + k].rank); + } + } + ioc_count = n_iocs; + } + + if (ioc_count > 0) { + app_topology->n_io_concentrators = ioc_count; + *thisapp = app_topology; + // topology_id = (hid_t)record_subfiling_object(SF_TOPOLOGY, + // app_topology); + } + } else { + if (world_rank == 0) { + printf("[%d - %s] IOC_selection(%d) with option(%s) is not " + "supported\n", + world_rank, __func__, (int) ioc_select_method, + ioc_select_option); + } + } + } else { + app_topology = (sf_topology_t *) get_subfiling_object(topology_id); + *thisapp = app_topology; } return ioc_count; } -int -H5FD__init_subfile_context(subfiling_context_t **newContext, int n_iocs, int world_size, int world_rank, bool rank_is_ioc) -{ - int status; - subfiling_context_t *next = (subfiling_context_t *) malloc(sizeof(subfiling_context_t)); - assert(next != NULL); - memset(next,0, sizeof(subfiling_context_t)); - - if (io_concentrator == NULL) { - goto err_exit; - } - else { - int k; - char *envValue = NULL; - int ioc_leader = io_concentrator[0]; - int app_leader = 0; - *newContext = next; - next->sf_stripe_size = DEFAULT_STRIPE_SIZE; - if ((envValue = getenv("IOC_STRIPE_SIZE")) != NULL) { - long value_check = atol(envValue); - if (value_check > 0) { - next->sf_stripe_size = (int64_t)value_check; - } - } - if ((envValue = getenv("SUBFILE_PREFIX")) != NULL) { - char temp[PATH_MAX]; - sprintf(temp,"%s", envValue); - next->subfile_prefix = strdup(temp); - sf_subfile_prefix = strdup(temp); - } - - next->sf_blocksize_per_stripe = next->sf_stripe_size * n_iocs; - status = MPI_Comm_dup(MPI_COMM_WORLD, &next->sf_msg_comm); - if (status != MPI_SUCCESS) goto err_exit; - status = MPI_Comm_set_errhandler(next->sf_msg_comm, MPI_ERRORS_RETURN); - if (status != MPI_SUCCESS) goto err_exit; - status = MPI_Comm_dup(MPI_COMM_WORLD, &next->sf_data_comm); - if (status != MPI_SUCCESS) goto err_exit; - status = MPI_Comm_set_errhandler(next->sf_data_comm, MPI_ERRORS_RETURN); - if (status != MPI_SUCCESS) goto err_exit; - - k = 0; - while(is_io_concentrator(k)) - k++; - app_leader = k; - - /* Do this now rather than having the ioc thread - * update the value - */ - if (rank_is_ioc) { - sf_stripe_size = next->sf_stripe_size; - } - - if (sf_verbose_flag && (world_rank == 0)) { - printf("app_leader = %d and ioc_leader = %d\n", app_leader, ioc_leader); - } - - if (n_iocs > 1) { - status = MPI_Comm_split(MPI_COMM_WORLD, rank_is_ioc, world_rank, &next->sf_group_comm); - if (status != MPI_SUCCESS) goto err_exit; - status = MPI_Comm_size(next->sf_group_comm, &next->sf_group_size); - if (status != MPI_SUCCESS) goto err_exit; - status = MPI_Comm_rank(next->sf_group_comm, &next->sf_group_rank); - if (status != MPI_SUCCESS) goto err_exit; - /* - * There may be additional functionality we need for the IOCs... - * If so, then can probably initialize those things here! - */ - } - else { - next->sf_group_comm = MPI_COMM_NULL; - } - - if (rank_is_ioc) { - status = initialize_ioc_threads(next); - if (status) goto err_exit; - } - } - return 0; - -err_exit: - if (next) { - free(next); - } - return -1; -} - - -/* ---------------------------------------------------------------------------------- - The data that we're sending to receiving from an IO concentrator (IOC) contains - the initial collection of bytes. The length of this initial segment is 'first_write'. - Note that the terminology isn't significant. We are describing an IO operation in - terms of an MPI datatype which will either gather data from a source buffer - to send to an IOC or will be used to unpack data from an IOC into a user buffer. - Subsequent IO operations which are related to the current File IO will begin on - sf_stripe_size boundaries. ---------------------------------------------------------------------------------- -*/ - -static MPI_Datatype H5FD__create_first_mpi_type( - subfiling_context_t *context, int64_t offset, - int64_t target_write_bytes, int64_t first_write, int ioc_depth) +/* ===================================================================== */ +/* MPI_Datatype Creation functions. + * These are catagorized by usage paterns, i.e. when data is sent to or + * received from and IOC, the initial data offset provided by the user + * may or may NOT start on a stripe boundary. Because this, the initial + * data segment to the selected IOC will often be less than 'stripe_size' + * in length. The purpose of these Datatype creation functions is to + * enable the gathering of all data from this client to the IOC target + * into a single MPI message. The MPI datatype will the be utilized by + * the sending function to pack data into a contiguous block of memory + * which enables the IOC to write to disk in an effective manner. + * ===================================================================== */ + +/*------------------------------------------------------------------------- + * Function: H5FD__create_first_mpi_type + * + * Purpose: Return an appropriate MPI datatype to represent the initial + * IO operation when reading or writing data to or from an IO + * Concentrator (IOC). + * + * If the 'first_io' is sufficient to complete the IO to the + * IOC, then the returned MPI datatype will simply be MPI_BYTE. + * For all other non-zero length IO operations, we create a + * derived MPI datatype using MPI_Type_indexed. The 'ioc_depth' + * input will define the number of blocks/disps pairs that are + * required to represent the desired IO operation. + * + * Return: The MPI_Datatype that will be used to send or receive data. + * Errors: MPI_Type_NULL if for any reason, the MPI_Datatype creation + * fails. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +static MPI_Datatype +H5FD__create_first_mpi_type(subfiling_context_t *context, int ioc_depth, + int64_t offset, int64_t target_write_bytes, int64_t first_io) { MPI_Datatype newType = MPI_DATATYPE_NULL; - int64_t stripe_size = context->sf_stripe_size; - int64_t offset_in_stripe = offset % sf_stripe_size; - int64_t depth_in_bytes = sf_stripe_size * ioc_depth; - int64_t next_offset = context->sf_blocksize_per_stripe - offset_in_stripe; - int64_t total_bytes = first_write; - - assert(ioc_depth > 0); - if (stripe_size >= depth_in_bytes) - return MPI_BYTE; - - if (depth_in_bytes) { - int k; - int temp_blocks[64]; - int temp_disps[64]; + int64_t stripe_size = context->sf_stripe_size; + int64_t blocksize_per_stripe = context->sf_blocksize_per_stripe; + int64_t offset_in_stripe = offset % stripe_size; + int64_t next_offset = blocksize_per_stripe - offset_in_stripe; + int64_t total_bytes = first_io; + + if (first_io == target_write_bytes) { + if (first_io > 0) { + return MPI_BYTE; + } + } + if (first_io) { + int k; + int temp_blocks[64]; + int temp_disps[64]; int *blocks = temp_blocks; int *disps = temp_disps; if (ioc_depth > 64) { - blocks = (int *)calloc((size_t)ioc_depth, sizeof(int)); - disps = (int *)calloc((size_t)ioc_depth, sizeof(int)); - } - blocks[0] = (int)first_write; + blocks = (int *) calloc((size_t) ioc_depth, sizeof(int)); + if (blocks == NULL) { + perror("calloc"); + return newType; + } + disps = (int *) calloc((size_t) ioc_depth, sizeof(int)); + if (disps == NULL) { + perror("calloc"); + return newType; + } + } + blocks[0] = (int) first_io; disps[0] = (int) 0; - for(k=1; k < ioc_depth; k++) { - disps[k] = (int)next_offset; - blocks[k] = (int)stripe_size; - total_bytes += stripe_size; + for (k = 1; k <= ioc_depth; k++) { + disps[k] = (int) next_offset; + blocks[k] = (int) stripe_size; + total_bytes += stripe_size; next_offset += context->sf_blocksize_per_stripe; } - if (total_bytes != target_write_bytes) { - printf("Warning (%s): total_SUM(%ld) != target_bytes(%ld)\n", - __func__, total_bytes, target_write_bytes); - } - - if (MPI_Type_indexed(ioc_depth, blocks, disps, MPI_BYTE, &newType) != MPI_SUCCESS) { + if (total_bytes != target_write_bytes) { + printf("Warning (%s): total_SUM(%ld) != target_bytes(%ld)\n", + __func__, total_bytes, target_write_bytes); + } + + if (MPI_Type_indexed(k, blocks, disps, MPI_BYTE, &newType) != + MPI_SUCCESS) { perror("MPI_Type_indexed failed!"); - return MPI_DATATYPE_NULL; + return newType; } MPI_Type_commit(&newType); + if (1) { + int type_size; + MPI_Type_size(newType, &type_size); + if (type_size != target_write_bytes) { + printf("%s: type_size=%d should be: %ld\n", __func__, type_size, + target_write_bytes); + } + } if (ioc_depth > 64) { - if (blocks != temp_blocks) { - free(blocks); - blocks = NULL; - } - if (disps != temp_disps) { - free(disps); - disps = NULL; - } + if (blocks != temp_blocks) { + free(blocks); + blocks = NULL; + } + if (disps != temp_disps) { + free(disps); + disps = NULL; + } } } return newType; } -/* ---------------------------------------------------------------------------------- - The data that we're sending to an IO concentrator (IOC) contains the final - collection of bytes. Other than that detail, this is pretty much like the - typical' case... All chunks sizes are the identical (execpt for the very - last chunk) and all will start at relative stripe offset of 0. More precisely, - the start offset is a multiple of the subfiling "stripe_size". - We can utilize MPI_Type_indexed to represent the new type. ---------------------------------------------------------------------------------- -*/ -static MPI_Datatype H5FD__create_final_mpi_type(subfiling_context_t *context, int64_t target_write_bytes, int64_t last_write, int ioc_depth) +/*------------------------------------------------------------------------- + * Function: H5FD__create_final_mpi_type + * + * Purpose: Return an appropriate MPI datatype to represent the final + * IO operation when reading or writing data to or from an IO + * Concentrator (IOC). + * + * The data that we're sending to an IO concentrator (IOC) + * contains the final collection of bytes. Other than that detail, + * this is pretty much like the typical' IO case, i.e. all block + * sizes are identical (execpt for the very last block). + *Furthermore, they all start at relative stripe offset of 0, in other words on + *a 'stripe_size' boundary. + * + * Return: The MPI_Datatype that will be used to send or receive data. + * Errors: MPI_Type_NULL if for any reason, the MPI_Datatype creation + * fails. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +static MPI_Datatype +H5FD__create_final_mpi_type(subfiling_context_t *context, int ioc_depth, + int64_t target_write_bytes, int64_t last_write) { MPI_Datatype newType = MPI_DATATYPE_NULL; - int64_t stripe_size = context->sf_stripe_size; - int64_t depth_in_bytes = (stripe_size * ioc_depth) + last_write; - int64_t total_bytes = last_write; - - assert(ioc_depth > 0); + int64_t stripe_size = context->sf_stripe_size; + int64_t depth_in_bytes = (stripe_size * ioc_depth) + last_write; + int64_t total_bytes = last_write; - if (depth_in_bytes <= stripe_size) - return MPI_BYTE; + if (depth_in_bytes == target_write_bytes) { + if (depth_in_bytes > 0) { + return MPI_BYTE; + } + } if (depth_in_bytes) { - int k; - int temp_blocks[64]; - int temp_disps[64]; + int k; + int temp_blocks[64]; + int temp_disps[64]; int *blocks = temp_blocks; int *disps = temp_disps; if (ioc_depth > 64) { - blocks = (int *)calloc((size_t)ioc_depth, sizeof(int)); - disps = (int *)calloc((size_t)ioc_depth, sizeof(int)); - } - - for(k=0; k < ioc_depth; k++) { - disps[k] = (int)(k * context->sf_blocksize_per_stripe); - blocks[k] = (int)stripe_size; - total_bytes += stripe_size; - } - blocks[k-1] = (int)last_write; - if (total_bytes != target_write_bytes) { - printf("Warning (%s): total_SUM(%ld) != target_bytes(%ld)\n", - __func__, total_bytes, target_write_bytes); - } + blocks = (int *) calloc((size_t) ioc_depth, sizeof(int)); + if (blocks == NULL) { + perror("calloc"); + return newType; + } + disps = (int *) calloc((size_t) ioc_depth, sizeof(int)); + if (disps == NULL) { + perror("calloc"); + return newType; + } + } + + for (k = 0; k < ioc_depth; k++) { + disps[k] = (int) (k * context->sf_blocksize_per_stripe); + blocks[k] = (int) stripe_size; + total_bytes += stripe_size; + } + blocks[k - 1] = (int) last_write; + if (total_bytes != target_write_bytes) { + printf("Warning (%s): total_SUM(%ld) != target_bytes(%ld)\n", + __func__, total_bytes, target_write_bytes); + } - if (MPI_Type_indexed(ioc_depth, blocks, disps, MPI_BYTE, &newType) != MPI_SUCCESS) { + if (MPI_Type_indexed(ioc_depth, blocks, disps, MPI_BYTE, &newType) != + MPI_SUCCESS) { return MPI_DATATYPE_NULL; } MPI_Type_commit(&newType); if (ioc_depth > 64) { - if (blocks != temp_blocks) { - free(blocks); - blocks = NULL; - } - if (disps != temp_disps) { - free(disps); - disps = NULL; - } + if (blocks != temp_blocks) { + free(blocks); + blocks = NULL; + } + if (disps != temp_disps) { + free(disps); + disps = NULL; + } } } return newType; } -/* ---------------------------------------------------------------------------------- - Special case where the current IOC has both the first and final write chunks. - This implmentation is a merge of the first_mpi_type and final_mpi_type - functions. ---------------------------------------------------------------------------------- -*/ -static MPI_Datatype H5FD__create_f_l_mpi_type(subfiling_context_t *context, - int64_t target_write_bytes, - int64_t first_write, - int64_t last_write, int ioc_depth) +/*------------------------------------------------------------------------- + * Function: H5FD__create_f_l_mpi_type + * + * Purpose: Return an appropriate MPI datatype which includes both the + * first and final IO data segments. + * + * A special case where the current IOC has both the first and + * final write blocks. This function is basically a merge of + * the first_mpi_type and final_mpi_type functions. + * + * Return: The MPI_Datatype that will be used to send or receive data. + * Errors: MPI_Type_NULL if for any reason, the MPI_Datatype creation + * fails. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +static MPI_Datatype +H5FD__create_f_l_mpi_type(subfiling_context_t *context, int ioc_depth, + int64_t offset, int64_t target_write_bytes, int64_t first_write, + int64_t last_write) { MPI_Datatype newType = MPI_DATATYPE_NULL; - int64_t stripe_size = context->sf_stripe_size; - int64_t depth_in_bytes = stripe_size * ioc_depth; - int64_t offset_in_stripe = stripe_size - first_write; - int64_t next_offset = context->sf_blocksize_per_stripe - offset_in_stripe; - int64_t total_bytes = first_write + last_write; - - assert(ioc_depth > 0); - if (last_write == 0) { - newType = MPI_BYTE; - } - else if (depth_in_bytes) { - int k; - int temp_blocks[64]; - int temp_disps[64]; + int64_t stripe_size = context->sf_stripe_size; + int64_t blocksize_per_stripe = context->sf_blocksize_per_stripe; + int64_t offset_in_stripe = offset % stripe_size; + int64_t next_offset = blocksize_per_stripe - offset_in_stripe; + int64_t total_bytes = first_write + last_write; + + /* We might actaully check that the 'target_write_bytes' + * input variable exceeds 2Gb. If so, then we should + * always create a derived type. + */ + if ((total_bytes == target_write_bytes) && + (context->topology->n_io_concentrators == 1)) { + return MPI_BYTE; + } else if (first_write) { + int k; + int temp_blocks[64]; + int temp_disps[64]; int *blocks = temp_blocks; int *disps = temp_disps; -#if 0 - /* Depth in bytes might be incorrect... How? */ - if (total_bytes < target_write_bytes) { - int64_t remaining = target_write_bytes - total_bytes; - ioc_depth = (remaining / stripe_size) +1; - } -#endif if (ioc_depth > 64) { - blocks = (int *)calloc((size_t)ioc_depth, sizeof(int)); - disps = (int *)calloc((size_t)ioc_depth, sizeof(int)); - } + blocks = (int *) calloc((size_t) ioc_depth, sizeof(int)); + if (blocks == NULL) { + perror("calloc"); + return newType; + } + disps = (int *) calloc((size_t) ioc_depth, sizeof(int)); + if (disps == NULL) { + perror("calloc"); + return newType; + } + } - blocks[0] = (int)first_write; + blocks[0] = (int) first_write; disps[0] = 0; - for(k=1; k < ioc_depth; k++) { - blocks[k] = (int)stripe_size; - disps[k] = (int)next_offset; - next_offset += context->sf_blocksize_per_stripe; - } - blocks[k-1] = (int)last_write; - if (ioc_depth > 2) total_bytes += (int64_t)((ioc_depth - 2) * stripe_size); + for (k = 1; k < ioc_depth; k++) { + blocks[k] = (int) stripe_size; + disps[k] = (int) next_offset; + next_offset += context->sf_blocksize_per_stripe; + total_bytes += stripe_size; + } + if (k == 1) { + disps[k] = (int) next_offset; + } + blocks[k] = (int) last_write; - if (total_bytes != target_write_bytes) { - printf("[%d] Warning (%s): total_SUM(%ld) != target_bytes(%ld)\n", - sf_world_rank, __func__, total_bytes, target_write_bytes); - } + if (total_bytes != target_write_bytes) { + printf("[%d] Warning (%s): total_SUM(%ld) != target_bytes(%ld)\n", + sf_world_rank, __func__, total_bytes, target_write_bytes); + } - if (MPI_Type_indexed(ioc_depth, blocks, disps, MPI_BYTE, &newType) != MPI_SUCCESS) { + if (MPI_Type_indexed(k + 1, blocks, disps, MPI_BYTE, &newType) != + MPI_SUCCESS) { perror("MPI_Type_indexed failed!"); return MPI_DATATYPE_NULL; } MPI_Type_commit(&newType); if (ioc_depth > 64) { - if (blocks != temp_blocks) { - free(blocks); - blocks = NULL; - } - if (disps != temp_disps) { - free(disps); - disps = NULL; - } + if (blocks != temp_blocks) { + free(blocks); + blocks = NULL; + } + if (disps != temp_disps) { + free(disps); + disps = NULL; + } } } return newType; } -/* ---------------------------------------------------------------------------------- - This is the 'typical' case in which the IOC has neither the first chunck nor - the last. All chunks sizes are the identical and start at offset = 0. - We utilize MPI_Type_indexed to represent the new type. ---------------------------------------------------------------------------------- -*/ -MPI_Datatype H5FD__create_mpi_uniform_type(subfiling_context_t *context, - int64_t offset, - int64_t target_write_bytes, int ioc_depth) +/*------------------------------------------------------------------------- + * Function: H5FD__create_mpi_uniform_type + * + * Purpose: Return an appropriate MPI datatype to represent the typical + * IO operation when reading or writing data to or from an IO + * Concentrator (IOC). + * + * Each data segment is of 'stripe_size' length and will be + * seperated from a previous or following segment by + * 'sf_blocksize_per_stripe' bytes of data. + * + * Return: The MPI_Datatype that will be used to send or receive data. + * Errors: MPI_Type_NULL if for any reason, the MPI_Datatype creation + * fails. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +static MPI_Datatype +H5FD__create_mpi_uniform_type( + subfiling_context_t *context, int ioc_depth, int64_t target_write_bytes) { - /* Maintain some state between function calls allow reuse of the new datatypes... */ - static MPI_Datatype uniformType = MPI_DATATYPE_NULL; - static int64_t depth_in_bytes = 0; - MPI_Datatype newType = MPI_DATATYPE_NULL; - int64_t stripe_size = context->sf_stripe_size; - int64_t offset_in_stripe = offset % stripe_size; - int64_t check_depth = stripe_size * ioc_depth; - int64_t total_bytes = 0; + int64_t stripe_size = context->sf_stripe_size; + int64_t check_depth = stripe_size * ioc_depth; + int64_t total_bytes = 0; - assert(offset_in_stripe == 0); - assert(ioc_depth > 0); - - if (check_depth == stripe_size) - return MPI_BYTE; + if (check_depth == stripe_size) { + if (target_write_bytes > 0) { + return MPI_BYTE; + } + } - if (depth_in_bytes) { - if (depth_in_bytes != check_depth) { - MPI_Type_free(&uniformType); - depth_in_bytes = 0; - } - } - if (!depth_in_bytes) { - int k; - int temp_blocks[64]; - int temp_disps[64]; + if (target_write_bytes) { + int k; + int temp_blocks[64]; + int temp_disps[64]; int *blocks = temp_blocks; int *disps = temp_disps; if (ioc_depth > 64) { - blocks = (int *)calloc((size_t)ioc_depth, sizeof(int)); - disps = (int *)calloc((size_t)ioc_depth, sizeof(int)); - } - for(k=0; k < ioc_depth; k++) { - disps[k] = (int)(k * context->sf_blocksize_per_stripe); - blocks[k] = (int)(stripe_size); - total_bytes += stripe_size; - } - - if (total_bytes != target_write_bytes) { - printf("Warning (%s): total_SUM(%ld) != target_bytes(%ld)\n", - __func__, total_bytes, target_write_bytes); - } - - if (MPI_Type_indexed(ioc_depth, blocks, disps, MPI_BYTE, &uniformType) != MPI_SUCCESS) { + blocks = (int *) calloc((size_t) ioc_depth, sizeof(int)); + if (blocks == NULL) { + perror("calloc"); + return newType; + } + disps = (int *) calloc((size_t) ioc_depth, sizeof(int)); + if (disps == NULL) { + perror("calloc"); + return newType; + } + } + for (k = 0; k < ioc_depth; k++) { + disps[k] = (int) (k * context->sf_blocksize_per_stripe); + blocks[k] = (int) (stripe_size); + total_bytes += stripe_size; + } + + if (total_bytes != target_write_bytes) { + printf("Warning (%s): total_SUM(%ld) != target_bytes(%ld)\n", + __func__, total_bytes, target_write_bytes); + } + + if (MPI_Type_indexed(ioc_depth, blocks, disps, MPI_BYTE, &newType) != + MPI_SUCCESS) { perror("MPI_Type_indexed failed!"); return MPI_DATATYPE_NULL; } - MPI_Type_commit(&uniformType); + MPI_Type_commit(&newType); + if (1) { + int type_size; + MPI_Type_size(newType, &type_size); + if (type_size != target_write_bytes) { + printf("%s: type_size=%d should be: %ld\n", __func__, type_size, + target_write_bytes); + } + } + if (ioc_depth > 64) { - if (blocks != temp_blocks) { - free(blocks); - blocks = NULL; - } - if (disps != temp_disps) { - free(disps); - disps = NULL; - } + if (blocks != temp_blocks) { + free(blocks); + blocks = NULL; + } + if (disps != temp_disps) { + free(disps); + disps = NULL; + } } - depth_in_bytes = check_depth; } - MPI_Type_dup(uniformType, &newType); return newType; } - -int sf_read_independent(hid_t context_id, int64_t offset, int64_t elements, int dtype_extent, void *data) +static file_map_to_context_t *sf_open_file_map = NULL; +static int sf_file_map_size = 0; +#define DEFAULT_MAP_ENTRIES 8 + +/*------------------------------------------------------------------------- + * Function: record_fid_to_subfile + * + * Purpose: Every opened HDF5 file will have (if utilizing subfiling) + * a subfiling context associated with it. It is important that + * the HDF5 file index is a constant rather than utilizing a + * posix file handle since files can be opened multiple times + * and with each file open, a new file handle will be assigned. + * Note that in such a case, the actual filesystem id will be + * retained. + * + * We utilize that filesystem id (ino_t inode) so that + * irrespective of what process opens a common file, the + * subfiling system will generate a consistent context for this + * file across all parallel ranks. + * + * This function simply records the filesystem handle to + * subfiling context mapping. + * + * Return: SUCCEED or FAIL. + * Errors: FAILs ONLY if storage for the mapping entry cannot + * be allocated. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +static herr_t +record_fid_to_subfile(hid_t fid, hid_t subfile_context_id, int *next_index) { - static int *acks = NULL; - static int *indices = NULL; - static MPI_Request *ackreqs = NULL; - static MPI_Request *reqs = NULL; - static MPI_Status *stats = NULL; - static int64_t *source_data_offset = NULL; - static int64_t *ioc_read_datasize = NULL; - static int64_t *ioc_read_offset = NULL; - static MPI_Datatype *ioc_read_type = NULL; - - subfiling_context_t *sf_context = get_subfiling_object(context_id); - int i, ioc, n_waiting = 0, status = 0; - - assert(sf_context != NULL); - - if (acks == NULL) { - if ((acks = (int *)calloc((size_t)n_io_concentrators*2, sizeof(int))) == NULL) { + herr_t status = SUCCEED; + int index; + if (sf_file_map_size == 0) { + int i; + sf_open_file_map = (file_map_to_context_t *) malloc( + (size_t) DEFAULT_MAP_ENTRIES * sizeof(file_map_to_context_t)); + if (sf_open_file_map == NULL) { perror("calloc"); - return -1; + return FAIL; + } + sf_file_map_size = DEFAULT_MAP_ENTRIES; + for (i = 0; i < sf_file_map_size; i++) { + sf_open_file_map[i].h5_file_id = H5I_INVALID_HID; } - else indices = &acks[n_io_concentrators]; } - if (reqs == NULL) { - if ((reqs = (MPI_Request *)calloc((size_t)n_io_concentrators, sizeof(MPI_Request))) == NULL) { - perror("calloc"); - return -1; + for (index = 0; index < sf_file_map_size; index++) { + if (sf_open_file_map[index].h5_file_id == H5I_INVALID_HID) { + sf_open_file_map[index].h5_file_id = fid; + sf_open_file_map[index].sf_context_id = subfile_context_id; + if (next_index) { + *next_index = index; + } + return status; } } - if (ackreqs == NULL) { - if ((ackreqs = (MPI_Request *)calloc((size_t)n_io_concentrators, sizeof(MPI_Request))) == NULL) { - perror("calloc"); - return -1; + if (index == sf_file_map_size) { + int i; + sf_open_file_map = reallocarray(sf_open_file_map, + (size_t)(sf_file_map_size * 2), sizeof(file_map_to_context_t)); + if (sf_open_file_map == NULL) { + perror("realloc"); + return FAIL; } + sf_file_map_size *= 2; + for (i = index; i < sf_file_map_size; i++) { + sf_open_file_map[i].h5_file_id = H5I_INVALID_HID; + } + + if (next_index) { + *next_index = index; + } + + sf_open_file_map[index].h5_file_id = fid; + sf_open_file_map[index++].sf_context_id = subfile_context_id; } - if (stats == NULL) { - if ((stats = (MPI_Status *)calloc((size_t)n_io_concentrators, sizeof(MPI_Status))) == NULL) { - perror("calloc"); - return -1; + return status; +} + +/*------------------------------------------------------------------------- + * Function: fid_map_to_context + * + * Purpose: This is a basic lookup function which returns the subfiling + * context id associated with the specified file->inode. + * + * Return: The Subfiling context ID if it exists. + * Errors: H5I_INVALID_HID if the inode to context map is not found. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +hid_t +fid_map_to_context(hid_t sf_fid) +{ + if (sf_open_file_map) { + int i; + for (i = 0; i < sf_file_map_size; i++) { + if (sf_open_file_map[i].h5_file_id == sf_fid) { + return sf_open_file_map[i].sf_context_id; + } } } + return H5I_INVALID_HID; +} - if (init__indep_io(sf_context, &source_data_offset, &ioc_read_datasize, &ioc_read_offset, - &ioc_read_type, offset, elements, dtype_extent) < 0) { - return -1; - } +/*------------------------------------------------------------------------- + * Function: clear_fid_map_entry + * + * Purpose: Remove the map entry associated with the file->inode. + * This is done at file close. + * + * Return: None + * Errors: Cannot fail. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +void +clear_fid_map_entry(hid_t sf_fid) +{ + if (sf_open_file_map) { + int i; + for (i = 0; i < sf_file_map_size; i++) { + if (sf_open_file_map[i].h5_file_id == sf_fid) { + sf_open_file_map[i].h5_file_id = H5I_INVALID_HID; + sf_open_file_map[i].sf_context_id = H5I_INVALID_HID; + return; + } + } + } +} - if (sf_verbose_flag) { - for(ioc=0; ioc < n_io_concentrators; ioc++) { - int64_t sourceOffset = source_data_offset[ioc]; - printf("[%d %s]: read_source[ioc(%d), sourceOffset=%ld, datasize=%ld, foffset=%ld]\n", - sf_world_rank, __func__, ioc, sourceOffset, ioc_read_datasize[ioc], ioc_read_offset[ioc] ); - } - } +/*------------------------------------------------------------------------- + * Function: active_map_entries + * + * Purpose: Count the number of entries that have valid h5_file_id + * values. + * + * Return: The number of active map entries (can be zero). + * Errors: Cannot fail. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +int +active_map_entries(void) +{ + int i, map_entries = 0; + for (i = 0; i < sf_file_map_size; i++) { + if (sf_open_file_map[i].h5_file_id != H5I_INVALID_HID) { + map_entries++; + } + } + return map_entries; +} + +/*------------------------------------------------------------------------- + * Function: init__indep_io + * + * Purpose: Utility function to initialize the set of IO transactions + * used to communicate with IO concentrators for read and write + * IO operations. + * + * Return: A filled set of vectors (1 entry per IO concentrator) which + * fully describe the IO transactions for read and writes. + * At most, every IO concentrator will have a descriptor which + * identifies the local memory offset, the virtual FILE offset, + * and the total length of the IO which will be sent to or + * received from the individual IOCs. + * + * For IO operations which involve a subset of IO concentrators, + * the vector entries for the unused IOCs will have lengths of + * zero and MPI NULL datatypes. + * + * Errors: Cannot fail. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +static int +init__indep_io(subfiling_context_t *sf_context, int64_t *sf_source_data_offset, + int64_t *sf_datasize, int64_t *sf_offset, MPI_Datatype *sf_dtype, + int64_t offset, int64_t elements, int dtype_extent) +{ + + int container_count = sf_context->topology->n_io_concentrators; + int64_t stripe_size = sf_context->sf_stripe_size; + int64_t data_size = elements * dtype_extent; + + int64_t start_id = offset / stripe_size; + int64_t offset_in_stripe = offset % stripe_size; + int64_t start_length = MIN(data_size, (stripe_size - offset_in_stripe)); + int64_t start_row = start_id / container_count; + int64_t ioc_start = start_id % container_count; + + int64_t final_offset = offset + data_size; + int64_t final_id = final_offset / stripe_size; + int64_t final_length = + (start_length == data_size ? 0 : final_offset % stripe_size); + int64_t ioc_final = final_id % container_count; + int64_t container_bytes, total_bytes = 0; + int64_t source_offset = 0; + + int row_id_start = (int) (start_id - ioc_start); + int row_id_final = (int) (final_id - ioc_final); + int i, k, depth = ((row_id_final - row_id_start) / container_count) + 1; + int container_id = (int) start_id; + int64_t row_offset = (int64_t)(start_row * stripe_size); + + for (i = 0, k = (int) ioc_start; i < container_count; i++) { + int container_depth = depth; + hbool_t is_first = false, is_last = false; + container_bytes = 0; + sf_datasize[k] = container_bytes; + if (total_bytes < data_size) { + if (k == ioc_start) { + is_first = true; + container_bytes = start_length; + container_depth--; /* Account for the start_length */ + if (ioc_final < ioc_start) { + container_depth--; + depth--; + } + } + if (k == ioc_final) { + is_last = true; + container_bytes += final_length; + if (container_depth) + container_depth--; /* Account for the final_length */ + if (depth) + depth--; + } + container_bytes += container_depth * stripe_size; + total_bytes += container_bytes; + } + + sf_source_data_offset[k] = source_offset; + sf_datasize[k] = container_bytes; + sf_offset[k] = row_offset + offset_in_stripe; + + if (container_count == 1) { + sf_dtype[k] = MPI_BYTE; + } else { + /* Fill the IO datatypes */ + if (is_first) { + if (is_last) { /* First + Last */ + sf_dtype[k] = H5FD__create_f_l_mpi_type(sf_context, + container_depth + 1, sf_offset[k], container_bytes, + start_length, final_length); + } else { /* First ONLY */ + sf_dtype[k] = + H5FD__create_first_mpi_type(sf_context, container_depth, + sf_offset[k], container_bytes, start_length); + } + source_offset += start_length; + offset_in_stripe = 0; + } else if (is_last) { /* Last ONLY */ + source_offset += stripe_size; + sf_dtype[k] = H5FD__create_final_mpi_type( + sf_context, container_depth, container_bytes, final_length); + } else { /* Everything else (uniform) */ + source_offset += stripe_size; + sf_dtype[k] = H5FD__create_mpi_uniform_type( + sf_context, container_depth, container_bytes); + } + } + k++; + container_id++; + + if (k == container_count) { + k = 0; + depth = ((row_id_final - container_id) / container_count) + 1; + row_offset += stripe_size; + } + } + if (total_bytes != data_size) { + printf("Error: total_bytes != data_size\n"); + } + + return 0; +} + +/*------------------------------------------------------------------------- + * Function: H5FD__init_subfile_context + * + * Purpose: Called as part of the HDF5 file + subfiling opening. + * This initializes the subfiling context and associates + * this context with the specific HDF5 file. + * + * Return: Success (0) or Faiure (-1) + * Errors: If MPI operations fail for some reason. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + *------------------------------------------------------------------------- + */ +int +H5FD__init_subfile_context(sf_topology_t *thisApp, int n_iocs, int world_rank, + subfiling_context_t *newContext) +{ + static MPI_Comm sf_msg_comm = MPI_COMM_NULL; + static MPI_Comm sf_data_comm = MPI_COMM_NULL; + + assert(newContext != NULL); + + if (newContext->topology != thisApp) { + int status; + char *envValue = NULL; + + newContext->topology = thisApp; + newContext->sf_msg_comm = sf_msg_comm; + newContext->sf_data_comm = sf_data_comm; + newContext->sf_group_comm = MPI_COMM_NULL; + newContext->sf_intercomm = MPI_COMM_NULL; + newContext->sf_stripe_size = DEFAULT_STRIPE_SIZE; + newContext->sf_write_count = 0; + newContext->sf_read_count = 0; + newContext->sf_eof = 0; + if ((envValue = getenv("IOC_STRIPE_SIZE")) != NULL) { + long value_check = atol(envValue); + if (value_check > 0) { + newContext->sf_stripe_size = (int64_t) value_check; + } + } + if ((envValue = getenv("SUBFILE_PREFIX")) != NULL) { + char temp[PATH_MAX]; + sprintf(temp, "%s", envValue); + newContext->subfile_prefix = strdup(temp); + sf_subfile_prefix = strdup(temp); + } + newContext->sf_blocksize_per_stripe = + newContext->sf_stripe_size * n_iocs; + if (sf_msg_comm == MPI_COMM_NULL) { + status = MPI_Comm_dup(MPI_COMM_WORLD, &newContext->sf_msg_comm); + if (status != MPI_SUCCESS) + goto err_exit; + status = MPI_Comm_set_errhandler( + newContext->sf_msg_comm, MPI_ERRORS_RETURN); + if (status != MPI_SUCCESS) + goto err_exit; + sf_msg_comm = newContext->sf_msg_comm; + } + if (sf_data_comm == MPI_COMM_NULL) { + status = MPI_Comm_dup(MPI_COMM_WORLD, &newContext->sf_data_comm); + if (status != MPI_SUCCESS) + goto err_exit; + status = MPI_Comm_set_errhandler( + newContext->sf_data_comm, MPI_ERRORS_RETURN); + if (status != MPI_SUCCESS) + goto err_exit; + sf_data_comm = newContext->sf_data_comm; + } + if (n_iocs > 1) { + status = MPI_Comm_split(MPI_COMM_WORLD, thisApp->rank_is_ioc, + world_rank, &newContext->sf_group_comm); + if (status != MPI_SUCCESS) + goto err_exit; + status = MPI_Comm_size( + newContext->sf_group_comm, &newContext->sf_group_size); + if (status != MPI_SUCCESS) + goto err_exit; + status = MPI_Comm_rank( + newContext->sf_group_comm, &newContext->sf_group_rank); + if (status != MPI_SUCCESS) + goto err_exit; + /* + * There may be additional functionality we need for the IOCs... + * If so, then can probably initialize those things here! + */ + } else { + newContext->sf_group_comm = MPI_COMM_SELF; + newContext->sf_group_size = 1; + newContext->sf_group_rank = 0; + } + } + return 0; + +err_exit: + return -1; +} + +/*------------------------------------------------------------------------- + * Function: Internal read__independent. + * + * Purpose: The IO operations can be striped across a selection of + * IO concentrators. The read and write independent calls + * compute the group of 1 or more IOCs and further create + * derived MPI datatypes when required by the size of the + * contiguous read or write requests. + * + * IOC(0) contains the logical data storage for file offset + * zero and all offsets that reside within modulo range of + * the subfiling stripe_size. + * + * We cycle through all 'n_io_conentrators' and send a + * descriptor to each IOC that has a non-zero sized IO + * request to fullfill. + * + * Sending descriptors to an IOC usually gets an ACK or + * NACK in response. For the read operations, we post + * asynch READs to receive the file data and wait until + * all pending operations have completed. + * + * Return: Success (0) or Faiure (non-zero) + * Errors: If MPI operations fail for some reason. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + *------------------------------------------------------------------------- + */ +static int +read__independent(int n_io_concentrators, hid_t context_id, int64_t offset, + int64_t elements, int dtype_extent, void *data) +{ + int i, ioc, n_waiting = 0, status = 0; + int * io_concentrator = NULL; + int indices[n_io_concentrators]; + MPI_Request reqs[n_io_concentrators]; + MPI_Status stats[n_io_concentrators]; + int64_t source_data_offset[n_io_concentrators]; + int64_t ioc_read_datasize[n_io_concentrators]; + int64_t ioc_read_offset[n_io_concentrators]; + MPI_Datatype ioc_read_type[n_io_concentrators]; + + subfiling_context_t *sf_context = get_subfiling_object(context_id); + assert(sf_context != NULL); + + /* Note that the sf_write_count is only tracked by an IOC rank */ + if (sf_context->sf_write_count && (sf_context->sf_fid > 0)) { + fdatasync(sf_context->sf_fid); + } + + io_concentrator = sf_context->topology->io_concentrator; + if (init__indep_io(sf_context, source_data_offset, ioc_read_datasize, + ioc_read_offset, ioc_read_type, offset, elements, + dtype_extent) < 0) { + return -1; + } /* Prepare the IOCs with a message which indicates the length - * and file offset for the actual data to be provided. + * and file offset for the actual data to be provided. */ - for(ioc=0; ioc < n_io_concentrators; ioc++) { - int64_t msg[2] = {ioc_read_datasize[ioc], ioc_read_offset[ioc]}; - char *sourceData = (char *)data; + for (ioc = 0; ioc < n_io_concentrators; ioc++) { + int64_t msg[3] = {ioc_read_datasize[ioc], ioc_read_offset[ioc], + sf_context->sf_context_id}; + char * sourceData = (char *) data; int64_t sourceOffset = source_data_offset[ioc]; - + int packsize = 0; + // printf("[%d] %s: context_id = 0x%lx\n", sf_world_rank, __func__, + // sf_context->sf_context_id); /* We may not require data from this IOC... * or we may read the data directly from the file! * Check the size to verify! @@ -848,27 +1227,46 @@ int sf_read_independent(hid_t context_id, int64_t offset, int64_t elements, int continue; } - if (sf_verbose_flag ) { - printf("[%d %s] Requesting %ld read bytes from IOC(%d): sourceOffset=%ld\n", - sf_world_rank, __func__, msg[0], io_concentrator[ioc], sourceOffset ); +#ifndef NDEBUG + if (sf_verbose_flag) { +#if 0 + if (sf_logfile) { + fprintf(sf_logfile, + "[%d %s] Requesting %ld read bytes from IOC(%d): " + "sourceOffset=%ld subfile_offset=%ld\n", + sf_world_rank, __func__, msg[0], io_concentrator[ioc], + sourceOffset, msg[1]); + } +#else + fprintf(stdout, + "[%d %s] Requesting %ld read bytes from IOC(%d): " + "sourceOffset=%ld subfile_offset=%ld\n", + sf_world_rank, __func__, msg[0], io_concentrator[ioc], + sourceOffset, msg[1]); + fflush(stdout); +#endif } +#endif - status = MPI_Ssend(msg, 2, MPI_INT64_T, io_concentrator[ioc], READ_INDEP, sf_context->sf_msg_comm); + status = MPI_Ssend(msg, 3, MPI_INT64_T, io_concentrator[ioc], + READ_INDEP, sf_context->sf_msg_comm); if (status != MPI_SUCCESS) { printf("[%d] MPI_Send failure!", sf_world_rank); return status; - } - else { + } else { if (ioc_read_type[ioc] == MPI_BYTE) { int bytes = (int) ioc_read_datasize[ioc]; - status = MPI_Irecv(&sourceData[sourceOffset], bytes, ioc_read_type[ioc], io_concentrator[ioc], - READ_INDEP_DATA, sf_context->sf_data_comm, &reqs[ioc]); + status = MPI_Irecv(&sourceData[sourceOffset], bytes, + ioc_read_type[ioc], io_concentrator[ioc], READ_INDEP_DATA, + sf_context->sf_data_comm, &reqs[ioc]); } else { - status = MPI_Irecv(&sourceData[sourceOffset], 1, ioc_read_type[ioc], io_concentrator[ioc], - READ_INDEP_DATA, sf_context->sf_data_comm, &reqs[ioc]); + MPI_Pack_size(1, ioc_read_type[ioc], MPI_COMM_WORLD, &packsize); + status = MPI_Irecv(&sourceData[sourceOffset], 1, + ioc_read_type[ioc], io_concentrator[ioc], READ_INDEP_DATA, + sf_context->sf_data_comm, &reqs[ioc]); } if (status != MPI_SUCCESS) { - int length = 256; + int length = 256; char error_string[length]; MPI_Error_string(status, error_string, &length); printf("(%s) MPI_Irecv error: %s\n", __func__, error_string); @@ -876,419 +1274,1013 @@ int sf_read_independent(hid_t context_id, int64_t offset, int64_t elements, int } n_waiting++; } - } - /* We've queued all of the Async READs, now we just need to + /* We've queued all of the Async READs, now we just need to * complete them in any order... */ - while(n_waiting) { + while (n_waiting) { int ready = 0; status = MPI_Waitsome(n_io_concentrators, reqs, &ready, indices, stats); if (status != MPI_SUCCESS) { - int len; - char estring[MPI_MAX_ERROR_STRING]; - MPI_Error_string(status, estring, &len); - printf("[%d %s] MPI_ERROR! MPI_Waitsome returned an error(%s)\n", - sf_world_rank, __func__, estring ); - fflush(stdout); + int length = 256; + char error_string[length]; + MPI_Error_string(status, error_string, &length); + printf("(%s) MPI_Waitsome error: %s\n", __func__, error_string); + for (i = 0; i < n_waiting; i++) { + printf( + "stats[%d].SOURCE=%d, stats.TAG=%d, stats.MPI_ERROR=%d\n", + i, stats[i].MPI_SOURCE, stats[i].MPI_TAG, + stats[i].MPI_ERROR); + fflush(stdout); + } + return status; } - for(i=0; i < ready; i++) { - ioc = io_concentrator[indices[i]]; + for (i = 0; i < ready; i++) { +#ifndef NDEBUG if (sf_verbose_flag) { - printf("[%d] READ bytes(%ld) of data from ioc_concentrator %d complete\n", - sf_world_rank, ioc_read_datasize[indices[i]] , ioc); - fflush(stdout); +#if 0 + if (sf_logfile) { + fprintf(sf_logfile, + "[%d] READ bytes(%ld) of data from ioc_concentrator %d " + "complete\n", + sf_world_rank, ioc_read_datasize[indices[i]], + indices[i]); + } +#else + fprintf(stdout, + "[%d] READ bytes(%ld) of data from ioc_concentrator %d " + "complete\n", + sf_world_rank, ioc_read_datasize[indices[i]], + indices[i]); + fflush(stdout); +#endif } - n_waiting--; +#endif + if (ioc_read_type[indices[i]] != MPI_BYTE) { + MPI_Type_free(&ioc_read_type[indices[i]]); + } + n_waiting--; } } return status; } +/*------------------------------------------------------------------------- + * Function: Public/Client sf_read_independent + * + * Purpose: A public function which wraps the Internal version + * and allows the addition of the additional 'n_io_concentrator' + * argument. This is important as it allows me to skip + * memory allocation functions since storage for the various + * vector variables is on the call stack... + * + * Return: The integer status returned by the Internal read_independent + * function. Successful operations will return 0. + * Errors: An MPI related error value. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +int +sf_read_independent(hid_t sf_fid, int64_t offset, int64_t elements, + int dtype_extent, void *data) +{ + hid_t sf_context_id = fid_map_to_context(sf_fid); + subfiling_context_t *sf_context = get_subfiling_object(sf_context_id); + assert(sf_context != NULL); + return read__independent(sf_context->topology->n_io_concentrators, + sf_context_id, offset, elements, dtype_extent, data); +} -int sf_write_independent(hid_t context_id, int64_t offset, int64_t elements, int dtype_extent, void *data) +/*------------------------------------------------------------------------- + * Function: Public/Client sf_read_vector + * + * Purpose: Another read__independent wrapper. In this instance + * we simply loop over then collection of vector entries + * and call the sf__read_independent function. + * + * Return: The integer status returned by the Internal read_independent + * function. Successful operations will return 0. + * Errors: An MPI related error value. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +herr_t +sf_read_vector(hid_t h5_fid, hssize_t count, haddr_t addrs[], hsize_t sizes[], + void *bufs[] /* data_out */) { - static int *acks = NULL; - static int *indices = NULL; - static MPI_Request *reqs = NULL; - static MPI_Request *completed = NULL; - static MPI_Status *stats = NULL; - static int64_t *source_data_offset = NULL; - static int64_t *ioc_write_datasize = NULL; - static int64_t *ioc_write_offset = NULL; - static MPI_Datatype *ioc_write_type = NULL; - - subfiling_context_t *sf_context = get_subfiling_object(context_id); - int i, target, ioc, n_waiting = 0, status = 0; - int awaiting_completion = 0; - int errors = 0; - if (acks == NULL) { - if ((acks = (int *)calloc((size_t)n_io_concentrators*2, sizeof(int))) == NULL) { - perror("calloc"); - return -1; - } - else indices = &acks[n_io_concentrators]; - } - if (reqs == NULL) { - if ((reqs = (MPI_Request *)calloc((size_t)n_io_concentrators, sizeof(MPI_Request))) == NULL) { - perror("calloc"); - return -1; - } - } - if (completed == NULL) { - if ((completed = (MPI_Request *)calloc((size_t)n_io_concentrators, sizeof(MPI_Request))) == NULL) { - perror("calloc"); - return -1; + hssize_t k; + herr_t ret_value = SUCCEED; + hid_t sf_context_id = fid_map_to_context(h5_fid); + subfiling_context_t *sf_context = get_subfiling_object(sf_context_id); + + assert(sf_context != NULL); + + /* Unfortunately, we cannot know whether an incoming vector represents + * (as a whole) a contiguous block of data. Certainly each vector entry + * is a contiguous block of data. There is a temptation of course to + * attempt to merge multiple vector instances into a single MPI write + * by utilizing MPI datatypes. At this time we don't attempt to + * consolidate multiple vector entries and are thus forced to loop + * over the vector, sending one a vector entry at a time. + */ + for (k = 0; k < (int32_t) count; k++) { + if (read__independent(sf_context->topology->n_io_concentrators, + sf_context_id, (int64_t) addrs[k], (int64_t) sizes[k], 1, + bufs[k]) != 0) { + printf("%s - encountered an internal error!\n", __func__); + goto errors; } } - if (stats == NULL) { - if ((stats = (MPI_Status *)calloc((size_t)n_io_concentrators, sizeof(MPI_Status))) == NULL) { - perror("calloc"); - return -1; - } + return ret_value; + +errors: + return FAIL; +} + +/*------------------------------------------------------------------------- + * Function: Internal write__independent. + * + * Purpose: The IO operations can be striped across a selection of + * IO concentrators. The read and write independent calls + * compute the group of 1 or more IOCs and further create + * derived MPI datatypes when required by the size of the + * contiguous read or write requests. + * + * IOC(0) contains the logical data storage for file offset + * zero and all offsets that reside within modulo range of + * the subfiling stripe_size. + * + * We cycle through all 'n_io_conentrators' and send a + * descriptor to each IOC that has a non-zero sized IO + * request to fullfill. + * + * Sending descriptors to an IOC usually gets an ACK or + * NACK in response. For the write operations, we post + * asynch READs to receive ACKs from IOC ranks that have + * allocated memory receive the data to write to the + * subfile. Upon receiving an ACK, we send the actual + * user data to the IOC. + * + * Return: Success (0) or Faiure (non-zero) + * Errors: If MPI operations fail for some reason. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + *------------------------------------------------------------------------- + */ +static int +write__independent(int n_io_concentrators, hid_t context_id, int64_t offset, + int64_t elements, int dtype_extent, const void *data) +{ + int * io_concentrator = NULL; + int acks[n_io_concentrators]; + int indices[n_io_concentrators]; + MPI_Request reqs[n_io_concentrators]; + MPI_Status stats[n_io_concentrators]; + int64_t source_data_offset[n_io_concentrators]; + int64_t ioc_write_datasize[n_io_concentrators]; + int64_t ioc_write_offset[n_io_concentrators]; + MPI_Datatype ioc_write_type[n_io_concentrators]; + + subfiling_context_t *sf_context = get_subfiling_object(context_id); + int i, target, ioc, n_waiting = 0, status = 0; + int errors = 0; + + io_concentrator = sf_context->topology->io_concentrator; + + if (sf_context->topology->rank_is_ioc) { + sf_context->sf_write_count++; } - if (init__indep_io(sf_context, &source_data_offset, &ioc_write_datasize, &ioc_write_offset, - &ioc_write_type, offset, elements, dtype_extent) < 0) { + /* The following function will initialize the collection of IO transfer + * parameters, i.e. local memory (source) offsets, target file offsets, + * target data sizes (in bytes), and a MPI Datatype for each of the + * IO concentrator transactions. + * + * For small transfers, at least 1 IOC instance will have valid info. + * For larger transfers, it is likely that the full set of + * n_io_concentrators will be utilized. If the total transaction size is + * less than n_io_concentrators X stripe_size, then the MPI datatype should + * probably be MPI_BYTE. Larger tranactions will create MPI derived + * datatypes to span the entire logical collection of stripes. Said + * differently, the largest IO requests will require a stripe depth greater + * than one. + */ + if (init__indep_io(sf_context, source_data_offset, ioc_write_datasize, + ioc_write_offset, ioc_write_type, offset, elements, + dtype_extent) < 0) { return -1; - } - - if (sf_verbose_flag) { - for(ioc=0; ioc < n_io_concentrators; ioc++) { - int64_t sourceOffset = source_data_offset[ioc]; - printf("[%d %s]: write_dest[ioc(%d), sourceOffset=%ld, datasize=%ld, foffset=%ld]\n", - sf_world_rank, __func__, ioc, sourceOffset, - ioc_write_datasize[ioc], ioc_write_offset[ioc] ); - } - } + } /* Prepare the IOCs with a message which indicates the length * of the actual data to be written. We also provide the file * offset so that when the IOC recieves the data (in whatever order) * they can lseek to the correct offset and write the data. + * + * NOTE: we use 'pwrite' which provides the seek functionality + * as part of the API. */ - for(target=0; target < n_io_concentrators; target++) { + for (target = 0; target < n_io_concentrators; target++) { int64_t sourceOffset; - int64_t msg[2] = {0,}; - char *sourceData = (char *)data; - ioc = (sf_world_rank + target) % n_io_concentrators; - - sourceOffset = source_data_offset[ioc]; - msg[0] = ioc_write_datasize[ioc]; - msg[1] = ioc_write_offset[ioc]; + int64_t msg[3] = { + 0, + }; + const char *sourceData = (const char *) data; + ioc = (sf_world_rank + target) % n_io_concentrators; + + sourceOffset = source_data_offset[ioc]; + msg[0] = ioc_write_datasize[ioc]; + msg[1] = ioc_write_offset[ioc]; + msg[2] = sf_context->sf_context_id; acks[ioc] = 0; reqs[ioc] = MPI_REQUEST_NULL; if (ioc_write_datasize[ioc] == 0) { - if (sf_verbose_flag) { - printf("[%d %s] skipping ioc(%d) send datasize = %ld\n", - sf_world_rank,__func__, ioc, ioc_write_datasize[ioc]); - fflush(stdout); - } continue; } - if ( sf_verbose_flag ) { - printf("[%d] Datatype(%x) Sending to ioc(%d) %ld bytes of data with file_offset=%ld\n", - sf_world_rank, ioc_write_type[ioc], ioc, ioc_write_datasize[ioc], ioc_write_offset[ioc]); - fflush(stdout); - } + +#ifndef NDEBUG + if (sf_verbose_flag) + { +#if 0 + if (sf_logfile) { + fprintf(sf_logfile, + "[%d %s]: write_dest[ioc(%d), " + "sourceOffset=%ld, datasize=%ld, foffset=%ld]\n", + sf_world_rank, __func__, ioc, sourceOffset, + ioc_write_datasize[ioc], ioc_write_offset[ioc]); + } +#else + fprintf(stdout, + "[%d %s]: write_dest[ioc(%d), " + "sourceOffset=%ld, datasize=%ld, foffset=%ld]\n", + sf_world_rank, __func__, ioc, sourceOffset, + ioc_write_datasize[ioc], ioc_write_offset[ioc]); + fflush(stdout); +#endif + } +#endif + /* Send the Message HEADER which indicates the requested IO operation * (via the message TAG) along with the data size and file offset. */ - status = MPI_Ssend(msg, 2, MPI_INT64_T, io_concentrator[ioc], - WRITE_INDEP, sf_context->sf_msg_comm); + status = MPI_Ssend(msg, 3, MPI_INT64_T, io_concentrator[ioc], + WRITE_INDEP, sf_context->sf_msg_comm); if (status != MPI_SUCCESS) { - int len; + int len; char estring[MPI_MAX_ERROR_STRING]; MPI_Error_string(status, estring, &len); - printf("[%d] ERROR! MPI_Send of %ld bytes to %d returned an error(%s)\n", - sf_world_rank, sizeof(msg), io_concentrator[ioc], estring ); + printf("[%d] ERROR! MPI_Send of %ld bytes to %d returned an " + "error(%s)\n", + sf_world_rank, sizeof(msg), io_concentrator[ioc], estring); fflush(stdout); + break; /* If unable to send to an IOC, we can call it quits... */ } - status = MPI_Recv(&acks[ioc], 1, MPI_INT, io_concentrator[ioc], WRITE_INDEP_ACK, - sf_context->sf_data_comm, &stats[ioc]); + /* Wait for memory to be allocated on the target IOC so that we can + * start sending user data to this IOC. + * FIXME: We could possibly use Irecv for handling ACKs. This could + * potentially allow some additional overlap of posting IO requests + * to the collection of IO Concentrators. + */ + status = MPI_Recv(&acks[ioc], 1, MPI_INT, io_concentrator[ioc], + WRITE_INDEP_ACK, sf_context->sf_data_comm, &stats[ioc]); if (status == MPI_SUCCESS) { +#ifndef NDEBUG if (sf_verbose_flag) { - printf("[%d] received ack(%d) from ioc(%d)\n",sf_world_rank, acks[ioc], ioc); - fflush(stdout); + if (sf_logfile) { + fprintf(sf_logfile, "[%d] received ack(%d) from ioc(%d)\n", + sf_world_rank, acks[ioc], ioc); + } } +#endif + /* No errors, start sending data to the IOC. + * If the data transfer is small enough, we don't utilize a + * derived MPI type, i.e. we use MPI_BYTE. + */ if (acks[ioc] > 0) { - if (ioc_write_type[ioc] == MPI_BYTE) { - int datasize = (int)(ioc_write_datasize[ioc] & INT32_MASK); + if (ioc_write_type[ioc] == MPI_BYTE) { + int datasize = (int) (ioc_write_datasize[ioc] & INT32_MASK); status = MPI_Issend(&sourceData[sourceOffset], datasize, - MPI_BYTE, io_concentrator[ioc], WRITE_INDEP_DATA, - sf_context->sf_data_comm,&reqs[ioc]); - } - else { - status = MPI_Issend(&sourceData[sourceOffset], 1, ioc_write_type[ioc], - io_concentrator[ioc], WRITE_INDEP_DATA, - sf_context->sf_data_comm,&reqs[ioc]); + MPI_BYTE, io_concentrator[ioc], WRITE_INDEP_DATA, + sf_context->sf_data_comm, &reqs[ioc]); + } else { + status = MPI_Issend(&sourceData[sourceOffset], 1, + ioc_write_type[ioc], io_concentrator[ioc], + WRITE_INDEP_DATA, sf_context->sf_data_comm, &reqs[ioc]); } - /* Queued another Isend which need to be completed (below) */ n_waiting++; } } else { errors++; puts("ACK error!"); fflush(stdout); + break; } + + /* Check the status of our MPI_Issend... */ if (status != MPI_SUCCESS) { errors++; printf("[%d] ERROR! Unable to Send data to ioc(%d)\n", - sf_world_rank, ioc); + sf_world_rank, ioc); fflush(stdout); + break; } } - while(n_waiting) { + /* Wait for the Issends to complete (in any order) */ + while (n_waiting) { int ready = 0; status = MPI_Waitsome(n_io_concentrators, reqs, &ready, indices, stats); if (status != MPI_SUCCESS) { - int len; + int len; char estring[MPI_MAX_ERROR_STRING]; MPI_Error_string(status, estring, &len); printf("[%d %s] MPI_ERROR! MPI_Waitsome returned an error(%s)\n", - sf_world_rank, __func__, estring ); + sf_world_rank, __func__, estring); fflush(stdout); errors++; } - for(i=0; i < ready; i++) { + for (i = 0; i < ready; i++) { /* One of the Issend calls has completed - * Wait for another ACK to indicate that the data as been written - * to the subfile. + * If we used a derived type to send data, then should free + * that datatype instance. */ + if (ioc_write_type[indices[i]] != MPI_BYTE) { + MPI_Type_free(&ioc_write_type[indices[i]]); + } + n_waiting--; + } + } + if (errors) + return -1; + return status; +} + +/*------------------------------------------------------------------------- + * Function: Public/Client sf_write_independent + * + * Purpose: A public function which wraps the Internal version + * and allows the addition of the additional 'n_io_concentrator' + * argument. This is important as it allows me to skip + * memory allocation functions since storage for the various + * vector variables is on the call stack... + * + * Return: The integer status returned by the Internal read_independent + * function. Successful operations will return 0. + * Errors: An MPI related error value. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +int +sf_write_independent(hid_t sf_fid, int64_t offset, int64_t elements, + int dtype_extent, const void *data) +{ + hid_t sf_context_id = fid_map_to_context(sf_fid); + subfiling_context_t *sf_context = get_subfiling_object(sf_context_id); + + assert(sf_context != NULL); + return write__independent(sf_context->topology->n_io_concentrators, + sf_context_id, offset, elements, dtype_extent, data); +} + +/*------------------------------------------------------------------------- + * Function: Public/Client sf_write_vector + * + * Purpose: Another write__independent wrapper. As with the + * sf_read_vector function, we simply loop over the vector + * elements and call the underlying write_independent function. + * + * Return: The integer status returned by the Internal read_independent + * function. Successful operations will return 0. + * Errors: An MPI related error value. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +herr_t +sf_write_vector(hid_t h5_fid, hssize_t count, haddr_t addrs[], hsize_t sizes[], + void *bufs[] /* data_in */) +{ + hssize_t k; + herr_t ret_value = SUCCEED; + hid_t sf_context_id = fid_map_to_context(h5_fid); + subfiling_context_t *sf_context = get_subfiling_object(sf_context_id); + + assert(sf_context != NULL); + + /* + * Call the underlying write function for each vector element. + */ + for (k = 0; k < count; k++) { + if (write__independent(sf_context->topology->n_io_concentrators, + sf_context_id, (int64_t) addrs[k], (int64_t) sizes[k], 1, + bufs[k]) < 0) { + printf("%s - encountered an internal error!\n", __func__); + goto errors; + } + } + return ret_value; + +errors: + return FAIL; +} + +int +sf_truncate(hid_t h5_fid, haddr_t H5_ATTR_PARALLEL_UNUSED addr) +{ + hid_t sf_context_id = fid_map_to_context(h5_fid); + subfiling_context_t *sf_context = get_subfiling_object(sf_context_id); + + assert(sf_context != NULL); + #if 0 - acks[indices[i]] = 0; - MPI_Irecv(&acks[indices[i]], 1, MPI_INT, io_concentrator[indices[i]], COMPLETED, sf_context->sf_data_comm, &completed[indices[i]]); - awaiting_completion++; + if (sf_context->topology->n_io_concentrators > 1) { + if (MPI_Allreduce(&addr_in, &addr_max, 1, MPI_INT64_T, MPI_MAX, sf_context->sf_data_comm) != MPI_SUCCESS) { + addr_max = (int64_t)addr; + } + } + if (sf_context->topology->rank_is_ioc) { + int container_count = sf_context->topology->n_io_concentrators; + int64_t stripe_size = sf_context->sf_stripe_size; + int64_t addr_max_stripe_id = addr_max / stripe_size; + int64_t offset_in_stripe = addr_max % stripe_size; + int max_row = (int)(addr_max_stripe_id / container_count); + int addr_max_ioc = (int)(addr_max_stripe_id % container_count); + /* + * Subfiling storage can be thought of as a 2D array in which each row + * contains N columns (containers). The containers have a fixed width + * so that number of bytes in any "row" is (# of containers) X stripe_size. + * + * Given any offset, we can identify the 'row' of the specified offset + * as well as the offset within row and thus the specific container and + * actual offset within that container. + */ + int64_t row_start = max_row * stripe_size; + int64_t container_addr_max = row_start + stripe_size; + if (sf_context->topology->subfile_rank == addr_max_ioc) { + container_addr_max = row_start + offset_in_stripe; + } + else if (sf_context->topology->subfile_rank < addr_max_ioc) { + container_addr_max = row_start + stripe_size; + } + if(-1 == HDftruncate(sf_context->sf_fid, (HDoff_t)container_addr_max)) { + puts("truncate failed!"); + return -1; + } + } #endif - n_waiting--; + return 0; +} + +/*------------------------------------------------------------------------- + * Function: Internal close__subfiles + * + * Purpose: When closing and HDF5 file, we need to close any associated + * subfiles as well. This function cycles through all known + * IO Concentrators to send a file CLOSE_OP command. + * + * This function is collective across all MPI ranks which + * have opened HDF5 file which associated with the provided + * sf_context. Once the request has been issued by all + * ranks, the subfile at each IOC will be closed and an + * completion ACK will be received. + * + * Once the subfiles are closed, we initiate a teardown of + * the IOC and associated thread_pool threads. + * + * Return: Success (0) or Faiure (non-zero) + * Errors: If MPI operations fail for some reason. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + *------------------------------------------------------------------------- + */ +static int +close__subfiles( + subfiling_context_t *sf_context, int n_io_concentrators, hid_t fid) +{ + int i, status; + int global_errors = 0, errors = 0; + int n_waiting = 0; + int indices[n_io_concentrators]; + int ioc_acks[n_io_concentrators]; + MPI_Request reqs[n_io_concentrators]; + int * io_concentrator = sf_context->topology->io_concentrator; + + /* The map from fid to context can now be cleared */ + clear_fid_map_entry(fid); + + for (i = 0; i < n_io_concentrators; i++) { + int64_t msg[3] = {0, 0, sf_context->sf_context_id}; + status = MPI_Ssend(msg, 3, MPI_INT64_T, io_concentrator[i], CLOSE_OP, + sf_context->sf_msg_comm); + if (status == MPI_SUCCESS) { + status = MPI_Irecv(&ioc_acks[i], 1, MPI_INT, io_concentrator[i], + COMPLETED, sf_context->sf_data_comm, &reqs[i]); } + if (status != MPI_SUCCESS) { + printf("[%d] MPI close_subfiles failure!", sf_world_rank); + errors++; + } else + n_waiting++; } - while(awaiting_completion) { + + while (n_waiting) { int ready = 0; - status = MPI_Waitsome(n_io_concentrators, completed, &ready, indices, stats); + status = MPI_Waitsome( + n_io_concentrators, reqs, &ready, indices, MPI_STATUSES_IGNORE); if (status != MPI_SUCCESS) { - int len; + int len; char estring[MPI_MAX_ERROR_STRING]; MPI_Error_string(status, estring, &len); printf("[%d %s] MPI_ERROR! MPI_Waitsome returned an error(%s)\n", - sf_world_rank, __func__, estring ); + sf_world_rank, __func__, estring); fflush(stdout); errors++; } - - for(i=0; i < ready; i++) { - /* One of the Issend calls has completed - * Wait for another ACK to indicate that the data as been written - * to the subfile. - */ - acks[indices[i]] = 0; - awaiting_completion--; + for (i = 0; i < ready; i++) { + n_waiting--; } } - if (errors) return -1; - return status; + if (sf_context->topology->rank_is_ioc) { + finalize_ioc_threads(); + wait_for_thread_main(); + } + + status = MPI_Allreduce( + &errors, &global_errors, 1, MPI_INT, MPI_SUM, sf_context->sf_data_comm); + + if (status != MPI_SUCCESS) { + global_errors++; + } + return global_errors; } -int sf_close_subfiles(hid_t context_id) +/*------------------------------------------------------------------------- + * Function: Public/Client sf_close_subfiles + * + * Purpose: This is a simple wrapper function for the internal version + * which actually manages all subfile closing via commands + * to the set of IO Concentrators. + * + * Return: Success (0) or Faiure (non-zero) + * Errors: If MPI operations fail for some reason. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + *------------------------------------------------------------------------- + */ +int +sf_close_subfiles(hid_t fid) { - int i, status; - int errors = 0; - int n_waiting = 0; - int indices[n_io_concentrators]; - int ioc_acks[n_io_concentrators]; - MPI_Request reqs[n_io_concentrators]; - subfiling_context_t *sf_context = get_subfiling_object(context_id); - - for (i=0; i < n_io_concentrators; i++) { - int64_t msg[2] = {0, 0}; - status = MPI_Ssend(msg, 2, MPI_INT64_T, io_concentrator[i], CLOSE_OP, sf_context->sf_msg_comm); - if (status == MPI_SUCCESS) { - status = MPI_Irecv(&ioc_acks[i], 1, MPI_INT, io_concentrator[i], COMPLETED, sf_context->sf_data_comm, &reqs[i]); - } - if (status != MPI_SUCCESS) { - printf("[%d] MPI close_subfiles failure!", sf_world_rank); - errors++; - } - else n_waiting++; - } - while(n_waiting) { - int ready = 0; - status = MPI_Waitsome(n_io_concentrators, reqs, &ready, indices, MPI_STATUSES_IGNORE); - if (status != MPI_SUCCESS) { - int len; - char estring[MPI_MAX_ERROR_STRING]; - MPI_Error_string(status, estring, &len); - printf("[%d %s] MPI_ERROR! MPI_Waitsome returned an error(%s)\n", - sf_world_rank, __func__, estring ); - fflush(stdout); - errors++; - } - for(i=0; i < ready; i++) { - n_waiting--; - } - } - return errors; + hid_t context_id = fid_map_to_context(fid); + subfiling_context_t *sf_context = get_subfiling_object(context_id); + assert(sf_context != NULL); + return close__subfiles( + sf_context, sf_context->topology->n_io_concentrators, fid); } -int sf_open_subfiles(hid_t context_id, char *prefix, int flags) +/*------------------------------------------------------------------------- + * Function: Internal open__subfiles + * + * Purpose: While we cannot know a priori, whether an HDF client will + * need to access data across the entirety of a file, e.g. + * an individual MPI rank may read or write only small + * segments of the entire file space; this function sends + * a file OPEN_OP to every IO concentrator. + * + * Prior to opening any subfiles, the H5FDopen will have + * created an HDF5 file with the user specified naming. + * A path prefix will be selected and is available as + * an input argument. + * + * The opened HDF5 file handle will contain device and + * inode values, these being constant for all processes + * opening the shared file. The inode value is utilized + * as a key value and is associated with the sf_context + * which we recieve as one of the input arguments. + * + * IO Concentrator threads will be initialized on MPI ranks + * which have been identified via application toplogy + * discovery. The number and mapping of IOC to MPI_rank + * is part of the sf_context->topology structure. + * + * Return: Success (0) or Faiure (non-zero) + * Errors: If MPI operations fail for some reason. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + *------------------------------------------------------------------------- + */ + +static int +open__subfiles(subfiling_context_t *sf_context, int n_io_concentrators, + hid_t fid, char *prefix, int flags) { - int i, status; - int n_waiting = 0; - int indices[n_io_concentrators]; - int ioc_acks[n_io_concentrators]; - MPI_Request reqs[n_io_concentrators]; - subfiling_context_t *sf_context = get_subfiling_object(context_id); - - if ((sf_context->subfile_prefix != NULL) && (prefix != NULL)) { - if (strcmp(sf_context->subfile_prefix, prefix) != 0) { - sf_context->subfile_prefix = strdup(prefix); - } - } + int i, ret, status, n_waiting = 0; + int * io_concentrator = NULL; + int indices[n_io_concentrators]; + int ioc_acks[n_io_concentrators]; + MPI_Request reqs[n_io_concentrators]; + + assert(sf_context != NULL); + + if (prefix) { + if (sf_context->subfile_prefix) { + if (strcmp(sf_context->subfile_prefix, prefix) != 0) { + sf_context->subfile_prefix = strdup(prefix); + } + } else { + sf_context->subfile_prefix = strdup(prefix); + } + sf_subfile_prefix = sf_context->subfile_prefix; + } - for (i=0; i < n_io_concentrators; i++) { - int64_t msg[2] = {flags, 0}; - if (sf_verbose_flag) { - printf("[%d] file open request (flags = %0lx)\n", sf_world_rank, msg[0]); - } - status = MPI_Ssend(msg, 2, MPI_INT64_T, io_concentrator[i], OPEN_OP, sf_context->sf_msg_comm); - if (status == MPI_SUCCESS) { - status = MPI_Irecv(&ioc_acks[i], 1, MPI_INT, io_concentrator[i], COMPLETED, sf_context->sf_data_comm, &reqs[i]); - } + /* + * Save the HDF5 file id (fid) to subfile context mapping. + * There shouldn't be any issue, but check the status and + * return if there was a problem. + */ + ret = record_fid_to_subfile(fid, sf_context->sf_context_id, NULL); + if (ret != SUCCEED) { + printf("[%d - %s] Error mapping hdf5 file to a subfiling context\n", + sf_context->topology->world_rank, __func__); + return -1; + } + + /* We already know the number of IO concentrators, but + * grab the mapping of IO concentrator to MPI ranks for our + * messaging loop. + */ + io_concentrator = sf_context->topology->io_concentrator; + + for (i = 0; i < n_io_concentrators; i++) { + int64_t msg[3] = {flags, fid, sf_context->sf_context_id}; + +#ifndef NDEBUG + if (sf_verbose_flag) { + if (sf_logfile) { + fprintf(sf_logfile, "[%d] file open request (flags = %0lx)\n", + sf_world_rank, msg[0]); + } + } +#endif + /* Send the open_op message to an IOC */ + status = MPI_Ssend(msg, 3, MPI_INT64_T, io_concentrator[i], OPEN_OP, + sf_context->sf_msg_comm); + + /* Check for errors */ + if (status == MPI_SUCCESS) { + /* And post a receive for the open file ACK */ + status = MPI_Irecv(&ioc_acks[i], 1, MPI_INT, io_concentrator[i], + COMPLETED, sf_context->sf_data_comm, &reqs[i]); + } + + if (status != MPI_SUCCESS) { + printf("[%d] MPI close_subfiles failure!", sf_world_rank); + } else + n_waiting++; + } /* END - for loop */ + + /* Wait for all (n_waiting) ACK messages to be received */ + while (n_waiting) { + int ready = 0; + status = MPI_Waitsome( + n_io_concentrators, reqs, &ready, indices, MPI_STATUSES_IGNORE); if (status != MPI_SUCCESS) { - printf("[%d] MPI close_subfiles failure!", sf_world_rank); - } - else n_waiting++; - } - while(n_waiting) { - int ready = 0; - status = MPI_Waitsome(n_io_concentrators, reqs, &ready, indices, MPI_STATUSES_IGNORE); - if (status != MPI_SUCCESS) { - int len; + int len; char estring[MPI_MAX_ERROR_STRING]; MPI_Error_string(status, estring, &len); printf("[%d %s] MPI_ERROR! MPI_Waitsome returned an error(%s)\n", - sf_world_rank, __func__, estring ); + sf_world_rank, __func__, estring); fflush(stdout); - } - for(i=0; i < ready; i++) { - n_waiting--; - } - } + } - return 0; + for (i = 0; i < ready; i++) { + n_waiting--; + } + } /* END - while */ + + return 0; } - + +/*------------------------------------------------------------------------- + * Function: Public/Client open_subfiles + * + * Purpose: Wrapper for the internal 'open__subfiles' function + * Similar to the other public wrapper functions, we + * discover (via the sf_context) the number of io concentrators + * and pass that to the internal function so that vector + * storage arrays can be stack based rather than explicitly + * allocated and freed. + * + * The Internal function is resposible for sending all IOC + * instances, the (sub)file open requests. + * + * Prior to calling the internal open function, we initialize + * a new subfiling context that contains topology info and + * new MPI communicators that facilitate messaging between + * HDF5 clients and the IOCs. + * + * Return: Success (0) or Faiure (non-zero) + * Errors: If MPI operations fail for some reason. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + *------------------------------------------------------------------------- + */ int -ioc_main(subfiling_context_t *context) +sf_open_subfiles(hid_t fid, char *filename, char *prefix, int flags) { - int subfile_rank; - int flag, ret; - int max_work_depth; - int my_shutdown_flag = 0; - MPI_Status status, msg_status; - sf_work_request_t *incoming_requests = NULL; - useconds_t delay = 20; - - assert(context != NULL); - subfile_rank = context->sf_group_rank; - if (request_count_per_rank == NULL) { - request_count_per_rank = (int *)calloc((size_t)sf_world_size, sizeof(int)); - assert(request_count_per_rank != NULL); - } + int status; + int64_t context_id = -1; + subfiling_context_t *sf_context = NULL; + sf_ioc_selection_t ioc_selection; + char *option_arg = get_ioc_selection_criteria(&ioc_selection); + + status = H5FDsubfiling_init(ioc_selection, option_arg, &context_id); + if (status != SUCCEED) { + puts("H5FDsubfiling_init failed!"); + return -1; + } +#if 0 + printf("[%d %s]\n", sf_world_rank, __func__); +#endif + + sf_context = get_subfiling_object(context_id); + assert(sf_context != NULL); + + sf_context->sf_context_id = context_id; + sf_context->h5_file_id = fid; + sf_context->filename = strdup(filename); + sf_shutdown_flag = 0; + + return open__subfiles(sf_context, sf_context->topology->n_io_concentrators, + fid, prefix, flags); +} + +/*------------------------------------------------------------------------- + * Function: Public/Client set_verbose_flag + * + * Purpose: For debugging purposes, I allow a verbose setting to + * have printing of relevent information into an IOC specific + * file that is opened as a result of enabling the flag + * and closed when the verbose setting is disabled. + * + * Return: None + * Errors: None + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + *------------------------------------------------------------------------- + */ +void +set_verbose_flag(int subfile_rank, int new_value) +{ +#ifndef NDEBUG + sf_verbose_flag = (int) (new_value & 0x0FF); + + if (sf_verbose_flag) { + char logname[64]; + sprintf(logname, "ioc_%d.log", subfile_rank); + sf_logfile = fopen(logname, "w+"); + } else if (sf_logfile) { + fclose(sf_logfile); + sf_logfile = NULL; + } +#endif + return; +} - max_work_depth = MAX(8, sf_world_size * MAX_WORK_PER_RANK); - incoming_requests = (sf_work_request_t *)calloc((size_t)(max_work_depth +1), sizeof(sf_work_request_t)); - assert(incoming_requests != NULL); +/*------------------------------------------------------------------------- + * Function: Public/IOC ioc_main + * + * Purpose: This is the principal function run by the IO Concentrator + * main thread. It remains within a loop until allowed to + * exit by means of setting the 'sf_shutdown_flag'. This + * usually accomplished as part of the file close operation. + * + * The function implements an asynchronous polling approach + * for incoming messages. These messages can be thought of + * as a primitive RPC which utilizes MPI TAGs to code and + * implement the desired subfiling functionality. + * + * As each incoming message is received, it get added to + * a queue for processing by a thread_pool thread. + * The message handlers are dispatched via the + * "handle_work_request" ftn (see H5FDsubfile_thread.c) + + * Subfiling is effectively a software RAID-0 implementation + * where having multiple IO Concentrators and independent + * subfiles is equated to the multiple disks and a true + * hardware base RAID implementation. + * + * IO Concentrators are ordered according to their MPI rank. + * In the simplest interpretation, IOC(0) will always contain + * the initial bytes of the logical disk image. Byte 0 of + * IOC(1) will contain the byte written to the logical disk + * offset "stripe_size" X IOC(number). + * + * Example: If the stripe size is defined to be 256K, then + * byte 0 of subfile(1) is at logical offset 262144 of the + * file. Similarly, byte 0 of subfile(2) represents the + * logical file offset = 524288. For logical files larger + * than 'N' X stripe_size, we simply "wrap around" back to + * subfile(0). The following shows the mapping of 30 + * logical blocks of data over 3 subfiles: + * +--------+--------+--------+--------+--------+--------+ + * | blk(0 )| blk(1) | blk(2 )| blk(3 )| blk(4 )| blk(5 )| + * | IOC(0) | IOC(1) | IOC(2) | IOC(0) | IOC(1) | IOC(2) | + * +--------+--------+--------+--------+--------+--------+ + * | blk(6 )| blk(7) | blk(8 )| blk(9 )| blk(10)| blk(11)| + * | IOC(0) | IOC(1) | IOC(2) | IOC(0) | IOC(1) | IOC(2) | + * +--------+--------+--------+--------+--------+--------+ + * | blk(12)| blk(13)| blk(14)| blk(15)| blk(16)| blk(17)| + * | IOC(0) | IOC(1) | IOC(2) | IOC(0) | IOC(1) | IOC(2) | + * +--------+--------+--------+--------+--------+--------+ + * | blk(18)| blk(19)| blk(20)| blk(21)| blk(22)| blk(23)| + * | IOC(0) | IOC(1) | IOC(2) | IOC(0) | IOC(1) | IOC(2) | + * +--------+--------+--------+--------+--------+--------+ + * | blk(24)| blk(25)| blk(26)| blk(27)| blk(28)| blk(29)| + * | IOC(0) | IOC(1) | IOC(2) | IOC(0) | IOC(1) | IOC(2) | + * +--------+--------+--------+--------+--------+--------+ + * + * Return: None + * Errors: None + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + *------------------------------------------------------------------------- + */ +int +ioc_main(int64_t context_id) +{ + int subfile_rank; + int flag, ret; + int max_work_depth; + MPI_Status status, msg_status; + sf_work_request_t * incoming_requests = NULL; + useconds_t delay = 20; + subfiling_context_t *context = get_subfiling_object(context_id); + + assert(context != NULL); + /* We can't have opened any files at this point.. */ + context->sf_fid = -1; + + subfile_rank = context->sf_group_rank; + if (request_count_per_rank == NULL) { + request_count_per_rank = + (int *) calloc((size_t) sf_world_size, sizeof(int)); + assert(request_count_per_rank != NULL); + } + + max_work_depth = MAX(8, sf_world_size * MAX_WORK_PER_RANK); + incoming_requests = (sf_work_request_t *) calloc( + (size_t)(max_work_depth + 1), sizeof(sf_work_request_t)); + + /* Validate that the allocation succeeded */ + assert(incoming_requests != NULL); + + /* Initialize atomic vars */ + atomic_init(&sf_workinprogress, 0); + atomic_init(&sf_work_pending, 0); + atomic_init(&sf_file_close_count, 0); + atomic_init(&sf_file_refcount, 0); + atomic_init(&sf_ioc_fini_refcount, 0); + + sf_open_file_count = 0; + sf_close_file_count = 0; + sf_ops_after_first_close = 0; -#ifdef DEBUG_TRACING - char logname[64]; - sprintf(logname,"ioc_%d.log", subfile_rank); - sf_logfile = fopen(logname, "w+"); +#if 0 + printf("Starting IOC! mpi_rank=%d\n", sf_world_rank); + fflush(stdout); #endif - /* Initialize atomic vars */ - atomic_init(&sf_workinprogress, 0); - atomic_init(&sf_work_pending, 0); - atomic_init(&sf_file_close_count, 0); - atomic_init(&sf_file_refcount, 0); - - sf_msg_comm = context->sf_msg_comm; /* Messages IN */ - sf_data_comm = context->sf_data_comm; /* Messages OUT */ - - while(!sf_shutdown_flag || sf_work_pending) { + + while (!sf_shutdown_flag || sf_work_pending) { flag = 0; - ret = MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, context->sf_msg_comm, &flag, &status); + ret = MPI_Iprobe( + MPI_ANY_SOURCE, MPI_ANY_TAG, context->sf_msg_comm, &flag, &status); if ((ret == MPI_SUCCESS) && (flag != 0)) { sf_work_request_t *msg = NULL; - int count; - int request_size = (int)sizeof(sf_work_request_t); - int source = status.MPI_SOURCE; - int tag = status.MPI_TAG; + int count; + int request_size = (int) sizeof(sf_work_request_t); + int source = status.MPI_SOURCE; + int tag = status.MPI_TAG; MPI_Get_count(&status, MPI_BYTE, &count); if (count > request_size) { - msg = (sf_work_request_t *) malloc((size_t)count); - ret = MPI_Recv(msg,count,MPI_BYTE, source, tag, context->sf_msg_comm, &msg_status); - } - else { - ret = MPI_Recv(&incoming_requests[sf_workinprogress],count, MPI_BYTE, - source, tag, context->sf_msg_comm, &msg_status); + msg = (sf_work_request_t *) malloc((size_t) count); + ret = MPI_Recv(msg, count, MPI_BYTE, source, tag, + context->sf_msg_comm, &msg_status); + } else { + ret = MPI_Recv(&incoming_requests[sf_workinprogress], count, + MPI_BYTE, source, tag, context->sf_msg_comm, &msg_status); } if (ret == MPI_SUCCESS) { -#ifdef DEBUG_TRACING - printf("[[ioc(%d) msg from %d tag=%x, datasize=%ld, foffset=%ld]]\n", subfile_rank, source, tag, - incoming_requests[sf_workinprogress].header[0], - incoming_requests[sf_workinprogress].header[1]); - fflush(stdout); +#if 0 + if (tag == OPEN_OP) { + sf_open_file_count++; + printf("source=%d: sf_open_file_count = %d\n", source, sf_open_file_count); + fflush(stdout); + } + else if (tag == CLOSE_OP) { + sf_close_file_count++; + printf("source=%d: sf_close_file_count = %d\n", source, sf_close_file_count); + fflush(stdout); + } + else { + printf("ioc(0): tag=%d\n", tag); + fflush(stdout); + if (sf_close_file_count) { + sf_ops_after_first_close++; + if (sf_close_file_count == sf_world_size) { + printf("op=%d from source(%d) after file close! sf_open_file_count=%d\n", tag, source, sf_open_file_count); + fflush(stdout); + } + } + } #endif if (msg) { - msg->tag = tag; msg->source = source; - msg->subfile_rank = subfile_rank; + msg->subfile_rank = subfile_rank; + msg->context_id = context->sf_context_id; tpool_add_work(msg); - } - else { - int index = atomic_load(&sf_workinprogress); - incoming_requests[sf_workinprogress].tag = tag; - incoming_requests[sf_workinprogress].source = source; - incoming_requests[sf_workinprogress].subfile_rank = subfile_rank; - tpool_add_work(&incoming_requests[sf_workinprogress]); - if (index == max_work_depth -1) { - atomic_init(&sf_workinprogress, 0); - } - else { - atomic_fetch_add(&sf_workinprogress, 1); // atomic - } + } else { + int index = atomic_load(&sf_workinprogress); + incoming_requests[index].tag = tag; + incoming_requests[index].source = source; + incoming_requests[index].subfile_rank = subfile_rank; + tpool_add_work(&incoming_requests[index]); + if (index == max_work_depth - 1) { + atomic_init(&sf_workinprogress, 0); + } else { + atomic_fetch_add(&sf_workinprogress, 1); // atomic + } } } + } else { + usleep(delay); } - else { - begin_thread_exclusive(); - my_shutdown_flag = sf_shutdown_flag; - end_thread_exclusive(); - usleep(delay); - } } -#ifdef DEBUG_TRACING - fclose(sf_logfile); +#ifndef NDEBUG + if (sf_logfile) { + fclose(sf_logfile); + sf_logfile = NULL; + } #endif - if (incoming_requests) { - free(incoming_requests); - } + if (incoming_requests) { + free(incoming_requests); + } + + /* Reset the shutdown flag */ + sf_shutdown_flag = 0; - return 0; + return 0; } /* @@ -1297,23 +2289,36 @@ Private helper functions ========================================= */ -static int send_ack__(int target, int subfile_rank, int tag, MPI_Comm comm) +static int +send_ack__(int target, int subfile_rank, int tag, MPI_Comm comm) { int ack = 1; int ret = MPI_Send(&ack, 1, MPI_INT, target, tag, comm); +#ifndef NDEBUG if (sf_verbose_flag) { - printf("[ioc(%d): Sending ACK to MPI_rank(%d)\n", subfile_rank, target); + if (sf_logfile) { + fprintf(sf_logfile, "[ioc(%d): Sending ACK to MPI_rank(%d)\n", + subfile_rank, target); + } } +#endif return ret; } -static int send_nack__(int target, int subfile_rank, int tag, MPI_Comm comm) +static int +send_nack__(int target, int subfile_rank, int tag, MPI_Comm comm) { int nack = 0; int ret = MPI_Send(&nack, 1, MPI_INT, target, tag, comm); + +#ifndef NDEBUG if (sf_verbose_flag) { - printf("[ioc(%d): Sending NACK to MPI_rank(%d)\n", subfile_rank, target); + if (sf_logfile) { + fprintf(sf_logfile, "[ioc(%d): Sending NACK to MPI_rank(%d)\n", + subfile_rank, target); + } } +#endif return ret; } @@ -1324,30 +2329,103 @@ from the thread pool threads... ========================================= */ -int queue_write_coll(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm) +/*------------------------------------------------------------------------- + * Function: Public/IOC queue_write_coll + * + * Purpose: Collective write function (NOT currently implemented) + * + * Return: The integer status returned by the Internal read_independent + * function. Successful operations will return 0. + * Errors: An MPI related error value. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +int +queue_write_coll(sf_work_request_t H5_ATTR_PARALLEL_UNUSED *msg, + int H5_ATTR_PARALLEL_UNUSED subfile_rank, + int H5_ATTR_PARALLEL_UNUSED source, MPI_Comm H5_ATTR_PARALLEL_UNUSED comm) { - return 0; + return 0; } -int queue_read_coll(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm) +/*------------------------------------------------------------------------- + * Function: Public/IOC queue_read_coll + * + * Purpose: Collective read function (NOT currently implemented) + * + * Return: The integer status returned by the Internal read_independent + * function. Successful operations will return 0. + * Errors: An MPI related error value. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +int +queue_read_coll(sf_work_request_t H5_ATTR_PARALLEL_UNUSED *msg, + int H5_ATTR_PARALLEL_UNUSED subfile_rank, + int H5_ATTR_PARALLEL_UNUSED source, MPI_Comm H5_ATTR_PARALLEL_UNUSED comm) { - return 0; + return 0; } -int queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm) +/*------------------------------------------------------------------------- + * Function: Public/IOC queue_write_indep + * + * Purpose: Implement the IOC independent write function. The + * function is invoked as a result of the IOC receiving the + * "header"/RPC. What remains is to allocate memory for the + * data sent by the client and then write the data to our + * subfile. We utilize pwrite for the actual file writing. + * File flushing is done at file close. + * + * Return: The integer status returned by the Internal read_independent + * function. Successful operations will return 0. + * Errors: An MPI related error value. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +int +queue_write_indep( + sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm) { - char *recv_buffer = NULL; - int ret = MPI_SUCCESS; - MPI_Status msg_status; - int64_t data_size = msg->header[0]; - int64_t file_offset = msg->header[1]; - int fd; + int fd; + char * recv_buffer = NULL; + int ret = MPI_SUCCESS; + MPI_Status msg_status; + int64_t data_size = msg->header[0]; + int64_t file_offset = msg->header[1]; + int64_t file_context_id = msg->header[2]; + subfiling_context_t *sf_context = get_subfiling_object(file_context_id); + assert(sf_context != NULL); + + /* flag that we've attempted to write data to the file */ + sf_context->sf_write_count++; + +#ifndef NDEBUG if (sf_verbose_flag) { - printf("[ioc(%d) %s]: msg from %d: datasize=%ld\toffset=%ld\n", subfile_rank, __func__, source, data_size, file_offset ); - fflush(stdout); + if (sf_logfile) { + fprintf(sf_logfile, + "[ioc(%d) %s]: msg from %d: datasize=%ld\toffset=%ld\n", + subfile_rank, __func__, source, data_size, file_offset); + } } +#endif if (recv_buffer == NULL) { - if ((recv_buffer = (char *)malloc((size_t)data_size)) == NULL) { + if ((recv_buffer = (char *) malloc((size_t) data_size)) == NULL) { perror("malloc"); send_nack__(source, subfile_rank, WRITE_INDEP_ACK, comm); return -1; @@ -1356,287 +2434,678 @@ int queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source, MPI_ send_ack__(source, subfile_rank, WRITE_INDEP_ACK, comm); - ret = MPI_Recv(recv_buffer, (int)data_size, MPI_BYTE, source, WRITE_INDEP_DATA, comm, &msg_status ); + ret = MPI_Recv(recv_buffer, (int) data_size, MPI_BYTE, source, + WRITE_INDEP_DATA, comm, &msg_status); + +#ifndef NDEBUG + if (sf_verbose_flag) { + if (sf_logfile) { + fprintf(sf_logfile, + "[ioc(%d) %s] MPI_Recv(%ld bytes, from = %d) status = %d\n", + subfile_rank, __func__, data_size, source, ret); + } + } +#endif + if (ret != MPI_SUCCESS) { - int len; + int len; char estring[MPI_MAX_ERROR_STRING]; MPI_Error_string(ret, estring, &len); - printf("[ioc(%d) %s] MPI_ERROR(%d)! MPI_Recv of %ld bytes from %d returned an error(%s)\n", - subfile_rank, __func__, msg_status.MPI_ERROR, data_size, source, estring ); + printf("[ioc(%d) %s] MPI_ERROR(%d)! MPI_Recv of %ld bytes from %d " + "returned an error(%s)\n", + subfile_rank, __func__, msg_status.MPI_ERROR, data_size, source, + estring); fflush(stdout); return ret; - } else if(sf_verbose_flag) { - printf("[ioc(%d) %s] MPI_Recv success. Writing %ld bytes from rank %d to disk\n", - subfile_rank, __func__, data_size, source); - fflush(stdout); } - if ((fd = subfile_fid) < 0) { - printf("[ioc(%d)] WARNING: %s called while subfile_fid = %d (closed)\n", subfile_rank, __func__, subfile_fid); - fflush(stdout); - } - else if (sf_write_data(fd, file_offset, recv_buffer, data_size, subfile_rank ) < 0) { - free(recv_buffer); - recv_buffer = NULL; - printf("[ioc(%d) %s] sf_write_data returned an error!\n", subfile_rank, __func__); + fd = sf_context->sf_fid; + + if (fd < 0) { + printf("[ioc(%d)] WARNING: %s called while subfile_fid = %d (closed)\n", + subfile_rank, __func__, fd); + fflush(stdout); + } else if (sf_write_data( + fd, file_offset, recv_buffer, data_size, subfile_rank) < 0) { + free(recv_buffer); + recv_buffer = NULL; + printf("[ioc(%d) %s] sf_write_data returned an error!\n", subfile_rank, + __func__); fflush(stdout); return -1; } - /* Done... */ - // send_ack__(source, subfile_rank, COMPLETED, comm); + /* Done... */ if (recv_buffer) { - free(recv_buffer); - } - return 0; + free(recv_buffer); + } + return 0; } -int queue_read_indep(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm) +/*------------------------------------------------------------------------- + * Function: Public/IOC queue_read_indep + * + * Purpose: Implement the IOC independent read function. The + * function is invoked as a result of the IOC receiving the + * "header"/RPC. What remains is to allocate memory for + * reading the data and then to send this to the client. + * We utilize pread for the actual file reading. + * + * Return: The integer status returned by the Internal read_independent + * function. Successful operations will return 0. + * Errors: An MPI related error value. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +int +queue_read_indep( + sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm) { - char *send_buffer = NULL; - int ret = MPI_SUCCESS; - int64_t data_size = msg->header[0]; - int64_t file_offset = msg->header[1]; + int fd; + char * send_buffer = NULL; + int ret = MPI_SUCCESS; + int64_t data_size = msg->header[0]; + int64_t file_offset = msg->header[1]; + int64_t file_context_id = msg->header[2]; + subfiling_context_t *sf_context = get_subfiling_object(file_context_id); + assert(sf_context != NULL); + + sf_context->sf_read_count++; + + fd = sf_context->sf_fid; + + if (fd < 0) { + printf("[ioc(%d) %s] subfile(%d) file descriptor not valid\n", + subfile_rank, __func__, fd); + return -1; + } + /* If there were writes to this file, we should flush the file cache + * before attempting to read the contents. + */ + if (sf_context->sf_write_count) { + sf_context->sf_write_count = 0; + fdatasync(fd); + } +#ifndef NDEBUG if (sf_verbose_flag) { - printf("[ioc(%d) %s] msg from %d: datasize=%ld\toffset=%ld\n", subfile_rank, __func__, source, data_size, file_offset ); - fflush(stdout); + if (sf_logfile) { + fprintf(sf_logfile, + "[ioc(%d) %s] msg from %d: datasize=%ld\toffset=%ld\n", + subfile_rank, __func__, source, data_size, file_offset); + } } - if ((send_buffer = (char *)malloc((size_t)data_size)) == NULL) { +#endif + if ((send_buffer = (char *) malloc((size_t) data_size)) == NULL) { perror("malloc"); return -1; } - if (sf_read_data(subfile_fid, file_offset, send_buffer, data_size, subfile_rank) < 0) { - printf("[%d] %s - sf_read_data returned an error!\n", subfile_rank, __func__); + if (sf_read_data(fd, file_offset, send_buffer, data_size, subfile_rank) < + 0) { + printf("[%d] %s - sf_read_data for source(%d) returned an error! " + "read_count=%ld\n", + subfile_rank, __func__, source, sf_context->sf_read_count); fflush(stdout); return -1; } - ret = MPI_Send(send_buffer, (int)data_size, MPI_BYTE, source, READ_INDEP_DATA, comm); + ret = MPI_Send( + send_buffer, (int) data_size, MPI_BYTE, source, READ_INDEP_DATA, comm); if (ret != MPI_SUCCESS) { - int len; + int len; char estring[MPI_MAX_ERROR_STRING]; MPI_Error_string(ret, estring, &len); - printf("[ioc(%d)] ERROR! MPI_Send of %ld bytes to %d returned an error(%s)\n",subfile_rank, data_size, source, estring ); + printf("[ioc(%d)] ERROR! MPI_Send of %ld bytes to %d returned an " + "error(%s)\n", + subfile_rank, data_size, source, estring); fflush(stdout); return ret; } +#ifndef NDEBUG + if (sf_verbose_flag) { + if (sf_logfile) { + fprintf(sf_logfile, "[ioc(%d)] MPI_Send to source(%d) completed\n", + subfile_rank, source); + } + } +#endif if (send_buffer) { - free(send_buffer); - send_buffer = NULL; - } + free(send_buffer); + send_buffer = NULL; + } - return 0; + return 0; } - -int queue_file_open(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm) +/*------------------------------------------------------------------------- + * Function: Public/IOC queue_file_open + * + * Purpose: Implement the IOC file open function. The + * function is invoked as a result of the IOC receiving the + * "header"/RPC. What remains is open the subfile if it + * isn't already open. This can happen if this function + * was invoked by another client process. + * + * Return: The integer status returned by the Internal read_independent + * function. Successful operations will return 0. + * Errors: An MPI related error value. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +int +queue_file_open( + sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm) { - int ret, req_count, errors=0; - int ref_count; - int flags = (int)(msg->header[0] & 0x0ffffffff); - atomic_fetch_add(&sf_file_refcount, 1); // atomic - ref_count = atomic_load(&sf_file_refcount); - if (sf_verbose_flag) { - printf("[ioc(%d) %s] file open flags = %0x, source=%d\n", subfile_rank, __func__, flags, source); - fflush(stdout); - } - - errors = subfiling_open_file(sf_subfile_prefix, subfile_rank, flags); + int ret, errors = 0; + int flags = (int) (msg->header[0] & 0x0ffffffff); + // int open_count; + atomic_fetch_add(&sf_file_refcount, 1); // atomic +#ifndef NDEBUG + if (sf_verbose_flag) { + if (sf_logfile) { + fprintf(sf_logfile, + "[ioc(%d) %s] file open flags = %0x, source=%d\n", subfile_rank, + __func__, flags, source); + } + } +#endif +#if 0 + printf("[ioc(%d) %s]\n", subfile_rank, __func__); + fflush(stdout); +#endif + errors = subfiling_open_file(msg, sf_subfile_prefix, subfile_rank, flags); + // open_count = atomic_load(&sf_file_refcount); - req_count = COMPLETED; - ret = MPI_Send(&req_count, 1, MPI_INT, source, COMPLETED, comm); +#if 1 + ret = MPI_Send(&errors, 1, MPI_INT, source, COMPLETED, comm); if (ret != MPI_SUCCESS) { - errors++; - } - if (errors) { - printf("[ioc(%d) %s] Error opening file\n", subfile_rank, __func__); + printf("[ioc(%d)] MPI_Send FILE_OPEN, COMPLETED to source(%d) FAILED\n", + subfile_rank, source); fflush(stdout); + errors++; } +#else + if (open_count == sf_world_size) { + int i, k = (sf_world_rank +1); + for (i=0; i < sf_world_size; i++, k++) { + source = k % sf_world_size; + ret = MPI_Send(&errors, 1, MPI_INT, source, COMPLETED, comm); + if (ret != MPI_SUCCESS) { + printf("[ioc(%d)] MPI_Send FILE_OPEN, COMPLETED to source(%d) FAILED\n", + subfile_rank, source); + fflush(stdout); + errors++; + } + } + } +#endif + if (errors) { +#ifndef NDEBUG + if (sf_verbose_flag) { + if (sf_logfile) { + fprintf(sf_logfile, "[ioc(%d) %s] Error opening file\n", + subfile_rank, __func__); + } + } +#endif + } return errors; } -/* +/* * The decrement is somewhat of misnomer, i.e. we check the number of file open * requests to the number of file close requests. When those values match, the - * actual file gets closed via the callback_ftn. The effects a weak collective - * on the file close operation. File opens on the other hand, can occur in - * any random order and no collective semanitics are enforced. + * actual file gets closed via the callback_ftn. This effects a weak + * collective on the file close operation. File opens (*) on the other hand, + * can occur in any random order and no collective semanitics are enforced. + * + * (*) Note that on the original file open, there are collective operations + * which take place to generate the MPI communications descriptors. */ -int decrement_file_ref_counts( int subfile_rank, int source, MPI_Comm comm, file_close_cb callback_ftn) +int +decrement_file_ref_counts(sf_work_request_t *msg, int subfile_rank, + int H5_ATTR_PARALLEL_UNUSED source, MPI_Comm comm, + file_close_cb callback_ftn) { - int close_count, open_count; - atomic_fetch_add(&sf_file_close_count, 1); // atomic - close_count = atomic_load(&sf_file_close_count); - open_count = atomic_load(&sf_file_refcount); - - if (close_count == sf_world_size) { - atomic_store(&sf_file_refcount, 0); - atomic_store(&sf_file_close_count, 0); /* Complete the reset to zeros */ - while (!tpool_is_empty) { - usleep(10); - } - if (callback_ftn(subfile_rank, comm) < 0) { - printf("[ioc(%d) %s] callback_ftn returned an error\n", subfile_rank, __func__ ); + int close_count, errors = 0; + + atomic_fetch_add(&sf_file_close_count, 1); // atomic + close_count = atomic_load(&sf_file_close_count); + + if (close_count == sf_world_size) { + int64_t file_context_id = msg->header[2]; + subfiling_context_t *sf_context = get_subfiling_object(file_context_id); + assert(sf_context != NULL); + + atomic_store(&sf_file_refcount, 0); + atomic_store(&sf_file_close_count, 0); /* Complete the reset to zeros */ + + /* Wait until any queued work has finished */ + while (!tpool_is_empty()) { + usleep(20); + } + + if (callback_ftn(subfile_rank, &sf_context->sf_fid, comm) < 0) { + printf("[ioc(%d) %s] callback_ftn returned an error\n", + subfile_rank, __func__); fflush(stdout); + errors++; + } else { + sf_context->sf_fid = -1; /* reset the actual file descriptor */ } } - return 0; + return errors; } -/* Note: This function should be called ONLY when all clients - * have called the CLOSE_OP on this IO Concentrator. - * The IOC API maintains a reference count on subfiles - * so that once that count is decremented to zero, the - * decrement_file_ref_counts function will call here. +/*------------------------------------------------------------------------- + * Function: Public/IOC subfiling_close_file + * + * Purpose: This function should be called ONLY when all clients + * have called the CLOSE_OP on this IO Concentrator. + * The IOC API maintains a reference count on subfiles + * so that once that count is decremented to zero, the + * decrement_file_ref_counts function will call here. + * + * Return: The integer status returned by the Internal read_independent + * function. Successful operations will return 0. + * Errors: An MPI related error value. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- */ -int subfiling_close_file(int subfile_rank, MPI_Comm comm) +int +subfiling_close_file(int subfile_rank, int *fid, MPI_Comm comm) { - int ret, source = 0; - int errors = 0, flag = COMPLETED; + int errors = 0; + int subfile_fid = *fid; -#if 0 - printf("[ioc(%d) %s] subfile_fid = %d\n", subfile_rank, __func__, subfile_fid); - fflush(stdout); -#endif if (subfile_fid >= 0) { + if (fdatasync(subfile_fid) < 0) { + perror("fdatasync"); + printf("fdatasync(%d)\n", subfile_fid); + errors++; + } + } -#if 0 + errors += subfiling_shutdown(subfile_rank, fid, comm); + + if (errors) { + printf("[ioc(%d) %s] Errors detected!\n", subfile_rank, __func__); + fflush(stdout); + } + + return errors; +} + +/*------------------------------------------------------------------------- + * Function: Public/IOC subfiling_shutdown + * + * Purpose: This function gets called ONLY when all clients have + * invoked the file CLOSE_OP, which in turn decrements the + * file reference count maintained within the subfiling + * context. As a result, the subfiling_close_file call is + * invoked, forcing a file sync/flush and then calling + * function to close the local subfile and notify the + * clients with the close ACK to allow them to continue + * beyond the HDF5 file close function. + * + * Return: The integer status returned by the Internal read_independent + * function. Successful operations will return 0. + * Errors: An MPI related error value. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +int +subfiling_shutdown(int subfile_rank, int *fid, MPI_Comm comm) +{ + int ret, source = 0; + int subfile_fid = *fid; + int errors = 0, flag = COMPLETED; + if (subfile_fid >= 0) { if (close(subfile_fid) < 0) { - perror("subfiling_close_file"); - } - subfile_fid = -1; -#else - fdatasync(subfile_fid); -#endif + perror("subfiling_close_file"); + printf("subfile_fid = %d\n", subfile_fid); + errors++; + } + *fid = -1; } + + /* Shutdown the main IOC thread */ + sf_shutdown_flag = 1; + /* Allow ioc_main to exit.*/ + usleep(40); + /* Notify all ranks */ for (source = 0; source < sf_world_size; source++) { - /* Don't release our local MPI process until all - * other ranks are released. - */ - if (source == sf_world_rank) { - continue; - } + /* Don't release our local MPI process until all + * other ranks are released. + */ + if (source == sf_world_rank) { + continue; + } ret = MPI_Send(&flag, 1, MPI_INT, source, COMPLETED, comm); - if (ret != MPI_SUCCESS) errors++; + if (ret != MPI_SUCCESS) + errors++; } - /* Release the local MPI process */ - ret = MPI_Send(&flag, 1, MPI_INT, sf_world_rank, COMPLETED, comm); - if (ret != MPI_SUCCESS) errors++; + /* Release the local MPI process */ + ret = MPI_Send(&flag, 1, MPI_INT, sf_world_rank, COMPLETED, comm); + if (ret != MPI_SUCCESS) + errors++; if (errors) { - printf("[ioc(%d) %s] Errors sending file close replies\n", subfile_rank, __func__); + printf("[ioc(%d) %s] Errors sending ioc_fini replies\n", subfile_rank, + __func__); fflush(stdout); } return errors; } -int subfiling_open_file(const char *prefix, int subfile_rank, int flags) +/*------------------------------------------------------------------------- + * Function: Public/IOC increment_ioc_fini_counts + * + * Purpose: UNUSED. Was originally implemented to manage the shutdown + * of IO Concentrators. The subfiling design changed to + * create IOC instances as part of FILE opens and shutdowns + * as part of file closing. + * + * Return: The integer status returned by the Internal read_independent + * function. Successful operations will return 0. + * Errors: An MPI related error value. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +int +increment_ioc_fini_counts(sf_work_request_t *msg, int subfile_rank, + int H5_ATTR_PARALLEL_UNUSED source, MPI_Comm comm, + file_close_cb callback_ftn) { - int errors = 0; - /* Only the real IOCs open the subfiles - * Once a file is opened, all additional file open requests - * can return immediately. - */ - if (subfile_rank >= 0) { - char filepath[PATH_MAX]; - char config[PATH_MAX]; - - - if (subfile_fid < 0) { - const char *dotconfig = ".subfile_config"; - mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; - if (prefix) { - mkdir(prefix, S_IRWXU); - sprintf(filepath, "%s/node_local_temp_%d_of_%d", - prefix, subfile_rank, n_io_concentrators); - sprintf(config, "%s/%s", prefix, dotconfig); - } - else { - sprintf(filepath, "node_local_temp_%d_of_%d", - subfile_rank,n_io_concentrators); - strcpy(config, dotconfig); - } - - begin_thread_exclusive(); + int close_count, errors = 0; + atomic_fetch_add(&sf_ioc_fini_refcount, 1); // atomic + close_count = atomic_load(&sf_ioc_fini_refcount); + + if (close_count == sf_world_size) { + int64_t file_context_id = msg->header[2]; + subfiling_context_t *sf_context = get_subfiling_object(file_context_id); + assert(sf_context != NULL); + if (callback_ftn(subfile_rank, &sf_context->sf_fid, comm) < 0) { + printf("[ioc(%d) %s] callback_ftn returned an error\n", + subfile_rank, __func__); + fflush(stdout); + } + } + return errors; +} - if ((subfile_fid = open(filepath, flags, mode)) < 0) { - perror("subfile open"); - end_thread_exclusive(); - errors++; - goto done; - } +/*------------------------------------------------------------------------- + * Function: Public/IOC subfiling_open_file + * + * Purpose: This function gets called when a client invokes a OPEN_OP. + * The HDF5 file opening protocol actually attempts to open + * a file; first without any truncate other flags which would + * modify the file state if it already exists. A file close + * and then the second file open using the user supplied open + * flags is invoked. The OPEN_OP provides the user flags as + * part of the RPC message. The file prefix info doesn't + * transmited as part of the RPC since it is available as + * part of the client context which can be utilized by the + * IOC thread. We access the sf_context by reading the + * cache of contexts at the index provided with the RPC msg. + * + * Return: The integer status returned by the Internal read_independent + * function. Successful operations will return 0. + * Errors: An MPI related error value. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +int +subfiling_open_file( + sf_work_request_t *msg, const char *prefix, int subfile_rank, int flags) +{ + int errors = 0; - end_thread_exclusive(); - - if (flags & O_CREAT) { - size_t bufsize = PATH_MAX + 16; - FILE *f = NULL; - char linebuf[bufsize]; - /* If a config file already exists, AND - * the user wants to truncate subfiles (if they exist), - * then we should also truncate an existing config file. - */ - if (access(config, flags) == 0) { - truncate(config, 0); - } - f = fopen(config, "w+"); - if (f != NULL) { - int k; - char *underscore = strrchr(filepath,'_'); - *underscore=0; - strcpy(config, filepath); - *underscore='_'; - sprintf(linebuf,"stripe_size=%ld\n", sf_stripe_size); - fwrite(linebuf, strlen(linebuf), 1, f); - sprintf(linebuf,"aggregator_count=%d\n",n_io_concentrators); - fwrite(linebuf, strlen(linebuf), 1, f); - - for(k=0; k < n_io_concentrators; k++) { - snprintf(linebuf,bufsize,"%s_%d:%d\n",config, k, io_concentrator[k]); - fwrite(linebuf, strlen(linebuf), 1, f); - } + /* Only the real IOCs open the subfiles + * Once a file is opened, all additional file open requests + * can return immediately. + */ + if (subfile_rank >= 0) { + char filepath[PATH_MAX]; + char config[PATH_MAX]; + int subfile_fid; + int64_t h5_file_id = msg->header[1]; + int64_t file_context_id = msg->header[2]; + subfiling_context_t *sf_context = get_subfiling_object(file_context_id); + assert(sf_context != NULL); + + begin_thread_exclusive(); + + if (sf_context->sf_fid < 0) { + int n_io_concentrators = sf_context->topology->n_io_concentrators; + int *io_concentrator = sf_context->topology->io_concentrator; + const char *dotconfig = ".subfile_config"; + mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; + if (prefix) { + mkdir(prefix, S_IRWXU); + sprintf(filepath, "%s/%ld_node_local_temp_%d_of_%d", prefix, + h5_file_id, subfile_rank, n_io_concentrators); + sprintf(config, "%s/%ld%s", prefix, h5_file_id, dotconfig); + } else { + sprintf(filepath, "%ld_node_local_temp_%d_of_%d", h5_file_id, + subfile_rank, n_io_concentrators); + strcpy(config, dotconfig); + } - fclose(f); - } - else { - perror("fopen(config)"); - errors++; - goto done; - } - } - if (sf_verbose_flag) { - printf("[ioc:%d] Opened subfile %s\n", subfile_rank, filepath); - } - } + if ((subfile_fid = open(filepath, flags, mode)) < 0) { + end_thread_exclusive(); + errors++; + goto done; + } else { + sf_context->sf_fid = subfile_fid; + } + if (flags & O_CREAT) { + int64_t new_context = SF_CONTEXT; + int64_t objtype = (new_context << 32); + int context_id = (int) msg->context_id; + size_t bufsize = PATH_MAX + 16; + FILE * f = NULL; + char linebuf[bufsize]; + int64_t thisId = (int64_t)(objtype | context_id); + subfiling_context_t *context = + (subfiling_context_t *) get_subfiling_object(thisId); + /* If a config file already exists, AND + * the user wants to truncate subfiles (if they exist), + * then we should also truncate an existing config file. + */ + if (access(config, flags) == 0) { + truncate(config, 0); + } + f = fopen(config, "w+"); + if (f != NULL) { + int k; + sprintf( + linebuf, "stripe_size=%ld\n", context->sf_stripe_size); + fwrite(linebuf, strlen(linebuf), 1, f); + sprintf( + linebuf, "aggregator_count=%d\n", n_io_concentrators); + fwrite(linebuf, strlen(linebuf), 1, f); + sprintf(linebuf,"hdf5_file=%s\n", context->filename); + fwrite(linebuf, strlen(linebuf), 1, f); + + for (k = 0; k < n_io_concentrators; k++) { + if (prefix) + sprintf(linebuf, "%s/%ld_node_local_temp_%d_of_%d:%d", prefix, + h5_file_id, subfile_rank, n_io_concentrators, io_concentrator[k]); + else + sprintf(linebuf, "%ld_node_local_temp_%d_of_%d:%d", h5_file_id, + subfile_rank, n_io_concentrators, io_concentrator[k]); + + fwrite(linebuf, strlen(linebuf), 1, f); + } + + fclose(f); + } else { + perror("fopen(config)"); + errors++; + goto done; + } + } +#ifndef NDEBUG + if (sf_verbose_flag) { + if (sf_logfile) { + fprintf(sf_logfile, "[ioc:%d] Opened subfile %s\n", + subfile_rank, filepath); + } + } +#endif + } + end_thread_exclusive(); } done: - return errors; + return errors; } - +/*------------------------------------------------------------------------- + * Function: UTILITY FUNCTIONS: + * delete_subfiling_context - removes a context entry in the + * object cache. Free communicators + * and zero other structure fields. + * + * sf_get_mpi_rank - (not used) retrieves the MPI rank of the + * calling process. Was used when pairing + * the subfiling VFD with the SUBFILING VFD. + * + * sf_get_mpi_size - (not used) retrieves the MPI size of the + * communicator associated with the open + * file. + * + * sf_get_group_com - (not used) retrieves the MPI Comm object + * associated with the open file/sf_context. + * + * sf_subfile_set_logging - (not used) informs one or all IOC + * instances to set the verbose/logging flag + * to the value provided by the user. + * + * Return: none + * Errors: none + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ void delete_subfiling_context(hid_t context_id) { - subfiling_context_t *sf_context = get_subfiling_object(context_id); - if (sf_context) { - MPI_Comm_free(&sf_context->sf_msg_comm); - MPI_Comm_free(&sf_context->sf_data_comm); - sf_msg_comm = MPI_COMM_NULL; - sf_data_comm = MPI_COMM_NULL; - if (n_io_concentrators > 1) { - MPI_Comm_free(&sf_context->sf_group_comm); - MPI_Comm_free(&sf_context->sf_intercomm); - } - free(sf_context); - } + subfiling_context_t *sf_context = get_subfiling_object(context_id); + if (sf_context) { + if (sf_context->topology->n_io_concentrators > 1) { + if (sf_context->sf_group_comm != MPI_COMM_NULL) { + MPI_Comm_free(&sf_context->sf_group_comm); + } + if (sf_context->sf_intercomm != MPI_COMM_NULL) { + MPI_Comm_free(&sf_context->sf_intercomm); + } + } + free(sf_context); + } + + return; +} + +int +sf_get_mpi_rank(hid_t fid, int *rank) +{ + hid_t context_id = fid_map_to_context(fid); + subfiling_context_t *sf_context = get_subfiling_object(context_id); + assert(sf_context != NULL); + assert(rank != NULL); + *rank = sf_context->sf_group_rank; + return 0; +} + +int +sf_get_mpi_size(hid_t fid, int *size) +{ + hid_t context_id = fid_map_to_context(fid); + subfiling_context_t *sf_context = get_subfiling_object(context_id); + assert(sf_context != NULL); + assert(size != NULL); + *size = sf_context->sf_group_size; + return 0; +} + +int +sf_get_group_comm(hid_t fid, MPI_Comm *comm) +{ + hid_t context_id = fid_map_to_context(fid); + subfiling_context_t *sf_context = get_subfiling_object(context_id); + assert(sf_context != NULL); + assert(comm != NULL); + *comm = sf_context->sf_group_comm; + return 0; +} - usleep(100); - return; +int +sf_subfile_set_logging(hid_t sf_fid, int ioc_rank, int flag) +{ + int ioc; + int status = 0; + hid_t context_id = fid_map_to_context(sf_fid); + subfiling_context_t *sf_context = get_subfiling_object(context_id); + int n_io_concentrators; + int * io_concentrator = NULL; + int64_t lflag = (int64_t)(flag & 0xFF); + int64_t msg[3]; + + assert(sf_context != NULL); + + msg[0] = lflag; + msg[1] = 0; + msg[2] = sf_context->sf_context_id; + + n_io_concentrators = sf_context->topology->n_io_concentrators; + io_concentrator = sf_context->topology->io_concentrator; + + for (ioc = 0; ioc < n_io_concentrators; ioc++) { + if ((flag < 0) || (flag == ioc_rank)) { + status = MPI_Ssend(msg, 3, MPI_INT64_T, io_concentrator[ioc], + LOGGING_OP, sf_context->sf_msg_comm); + } + } + return status; } diff --git a/src/H5FDsubfile_private.h b/src/H5FDsubfile_private.h index 0088c13..db991f9 100644 --- a/src/H5FDsubfile_private.h +++ b/src/H5FDsubfile_private.h @@ -4,9 +4,9 @@ #include #include -#include #include #include +#include #include #include #include @@ -14,87 +14,108 @@ /**************/ /* H5 Headers */ /**************/ -#include "H5private.h" /* Generic Functions */ -#include "H5CXprivate.h" /* API Contexts */ -#include "H5Dprivate.h" /* Datasets */ -#include "H5Eprivate.h" /* Error handling */ +#include "H5CXprivate.h" /* API Contexts */ +#include "H5Dprivate.h" /* Datasets */ +#include "H5Eprivate.h" /* Error handling */ +#include "H5Iprivate.h" /* IDs */ #include "H5Ipublic.h" -#include "H5Iprivate.h" /* IDs */ -#include "H5MMprivate.h" /* Memory management */ -#include "H5Pprivate.h" /* Property lists */ - +#include "H5MMprivate.h" /* Memory management */ +#include "H5Pprivate.h" /* Property lists */ +#include "H5private.h" /* Generic Functions */ #include "mpi.h" #ifndef _H5FDsubfile_private_H -#define _H5FDsubfile_private_H +# define _H5FDsubfile_private_H -typedef int (*file_close_cb)(int,MPI_Comm); +typedef int (*file_close_cb)(int, int *, MPI_Comm); -typedef struct { - int64_t sf_stripe_size; - int64_t sf_blocksize_per_stripe; - MPI_Comm sf_msg_comm; - MPI_Comm sf_data_comm; - MPI_Comm sf_group_comm; - MPI_Comm sf_intercomm; - int sf_group_size; - int sf_group_rank; - int sf_intercomm_root; - char *subfile_prefix; -} subfiling_context_t; - -typedef struct { - /* {Datasize, Offset} */ - int64_t header[2]; - int tag; - int source; - int subfile_rank; -} sf_work_request_t; +typedef enum io_ops { + READ_OP = 1, + WRITE_OP = 2, + OPEN_OP = 3, + CLOSE_OP = 4, + FINI_OP = 8, + LOGGING_OP = 16 +} io_op_t; +typedef enum { + SF_BADID = (-1), + SF_TOPOLOGY = 1, + SF_CONTEXT = 2, + SF_NTYPES /* number of subfiling object types, MUST BE LAST */ +} sf_obj_type_t; +typedef enum { + SELECT_IOC_ONE_PER_NODE = 0, /* Default */ + SELECT_IOC_EVERY_NTH_RANK, + SELECT_IOC_WITH_CONFIG, + ioc_selection_options +} sf_ioc_selection_t; typedef struct { - long rank; - long hostid; + long rank; + long hostid; } layout_t; -typedef struct { - long hostid; - layout_t *topology; - int *node_ranks; - int node_count; - int node_index; - int local_peers; - int subfile_rank; - int world_rank; - int world_size; - bool rank_is_ioc; +typedef struct topology { + long hostid; + layout_t * layout; + int * node_ranks; + int node_count; + int node_index; + int local_peers; + int subfile_rank; + int world_rank; + int world_size; + bool rank_is_ioc; + int n_io_concentrators; + int * io_concentrator; + sf_ioc_selection_t selection_type; } sf_topology_t; -#define K(n) ((n)*1024) -#define DEFAULT_STRIPE_SIZE K(256) /* (1024*1024) */ -#define MAX_DEPTH 256 +typedef struct { + hid_t sf_context_id; + hid_t h5_file_id; + int sf_fid; + size_t sf_write_count; + size_t sf_read_count; + size_t sf_eof; + /* Copy of the HDF5 File 'serial' number */ + unsigned long fileno; + int64_t sf_stripe_size; + int64_t sf_blocksize_per_stripe; + MPI_Comm sf_msg_comm; + MPI_Comm sf_data_comm; + MPI_Comm sf_group_comm; + MPI_Comm sf_intercomm; + int sf_group_size; + int sf_group_rank; + int sf_intercomm_root; + char * subfile_prefix; + char * filename; + sf_topology_t *topology; +} subfiling_context_t; -typedef enum io_ops { - READ_OP = 1, - WRITE_OP = 2, - OPEN_OP = 3, - CLOSE_OP = 4, - INCR_OP = 8, - DECR_OP = 16, -} io_op_t; - -typedef enum { - SF_BADID = (-1), - SF_TOPOLOGY = 1, - SF_CONTEXT, - SF_NTYPES /* number of subfiling object types, MUST BE LAST */ -} SF_OBJ_TYPE; - +typedef struct { + /* {Datasize, Offset, FileID} */ + int64_t header[3]; + int tag; + int source; + int subfile_rank; + hid_t context_id; +} sf_work_request_t; +typedef struct { + hid_t h5_file_id; + hid_t sf_context_id; +} file_map_to_context_t; + +# define K(n) ((n) *1024) +# define DEFAULT_STRIPE_SIZE K(256) /* (1024*1024) */ +# define MAX_DEPTH 1024 -/* MPI Tags are 32 bits, we treat them as unsigned +/* MPI Tags are 32 bits, we treat them as unsigned * to allow the use of the available bits for RPC * selections: * 0000 @@ -108,85 +129,113 @@ typedef enum { * 1010 COLLECTIVE_WRITE * 1011 ///////// * 1100 COLLECTIVE_CLOSE - * + * * 31 28 24 20 16 12 8 4 0| * +-------+-------+-------+-------+-------+-------+-------+-------+ * | | | ACKS | OP | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * + * */ /* Bit 3 SET indicates collectives */ -#define COLL_FUNC (0x1 << 3) +# define COLL_FUNC (0x1 << 3) -#define ACK_PART (0x0acc << 8) -#define DATA_PART (0xd8da << 8) -#define READY (0xfeed << 8) -#define COMPLETED (0xfed1 << 8) +# define ACK_PART (0x0acc << 8) +# define DATA_PART (0xd8da << 8) +# define READY (0xfeed << 8) +# define COMPLETED (0xfed1 << 8) -#define READ_INDEP (READ_OP) -#define READ_COLL (COLL_FUNC | READ_OP) -#define WRITE_INDEP (WRITE_OP) -#define WRITE_COLL (COLL_FUNC | WRITE_OP) +# define READ_INDEP (READ_OP) +# define READ_COLL (COLL_FUNC | READ_OP) +# define WRITE_INDEP (WRITE_OP) +# define WRITE_COLL (COLL_FUNC | WRITE_OP) -#define WRITE_INDEP_ACK (ACK_PART | WRITE_OP) -#define WRITE_INDEP_DATA (DATA_PART | WRITE_OP) +# define WRITE_INDEP_ACK (ACK_PART | WRITE_OP) +# define WRITE_INDEP_DATA (DATA_PART | WRITE_OP) -#define READ_INDEP_DATA (DATA_PART | READ_OP) +# define READ_INDEP_DATA (DATA_PART | READ_OP) +# define SET_LOGGING (LOGGING_OP) -#define INT32_MASK 0x07FFFFFFFFFFFFFFF +# define INT32_MASK 0x07FFFFFFFFFFFFFFF -extern int sf_verbose_flag; extern int sf_shutdown_flag; extern atomic_int sf_workinprogress; extern atomic_int sf_work_pending; extern atomic_int sf_file_close_count; extern atomic_int sf_file_refcount; - -/* -------------- -Messages IN -------------- -*/ -extern MPI_Comm sf_msg_comm; - -/* -------------- -Messages OUT -------------- -*/ -extern MPI_Comm sf_data_comm; - - - -H5_DLL int H5FD__determine_ioc_count(int world_size, int world_rank, sf_topology_t **thisapp); -H5_DLL int H5FD__init_subfile_context(subfiling_context_t **newContext, int n_iocs, int world_size, int world_rank, bool rank_is_ioc); -H5_DLL void * get_subfiling_object(int64_t object_id); -H5_DLL hid_t get_subfiling_context(void); -H5_DLL int initialize_ioc_threads(subfiling_context_t *sf_context); -H5_DLL int tpool_add_work(sf_work_request_t *); -H5_DLL bool tpool_is_empty(void); -H5_DLL int ioc_main(subfiling_context_t *context); -H5_DLL int queue_write_coll(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm); -H5_DLL int queue_read_coll(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm); -H5_DLL int queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm); -H5_DLL int queue_read_indep(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm); -H5_DLL int subfiling_close_file(int subfile_rank, MPI_Comm comm); -H5_DLL int subfiling_open_file(const char *prefix, int subfile_rank, MPI_Comm comm); -H5_DLL int queue_file_open(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm); -H5_DLL int decrement_file_ref_counts( int subfile_rank, int source, MPI_Comm comm, file_close_cb callback_ftn); -H5_DLL int sf_open_subfiles(hid_t context_id, char *prefix, int flags); -H5_DLL int sf_close_subfiles(hid_t context_id); -H5_DLL int sf_write_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, int subfile_rank); -H5_DLL int sf_read_independent(hid_t context_id, int64_t offset, int64_t elements, int dtype_extent, void *data); -H5_DLL int sf_write_independent(hid_t context_id, int64_t offset, int64_t elements, int dtype_extent, void *data); -H5_DLL int sf_read_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, int subfile_rank); -H5_DLL void delete_subfiling_context(hid_t context_id); -H5_DLL void finalize_ioc_threads(void); -H5_DLL int begin_thread_exclusive(void); -H5_DLL int end_thread_exclusive(void); -H5_DLL int wait_for_thread_main(void); -H5_DLL int finalize_subfile_close(void); +extern int sf_verbose_flag; + +# ifndef NDEBUG +extern FILE *sf_logfile; +# endif + +# ifdef __cplusplus +extern "C" { +# endif + +/* clang-format off */ +H5_DLL herr_t H5FDsubfiling_init(sf_ioc_selection_t ioc_select_method, char *ioc_select_option, int64_t *context); +H5_DLL herr_t H5FDsubfiling_finalize(int64_t subfile_id); +H5_DLL int H5FD__determine_ioc_count(int world_size, int world_rank, + sf_ioc_selection_t ioc_select_method, char *ioc_select_option, sf_topology_t **thisapp); +H5_DLL int H5FD__init_subfile_context(sf_topology_t *thisApp, int n_iocs, int world_rank, + subfiling_context_t *newContext); +H5_DLL int64_t record_subfiling_object(sf_obj_type_t type, void *obj); +H5_DLL void * get_subfiling_object(int64_t object_id); +H5_DLL herr_t sf_free_context(subfiling_context_t **sf_context); +H5_DLL int initialize_ioc_threads(subfiling_context_t *sf_context); +H5_DLL int tpool_add_work(sf_work_request_t *); +H5_DLL bool tpool_is_empty(void); +H5_DLL int ioc_main(int64_t context_id); +H5_DLL int queue_write_coll( sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm); +H5_DLL int queue_read_coll( sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm); +H5_DLL int queue_write_indep( sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm); +H5_DLL int queue_read_indep( sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm); +H5_DLL int subfiling_close_file(int subfile_rank, int *subfile_fid, MPI_Comm comm); +H5_DLL int subfiling_shutdown(int subfile_rank, int *subfile_fid, MPI_Comm comm); +H5_DLL int subfiling_open_file( sf_work_request_t *msg, const char *prefix, int subfile_rank, int flags); +H5_DLL int queue_file_open( sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm); +H5_DLL int decrement_file_ref_counts(sf_work_request_t *msg, int subfile_rank, int source, + MPI_Comm comm, file_close_cb callback_ftn); +H5_DLL int increment_ioc_fini_counts(sf_work_request_t *msg, int subfile_rank, int source, + MPI_Comm comm, file_close_cb callback_ftn); +H5_DLL int sf_open_subfiles(hid_t context_id, char *filename, char *prefix, int flags); +H5_DLL int sf_close_subfiles(hid_t context_id); +H5_DLL int sf_notify_shutdown(hid_t context_id); +H5_DLL int sf_write_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, + int subfile_rank); +H5_DLL int sf_read_independent(hid_t sf_fid, int64_t offset, int64_t elements, + int dtype_extent, void *data); +H5_DLL int sf_write_independent(hid_t sf_fid, int64_t offset, int64_t elements, + int dtype_extent, const void *data); +H5_DLL int sf_read_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, + int subfile_rank); +H5_DLL herr_t sf_read_vector(hid_t h5_fid, hssize_t count, haddr_t addrs[], hsize_t sizes[], + void *bufs[] /* in */); +H5_DLL herr_t sf_write_vector(hid_t h5_fid, hssize_t count, haddr_t addrs[], hsize_t sizes[], + void *bufs[] /* in */); +H5_DLL int sf_truncate(hid_t h5_fid, haddr_t addr); +H5_DLL void delete_subfiling_context(hid_t context_id); +H5_DLL void finalize_ioc_threads(void); +H5_DLL int begin_thread_exclusive(void); +H5_DLL int end_thread_exclusive(void); +H5_DLL int wait_for_thread_main(void); +H5_DLL int finalize_subfile_close(void); +H5_DLL char * get_ioc_selection_criteria(sf_ioc_selection_t *selection_criteria); +H5_DLL int active_map_entries(void); +H5_DLL void clear_fid_map_entry(hid_t sf_fid); +H5_DLL hid_t fid_map_to_context(hid_t sf_fid); +H5_DLL void set_verbose_flag(int subfile_rank, int new_value); +H5_DLL int sf_get_mpi_rank(hid_t fid, int *rank); +H5_DLL int sf_get_mpi_size(hid_t fid, int *size); +H5_DLL int sf_get_group_comm(hid_t fid, MPI_Comm *comm); +H5_DLL int sf_subfile_set_logging(hid_t sf_fid, int ioc_rank, int flag); + +/* clang-format on */ + +# ifdef __cplusplus +} +# endif #endif diff --git a/src/H5FDsubfile_public.h b/src/H5FDsubfile_public.h index 6e4e23c..32a2785 100644 --- a/src/H5FDsubfile_public.h +++ b/src/H5FDsubfile_public.h @@ -3,9 +3,5 @@ #include "H5FDsubfile_private.h" -herr_t H5FDsubfiling_init(void); -herr_t H5FDsubfiling_finalize(void); - - #endif /* _H5FDsubfile_public_H */ diff --git a/src/H5FDsubfile_threads.c b/src/H5FDsubfile_threads.c index fb99930..fa957a5 100644 --- a/src/H5FDsubfile_threads.c +++ b/src/H5FDsubfile_threads.c @@ -1,161 +1,371 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright by The HDF Group. * + * Copyright by the Board of Trustees of the University of Illinois. * + * All rights reserved. * + * * + * This file is part of HDF5. The full HDF5 copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the COPYING file, which can be found at the root of the source code * + * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. * + * If you do not have access to either file, you may request a copy from * + * help@hdfgroup.org. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #include "H5FDsubfile_private.h" -#include "mercury/mercury_util_config.h" -#include "mercury/mercury_log.h" +/* + * NOTES: + * Rather than re-create the code for creating and managing a thread pool, + * I'm utilizing a reasonably well tested implementation from the mercury + * project. At some point, we should revisit this decision or possibly + * directly link against the mercury library. This would make sense if + * we move away from using MPI as the messaging infrastructure and instead + * use mercury for that purpose... + */ + #include "mercury/mercury_log.c" -#include "mercury/mercury_util_error.c" +#include "mercury/mercury_log.h" #include "mercury/mercury_thread.c" -#include "mercury/mercury_thread_mutex.c" -#include "mercury/mercury_thread_condition.h" #include "mercury/mercury_thread_condition.c" +#include "mercury/mercury_thread_condition.h" +#include "mercury/mercury_thread_mutex.c" #include "mercury/mercury_thread_pool.c" #include "mercury/mercury_thread_spin.c" +#include "mercury/mercury_util_config.h" +#include "mercury/mercury_util_error.c" static hg_thread_mutex_t ioc_mutex = PTHREAD_MUTEX_INITIALIZER; static hg_thread_mutex_t ioc_thread_mutex = PTHREAD_MUTEX_INITIALIZER; static hg_thread_pool_t *ioc_thread_pool = NULL; -static hg_thread_t ioc_thread; +static hg_thread_t ioc_thread; #ifndef HG_TEST_NUM_THREADS_DEFAULT -#define HG_TEST_NUM_THREADS_DEFAULT 4 +# define HG_TEST_NUM_THREADS_DEFAULT 4 #endif -#define POOL_CONCURRENT_MAX 64 +#define POOL_CONCURRENT_MAX 256 static struct hg_thread_work pool_request[POOL_CONCURRENT_MAX]; +/*------------------------------------------------------------------------- + * Function: local ioc_thread_main + * + * Purpose: An IO Concentrator instance is initialized with the + * specified subfiling context. + * + * Return: The IO concentrator thread executes as long as the HDF5 + * file associated with this context is open. At file close, + * the thread will return from 'ioc_main' and the thread + * exit status will be checked by the main program. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ static HG_THREAD_RETURN_TYPE ioc_thread_main(void *arg) { + int64_t * context_id = (int64_t *) arg; hg_thread_ret_t thread_ret = (hg_thread_ret_t) 0; - /* Pass along the subfiling_context_t */ - ioc_main(arg); + /* Pass along the subfiling_context_t */ + ioc_main(context_id[0]); - // hg_thread_exit(thread_ret); + /* Upon exit, we can free the input arg */ + free(arg); return thread_ret; } +/*------------------------------------------------------------------------- + * Function: initialize_ioc_threads + * + * Purpose: The principal entry point to initialize the execution + * context for an IO Concentrator (IOC). The main thread + * is responsible for receiving IO requests from each + * HDF5 "client" and distibuting those to helper threads + * for actual processing. We initialize a fixed number + * of helper threads by creating a thread_pool. + * + * Return: SUCCESS (0) or FAIL (-1) if any errors are detected + * for the multi-threaded initialization. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ int initialize_ioc_threads(subfiling_context_t *sf_context) { - int status; - status = hg_thread_mutex_init(&ioc_mutex); - if (status) { - puts("hg_thread_mutex_init failed"); - goto err_exit; - } - status = hg_thread_mutex_init(&ioc_thread_mutex); - if (status) { - puts("hg_thread_mutex_init failed"); - goto err_exit; - } - - status = hg_thread_pool_init(HG_TEST_NUM_THREADS_DEFAULT, &ioc_thread_pool); - if (status) { - puts("hg_thread_pool_init failed"); - goto err_exit; - } - status = hg_thread_create(&ioc_thread, ioc_thread_main, sf_context); - if (status) { - puts("hg_thread_create failed"); - goto err_exit; - } - return 0; + int status; + int64_t *context_id = (int64_t *) malloc(sizeof(int64_t)); + assert(context_id != NULL); + /* Initialize the main IOC thread input argument. + * Each IOC request will utilize this context_id which is + * consistent across all MPI ranks, to ensure that requests + * involving reference counting are correctly using the + * correct file contexts. + */ + context_id[0] = sf_context->sf_context_id; + + /* Initialize a couple of mutex variables that are used + * during IO concentrator operations to serialize + * access to key objects, e.g. reference counting. + */ + status = hg_thread_mutex_init(&ioc_mutex); + if (status) { + puts("hg_thread_mutex_init failed"); + goto err_exit; + } + status = hg_thread_mutex_init(&ioc_thread_mutex); + if (status) { + puts("hg_thread_mutex_init failed"); + goto err_exit; + } + + /* Initialize a thread pool for the IO Concentrator to use */ + status = hg_thread_pool_init(HG_TEST_NUM_THREADS_DEFAULT, &ioc_thread_pool); + if (status) { + puts("hg_thread_pool_init failed"); + goto err_exit; + } + + /* Arguments to hg_thread_create are: + * 1. A pointer to reference the created thread. + * 2. User function pointer for the new thread to execute. + * 3. Pointer to the input argument that gets passed along to the user + * function. + */ + status = hg_thread_create(&ioc_thread, ioc_thread_main, context_id); + if (status) { + puts("hg_thread_create failed"); + goto err_exit; + } + return 0; err_exit: - return -1; + return -1; } - +/*------------------------------------------------------------------------- + * Function: finalize_ioc_threads + * + * Purpose: Normally we shouldn't have any IOC threads running by the + * program exits. If we do, this destructor function gets + * called to cleanup + * + * Return: None + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ void __attribute__((destructor)) finalize_ioc_threads(void) { - if (ioc_thread_pool != NULL) { - hg_thread_pool_destroy(ioc_thread_pool); - ioc_thread_pool = NULL; - } + if (ioc_thread_pool != NULL) { + hg_thread_pool_destroy(ioc_thread_pool); + ioc_thread_pool = NULL; + } } - +/*------------------------------------------------------------------------- + * Function: local: handle_work_request + * + * Purpose: Handle a work request from the thread pool work queue. + * We dispatch the specific function as indicated by the + * TAG that has been added to the work request by the + * IOC main thread (which is just a copy of the MPI tag + * associated with the RPC message) and provide the subfiling + * context associated with the HDF5 file. + * + * Any status associated with the function processing is + * returned directly to the client via ACK or NACK messages. + * + * Return: (none) Doesn't fail. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ static HG_THREAD_RETURN_TYPE handle_work_request(void *arg) { - hg_thread_ret_t ret = 0; - sf_work_request_t *msg = (sf_work_request_t *)arg; - int status = 0; - - atomic_fetch_add(&sf_work_pending, 1); // atomic - switch(msg->tag) { - case WRITE_COLL: - status = queue_write_coll( msg, msg->subfile_rank, msg->source, sf_data_comm); + int status = 0; + hg_thread_ret_t ret = 0; + sf_work_request_t * msg = (sf_work_request_t *) arg; + int64_t file_context_id = msg->header[2]; + subfiling_context_t *sf_context = get_subfiling_object(file_context_id); + assert(sf_context != NULL); + + atomic_fetch_add(&sf_work_pending, 1); // atomic + switch (msg->tag) { + case WRITE_COLL: + status = queue_write_coll( + msg, msg->subfile_rank, msg->source, sf_context->sf_data_comm); break; - case READ_COLL: - status = queue_read_coll( msg, msg->subfile_rank, msg->source, sf_data_comm); + case READ_COLL: + status = queue_read_coll( + msg, msg->subfile_rank, msg->source, sf_context->sf_data_comm); break; - case WRITE_INDEP: - status = queue_write_indep( msg, msg->subfile_rank, msg->source, sf_data_comm); + case WRITE_INDEP: + status = queue_write_indep( + msg, msg->subfile_rank, msg->source, sf_context->sf_data_comm); break; - case READ_INDEP: - status = queue_read_indep( msg, msg->subfile_rank, msg->source, sf_data_comm); + case READ_INDEP: + status = queue_read_indep( + msg, msg->subfile_rank, msg->source, sf_context->sf_data_comm); break; - case CLOSE_OP: - status = decrement_file_ref_counts( msg->subfile_rank, msg->source, sf_data_comm, - subfiling_close_file); + case CLOSE_OP: + status = decrement_file_ref_counts(msg, msg->subfile_rank, + msg->source, sf_context->sf_data_comm, subfiling_close_file); break; - case OPEN_OP: - status = queue_file_open( msg, msg->subfile_rank, msg->source, sf_data_comm); + case OPEN_OP: + status = queue_file_open( + msg, msg->subfile_rank, msg->source, sf_context->sf_data_comm); break; - - default: - printf("[ioc(%d)] received message tag(%x)from rank %d\n", msg->subfile_rank, msg->tag, msg->source); + case FINI_OP: + status = increment_ioc_fini_counts(msg, msg->subfile_rank, + msg->source, sf_context->sf_data_comm, subfiling_shutdown); + break; + default: + printf("[ioc(%d)] received message tag(%x)from rank %d\n", + msg->subfile_rank, msg->tag, msg->source); status = -1; break; } - - atomic_fetch_sub(&sf_work_pending, 1); // atomic + + atomic_fetch_sub(&sf_work_pending, 1); // atomic if (status < 0) { - printf("[ioc(%d) %s]: Error encounted processing request(%x) from rank(%d\n", - msg->subfile_rank, __func__, msg->tag, msg->source); - fflush(stdout); + printf("[ioc(%d) %s]: Error encounted processing request(%x) from " + "rank(%d)\n", + msg->subfile_rank, __func__, msg->tag, msg->source); + fflush(stdout); } - return ret; + return ret; } -int tpool_add_work(sf_work_request_t *work) +/*------------------------------------------------------------------------- + * Function: tpool_add_work + * + * Purpose: Initiate the handoff of client request processing to a + * thread in the thread pool. A work request is created and + * added to the thread pool work queue. Once + * + * Return: result of: (hostid1 > hostid2) + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +int +tpool_add_work(sf_work_request_t *work) { - static int work_index = 0; - hg_thread_mutex_lock(&ioc_mutex); - if (work_index == POOL_CONCURRENT_MAX) - work_index = 0; - pool_request[work_index].func = handle_work_request; - pool_request[work_index].args = work; - hg_thread_pool_post(ioc_thread_pool, &pool_request[work_index++]); - hg_thread_mutex_unlock(&ioc_mutex); - return 0; + static int work_index = 0; + hg_thread_mutex_lock(&ioc_mutex); + if (work_index == POOL_CONCURRENT_MAX) + work_index = 0; + pool_request[work_index].func = handle_work_request; + pool_request[work_index].args = work; + hg_thread_pool_post(ioc_thread_pool, &pool_request[work_index++]); + hg_thread_mutex_unlock(&ioc_mutex); + return 0; } -bool tpool_is_empty(void) +/*------------------------------------------------------------------------- + * Function: tpool_is_empty + * + * Purpose: Utility function to indicate to the caller whether there + * is any remaining work in the thread pool queue. + * + * Return: TRUE or FALSE to indicate whether the work queue is empty. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +bool +tpool_is_empty(void) { - return HG_QUEUE_IS_EMPTY(&ioc_thread_pool->queue); + return HG_QUEUE_IS_EMPTY(&ioc_thread_pool->queue); } -int begin_thread_exclusive(void) +/*------------------------------------------------------------------------- + * Function: begin_thread_exclusive + * + * Purpose: Mutex lock to restrict access to code or variables. + * + * Return: integer result of mutex_lock request. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +int +begin_thread_exclusive(void) { return hg_thread_mutex_lock(&ioc_thread_mutex); } -int end_thread_exclusive(void) +/*------------------------------------------------------------------------- + * Function: end_thread_exclusive + * + * Purpose: Mutex unlock. Should only be called by the current holder + * of the locked mutex. + * + * Return: result of mutex_unlock operation. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +int +end_thread_exclusive(void) { return hg_thread_mutex_unlock(&ioc_thread_mutex); } -int wait_for_thread_main(void) +/*------------------------------------------------------------------------- + * Function: wait_for_thread_main + * + * Purpose: Perform a thread_join on the IOC main thread. + * + * Return: SUCCESS (0) or FAIL (-1) if the thread_join + * does not succeed. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +int +wait_for_thread_main(void) { - if (hg_thread_join(ioc_thread) == 0) - puts("thread_join succeeded"); - else { - puts("thread_join failed"); - return -1; - } - return 0; + if (hg_thread_join(ioc_thread) != 0) { + return -1; + } + return 0; } diff --git a/src/H5FDsubfiling.c b/src/H5FDsubfiling.c index 5626ed1..7fbfdc7 100644 --- a/src/H5FDsubfiling.c +++ b/src/H5FDsubfiling.c @@ -23,23 +23,30 @@ * application to the same file). */ +#define H5S_FRIEND /*suppress error about including H5Spkg */ #include "H5FDdrvr_module.h" /* This source code file is part of the H5FD driver module */ - #include "H5private.h" /* Generic Functions */ +#include "H5Dprivate.h" /* Dataset stuff */ #include "H5Eprivate.h" /* Error handling */ #include "H5Fprivate.h" /* File access */ +#include "H5CXprivate.h" /* API contexts, etc. */ #include "H5FDprivate.h" /* File drivers */ #include "H5FDsubfiling.h" /* Subfiling file driver */ #include "H5FLprivate.h" /* Free Lists */ #include "H5Iprivate.h" /* IDs */ #include "H5MMprivate.h" /* Memory management */ #include "H5Pprivate.h" /* Property lists */ +#include "H5Spkg.h" /* For selections and creation of subfiling vectors */ /* The driver identification number, initialized at runtime */ static hid_t H5FD_SUBFILING_g = 0; - +/* These are used for the creation of read or write vectors */ +static hssize_t sf_vlen = -1; +static hsize_t *sf_offsets = NULL; +static hsize_t *sf_sizes = NULL; +static void **sf_bufs = NULL; /* The description of a file belonging to this driver. The 'eoa' and 'eof' @@ -104,6 +111,11 @@ typedef struct H5FD_subfiling_t { haddr_t pos; /* current file I/O position */ H5FD_file_op_t op; /* last operation */ char filename[H5FD_MAX_FILENAME_LEN]; /* Copy of file name from open operation */ + MPI_Info info; + MPI_Comm comm; + int mpi_size; + int mpi_rank; + #ifndef H5_HAVE_WIN32_API /* On most systems the combination of device and i-node number uniquely * identify a file. Note that Cygwin, MinGW and other Windows POSIX @@ -186,14 +198,28 @@ static herr_t H5FD_subfiling_read(H5FD_t *_file, H5FD_mem_t type, hid_t fapl_id, haddr_t addr, size_t size, void *buf); static herr_t H5FD_subfiling_write(H5FD_t *_file, H5FD_mem_t type, hid_t fapl_id, haddr_t addr, size_t size, const void *buf); + +static herr_t H5FD__subfiling_read_vector(H5FD_t *file, hid_t dxpl_id, + uint32_t count, H5FD_mem_t types[], haddr_t addrs[], size_t sizes[], + void *bufs[] /* out */); +static herr_t H5FD__subfiling_write_vector(H5FD_t *file, hid_t dxpl_id, + uint32_t count, H5FD_mem_t types[], haddr_t addrs[], size_t sizes[], + void *bufs[] /* in */); + static herr_t H5FD_subfiling_truncate(H5FD_t *_file, hid_t dxpl_id, hbool_t closing); static herr_t H5FD_subfiling_lock(H5FD_t *_file, hbool_t rw); static herr_t H5FD_subfiling_unlock(H5FD_t *_file); static herr_t H5FD_subfiling_validate_config(const H5FD_subfiling_fapl_t * fa); +static int H5FD_subfiling_mpi_rank(const H5FD_t *_file); +static int H5FD_subfiling_mpi_size(const H5FD_t *_file); +static MPI_Comm H5FD_subfiling_communicator(const H5FD_t *_file); +static herr_t H5FD_subfiling_get_info(H5FD_t *_file, void **mpi_info); + -static const H5FD_class_t H5FD_subfiling_g = { +static const H5FD_class_mpi_t H5FD_subfiling_g = { + { "subfiling", /* name */ MAXADDR, /* maxaddr */ H5F_CLOSE_WEAK, /* fc_degree */ @@ -221,13 +247,18 @@ static const H5FD_class_t H5FD_subfiling_g = { H5FD_subfiling_get_handle, /* get_handle */ H5FD_subfiling_read, /* read */ H5FD_subfiling_write, /* write */ - NULL, /* read_vector */ - NULL, /* write_vector */ + H5FD__subfiling_read_vector, /* read_vector */ + H5FD__subfiling_write_vector, /* write_vector */ NULL, /* flush */ H5FD_subfiling_truncate, /* truncate */ H5FD_subfiling_lock, /* lock */ H5FD_subfiling_unlock, /* unlock */ H5FD_FLMAP_DICHOTOMY /* fl_map */ + }, + H5FD_subfiling_mpi_rank, + H5FD_subfiling_mpi_size, + H5FD_subfiling_communicator, + H5FD_subfiling_get_info }; /* Declare a free list to manage the H5FD_subfiling_t struct */ @@ -605,19 +636,23 @@ H5FD_subfiling_fapl_free(void *_fa) *------------------------------------------------------------------------- */ static H5FD_t * -H5FD_subfiling_open(const char *name, unsigned flags, hid_t fapl_id, - haddr_t maxaddr) +H5FD_subfiling_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t maxaddr) { H5FD_subfiling_t *file = NULL; /* subfiling VFD info */ - int fd = -1; /* File descriptor */ - int o_flags; /* Flags for open() call */ + int fd = -1; /* File descriptor */ + int o_flags; /* Flags for open() call */ + int mpi_enabled = 0; + int mpi_provides = -1; + int my_rank; #ifdef H5_HAVE_WIN32_API struct _BY_HANDLE_FILE_INFORMATION fileinfo; #endif h5_stat_t sb; H5FD_subfiling_fapl_t fa; H5FD_t *ret_value = NULL; /* Return value */ - + char *dir_path = NULL; + char file_prefix[H5FD_MAX_FILENAME_LEN]; + hid_t h5_file_id; FUNC_ENTER_NOAPI_NOINIT /* Sanity check on file offsets */ @@ -644,14 +679,34 @@ H5FD_subfiling_open(const char *name, unsigned flags, hid_t fapl_id, HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, NULL, "can't get property list") } + if (MPI_Initialized(&mpi_enabled) == MPI_SUCCESS) { + MPI_Query_thread(&mpi_provides); + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + } + /* Open the file */ if((fd = HDopen(name, o_flags, H5_POSIX_CREATE_MODE_RW)) < 0) { int myerrno = errno; - HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to open file: name = '%s', errno = %d, error message = '%s', flags = %x, o_flags = %x", name, myerrno, HDstrerror(myerrno), flags, (unsigned)o_flags); + HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, + "unable to open file: name = '%s', errno = %d, error message = '%s', flags = %x, o_flags = %x", + name, myerrno, HDstrerror(myerrno), flags, (unsigned)o_flags); } /* end if */ - if(HDfstat(fd, &sb) < 0) - HSYS_GOTO_ERROR(H5E_FILE, H5E_BADFILE, NULL, "unable to fstat file") + /* Avoid doing an addtional file stat on every MPI rank. + * By default the file open will stat the directory... + * We can be a bit more efficient by having rank 0 broadcast + * the stat buffer. + */ +#if 0 + if (mpi_enabled && (my_rank == 0)) { + int sb_size = sizeof(sb); + + MPI_Bcast(&sb, sb_size, MPI_BYTE, 0, MPI_COMM_WORLD); + } +#else + if(HDfstat(fd, &sb) < 0) + HSYS_GOTO_ERROR(H5E_FILE, H5E_BADFILE, NULL, "unable to fstat file") +#endif /* Create the new file struct */ if(NULL == (file = H5FL_CALLOC(H5FD_subfiling_t))) @@ -701,6 +756,32 @@ H5FD_subfiling_open(const char *name, unsigned flags, hid_t fapl_id, HGOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't get property of changing family to single") } /* end if */ + /* Use a FILE_id which is consistent/constant across multiple MPI ranks */ + h5_file_id = (hid_t)file->inode; + dir_path = strrchr(file->filename,'/'); + if (dir_path) { + *dir_path = '\0'; + strcpy(file_prefix, file->filename); + *dir_path = '/'; + dir_path = file_prefix; + } + + /* Only open subfiling if we've enabled MPI */ + if (mpi_enabled && + (mpi_provides == MPI_THREAD_MULTIPLE) && + (sf_open_subfiles(h5_file_id, file->filename, dir_path, o_flags ) < 0)) { + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't open subfiling files") + MPI_Comm_dup(MPI_COMM_WORLD, &file->comm); + MPI_Comm_rank(MPI_COMM_WORLD,&file->mpi_rank); + MPI_Comm_size(MPI_COMM_WORLD,&file->mpi_size); + file->info = MPI_INFO_NULL; + } + else { + /* MPI isn't avail, so neither is subfiling... + * In would be advantageous to replace subfiling parallel + * subfiling with serial.. + */ + } /* Set return value */ ret_value = (H5FD_t*)file; @@ -734,13 +815,20 @@ H5FD_subfiling_close(H5FD_t *_file) { H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file; herr_t ret_value = SUCCEED; /* Return value */ + int mpi_enabled = 0; FUNC_ENTER_NOAPI_NOINIT /* Sanity check */ HDassert(file); - /* Close the underlying file */ + if (MPI_Initialized(&mpi_enabled) == MPI_SUCCESS) { + /* Prepare to close the actual subfiles */ + hid_t h5_fid = (hid_t)file->inode; + if (mpi_enabled && (sf_close_subfiles(h5_fid) < 0)) + HSYS_GOTO_ERROR(H5E_IO, H5E_CANTCLOSEFILE, FAIL, "sf_close_subfiles returned and error") + } + /* Close the underlying HDF file */ if(HDclose(file->fd) < 0) HSYS_GOTO_ERROR(H5E_IO, H5E_CANTCLOSEFILE, FAIL, "unable to close file") @@ -822,7 +910,7 @@ done: static herr_t H5FD_subfiling_query(const H5FD_t *_file, unsigned long *flags /* out */) { - const H5FD_subfiling_t *file = (const H5FD_subfiling_t *)_file; /* subfiling VFD info */ + const H5FD_subfiling_t *file = (const H5FD_subfiling_t *)_file; /* subfiling VFD info */ FUNC_ENTER_NOAPI_NOINIT_NOERR @@ -841,8 +929,9 @@ H5FD_subfiling_query(const H5FD_t *_file, unsigned long *flags /* out */) *flags |= H5FD_FEAT_AGGREGATE_SMALLDATA; /* OK to aggregate "small" raw data allocations */ *flags |= H5FD_FEAT_POSIX_COMPAT_HANDLE; /* get_handle callback returns a POSIX file descriptor */ *flags |= H5FD_FEAT_SUPPORTS_SWMR_IO; /* VFD supports the single-writer/multiple-readers (SWMR) pattern */ - *flags |= H5FD_FEAT_DEFAULT_VFD_COMPATIBLE; /* VFD creates a file which can be opened with the default VFD */ - +#if 0 + *flags |= H5FD_FEAT_HAS_MPI; /* FIXME:: for experimentation only... */ +#endif /* Check for flags that are set by h5repart */ if(file && file->fam_to_single) *flags |= H5FD_FEAT_IGNORE_DRVRINFO; /* Ignore the driver info when file is opened (which eliminates it) */ @@ -869,7 +958,7 @@ H5FD_subfiling_query(const H5FD_t *_file, unsigned long *flags /* out */) static haddr_t H5FD_subfiling_get_eoa(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type) { - const H5FD_subfiling_t *file = (const H5FD_subfiling_t *)_file; + const H5FD_subfiling_t *file = (const H5FD_subfiling_t *)_file; FUNC_ENTER_NOAPI_NOINIT_NOERR @@ -894,7 +983,7 @@ H5FD_subfiling_get_eoa(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type) static herr_t H5FD_subfiling_set_eoa(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, haddr_t addr) { - H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file; + H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file; FUNC_ENTER_NOAPI_NOINIT_NOERR @@ -982,8 +1071,9 @@ H5FD_subfiling_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNUSED dxpl_id, haddr_t addr, size_t size, void *buf /*out*/) { H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file; - HDoff_t offset = (HDoff_t)addr; herr_t ret_value = SUCCEED; /* Return value */ + hbool_t addrs_cooked = FALSE; + int mpi_enabled = 0; FUNC_ENTER_NOAPI_NOINIT @@ -996,62 +1086,19 @@ H5FD_subfiling_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, if(REGION_OVERFLOW(addr, size)) HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow, addr = %llu", (unsigned long long)addr) -#ifndef H5_HAVE_PREADWRITE - /* Seek to the correct location (if we don't have pread) */ - if(addr != file->pos || OP_READ != file->op) { - if(HDlseek(file->fd, (HDoff_t)addr, SEEK_SET) < 0) - HSYS_GOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL, "unable to seek to proper position") - } -#endif /* H5_HAVE_PREADWRITE */ - - /* Read data, being careful of interrupted system calls, partial results, - * and the end of the file. - */ - while(size > 0) { - - h5_posix_io_t bytes_in = 0; /* # of bytes to read */ - h5_posix_io_ret_t bytes_read = -1; /* # of bytes actually read */ - - /* Trying to read more bytes than the return type can handle is - * undefined behavior in POSIX. - */ - if(size > H5_POSIX_MAX_IO_BYTES) - bytes_in = H5_POSIX_MAX_IO_BYTES; - else - bytes_in = (h5_posix_io_t)size; - - do { -#ifdef H5_HAVE_PREADWRITE - bytes_read = HDpread(file->fd, buf, bytes_in, offset); - if(bytes_read > 0) - offset += bytes_read; -#else - bytes_read = HDread(file->fd, buf, bytes_in); -#endif /* H5_HAVE_PREADWRITE */ - } while(-1 == bytes_read && EINTR == errno); - - if(-1 == bytes_read) { /* error */ - int myerrno = errno; - time_t mytime = HDtime(NULL); + addr += _file->base_addr; + addrs_cooked = TRUE; /* Follow the example of read_vector (see H5FDint.c) */ - offset = HDlseek(file->fd, (HDoff_t)0, SEEK_CUR); - - HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed: time = %s, filename = '%s', file descriptor = %d, errno = %d, error message = '%s', buf = %p, total read size = %llu, bytes this sub-read = %llu, bytes actually read = %llu, offset = %llu", HDctime(&mytime), file->filename, file->fd, myerrno, HDstrerror(myerrno), buf, (unsigned long long)size, (unsigned long long)bytes_in, (unsigned long long)bytes_read, (unsigned long long)offset); - } /* end if */ - - if(0 == bytes_read) { - /* end of file but not end of format address space */ - HDmemset(buf, 0, size); - break; - } /* end if */ - - HDassert(bytes_read >= 0); - HDassert((size_t)bytes_read <= size); + if (MPI_Initialized(&mpi_enabled) == MPI_SUCCESS) { + hid_t h5_fid = (hid_t)file->inode; + if (mpi_enabled && (sf_read_independent(h5_fid, (int64_t)addr, (int64_t)size, 1, buf) < 0)) + HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "subfile read failed") + } + addr += (haddr_t)size; - size -= (size_t)bytes_read; - addr += (haddr_t)bytes_read; - buf = (char *)buf + bytes_read; - } /* end while */ + if ( addrs_cooked ) { + addr -= _file->base_addr; + } /* Update current position */ file->pos = addr; @@ -1087,8 +1134,9 @@ H5FD_subfiling_write(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNUSED dxpl_id, haddr_t addr, size_t size, const void *buf) { H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file; - HDoff_t offset = (HDoff_t)addr; herr_t ret_value = SUCCEED; /* Return value */ + hbool_t addrs_cooked = FALSE; + int mpi_enabled = 0; FUNC_ENTER_NOAPI_NOINIT @@ -1101,56 +1149,20 @@ H5FD_subfiling_write(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, if(REGION_OVERFLOW(addr, size)) HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow, addr = %llu, size = %llu", (unsigned long long)addr, (unsigned long long)size) -#ifndef H5_HAVE_PREADWRITE - /* Seek to the correct location (if we don't have pwrite) */ - if(addr != file->pos || OP_WRITE != file->op) { - if(HDlseek(file->fd, (HDoff_t)addr, SEEK_SET) < 0) - HSYS_GOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL, "unable to seek to proper position") - } -#endif /* H5_HAVE_PREADWRITE */ + addr += _file->base_addr; + addrs_cooked = TRUE; /* Follow the example of read_vector (see H5FDint.c) */ - /* Write the data, being careful of interrupted system calls and partial - * results - */ - while(size > 0) { - - h5_posix_io_t bytes_in = 0; /* # of bytes to write */ - h5_posix_io_ret_t bytes_wrote = -1; /* # of bytes written */ - - /* Trying to write more bytes than the return type can handle is - * undefined behavior in POSIX. - */ - if(size > H5_POSIX_MAX_IO_BYTES) - bytes_in = H5_POSIX_MAX_IO_BYTES; - else - bytes_in = (h5_posix_io_t)size; - - do { -#ifdef H5_HAVE_PREADWRITE - bytes_wrote = HDpwrite(file->fd, buf, bytes_in, offset); - if(bytes_wrote > 0) - offset += bytes_wrote; -#else - bytes_wrote = HDwrite(file->fd, buf, bytes_in); -#endif /* H5_HAVE_PREADWRITE */ - } while(-1 == bytes_wrote && EINTR == errno); - - if(-1 == bytes_wrote) { /* error */ - int myerrno = errno; - time_t mytime = HDtime(NULL); - - offset = HDlseek(file->fd, (HDoff_t)0, SEEK_CUR); - - HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "file write failed: time = %s, filename = '%s', file descriptor = %d, errno = %d, error message = '%s', buf = %p, total write size = %llu, bytes this sub-write = %llu, bytes actually written = %llu, offset = %llu", HDctime(&mytime), file->filename, file->fd, myerrno, HDstrerror(myerrno), buf, (unsigned long long)size, (unsigned long long)bytes_in, (unsigned long long)bytes_wrote, (unsigned long long)offset); - } /* end if */ + if (MPI_Initialized(&mpi_enabled) == MPI_SUCCESS) { + hid_t h5_fid = (hid_t)file->inode; + if (mpi_enabled && (sf_write_independent(h5_fid, (int64_t)addr, (int64_t)size, 1, buf) < 0)) + HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "subfile write failed") + } - HDassert(bytes_wrote > 0); - HDassert((size_t)bytes_wrote <= size); + addr += (haddr_t)size; /* Point to the end of the current IO */ - size -= (size_t)bytes_wrote; - addr += (haddr_t)bytes_wrote; - buf = (const char *)buf + bytes_wrote; - } /* end while */ + if ( addrs_cooked ) { + addr -= _file->base_addr; + } /* Update current position and eof */ file->pos = addr; @@ -1168,12 +1180,165 @@ done: FUNC_LEAVE_NOAPI(ret_value) } /* end H5FD_subfiling_write() */ + + +/*------------------------------------------------------------------------- + * Function: H5FDsubfile__read_vector (internal function) + * + * Purpose: Perform count reads from the specified file at the offsets + * provided in the addrs array, with the lengths and memory + * types provided in the sizes and types arrays. Data read + * is returned in the buffers provided in the bufs array. + * + * All reads are done according to the data transfer property + * list dxpl_id (which may be the constant H5P_DEFAULT). + * + * Return: Success: SUCCEED + * All reads have completed successfully, and + * the results havce been into the supplied + * buffers. + * + * Failure: FAIL + * The contents of supplied buffers are undefined. + * + * Programmer: JRM -- 6/10/20 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__subfiling_read_vector(H5FD_t *_file, hid_t dxpl_id, uint32_t count, + H5FD_mem_t types[], haddr_t addrs[], size_t sizes[], + void *bufs[] /* out */) +{ + H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file; + herr_t ret_value = SUCCEED; /* Return value */ + hid_t h5_fid; + + FUNC_ENTER_STATIC + + /* Check arguments + * RAW - Do we really need to check arguments once again? + * These have already been checked in H5FD_subfiling_read_vector (see below)! + */ + if(!file) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file pointer cannot be NULL") + + if((!types) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "types parameter can't be NULL if count is positive") + + if((!addrs) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "addrs parameter can't be NULL if count is positive") + + if((!sizes) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "sizes parameter can't be NULL if count is positive") + + if((!bufs) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "bufs parameter can't be NULL if count is positive") + + /* Get the default dataset transfer property list if the user didn't provide one */ + if(H5P_DEFAULT == dxpl_id) { + dxpl_id = H5P_DATASET_XFER_DEFAULT; + } else { + if(TRUE != H5P_isa_class(dxpl_id, H5P_DATASET_XFER)) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a data transfer property list") + } + + /* Set DXPL for operation */ + H5CX_set_dxpl(dxpl_id); + h5_fid = (hid_t)file->inode; + if(sf_read_vector(h5_fid, count, (hsize_t *)addrs, (hsize_t *)sizes, bufs) != SUCCEED) + HGOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "file vector write request failed") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} + + +/*------------------------------------------------------------------------- + * Function: H5FDsubfile__write_vector (internal function) + * + * Purpose: Perform count writes to the specified file at the offsets + * provided in the addrs array. Lengths and memory + * types provided in the sizes and types arrays. Data to be + * written is referenced by the bufs array. + * + * All writes are done according to the data transfer property + * list dxpl_id (which may be the constant H5P_DEFAULT). + * + * Return: Success: SUCCEED + * All writes have completed successfully. + * + * Failure: FAIL + * An internal error was encountered, e.g the + * input arguments are not valid, or the actual + * subfiling writes have failed for some reason. + * + * Programmer: JRM -- 6/10/20 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__subfiling_write_vector(H5FD_t *_file, hid_t dxpl_id, uint32_t count, + H5FD_mem_t types[], haddr_t addrs[], size_t sizes[], + void *bufs[] /* in */) +{ + H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file; + herr_t ret_value = SUCCEED; /* Return value */ + hid_t h5_fid; + + FUNC_ENTER_STATIC + + HDassert(file != NULL); /* sanity check */ + + /* Check arguments + * RAW - Do we really need to check arguments once again? + * These have already been checked in H5FD_subfiling_write_vector (see below)! + */ + if(!file) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file pointer cannot be NULL") + + if((!types) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "types parameter can't be NULL if count is positive") + + if((!addrs) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "addrs parameter can't be NULL if count is positive") + + if((!sizes) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "sizes parameter can't be NULL if count is positive") + + if((!bufs) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "bufs parameter can't be NULL if count is positive") + + /* Get the default dataset transfer property list if the user didn't provide one */ + if(H5P_DEFAULT == dxpl_id) { + dxpl_id = H5P_DATASET_XFER_DEFAULT; + } else { + if(TRUE != H5P_isa_class(dxpl_id, H5P_DATASET_XFER)) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a data transfer property list") + } + + /* Set DXPL for operation */ + H5CX_set_dxpl(dxpl_id); + h5_fid = (hid_t)file->inode; + if(sf_write_vector(h5_fid, count, (hsize_t *)addrs, (hsize_t *)sizes, bufs) != SUCCEED) + HGOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "file vector write request failed") + +done: + FUNC_LEAVE_NOAPI(ret_value) + +} /* end H5FDsubfile__write_vector() */ + + /*------------------------------------------------------------------------- * Function: H5FD_subfiling_truncate * - * Purpose: Makes sure that the true file size is the same (or larger) - * than the end-of-address. + * Purpose: Makes sure that the true file size is the same as + * the end-of-allocation. * * Return: SUCCEED/FAIL * @@ -1183,11 +1348,12 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5FD_subfiling_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, +H5FD_subfiling_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t H5_ATTR_UNUSED closing) { H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file; herr_t ret_value = SUCCEED; /* Return value */ + int mpi_enabled = 0; FUNC_ENTER_NOAPI_NOINIT @@ -1195,6 +1361,11 @@ H5FD_subfiling_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, /* Extend the file to make sure it's large enough */ if(!H5F_addr_eq(file->eoa, file->eof)) { + if (MPI_Initialized(&mpi_enabled) == MPI_SUCCESS) { + hid_t h5_fid = (hid_t)file->inode; + if (mpi_enabled && (sf_truncate(h5_fid, file->eof) < 0)) + HGOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL, "unable to extend file properly") + } #ifdef H5_HAVE_WIN32_API LARGE_INTEGER li; /* 64-bit (union) integer for SetFilePointer() call */ DWORD dwPtrLow; /* Low-order pointer bits from SetFilePointer() @@ -1243,9 +1414,9 @@ done: * Function: H5FD_subfiling_lock * * Purpose: To place an advisory lock on a file. - * The lock type to apply depends on the parameter "rw": - * TRUE--opens for write: an exclusive lock - * FALSE--opens for read: a shared lock + * The lock type to apply depends on the parameter "rw": + * TRUE--opens for write: an exclusive lock + * FALSE--opens for read: a shared lock * * Return: SUCCEED/FAIL * @@ -1312,3 +1483,568 @@ done: FUNC_LEAVE_NOAPI(ret_value) } /* end H5FD_subfiling_unlock() */ +herr_t +H5FD__get_file_ino(const char *name, uint64_t *st_ino) +{ + herr_t ret_value = SUCCEED; /* Return value */ + h5_stat_t sb; + + FUNC_ENTER_PACKAGE + + if(HDstat(name, &sb) < 0) + HSYS_GOTO_ERROR(H5E_FILE, H5E_BADFILE, FAIL, "unable to fstat file") + + *st_ino = sb.st_ino; +done: + + FUNC_LEAVE_NOAPI(ret_value) +} + +static +herr_t create_simple_vector( hid_t file_space_id, void *memDataBuf, haddr_t addrBase, hssize_t elements, size_t type_extent, hssize_t *vlen, hsize_t **_offsets, hsize_t **_blocklens, void ***_bufs ) +{ + int n_dims = H5Sget_simple_extent_ndims(file_space_id); + hsize_t *offsets = *_offsets; + hsize_t *blocklens = *_blocklens; + void **bufs = *_bufs; + void *nextBuf = memDataBuf; + + assert(vlen); + assert(_offsets); + assert(_blocklens); + assert(_bufs); + + if (n_dims > 0) { + hsize_t simple_dims[n_dims]; + hsize_t stride[n_dims]; + if (H5Sget_simple_extent_dims(file_space_id, simple_dims, stride) < 0) { + puts("H5Sget_simple_extent_dims returned an error"); + return -1; + } + + if (*vlen < 0) { + offsets = (hsize_t *)malloc((sizeof(haddr_t))); + assert(offsets); + + blocklens = (hsize_t *)malloc((sizeof(hsize_t))); + assert(blocklens); + + bufs = (void **)malloc((sizeof(void **))); + assert(bufs); + } + bufs[0] = nextBuf; + offsets[0] = addrBase; + blocklens[0] = (hsize_t )((hsize_t)elements * type_extent); + + if (*vlen < 0) { + *_offsets = offsets; + *_blocklens = blocklens; + *_bufs = bufs; + } + *vlen = 1; + return 0; + } + return -1; +} + + +static +herr_t create_vector_from_hyperslab( hid_t file_space_id, void *memDataBuf, haddr_t addrBase, size_t type_extent, hssize_t *vlen, hsize_t **_offsets, hsize_t **_blocklens, void ***_bufs ) +{ + herr_t ret_value = SUCCEED; + hssize_t k, n_blocks = H5Sget_select_hyper_nblocks(file_space_id); + + void *nextBuf = memDataBuf; + + hsize_t stride[H5S_MAX_RANK]; + hsize_t count[H5S_MAX_RANK]; + + hsize_t *strides = stride; + hsize_t *counts = count; + + hsize_t *offsets = *_offsets; + hsize_t *blocklens = *_blocklens; + void **bufs = *_bufs; + + assert(vlen); + assert(_offsets); + assert(_blocklens); + assert(_bufs); + assert(n_blocks > 0); + + if (n_blocks > H5S_MAX_RANK) { + /* Allocate a temp for the H5Sget_regular_hyperslab function call */ + if ((strides = (hsize_t *)malloc((size_t)n_blocks * sizeof(hsize_t))) == NULL) { + perror("unable to allocate storage for vector creation"); + return -1; + } + if ((counts = (hsize_t *)malloc((size_t)n_blocks * sizeof(hsize_t))) == NULL) { + perror("unable to allocate storage for vector creation"); + return -1; + } + } + + /* Allocate storage for the vector elements */ + if (*vlen < n_blocks) { + if (offsets) { + offsets = (hsize_t *)realloc(offsets, ((size_t)n_blocks * sizeof(haddr_t))); + } else { + offsets = (hsize_t *)malloc(((size_t)n_blocks * sizeof(haddr_t))); + } + assert(offsets); + if (blocklens) { + blocklens = (hsize_t *)realloc(blocklens, ((size_t)n_blocks * sizeof(hsize_t))); + } else { + blocklens = (hsize_t *)malloc(((size_t)n_blocks * sizeof(hsize_t))); + } + assert(blocklens); + if (bufs) { + bufs = (void **)realloc(bufs, ((size_t)n_blocks * sizeof(void **))); + } else { + bufs = (void **)malloc(((size_t)n_blocks * sizeof(void **))); + } + assert(bufs); + *vlen = n_blocks; + } + /* Fill vector elements */ + if ((ret_value = H5Sget_regular_hyperslab(file_space_id, offsets, strides, counts, blocklens)) < 0) { + puts("H5Sget_regular_hyperslab failed"); + return -1; + } + + for(k=0; k < n_blocks; k++) { + bufs[k] = nextBuf; + offsets[k] *= type_extent; + offsets[k] += addrBase; + blocklens[k] *= type_extent; + nextBuf += (strides[k] * type_extent); + } + if (strides != stride) + free(strides); + if (counts != count) + free(counts); + + *_offsets = offsets; + *_blocklens = blocklens; + *_bufs = bufs; + + return ret_value; +} + + +static +herr_t check_dims(int ndims, hsize_t *mem_dims, hsize_t *file_dims, int *diff_index) +{ + int i; + herr_t ret_value = SUCCEED; + for(i=0; i < ndims; i++) { + if (mem_dims[i] != file_dims[i]) { + *diff_index = i; + return 0; + } + } + /* ndims +1 == no differences */ + *diff_index = i; + return ret_value; +} + +static +haddr_t get_data_offset(int mpi_rank, int mpi_size, size_t dtype_extent, const H5S_t *mem_space, const H5S_t *file_space) +{ + haddr_t this_base = 0; + return this_base; +} + + + +static +haddr_t get_base_offset(int mpi_rank, int mpi_size, hid_t mem_space_id, hid_t file_space_id) +{ + haddr_t this_base = 0; + int n_dims; + int is_simple = H5Sis_simple(file_space_id); + /* The 'is_simple' variable is actually a tri value type: + * -1 == failed + * 0 == NOT_SIMPLE + * 1 == SIMPLE + */ + if (is_simple > 0) { + n_dims = H5Sget_simple_extent_ndims(mem_space_id); + if (n_dims > 0) { + hsize_t mem_stride[n_dims]; + hsize_t mem_dims[n_dims]; + hsize_t file_stride[n_dims]; + hsize_t file_dims[n_dims]; + hsize_t total_size; + if (H5Sget_simple_extent_dims(mem_space_id, mem_dims, mem_stride) < 0) + puts("H5Sget_simple_extent_dims returned an error"); + if (H5Sget_simple_extent_dims(file_space_id, file_dims, file_stride) < 0) + puts("H5Sget_simple_extent_dims returned an error"); + + if (n_dims == 1) { + if ((total_size = mem_dims[0] * (hsize_t)mpi_size) == file_dims[0]) { + this_base = (mem_dims[0] * (hsize_t)mpi_rank); + } + } + else { + int diff_index = -1; + if (check_dims(n_dims, mem_dims, file_dims, &diff_index) < 0) + puts("check_dims returned an error"); + if ((total_size = mem_dims[diff_index] * (hsize_t)mpi_size) == file_dims[diff_index]) { + this_base = (mem_dims[diff_index] * (hsize_t)mpi_rank); + } + } + } + } + + return this_base; +} + + + +herr_t +H5FD__dataset_write_contiguous(hid_t h5_file_id, haddr_t dataset_baseAddr, size_t dtype_extent, + int mpi_rank, int mpi_size, void *_dset, hid_t mem_type_id, + hid_t mem_space_id, hid_t file_space_id, hid_t plist_id, const void *buf) +{ + H5D_t *dset = (H5D_t *)_dset; + herr_t ret_value = SUCCEED; /* Return value */ + hssize_t num_elem_file = -1, num_elem_mem = -1; + H5S_sel_type sel_type; + hsize_t mem_nelem, file_nelem; + const H5S_t *mem_space; + const H5S_t *file_space; + + FUNC_ENTER_PACKAGE + + if((num_elem_file = H5Sget_select_npoints(file_space_id)) < 0) + puts("can't get number of points in file selection"); + if((num_elem_mem = H5Sget_select_npoints(mem_space_id)) < 0) + puts("can't get number of points in memory selection"); + + if(num_elem_file != num_elem_mem) + puts("number of elements selected in file and memory dataspaces is different"); + + if (H5S_get_validated_dataspace(mem_space_id, &mem_space) < 0) { + puts("could not get a validated dataspace from mem_space_id"); + } + else mem_nelem = mem_space->extent.nelem; + if (H5S_get_validated_dataspace(file_space_id, &file_space) < 0) { + puts("could not get a validated dataspace from file_space_id"); + } + else file_nelem = file_space->extent.nelem; + + if (num_elem_file > 0) { + sel_type = H5Sget_select_type(file_space_id); + switch (sel_type) { + case H5S_SEL_NONE: + // printf("[%d] H5S_SEL_NONE\n", mpi_rank); + break; + case H5S_SEL_POINTS: + { + haddr_t rank_baseAddr; + rank_baseAddr = get_base_offset(mpi_rank, mpi_size, mem_space_id, file_space_id); + rank_baseAddr += dataset_baseAddr; + // printf("[%d] H5S_SEL_POINTS - num_elem_file: %lld: UNSUPPORTED (for now)\n", mpi_rank, num_elem_file); + ret_value = -1; + goto done; + + break; + } + case H5S_SEL_HYPERSLABS: + { + int status; + haddr_t rank_baseAddr; +#if 0 + rank_baseAddr = get_data_offset(mpi_rank, mpi_size, dtype_extent, mem_space, file_space); + +#else + rank_baseAddr = get_base_offset(mpi_rank, mpi_size, mem_space_id, file_space_id); + rank_baseAddr += dataset_baseAddr; +#endif + // printf("[%d] H5S_SEL_HYPERSLABS, file_offset = %lld\n", mpi_rank, rank_baseAddr ); + if ((status = H5Sis_regular_hyperslab(file_space_id)) < 0) { + puts("H5Sis_regular_hyperslab returned an error"); + ret_value = -1; + goto done; + } + if (status > 0) { + hssize_t previous_vlen = sf_vlen; + if ((mem_space->extent.rank == 1)) { + if (sf_offsets == NULL) + sf_offsets = (hsize_t *)malloc(sizeof(hsize_t)); + if (sf_sizes == NULL) + sf_sizes = (hsize_t *)malloc(sizeof(hsize_t)); + if (sf_bufs == NULL) + sf_bufs = (void **)malloc(sizeof(void *)); + sf_vlen = 1; + assert(sf_offsets); + assert(sf_sizes); + assert(sf_bufs); + + sf_offsets[0] = rank_baseAddr; + sf_sizes[0] = num_elem_mem * dtype_extent; + sf_bufs[0] = buf; + } + else if (create_vector_from_hyperslab(file_space_id, buf, rank_baseAddr, dtype_extent, + &sf_vlen, &sf_offsets, &sf_sizes, &sf_bufs) < 0) { + puts("Unable to create vectors"); + ret_value = -1; + goto done; + } + ret_value = sf_write_vector(h5_file_id, sf_vlen, sf_offsets, sf_sizes, sf_bufs); + + /* Possibly restore the sf_vlen value to accurately reflect the malloc sizes */ + if (sf_vlen < previous_vlen) + sf_vlen = previous_vlen; + } + break; + } + case H5S_SEL_ALL: + { + int status; + haddr_t rank_baseAddr; + rank_baseAddr = get_base_offset(mpi_rank, mpi_size, mem_space_id, file_space_id); + rank_baseAddr += dataset_baseAddr; + // printf("[%d] H5S_SEL_ALL\n", mpi_rank); + status = H5Sis_simple(file_space_id); + if (status > 0) { + if (create_simple_vector(file_space_id, buf, rank_baseAddr, num_elem_mem, + dtype_extent, &sf_vlen, &sf_offsets, &sf_sizes, &sf_bufs) < 0) { + puts("Unable to create simple vectors"); + goto done; + } + ret_value = sf_write_vector(h5_file_id, sf_vlen, sf_offsets, sf_sizes, sf_bufs); + } + break; + } + default: + printf("[%d] UNSUPPORTED selection type\n", mpi_rank); + ret_value = -1; + } /* END switch (sel_type) */ + + } /* if (num_elem_file > 0) */ + +done: + + FUNC_LEAVE_NOAPI(ret_value) +} + +herr_t +H5FD__dataset_read_contiguous(hid_t h5_file_id, haddr_t dataset_baseAddr, size_t dtype_extent, + int mpi_rank, int mpi_size, void *_dset, hid_t mem_type_id, + hid_t mem_space_id, hid_t file_space_id, hid_t plist_id, void *buf) +{ + H5FD_t *dset = (H5FD_t *)_dset; + herr_t ret_value = SUCCEED; /* Return value */ + hssize_t num_elem_file = -1, num_elem_mem = -1; + H5S_sel_type sel_type; + + FUNC_ENTER_PACKAGE + if((num_elem_file = H5Sget_select_npoints(file_space_id)) < 0) + puts("can't get number of points in file selection"); + if((num_elem_mem = H5Sget_select_npoints(mem_space_id)) < 0) + puts("can't get number of points in memory selection"); + + if(num_elem_file != num_elem_mem) + puts("number of elements selected in file and memory dataspaces is different"); + + if (num_elem_file > 0) { + sel_type = H5Sget_select_type(file_space_id); + switch (sel_type) { + case H5S_SEL_NONE: + // printf("[%d] H5S_SEL_NONE\n", mpi_rank); + break; + case H5S_SEL_POINTS: + { + int status; + haddr_t rank_baseAddr; + rank_baseAddr = get_base_offset(mpi_rank, mpi_size, mem_space_id, file_space_id); + rank_baseAddr += dataset_baseAddr; + // printf("[%d] H5S_SEL_POINTS - num_elem_file: %lld: UNSUPPORTED (for now)\n", mpi_rank, num_elem_file); + ret_value = -1; + goto done; + + break; + } + case H5S_SEL_HYPERSLABS: + { + int status; + haddr_t rank_baseAddr; + const H5S_t *mem_space; + const H5S_t *file_space; + rank_baseAddr = get_base_offset(mpi_rank, mpi_size, mem_space_id, file_space_id); + rank_baseAddr += dataset_baseAddr; + if (H5S_get_validated_dataspace(mem_space_id, &mem_space) < 0) { + puts("could not get a validated dataspace from mem_space_id"); + } + if (H5S_get_validated_dataspace(file_space_id, &file_space) < 0) { + puts("could not get a validated dataspace from file_space_id"); + } + + // printf("[%d] H5S_SEL_HYPERSLABS, file_offset = %lld\n", mpi_rank, rank_baseAddr ); + if ((status = H5Sis_regular_hyperslab(file_space_id)) < 0) { + puts("H5Sis_regular_hyperslab returned an error"); + ret_value = -1; + goto done; + } + if (status > 0) { + hssize_t previous_vlen = sf_vlen; + if (mem_space->extent.rank == 1) { + if (sf_offsets == NULL) + sf_offsets = (hsize_t *)malloc(sizeof(hsize_t)); + if (sf_sizes == NULL) + sf_sizes = (hsize_t *)malloc(sizeof(hsize_t)); + if (sf_bufs == NULL) + sf_bufs = (void **)malloc(sizeof(void *)); + sf_vlen = 1; + assert(sf_offsets); + assert(sf_sizes); + assert(sf_bufs); + + sf_offsets[0] = rank_baseAddr; + sf_sizes[0] = num_elem_mem * dtype_extent; + sf_bufs[0] = buf; + } + else if (create_vector_from_hyperslab(file_space_id, buf, rank_baseAddr, dtype_extent, + &sf_vlen, &sf_offsets, &sf_sizes, &sf_bufs) < 0) { + puts("Unable to create vectors"); + ret_value = -1; + goto done; + } + ret_value = sf_read_vector(h5_file_id, sf_vlen, sf_offsets, sf_sizes, sf_bufs); + + /* Possibly restore the sf_vlen value to accurately reflect the malloc sizes */ + if (sf_vlen < previous_vlen) + sf_vlen = previous_vlen; + } + break; + } + case H5S_SEL_ALL: + { + int status; + haddr_t rank_baseAddr; + rank_baseAddr = get_base_offset(mpi_rank, mpi_size, mem_space_id, file_space_id); + rank_baseAddr += dataset_baseAddr; + // printf("[%d] H5S_SEL_ALL\n", mpi_rank); + status = H5Sis_simple(file_space_id); + if (status > 0) { + if (create_simple_vector(file_space_id, buf, rank_baseAddr, num_elem_mem, + dtype_extent, &sf_vlen, &sf_offsets, &sf_sizes, &sf_bufs) < 0) { + puts("Unable to create simple vectors"); + goto done; + } + ret_value = sf_read_vector(h5_file_id, sf_vlen, sf_offsets, sf_sizes, sf_bufs); + } + break; + } + default: + printf("[%d] UNSUPPORTED selection type\n", mpi_rank); + ret_value = -1; + } /* END switch (sel_type) */ + + } /* if (num_elem_file > 0) */ + +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} + +static int +H5FD_subfiling_mpi_rank(const H5FD_t *_file) +{ + const H5FD_subfiling_t *file = (const H5FD_subfiling_t*)_file; + + FUNC_ENTER_STATIC_NOERR + + /* Sanity checks */ + HDassert(file); + + FUNC_LEAVE_NOAPI(file->mpi_rank) +} /* end H5FD__mpio_mpi_rank() */ + + +/*------------------------------------------------------------------------- + * Function: H5FD_subfiling_mpi_size + * + * Purpose: Returns the number of MPI processes + * + * Return: Success: non-negative + * Failure: negative + * + * Programmer: Quincey Koziol + * Thursday, May 16, 2002 + * + *------------------------------------------------------------------------- + */ +static int +H5FD_subfiling_mpi_size(const H5FD_t *_file) +{ + const H5FD_subfiling_t *file = (const H5FD_subfiling_t*)_file; + + FUNC_ENTER_STATIC_NOERR + + /* Sanity checks */ + HDassert(file); + + FUNC_LEAVE_NOAPI(file->mpi_size) +} /* end H5FD__subfiling_mpi_size() */ + + +/*------------------------------------------------------------------------- + * Function: H5FD_subfiling_communicator + * + * Purpose: Returns the MPI communicator for the file. + * + * Return: Success: The communicator + * Failure: Can't fail + * + * Programmer: Robb Matzke + * Monday, August 9, 1999 + * + *------------------------------------------------------------------------- + */ +static MPI_Comm +H5FD_subfiling_communicator(const H5FD_t *_file) +{ + const H5FD_subfiling_t *file = (const H5FD_subfiling_t*)_file; + + FUNC_ENTER_STATIC_NOERR + + /* Sanity checks */ + HDassert(file); + + FUNC_LEAVE_NOAPI(file->comm) +} /* end H5FD__subfiling_communicator() */ + + +/*------------------------------------------------------------------------- + * Function: H5FD_subfiling_get_info + * + * Purpose: Returns the file info of SUBFILING file driver. + * + * Returns: Non-negative if succeed or negative if fails. + * + * Programmer: John Mainzer + * April 4, 2017 + * + *------------------------------------------------------------------------- +*/ +static herr_t +H5FD_subfiling_get_info(H5FD_t *_file, void **mpi_info) +{ + H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_STATIC + + if(!mpi_info) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "mpi info not valid") + + *mpi_info = &(file->info); + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* H5FD__subfiling_get_info() */ + diff --git a/src/H5FDsubfiling.h b/src/H5FDsubfiling.h index b4b3f1c..8113354 100644 --- a/src/H5FDsubfiling.h +++ b/src/H5FDsubfiling.h @@ -66,9 +66,25 @@ extern "C" { #endif H5_DLL hid_t H5FD_subfiling_init(void); -H5_DLL herr_t H5Pget_fapl_subfiling(hid_t fapl_id, - H5FD_subfiling_fapl_t *fa_out); +H5_DLL herr_t H5Pget_fapl_subfiling(hid_t fapl_id, H5FD_subfiling_fapl_t *fa_out); H5_DLL herr_t H5Pset_fapl_subfiling(hid_t fapl_id, H5FD_subfiling_fapl_t *fa); +H5_DLL herr_t H5FD__get_file_ino(const char *name, uint64_t *st_ino); +H5_DLL herr_t H5FD__dataset_write_contiguous(hid_t h5_file_id, haddr_t dataset_baseAddr, size_t dtype_extent, + int mpi_rank, int mpi_size, void *_dset, hid_t mem_type_id, hid_t mem_space_id, + hid_t file_space_id, hid_t plist_id, const void *buf); +H5_DLL herr_t H5FD__dataset_read_contiguous(hid_t h5_file_id, haddr_t dataset_baseAddr, size_t dtype_extent, + int mpi_rank, int mpi_size, void *_dset, hid_t mem_type_id, hid_t mem_space_id, + hid_t file_space_id, hid_t plist_id, void *buf); + +/* Copied from:: H5FDsubfile_private.h */ +H5_DLL int sf_open_subfiles(hid_t context_id, char *filename, char *prefix, int flags); +H5_DLL int sf_close_subfiles(hid_t context_id); +H5_DLL int sf_read_independent(hid_t sf_fid, int64_t offset, int64_t elements, int dtype_extent, void *data); +H5_DLL int sf_write_independent(hid_t sf_fid, int64_t offset, int64_t elements, int dtype_extent, const void *data); +H5_DLL herr_t sf_read_vector(hid_t h5_fid, hssize_t count, hsize_t addrs[], hsize_t sizes[], void *bufs[] /* out */); +H5_DLL herr_t sf_write_vector(hid_t h5_fid, hssize_t count, hsize_t addrs[], hsize_t sizes[], void *bufs[] /* in */); +H5_DLL int sf_truncate(hid_t h5_fid, haddr_t addr); + #ifdef __cplusplus } diff --git a/src/Makefile.am b/src/Makefile.am index e1d8591..995af7b 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -207,7 +207,7 @@ $(top_srcdir)/src/H5overflow.h: $(top_srcdir)/src/H5overflow.txt trace: $(libhdf5_la_SOURCES) @for dep in $? dummy; do \ - if test $$dep != "dummy" -a -n "$(PERL)"; then \ + if test $$dep != "dummy" -a -n "$(PERL)"; then \ case "$$dep" in \ *.c) \ $(TRACE) $$dep; \ diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 311d753..7da92cd 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -145,6 +145,7 @@ if (BUILD_SHARED_LIBS) #----------------------------------------------------------------------------- set (VOL_PLUGIN_LIBS null_vol_connector + h5subfiling_vol ) foreach (vol_lib ${VOL_PLUGIN_LIBS}) diff --git a/test/Makefile.am b/test/Makefile.am index 7ebeae7..8e6a900 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -116,7 +116,8 @@ if HAVE_SHARED_CONDITIONAL # The libh5test library provides common support code for the tests. # The filter_plugin* libraries are for use in filter_plugin.c. # Build them as shared libraries if that option was enabled in configure. - noinst_LTLIBRARIES=libh5test.la libfilter_plugin1_dsets.la libfilter_plugin2_dsets.la libfilter_plugin3_dsets.la libfilter_plugin4_groups.la libnull_vol_connector.la + noinst_LTLIBRARIES=libh5test.la libfilter_plugin1_dsets.la libfilter_plugin2_dsets.la libfilter_plugin3_dsets.la libfilter_plugin4_groups.la \ + libnull_vol_connector.la libh5subfiling_vol.la libfilter_plugin1_dsets_la_SOURCES=filter_plugin1_dsets.c libfilter_plugin2_dsets_la_SOURCES=filter_plugin2_dsets.c libfilter_plugin3_dsets_la_SOURCES=filter_plugin3_dsets.c @@ -133,7 +134,8 @@ if HAVE_SHARED_CONDITIONAL # null_vol_connector is used for testing basic VOL plugin functionality. libnull_vol_connector_la_SOURCES=null_vol_connector.c libnull_vol_connector_la_LDFLAGS=$(AM_LDFLAGS) -avoid-version -module -shared -export-dynamic -rpath /nowhere - + libh5subfiling_vol_la_SOURCES=h5subfiling_vol.c + libh5subfiling_vol_la_LDFLAGS=$(AM_LDFLAGS) -avoid-version -module -shared -export-dynamic -rpath /nowhere else # The libh5test library provides common support code for the tests. noinst_LTLIBRARIES=libh5test.la diff --git a/test/tselect.c b/test/tselect.c index c98db5d..305d660 100644 --- a/test/tselect.c +++ b/test/tselect.c @@ -14915,8 +14915,8 @@ test_sel_iter(void) { hid_t sid; /* Dataspace ID */ hid_t iter_id; /* Dataspace selection iterator ID */ - hsize_t dims1[] = {6, 12}; /* 2-D Dataspace dimensions */ - hsize_t coord1[POINT1_NPOINTS][2]; /* Coordinates for point selection */ + hsize_t dims1[] = {6, 12}; /* 2-D Dataspace dimensions */ + hsize_t coord1[POINT1_NPOINTS][2]; /* Coordinates for point selection */ hsize_t start[2]; /* Hyperslab start */ hsize_t stride[2]; /* Hyperslab stride */ hsize_t count[2]; /* Hyperslab block count */ @@ -14928,7 +14928,7 @@ test_sel_iter(void) H5S_sel_type sel_type; /* Selection type */ unsigned sel_share; /* Whether to share selection with dataspace */ unsigned sel_iter_flags; /* Flags for selection iterator creation */ - herr_t ret; /* Generic return value */ + herr_t ret; /* Generic return value */ /* Output message about test being performed */ MESSAGE(6, ("Testing Dataspace Selection Iterators\n")); diff --git a/test/vfd.c b/test/vfd.c index 8b59341..4fe229d 100644 --- a/test/vfd.c +++ b/test/vfd.c @@ -12,7 +12,7 @@ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ /* - * Programmer: Raymond Lu + * Programmer: Raymond Lu * Tuesday, Sept 24, 2002 * * Purpose: Tests the basic features of Virtual File Drivers @@ -2258,51 +2258,46 @@ static int compare_splitter_config_info(hid_t fapl_id, H5FD_splitter_vfd_config_t *info) { int ret_value = 0; - H5FD_splitter_vfd_config_t *fetched_info = NULL; - - if (NULL == (fetched_info = HDcalloc(1, sizeof(H5FD_splitter_vfd_config_t)))) - SPLITTER_TEST_FAULT("memory allocation for fetched_info struct failed"); + H5FD_splitter_vfd_config_t fetched_info; - fetched_info->magic = H5FD_SPLITTER_MAGIC; - fetched_info->version = H5FD_CURR_SPLITTER_VFD_CONFIG_VERSION; - fetched_info->rw_fapl_id = H5I_INVALID_HID; - fetched_info->wo_fapl_id = H5I_INVALID_HID; + fetched_info.magic = H5FD_SPLITTER_MAGIC; + fetched_info.version = H5FD_CURR_SPLITTER_VFD_CONFIG_VERSION; + fetched_info.rw_fapl_id = H5I_INVALID_HID; + fetched_info.wo_fapl_id = H5I_INVALID_HID; - if (H5Pget_fapl_splitter(fapl_id, fetched_info) < 0) { - SPLITTER_TEST_FAULT("can't get splitter info"); + if (H5Pget_fapl_splitter(fapl_id, &fetched_info) < 0) { + SPLITTER_TEST_FAULT("can't get splitter info\n"); } if (info->rw_fapl_id == H5P_DEFAULT) { - if (H5Pget_driver(fetched_info->rw_fapl_id) != H5Pget_driver(H5P_FILE_ACCESS_DEFAULT)) { + if (H5Pget_driver(fetched_info.rw_fapl_id) != H5Pget_driver(H5P_FILE_ACCESS_DEFAULT)) { SPLITTER_TEST_FAULT("Read-Write driver mismatch (default)\n"); } } else { - if (H5Pget_driver(fetched_info->rw_fapl_id) != H5Pget_driver(info->rw_fapl_id)) { + if (H5Pget_driver(fetched_info.rw_fapl_id) != H5Pget_driver(info->rw_fapl_id)) { SPLITTER_TEST_FAULT("Read-Write driver mismatch\n"); } } if (info->wo_fapl_id == H5P_DEFAULT) { - if (H5Pget_driver(fetched_info->wo_fapl_id) != H5Pget_driver(H5P_FILE_ACCESS_DEFAULT)) { + if (H5Pget_driver(fetched_info.wo_fapl_id) != H5Pget_driver(H5P_FILE_ACCESS_DEFAULT)) { SPLITTER_TEST_FAULT("Write-Only driver mismatch (default)\n"); } } else { - if (H5Pget_driver(fetched_info->wo_fapl_id) != H5Pget_driver(info->wo_fapl_id)) { + if (H5Pget_driver(fetched_info.wo_fapl_id) != H5Pget_driver(info->wo_fapl_id)) { SPLITTER_TEST_FAULT("Write-Only driver mismatch\n"); } } - if ( (HDstrlen(info->wo_path) != HDstrlen(fetched_info->wo_path)) || - HDstrncmp(info->wo_path, fetched_info->wo_path, H5FD_SPLITTER_PATH_MAX)) + if ( (HDstrlen(info->wo_path) != HDstrlen(fetched_info.wo_path)) || + HDstrncmp(info->wo_path, fetched_info.wo_path, H5FD_SPLITTER_PATH_MAX)) { - HDfprintf(stderr, "MISMATCH: '%s' :: '%s'\n", info->wo_path, fetched_info->wo_path); + HDfprintf(stderr, "MISMATCH: '%s' :: '%s'\n", info->wo_path, fetched_info.wo_path); HEXPRINT(H5FD_SPLITTER_PATH_MAX, info->wo_path); - HEXPRINT(H5FD_SPLITTER_PATH_MAX, fetched_info->wo_path); + HEXPRINT(H5FD_SPLITTER_PATH_MAX, fetched_info.wo_path); SPLITTER_TEST_FAULT("Write-Only file path mismatch\n"); } done: - HDfree(fetched_info); - return ret_value; } /* end compare_splitter_config_info() */ @@ -2336,42 +2331,37 @@ run_splitter_test(const struct splitter_dataset_def *data, hid_t space_id = H5I_INVALID_HID; hid_t fapl_id_out = H5I_INVALID_HID; hid_t fapl_id_cpy = H5I_INVALID_HID; - H5FD_splitter_vfd_config_t *vfd_config = NULL; - char *filename_rw = NULL; + H5FD_splitter_vfd_config_t vfd_config; + char filename_rw[H5FD_SPLITTER_PATH_MAX + 1]; FILE *logfile = NULL; int ret_value = 0; - if (NULL == (vfd_config = HDcalloc(1, sizeof(H5FD_splitter_vfd_config_t)))) - SPLITTER_TEST_FAULT("memory allocation for vfd_config struct failed"); - if (NULL == (filename_rw = HDcalloc(H5FD_SPLITTER_PATH_MAX + 1, sizeof(char)))) - SPLITTER_TEST_FAULT("memory allocation for filename_rw string failed"); + vfd_config.magic = H5FD_SPLITTER_MAGIC; + vfd_config.version = H5FD_CURR_SPLITTER_VFD_CONFIG_VERSION; + vfd_config.ignore_wo_errs = ignore_wo_errors; + vfd_config.rw_fapl_id = sub_fapl_ids[0]; + vfd_config.wo_fapl_id = sub_fapl_ids[1]; - vfd_config->magic = H5FD_SPLITTER_MAGIC; - vfd_config->version = H5FD_CURR_SPLITTER_VFD_CONFIG_VERSION; - vfd_config->ignore_wo_errs = ignore_wo_errors; - vfd_config->rw_fapl_id = sub_fapl_ids[0]; - vfd_config->wo_fapl_id = sub_fapl_ids[1]; - - if (splitter_prepare_file_paths(vfd_config, filename_rw) < 0) { + if (splitter_prepare_file_paths(&vfd_config, filename_rw) < 0) { SPLITTER_TEST_FAULT("can't prepare file paths\n"); } if (provide_logfile_path == FALSE) { - vfd_config->log_file_path[0] = '\0'; /* reset as empty string */ + *vfd_config.log_file_path = '\0'; /* reset as empty string */ } /* Create a new fapl to use the SPLITTER file driver */ if ((fapl_id = H5Pcreate(H5P_FILE_ACCESS)) == H5I_INVALID_HID) { SPLITTER_TEST_FAULT("can't create FAPL ID\n"); } - if (H5Pset_fapl_splitter(fapl_id, vfd_config) < 0) { + if (H5Pset_fapl_splitter(fapl_id, &vfd_config) < 0) { SPLITTER_TEST_FAULT("can't set splitter FAPL\n"); } if (H5Pget_driver(fapl_id) != H5FD_SPLITTER) { SPLITTER_TEST_FAULT("set FAPL not SPLITTER\n"); } - if (compare_splitter_config_info(fapl_id, vfd_config) < 0) { + if (compare_splitter_config_info(fapl_id, &vfd_config) < 0) { SPLITTER_TEST_FAULT("information mismatch\n"); } @@ -2384,7 +2374,7 @@ run_splitter_test(const struct splitter_dataset_def *data, if (H5I_INVALID_HID == fapl_id_cpy) { SPLITTER_TEST_FAULT("can't copy FAPL\n"); } - if (compare_splitter_config_info(fapl_id_cpy, vfd_config) < 0) { + if (compare_splitter_config_info(fapl_id_cpy, &vfd_config) < 0) { SPLITTER_TEST_FAULT("information mismatch\n"); } if (H5Pclose(fapl_id_cpy) < 0) { @@ -2411,7 +2401,7 @@ run_splitter_test(const struct splitter_dataset_def *data, if (H5Pget_driver(fapl_id_out) != H5FD_SPLITTER) { SPLITTER_TEST_FAULT("wrong file FAPL driver\n"); } - if (compare_splitter_config_info(fapl_id_out, vfd_config) < 0) { + if (compare_splitter_config_info(fapl_id_out, &vfd_config) < 0) { SPLITTER_TEST_FAULT("information mismatch\n"); } if (H5Pclose(fapl_id_out) < 0) { @@ -2449,12 +2439,12 @@ run_splitter_test(const struct splitter_dataset_def *data, } /* Verify that the R/W and W/O files are identical */ - if (h5_compare_file_bytes(filename_rw, vfd_config->wo_path) < 0) { + if (h5_compare_file_bytes(filename_rw, vfd_config.wo_path) < 0) { SPLITTER_TEST_FAULT("files are not byte-for-byte equivalent\n"); } - /* Verify existence of logfile if appropriate */ - logfile = fopen(vfd_config->log_file_path, "r"); + /* Verify existence of logfile iff appropriate */ + logfile = fopen(vfd_config.log_file_path, "r"); if ( (TRUE == provide_logfile_path && NULL == logfile) || (FALSE == provide_logfile_path && NULL != logfile) ) { @@ -2464,22 +2454,19 @@ run_splitter_test(const struct splitter_dataset_def *data, done: if (ret_value < 0) { H5E_BEGIN_TRY { - H5Dclose(dset_id); - H5Sclose(space_id); - H5Pclose(fapl_id_out); - H5Pclose(fapl_id_cpy); - H5Pclose(fapl_id); - H5Fclose(file_id); + (void)H5Dclose(dset_id); + (void)H5Sclose(space_id); + (void)H5Pclose(fapl_id_out); + (void)H5Pclose(fapl_id_cpy); + (void)H5Pclose(fapl_id); + (void)H5Fclose(file_id); } H5E_END_TRY; } - - if (logfile != NULL) + if (logfile != NULL) { fclose(logfile); - - HDfree(vfd_config); - HDfree(filename_rw); - + } return ret_value; + } /* end run_splitter_test() */ @@ -2501,28 +2488,25 @@ done: static int driver_is_splitter_compatible(hid_t fapl_id) { - H5FD_splitter_vfd_config_t *vfd_config = NULL; + H5FD_splitter_vfd_config_t vfd_config; hid_t split_fapl_id = H5I_INVALID_HID; herr_t ret = SUCCEED; int ret_value = 0; - if (NULL == (vfd_config = HDcalloc(1, sizeof(H5FD_splitter_vfd_config_t)))) { - FAIL_PUTS_ERROR("memory allocation for vfd_config struct failed"); - } - - if(H5I_INVALID_HID == (split_fapl_id = H5Pcreate(H5P_FILE_ACCESS))) { + split_fapl_id = H5Pcreate(H5P_FILE_ACCESS); + if (H5I_INVALID_HID == split_fapl_id) { FAIL_PUTS_ERROR("Can't create contained FAPL"); } - vfd_config->magic = H5FD_SPLITTER_MAGIC; - vfd_config->version = H5FD_CURR_SPLITTER_VFD_CONFIG_VERSION; - vfd_config->ignore_wo_errs = FALSE; - vfd_config->rw_fapl_id = H5P_DEFAULT; - vfd_config->wo_fapl_id = fapl_id; - HDstrncpy(vfd_config->wo_path, "nonesuch", H5FD_SPLITTER_PATH_MAX); - vfd_config->log_file_path[0] = '\0'; + vfd_config.magic = H5FD_SPLITTER_MAGIC; + vfd_config.version = H5FD_CURR_SPLITTER_VFD_CONFIG_VERSION; + vfd_config.ignore_wo_errs = FALSE; + vfd_config.rw_fapl_id = H5P_DEFAULT; + vfd_config.wo_fapl_id = fapl_id; + HDstrncpy(vfd_config.wo_path, "nonesuch", H5FD_SPLITTER_PATH_MAX); + *vfd_config.log_file_path = '\0'; H5E_BEGIN_TRY { - ret = H5Pset_fapl_splitter(split_fapl_id, vfd_config); + ret = H5Pset_fapl_splitter(split_fapl_id, &vfd_config); } H5E_END_TRY; if (SUCCEED == ret) { ret_value = -1; @@ -2533,17 +2517,12 @@ driver_is_splitter_compatible(hid_t fapl_id) } split_fapl_id = H5I_INVALID_HID; - HDfree(vfd_config); - return ret_value; error: H5E_BEGIN_TRY { - H5Pclose(split_fapl_id); + (void)H5Pclose(split_fapl_id); } H5E_END_TRY; - - HDfree(vfd_config); - return -1; } /* end driver_is_splitter_compatible() */ @@ -2566,24 +2545,19 @@ splitter_RO_test( const struct splitter_dataset_def *data, hid_t child_fapl_id) { - char *filename_rw = NULL; - H5FD_splitter_vfd_config_t *vfd_config = NULL; + char filename_rw[H5FD_SPLITTER_PATH_MAX + 1]; + H5FD_splitter_vfd_config_t vfd_config; hid_t fapl_id = H5I_INVALID_HID; - hid_t file_id = H5I_INVALID_HID; int ret_value = 0; + hid_t file_id = H5I_INVALID_HID; - if (NULL == (vfd_config = HDcalloc(1, sizeof(H5FD_splitter_vfd_config_t)))) - SPLITTER_TEST_FAULT("memory allocation for vfd_config struct failed"); - if (NULL == (filename_rw = HDcalloc(H5FD_SPLITTER_PATH_MAX + 1, sizeof(char)))) - SPLITTER_TEST_FAULT("memory allocation for filename_rw string failed"); - - vfd_config->magic = H5FD_SPLITTER_MAGIC; - vfd_config->version = H5FD_CURR_SPLITTER_VFD_CONFIG_VERSION; - vfd_config->ignore_wo_errs = FALSE; - vfd_config->rw_fapl_id = child_fapl_id; - vfd_config->wo_fapl_id = child_fapl_id; + vfd_config.magic = H5FD_SPLITTER_MAGIC; + vfd_config.version = H5FD_CURR_SPLITTER_VFD_CONFIG_VERSION; + vfd_config.ignore_wo_errs = FALSE; + vfd_config.rw_fapl_id = child_fapl_id; + vfd_config.wo_fapl_id = child_fapl_id; - if (splitter_prepare_file_paths(vfd_config, filename_rw) < 0) { + if (splitter_prepare_file_paths(&vfd_config, filename_rw) < 0) { SPLITTER_TEST_FAULT("can't prepare splitter file paths\n"); } @@ -2592,7 +2566,7 @@ splitter_RO_test( if (H5I_INVALID_HID == fapl_id) { SPLITTER_TEST_FAULT("can't create FAPL ID\n"); } - if (H5Pset_fapl_splitter(fapl_id, vfd_config) < 0) { + if (H5Pset_fapl_splitter(fapl_id, &vfd_config) < 0) { SPLITTER_TEST_FAULT("can't set splitter FAPL\n"); } if (H5Pget_driver(fapl_id) != H5FD_SPLITTER) { @@ -2614,7 +2588,7 @@ splitter_RO_test( * Should fail. */ - if (splitter_create_single_file_at(vfd_config->wo_path, vfd_config->wo_fapl_id, data) < 0) { + if (splitter_create_single_file_at(vfd_config.wo_path, vfd_config.wo_fapl_id, data) < 0) { SPLITTER_TEST_FAULT("can't write W/O file\n"); } H5E_BEGIN_TRY { @@ -2623,13 +2597,13 @@ splitter_RO_test( if (file_id >= 0) { SPLITTER_TEST_FAULT("R/O open with extant W/O file unexpectedly successful\n"); } - HDremove(vfd_config->wo_path); + HDremove(vfd_config.wo_path); /* Attempt R/O open when only R/W file exists * Should fail. */ - if (splitter_create_single_file_at(filename_rw, vfd_config->rw_fapl_id, data) < 0) { + if (splitter_create_single_file_at(filename_rw, vfd_config.rw_fapl_id, data) < 0) { SPLITTER_TEST_FAULT("can't create R/W file\n"); } H5E_BEGIN_TRY { @@ -2642,7 +2616,7 @@ splitter_RO_test( /* Attempt R/O open when both R/W and W/O files exist */ - if (splitter_create_single_file_at(vfd_config->wo_path, vfd_config->wo_fapl_id, data) < 0) { + if (splitter_create_single_file_at(vfd_config.wo_path, vfd_config.wo_fapl_id, data) < 0) { SPLITTER_TEST_FAULT("can't create W/O file\n"); } file_id = H5Fopen(filename_rw, H5F_ACC_RDONLY, fapl_id); @@ -2668,14 +2642,10 @@ splitter_RO_test( done: if (ret_value < 0) { H5E_BEGIN_TRY { - H5Pclose(fapl_id); - H5Fclose(file_id); + (void)H5Pclose(fapl_id); + (void)H5Fclose(file_id); } H5E_END_TRY; - } - - HDfree(vfd_config); - HDfree(filename_rw); - + } /* end if error */ return ret_value; } /* end splitter_RO_test() */ @@ -2814,9 +2784,9 @@ splitter_create_single_file_at( done: if (ret_value < 0) { H5E_BEGIN_TRY { - H5Dclose(dset_id); - H5Sclose(space_id); - H5Fclose(file_id); + (void)H5Dclose(dset_id); + (void)H5Sclose(space_id); + (void)H5Fclose(file_id); } H5E_END_TRY; } /* end if error */ return ret_value; @@ -2877,7 +2847,7 @@ splitter_compare_expected_data(hid_t file_id, done: if (ret_value < 0) { H5E_BEGIN_TRY { - H5Dclose(dset_id); + (void)H5Dclose(dset_id); } H5E_END_TRY; } return ret_value; @@ -2910,9 +2880,8 @@ done: static int splitter_tentative_open_test(hid_t child_fapl_id) { - const char *filename_tmp = "splitter_tmp.h5"; - char *filename_rw = NULL; - H5FD_splitter_vfd_config_t *vfd_config = NULL; + char filename_rw[H5FD_SPLITTER_PATH_MAX + 1]; + H5FD_splitter_vfd_config_t vfd_config; hid_t fapl_id = H5I_INVALID_HID; hid_t file_id = H5I_INVALID_HID; int buf[SPLITTER_SIZE][SPLITTER_SIZE]; /* for comparison */ @@ -2922,11 +2891,6 @@ splitter_tentative_open_test(hid_t child_fapl_id) struct splitter_dataset_def data; /* for comparison */ int ret_value = 0; - if (NULL == (vfd_config = HDcalloc(1, sizeof(H5FD_splitter_vfd_config_t)))) - SPLITTER_TEST_FAULT("memory allocation for vfd_config struct failed"); - if (NULL == (filename_rw = HDcalloc(H5FD_SPLITTER_PATH_MAX + 1, sizeof(char)))) - SPLITTER_TEST_FAULT("memory allocation for filename_rw string failed"); - /* pre-fill data buffer to write */ for (i=0; i < SPLITTER_SIZE; i++) { for (j=0; j < SPLITTER_SIZE; j++) { @@ -2941,13 +2905,13 @@ splitter_tentative_open_test(hid_t child_fapl_id) data.n_dims = 2; data.dset_name = SPLITTER_DATASET_NAME; - vfd_config->magic = H5FD_SPLITTER_MAGIC; - vfd_config->version = H5FD_CURR_SPLITTER_VFD_CONFIG_VERSION; - vfd_config->ignore_wo_errs = FALSE; - vfd_config->rw_fapl_id = child_fapl_id; - vfd_config->wo_fapl_id = child_fapl_id; + vfd_config.magic = H5FD_SPLITTER_MAGIC; + vfd_config.version = H5FD_CURR_SPLITTER_VFD_CONFIG_VERSION; + vfd_config.ignore_wo_errs = FALSE; + vfd_config.rw_fapl_id = child_fapl_id; + vfd_config.wo_fapl_id = child_fapl_id; - if (splitter_prepare_file_paths(vfd_config, filename_rw) < 0) { + if (splitter_prepare_file_paths(&vfd_config, filename_rw) < 0) { SPLITTER_TEST_FAULT("can't prepare splitter file paths\n"); } @@ -2955,23 +2919,14 @@ splitter_tentative_open_test(hid_t child_fapl_id) if ((fapl_id = H5Pcreate(H5P_FILE_ACCESS)) == H5I_INVALID_HID) { SPLITTER_TEST_FAULT("can't create FAPL ID\n"); } - if (H5Pset_fapl_splitter(fapl_id, vfd_config) < 0) { + if (H5Pset_fapl_splitter(fapl_id, &vfd_config) < 0) { SPLITTER_TEST_FAULT("can't set splitter FAPL\n"); } if (H5Pget_driver(fapl_id) != H5FD_SPLITTER) { SPLITTER_TEST_FAULT("set FAPL not SPLITTER\n"); } - /* Create instance of file on disk. - * Will be copied verbatim as needed, to avoid issues where differences in - * the creation time would befoul comparisons. - */ - if (splitter_create_single_file_at(filename_tmp, child_fapl_id, &data) < 0) { - SPLITTER_TEST_FAULT("can't write W/O file\n"); - } - - /* - * H5Fopen() with RDWR access. + /* H5Fopen() with RDWR access. * Neither file exist already * Should fail. */ @@ -2985,18 +2940,17 @@ splitter_tentative_open_test(hid_t child_fapl_id) if (file_exists(filename_rw, child_fapl_id)) { SPLITTER_TEST_FAULT("R/W file unexpectedly created\n"); } - if (file_exists(vfd_config->wo_path, child_fapl_id)) { + if (file_exists(vfd_config.wo_path, child_fapl_id)) { SPLITTER_TEST_FAULT("W/O file unexpectedly created\n"); } - /* - * H5Fopen() with RDWR access. - * Only W/O file present. + /* H5Fopen() with RDWR access. + * W/O file exists already. * Should fail. */ - if (h5_duplicate_file_by_bytes(filename_tmp, vfd_config->wo_path) < 0) { - SPLITTER_TEST_FAULT("Can't create W/O file copy.\n"); + if (splitter_create_single_file_at(vfd_config.wo_path, child_fapl_id, &data) < 0) { + SPLITTER_TEST_FAULT("can't write W/O file\n"); } H5E_BEGIN_TRY { file_id = H5Fopen(filename_rw, H5F_ACC_RDWR, fapl_id); @@ -3007,22 +2961,21 @@ splitter_tentative_open_test(hid_t child_fapl_id) if (file_exists(filename_rw, child_fapl_id)) { SPLITTER_TEST_FAULT("R/W file unexpectedly created\n"); } - if (!file_exists(vfd_config->wo_path, child_fapl_id)) { + if (!file_exists(vfd_config.wo_path, child_fapl_id)) { SPLITTER_TEST_FAULT("W/O file mysteriously disappeared\n"); } - HDremove(vfd_config->wo_path); - if (file_exists(vfd_config->wo_path, child_fapl_id)) { + HDremove(vfd_config.wo_path); + if (file_exists(vfd_config.wo_path, child_fapl_id)) { SPLITTER_TEST_FAULT("failed to remove W/O file\n"); } - /* - * H5Fopen() with RDWR access. - * Only R/W file present. + /* H5Fopen() with RDWR access. + * R/W file exists already. * Should fail. */ - if (h5_duplicate_file_by_bytes(filename_tmp, filename_rw) < 0) { - SPLITTER_TEST_FAULT("Can't create R/W file copy.\n"); + if (splitter_create_single_file_at(filename_rw, child_fapl_id, &data) < 0) { + SPLITTER_TEST_FAULT("can't write R/W file\n"); } H5E_BEGIN_TRY { file_id = H5Fopen(filename_rw, H5F_ACC_RDWR, fapl_id); @@ -3033,17 +2986,16 @@ splitter_tentative_open_test(hid_t child_fapl_id) if (!file_exists(filename_rw, child_fapl_id)) { SPLITTER_TEST_FAULT("R/W file mysteriously disappeared\n"); } - if (file_exists(vfd_config->wo_path, child_fapl_id)) { + if (file_exists(vfd_config.wo_path, child_fapl_id)) { SPLITTER_TEST_FAULT("W/O file unexpectedly created\n"); } - /* - * H5Fopen() with RDWR access. - * Both files present. + /* H5Fopen() with RDWR access. + * Both files already exist. */ - if (h5_duplicate_file_by_bytes(filename_tmp, vfd_config->wo_path) < 0) { - SPLITTER_TEST_FAULT("Can't create W/O file copy.\n"); + if (splitter_create_single_file_at(vfd_config.wo_path, child_fapl_id, &data) < 0) { + SPLITTER_TEST_FAULT("can't write W/O file\n"); } file_id = H5Fopen(filename_rw, H5F_ACC_RDWR, fapl_id); if (file_id == H5I_INVALID_HID) { @@ -3056,13 +3008,15 @@ splitter_tentative_open_test(hid_t child_fapl_id) if (!file_exists(filename_rw, child_fapl_id)) { SPLITTER_TEST_FAULT("R/W file mysteriously disappared\n"); } - if (!file_exists(vfd_config->wo_path, child_fapl_id)) { + if (!file_exists(vfd_config.wo_path, child_fapl_id)) { SPLITTER_TEST_FAULT("W/O file mysteriously disappeared\n"); } + if (h5_compare_file_bytes(filename_rw, vfd_config.wo_path) < 0) { + SPLITTER_TEST_FAULT("files are not byte-for-byte equivalent\n"); + } - /* - * H5Fcreate() with TRUNC access. - * Both files present. + /* H5Fcreate() with TRUNC access. + * Both files already exist. */ file_id = H5Fcreate(filename_rw, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); @@ -3076,24 +3030,24 @@ splitter_tentative_open_test(hid_t child_fapl_id) if (!file_exists(filename_rw, child_fapl_id)) { SPLITTER_TEST_FAULT("R/W file mysteriously disappared\n"); } - if (!file_exists(vfd_config->wo_path, child_fapl_id)) { + if (!file_exists(vfd_config.wo_path, child_fapl_id)) { SPLITTER_TEST_FAULT("W/O file mysteriously disappeared\n"); } - if (h5_compare_file_bytes(filename_rw, vfd_config->wo_path) < 0) { + if (h5_compare_file_bytes(filename_rw, vfd_config.wo_path) < 0) { SPLITTER_TEST_FAULT("files are not byte-for-byte equivalent\n"); } - HDremove(filename_rw); - HDremove(vfd_config->wo_path); - /* - * H5Fcreate() with TRUNC access. + /* H5Fcreate() with TRUNC access. * R/W already exists. */ - if (h5_duplicate_file_by_bytes(filename_tmp, filename_rw) < 0) { - SPLITTER_TEST_FAULT("Can't create R/W file copy.\n"); + HDremove(filename_rw); + HDremove(vfd_config.wo_path); + if (splitter_create_single_file_at(filename_rw, child_fapl_id, &data) < 0) { + SPLITTER_TEST_FAULT("can't write R/W file\n"); } - if (file_exists(vfd_config->wo_path, child_fapl_id)) { + + if (file_exists(vfd_config.wo_path, child_fapl_id)) { SPLITTER_TEST_FAULT("failed to remove W/O file\n"); } file_id = H5Fcreate(filename_rw, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); @@ -3107,23 +3061,23 @@ splitter_tentative_open_test(hid_t child_fapl_id) if (!file_exists(filename_rw, child_fapl_id)) { SPLITTER_TEST_FAULT("R/W file mysteriously disappared\n"); } - if (!file_exists(vfd_config->wo_path, child_fapl_id)) { + if (!file_exists(vfd_config.wo_path, child_fapl_id)) { SPLITTER_TEST_FAULT("W/O file mysteriously disappeared\n"); } - if (h5_compare_file_bytes(filename_rw, vfd_config->wo_path) < 0) { + if (h5_compare_file_bytes(filename_rw, vfd_config.wo_path) < 0) { SPLITTER_TEST_FAULT("files are not byte-for-byte equivalent\n"); } - HDremove(filename_rw); - HDremove(vfd_config->wo_path); - /* - * H5Fcreate() with TRUNC access. - * Only W/O present. + /* H5Fcreate() with TRUNC access. + * W/O already exists. */ - if (h5_duplicate_file_by_bytes(filename_tmp, vfd_config->wo_path) < 0) { - SPLITTER_TEST_FAULT("Can't create W/O file copy.\n"); + HDremove(filename_rw); + HDremove(vfd_config.wo_path); + if (splitter_create_single_file_at(vfd_config.wo_path, child_fapl_id, &data) < 0) { + SPLITTER_TEST_FAULT("can't write R/W file\n"); } + if (file_exists(filename_rw, child_fapl_id)) { SPLITTER_TEST_FAULT("failed to remove R/W file\n"); } @@ -3138,19 +3092,16 @@ splitter_tentative_open_test(hid_t child_fapl_id) if (!file_exists(filename_rw, child_fapl_id)) { SPLITTER_TEST_FAULT("R/W file mysteriously disappared\n"); } - if (!file_exists(vfd_config->wo_path, child_fapl_id)) { + if (!file_exists(vfd_config.wo_path, child_fapl_id)) { SPLITTER_TEST_FAULT("W/O file mysteriously disappeared\n"); } - if (h5_compare_file_bytes(filename_rw, vfd_config->wo_path) < 0) { + if (h5_compare_file_bytes(filename_rw, vfd_config.wo_path) < 0) { SPLITTER_TEST_FAULT("files are not byte-for-byte equivalent\n"); } - HDremove(filename_rw); - HDremove(vfd_config->wo_path); /* H5Fcreate with both files absent is tested elsewhere */ - /* - * Cleanup + /* Cleanup */ if (H5Pclose(fapl_id) < 0) { @@ -3160,14 +3111,10 @@ splitter_tentative_open_test(hid_t child_fapl_id) done: if (ret_value < 0) { H5E_BEGIN_TRY { - H5Pclose(fapl_id); - H5Fclose(file_id); + (void)H5Pclose(fapl_id); + (void)H5Fclose(file_id); } H5E_END_TRY; - } - - HDfree(vfd_config); - HDfree(filename_rw); - + } /* end if error */ return ret_value; } /* end splitter_tentative_open_test() */ @@ -3204,7 +3151,7 @@ file_exists(const char *filename, hid_t fapl_id) error: H5E_BEGIN_TRY { - H5Fclose(file_id); + (void)H5Fclose(file_id); } H5E_END_TRY; return ret_value; } /* end file_exists() */ @@ -3278,6 +3225,7 @@ test_splitter(void) TEST_ERROR; } + /* Test file creation, utilizing different child FAPLs (default vs. * specified), logfile, and Write Channel error ignoring behavior. */ @@ -3303,6 +3251,7 @@ test_splitter(void) /* TODO: SWMR open? */ /* Concurrent opens with both drivers using the Splitter */ + if (H5Pclose(child_fapl_id) == FAIL) { TEST_ERROR; } @@ -3311,9 +3260,9 @@ test_splitter(void) return 0; error: - if (child_fapl_id != H5I_INVALID_HID) - H5Pclose(child_fapl_id); - + if (child_fapl_id != H5I_INVALID_HID) { + (void)H5Pclose(child_fapl_id); + } return -1; } /* end test_splitter() */ @@ -4088,10 +4037,26 @@ test_subfiling(void) hid_t driver_id = -1; /* ID for this VFD */ unsigned long driver_flags = 0; /* VFD feature flags */ char filename[1024]; /* filename */ - void *os_file_handle = NULL; /* OS file handle */ - hsize_t file_size; /* file size */ H5FD_subfiling_fapl_t fa_in = {H5FD_CURR_SUBFILING_FAPL_T_VERSION}; H5FD_subfiling_fapl_t fa_out; + int require_mpi_finalize = 0; + + +#if defined(HAVE_SERIAL_SUBFILING) + void *os_file_handle = NULL; /* OS file handle */ + hsize_t file_size; /* file size */ +#else + int mpi_enabled; /* if MPI_Init has been called */ + if (MPI_Initialized(&mpi_enabled) == MPI_SUCCESS) { + if (!mpi_enabled) { + int mpi_provides, require = MPI_THREAD_MULTIPLE; + if ((MPI_Init_thread(NULL, NULL, require, &mpi_provides)) != MPI_SUCCESS) { + TEST_ERROR; + } + require_mpi_finalize++; + } + } +#endif TESTING("subfiling file driver"); @@ -4124,7 +4089,6 @@ test_subfiling(void) if(!(driver_flags & H5FD_FEAT_AGGREGATE_SMALLDATA)) TEST_ERROR if(!(driver_flags & H5FD_FEAT_POSIX_COMPAT_HANDLE)) TEST_ERROR if(!(driver_flags & H5FD_FEAT_SUPPORTS_SWMR_IO)) TEST_ERROR - if(!(driver_flags & H5FD_FEAT_DEFAULT_VFD_COMPATIBLE)) TEST_ERROR /* Check for extra flags not accounted for above */ if(driver_flags != (H5FD_FEAT_AGGREGATE_METADATA @@ -4132,8 +4096,8 @@ test_subfiling(void) | H5FD_FEAT_DATA_SIEVE | H5FD_FEAT_AGGREGATE_SMALLDATA | H5FD_FEAT_POSIX_COMPAT_HANDLE - | H5FD_FEAT_SUPPORTS_SWMR_IO - | H5FD_FEAT_DEFAULT_VFD_COMPATIBLE)) + | H5FD_FEAT_SUPPORTS_SWMR_IO)) + TEST_ERROR if((fid = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id)) < 0) @@ -4158,6 +4122,7 @@ test_subfiling(void) if(H5Pclose(fapl_id_out) < 0) TEST_ERROR; +#if defined(HAVE_SERIAL_SUBFILING) /* Check that we can get an operating-system-specific handle from * the library. * @@ -4180,6 +4145,7 @@ test_subfiling(void) if(file_size < 1 * KB || file_size > 4 * KB) FAIL_PUTS_ERROR("suspicious file size obtained from H5Fget_filesize"); +#endif /* Close and delete the file */ if(H5Fclose(fid) < 0) @@ -4191,6 +4157,9 @@ test_subfiling(void) if(H5Pclose(fapl_id) < 0) TEST_ERROR; + if (require_mpi_finalize) + MPI_Finalize(); + PASSED(); return 0; @@ -4213,7 +4182,8 @@ error: * * Purpose: Tests the basic features of Virtual File Drivers * - * Return: EXIT_SUCCESS/EXIT_FAILURE + * Return: Success: 0 + * Failure: 1 * *------------------------------------------------------------------------- */ @@ -4249,11 +4219,11 @@ main(void) if(nerrors) { HDprintf("***** %d Virtual File Driver TEST%s FAILED! *****\n", nerrors, nerrors > 1 ? "S" : ""); - return EXIT_FAILURE; - } + return 1; + } /* end if */ HDprintf("All Virtual File Driver tests passed.\n"); - return EXIT_SUCCESS; + return 0; } /* end main() */ diff --git a/testpar/CMakeLists.txt b/testpar/CMakeLists.txt index c95e01f..67dcd00 100644 --- a/testpar/CMakeLists.txt +++ b/testpar/CMakeLists.txt @@ -74,9 +74,10 @@ set (H5P_TESTS t_init_term t_shapesame t_filters_parallel - t_2Gio t_subfile_openclose t_subfile_readwrite + t_subfile_bench +# t_subfile_bench_hdf ) foreach (h5_testp ${H5P_TESTS}) diff --git a/testpar/t_bigio.c b/testpar/t_bigio.c index f86852a..26ee15f 100644 --- a/testpar/t_bigio.c +++ b/testpar/t_bigio.c @@ -48,6 +48,9 @@ static int mpi_size_g, mpi_rank_g; hsize_t space_dim1 = SPACE_DIM1 * 256; // 4096 hsize_t space_dim2 = SPACE_DIM2; +extern void +set_verbose_flag(int subfile_rank, int new_value); + static void coll_chunktest(const char* filename, int chunk_factor, int select_factor, int api_option, int file_selection, int mem_selection, int mode); @@ -494,7 +497,6 @@ dataset_big_write(void) size_t num_points; B_DATATYPE * wdata; - /* allocate memory for data buffer */ wdata = (B_DATATYPE *)HDmalloc(bigcount*sizeof(B_DATATYPE)); VRFY_G((wdata != NULL), "wdata malloc succeeded"); @@ -516,7 +518,7 @@ dataset_big_write(void) /* Each process takes a slabs of rows. */ if (mpi_rank_g == 0) HDprintf("\nTesting Dataset1 write by ROW\n"); - /* Create a large dataset */ + /* Create a large dataset - global dims as follows:: */ dims[0] = bigcount; dims[1] = (hsize_t)mpi_size_g; @@ -528,6 +530,7 @@ dataset_big_write(void) block[0] = dims[0]/(hsize_t)mpi_size_g; block[1] = dims[1]; + printf("[%d] block[0] = %lld block[1] = %lld\n", mpi_rank_g, block[0], block[1]); stride[0] = block[0]; stride[1] = block[1]; count[0] = 1; @@ -776,6 +779,7 @@ dataset_big_write(void) VRFY_G((ret >= 0), "H5Dclose1 succeeded"); HDfree(wdata); + H5Fclose(fid); } @@ -1922,6 +1926,8 @@ do_express_test(int world_mpi_rank) int main(int argc, char **argv) { int ExpressMode = 0; + int mpi_provides, require = MPI_THREAD_MULTIPLE; + hsize_t newsize = 1048576; /* Set the bigio processing limit to be 'newsize' bytes */ hsize_t oldsize = H5_mpi_set_bigio_count(newsize); @@ -1934,8 +1940,10 @@ int main(int argc, char **argv) if (newsize != oldsize) { bigcount = newsize * 2; } - - MPI_Init(&argc, &argv); + if ( (MPI_Init_thread(&argc, &argv, require, &mpi_provides)) != MPI_SUCCESS) { + HDfprintf(stderr, "FATAL: Unable to initialize MPI\n"); + HDexit(EXIT_FAILURE); + } MPI_Comm_size(MPI_COMM_WORLD,&mpi_size_g); MPI_Comm_rank(MPI_COMM_WORLD,&mpi_rank_g); @@ -1945,7 +1953,7 @@ int main(int argc, char **argv) * calls. By then, MPI calls may not work. */ if (H5dont_atexit() < 0){ - HDprintf("Failed to turn off atexit processing. Continue.\n"); + HDprintf("Failed to turn off atexit processing. Continue.\n"); }; /* set alarm. */ @@ -1953,6 +1961,8 @@ int main(int argc, char **argv) ExpressMode = do_express_test(mpi_rank_g); + set_verbose_flag(0, 1); + dataset_big_write(); MPI_Barrier(MPI_COMM_WORLD); @@ -1976,9 +1986,10 @@ int main(int argc, char **argv) /* turn off alarm */ ALARM_OFF; +#if 0 if (mpi_rank_g == 0) HDremove(FILENAME[0]); - +#endif /* close HDF5 library */ H5close(); diff --git a/testpar/t_subfile_openclose.c b/testpar/t_subfile_openclose.c index fe39f2c..8ccf3c7 100644 --- a/testpar/t_subfile_openclose.c +++ b/testpar/t_subfile_openclose.c @@ -1,6 +1,11 @@ #include +#include +#include +#include +#include +#include + #include "hdf5.h" -#include "H5FDsubfile_public.h" #include "mpi.h" @@ -11,7 +16,9 @@ main(int argc, char **argv) int i, mpi_size, mpi_rank; int loop_count = 20; int mpi_provides, require = MPI_THREAD_MULTIPLE; - hid_t subfile_id = -1; + hid_t subfile_id = 1; + const char *h5_filename = "unused.h5"; + FILE *h5file; MPI_Init_thread(&argc, &argv, require, &mpi_provides); MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); @@ -25,21 +32,22 @@ main(int argc, char **argv) } H5open(); - - if (H5FDsubfiling_init() == SUCCEED) { - subfile_id = get_subfiling_context(); - printf("[%d] subfile_id = %lx\n", mpi_rank, subfile_id); - } - else if (mpi_rank == 0) { - puts("Error: Unable to initialize subfiling!"); - } - + + h5file = fopen(h5_filename, "w+"); for(i=0; i < loop_count; i++) { - sf_open_subfiles(subfile_id, NULL, O_CREAT|O_TRUNC|O_RDWR); + if (mpi_rank == 0) { + printf("loop_count(%d)\n", i); + fflush(stdout); + } + sf_open_subfiles(subfile_id, h5_filename, NULL, O_CREAT|O_TRUNC|O_RDWR); sf_close_subfiles(subfile_id); } - H5FDsubfiling_finalize(); + if (h5file) { + fclose(h5file); + if (mpi_rank == 0) + unlink(h5_filename); + } MPI_Barrier(MPI_COMM_WORLD); diff --git a/testpar/t_subfile_readwrite.c b/testpar/t_subfile_readwrite.c index b4c798a..34f3281 100644 --- a/testpar/t_subfile_readwrite.c +++ b/testpar/t_subfile_readwrite.c @@ -1,6 +1,12 @@ #include +#include +#include +#include +#include +#include +#include + #include "hdf5.h" -#include "H5FDsubfile_public.h" #include "mpi.h" @@ -13,46 +19,59 @@ int mpi_size = -1; int mpi_rank = -1; -static int test_subfile_op(int op_type, hid_t subfile_id, int64_t offset, int64_t local_elements, void *local_data) +static int test_subfile_op(int op_type, hid_t subfile_id, char *prefix, int64_t offset, int64_t local_elements, void *local_data, int reporter) { int i, flags = O_RDWR; int errors = 0; int loop_count = 20; - int64_t local_data_size = local_elements * sizeof(int); + int64_t local_data_size = local_elements * (int64_t)sizeof(int); int64_t total_data_size = 0; + const char *h5_filename = "unused.h5"; + FILE *h5file; int (*subfile_ftn)(hid_t context_id,int64_t offset, int64_t elements, int dtype_extent, void *data) = sf_read_independent; double m_startTime, m_endTime; - double this_time, max_time, min_time, total_time, avg_time; - double bw; + double this_time, avg_time, max_time = 0.0, min_time = 0.0, total_time = 0.0; + double bw = 0.0; const char *OPERATION = "READ"; + if (op_type == WRITE_OP) { flags = O_CREAT|O_TRUNC|O_RDWR; - subfile_ftn = sf_write_independent; + subfile_ftn = (int (*)(long int, long int, long int, int, void *))sf_write_independent; OPERATION = "WRITE"; } + h5file = fopen(h5_filename, "w+"); + for(i=0; i < loop_count; i++) { - m_startTime = MPI_Wtime(); - if (sf_open_subfiles(subfile_id, NULL, flags) < 0) { + // if (mpi_rank == 0) set_verbose_flag(0, 1); + + if (sf_open_subfiles(subfile_id, h5_filename, prefix, flags) < 0) { puts("sf_open_subfiles returned an error!"); errors++; goto done; } + + m_startTime = MPI_Wtime(); + if (subfile_ftn(subfile_id, offset, local_elements, sizeof(int), local_data) < 0) { puts("subfile_ftn returned an error!"); errors++; goto done; } + m_endTime = MPI_Wtime(); + if (sf_close_subfiles(subfile_id) < 0) { puts("sf_close_subfiles returned an error!"); errors++; goto done; } - m_endTime = MPI_Wtime(); this_time = m_endTime - m_startTime; + + // if (mpi_rank == 0) set_verbose_flag(0, 0); + if (i == 0) { min_time = this_time; max_time = this_time; @@ -65,11 +84,17 @@ static int test_subfile_op(int op_type, hid_t subfile_id, int64_t offset, int64_ } total_time += this_time; } + if (h5file) { + fclose(h5file); + if (mpi_rank == 0) + unlink(h5_filename); + } + total_data_size = local_data_size * mpi_size; avg_time = total_time / (double) loop_count; bw = ((double)total_data_size)/ avg_time / (1024.0 * 1024.0); - if (mpi_rank == 0) { + if (mpi_rank == reporter) { printf("%s Perf: %lf BW/[MBs] %ld Bytes AvgTime[sec] %lf\n", OPERATION, bw, total_data_size, avg_time); fflush(stdout); } @@ -82,16 +107,14 @@ int main(int argc, char **argv) { int errors = 0; - int mpi_provides, require = MPI_THREAD_MULTIPLE; - hid_t subfile_id = -1; - double m_startTime, m_endTime; - double this_time, max_time, min_time, total_time, avg_time; - double bw; + int proc, mpi_provides, require = MPI_THREAD_MULTIPLE; + hid_t subfile_id = 1; int64_t local_elements = DATA_SIZE; int64_t local_data_size = 0; int64_t offset = 0; int *local_data = NULL; int *verify_data = NULL; + char *prefix = NULL; MPI_Init_thread(&argc, &argv, require, &mpi_provides); MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); @@ -103,14 +126,17 @@ main(int argc, char **argv) local_elements = check_value; } } + if (argc > 2) { + prefix = strdup(argv[2]); + } H5open(); - local_data_size = local_elements * sizeof(int); + local_data_size = local_elements * (int64_t)sizeof(int); local_data = (int *)malloc((size_t)local_data_size); if (local_data) { - int k, base = local_elements * mpi_rank; - offset = local_data_size * mpi_rank; + int k, base = (int)local_elements * mpi_rank; + offset = local_data_size * (int64_t)mpi_rank; for (k=0; k < local_elements; k++) { local_data[k] = k + base; } @@ -128,30 +154,21 @@ main(int argc, char **argv) goto done; } - if (H5FDsubfiling_init() == SUCCEED) { - subfile_id = get_subfiling_context(); - printf("[%d] subfile_id = %lx\n", mpi_rank, subfile_id); - } - else if (mpi_rank == 0) { - puts("Error: Unable to initialize subfiling!"); - errors++; - goto done; - } - - if (test_subfile_op( WRITE_OP, subfile_id, offset, local_elements, local_data)) { - puts("Subfile writing test returned an error!"); - errors++; - goto done; - } - if (test_subfile_op( READ_OP, subfile_id, offset, local_elements, verify_data)) { - puts("Subfile reading test returned an error!"); - errors++; - goto done; + for(proc=0; proc < 10; proc++) { + if (test_subfile_op( WRITE_OP, subfile_id, prefix, offset, local_elements, local_data, proc)) { + puts("Subfile writing test returned an error!"); + errors++; + goto done; + } + if (test_subfile_op( READ_OP, subfile_id, prefix, offset, local_elements, verify_data, proc)) { + puts("Subfile reading test returned an error!"); + errors++; + goto done; + } } done: - H5FDsubfiling_finalize(); MPI_Barrier(MPI_COMM_WORLD); if (local_data) { diff --git a/tools/lib/h5diff.c b/tools/lib/h5diff.c index 8324714..87a3b11 100644 --- a/tools/lib/h5diff.c +++ b/tools/lib/h5diff.c @@ -986,7 +986,7 @@ h5diff(const char *fname1, const char *fname2, const char *objname1, const char H5TOOLS_DEBUG("groups traversed - errstat:%d", opts->err_stat); #ifdef H5_HAVE_PARALLEL - if(g_Parallel && !g_CollectInfoOnly) { + if(g_Parallel) { int i; if((HDstrlen(fname1) > MAX_FILENAME) || (HDstrlen(fname2) > MAX_FILENAME)) { @@ -1001,11 +1001,6 @@ h5diff(const char *fname1, const char *fname2, const char *objname1, const char for(i = 1; i < g_nTasks; i++) MPI_Send(filenames, (MAX_FILENAME * 2), MPI_CHAR, i, MPI_TAG_PARALLEL, MPI_COMM_WORLD); } /* end if */ - else if (g_CollectInfoOnly) { - build_match_list (obj1fullname, info1_lp, obj2fullname, info2_lp, &match_list, opts); - - } - #endif H5TOOLS_DEBUG("build_match_list next - errstat:%d", opts->err_stat); diff --git a/tools/lib/h5tools_utils.c b/tools/lib/h5tools_utils.c index 63b3041..6167dd9 100644 --- a/tools/lib/h5tools_utils.c +++ b/tools/lib/h5tools_utils.c @@ -48,7 +48,6 @@ hsize_t H5TOOLS_BUFSIZE = ( 32 * 1024 * 1024); /* 32 MB */ /* ``parallel_print'' variables */ unsigned char g_Parallel = 0; /*0 for serial, 1 for parallel */ -unsigned char g_CollectInfoOnly = 0; char outBuff[OUTBUFF_SIZE]; unsigned outBuffOffset; FILE* overflow_file = NULL; diff --git a/tools/lib/h5tools_utils.h b/tools/lib/h5tools_utils.h index 2cd2eae..07069cc 100644 --- a/tools/lib/h5tools_utils.h +++ b/tools/lib/h5tools_utils.h @@ -32,7 +32,6 @@ extern "C" { H5TOOLS_DLLVAR int g_nTasks; H5TOOLS_DLLVAR unsigned char g_Parallel; -H5TOOLS_DLLVAR unsigned char g_CollectInfoOnly; H5TOOLS_DLLVAR char outBuff[]; H5TOOLS_DLLVAR unsigned outBuffOffset; H5TOOLS_DLLVAR FILE *overflow_file; diff --git a/tools/lib/h5trav.c b/tools/lib/h5trav.c index a9b5b75..dc7e27d 100644 --- a/tools/lib/h5trav.c +++ b/tools/lib/h5trav.c @@ -15,9 +15,6 @@ #include "h5trav.h" #include "h5tools.h" #include "H5private.h" -#ifdef H5_HAVE_PARALLEL -#include "h5tools_utils.h" -#endif /*------------------------------------------------------------------------- * local typedefs @@ -182,10 +179,8 @@ static herr_t traverse_cb(hid_t loc_id, const char *path, const H5L_info2_t *linfo, void *_udata) { - herr_t ret_value = SUCCEED; trav_ud_traverse_t *udata = (trav_ud_traverse_t *)_udata; /* User data */ char *new_name = NULL; - const char *full_name; const char *already_visited = NULL; /* Whether the link/object was already visited */ @@ -206,18 +201,6 @@ traverse_cb(hid_t loc_id, const char *path, const H5L_info2_t *linfo, else full_name = path; -#ifdef H5_HAVE_PARALLEL - if(linfo->type == H5L_TYPE_EXTERNAL) { - h5tool_link_info_t lnk_info; - if ((ret_value = H5tools_get_symlink_info(loc_id, path, &lnk_info, FALSE)) < 0) { - puts("H5tools_get_symlink_info failed!"); - } - else if (ret_value == 0) { - puts("Dangling link?"); - } - printf("Visiting external link: %s\n", path); - } -#endif /* Perform the correct action for different types of links */ if(linfo->type == H5L_TYPE_HARD) { H5O_info2_t oinfo; -- cgit v0.12