summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorjhendersonHDF <jhenderson@hdfgroup.org>2022-09-16 16:17:30 (GMT)
committerGitHub <noreply@github.com>2022-09-16 16:17:30 (GMT)
commit16aa2dbaa0e70bf81f4329a70a45c601433549bb (patch)
tree7c6debf81d393d9294a2e6d79ca36b53d485348d /src
parent45178c87a3099a9fef8bae6f7249ca306cf89629 (diff)
downloadhdf5-16aa2dbaa0e70bf81f4329a70a45c601433549bb.zip
hdf5-16aa2dbaa0e70bf81f4329a70a45c601433549bb.tar.gz
hdf5-16aa2dbaa0e70bf81f4329a70a45c601433549bb.tar.bz2
Subfiling VFD updates (#2106)
Diffstat (limited to 'src')
-rw-r--r--src/H5FDsubfiling/H5FDioc.c805
-rw-r--r--src/H5FDsubfiling/H5FDioc.h21
-rw-r--r--src/H5FDsubfiling/H5FDioc_int.c295
-rw-r--r--src/H5FDsubfiling/H5FDioc_priv.h37
-rw-r--r--src/H5FDsubfiling/H5FDioc_threads.c300
-rw-r--r--src/H5FDsubfiling/H5FDsubfile_int.c186
-rw-r--r--src/H5FDsubfiling/H5FDsubfiling.c832
-rw-r--r--src/H5FDsubfiling/H5FDsubfiling.h124
-rw-r--r--src/H5FDsubfiling/H5subfiling_common.c2840
-rw-r--r--src/H5FDsubfiling/H5subfiling_common.h181
10 files changed, 3380 insertions, 2241 deletions
diff --git a/src/H5FDsubfiling/H5FDioc.c b/src/H5FDsubfiling/H5FDioc.c
index 78d060f..11d51de 100644
--- a/src/H5FDsubfiling/H5FDioc.c
+++ b/src/H5FDsubfiling/H5FDioc.c
@@ -47,15 +47,21 @@ typedef struct H5FD_ioc_t {
int fd; /* the filesystem file descriptor */
H5FD_ioc_config_t fa; /* driver-specific file access properties */
+ H5FD_subfiling_params_t subf_config;
+
/* MPI Info */
MPI_Comm comm;
MPI_Info info;
int mpi_rank;
int mpi_size;
- H5FD_t *ioc_file; /* native HDF5 file pointer */
+ uint64_t file_id;
+ int64_t context_id; /* The value used to lookup a subfiling context for the file */
- int64_t context_id; /* The value used to lookup a subfiling context for the file */
+ haddr_t eof;
+ haddr_t eoa;
+ haddr_t last_eoa;
+ haddr_t local_eof;
char *file_dir; /* Directory where we find files */
char *file_path; /* The user defined filename */
@@ -130,9 +136,8 @@ static herr_t H5FD__ioc_ctl(H5FD_t *file, uint64_t op_code, uint64_t flags,
const void *input, void **result);
*/
-static herr_t H5FD__ioc_get_default_config(hid_t fapl_id, H5FD_ioc_config_t *config_out);
+static herr_t H5FD__ioc_get_default_config(H5FD_ioc_config_t *config_out);
static herr_t H5FD__ioc_validate_config(const H5FD_ioc_config_t *fa);
-static int H5FD__copy_plist(hid_t fapl_id, hid_t *id_out_ptr);
static herr_t H5FD__ioc_close_int(H5FD_ioc_t *file_ptr);
@@ -330,10 +335,9 @@ H5Pset_fapl_ioc(hid_t fapl_id, H5FD_ioc_config_t *vfd_config)
if (NULL == (ioc_conf = H5FL_CALLOC(H5FD_ioc_config_t)))
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
"can't allocate IOC VFD configuration");
- ioc_conf->under_fapl_id = H5I_INVALID_HID;
/* Get IOC VFD defaults */
- if (H5FD__ioc_get_default_config(fapl_id, ioc_conf) < 0)
+ if (H5FD__ioc_get_default_config(ioc_conf) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't get default IOC VFD configuration");
vfd_config = ioc_conf;
@@ -346,9 +350,6 @@ H5Pset_fapl_ioc(hid_t fapl_id, H5FD_ioc_config_t *vfd_config)
done:
if (ioc_conf) {
- if (ioc_conf->under_fapl_id >= 0 && H5I_dec_ref(ioc_conf->under_fapl_id) < 0)
- H5_SUBFILING_DONE_ERROR(H5E_PLIST, H5E_CANTDEC, FAIL, "can't close IOC under FAPL");
- ioc_conf->under_fapl_id = H5I_INVALID_HID;
H5FL_FREE(H5FD_ioc_config_t, ioc_conf);
}
@@ -393,16 +394,12 @@ H5Pget_fapl_ioc(hid_t fapl_id, H5FD_ioc_config_t *config_out)
}
if (use_default_config) {
- if (H5FD__ioc_get_default_config(fapl_id, config_out) < 0)
+ if (H5FD__ioc_get_default_config(config_out) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get default IOC VFD configuration");
}
else {
/* Copy the IOC fapl data out */
HDmemcpy(config_out, config_ptr, sizeof(H5FD_ioc_config_t));
-
- /* Copy the driver info value */
- if (H5FD__copy_plist(config_ptr->under_fapl_id, &(config_out->under_fapl_id)) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "can't copy IOC under FAPL");
}
done:
@@ -421,56 +418,18 @@ done:
*-------------------------------------------------------------------------
*/
static herr_t
-H5FD__ioc_get_default_config(hid_t fapl_id, H5FD_ioc_config_t *config_out)
+H5FD__ioc_get_default_config(H5FD_ioc_config_t *config_out)
{
- MPI_Comm comm = MPI_COMM_NULL;
- MPI_Info info = MPI_INFO_NULL;
- herr_t ret_value = SUCCEED;
+ herr_t ret_value = SUCCEED;
HDassert(config_out);
HDmemset(config_out, 0, sizeof(*config_out));
- config_out->magic = H5FD_IOC_FAPL_MAGIC;
- config_out->version = H5FD_IOC_CURR_FAPL_VERSION;
- config_out->under_fapl_id = H5I_INVALID_HID;
-
- /*
- * Use default subfiling configuration. Do NOT call
- * H5Pget_fapl_subfiling here as that can cause issues
- */
- config_out->subf_config.ioc_selection = SELECT_IOC_ONE_PER_NODE;
- config_out->subf_config.stripe_size = H5FD_SUBFILING_DEFAULT_STRIPE_SIZE;
- config_out->subf_config.stripe_count = 0;
-
- /* Create a default FAPL and choose an appropriate underlying driver */
- if ((config_out->under_fapl_id = H5Pcreate(H5P_FILE_ACCESS)) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTCREATE, FAIL, "can't create default FAPL");
-
- /* Check if any MPI parameters were set on the FAPL */
- if (H5Pget_mpi_params(fapl_id, &comm, &info) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI Comm/Info");
- if (comm == MPI_COMM_NULL)
- comm = MPI_COMM_WORLD;
-
- /* Hardwire MPI I/O VFD for now */
- if (H5Pset_fapl_mpio(config_out->under_fapl_id, comm, info) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI I/O VFD on IOC under FAPL");
-
- /* Specific to this I/O Concentrator */
+ config_out->magic = H5FD_IOC_FAPL_MAGIC;
+ config_out->version = H5FD_IOC_CURR_FAPL_VERSION;
config_out->thread_pool_size = H5FD_IOC_DEFAULT_THREAD_POOL_SIZE;
-done:
- if (H5_mpi_comm_free(&comm) < 0)
- H5_SUBFILING_DONE_ERROR(H5E_PLIST, H5E_CANTFREE, FAIL, "can't free MPI Communicator");
- if (H5_mpi_info_free(&info) < 0)
- H5_SUBFILING_DONE_ERROR(H5E_PLIST, H5E_CANTFREE, FAIL, "can't free MPI Info object");
-
- if (ret_value < 0) {
- if (config_out->under_fapl_id >= 0 && H5Pclose(config_out->under_fapl_id) < 0)
- H5_SUBFILING_DONE_ERROR(H5E_PLIST, H5E_CANTCLOSEOBJ, FAIL, "can't close FAPL");
- }
-
H5_SUBFILING_FUNC_LEAVE;
}
@@ -504,13 +463,6 @@ H5FD__ioc_validate_config(const H5FD_ioc_config_t *fa)
if (fa->magic != H5FD_IOC_FAPL_MAGIC)
H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid H5FD_ioc_config_t magic value");
- if (fa->under_fapl_id < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid under FAPL ID");
-
- if (fa->subf_config.ioc_selection < SELECT_IOC_ONE_PER_NODE ||
- fa->subf_config.ioc_selection >= ioc_selection_options)
- H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid IOC selection method");
-
done:
H5_SUBFILING_FUNC_LEAVE;
} /* end H5FD__ioc_validate_config() */
@@ -518,31 +470,37 @@ done:
/*-------------------------------------------------------------------------
* Function: H5FD__ioc_sb_size
*
- * Purpose: Obtains the number of bytes required to store the driver file
- * access data in the HDF5 superblock.
+ * Purpose: Obtains the number of bytes required to store the driver
+ * file access data in the HDF5 superblock.
*
* Return: Success: Number of bytes required.
*
* Failure: 0 if an error occurs or if the driver has no
* data to store in the superblock.
*
- * NOTE: no public API for H5FD_sb_size, it needs to be added
*-------------------------------------------------------------------------
*/
static hsize_t
-H5FD__ioc_sb_size(H5FD_t *_file)
+H5FD__ioc_sb_size(H5FD_t H5_ATTR_UNUSED *_file)
{
- H5FD_ioc_t *file = (H5FD_ioc_t *)_file;
- hsize_t ret_value = 0;
+ hsize_t ret_value = 0;
H5FD_IOC_LOG_CALL(__func__);
- /* Sanity check */
- HDassert(file);
- HDassert(file->ioc_file);
+ /* Configuration structure magic number */
+ ret_value += sizeof(uint32_t);
+
+ /* Configuration structure version number */
+ ret_value += sizeof(uint32_t);
+
+ /* IOC thread pool size */
+ ret_value += sizeof(int32_t);
+
+ /* Subfiling stripe size */
+ ret_value += sizeof(int64_t);
- if (file->ioc_file)
- ret_value = H5FD_sb_size(file->ioc_file);
+ /* Subfiling stripe count (encoded as int64_t for future) */
+ ret_value += sizeof(int64_t);
H5_SUBFILING_FUNC_LEAVE;
} /* end H5FD__ioc_sb_size */
@@ -552,23 +510,42 @@ H5FD__ioc_sb_size(H5FD_t *_file)
*
* Purpose: Encode driver-specific data into the output arguments.
*
- * Return: SUCCEED/FAIL
+ * Return: Non-negative on success/Negative on failure
*-------------------------------------------------------------------------
*/
static herr_t
-H5FD__ioc_sb_encode(H5FD_t *_file, char *name /*out*/, unsigned char *buf /*out*/)
+H5FD__ioc_sb_encode(H5FD_t *_file, char *name, unsigned char *buf)
{
- H5FD_ioc_t *file = (H5FD_ioc_t *)_file;
- herr_t ret_value = SUCCEED; /* Return value */
+ subfiling_context_t *sf_context = NULL;
+ H5FD_ioc_t *file = (H5FD_ioc_t *)_file;
+ uint8_t *p = (uint8_t *)buf;
+ int64_t tmp64;
+ herr_t ret_value = SUCCEED;
H5FD_IOC_LOG_CALL(__func__);
- /* Sanity check */
- HDassert(file);
- HDassert(file->ioc_file);
+ if (NULL == (sf_context = H5_get_subfiling_object(file->context_id)))
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get subfiling context object");
- if (file->ioc_file && H5FD_sb_encode(file->ioc_file, name, buf) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTENCODE, FAIL, "unable to encode the superblock in R/W file");
+ /* Encode driver name */
+ HDstrncpy(name, "IOC", 9);
+ name[8] = '\0';
+
+ /* Encode configuration structure magic number */
+ UINT32ENCODE(p, file->fa.magic);
+
+ /* Encode configuration structure version number */
+ UINT32ENCODE(p, file->fa.version);
+
+ /* Encode thread pool size field */
+ INT32ENCODE(p, file->fa.thread_pool_size);
+
+ /* Encode subfiling stripe size */
+ INT64ENCODE(p, sf_context->sf_stripe_size);
+
+ /* Encode subfiling stripe count (number of subfiles) */
+ tmp64 = sf_context->sf_num_subfiles;
+ INT64ENCODE(p, tmp64);
done:
H5_SUBFILING_FUNC_LEAVE;
@@ -579,25 +556,62 @@ done:
*
* Purpose: Decodes the driver information block.
*
- * Return: SUCCEED/FAIL
- *
- * NOTE: no public API for H5FD_sb_size, need to add
+ * Return: Non-negative on success/Negative on failure
*-------------------------------------------------------------------------
*/
static herr_t
H5FD__ioc_sb_decode(H5FD_t *_file, const char *name, const unsigned char *buf)
{
- H5FD_ioc_t *file = (H5FD_ioc_t *)_file;
- herr_t ret_value = SUCCEED; /* Return value */
+ subfiling_context_t *sf_context = NULL;
+ const uint8_t *p = (const uint8_t *)buf;
+ H5FD_ioc_t *file = (H5FD_ioc_t *)_file;
+ int64_t tmp64;
+ herr_t ret_value = SUCCEED;
H5FD_IOC_LOG_CALL(__func__);
- /* Sanity check */
- HDassert(file);
- HDassert(file->ioc_file);
+ if (NULL == (sf_context = H5_get_subfiling_object(file->context_id)))
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get subfiling context object");
+
+ if (HDstrncmp(name, "IOC", 9))
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "invalid driver name in superblock");
+
+ /* Decode configuration structure magic number */
+ UINT32DECODE(p, file->fa.magic);
+
+ /* Decode configuration structure version number */
+ UINT32DECODE(p, file->fa.version);
+
+ /* Decode thread pool size field */
+ INT32DECODE(p, file->fa.thread_pool_size);
+
+ /* Decode subfiling stripe size */
+ INT64DECODE(p, file->subf_config.stripe_size);
- if (H5FD_sb_load(file->ioc_file, name, buf) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTDECODE, FAIL, "unable to decode the superblock in R/W file");
+ /* Decode subfiling stripe count */
+ INT64DECODE(p, tmp64);
+ H5_CHECK_OVERFLOW(tmp64, int64_t, int32_t);
+ file->subf_config.stripe_count = (int32_t)tmp64;
+
+ /* Validate the decoded configuration */
+ if (H5FD__ioc_validate_config(&file->fa) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "decoded IOC VFD configuration info is invalid");
+
+ if (H5_subfiling_validate_config(&file->subf_config) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL,
+ "decoded subfiling configuration parameters are invalid");
+
+ if (file->subf_config.stripe_size != sf_context->sf_stripe_size)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL,
+ "specified subfiling stripe size (%" PRId64
+ ") doesn't match value stored in file (%" PRId64 ")",
+ sf_context->sf_stripe_size, file->subf_config.stripe_size);
+
+ if (file->subf_config.stripe_count != sf_context->sf_num_subfiles)
+ H5_SUBFILING_GOTO_ERROR(
+ H5E_VFL, H5E_BADVALUE, FAIL,
+ "specified subfiling stripe count (%d) doesn't match value stored in file (%" PRId32 ")",
+ sf_context->sf_num_subfiles, file->subf_config.stripe_count);
done:
H5_SUBFILING_FUNC_LEAVE;
@@ -629,40 +643,6 @@ H5FD__ioc_fapl_get(H5FD_t *_file)
} /* end H5FD__ioc_fapl_get() */
/*-------------------------------------------------------------------------
- * Function: H5FD__copy_plist
- *
- * Purpose: Sanity-wrapped H5P_copy_plist() for each channel.
- * Utility function for operation in multiple locations.
- *
- * Return: 0 on success, -1 on error.
- *-------------------------------------------------------------------------
- */
-static int
-H5FD__copy_plist(hid_t fapl_id, hid_t *id_out_ptr)
-{
- int ret_value = 0;
- H5P_genplist_t *plist_ptr = NULL;
-
- H5FD_IOC_LOG_CALL(__func__);
-
- HDassert(id_out_ptr != NULL);
-
- if (FALSE == H5P_isa_class(fapl_id, H5P_FILE_ACCESS))
- H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADTYPE, -1, "not a file access property list");
-
- plist_ptr = (H5P_genplist_t *)H5I_object(fapl_id);
- if (NULL == plist_ptr)
- H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADTYPE, -1, "unable to get property list");
-
- *id_out_ptr = H5P_copy_plist(plist_ptr, FALSE);
- if (H5I_INVALID_HID == *id_out_ptr)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADTYPE, -1, "unable to copy file access property list");
-
-done:
- H5_SUBFILING_FUNC_LEAVE;
-} /* end H5FD__copy_plist() */
-
-/*-------------------------------------------------------------------------
* Function: H5FD__ioc_fapl_copy
*
* Purpose: Copies the file access properties.
@@ -688,10 +668,6 @@ H5FD__ioc_fapl_copy(const void *_old_fa)
HDmemcpy(new_fa_ptr, old_fa_ptr, sizeof(H5FD_ioc_config_t));
- /* Copy the FAPL */
- if (H5FD__copy_plist(old_fa_ptr->under_fapl_id, &(new_fa_ptr->under_fapl_id)) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, NULL, "can't copy the IOC under FAPL");
-
ret_value = (void *)new_fa_ptr;
done:
@@ -721,14 +697,9 @@ H5FD__ioc_fapl_free(void *_fapl)
/* Check arguments */
HDassert(fapl);
- if (fapl->under_fapl_id >= 0 && H5I_dec_ref(fapl->under_fapl_id) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTDEC, FAIL, "can't close IOC under FAPL ID");
- fapl->under_fapl_id = H5I_INVALID_HID;
-
/* Free the property list */
fapl = H5FL_FREE(H5FD_ioc_config_t, fapl);
-done:
H5_SUBFILING_FUNC_LEAVE;
} /* end H5FD__ioc_fapl_free() */
@@ -748,10 +719,10 @@ H5FD__ioc_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t maxaddr)
{
H5FD_ioc_t *file_ptr = NULL; /* Ioc VFD info */
const H5FD_ioc_config_t *config_ptr = NULL; /* Driver-specific property list */
+ subfiling_context_t *sf_context = NULL;
H5FD_ioc_config_t default_config;
- H5FD_class_t *driver = NULL; /* VFD for file */
H5P_genplist_t *plist_ptr = NULL;
- H5FD_driver_prop_t driver_prop; /* Property for driver ID & info */
+ int ioc_flags;
int mpi_inited = 0;
int mpi_code; /* MPI return code */
H5FD_t *ret_value = NULL;
@@ -768,10 +739,15 @@ H5FD__ioc_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t maxaddr)
if (NULL == (file_ptr = (H5FD_ioc_t *)H5FL_CALLOC(H5FD_ioc_t)))
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTALLOC, NULL, "unable to allocate file struct");
- file_ptr->comm = MPI_COMM_NULL;
- file_ptr->info = MPI_INFO_NULL;
- file_ptr->context_id = -1;
- file_ptr->fa.under_fapl_id = H5I_INVALID_HID;
+ file_ptr->comm = MPI_COMM_NULL;
+ file_ptr->info = MPI_INFO_NULL;
+ file_ptr->file_id = UINT64_MAX;
+ file_ptr->context_id = -1;
+
+ /* Initialize file pointer's subfiling parameters */
+ file_ptr->subf_config.ioc_selection = SELECT_IOC_ONE_PER_NODE;
+ file_ptr->subf_config.stripe_size = H5FD_SUBFILING_DEFAULT_STRIPE_SIZE;
+ file_ptr->subf_config.stripe_count = H5FD_SUBFILING_DEFAULT_STRIPE_COUNT;
/* Get the driver-specific file access properties */
if (NULL == (plist_ptr = (H5P_genplist_t *)H5I_object(fapl_id)))
@@ -808,7 +784,7 @@ H5FD__ioc_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t maxaddr)
config_ptr = H5P_peek_driver_info(plist_ptr);
if (!config_ptr || (H5P_FILE_ACCESS_DEFAULT == fapl_id)) {
- if (H5FD__ioc_get_default_config(fapl_id, &default_config) < 0)
+ if (H5FD__ioc_get_default_config(&default_config) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, NULL, "can't get default IOC VFD configuration");
config_ptr = &default_config;
}
@@ -816,117 +792,87 @@ H5FD__ioc_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t maxaddr)
/* Fill in the file config values */
HDmemcpy(&file_ptr->fa, config_ptr, sizeof(H5FD_ioc_config_t));
- /* Copy the ioc FAPL. */
- if (H5FD__copy_plist(config_ptr->under_fapl_id, &(file_ptr->fa.under_fapl_id)) < 0) {
- file_ptr->fa.under_fapl_id = H5I_INVALID_HID;
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, NULL, "can't copy IOC under FAPL");
- }
-
- if (NULL != (file_ptr->file_path = HDrealpath(name, NULL))) {
- if (H5_dirname(file_ptr->file_path, &file_ptr->file_dir) < 0) {
- H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL, "couldn't get subfile dirname");
- }
- }
- else {
- if (ENOENT == errno) {
- if (NULL == (file_ptr->file_path = HDstrdup(name)))
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTCOPY, NULL, "can't copy file name");
- if (NULL == (file_ptr->file_dir = H5MM_strdup(".")))
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, NULL, "can't set subfile directory path");
- }
- else
- H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't resolve subfile path");
- }
+ /* Fully resolve the given filepath and get its dirname */
+ if (H5_resolve_pathname(name, file_ptr->comm, &file_ptr->file_path) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't resolve filepath");
+ if (H5_dirname(file_ptr->file_path, &file_ptr->file_dir) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't get filepath dirname");
+
+ /* Translate the HDF5 file open flags into standard POSIX open flags */
+ ioc_flags = (H5F_ACC_RDWR & flags) ? O_RDWR : O_RDONLY;
+ if (H5F_ACC_TRUNC & flags)
+ ioc_flags |= O_TRUNC;
+ if (H5F_ACC_CREAT & flags)
+ ioc_flags |= O_CREAT;
+ if (H5F_ACC_EXCL & flags)
+ ioc_flags |= O_EXCL;
- /* Check the underlying driver (sec2/mpio/etc.) */
- if (NULL == (plist_ptr = (H5P_genplist_t *)H5I_object(file_ptr->fa.under_fapl_id)))
+ if (NULL == (plist_ptr = (H5P_genplist_t *)H5I_object(fapl_id)))
H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADTYPE, NULL, "not a file access property list");
- if (H5P_peek(plist_ptr, H5F_ACS_FILE_DRV_NAME, &driver_prop) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, NULL, "can't get driver ID & info");
- if (NULL == (driver = (H5FD_class_t *)H5I_object(driver_prop.driver_id)))
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, NULL,
- "invalid driver ID in file access property list");
+ /* Retrieve the subfiling configuration for the current file */
+ if (H5_subfiling_get_config_prop(plist_ptr, &file_ptr->subf_config) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, NULL, "can't get subfiling configuration from FAPL");
+ if (H5_subfiling_validate_config(&file_ptr->subf_config) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_BADVALUE, NULL, "invalid subfiling configuration");
- if (driver->value != H5_VFD_MPIO) {
- H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL,
- "unable to open file '%s' - only MPI I/O VFD is currently supported", name);
- }
- else {
- subfiling_context_t *sf_context = NULL;
- void *file_handle = NULL;
- int ioc_flags;
- int l_error = 0;
- int g_error = 0;
-
- /* Translate the HDF5 file open flags into standard POSIX open flags */
- ioc_flags = (H5F_ACC_RDWR & flags) ? O_RDWR : O_RDONLY;
- if (H5F_ACC_TRUNC & flags)
- ioc_flags |= O_TRUNC;
- if (H5F_ACC_CREAT & flags)
- ioc_flags |= O_CREAT;
- if (H5F_ACC_EXCL & flags)
- ioc_flags |= O_EXCL;
-
- file_ptr->ioc_file = H5FD_open(file_ptr->file_path, flags, file_ptr->fa.under_fapl_id, HADDR_UNDEF);
- if (file_ptr->ioc_file) {
- if (H5FDget_vfd_handle(file_ptr->ioc_file, file_ptr->fa.under_fapl_id, &file_handle) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't get file handle");
- }
- else {
- l_error = 1;
- }
-
- /* Check if any ranks had an issue opening the file */
- if (MPI_SUCCESS !=
- (mpi_code = MPI_Allreduce(&l_error, &g_error, 1, MPI_INT, MPI_SUM, file_ptr->comm)))
- H5_SUBFILING_MPI_GOTO_ERROR(NULL, "MPI_Allreduce failed", mpi_code);
- if (g_error)
- H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL,
- "one or more MPI ranks were unable to open file '%s'", name);
+ /* Retrieve the HDF5 stub file ID for the current file */
+ if (H5_subfiling_get_file_id_prop(plist_ptr, &file_ptr->file_id) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, NULL, "can't get stub file ID from FAPL");
- /*
- * Open the subfiles for this HDF5 file. A subfiling
- * context ID will be returned, which is used for
- * further interactions with this file's subfiles.
- */
- if (H5_open_subfiles(file_ptr->file_path, file_handle, &file_ptr->fa.subf_config, ioc_flags,
- file_ptr->comm, &file_ptr->context_id) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to open subfiles for file '%s'",
- name);
-
- /* Initialize I/O concentrator threads if this MPI rank is an I/O concentrator */
- sf_context = H5_get_subfiling_object(file_ptr->context_id);
- if (sf_context && sf_context->topology->rank_is_ioc) {
- if (initialize_ioc_threads(sf_context) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTINIT, NULL,
- "unable to initialize I/O concentrator threads");
- }
+ /*
+ * Open the subfiles for this HDF5 file. A subfiling
+ * context ID will be returned, which is used for
+ * further interactions with this file's subfiles.
+ */
+ if (H5_open_subfiles(file_ptr->file_path, file_ptr->file_id, &file_ptr->subf_config, ioc_flags,
+ file_ptr->comm, &file_ptr->context_id) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to open subfiles for file '%s'",
+ name);
+
+ /* Initialize I/O concentrator threads if this MPI rank is an I/O concentrator */
+ sf_context = H5_get_subfiling_object(file_ptr->context_id);
+ if (sf_context && sf_context->topology->rank_is_ioc) {
+ if (initialize_ioc_threads(sf_context) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTINIT, NULL,
+ "unable to initialize I/O concentrator threads");
}
ret_value = (H5FD_t *)file_ptr;
done:
- /* run a barrier just before exit. The objective is to
- * ensure that the IOCs are fully up and running before
- * we proceed. Note that this barrier is not sufficient
- * by itself -- we also need code in initialize_ioc_threads()
- * to wait until the main IOC thread has finished its
- * initialization.
+ /*
+ * Check if any ranks failed before exit. The objective
+ * here is twofold:
+ *
+ * - prevent possible hangs caused by ranks sending
+ * messages to I/O concentrators that failed and
+ * didn't spin up
+ * - use the barrier semantics of MPI_Allreduce to
+ * ensure that the I/O concentrators are fully up
+ * and running before proceeding.
*/
if (mpi_inited) {
- MPI_Comm barrier_comm = MPI_COMM_WORLD;
+ MPI_Comm reduce_comm = MPI_COMM_WORLD;
+ int mpi_size = -1;
+ int err_result = (ret_value == NULL);
if (file_ptr && (file_ptr->comm != MPI_COMM_NULL))
- barrier_comm = file_ptr->comm;
+ reduce_comm = file_ptr->comm;
- if (MPI_SUCCESS != (mpi_code = MPI_Barrier(barrier_comm)))
- H5_SUBFILING_MPI_DONE_ERROR(NULL, "MPI_Barrier failed", mpi_code);
- }
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(reduce_comm, &mpi_size)))
+ H5_SUBFILING_MPI_DONE_ERROR(NULL, "MPI_Comm_size failed", mpi_code);
- if (config_ptr == &default_config)
- if (H5I_dec_ref(config_ptr->under_fapl_id) < 0)
- H5_SUBFILING_DONE_ERROR(H5E_PLIST, H5E_CANTCLOSEOBJ, NULL, "can't close IOC under FAPL");
+ if (mpi_size > 1) {
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Allreduce(MPI_IN_PLACE, &err_result, 1, MPI_INT, MPI_MAX, reduce_comm)))
+ H5_SUBFILING_MPI_DONE_ERROR(NULL, "MPI_Allreduce failed", mpi_code);
+ }
+
+ if (err_result)
+ H5_SUBFILING_DONE_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL,
+ "one or more MPI ranks were unable to open file '%s'", name);
+ }
if (NULL == ret_value) {
if (file_ptr) {
@@ -945,39 +891,14 @@ H5FD__ioc_close_int(H5FD_ioc_t *file_ptr)
HDassert(file_ptr);
-#ifdef H5FD_IOC_DEBUG
- {
- subfiling_context_t *sf_context = H5_get_subfiling_object(file_ptr->context_id);
- if (sf_context) {
- if (sf_context->topology->rank_is_ioc)
- HDprintf("[%s %d] fd=%d\n", __func__, file_ptr->mpi_rank, sf_context->sf_fid);
- else
- HDprintf("[%s %d] fd=*\n", __func__, file_ptr->mpi_rank);
- }
- else
- HDprintf("[%s %d] invalid subfiling context", __func__, file_ptr->mpi_rank);
- HDfflush(stdout);
- }
-#endif
-
- if (file_ptr->fa.under_fapl_id >= 0 && H5I_dec_ref(file_ptr->fa.under_fapl_id) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_ARGS, FAIL, "can't close IOC under FAPL");
- file_ptr->fa.under_fapl_id = H5I_INVALID_HID;
-
- /* Close underlying file */
- if (file_ptr->ioc_file) {
- if (H5FD_close(file_ptr->ioc_file) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTCLOSEFILE, FAIL, "unable to close HDF5 file");
- file_ptr->ioc_file = NULL;
- }
-
if (file_ptr->context_id >= 0) {
subfiling_context_t *sf_context = H5_get_subfiling_object(file_ptr->context_id);
int mpi_code;
/* Don't allow IOC threads to be finalized until everyone gets here */
- if (MPI_SUCCESS != (mpi_code = MPI_Barrier(file_ptr->comm)))
- H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code);
+ if (file_ptr->mpi_size > 1)
+ if (MPI_SUCCESS != (mpi_code = MPI_Barrier(file_ptr->comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code);
if (sf_context && sf_context->topology->rank_is_ioc) {
if (finalize_ioc_threads(sf_context) < 0)
@@ -985,7 +906,7 @@ H5FD__ioc_close_int(H5FD_ioc_t *file_ptr)
H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTCLOSEFILE, FAIL, "unable to finalize IOC threads");
}
- if (H5_close_subfiles(file_ptr->context_id) < 0)
+ if (H5_close_subfiles(file_ptr->context_id, file_ptr->comm) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTCLOSEFILE, FAIL, "unable to close subfiling file(s)");
file_ptr->context_id = -1;
}
@@ -1053,31 +974,8 @@ H5FD__ioc_cmp(const H5FD_t *_f1, const H5FD_t *_f2)
HDassert(f1);
HDassert(f2);
- if (f1->ioc_file && f1->ioc_file->cls && f1->ioc_file->cls->cmp && f2->ioc_file && f2->ioc_file->cls &&
- f2->ioc_file->cls->cmp) {
- ret_value = H5FD_cmp(f1->ioc_file, f2->ioc_file);
- }
- else {
- h5_stat_t st1;
- h5_stat_t st2;
-
- /*
- * If under VFD has no compare routine, get
- * inode of HDF5 stub file and compare them
- *
- * Note that the compare callback doesn't
- * allow for failure, so we just return -1
- * if stat fails.
- */
- if (HDstat(f1->file_path, &st1) < 0)
- H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_CANTGET, -1, "couldn't stat file");
- if (HDstat(f2->file_path, &st2) < 0)
- H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_CANTGET, -1, "couldn't stat file");
-
- ret_value = (st1.st_ino > st2.st_ino) - (st1.st_ino < st2.st_ino);
- }
+ ret_value = (f1->file_id > f2->file_id) - (f1->file_id < f2->file_id);
-done:
H5_SUBFILING_FUNC_LEAVE;
} /* end H5FD__ioc_cmp */
@@ -1091,30 +989,20 @@ done:
*-------------------------------------------------------------------------
*/
static herr_t
-H5FD__ioc_query(const H5FD_t *_file, unsigned long *flags /* out */)
+H5FD__ioc_query(const H5FD_t H5_ATTR_UNUSED *_file, unsigned long *flags /* out */)
{
- const H5FD_ioc_t *file_ptr = (const H5FD_ioc_t *)_file;
- herr_t ret_value = SUCCEED;
+ herr_t ret_value = SUCCEED;
H5FD_IOC_LOG_CALL(__func__);
- if (file_ptr == NULL) {
- if (flags)
- *flags = 0;
- }
- else if (file_ptr->ioc_file) {
- if (H5FDquery(file_ptr->ioc_file, flags) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTLOCK, FAIL, "unable to query R/W file");
- }
- else {
- /* There is no file. Because this is a pure passthrough VFD,
- * it has no features of its own.
- */
- if (flags)
- *flags = 0;
+ /* Set the VFL feature flags that this driver supports */
+ if (flags) {
+ *flags = 0;
+ *flags |= H5FD_FEAT_AGGREGATE_METADATA; /* OK to aggregate metadata allocations */
+ *flags |= H5FD_FEAT_AGGREGATE_SMALLDATA; /* OK to aggregate "small" raw data allocations */
+ *flags |= H5FD_FEAT_HAS_MPI; /* This driver uses MPI */
}
-done:
H5_SUBFILING_FUNC_LEAVE;
} /* end H5FD__ioc_query() */
@@ -1127,22 +1015,14 @@ done:
*-------------------------------------------------------------------------
*/
static herr_t
-H5FD__ioc_get_type_map(const H5FD_t *_file, H5FD_mem_t *type_map)
+H5FD__ioc_get_type_map(const H5FD_t H5_ATTR_UNUSED *_file, H5FD_mem_t H5_ATTR_UNUSED *type_map)
{
- const H5FD_ioc_t *file = (const H5FD_ioc_t *)_file;
- herr_t ret_value = SUCCEED;
+ herr_t ret_value = SUCCEED;
H5FD_IOC_LOG_CALL(__func__);
- /* Check arguments */
- HDassert(file);
- HDassert(file->ioc_file);
-
- /* Retrieve memory type mapping for R/W channel only */
- if (H5FD_get_fs_type_map(file->ioc_file, type_map) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "unable to allocate for R/W file");
+ /* TODO: placeholder for now */
-done:
H5_SUBFILING_FUNC_LEAVE;
} /* end H5FD__ioc_get_type_map() */
@@ -1155,23 +1035,15 @@ done:
*-------------------------------------------------------------------------
*/
static haddr_t
-H5FD__ioc_alloc(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, hsize_t size)
+H5FD__ioc_alloc(H5FD_t H5_ATTR_UNUSED *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNUSED dxpl_id,
+ hsize_t H5_ATTR_UNUSED size)
{
- H5FD_ioc_t *file = (H5FD_ioc_t *)_file; /* VFD file struct */
- haddr_t ret_value = HADDR_UNDEF; /* Return value */
+ haddr_t ret_value = HADDR_UNDEF; /* Return value */
H5FD_IOC_LOG_CALL(__func__);
- /* Check arguments */
- HDassert(file);
- HDassert(file->ioc_file);
-
- /* Allocate memory for each file, only return the return value for R/W file.
- */
- if ((ret_value = H5FDalloc(file->ioc_file, type, dxpl_id, size)) == HADDR_UNDEF)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, HADDR_UNDEF, "unable to allocate for R/W file");
+ /* TODO: placeholder for now */
-done:
H5_SUBFILING_FUNC_LEAVE;
} /* end H5FD__ioc_alloc() */
@@ -1184,21 +1056,15 @@ done:
*-------------------------------------------------------------------------
*/
static herr_t
-H5FD__ioc_free(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, hsize_t size)
+H5FD__ioc_free(H5FD_t H5_ATTR_UNUSED *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNUSED dxpl_id,
+ haddr_t H5_ATTR_UNUSED addr, hsize_t H5_ATTR_UNUSED size)
{
- H5FD_ioc_t *file = (H5FD_ioc_t *)_file; /* VFD file struct */
- herr_t ret_value = SUCCEED; /* Return value */
+ herr_t ret_value = SUCCEED; /* Return value */
H5FD_IOC_LOG_CALL(__func__);
- /* Check arguments */
- HDassert(file);
- HDassert(file->ioc_file);
-
- if (H5FDfree(file->ioc_file, type, dxpl_id, addr, size) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free for R/W file");
+ /* TODO: placeholder for now */
-done:
H5_SUBFILING_FUNC_LEAVE;
} /* end H5FD__ioc_free() */
@@ -1224,12 +1090,9 @@ H5FD__ioc_get_eoa(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type)
/* Sanity check */
HDassert(file);
- HDassert(file->ioc_file);
- if ((ret_value = H5FD_get_eoa(file->ioc_file, type)) == HADDR_UNDEF)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, HADDR_UNDEF, "unable to get eoa");
+ ret_value = file->eoa;
-done:
H5_SUBFILING_FUNC_LEAVE;
} /* end H5FD__ioc_get_eoa */
@@ -1253,13 +1116,9 @@ H5FD__ioc_set_eoa(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, haddr_t addr)
/* Sanity check */
HDassert(file);
- HDassert(file->ioc_file);
- HDassert(file->ioc_file);
- if (H5FD_set_eoa(file->ioc_file, type, addr) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTSET, FAIL, "H5FDset_eoa failed for R/W file");
+ file->eoa = addr;
-done:
H5_SUBFILING_FUNC_LEAVE;
} /* end H5FD__ioc_set_eoa() */
@@ -1286,16 +1145,14 @@ H5FD__ioc_get_eof(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type)
/* Sanity check */
HDassert(file);
- HDassert(file->ioc_file);
sf_context = H5_get_subfiling_object(file->context_id);
if (sf_context) {
ret_value = sf_context->sf_eof;
goto done;
}
-
- if (HADDR_UNDEF == (ret_value = H5FD_get_eof(file->ioc_file, type)))
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, HADDR_UNDEF, "unable to get eof");
+ else
+ ret_value = file->eof;
done:
H5_SUBFILING_FUNC_LEAVE;
@@ -1311,22 +1168,15 @@ done:
*--------------------------------------------------------------------------
*/
static herr_t
-H5FD__ioc_get_handle(H5FD_t *_file, hid_t H5_ATTR_UNUSED fapl, void **file_handle)
+H5FD__ioc_get_handle(H5FD_t H5_ATTR_UNUSED *_file, hid_t H5_ATTR_UNUSED fapl,
+ void H5_ATTR_UNUSED **file_handle)
{
- H5FD_ioc_t *file = (H5FD_ioc_t *)_file;
- herr_t ret_value = SUCCEED; /* Return value */
+ herr_t ret_value = SUCCEED;
H5FD_IOC_LOG_CALL(__func__);
- /* Check arguments */
- HDassert(file);
- HDassert(file->ioc_file);
- HDassert(file_handle);
+ /* TODO: placeholder for now */
- if (H5FD_get_vfd_handle(file->ioc_file, file->fa.under_fapl_id, file_handle) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "unable to get handle of R/W file");
-
-done:
H5_SUBFILING_FUNC_LEAVE;
} /* end H5FD__ioc_get_handle */
@@ -1362,9 +1212,7 @@ H5FD__ioc_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNUS
if (REGION_OVERFLOW(addr, size))
H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow, addr = %" PRIuHADDR, addr);
- /* Public API for dxpl "context" */
- if (H5FDread(file->ioc_file, type, dxpl_id, addr, size, buf) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "Reading from R/W channel failed");
+ ret_value = H5FD__ioc_read_vector_internal(_file, 1, &addr, &size, &buf);
done:
H5_SUBFILING_FUNC_LEAVE;
@@ -1381,19 +1229,15 @@ done:
*-------------------------------------------------------------------------
*/
static herr_t
-H5FD__ioc_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size, const void *buf)
+H5FD__ioc_write(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, haddr_t addr, size_t size,
+ const void *buf)
{
- H5P_genplist_t *plist_ptr = NULL;
- herr_t ret_value = SUCCEED;
-
- if (NULL == (plist_ptr = (H5P_genplist_t *)H5I_object(dxpl_id)))
- H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a property list");
+ herr_t ret_value = SUCCEED;
addr += _file->base_addr;
ret_value = H5FD__ioc_write_vector_internal(_file, 1, &type, &addr, &size, &buf);
-done:
H5_SUBFILING_FUNC_LEAVE;
} /* end H5FD__ioc_write() */
@@ -1492,17 +1336,14 @@ done:
*-------------------------------------------------------------------------
*/
static herr_t
-H5FD__ioc_flush(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t closing)
+H5FD__ioc_flush(H5FD_t H5_ATTR_UNUSED *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t H5_ATTR_UNUSED closing)
{
- H5FD_ioc_t *file = (H5FD_ioc_t *)_file;
- herr_t ret_value = SUCCEED; /* Return value */
+ herr_t ret_value = SUCCEED;
H5FD_IOC_LOG_CALL(__func__);
- if (H5FDflush(file->ioc_file, dxpl_id, closing) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFLUSH, FAIL, "unable to flush R/W file");
+ /* TODO: placeholder for now */
-done:
H5_SUBFILING_FUNC_LEAVE;
} /* end H5FD__ioc_flush() */
@@ -1515,21 +1356,20 @@ done:
*-------------------------------------------------------------------------
*/
static herr_t
-H5FD__ioc_truncate(H5FD_t *_file, hid_t dxpl_id, hbool_t closing)
+H5FD__ioc_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t H5_ATTR_UNUSED closing)
{
H5FD_ioc_t *file = (H5FD_ioc_t *)_file;
- herr_t ret_value = SUCCEED; /* Return value */
+ herr_t ret_value = SUCCEED;
H5FD_IOC_LOG_CALL(__func__);
HDassert(file);
- HDassert(file->ioc_file);
- HDassert(file->ioc_file);
- if (H5FDtruncate(file->ioc_file, dxpl_id, closing) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTUPDATE, FAIL, "unable to truncate R/W file");
+ /* TODO: placeholder for now since Subfiling does the truncation */
+ if (!H5F_addr_eq(file->eoa, file->last_eoa)) {
+ file->last_eoa = file->eoa;
+ }
-done:
H5_SUBFILING_FUNC_LEAVE;
} /* end H5FD__ioc_truncate */
@@ -1542,20 +1382,14 @@ done:
*--------------------------------------------------------------------------
*/
static herr_t
-H5FD__ioc_lock(H5FD_t *_file, hbool_t rw)
+H5FD__ioc_lock(H5FD_t H5_ATTR_UNUSED *_file, hbool_t H5_ATTR_UNUSED rw)
{
- H5FD_ioc_t *file = (H5FD_ioc_t *)_file; /* VFD file struct */
- herr_t ret_value = SUCCEED; /* Return value */
+ herr_t ret_value = SUCCEED;
H5FD_IOC_LOG_CALL(__func__);
- HDassert(file);
- HDassert(file->ioc_file);
-
- if (H5FD_lock(file->ioc_file, rw) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTLOCKFILE, FAIL, "unable to lock file");
+ /* TODO: placeholder for now */
-done:
H5_SUBFILING_FUNC_LEAVE;
} /* end H5FD__ioc_lock */
@@ -1568,21 +1402,14 @@ done:
*--------------------------------------------------------------------------
*/
static herr_t
-H5FD__ioc_unlock(H5FD_t *_file)
+H5FD__ioc_unlock(H5FD_t H5_ATTR_UNUSED *_file)
{
- H5FD_ioc_t *file = (H5FD_ioc_t *)_file; /* VFD file struct */
- herr_t ret_value = SUCCEED; /* Return value */
+ herr_t ret_value = SUCCEED;
H5FD_IOC_LOG_CALL(__func__);
- /* Check arguments */
- HDassert(file);
- HDassert(file->ioc_file);
+ /* TODO: placeholder for now */
- if (H5FD_unlock(file->ioc_file) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTUNLOCKFILE, FAIL, "unable to unlock file");
-
-done:
H5_SUBFILING_FUNC_LEAVE;
} /* end H5FD__ioc_unlock */
@@ -1626,8 +1453,9 @@ H5FD__ioc_del(const char *name, hid_t fapl)
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code);
if (mpi_rank == 0) {
- int n_io_concentrators = 0;
- int num_digits = 0;
+ int64_t read_n_subfiles = 0;
+ int32_t n_subfiles = 0;
+ int num_digits = 0;
if (HDstat(name, &st) < 0)
H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_SYSERRSTR, FAIL, "HDstat failed");
@@ -1643,7 +1471,7 @@ H5FD__ioc_del(const char *name, hid_t fapl)
"can't allocate config file name buffer");
/* TODO: No support for subfile directory prefix currently */
- HDsnprintf(tmp_filename, PATH_MAX, "%s/%s" H5FD_SUBFILING_CONFIG_FILENAME_TEMPLATE, file_dirname,
+ HDsnprintf(tmp_filename, PATH_MAX, "%s/" H5FD_SUBFILING_CONFIG_FILENAME_TEMPLATE, file_dirname,
base_filename, (uint64_t)st.st_ino);
if (NULL == (config_file = HDfopen(tmp_filename, "r"))) {
@@ -1659,9 +1487,12 @@ H5FD__ioc_del(const char *name, hid_t fapl)
"can't open subfiling config file");
}
- if (H5_get_num_iocs_from_config_file(config_file, &n_io_concentrators) < 0)
+ if (H5_get_subfiling_config_from_file(config_file, NULL, &read_n_subfiles) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_READERROR, FAIL, "can't read subfiling config file");
+ H5_CHECK_OVERFLOW(read_n_subfiles, int64_t, int32_t);
+ n_subfiles = (int32_t)read_n_subfiles;
+
/* Delete the Subfiling configuration file */
if (EOF == HDfclose(config_file)) {
config_file = NULL;
@@ -1676,12 +1507,12 @@ H5FD__ioc_del(const char *name, hid_t fapl)
"can't delete subfiling config file");
/* Try to delete each of the subfiles */
- num_digits = (int)(HDlog10(n_io_concentrators) + 1);
+ num_digits = (int)(HDlog10(n_subfiles) + 1);
- for (int i = 0; i < n_io_concentrators; i++) {
+ for (int i = 0; i < n_subfiles; i++) {
/* TODO: No support for subfile directory prefix currently */
- HDsnprintf(tmp_filename, PATH_MAX, "%s/%s" H5FD_SUBFILING_FILENAME_TEMPLATE, file_dirname,
- base_filename, (uint64_t)st.st_ino, num_digits, i + 1, n_io_concentrators);
+ HDsnprintf(tmp_filename, PATH_MAX, "%s/" H5FD_SUBFILING_FILENAME_TEMPLATE, file_dirname,
+ base_filename, (uint64_t)st.st_ino, num_digits, i + 1, n_subfiles);
if (HDremove(tmp_filename) < 0) {
#ifdef H5FD_IOC_DEBUG
@@ -1704,8 +1535,16 @@ done:
H5_SUBFILING_DONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, "can't close subfiling config file");
/* Set up a barrier (don't want processes to run ahead of the delete) */
- if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm)))
- H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Barrier failed", mpi_code);
+ if (comm != MPI_COMM_NULL) {
+ int comm_size = -1;
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(comm, &comm_size)))
+ H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Comm_size failed", mpi_code);
+
+ if (comm_size > 1)
+ if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm)))
+ H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Barrier failed", mpi_code);
+ }
/* Free duplicated MPI Communicator and Info objects */
if (H5_mpi_comm_free(&comm) < 0)
@@ -1737,19 +1576,15 @@ done:
*--------------------------------------------------------------------------
*/
static herr_t
-H5FD__ioc_write_vector_internal(H5FD_t *_file, uint32_t count, H5FD_mem_t types[], haddr_t addrs[],
- size_t sizes[], const void *bufs[] /* in */)
+H5FD__ioc_write_vector_internal(H5FD_t *_file, uint32_t count, H5FD_mem_t H5_ATTR_UNUSED types[],
+ haddr_t addrs[], size_t sizes[], const void *bufs[] /* in */)
{
subfiling_context_t *sf_context = NULL;
- MPI_Request *active_reqs = NULL;
+ MPI_Request *mpi_reqs = NULL;
H5FD_ioc_t *file_ptr = (H5FD_ioc_t *)_file;
- io_req_t **sf_async_reqs = NULL;
+ io_req_t **sf_io_reqs = NULL;
int64_t sf_context_id = -1;
herr_t ret_value = SUCCEED;
- struct __mpi_req {
- int n_reqs;
- MPI_Request *active_reqs;
- } *mpi_reqs = NULL;
HDassert(_file);
HDassert(addrs);
@@ -1764,22 +1599,20 @@ H5FD__ioc_write_vector_internal(H5FD_t *_file, uint32_t count, H5FD_mem_t types[
if (NULL == (sf_context = H5_get_subfiling_object(sf_context_id)))
H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "can't get subfiling context from ID");
HDassert(sf_context->topology);
- HDassert(sf_context->topology->n_io_concentrators);
-
- if (NULL == (active_reqs = HDcalloc((size_t)(count + 2), sizeof(struct __mpi_req))))
- H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
- "can't allocate active I/O requests array");
-
- if (NULL == (sf_async_reqs = HDcalloc((size_t)count, sizeof(*sf_async_reqs))))
- H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate I/O request array");
/*
- * Note: We allocated extra space in the active_requests (above).
- * The extra should be enough for an integer plus a pointer.
+ * Allocate an array of I/O requests and an array twice that size for
+ * MPI_Request objects. Each write I/O request has an MPI_Request
+ * object for the I/O data transfer and an MPI_Request object that,
+ * when waited on until completion, signifies that the actual I/O
+ * call (currently, HDpwrite) has completed. This is needed for ensuring
+ * that blocking write calls do not return early before the data is
+ * actually written.
*/
- mpi_reqs = (struct __mpi_req *)&active_reqs[count];
- mpi_reqs->n_reqs = (int)count;
- mpi_reqs->active_reqs = active_reqs;
+ if (NULL == (sf_io_reqs = HDcalloc((size_t)count, sizeof(*sf_io_reqs))))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate I/O request array");
+ if (NULL == (mpi_reqs = HDmalloc(2 * (size_t)count * sizeof(*mpi_reqs))))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate MPI request array");
/* Each pass thru the following should queue an MPI write
* to a new IOC. Both the IOC selection and offset within the
@@ -1794,47 +1627,30 @@ H5FD__ioc_write_vector_internal(H5FD_t *_file, uint32_t count, H5FD_mem_t types[
H5_CHECK_OVERFLOW(addrs[i], haddr_t, int64_t);
H5_CHECK_OVERFLOW(sizes[i], size_t, int64_t);
- write_status =
- ioc__write_independent_async(sf_context_id, sf_context->topology->n_io_concentrators,
- (int64_t)addrs[i], (int64_t)sizes[i], bufs[i], &sf_async_reqs[i]);
+ write_status = ioc__write_independent_async(sf_context_id, (int64_t)addrs[i], (int64_t)sizes[i],
+ bufs[i], &sf_io_reqs[i]);
if (write_status < 0)
H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "couldn't queue write operation");
- mpi_reqs->active_reqs[i] = sf_async_reqs[i]->completion_func.io_args.io_req;
- }
-
- /*
- * Mirror superblock writes to the stub file so that
- * legacy HDF5 applications can check what type of
- * file they are reading
- */
- for (size_t i = 0; i < (size_t)count; i++) {
- if (types[i] == H5FD_MEM_SUPER) {
- if (H5FDwrite(file_ptr->ioc_file, H5FD_MEM_SUPER, H5P_DEFAULT, addrs[i], sizes[i], bufs[i]) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL,
- "couldn't write superblock information to stub file");
- }
+ mpi_reqs[(2 * i)] = sf_io_reqs[i]->io_transfer_req;
+ mpi_reqs[(2 * i) + 1] = sf_io_reqs[i]->io_comp_req;
}
/* Here, we should have queued 'count' async requests.
* We can can now try to complete those before returning
* to the caller for the next set of IO operations.
*/
- if (sf_async_reqs[0]->completion_func.io_function)
- ret_value = (*sf_async_reqs[0]->completion_func.io_function)(mpi_reqs);
+ if (ioc__async_completion(mpi_reqs, 2 * (size_t)count) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "can't complete I/O requests");
done:
- if (active_reqs)
- HDfree(active_reqs);
+ HDfree(mpi_reqs);
- if (sf_async_reqs) {
- for (size_t i = 0; i < (size_t)count; i++) {
- if (sf_async_reqs[i]) {
- HDfree(sf_async_reqs[i]);
- }
- }
- HDfree(sf_async_reqs);
+ if (sf_io_reqs) {
+ for (size_t i = 0; i < count; i++)
+ HDfree(sf_io_reqs[i]);
+ HDfree(sf_io_reqs);
}
H5_SUBFILING_FUNC_LEAVE;
@@ -1845,15 +1661,11 @@ H5FD__ioc_read_vector_internal(H5FD_t *_file, uint32_t count, haddr_t addrs[], s
void *bufs[] /* out */)
{
subfiling_context_t *sf_context = NULL;
- MPI_Request *active_reqs = NULL;
+ MPI_Request *mpi_reqs = NULL;
H5FD_ioc_t *file_ptr = (H5FD_ioc_t *)_file;
- io_req_t **sf_async_reqs = NULL;
+ io_req_t **sf_io_reqs = NULL;
int64_t sf_context_id = -1;
herr_t ret_value = SUCCEED;
- struct __mpi_req {
- int n_reqs;
- MPI_Request *active_reqs;
- } *mpi_reqs = NULL;
HDassert(_file);
HDassert(addrs);
@@ -1868,36 +1680,31 @@ H5FD__ioc_read_vector_internal(H5FD_t *_file, uint32_t count, haddr_t addrs[], s
if (NULL == (sf_context = H5_get_subfiling_object(sf_context_id)))
H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTGET, FAIL, "can't get subfiling context from ID");
HDassert(sf_context->topology);
- HDassert(sf_context->topology->n_io_concentrators);
-
- if (NULL == (active_reqs = HDcalloc((size_t)(count + 2), sizeof(struct __mpi_req))))
- H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
- "can't allocate active I/O requests array");
-
- if (NULL == (sf_async_reqs = HDcalloc((size_t)count, sizeof(*sf_async_reqs))))
- H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate I/O request array");
/*
- * Note: We allocated extra space in the active_requests (above).
- * The extra should be enough for an integer plus a pointer.
+ * Allocate an array of I/O requests and an array for MPI_Request
+ * objects. Each read I/O request has an MPI_Request object for the
+ * I/O data transfer that, when waited on until completion, signifies
+ * that the actual I/O call (currently, HDpread) has completed and
+ * the data read from the file has been transferred to the caller.
*/
- mpi_reqs = (struct __mpi_req *)&active_reqs[count];
- mpi_reqs->n_reqs = (int)count;
- mpi_reqs->active_reqs = active_reqs;
+ if (NULL == (sf_io_reqs = HDcalloc((size_t)count, sizeof(*sf_io_reqs))))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate I/O request array");
+ if (NULL == (mpi_reqs = HDmalloc((size_t)count * sizeof(*mpi_reqs))))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate MPI request array");
for (size_t i = 0; i < (size_t)count; i++) {
int read_status;
H5_CHECK_OVERFLOW(addrs[i], haddr_t, int64_t);
H5_CHECK_OVERFLOW(sizes[i], size_t, int64_t);
- read_status =
- ioc__read_independent_async(sf_context_id, sf_context->topology->n_io_concentrators,
- (int64_t)addrs[i], (int64_t)sizes[i], bufs[i], &sf_async_reqs[i]);
+ read_status = ioc__read_independent_async(sf_context_id, (int64_t)addrs[i], (int64_t)sizes[i],
+ bufs[i], &sf_io_reqs[i]);
if (read_status < 0)
H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "couldn't queue read operation");
- mpi_reqs->active_reqs[i] = sf_async_reqs[i]->completion_func.io_args.io_req;
+ mpi_reqs[i] = sf_io_reqs[i]->io_transfer_req;
}
/* Here, we should have queued 'count' async requests
@@ -1906,20 +1713,16 @@ H5FD__ioc_read_vector_internal(H5FD_t *_file, uint32_t count, haddr_t addrs[], s
* We can can now try to complete those before returning
* to the caller for the next set of IO operations.
*/
- if (sf_async_reqs[0]->completion_func.io_function)
- ret_value = (*sf_async_reqs[0]->completion_func.io_function)(mpi_reqs);
+ if (ioc__async_completion(mpi_reqs, (size_t)count) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "can't complete I/O requests");
done:
- if (active_reqs)
- HDfree(active_reqs);
+ HDfree(mpi_reqs);
- if (sf_async_reqs) {
- for (size_t i = 0; i < count; i++) {
- if (sf_async_reqs[i]) {
- HDfree(sf_async_reqs[i]);
- }
- }
- HDfree(sf_async_reqs);
+ if (sf_io_reqs) {
+ for (size_t i = 0; i < count; i++)
+ HDfree(sf_io_reqs[i]);
+ HDfree(sf_io_reqs);
}
H5_SUBFILING_FUNC_LEAVE;
diff --git a/src/H5FDsubfiling/H5FDioc.h b/src/H5FDsubfiling/H5FDioc.h
index 7173aa9..2b68d9c 100644
--- a/src/H5FDsubfiling/H5FDioc.h
+++ b/src/H5FDsubfiling/H5FDioc.h
@@ -84,11 +84,6 @@
* Property List. A pointer to an instance of this structure is
* a parameter to H5Pset_fapl_ioc() and H5Pget_fapl_ioc().
*
- * The #H5FD_IOC driver shares much of its configuration with the
- * #H5FD_SUBFILING driver and so its configuration structure
- * contains an instance of a H5FD_subfiling_shared_config_t
- * configuration structure.
- *
* \var uint32_t H5FD_ioc_config_t::magic
* A somewhat unique number which distinguishes the #H5FD_IOC driver
* from other drivers. Used in combination with a version number, it
@@ -101,31 +96,17 @@
* number or an error will be raised. Currently, this field should be set
* to #H5FD_IOC_CURR_FAPL_VERSION.
*
- * \var hid_t H5FD_ioc_config_t::under_fapl_id
- * The File Access Property List which is setup with the file driver
- * to use for I/O to the HDF5 stub file. The stub file looks like a
- * typical HDF5 file, but currently only contains the superblock metadata
- * for compatibility with legacy HDF5 applications. The default driver used
- * is currently the #H5FD_MPIO driver.
- *
* \var int32_t H5FD_ioc_config_t::thread_pool_size
* The number of I/O concentrator worker threads to use.
*
* This value can also be set or adjusted with the #H5FD_IOC_THREAD_POOL_SIZE
* environment variable.
*
- * \var H5FD_subfiling_shared_config_t H5FD_ioc_config_t::subf_config
- * Subfiling configuration data for the parent #H5FD_SUBFILING driver. This
- * includes the sub-file stripe size, number of I/O concentrators, IOC
- * selection method, etc.
- *
*/
typedef struct H5FD_ioc_config_t {
uint32_t magic; /* Must be set to H5FD_IOC_FAPL_MAGIC */
uint32_t version; /* Must be set to H5FD_IOC_CURR_FAPL_VERSION */
- hid_t under_fapl_id; /* FAPL setup with the VFD to use for I/O to the HDF5 stub file */
int32_t thread_pool_size; /* Number of I/O concentrator worker threads to use */
- H5FD_subfiling_shared_config_t subf_config; /* Subfiling driver configuration */
} H5FD_ioc_config_t;
//! <!-- [H5FD_ioc_config_t_snip] -->
@@ -152,7 +133,7 @@ H5_DLL hid_t H5FD_ioc_init(void);
*
* The #H5FD_IOC driver is a reference implementation of an "I/O concentrator"
* file driver that works in conjunction with the #H5FD_SUBFILING driver and
- * provides the I/O backend for servicing I/O requests to sub-files.
+ * provides the I/O backend for servicing I/O requests to subfiles.
*
* Typically, an HDF5 application won't need to call this routine directly.
* The #H5FD_IOC driver is usually set up as a side effect of an HDF5 application
diff --git a/src/H5FDsubfiling/H5FDioc_int.c b/src/H5FDsubfiling/H5FDioc_int.c
index 71afef4..e2ba95a 100644
--- a/src/H5FDsubfiling/H5FDioc_int.c
+++ b/src/H5FDsubfiling/H5FDioc_int.c
@@ -16,31 +16,36 @@
#include "H5FDioc_priv.h"
-static int async_completion(void *arg);
-
/*
- * Given a file offset, the stripe size and
- * the number of IOCs, calculate the target
- * IOC for I/O and the file offset for the
- * subfile that IOC controls
+ * Given a file offset, the stripe size, the
+ * number of IOCs and the number of subfiles,
+ * calculate the target IOC for I/O, the index
+ * of the target subfile out of the subfiles
+ * that the IOC controls and the file offset
+ * into that subfile
*/
static inline void
-calculate_target_ioc(int64_t file_offset, int64_t stripe_size, int n_io_concentrators, int64_t *target_ioc,
- int64_t *ioc_file_offset)
+calculate_target_ioc(int64_t file_offset, int64_t stripe_size, int num_io_concentrators, int num_subfiles,
+ int64_t *target_ioc, int64_t *ioc_file_offset, int64_t *ioc_subfile_idx)
{
int64_t stripe_idx;
int64_t subfile_row;
+ int64_t subfile_idx;
+ HDassert(stripe_size > 0);
+ HDassert(num_io_concentrators > 0);
+ HDassert(num_subfiles > 0);
HDassert(target_ioc);
HDassert(ioc_file_offset);
- HDassert(stripe_size > 0);
- HDassert(n_io_concentrators > 0);
+ HDassert(ioc_subfile_idx);
stripe_idx = file_offset / stripe_size;
- subfile_row = stripe_idx / n_io_concentrators;
+ subfile_row = stripe_idx / num_subfiles;
+ subfile_idx = (stripe_idx % num_subfiles) / num_io_concentrators;
- *target_ioc = stripe_idx % n_io_concentrators;
+ *target_ioc = (stripe_idx % num_subfiles) % num_io_concentrators;
*ioc_file_offset = (subfile_row * stripe_size) + (file_offset % stripe_size);
+ *ioc_subfile_idx = subfile_idx;
}
/*
@@ -90,17 +95,20 @@ cast_to_void(const void *data)
*-------------------------------------------------------------------------
*/
herr_t
-ioc__write_independent_async(int64_t context_id, int n_io_concentrators, int64_t offset, int64_t elements,
- const void *data, io_req_t **io_req)
+ioc__write_independent_async(int64_t context_id, int64_t offset, int64_t elements, const void *data,
+ io_req_t **io_req)
{
subfiling_context_t *sf_context = NULL;
MPI_Request ack_request = MPI_REQUEST_NULL;
io_req_t *sf_io_request = NULL;
int64_t ioc_start;
int64_t ioc_offset;
+ int64_t ioc_subfile_idx;
int64_t msg[3] = {0};
int *io_concentrators = NULL;
- int data_tag = 0;
+ int num_io_concentrators;
+ int num_subfiles;
+ int data_tag = 0;
int mpi_code;
herr_t ret_value = SUCCEED;
@@ -111,13 +119,16 @@ ioc__write_independent_async(int64_t context_id, int n_io_concentrators, int64_t
HDassert(sf_context->topology);
HDassert(sf_context->topology->io_concentrators);
- io_concentrators = sf_context->topology->io_concentrators;
+ io_concentrators = sf_context->topology->io_concentrators;
+ num_io_concentrators = sf_context->topology->n_io_concentrators;
+ num_subfiles = sf_context->sf_num_subfiles;
/*
* Calculate the IOC that we'll send the I/O request to
* and the offset within that IOC's subfile
*/
- calculate_target_ioc(offset, sf_context->sf_stripe_size, n_io_concentrators, &ioc_start, &ioc_offset);
+ calculate_target_ioc(offset, sf_context->sf_stripe_size, num_io_concentrators, num_subfiles, &ioc_start,
+ &ioc_offset, &ioc_subfile_idx);
/*
* Wait for memory to be allocated on the target IOC before
@@ -141,37 +152,43 @@ ioc__write_independent_async(int64_t context_id, int n_io_concentrators, int64_t
*/
msg[0] = elements;
msg[1] = ioc_offset;
- msg[2] = context_id;
- if (MPI_SUCCESS != (mpi_code = MPI_Send(msg, 3, MPI_INT64_T, io_concentrators[ioc_start], WRITE_INDEP,
- sf_context->sf_msg_comm)))
+ msg[2] = ioc_subfile_idx;
+ if (MPI_SUCCESS != (mpi_code = MPI_Send(msg, 1, H5_subfiling_rpc_msg_type, io_concentrators[ioc_start],
+ WRITE_INDEP, sf_context->sf_msg_comm)))
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Send failed", mpi_code);
- /* Wait to receive data tag */
+ /* Wait to receive the data tag from the IOC */
if (MPI_SUCCESS != (mpi_code = MPI_Wait(&ack_request, MPI_STATUS_IGNORE)))
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Wait failed", mpi_code);
if (data_tag == 0)
H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "received NACK from IOC");
- /* At this point in the new implementation, we should queue
- * the async write so that when the top level VFD tells us
- * to complete all pending IO requests, we have all the info
- * we need to accomplish that.
+ /*
+ * Allocate the I/O request object that will
+ * be returned to the caller
*/
if (NULL == (sf_io_request = HDmalloc(sizeof(io_req_t))))
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_WRITEERROR, FAIL, "couldn't allocate I/O request");
H5_CHECK_OVERFLOW(ioc_start, int64_t, int);
- sf_io_request->completion_func.io_args.ioc = (int)ioc_start;
- sf_io_request->completion_func.io_args.context_id = context_id;
- sf_io_request->completion_func.io_args.offset = offset;
- sf_io_request->completion_func.io_args.elements = elements;
- sf_io_request->completion_func.io_args.data = cast_to_void(data);
- sf_io_request->completion_func.io_args.io_req = MPI_REQUEST_NULL;
- sf_io_request->completion_func.io_function = async_completion;
- sf_io_request->completion_func.pending = 0;
+ sf_io_request->ioc = (int)ioc_start;
+ sf_io_request->context_id = context_id;
+ sf_io_request->offset = offset;
+ sf_io_request->elements = elements;
+ sf_io_request->data = cast_to_void(data);
+ sf_io_request->io_transfer_req = MPI_REQUEST_NULL;
+ sf_io_request->io_comp_req = MPI_REQUEST_NULL;
+ sf_io_request->io_comp_tag = -1;
- sf_io_request->prev = sf_io_request->next = NULL;
+ /*
+ * Start a non-blocking receive from the IOC that signifies
+ * when the actual write is complete
+ */
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Irecv(&sf_io_request->io_comp_tag, 1, MPI_INT, io_concentrators[ioc_start],
+ WRITE_DATA_DONE, sf_context->sf_data_comm, &sf_io_request->io_comp_req)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Irecv failed", mpi_code);
/*
* Start the actual data transfer using the ack received
@@ -180,7 +197,7 @@ ioc__write_independent_async(int64_t context_id, int n_io_concentrators, int64_t
H5_CHECK_OVERFLOW(elements, int64_t, int);
if (MPI_SUCCESS !=
(mpi_code = MPI_Isend(data, (int)elements, MPI_BYTE, io_concentrators[ioc_start], data_tag,
- sf_context->sf_data_comm, &sf_io_request->completion_func.io_args.io_req)))
+ sf_context->sf_data_comm, &sf_io_request->io_transfer_req)))
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Isend failed", mpi_code);
/*
@@ -193,14 +210,23 @@ ioc__write_independent_async(int64_t context_id, int n_io_concentrators, int64_t
* to the caller.
*/
- sf_io_request->completion_func.pending = 1;
- *io_req = sf_io_request;
+ *io_req = sf_io_request;
done:
if (ret_value < 0) {
if (ack_request != MPI_REQUEST_NULL) {
- if (MPI_SUCCESS != (mpi_code = MPI_Cancel(&ack_request)))
- H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Cancel failed", mpi_code);
+ if (MPI_SUCCESS != (mpi_code = MPI_Wait(&ack_request, MPI_STATUS_IGNORE)))
+ H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Wait failed", mpi_code);
+ }
+ if (sf_io_request) {
+ if (sf_io_request->io_transfer_req != MPI_REQUEST_NULL) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Wait(&sf_io_request->io_transfer_req, MPI_STATUS_IGNORE)))
+ H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Wait failed", mpi_code);
+ }
+ if (sf_io_request->io_comp_req != MPI_REQUEST_NULL) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Wait(&sf_io_request->io_comp_req, MPI_STATUS_IGNORE)))
+ H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Wait failed", mpi_code);
+ }
}
HDfree(sf_io_request);
@@ -241,81 +267,141 @@ done:
*-------------------------------------------------------------------------
*/
herr_t
-ioc__read_independent_async(int64_t context_id, int n_io_concentrators, int64_t offset, int64_t elements,
- void *data, io_req_t **io_req)
+ioc__read_independent_async(int64_t context_id, int64_t offset, int64_t elements, void *data,
+ io_req_t **io_req)
{
subfiling_context_t *sf_context = NULL;
+ MPI_Request ack_request = MPI_REQUEST_NULL;
io_req_t *sf_io_request = NULL;
+ hbool_t need_data_tag = FALSE;
int64_t ioc_start;
int64_t ioc_offset;
+ int64_t ioc_subfile_idx;
int64_t msg[3] = {0};
int *io_concentrators = NULL;
+ int num_io_concentrators;
+ int num_subfiles;
+ int data_tag = 0;
int mpi_code;
herr_t ret_value = SUCCEED;
HDassert(io_req);
+ H5_CHECK_OVERFLOW(elements, int64_t, int);
+
if (NULL == (sf_context = H5_get_subfiling_object(context_id)))
H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "can't get subfiling context from ID");
HDassert(sf_context->topology);
HDassert(sf_context->topology->io_concentrators);
- io_concentrators = sf_context->topology->io_concentrators;
+ io_concentrators = sf_context->topology->io_concentrators;
+ num_io_concentrators = sf_context->topology->n_io_concentrators;
+ num_subfiles = sf_context->sf_num_subfiles;
+
+ /*
+ * If we are using 1 subfile per IOC, we can optimize reads
+ * a little since each read will go to a separate IOC and we
+ * won't be in danger of data being received in an
+ * unpredictable order. However, if some IOCs own more than
+ * 1 subfile, we need to associate each read with a unique
+ * message tag to make sure the data is received in the
+ * correct order.
+ */
+ need_data_tag = num_subfiles != num_io_concentrators;
+ if (!need_data_tag)
+ data_tag = READ_INDEP_DATA;
/*
* Calculate the IOC that we'll send the I/O request to
* and the offset within that IOC's subfile
*/
- calculate_target_ioc(offset, sf_context->sf_stripe_size, n_io_concentrators, &ioc_start, &ioc_offset);
+ calculate_target_ioc(offset, sf_context->sf_stripe_size, num_io_concentrators, num_subfiles, &ioc_start,
+ &ioc_offset, &ioc_subfile_idx);
/*
- * At this point in the new implementation, we should queue
- * the non-blocking recv so that when the top level VFD tells
- * us to complete all pending IO requests, we have all the info
- * we need to accomplish that.
- *
- * Post the early non-blocking receive here.
+ * Allocate the I/O request object that will
+ * be returned to the caller
*/
if (NULL == (sf_io_request = HDmalloc(sizeof(io_req_t))))
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_READERROR, FAIL, "couldn't allocate I/O request");
H5_CHECK_OVERFLOW(ioc_start, int64_t, int);
- sf_io_request->completion_func.io_args.ioc = (int)ioc_start;
- sf_io_request->completion_func.io_args.context_id = context_id;
- sf_io_request->completion_func.io_args.offset = offset;
- sf_io_request->completion_func.io_args.elements = elements;
- sf_io_request->completion_func.io_args.data = data;
- sf_io_request->completion_func.io_args.io_req = MPI_REQUEST_NULL;
- sf_io_request->completion_func.io_function = async_completion;
- sf_io_request->completion_func.pending = 0;
-
- sf_io_request->prev = sf_io_request->next = NULL;
+ sf_io_request->ioc = (int)ioc_start;
+ sf_io_request->context_id = context_id;
+ sf_io_request->offset = offset;
+ sf_io_request->elements = elements;
+ sf_io_request->data = data;
+ sf_io_request->io_transfer_req = MPI_REQUEST_NULL;
+ sf_io_request->io_comp_req = MPI_REQUEST_NULL;
+ sf_io_request->io_comp_tag = -1;
+
+ if (need_data_tag) {
+ /*
+ * Post an early non-blocking receive for IOC to send an ACK
+ * (or NACK) message with a data tag that we will use for
+ * receiving data
+ */
+ if (MPI_SUCCESS != (mpi_code = MPI_Irecv(&data_tag, 1, MPI_INT, io_concentrators[ioc_start],
+ READ_INDEP_ACK, sf_context->sf_data_comm, &ack_request)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Irecv failed", mpi_code);
+
+ /*
+ * Prepare and send an I/O request to the IOC identified
+ * by the file offset
+ */
+ msg[0] = elements;
+ msg[1] = ioc_offset;
+ msg[2] = ioc_subfile_idx;
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Send(msg, 1, H5_subfiling_rpc_msg_type, io_concentrators[ioc_start], READ_INDEP,
+ sf_context->sf_msg_comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Send failed", mpi_code);
+
+ /* Wait to receive the data tag from the IOC */
+ if (MPI_SUCCESS != (mpi_code = MPI_Wait(&ack_request, MPI_STATUS_IGNORE)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Wait failed", mpi_code);
+
+ if (data_tag == 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "received NACK from IOC");
+ }
- H5_CHECK_OVERFLOW(elements, int64_t, int);
+ /*
+ * Post a non-blocking receive for the data from the IOC
+ * using the selected data tag (either the one received
+ * from the IOC or the static READ_INDEP_DATA tag)
+ */
if (MPI_SUCCESS !=
- (mpi_code = MPI_Irecv(data, (int)elements, MPI_BYTE, io_concentrators[ioc_start], READ_INDEP_DATA,
- sf_context->sf_data_comm, &sf_io_request->completion_func.io_args.io_req)))
+ (mpi_code = MPI_Irecv(data, (int)elements, MPI_BYTE, io_concentrators[ioc_start], data_tag,
+ sf_context->sf_data_comm, &sf_io_request->io_transfer_req)))
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Irecv failed", mpi_code);
- sf_io_request->completion_func.pending = 1;
- *io_req = sf_io_request;
+ if (!need_data_tag) {
+ /*
+ * Prepare and send an I/O request to the IOC identified
+ * by the file offset
+ */
+ msg[0] = elements;
+ msg[1] = ioc_offset;
+ msg[2] = ioc_subfile_idx;
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Send(msg, 1, H5_subfiling_rpc_msg_type, io_concentrators[ioc_start], READ_INDEP,
+ sf_context->sf_msg_comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Send failed", mpi_code);
+ }
- /*
- * Prepare and send an I/O request to the IOC identified
- * by the file offset
- */
- msg[0] = elements;
- msg[1] = ioc_offset;
- msg[2] = context_id;
- if (MPI_SUCCESS != (mpi_code = MPI_Send(msg, 3, MPI_INT64_T, io_concentrators[ioc_start], READ_INDEP,
- sf_context->sf_msg_comm)))
- H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Send failed", mpi_code);
+ *io_req = sf_io_request;
done:
if (ret_value < 0) {
- if (sf_io_request && sf_io_request->completion_func.io_args.io_req != MPI_REQUEST_NULL) {
- if (MPI_SUCCESS != (mpi_code = MPI_Cancel(&sf_io_request->completion_func.io_args.io_req)))
- H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Cancel failed", mpi_code);
+ if (ack_request != MPI_REQUEST_NULL) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Wait(&ack_request, MPI_STATUS_IGNORE)))
+ H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Wait failed", mpi_code);
+ }
+ if (sf_io_request) {
+ if (sf_io_request->io_transfer_req != MPI_REQUEST_NULL) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Wait(&sf_io_request->io_transfer_req, MPI_STATUS_IGNORE)))
+ H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Wait failed", mpi_code);
+ }
}
HDfree(sf_io_request);
@@ -326,56 +412,27 @@ done:
} /* end ioc__read_independent_async() */
/*-------------------------------------------------------------------------
- * Function: async_completion
+ * Function: ioc__async_completion
*
- * Purpose: Given a single io_func_t structure containing the function
- * pointer and it's input arguments and a single MPI_Request
- * argument which needs to be completed, we make progress
- * by calling MPI_Test. In this initial example, we loop
- * until the request is completed as indicated by a non-zero
- * flag variable.
+ * Purpose: IOC function to complete outstanding I/O requests.
+ * Currently just a wrapper around MPI_Waitall on the given
+ * MPI_Request array.
*
- * As we go further with the implementation, we anticipate that
- * rather than testing a single request variable, we will
- * deal with a collection of all pending IO requests (on
- * this rank).
+ * Return: Non-negative on success/Negative on failure
*
- * Return: an integer status. Zero(0) indicates success. Negative
- * values (-1) indicates an error.
*-------------------------------------------------------------------------
*/
-static int
-async_completion(void *arg)
+herr_t
+ioc__async_completion(MPI_Request *mpi_reqs, size_t num_reqs)
{
- int n_reqs;
- int mpi_code;
- int ret_value = 0;
- struct async_arg {
- int n_reqs;
- MPI_Request *sf_reqs;
- } *in_progress = (struct async_arg *)arg;
-
- HDassert(arg);
-
- n_reqs = in_progress->n_reqs;
+ herr_t ret_value = SUCCEED;
+ int mpi_code;
- if (n_reqs < 0) {
-#ifdef H5FD_IOC_DEBUG
- HDprintf("%s: invalid number of in progress I/O requests\n", __func__);
-#endif
+ HDassert(mpi_reqs);
- ret_value = -1;
- goto done;
- }
-
- if (MPI_SUCCESS != (mpi_code = MPI_Waitall(n_reqs, in_progress->sf_reqs, MPI_STATUSES_IGNORE))) {
-#ifdef H5FD_IOC_DEBUG
- HDprintf("%s: MPI_Waitall failed with rc %d\n", __func__, mpi_code);
-#endif
-
- ret_value = -1;
- goto done;
- }
+ H5_CHECK_OVERFLOW(num_reqs, size_t, int);
+ if (MPI_SUCCESS != (mpi_code = MPI_Waitall((int)num_reqs, mpi_reqs, MPI_STATUSES_IGNORE)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Waitall failed", mpi_code);
done:
H5_SUBFILING_FUNC_LEAVE;
diff --git a/src/H5FDsubfiling/H5FDioc_priv.h b/src/H5FDsubfiling/H5FDioc_priv.h
index a86810c..3b0c4d0 100644
--- a/src/H5FDsubfiling/H5FDioc_priv.h
+++ b/src/H5FDsubfiling/H5FDioc_priv.h
@@ -394,26 +394,15 @@ typedef struct ioc_io_queue {
* input arguments for the functions which were originally
* invoked. See below.
*/
-typedef struct _client_io_args {
- int ioc; /* ID of the IO Concentrator handling this IO. */
- int64_t context_id; /* The context id provided for the read or write */
- int64_t offset; /* The file offset for the IO operation */
- int64_t elements; /* How many bytes */
- void *data; /* A pointer to the (contiguous) data segment */
- MPI_Request io_req; /* An MPI request to allow the code to loop while */
- /* making progress on multiple IOs */
-} io_args_t;
-
-typedef struct _client_io_func {
- int (*io_function)(void *this_io); /* pointer to a completion function */
- io_args_t io_args; /* arguments passed to the completion function */
- int pending; /* The function is complete (0) or pending (1)? */
-} io_func_t;
-
typedef struct _io_req {
- struct _io_req *prev; /* A simple list structure containing completion */
- struct _io_req *next; /* functions. These should get removed as IO ops */
- io_func_t completion_func; /* are completed */
+ int ioc; /* ID of the IO Concentrator handling this IO. */
+ int64_t context_id; /* The context id provided for the read or write */
+ int64_t offset; /* The file offset for the IO operation */
+ int64_t elements; /* How many bytes */
+ void *data; /* A pointer to the (contiguous) data segment */
+ MPI_Request io_transfer_req; /* MPI request for Isend/Irecv of I/O data */
+ MPI_Request io_comp_req; /* MPI request signifying when actual I/O is finished */
+ int io_comp_tag; /* MPI tag value used for completed I/O request */
} io_req_t;
extern int *H5FD_IOC_tag_ub_val_ptr;
@@ -425,10 +414,12 @@ extern "C" {
H5_DLL int initialize_ioc_threads(void *_sf_context);
H5_DLL int finalize_ioc_threads(void *_sf_context);
-H5_DLL herr_t ioc__write_independent_async(int64_t context_id, int n_io_concentrators, int64_t offset,
- int64_t elements, const void *data, io_req_t **io_req);
-H5_DLL herr_t ioc__read_independent_async(int64_t context_id, int n_io_concentrators, int64_t offset,
- int64_t elements, void *data, io_req_t **io_req);
+H5_DLL herr_t ioc__write_independent_async(int64_t context_id, int64_t offset, int64_t elements,
+ const void *data, io_req_t **io_req);
+H5_DLL herr_t ioc__read_independent_async(int64_t context_id, int64_t offset, int64_t elements, void *data,
+ io_req_t **io_req);
+
+H5_DLL herr_t ioc__async_completion(MPI_Request *mpi_reqs, size_t num_reqs);
H5_DLL int wait_for_thread_main(void);
diff --git a/src/H5FDsubfiling/H5FDioc_threads.c b/src/H5FDsubfiling/H5FDioc_threads.c
index 813fb3f..b3e8ebc 100644
--- a/src/H5FDsubfiling/H5FDioc_threads.c
+++ b/src/H5FDsubfiling/H5FDioc_threads.c
@@ -72,16 +72,16 @@ static double sf_queue_delay_time = 0.0;
static HG_THREAD_RETURN_TYPE ioc_thread_main(void *arg);
static int ioc_main(ioc_data_t *ioc_data);
-static int ioc_file_queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm,
+static int ioc_file_queue_write_indep(sf_work_request_t *msg, int ioc_idx, int source, MPI_Comm comm,
uint32_t counter);
-static int ioc_file_queue_read_indep(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm);
+static int ioc_file_queue_read_indep(sf_work_request_t *msg, int ioc_idx, int source, MPI_Comm comm,
+ uint32_t counter);
static int ioc_file_write_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size,
- int subfile_rank);
-static int ioc_file_read_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size,
- int subfile_rank);
-static int ioc_file_truncate(int fd, int64_t length, int subfile_rank);
-static int ioc_file_report_eof(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm);
+ int ioc_idx);
+static int ioc_file_read_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, int ioc_idx);
+static int ioc_file_truncate(sf_work_request_t *msg);
+static int ioc_file_report_eof(sf_work_request_t *msg, MPI_Comm comm);
static ioc_io_queue_entry_t *ioc_io_queue_alloc_entry(void);
static void ioc_io_queue_complete_entry(ioc_data_t *ioc_data, ioc_io_queue_entry_t *entry_ptr);
@@ -156,6 +156,8 @@ initialize_ioc_threads(void *_sf_context)
#endif
};
+ sf_context->ioc_data = ioc_data;
+
/* Initialize atomic vars */
atomic_init(&ioc_data->sf_ioc_ready, 0);
atomic_init(&ioc_data->sf_shutdown_flag, 0);
@@ -194,7 +196,7 @@ initialize_ioc_threads(void *_sf_context)
t_end = MPI_Wtime();
#ifdef H5FD_IOC_DEBUG
- if (sf_context->topology->subfile_rank == 0) {
+ if (sf_context->topology->ioc_idx == 0) {
HDprintf("%s: time = %lf seconds\n", __func__, (t_end - t_start));
HDfflush(stdout);
}
@@ -202,8 +204,6 @@ initialize_ioc_threads(void *_sf_context)
#endif
- sf_context->ioc_data = ioc_data;
-
done:
H5_SUBFILING_FUNC_LEAVE;
}
@@ -245,6 +245,7 @@ finalize_ioc_threads(void *_sf_context)
ioc_data->io_queue.num_failed);
HDfree(ioc_data);
+ sf_context->ioc_data = NULL;
H5_SUBFILING_FUNC_LEAVE;
}
@@ -346,7 +347,6 @@ ioc_main(ioc_data_t *ioc_data)
{
subfiling_context_t *context = NULL;
sf_work_request_t wk_req;
- int subfile_rank;
int shutdown_requested;
int ret_value = 0;
@@ -362,8 +362,6 @@ ioc_main(ioc_data_t *ioc_data)
* represent an open file).
*/
- subfile_rank = context->sf_group_rank;
-
/* tell initialize_ioc_threads() that ioc_main() is ready to enter its main loop */
atomic_store(&ioc_data->sf_ioc_ready, 1);
@@ -415,11 +413,11 @@ ioc_main(ioc_data_t *ioc_data)
queue_start_time = MPI_Wtime();
- wk_req.tag = tag;
- wk_req.source = source;
- wk_req.subfile_rank = subfile_rank;
- wk_req.context_id = ioc_data->sf_context_id;
- wk_req.start_time = queue_start_time;
+ wk_req.tag = tag;
+ wk_req.source = source;
+ wk_req.ioc_idx = context->topology->ioc_idx;
+ wk_req.context_id = ioc_data->sf_context_id;
+ wk_req.start_time = queue_start_time;
ioc_io_queue_add_entry(ioc_data, &wk_req);
@@ -506,7 +504,7 @@ handle_work_request(void *arg)
subfiling_context_t *sf_context = NULL;
sf_work_request_t *msg = &(q_entry_ptr->wk_req);
ioc_data_t *ioc_data = NULL;
- int64_t file_context_id = msg->header[2];
+ int64_t file_context_id = msg->context_id;
int op_ret;
hg_thread_ret_t ret_value = 0;
@@ -524,27 +522,27 @@ handle_work_request(void *arg)
switch (msg->tag) {
case WRITE_INDEP:
- op_ret = ioc_file_queue_write_indep(msg, msg->subfile_rank, msg->source, sf_context->sf_data_comm,
+ op_ret = ioc_file_queue_write_indep(msg, msg->ioc_idx, msg->source, sf_context->sf_data_comm,
q_entry_ptr->counter);
break;
case READ_INDEP:
- op_ret = ioc_file_queue_read_indep(msg, msg->subfile_rank, msg->source, sf_context->sf_data_comm);
+ op_ret = ioc_file_queue_read_indep(msg, msg->ioc_idx, msg->source, sf_context->sf_data_comm,
+ q_entry_ptr->counter);
break;
case TRUNC_OP:
- op_ret = ioc_file_truncate(sf_context->sf_fid, q_entry_ptr->wk_req.header[0],
- sf_context->topology->subfile_rank);
+ op_ret = ioc_file_truncate(msg);
break;
case GET_EOF_OP:
- op_ret = ioc_file_report_eof(msg, msg->subfile_rank, msg->source, sf_context->sf_eof_comm);
+ op_ret = ioc_file_report_eof(msg, sf_context->sf_eof_comm);
break;
default:
#ifdef H5_SUBFILING_DEBUG
H5_subfiling_log(file_context_id, "%s: IOC %d received unknown message with tag %x from rank %d",
- __func__, msg->subfile_rank, msg->tag, msg->source);
+ __func__, msg->ioc_idx, msg->tag, msg->source);
#endif
op_ret = -1;
@@ -555,11 +553,11 @@ handle_work_request(void *arg)
if (op_ret < 0) {
#ifdef H5_SUBFILING_DEBUG
- H5_subfiling_log(
- file_context_id,
- "%s: IOC %d request(%s) filename=%s from rank(%d), size=%ld, offset=%ld FAILED with ret %d",
- __func__, msg->subfile_rank, translate_opcode((io_op_t)msg->tag), sf_context->sf_filename,
- msg->source, msg->header[0], msg->header[1], op_ret);
+ H5_subfiling_log(file_context_id,
+ "%s: IOC %d request(%s) from rank(%d), (%" PRId64 ", %" PRId64 ", %" PRId64
+ ") FAILED with ret %d",
+ __func__, msg->ioc_idx, translate_opcode((io_op_t)msg->tag), msg->source,
+ msg->header[0], msg->header[1], msg->header[2], op_ret);
#endif
q_entry_ptr->wk_ret = op_ret;
@@ -686,15 +684,15 @@ from the thread pool threads...
*-------------------------------------------------------------------------
*/
static int
-ioc_file_queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm,
- uint32_t counter)
+ioc_file_queue_write_indep(sf_work_request_t *msg, int ioc_idx, int source, MPI_Comm comm, uint32_t counter)
{
subfiling_context_t *sf_context = NULL;
MPI_Status msg_status;
hbool_t send_nack = FALSE;
+ int64_t file_context_id;
int64_t data_size;
int64_t file_offset;
- int64_t file_context_id;
+ int64_t subfile_idx;
int64_t stripe_id;
haddr_t sf_eof;
#ifdef H5FD_IOC_COLLECT_STATS
@@ -714,10 +712,12 @@ ioc_file_queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source,
HDassert(msg);
+ file_context_id = msg->context_id;
+
/* Retrieve the fields of the RPC message for the write operation */
- data_size = msg->header[0];
- file_offset = msg->header[1];
- file_context_id = msg->header[2];
+ data_size = msg->header[0];
+ file_offset = msg->header[1];
+ subfile_idx = msg->header[2];
if (data_size < 0) {
send_nack = TRUE;
@@ -746,7 +746,7 @@ ioc_file_queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source,
#ifdef H5_SUBFILING_DEBUG
H5_subfiling_log(file_context_id,
"[ioc(%d) %s]: msg from %d: datasize=%ld\toffset=%ld, queue_delay = %lf seconds\n",
- subfile_rank, __func__, source, data_size, file_offset, t_queue_delay);
+ ioc_idx, __func__, source, data_size, file_offset, t_queue_delay);
#endif
#endif
@@ -764,12 +764,12 @@ ioc_file_queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source,
* allows us to distinguish between multiple concurrent
* writes from a single rank.
*/
- HDassert(H5FD_IOC_tag_ub_val_ptr && (*H5FD_IOC_tag_ub_val_ptr >= WRITE_TAG_BASE));
- rcv_tag = (int)(counter % (INT_MAX - WRITE_TAG_BASE));
- rcv_tag %= (*H5FD_IOC_tag_ub_val_ptr - WRITE_TAG_BASE);
- rcv_tag += WRITE_TAG_BASE;
+ HDassert(H5FD_IOC_tag_ub_val_ptr && (*H5FD_IOC_tag_ub_val_ptr >= IO_TAG_BASE));
+ rcv_tag = (int)(counter % (INT_MAX - IO_TAG_BASE));
+ rcv_tag %= (*H5FD_IOC_tag_ub_val_ptr - IO_TAG_BASE);
+ rcv_tag += IO_TAG_BASE;
- if (send_ack_to_client(rcv_tag, source, subfile_rank, WRITE_INDEP_ACK, comm) < 0)
+ if (send_ack_to_client(rcv_tag, source, ioc_idx, WRITE_INDEP_ACK, comm) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_WRITEERROR, -1, "couldn't send ACK to client");
/* Receive data from client */
@@ -794,13 +794,14 @@ ioc_file_queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source,
t_start = t_end;
#ifdef H5_SUBFILING_DEBUG
- H5_subfiling_log(file_context_id, "[ioc(%d) %s] MPI_Recv(%ld bytes, from = %d) status = %d\n",
- subfile_rank, __func__, data_size, source, mpi_code);
+ H5_subfiling_log(file_context_id, "[ioc(%d) %s] MPI_Recv(%ld bytes, from = %d) status = %d\n", ioc_idx,
+ __func__, data_size, source, mpi_code);
#endif
#endif
- sf_fid = sf_context->sf_fid;
+ HDassert(subfile_idx < sf_context->sf_num_fids);
+ sf_fid = sf_context->sf_fids[subfile_idx];
#ifdef H5_SUBFILING_DEBUG
if (sf_fid < 0)
@@ -810,7 +811,7 @@ ioc_file_queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source,
if (sf_fid >= 0) {
/* Actually write data received from client into subfile */
- if ((write_ret = ioc_file_write_data(sf_fid, file_offset, recv_buf, data_size, subfile_rank)) < 0)
+ if ((write_ret = ioc_file_write_data(sf_fid, file_offset, recv_buf, data_size, ioc_idx)) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_WRITEERROR, -1,
"write function(FID=%d, Source=%d) returned an error (%d)", sf_fid,
source, write_ret);
@@ -834,10 +835,17 @@ ioc_file_queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source,
H5FD_ioc_end_thread_exclusive();
+ /*
+ * Send a message back to the client that the I/O call has
+ * completed and it is safe to return from the write call
+ */
+ if (MPI_SUCCESS != (mpi_code = MPI_Send(&rcv_tag, 1, MPI_INT, source, WRITE_DATA_DONE, comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(-1, "MPI_Send failed", mpi_code);
+
done:
if (send_nack) {
/* Send NACK back to client so client can handle failure gracefully */
- if (send_nack_to_client(source, subfile_rank, WRITE_INDEP_ACK, comm) < 0)
+ if (send_nack_to_client(source, ioc_idx, WRITE_INDEP_ACK, comm) < 0)
H5_SUBFILING_DONE_ERROR(H5E_IO, H5E_WRITEERROR, -1, "couldn't send NACK to client");
}
@@ -867,13 +875,16 @@ done:
*-------------------------------------------------------------------------
*/
static int
-ioc_file_queue_read_indep(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm)
+ioc_file_queue_read_indep(sf_work_request_t *msg, int ioc_idx, int source, MPI_Comm comm, uint32_t counter)
{
subfiling_context_t *sf_context = NULL;
hbool_t send_empty_buf = TRUE;
+ hbool_t send_nack = FALSE;
+ hbool_t need_data_tag = FALSE;
+ int64_t file_context_id;
int64_t data_size;
int64_t file_offset;
- int64_t file_context_id;
+ int64_t subfile_idx;
#ifdef H5FD_IOC_COLLECT_STATS
double t_start;
double t_end;
@@ -881,6 +892,7 @@ ioc_file_queue_read_indep(sf_work_request_t *msg, int subfile_rank, int source,
double t_queue_delay;
#endif
char *send_buf = NULL;
+ int send_tag;
int sf_fid;
int read_ret;
int mpi_code;
@@ -888,17 +900,37 @@ ioc_file_queue_read_indep(sf_work_request_t *msg, int subfile_rank, int source,
HDassert(msg);
- /* Retrieve the fields of the RPC message for the read operation */
- data_size = msg->header[0];
- file_offset = msg->header[1];
- file_context_id = msg->header[2];
-
- if (data_size < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_BADVALUE, -1, "invalid data size for read");
+ file_context_id = msg->context_id;
sf_context = H5_get_subfiling_object(file_context_id);
HDassert(sf_context);
+ /*
+ * If we are using 1 subfile per IOC, we can optimize reads
+ * a little since each read will go to a separate IOC and we
+ * won't be in danger of data being received in an
+ * unpredictable order. However, if some IOCs own more than
+ * 1 subfile, we need to associate each read with a unique
+ * message tag to make sure the data is received in the
+ * correct order.
+ */
+ need_data_tag = sf_context->sf_num_subfiles != sf_context->topology->n_io_concentrators;
+ if (!need_data_tag)
+ send_tag = READ_INDEP_DATA;
+
+ /* Retrieve the fields of the RPC message for the read operation */
+ data_size = msg->header[0];
+ file_offset = msg->header[1];
+ subfile_idx = msg->header[2];
+
+ if (data_size < 0) {
+ if (need_data_tag) {
+ send_nack = TRUE;
+ send_empty_buf = FALSE;
+ }
+ H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_BADVALUE, -1, "invalid data size for read");
+ }
+
/* Flag that we've attempted to read data from the file */
sf_context->sf_read_count++;
@@ -911,22 +943,48 @@ ioc_file_queue_read_indep(sf_work_request_t *msg, int subfile_rank, int source,
#ifdef H5_SUBFILING_DEBUG
H5_subfiling_log(file_context_id,
- "[ioc(%d) %s] msg from %d: datasize=%ld\toffset=%ld queue_delay=%lf seconds\n",
- subfile_rank, __func__, source, data_size, file_offset, t_queue_delay);
+ "[ioc(%d) %s] msg from %d: datasize=%ld\toffset=%ld queue_delay=%lf seconds\n", ioc_idx,
+ __func__, source, data_size, file_offset, t_queue_delay);
#endif
#endif
/* Allocate space to send data read from file to client */
- if (NULL == (send_buf = HDmalloc((size_t)data_size)))
+ if (NULL == (send_buf = HDmalloc((size_t)data_size))) {
+ if (need_data_tag) {
+ send_nack = TRUE;
+ send_empty_buf = FALSE;
+ }
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, -1, "couldn't allocate send buffer for data");
+ }
+
+ if (need_data_tag) {
+ /*
+ * Calculate message tag for the client to use for receiving
+ * data, then send an ACK message to the client with the
+ * calculated message tag. This calculated message tag
+ * allows us to distinguish between multiple concurrent
+ * reads from a single rank, which can happen when a rank
+ * owns multiple subfiles.
+ */
+ HDassert(H5FD_IOC_tag_ub_val_ptr && (*H5FD_IOC_tag_ub_val_ptr >= IO_TAG_BASE));
+ send_tag = (int)(counter % (INT_MAX - IO_TAG_BASE));
+ send_tag %= (*H5FD_IOC_tag_ub_val_ptr - IO_TAG_BASE);
+ send_tag += IO_TAG_BASE;
+
+ if (send_ack_to_client(send_tag, source, ioc_idx, READ_INDEP_ACK, comm) < 0) {
+ send_empty_buf = FALSE;
+ H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_READERROR, -1, "couldn't send ACK to client");
+ }
+ }
- sf_fid = sf_context->sf_fid;
+ /* Read data from the subfile */
+ HDassert(subfile_idx < sf_context->sf_num_fids);
+ sf_fid = sf_context->sf_fids[subfile_idx];
if (sf_fid < 0)
H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_BADVALUE, -1, "subfile file descriptor %d is invalid", sf_fid);
- /* Read data from the subfile */
- if ((read_ret = ioc_file_read_data(sf_fid, file_offset, send_buf, data_size, subfile_rank)) < 0) {
+ if ((read_ret = ioc_file_read_data(sf_fid, file_offset, send_buf, data_size, ioc_idx)) < 0) {
H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_READERROR, read_ret,
"read function(FID=%d, Source=%d) returned an error (%d)", sf_fid, source,
read_ret);
@@ -936,8 +994,7 @@ ioc_file_queue_read_indep(sf_work_request_t *msg, int subfile_rank, int source,
/* Send read data to the client */
H5_CHECK_OVERFLOW(data_size, int64_t, int);
- if (MPI_SUCCESS !=
- (mpi_code = MPI_Send(send_buf, (int)data_size, MPI_BYTE, source, READ_INDEP_DATA, comm)))
+ if (MPI_SUCCESS != (mpi_code = MPI_Send(send_buf, (int)data_size, MPI_BYTE, source, send_tag, comm)))
H5_SUBFILING_MPI_GOTO_ERROR(-1, "MPI_Send failed", mpi_code);
#ifdef H5FD_IOC_COLLECT_STATS
@@ -947,19 +1004,24 @@ ioc_file_queue_read_indep(sf_work_request_t *msg, int subfile_rank, int source,
sf_queue_delay_time += t_queue_delay;
#ifdef H5_SUBFILING_DEBUG
- H5_subfiling_log(sf_context->sf_context_id, "[ioc(%d)] MPI_Send to source(%d) completed\n", subfile_rank,
+ H5_subfiling_log(sf_context->sf_context_id, "[ioc(%d)] MPI_Send to source(%d) completed\n", ioc_idx,
source);
#endif
#endif
done:
+ if (need_data_tag && send_nack) {
+ /* Send NACK back to client so client can handle failure gracefully */
+ if (send_nack_to_client(source, ioc_idx, READ_INDEP_ACK, comm) < 0)
+ H5_SUBFILING_DONE_ERROR(H5E_IO, H5E_READERROR, -1, "couldn't send NACK to client");
+ }
if (send_empty_buf) {
/*
* Send an empty message back to client on failure. The client will
* likely get a message truncation error, but at least shouldn't hang.
*/
- if (MPI_SUCCESS != (mpi_code = MPI_Send(NULL, 0, MPI_BYTE, source, READ_INDEP_DATA, comm)))
+ if (MPI_SUCCESS != (mpi_code = MPI_Send(NULL, 0, MPI_BYTE, source, send_tag, comm)))
H5_SUBFILING_MPI_DONE_ERROR(-1, "MPI_Send failed", mpi_code);
}
@@ -978,7 +1040,7 @@ being thread safe.
*/
static int
-ioc_file_write_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, int subfile_rank)
+ioc_file_write_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, int ioc_idx)
{
ssize_t bytes_remaining = (ssize_t)data_size;
ssize_t bytes_written = 0;
@@ -986,7 +1048,7 @@ ioc_file_write_data(int fd, int64_t file_offset, void *data_buffer, int64_t data
int ret_value = 0;
#ifndef H5FD_IOC_DEBUG
- (void)subfile_rank;
+ (void)ioc_idx;
#endif
HDcompile_assert(H5_SIZEOF_OFF_T == sizeof(file_offset));
@@ -1000,7 +1062,7 @@ ioc_file_write_data(int fd, int64_t file_offset, void *data_buffer, int64_t data
bytes_remaining -= bytes_written;
#ifdef H5FD_IOC_DEBUG
- HDprintf("[ioc(%d) %s]: wrote %ld bytes, remaining=%ld, file_offset=%" PRId64 "\n", subfile_rank,
+ HDprintf("[ioc(%d) %s]: wrote %ld bytes, remaining=%ld, file_offset=%" PRId64 "\n", ioc_idx,
__func__, bytes_written, bytes_remaining, file_offset);
#endif
@@ -1024,7 +1086,7 @@ done:
} /* end ioc_file_write_data() */
static int
-ioc_file_read_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, int subfile_rank)
+ioc_file_read_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, int ioc_idx)
{
useconds_t delay = 100;
ssize_t bytes_remaining = (ssize_t)data_size;
@@ -1034,7 +1096,7 @@ ioc_file_read_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_
int ret_value = 0;
#ifndef H5FD_IOC_DEBUG
- (void)subfile_rank;
+ (void)ioc_idx;
#endif
HDcompile_assert(H5_SIZEOF_OFF_T == sizeof(file_offset));
@@ -1052,7 +1114,7 @@ ioc_file_read_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_
bytes_remaining -= bytes_read;
#ifdef H5FD_IOC_DEBUG
- HDprintf("[ioc(%d) %s]: read %ld bytes, remaining=%ld, file_offset=%" PRId64 "\n", subfile_rank,
+ HDprintf("[ioc(%d) %s]: read %ld bytes, remaining=%ld, file_offset=%" PRId64 "\n", ioc_idx,
__func__, bytes_read, bytes_remaining, file_offset);
#endif
@@ -1069,8 +1131,8 @@ ioc_file_read_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_
else {
if (retries == 0) {
#ifdef H5FD_IOC_DEBUG
- HDprintf("[ioc(%d) %s]: TIMEOUT: file_offset=%" PRId64 ", data_size=%ld\n", subfile_rank,
- __func__, file_offset, data_size);
+ HDprintf("[ioc(%d) %s]: TIMEOUT: file_offset=%" PRId64 ", data_size=%ld\n", ioc_idx, __func__,
+ file_offset, data_size);
#endif
H5_SUBFILING_SYS_GOTO_ERROR(H5E_IO, H5E_READERROR, -1, "HDpread failed");
@@ -1087,19 +1149,40 @@ done:
} /* end ioc_file_read_data() */
static int
-ioc_file_truncate(int fd, int64_t length, int subfile_rank)
+ioc_file_truncate(sf_work_request_t *msg)
{
- int ret_value = 0;
+ subfiling_context_t *sf_context = NULL;
+ int64_t file_context_id;
+ int64_t length;
+ int64_t subfile_idx;
+ int fd;
+ int ioc_idx;
+ int ret_value = 0;
+
+ HDassert(msg);
+
+ file_context_id = msg->context_id;
+ ioc_idx = msg->ioc_idx;
+
+ length = msg->header[0];
+ subfile_idx = msg->header[1];
#ifndef H5FD_IOC_DEBUG
- (void)subfile_rank;
+ (void)ioc_idx;
#endif
+ if (NULL == (sf_context = H5_get_subfiling_object(file_context_id)))
+ H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTGET, -1, "couldn't retrieve subfiling context");
+
+ HDassert(subfile_idx < sf_context->sf_num_fids);
+
+ fd = sf_context->sf_fids[subfile_idx];
+
if (HDftruncate(fd, (off_t)length) != 0)
H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_SEEKERROR, -1, "HDftruncate failed");
#ifdef H5FD_IOC_DEBUG
- HDprintf("[ioc(%d) %s]: truncated subfile to %lld bytes. ret = %d\n", subfile_rank, __func__,
+ HDprintf("[ioc(%d) %s]: truncated subfile to %lld bytes. ret = %d\n", ioc_idx, __func__,
(long long)length, errno);
HDfflush(stdout);
#endif
@@ -1111,7 +1194,7 @@ done:
/*-------------------------------------------------------------------------
* Function: ioc_file_report_eof
*
- * Purpose: Determine the target sub-file's eof and report this value
+ * Purpose: Determine the target subfile's eof and report this value
* to the requesting rank.
*
* Notes: This function will have to be reworked once we solve
@@ -1131,40 +1214,48 @@ done:
*/
static int
-ioc_file_report_eof(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm)
+ioc_file_report_eof(sf_work_request_t *msg, MPI_Comm comm)
{
subfiling_context_t *sf_context = NULL;
h5_stat_t sb;
int64_t eof_req_reply[3];
int64_t file_context_id;
+ int64_t subfile_idx;
int fd;
+ int source;
+ int ioc_idx;
int mpi_code;
int ret_value = 0;
HDassert(msg);
- /* first get the EOF of the target file. */
+ file_context_id = msg->context_id;
+ source = msg->source;
+ ioc_idx = msg->ioc_idx;
- file_context_id = msg->header[2];
+ subfile_idx = msg->header[0];
if (NULL == (sf_context = H5_get_subfiling_object(file_context_id)))
H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTGET, -1, "couldn't retrieve subfiling context");
- fd = sf_context->sf_fid;
+ HDassert(subfile_idx < sf_context->sf_num_fids);
+
+ fd = sf_context->sf_fids[subfile_idx];
if (HDfstat(fd, &sb) < 0)
H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_SYSERRSTR, -1, "HDfstat failed");
- eof_req_reply[0] = (int64_t)subfile_rank;
+ eof_req_reply[0] = (int64_t)ioc_idx;
eof_req_reply[1] = (int64_t)(sb.st_size);
- eof_req_reply[2] = 0; /* not used */
+ eof_req_reply[2] = subfile_idx;
#ifdef H5_SUBFILING_DEBUG
H5_subfiling_log(file_context_id, "%s: reporting file EOF as %" PRId64 ".", __func__, eof_req_reply[1]);
#endif
/* return the subfile EOF to the querying rank */
- if (MPI_SUCCESS != (mpi_code = MPI_Send(eof_req_reply, 3, MPI_INT64_T, source, GET_EOF_COMPLETED, comm)))
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Send(eof_req_reply, 1, H5_subfiling_rpc_msg_type, source, GET_EOF_COMPLETED, comm)))
H5_SUBFILING_MPI_GOTO_ERROR(-1, "MPI_Send", mpi_code);
done:
@@ -1272,12 +1363,13 @@ ioc_io_queue_add_entry(ioc_data_t *ioc_data, sf_work_request_t *wk_req_ptr)
atomic_fetch_add(&ioc_data->sf_io_ops_pending, 1);
#ifdef H5_SUBFILING_DEBUG
- H5_subfiling_log(wk_req_ptr->context_id,
- "%s: request %d queued. op = %d, offset/len = %lld/%lld, q-ed/disp/ops_pend = %d/%d/%d.",
- __func__, entry_ptr->counter, (entry_ptr->wk_req.tag),
- (long long)(entry_ptr->wk_req.header[1]), (long long)(entry_ptr->wk_req.header[0]),
- ioc_data->io_queue.num_pending, ioc_data->io_queue.num_in_progress,
- atomic_load(&ioc_data->sf_io_ops_pending));
+ H5_subfiling_log(
+ wk_req_ptr->context_id,
+ "%s: request %d queued. op = %d, req = (%lld, %lld, %lld), q-ed/disp/ops_pend = %d/%d/%d.", __func__,
+ entry_ptr->counter, (entry_ptr->wk_req.tag), (long long)(entry_ptr->wk_req.header[0]),
+ (long long)(entry_ptr->wk_req.header[1]), (long long)(entry_ptr->wk_req.header[2]),
+ ioc_data->io_queue.num_pending, ioc_data->io_queue.num_in_progress,
+ atomic_load(&ioc_data->sf_io_ops_pending));
#endif
HDassert(ioc_data->io_queue.num_pending + ioc_data->io_queue.num_in_progress == ioc_data->io_queue.q_len);
@@ -1478,14 +1570,14 @@ ioc_io_queue_dispatch_eligible_entries(ioc_data_t *ioc_data, hbool_t try_lock)
entry_ptr->thread_wk.args = entry_ptr;
#ifdef H5_SUBFILING_DEBUG
- H5_subfiling_log(entry_ptr->wk_req.context_id,
- "%s: request %d dispatched. op = %d, offset/len = %lld/%lld, "
- "q-ed/disp/ops_pend = %d/%d/%d.",
- __func__, entry_ptr->counter, (entry_ptr->wk_req.tag),
- (long long)(entry_ptr->wk_req.header[1]),
- (long long)(entry_ptr->wk_req.header[0]), ioc_data->io_queue.num_pending,
- ioc_data->io_queue.num_in_progress,
- atomic_load(&ioc_data->sf_io_ops_pending));
+ H5_subfiling_log(
+ entry_ptr->wk_req.context_id,
+ "%s: request %d dispatched. op = %d, req = (%lld, %lld, %lld), "
+ "q-ed/disp/ops_pend = %d/%d/%d.",
+ __func__, entry_ptr->counter, (entry_ptr->wk_req.tag),
+ (long long)(entry_ptr->wk_req.header[0]), (long long)(entry_ptr->wk_req.header[1]),
+ (long long)(entry_ptr->wk_req.header[2]), ioc_data->io_queue.num_pending,
+ ioc_data->io_queue.num_in_progress, atomic_load(&ioc_data->sf_io_ops_pending));
#endif
#ifdef H5FD_IOC_COLLECT_STATS
@@ -1564,12 +1656,12 @@ ioc_io_queue_complete_entry(ioc_data_t *ioc_data, ioc_io_queue_entry_t *entry_pt
#ifdef H5_SUBFILING_DEBUG
H5_subfiling_log(entry_ptr->wk_req.context_id,
- "%s: request %d completed with ret %d. op = %d, offset/len = %lld/%lld, "
+ "%s: request %d completed with ret %d. op = %d, req = (%lld, %lld, %lld), "
"q-ed/disp/ops_pend = %d/%d/%d.",
__func__, entry_ptr->counter, entry_ptr->wk_ret, (entry_ptr->wk_req.tag),
- (long long)(entry_ptr->wk_req.header[1]), (long long)(entry_ptr->wk_req.header[0]),
- ioc_data->io_queue.num_pending, ioc_data->io_queue.num_in_progress,
- atomic_load(&ioc_data->sf_io_ops_pending));
+ (long long)(entry_ptr->wk_req.header[0]), (long long)(entry_ptr->wk_req.header[1]),
+ (long long)(entry_ptr->wk_req.header[2]), ioc_data->io_queue.num_pending,
+ ioc_data->io_queue.num_in_progress, atomic_load(&ioc_data->sf_io_ops_pending));
/*
* If this I/O request is a truncate or "get eof" op, make sure
diff --git a/src/H5FDsubfiling/H5FDsubfile_int.c b/src/H5FDsubfiling/H5FDsubfile_int.c
index 22a5bd0..c089509 100644
--- a/src/H5FDsubfiling/H5FDsubfile_int.c
+++ b/src/H5FDsubfiling/H5FDsubfile_int.c
@@ -30,11 +30,11 @@
* Note: This code should be moved -- most likely to the IOC
* code files.
*
- * Purpose: Apply a truncate operation to the sub-files.
+ * Purpose: Apply a truncate operation to the subfiles.
*
* In the context of the I/O concentrators, the eof must be
* translated into the appropriate value for each of the
- * sub-files, and then applied to same.
+ * subfiles, and then applied to same.
*
* Further, we must ensure that all prior I/O requests complete
* before the truncate is applied.
@@ -44,7 +44,7 @@
* 1) Run a barrier on entry.
*
* 2) Determine if this rank is a IOC. If it is, compute
- * the correct EOF for this sub-file, and send a truncate
+ * the correct EOF for this subfile, and send a truncate
* request to the IOC.
*
* 3) On the IOC thread, allow all pending I/O requests
@@ -72,50 +72,61 @@
herr_t
H5FD__subfiling__truncate_sub_files(hid_t context_id, int64_t logical_file_eof, MPI_Comm comm)
{
- int mpi_code; /* MPI return code */
subfiling_context_t *sf_context = NULL;
- int64_t msg[3] = {
- 0,
- };
- herr_t ret_value = SUCCEED; /* Return value */
+ int64_t msg[3] = {0};
+ int mpi_size;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(comm, &mpi_size)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code);
/* Barrier on entry */
- if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm)))
- H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code);
+ if (mpi_size > 1)
+ if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code);
if (NULL == (sf_context = (subfiling_context_t *)H5_get_subfiling_object(context_id)))
H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "can't get subfile context");
- /* Test to see if this rank is running an I/O concentrator. */
-
if (sf_context->topology->rank_is_ioc) {
-
- int i;
- int64_t subfile_eof;
int64_t num_full_stripes;
+ int64_t num_leftover_stripes;
int64_t partial_stripe_len;
-#ifndef NDEBUG
- int64_t test_file_eof;
-#endif /* NDEBUG */
- /* if it is, first compute the sub-file EOF */
+ num_full_stripes = logical_file_eof / sf_context->sf_blocksize_per_stripe;
+ partial_stripe_len = logical_file_eof % sf_context->sf_blocksize_per_stripe;
+ num_leftover_stripes = partial_stripe_len / sf_context->sf_stripe_size;
- num_full_stripes = logical_file_eof / sf_context->sf_blocksize_per_stripe;
- partial_stripe_len = logical_file_eof % sf_context->sf_blocksize_per_stripe;
+ /* Compute the EOF for each subfile this IOC owns */
+ for (int i = 0; i < sf_context->sf_num_fids; i++) {
+ int64_t subfile_eof = num_full_stripes * sf_context->sf_stripe_size;
+ int64_t global_subfile_idx;
- subfile_eof = num_full_stripes * sf_context->sf_stripe_size;
+ global_subfile_idx =
+ (i * sf_context->topology->n_io_concentrators) + sf_context->topology->ioc_idx;
- if (sf_context->topology->subfile_rank < (partial_stripe_len / sf_context->sf_stripe_size)) {
+ if (global_subfile_idx < num_leftover_stripes) {
+ subfile_eof += sf_context->sf_stripe_size;
+ }
+ else if (global_subfile_idx == num_leftover_stripes) {
+ subfile_eof += partial_stripe_len % sf_context->sf_stripe_size;
+ }
- subfile_eof += sf_context->sf_stripe_size;
- }
- else if (sf_context->topology->subfile_rank == (partial_stripe_len / sf_context->sf_stripe_size)) {
+ /* Direct the IOC to truncate this subfile to the correct EOF */
+ msg[0] = subfile_eof;
+ msg[1] = i;
+ msg[2] = -1; /* padding -- not used in this message */
- subfile_eof += partial_stripe_len % sf_context->sf_stripe_size;
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Send(msg, 1, H5_subfiling_rpc_msg_type,
+ sf_context->topology->io_concentrators[sf_context->topology->ioc_idx],
+ TRUNC_OP, sf_context->sf_msg_comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Send failed", mpi_code);
}
/* sanity check -- compute the file eof using the same mechanism used to
- * compute the sub-file eof. Assert that the computed value and the
+ * compute the subfile eof. Assert that the computed value and the
* actual value match.
*
* Do this only for debug builds -- probably delete this before release.
@@ -124,40 +135,29 @@ H5FD__subfiling__truncate_sub_files(hid_t context_id, int64_t logical_file_eof,
*/
#ifndef NDEBUG
- test_file_eof = 0;
-
- for (i = 0; i < sf_context->topology->n_io_concentrators; i++) {
-
- test_file_eof += num_full_stripes * sf_context->sf_stripe_size;
-
- if (i < (partial_stripe_len / sf_context->sf_stripe_size)) {
-
- test_file_eof += sf_context->sf_stripe_size;
+ {
+ int64_t test_file_eof = 0;
+
+ for (int i = 0; i < sf_context->sf_num_subfiles; i++) {
+ test_file_eof += num_full_stripes * sf_context->sf_stripe_size;
+
+ if (i < num_leftover_stripes) {
+ test_file_eof += sf_context->sf_stripe_size;
+ }
+ else if (i == num_leftover_stripes) {
+ test_file_eof += partial_stripe_len % sf_context->sf_stripe_size;
+ }
}
- else if (i == (partial_stripe_len / sf_context->sf_stripe_size)) {
- test_file_eof += partial_stripe_len % sf_context->sf_stripe_size;
- }
+ HDassert(test_file_eof == logical_file_eof);
}
- HDassert(test_file_eof == logical_file_eof);
#endif /* NDEBUG */
-
- /* then direct the IOC to truncate the sub-file to the correct EOF */
-
- msg[0] = subfile_eof;
- msg[1] = 0; /* padding -- not used in this message */
- msg[2] = context_id;
-
- if (MPI_SUCCESS !=
- (mpi_code = MPI_Send(msg, 3, MPI_INT64_T,
- sf_context->topology->io_concentrators[sf_context->topology->subfile_rank],
- TRUNC_OP, sf_context->sf_msg_comm)))
- H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Send failed", mpi_code);
}
/* Barrier on exit */
- if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm)))
- H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code);
+ if (mpi_size > 1)
+ if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code);
done:
@@ -176,9 +176,10 @@ done:
* Do this as follows:
*
* 1) allocate an array of int64_t of length equal to the
- * the number of IOCs, and initialize all fields to -1.
+ * the number of subfiles, and initialize all fields to -1.
*
- * 2) Send each IOC a message requesting that sub-file's EOF.
+ * 2) Send each subfile's IOC a message requesting that
+ * subfile's EOF.
*
* 3) Await reply from each IOC, storing the reply in
* the appropriate entry in the array allocated in 1.
@@ -197,13 +198,13 @@ done:
* than for the more traditional HDF5 file implementations.
* This statement derives from the fact that unlike "normal"
* HDF5 files, subfiling introduces a multi-file representation
- * of a single HDF5 file. The plurality of sub-files represents
- * a software RAID-0 based HDF5 file. As such, each sub-file
+ * of a single HDF5 file. The plurality of subfiles represents
+ * a software RAID-0 based HDF5 file. As such, each subfile
* contains a designated portion of the address space of the
* virtual HDF5 storage. We have no notion of HDF5 datatypes,
* datasets, metadata, or other HDF5 structures; only BYTES.
*
- * The organization of the bytes within sub-files is consistent
+ * The organization of the bytes within subfiles is consistent
* with the RAID-0 striping, i.e. there are IO Concentrators
* (IOCs) which correspond to a stripe-count (in Lustre) as
* well as a stripe_size. The combination of these two
@@ -220,7 +221,7 @@ done:
* follows.
* 1. At file creation, each IOC is assigned a rank value
* (0 to N-1, where N is the total number of IOCs) and
- * a 'sf_base_addr' = 'subfile_rank' * 'sf_stripe_size')
+ * a 'sf_base_addr' = 'ioc_idx' * 'sf_stripe_size')
* we also determine the 'sf_blocksize_per_stripe' which
* is simply the 'sf_stripe_size' * 'n_ioc_concentrators'
*
@@ -263,9 +264,10 @@ H5FD__subfiling__get_real_eof(hid_t context_id, int64_t *logical_eof_ptr)
int64_t msg[3] = {0, 0, 0};
int64_t logical_eof = 0;
int64_t sf_logical_eof;
- int n_io_concentrators = 0; /* copy of value in topology */
- int mpi_code; /* MPI return code */
- herr_t ret_value = SUCCEED; /* Return value */
+ int n_io_concentrators = 0;
+ int num_subfiles = 0;
+ int mpi_code; /* MPI return code */
+ herr_t ret_value = SUCCEED; /* Return value */
HDassert(logical_eof_ptr);
@@ -275,56 +277,60 @@ H5FD__subfiling__get_real_eof(hid_t context_id, int64_t *logical_eof_ptr)
HDassert(sf_context->topology);
n_io_concentrators = sf_context->topology->n_io_concentrators;
+ num_subfiles = sf_context->sf_num_subfiles;
HDassert(n_io_concentrators > 0);
+ HDassert(num_subfiles >= n_io_concentrators);
- if (NULL == (sf_eofs = HDmalloc((size_t)n_io_concentrators * sizeof(int64_t))))
- H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate sub-file EOFs array");
- if (NULL == (recv_reqs = HDmalloc((size_t)n_io_concentrators * sizeof(*recv_reqs))))
+ if (NULL == (sf_eofs = HDmalloc((size_t)num_subfiles * sizeof(int64_t))))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate subfile EOFs array");
+ if (NULL == (recv_reqs = HDmalloc((size_t)num_subfiles * sizeof(*recv_reqs))))
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate receive requests array");
- if (NULL == (recv_msg = HDmalloc((size_t)n_io_concentrators * 3 * sizeof(*recv_msg))))
+ if (NULL == (recv_msg = HDmalloc((size_t)num_subfiles * sizeof(msg))))
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate message array");
- for (int i = 0; i < n_io_concentrators; i++) {
+ for (int i = 0; i < num_subfiles; i++) {
sf_eofs[i] = -1;
recv_reqs[i] = MPI_REQUEST_NULL;
}
- /* Post early non-blocking receives for replies from each IOC */
- for (int i = 0; i < n_io_concentrators; i++) {
- int ioc_rank = sf_context->topology->io_concentrators[i];
+ /* Post early non-blocking receives for the EOF of each subfile */
+ for (int i = 0; i < num_subfiles; i++) {
+ int ioc_rank = sf_context->topology->io_concentrators[i % n_io_concentrators];
- if (MPI_SUCCESS != (mpi_code = MPI_Irecv(&recv_msg[3 * i], 3, MPI_INT64_T, ioc_rank,
+ if (MPI_SUCCESS != (mpi_code = MPI_Irecv(&recv_msg[3 * i], 1, H5_subfiling_rpc_msg_type, ioc_rank,
GET_EOF_COMPLETED, sf_context->sf_eof_comm, &recv_reqs[i])))
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Irecv", mpi_code);
}
- /* Send each IOC a message requesting that subfile's EOF */
+ /* Send each subfile's IOC a message requesting that subfile's EOF */
+
+ msg[1] = -1; /* padding -- not used in this message */
+ msg[2] = -1; /* padding -- not used in this message */
- msg[0] = 0; /* padding -- not used in this message */
- msg[1] = 0; /* padding -- not used in this message */
- msg[2] = context_id;
+ for (int i = 0; i < num_subfiles; i++) {
+ int ioc_rank = sf_context->topology->io_concentrators[i % n_io_concentrators];
- for (int i = 0; i < n_io_concentrators; i++) {
- int ioc_rank = sf_context->topology->io_concentrators[i];
+ /* Set subfile index for receiving IOC */
+ msg[0] = i / n_io_concentrators;
- if (MPI_SUCCESS !=
- (mpi_code = MPI_Send(msg, 3, MPI_INT64_T, ioc_rank, GET_EOF_OP, sf_context->sf_msg_comm)))
+ if (MPI_SUCCESS != (mpi_code = MPI_Send(msg, 1, H5_subfiling_rpc_msg_type, ioc_rank, GET_EOF_OP,
+ sf_context->sf_msg_comm)))
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Send", mpi_code);
}
/* Wait for EOF communication to complete */
- if (MPI_SUCCESS != (mpi_code = MPI_Waitall(n_io_concentrators, recv_reqs, MPI_STATUSES_IGNORE)))
+ if (MPI_SUCCESS != (mpi_code = MPI_Waitall(num_subfiles, recv_reqs, MPI_STATUSES_IGNORE)))
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Waitall", mpi_code);
- for (int i = 0; i < n_io_concentrators; i++) {
+ for (int i = 0; i < num_subfiles; i++) {
int ioc_rank = (int)recv_msg[3 * i];
HDassert(ioc_rank >= 0);
HDassert(ioc_rank < n_io_concentrators);
- HDassert(sf_eofs[ioc_rank] == -1);
+ HDassert(sf_eofs[i] == -1);
- sf_eofs[ioc_rank] = recv_msg[(3 * i) + 1];
+ sf_eofs[i] = recv_msg[(3 * i) + 1];
}
/* 4) After all IOCs have replied, compute the offset of
@@ -333,21 +339,21 @@ H5FD__subfiling__get_real_eof(hid_t context_id, int64_t *logical_eof_ptr)
* EOF.
*/
- for (int i = 0; i < n_io_concentrators; i++) {
+ for (int i = 0; i < num_subfiles; i++) {
/* compute number of complete stripes */
sf_logical_eof = sf_eofs[i] / sf_context->sf_stripe_size;
/* multiply by stripe size */
- sf_logical_eof *= sf_context->sf_stripe_size * n_io_concentrators;
+ sf_logical_eof *= sf_context->sf_stripe_size * num_subfiles;
- /* if the sub-file doesn't end on a stripe size boundary, must add in a partial stripe */
+ /* if the subfile doesn't end on a stripe size boundary, must add in a partial stripe */
if (sf_eofs[i] % sf_context->sf_stripe_size > 0) {
/* add in the size of the partial stripe up to but not including this subfile */
sf_logical_eof += i * sf_context->sf_stripe_size;
- /* finally, add in the number of bytes in the last partial stripe depth in the sub-file */
+ /* finally, add in the number of bytes in the last partial stripe depth in the subfile */
sf_logical_eof += sf_eofs[i] % sf_context->sf_stripe_size;
}
@@ -365,7 +371,7 @@ H5FD__subfiling__get_real_eof(hid_t context_id, int64_t *logical_eof_ptr)
done:
if (ret_value < 0) {
- for (int i = 0; i < n_io_concentrators; i++) {
+ for (int i = 0; i < num_subfiles; i++) {
if (recv_reqs && (recv_reqs[i] != MPI_REQUEST_NULL)) {
if (MPI_SUCCESS != (mpi_code = MPI_Cancel(&recv_reqs[i])))
H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Cancel", mpi_code);
diff --git a/src/H5FDsubfiling/H5FDsubfiling.c b/src/H5FDsubfiling/H5FDsubfiling.c
index 8fe8f77..33a57e9 100644
--- a/src/H5FDsubfiling/H5FDsubfiling.c
+++ b/src/H5FDsubfiling/H5FDsubfiling.c
@@ -91,7 +91,6 @@ static hbool_t H5FD_mpi_self_initialized = FALSE;
typedef struct H5FD_subfiling_t {
H5FD_t pub; /* public stuff, must be first */
- int fd; /* the filesystem file descriptor */
H5FD_subfiling_config_t fa; /* driver-specific file access properties */
/* MPI Info */
@@ -102,8 +101,10 @@ typedef struct H5FD_subfiling_t {
int mpi_size;
H5FD_t *sf_file;
+ H5FD_t *stub_file;
- int64_t context_id; /* The value used to lookup a subfiling context for the file */
+ uint64_t file_id;
+ int64_t context_id; /* The value used to lookup a subfiling context for the file */
char *file_dir; /* Directory where we find files */
char *file_path; /* The user defined filename */
@@ -146,6 +147,9 @@ typedef struct H5FD_subfiling_t {
/* Prototypes */
static herr_t H5FD__subfiling_term(void);
+static hsize_t H5FD__subfiling_sb_size(H5FD_t *_file);
+static herr_t H5FD__subfiling_sb_encode(H5FD_t *_file, char *name, unsigned char *buf);
+static herr_t H5FD__subfiling_sb_decode(H5FD_t *_file, const char *name, const unsigned char *buf);
static void *H5FD__subfiling_fapl_get(H5FD_t *_file);
static void *H5FD__subfiling_fapl_copy(const void *_old_fa);
static herr_t H5FD__subfiling_fapl_free(void *_fa);
@@ -182,8 +186,8 @@ static herr_t H5FD__subfiling_close_int(H5FD_subfiling_t *file_ptr);
static herr_t init_indep_io(subfiling_context_t *sf_context, int64_t file_offset, size_t io_nelemts,
size_t dtype_extent, size_t max_iovec_len, int64_t *mem_buf_offset,
- int64_t *target_file_offset, int64_t *io_block_len, int *first_ioc_index,
- int *n_iocs_used, int64_t *max_io_req_per_ioc);
+ int64_t *target_file_offset, int64_t *io_block_len, int *first_subfile_index,
+ int *n_subfiles_used, int64_t *max_io_req_per_subfile);
static herr_t iovec_fill_first(subfiling_context_t *sf_context, int64_t iovec_depth, int64_t target_datasize,
int64_t start_mem_offset, int64_t start_file_offset, int64_t first_io_len,
int64_t *mem_offset_out, int64_t *target_file_offset_out,
@@ -211,9 +215,9 @@ static const H5FD_class_t H5FD_subfiling_g = {
MAXADDR, /* maxaddr */
H5F_CLOSE_WEAK, /* fc_degree */
H5FD__subfiling_term, /* terminate */
- NULL, /* sb_size */
- NULL, /* sb_encode */
- NULL, /* sb_decode */
+ H5FD__subfiling_sb_size, /* sb_size */
+ H5FD__subfiling_sb_encode, /* sb_encode */
+ H5FD__subfiling_sb_decode, /* sb_decode */
sizeof(H5FD_subfiling_config_t), /* fapl_size */
H5FD__subfiling_fapl_get, /* fapl_get */
H5FD__subfiling_fapl_copy, /* fapl_copy */
@@ -326,6 +330,18 @@ H5FD_subfiling_init(void)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, H5I_INVALID_HID,
"can't register atexit handler for MPI_Finalize");
}
+
+ /*
+ * Create the MPI Datatype that will be used
+ * for sending/receiving RPC messages
+ */
+ HDcompile_assert(sizeof(((sf_work_request_t *)NULL)->header) == 3 * sizeof(int64_t));
+ if (H5_subfiling_rpc_msg_type == MPI_DATATYPE_NULL) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_contiguous(3, MPI_INT64_T, &H5_subfiling_rpc_msg_type)))
+ H5_SUBFILING_MPI_GOTO_ERROR(H5I_INVALID_HID, "MPI_Type_contiguous failed", mpi_code);
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(&H5_subfiling_rpc_msg_type)))
+ H5_SUBFILING_MPI_GOTO_ERROR(H5I_INVALID_HID, "MPI_Type_commit failed", mpi_code);
+ }
}
/* Set return value */
@@ -350,6 +366,18 @@ H5FD__subfiling_term(void)
herr_t ret_value = SUCCEED;
if (H5FD_SUBFILING_g >= 0) {
+ int mpi_code;
+
+ /* Free RPC message MPI Datatype */
+ if (H5_subfiling_rpc_msg_type != MPI_DATATYPE_NULL)
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_free(&H5_subfiling_rpc_msg_type)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
+
+ /* Clean up resources */
+ if (H5_subfiling_terminate() < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL,
+ "can't cleanup internal subfiling resources");
+
/* Unregister from HDF5 error API */
if (H5subfiling_err_class_g >= 0) {
if (H5Eunregister_class(H5subfiling_err_class_g) < 0)
@@ -402,6 +430,9 @@ H5Pset_fapl_subfiling(hid_t fapl_id, const H5FD_subfiling_config_t *vfd_config)
{
H5FD_subfiling_config_t *subfiling_conf = NULL;
H5P_genplist_t *plist = NULL;
+ H5P_genplist_t *ioc_plist = NULL;
+ MPI_Comm comm = MPI_COMM_NULL;
+ MPI_Info info = MPI_INFO_NULL;
herr_t ret_value = SUCCEED;
/*NO TRACE*/
@@ -427,12 +458,38 @@ H5Pset_fapl_subfiling(hid_t fapl_id, const H5FD_subfiling_config_t *vfd_config)
vfd_config = subfiling_conf;
}
+ /* Check if any MPI parameters were set on the FAPL */
+ if (H5P_get(plist, H5F_ACS_MPI_PARAMS_COMM_NAME, &comm) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI communicator from plist");
+ if (H5P_get(plist, H5F_ACS_MPI_PARAMS_INFO_NAME, &info) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI info from plist");
+ if (comm == MPI_COMM_NULL)
+ comm = MPI_COMM_WORLD;
+
+ /* Set MPI parameters on IOC FAPL */
+ if (NULL == (ioc_plist = H5P_object_verify(vfd_config->ioc_fapl_id, H5P_FILE_ACCESS)))
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list");
+ if (H5P_set(ioc_plist, H5F_ACS_MPI_PARAMS_COMM_NAME, &comm) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI communicator on plist");
+ if (H5P_set(ioc_plist, H5F_ACS_MPI_PARAMS_INFO_NAME, &info) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI info on plist");
+
if (H5FD__subfiling_validate_config(vfd_config) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid subfiling VFD configuration");
+ /* Set Subfiling configuration on IOC FAPL */
+ if (H5_subfiling_set_config_prop(ioc_plist, &vfd_config->shared_cfg) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL,
+ "can't set subfiling configuration on IOC FAPL");
+
ret_value = H5P_set_driver(plist, H5FD_SUBFILING, vfd_config, NULL);
done:
+ if (H5_mpi_comm_free(&comm) < 0)
+ H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI Communicator");
+ if (H5_mpi_info_free(&info) < 0)
+ H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI Info object");
+
if (subfiling_conf) {
if (subfiling_conf->ioc_fapl_id >= 0 && H5I_dec_ref(subfiling_conf->ioc_fapl_id) < 0)
H5_SUBFILING_DONE_ERROR(H5E_PLIST, H5E_CANTDEC, FAIL, "can't close IOC FAPL");
@@ -516,7 +573,7 @@ H5FD__subfiling_get_default_config(hid_t fapl_id, H5FD_subfiling_config_t *confi
config_out->shared_cfg.ioc_selection = SELECT_IOC_ONE_PER_NODE;
config_out->shared_cfg.stripe_size = H5FD_SUBFILING_DEFAULT_STRIPE_SIZE;
- config_out->shared_cfg.stripe_count = 0;
+ config_out->shared_cfg.stripe_count = H5FD_SUBFILING_DEFAULT_STRIPE_COUNT;
if ((h5_require_ioc = HDgetenv("H5_REQUIRE_IOC")) != NULL) {
int value_check = HDatoi(h5_require_ioc);
@@ -553,9 +610,9 @@ H5FD__subfiling_get_default_config(hid_t fapl_id, H5FD_subfiling_config_t *confi
done:
if (H5_mpi_comm_free(&comm) < 0)
- H5_SUBFILING_DONE_ERROR(H5E_PLIST, H5E_CANTFREE, FAIL, "can't free MPI Communicator");
+ H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI Communicator");
if (H5_mpi_info_free(&info) < 0)
- H5_SUBFILING_DONE_ERROR(H5E_PLIST, H5E_CANTFREE, FAIL, "can't free MPI Info object");
+ H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI Info object");
if (ret_value < 0) {
if (config_out->ioc_fapl_id >= 0 && H5Pclose(config_out->ioc_fapl_id) < 0)
@@ -603,15 +660,193 @@ H5FD__subfiling_validate_config(const H5FD_subfiling_config_t *fa)
H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL,
"Subfiling VFD currently always requires IOC VFD to be used");
- if (fa->shared_cfg.ioc_selection < SELECT_IOC_ONE_PER_NODE ||
- fa->shared_cfg.ioc_selection >= ioc_selection_options)
- H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid IOC selection method");
+ if (H5_subfiling_validate_config(&fa->shared_cfg) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid subfiling configuration parameters");
done:
H5_SUBFILING_FUNC_LEAVE;
} /* end H5FD__subfiling_validate_config() */
/*-------------------------------------------------------------------------
+ * Function: H5FD__subfiling_sb_size
+ *
+ * Purpose: Returns the size of the subfiling configuration information
+ * to be stored in the superblock.
+ *
+ * Return: Size of subfiling configuration information (never fails)
+ *-------------------------------------------------------------------------
+ */
+static hsize_t
+H5FD__subfiling_sb_size(H5FD_t *_file)
+{
+ H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file;
+ hsize_t ret_value = 0;
+
+ HDassert(file);
+
+ /* Configuration structure magic number */
+ ret_value += sizeof(uint32_t);
+
+ /* Configuration structure version number */
+ ret_value += sizeof(uint32_t);
+
+ /* "Require IOC" field */
+ ret_value += sizeof(int32_t);
+
+ /* Subfiling stripe size */
+ ret_value += sizeof(int64_t);
+
+ /* Subfiling stripe count (encoded as int64_t for future) */
+ ret_value += sizeof(int64_t);
+
+ /* Add superblock information from IOC file if necessary */
+ if (file->sf_file) {
+ /* Encode the IOC's name into the subfiling information */
+ ret_value += 9;
+
+ ret_value += H5FD_sb_size(file->sf_file);
+ }
+
+ H5_SUBFILING_FUNC_LEAVE;
+} /* end H5FD__subfiling_sb_size() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD__subfiling_sb_encode
+ *
+ * Purpose: Encodes the subfiling configuration information into the
+ * specified buffer.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD__subfiling_sb_encode(H5FD_t *_file, char *name, unsigned char *buf)
+{
+ subfiling_context_t *sf_context = NULL;
+ H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file;
+ uint8_t *p = (uint8_t *)buf;
+ int64_t tmp64;
+ int32_t tmp32;
+ herr_t ret_value = SUCCEED;
+
+ if (NULL == (sf_context = H5_get_subfiling_object(file->context_id)))
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get subfiling context object");
+
+ /* Encode driver name */
+ HDstrncpy(name, "Subfilin", 9);
+ name[8] = '\0';
+
+ /* Encode configuration structure magic number */
+ UINT32ENCODE(p, file->fa.magic);
+
+ /* Encode configuration structure version number */
+ UINT32ENCODE(p, file->fa.version);
+
+ /* Encode "require IOC" field */
+ tmp32 = (int32_t)file->fa.require_ioc;
+ INT32ENCODE(p, tmp32);
+
+ /* Encode subfiling stripe size */
+ INT64ENCODE(p, sf_context->sf_stripe_size);
+
+ /* Encode subfiling stripe count (number of subfiles) */
+ tmp64 = sf_context->sf_num_subfiles;
+ INT64ENCODE(p, tmp64);
+
+ /* Encode IOC VFD configuration information if necessary */
+ if (file->sf_file) {
+ char ioc_name[9];
+
+ HDmemset(ioc_name, 0, sizeof(ioc_name));
+
+ if (H5FD_sb_encode(file->sf_file, ioc_name, p + 9) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTENCODE, FAIL,
+ "unable to encode IOC VFD's superblock information");
+
+ /* Copy the IOC VFD's name into our buffer */
+ HDmemcpy(p, ioc_name, 9);
+ }
+
+done:
+ H5_SUBFILING_FUNC_LEAVE;
+} /* end H5FD__subfiling_sb_encode() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD__subfiling_sb_decode
+ *
+ * Purpose: Decodes the subfiling configuration information from the
+ * specified buffer.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD__subfiling_sb_decode(H5FD_t *_file, const char *name, const unsigned char *buf)
+{
+ subfiling_context_t *sf_context = NULL;
+ H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file;
+ const uint8_t *p = (const uint8_t *)buf;
+ int64_t tmp64;
+ int32_t tmp32;
+ herr_t ret_value = SUCCEED;
+
+ if (NULL == (sf_context = H5_get_subfiling_object(file->context_id)))
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get subfiling context object");
+
+ if (HDstrncmp(name, "Subfilin", 9))
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "invalid driver name in superblock");
+
+ /* Decode configuration structure magic number */
+ UINT32DECODE(p, file->fa.magic);
+
+ /* Decode configuration structure version number */
+ UINT32DECODE(p, file->fa.version);
+
+ /* Decode "require IOC" field */
+ INT32DECODE(p, tmp32);
+ file->fa.require_ioc = (hbool_t)tmp32;
+
+ /* Decode subfiling stripe size */
+ INT64DECODE(p, file->fa.shared_cfg.stripe_size);
+
+ /* Decode subfiling stripe count */
+ INT64DECODE(p, tmp64);
+ H5_CHECK_OVERFLOW(tmp64, int64_t, int32_t);
+ file->fa.shared_cfg.stripe_count = (int32_t)tmp64;
+
+ if (file->sf_file) {
+ char ioc_name[9];
+
+ HDmemcpy(ioc_name, p, 9);
+ p += 9;
+
+ if (H5FD_sb_load(file->sf_file, ioc_name, p) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTDECODE, FAIL,
+ "unable to decode IOC VFD's superblock information");
+ }
+
+ /* Validate the decoded configuration */
+ if (H5FD__subfiling_validate_config(&file->fa) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL,
+ "decoded subfiling configuration info is invalid");
+
+ if (file->fa.shared_cfg.stripe_size != sf_context->sf_stripe_size)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL,
+ "specified subfiling stripe size (%" PRId64
+ ") doesn't match value stored in file (%" PRId64 ")",
+ sf_context->sf_stripe_size, file->fa.shared_cfg.stripe_size);
+
+ if (file->fa.shared_cfg.stripe_count != sf_context->sf_num_subfiles)
+ H5_SUBFILING_GOTO_ERROR(
+ H5E_VFL, H5E_BADVALUE, FAIL,
+ "specified subfiling stripe count (%d) doesn't match value stored in file (%" PRId32 ")",
+ sf_context->sf_num_subfiles, file->fa.shared_cfg.stripe_count);
+
+done:
+ H5_SUBFILING_FUNC_LEAVE;
+} /* end H5FD__subfiling_sb_decode() */
+
+/*-------------------------------------------------------------------------
* Function: H5FD__subfiling_fapl_get
*
* Purpose: Gets a file access property list which could be used to
@@ -797,7 +1032,6 @@ H5FD__subfiling_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t ma
H5FD_driver_prop_t driver_prop; /* Property for driver ID & info */
hbool_t bcasted_eof = FALSE;
int64_t sf_eof = -1;
- void *file_handle = NULL;
int mpi_code; /* MPI return code */
H5FD_t *ret_value = NULL;
@@ -813,6 +1047,7 @@ H5FD__subfiling_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t ma
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTALLOC, NULL, "unable to allocate file struct");
file_ptr->comm = MPI_COMM_NULL;
file_ptr->info = MPI_INFO_NULL;
+ file_ptr->file_id = UINT64_MAX;
file_ptr->context_id = -1;
file_ptr->fa.ioc_fapl_id = H5I_INVALID_HID;
file_ptr->ext_comm = MPI_COMM_NULL;
@@ -868,33 +1103,6 @@ H5FD__subfiling_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t ma
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, NULL, "can't copy FAPL");
}
- if (NULL != (file_ptr->file_path = HDrealpath(name, NULL))) {
- char *path = NULL;
-
- if (NULL == (path = H5MM_strdup(file_ptr->file_path)))
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTCOPY, NULL, "can't copy subfiling subfile path");
- if (H5_dirname(path, &file_ptr->file_dir) < 0) {
- H5MM_free(path);
- H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL, "couldn't get subfile dirname");
- }
-
- H5MM_free(path);
- }
- else {
- if (ENOENT == errno) {
- if (NULL == (file_ptr->file_path = HDstrdup(name)))
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTCOPY, NULL, "can't copy file name");
- if (NULL == (file_ptr->file_dir = H5MM_strdup(".")))
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, NULL, "can't set subfile directory path");
- }
- else
- H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't resolve subfile path");
- }
-
- file_ptr->sf_file = H5FD_open(name, flags, file_ptr->fa.ioc_fapl_id, HADDR_UNDEF);
- if (!file_ptr->sf_file)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, NULL, "unable to open IOC file");
-
/* Check the "native" driver (IOC/sec2/etc.) */
if (NULL == (plist_ptr = H5I_object(file_ptr->fa.ioc_fapl_id)))
H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_BADVALUE, NULL, "invalid IOC FAPL");
@@ -905,17 +1113,36 @@ H5FD__subfiling_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t ma
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, NULL,
"invalid driver ID in file access property list");
- if (driver->value != H5_VFD_IOC && driver->value != H5_VFD_SEC2)
- H5_SUBFILING_GOTO_ERROR(
- H5E_FILE, H5E_CANTOPENFILE, NULL,
- "unable to open file '%s' - only IOC and Sec2 VFDs are currently supported for subfiles", name);
+ if (driver->value != H5_VFD_IOC)
+ H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL,
+ "unable to open file '%s' - only IOC VFD is currently supported for subfiles",
+ name);
+
+ /* Fully resolve the given filepath and get its dirname */
+ if (H5_resolve_pathname(name, file_ptr->comm, &file_ptr->file_path) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't resolve filepath");
+ if (H5_dirname(file_ptr->file_path, &file_ptr->file_dir) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't get filepath dirname");
+
+ /*
+ * Create/open the HDF5 stub file and get its inode value for
+ * the internal mapping from file inode to subfiling context.
+ */
+ if (H5_open_subfiling_stub_file(file_ptr->file_path, flags, file_ptr->comm, &file_ptr->stub_file,
+ &file_ptr->file_id) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "can't open HDF5 stub file");
- if (H5FDget_vfd_handle(file_ptr->sf_file, file_ptr->fa.ioc_fapl_id, &file_handle) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTGET, NULL, "can't get file handle");
+ /* Set stub file ID on IOC fapl so it can reuse on open */
+ if (H5_subfiling_set_file_id_prop(plist_ptr, file_ptr->file_id) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, NULL, "can't set stub file ID on FAPL");
+
+ /* Open the HDF5 file's subfiles */
+ if (NULL == (file_ptr->sf_file = H5FD_open(name, flags, file_ptr->fa.ioc_fapl_id, HADDR_UNDEF)))
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, NULL, "unable to open IOC file");
if (driver->value == H5_VFD_IOC) {
/* Get a copy of the context ID for later use */
- file_ptr->context_id = H5_subfile_fhandle_to_context(file_handle);
+ file_ptr->context_id = H5_subfile_fid_to_context(file_ptr->file_id);
file_ptr->fa.require_ioc = true;
}
else if (driver->value == H5_VFD_SEC2) {
@@ -935,7 +1162,7 @@ H5FD__subfiling_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t ma
* context ID will be returned, which is used for
* further interactions with this file's subfiles.
*/
- if (H5_open_subfiles(file_ptr->file_path, file_handle, &file_ptr->fa.shared_cfg, ioc_flags,
+ if (H5_open_subfiles(file_ptr->file_path, file_ptr->file_id, &file_ptr->fa.shared_cfg, ioc_flags,
file_ptr->comm, &file_ptr->context_id) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to open subfiling files = %s\n",
name);
@@ -946,8 +1173,10 @@ H5FD__subfiling_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t ma
sf_eof = -1;
}
- if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&sf_eof, 1, MPI_INT64_T, 0, file_ptr->comm)))
- H5_SUBFILING_MPI_GOTO_ERROR(NULL, "MPI_Bcast", mpi_code);
+ if (file_ptr->mpi_size > 1) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&sf_eof, 1, MPI_INT64_T, 0, file_ptr->comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(NULL, "MPI_Bcast", mpi_code);
+ }
bcasted_eof = TRUE;
@@ -971,8 +1200,10 @@ done:
if (!bcasted_eof) {
sf_eof = -1;
- if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&sf_eof, 1, MPI_INT64_T, 0, file_ptr->comm)))
- H5_SUBFILING_MPI_DONE_ERROR(NULL, "MPI_Bcast failed", mpi_code);
+ if (file_ptr->mpi_size > 1) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&sf_eof, 1, MPI_INT64_T, 0, file_ptr->comm)))
+ H5_SUBFILING_MPI_DONE_ERROR(NULL, "MPI_Bcast failed", mpi_code);
+ }
}
}
@@ -993,11 +1224,8 @@ H5FD__subfiling_close_int(H5FD_subfiling_t *file_ptr)
if (file_ptr->sf_file && H5FD_close(file_ptr->sf_file) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTCLOSEFILE, FAIL, "unable to close subfile");
-
- if (!file_ptr->fa.require_ioc) {
- if (file_ptr->context_id >= 0 && H5_free_subfiling_object(file_ptr->context_id) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free subfiling context object");
- }
+ if (file_ptr->stub_file && H5FD_close(file_ptr->stub_file) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTCLOSEFILE, FAIL, "unable to close HDF5 stub file");
/* if set, close the copy of the plist for the underlying VFD. */
if ((file_ptr->fa.ioc_fapl_id >= 0) && (H5I_dec_ref(file_ptr->fa.ioc_fapl_id) < 0))
@@ -1107,7 +1335,6 @@ H5FD__subfiling_query(const H5FD_t H5_ATTR_UNUSED *_file, unsigned long *flags /
*flags |= H5FD_FEAT_AGGREGATE_METADATA; /* OK to aggregate metadata allocations */
*flags |= H5FD_FEAT_AGGREGATE_SMALLDATA; /* OK to aggregate "small" raw data allocations */
*flags |= H5FD_FEAT_HAS_MPI; /* This driver uses MPI */
- *flags |= H5FD_FEAT_ALLOCATE_EARLY; /* Allocate space early instead of late */
}
H5_SUBFILING_FUNC_LEAVE_API;
@@ -1151,15 +1378,22 @@ H5FD__subfiling_get_eoa(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type)
*-------------------------------------------------------------------------
*/
static herr_t
-H5FD__subfiling_set_eoa(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, haddr_t addr)
+H5FD__subfiling_set_eoa(H5FD_t *_file, H5FD_mem_t type, haddr_t addr)
{
H5FD_subfiling_t *file_ptr = (H5FD_subfiling_t *)_file;
herr_t ret_value = SUCCEED;
file_ptr->eoa = addr;
+ /* Set EOA for HDF5 stub file */
+ if (file_ptr->mpi_rank == 0) {
+ if (H5FD_set_eoa(file_ptr->stub_file, type, addr) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTSET, FAIL, "can't set HDF5 stub file EOA");
+ }
+
ret_value = H5FD_set_eoa(file_ptr->sf_file, type, addr);
+done:
H5_SUBFILING_FUNC_LEAVE_API;
} /* end H5FD__subfiling_set_eoa() */
@@ -1208,7 +1442,7 @@ H5FD__subfiling_get_handle(H5FD_t *_file, hid_t H5_ATTR_UNUSED fapl, void **file
if (!file_handle)
H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file handle not valid");
- *file_handle = &(file->fd);
+ H5FD_get_vfd_handle(file->sf_file, file->fa.ioc_fapl_id, file_handle);
done:
H5_SUBFILING_FUNC_LEAVE_API;
@@ -1230,7 +1464,7 @@ done:
*-------------------------------------------------------------------------
*/
static herr_t
-H5FD__subfiling_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size,
+H5FD__subfiling_read(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, haddr_t addr, size_t size,
void *buf /*out*/)
{
subfiling_context_t *sf_context = NULL;
@@ -1243,7 +1477,7 @@ H5FD__subfiling_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr
int64_t *sf_data_size = NULL;
int64_t *sf_offset = NULL;
hbool_t rank0_bcast = FALSE;
- int ioc_total;
+ int num_subfiles;
herr_t ret_value = SUCCEED;
HDassert(file_ptr && file_ptr->pub.cls);
@@ -1286,7 +1520,7 @@ H5FD__subfiling_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr
/*
* Retrieve the subfiling context object and the number
- * of I/O concentrators.
+ * of subfiles.
*
* Given the current I/O and the I/O concentrator info,
* we can determine some I/O transaction parameters.
@@ -1300,50 +1534,50 @@ H5FD__subfiling_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr
HDassert(sf_context);
HDassert(sf_context->topology);
- ioc_total = sf_context->topology->n_io_concentrators;
+ num_subfiles = sf_context->sf_num_subfiles;
- if (ioc_total == 0) {
- H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid number of I/O concentrators (%d)",
- ioc_total);
+ if (num_subfiles <= 0) {
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid number of subfiles (%d)",
+ num_subfiles);
}
- else if (ioc_total == 1) {
- /***********************************
- * No striping - just a single IOC *
- ***********************************/
+ else if (num_subfiles == 1) {
+ /***************************************
+ * No striping - just a single subfile *
+ ***************************************/
/* Make vector read call to subfile */
- if (H5FDread_vector(file_ptr->sf_file, dxpl_id, 1, &type, &addr, &size, &buf) < 0)
+ if (H5FD_read_vector(file_ptr->sf_file, 1, &type, &addr, &size, &buf) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "read from subfile failed");
}
else {
- int64_t max_io_req_per_ioc;
+ int64_t max_io_req_per_subfile;
int64_t file_offset;
int64_t block_size;
size_t max_depth;
herr_t status;
- int ioc_count = 0;
- int ioc_start = -1;
+ int num_subfiles_used = 0;
+ int first_subfile_idx = -1;
- /*********************************
- * Striping across multiple IOCs *
- *********************************/
+ /*************************************
+ * Striping across multiple subfiles *
+ *************************************/
block_size = sf_context->sf_blocksize_per_stripe;
max_depth = (size / (size_t)block_size) + 2;
/*
- * Given the number of I/O concentrators, allocate vectors (one per IOC)
- * to contain the translation of the I/O request into a collection of I/O
- * requests.
+ * Given the number of subfiles, allocate vectors (one per subfile)
+ * to contain the translation of the I/O request into a collection of
+ * I/O requests.
*/
- if (NULL ==
- (source_data_offset = HDcalloc(1, (size_t)ioc_total * max_depth * sizeof(*source_data_offset))))
+ if (NULL == (source_data_offset =
+ HDcalloc(1, (size_t)num_subfiles * max_depth * sizeof(*source_data_offset))))
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
"can't allocate source data offset I/O vector");
- if (NULL == (sf_data_size = HDcalloc(1, (size_t)ioc_total * max_depth * sizeof(*sf_data_size))))
+ if (NULL == (sf_data_size = HDcalloc(1, (size_t)num_subfiles * max_depth * sizeof(*sf_data_size))))
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
"can't allocate subfile data size I/O vector");
- if (NULL == (sf_offset = HDcalloc(1, (size_t)ioc_total * max_depth * sizeof(*sf_offset))))
+ if (NULL == (sf_offset = HDcalloc(1, (size_t)num_subfiles * max_depth * sizeof(*sf_offset))))
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
"can't allocate subfile offset I/O vector");
@@ -1351,31 +1585,27 @@ H5FD__subfiling_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr
/*
* Get the potential set of IOC transactions; e.g., data sizes,
- * offsets and datatypes. These can all be used by either the
- * underlying IOC or by the sec2 driver.
- *
- * For now, assume we're dealing with contiguous datasets. Vector
- * I/O will probably handle the non-contiguous case.
+ * offsets and datatypes.
*/
- status = init_indep_io(sf_context, /* IN: Context used to look up config info */
- file_offset, /* IN: Starting file offset */
- size, /* IN: I/O size */
- 1, /* IN: Data extent of the 'type' assumes byte */
- max_depth, /* IN: Maximum stripe depth */
- source_data_offset, /* OUT: Memory offset */
- sf_offset, /* OUT: File offset */
- sf_data_size, /* OUT: Length of this contiguous block */
- &ioc_start, /* OUT: IOC index corresponding to starting offset */
- &ioc_count, /* OUT: Number of actual IOCs used */
- &max_io_req_per_ioc); /* OUT: Maximum number of requests to any IOC */
+ status = init_indep_io(sf_context, /* IN: Context used to look up config info */
+ file_offset, /* IN: Starting file offset */
+ size, /* IN: I/O size */
+ 1, /* IN: Data extent of the 'type' assumes byte */
+ max_depth, /* IN: Maximum stripe depth */
+ source_data_offset, /* OUT: Memory offset */
+ sf_offset, /* OUT: File offset */
+ sf_data_size, /* OUT: Length of this contiguous block */
+ &first_subfile_idx, /* OUT: Subfile index corresponding to starting offset */
+ &num_subfiles_used, /* OUT: Number of actual subfiles used */
+ &max_io_req_per_subfile); /* OUT: Maximum number of requests to any subfile */
if (status < 0)
H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTINIT, FAIL, "can't initialize IOC transactions");
- if (max_io_req_per_ioc > 0) {
+ if (max_io_req_per_subfile > 0) {
uint32_t vector_len;
- H5_CHECKED_ASSIGN(vector_len, uint32_t, ioc_count, int);
+ H5_CHECKED_ASSIGN(vector_len, uint32_t, num_subfiles_used, int);
/* Allocate I/O vectors */
if (NULL == (io_types = HDmalloc(vector_len * sizeof(*io_types))))
@@ -1391,20 +1621,20 @@ H5FD__subfiling_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
"can't allocate subfile I/O buffers vector");
- for (int64_t i = 0; i < max_io_req_per_ioc; i++) {
- uint32_t final_vec_len = vector_len;
- int next_ioc = ioc_start;
+ for (int64_t i = 0; i < max_io_req_per_subfile; i++) {
+ uint32_t final_vec_len = vector_len;
+ int next_subfile_idx = first_subfile_idx;
/* Fill in I/O types, offsets, sizes and buffers vectors */
for (uint32_t k = 0, vec_idx = 0; k < vector_len; k++) {
- size_t idx = (size_t)next_ioc * max_depth + (size_t)i;
+ size_t idx = (size_t)next_subfile_idx * max_depth + (size_t)i;
io_types[vec_idx] = type;
H5_CHECKED_ASSIGN(io_addrs[vec_idx], haddr_t, sf_offset[idx], int64_t);
H5_CHECKED_ASSIGN(io_sizes[vec_idx], size_t, sf_data_size[idx], int64_t);
io_bufs[vec_idx] = ((char *)buf + source_data_offset[idx]);
- next_ioc = (next_ioc + 1) % ioc_total;
+ next_subfile_idx = (next_subfile_idx + 1) % num_subfiles;
/* Skip 0-sized I/Os */
if (io_sizes[vec_idx] == 0) {
@@ -1417,13 +1647,13 @@ H5FD__subfiling_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr
if (!rank0_bcast || (file_ptr->mpi_rank == 0)) {
/* Make vector read call to subfile */
- if (H5FDread_vector(file_ptr->sf_file, dxpl_id, final_vec_len, io_types, io_addrs,
- io_sizes, io_bufs) < 0)
+ if (H5FD_read_vector(file_ptr->sf_file, final_vec_len, io_types, io_addrs, io_sizes,
+ io_bufs) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "read from subfile failed");
}
}
- if (rank0_bcast) {
+ if (rank0_bcast && (file_ptr->mpi_size > 1)) {
H5_CHECK_OVERFLOW(size, size_t, int);
if (MPI_SUCCESS != MPI_Bcast(buf, (int)size, MPI_BYTE, 0, file_ptr->comm))
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "can't broadcast data from rank 0");
@@ -1470,7 +1700,7 @@ done:
*-------------------------------------------------------------------------
*/
static herr_t
-H5FD__subfiling_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size,
+H5FD__subfiling_write(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, haddr_t addr, size_t size,
const void *buf /*in*/)
{
subfiling_context_t *sf_context = NULL;
@@ -1482,7 +1712,7 @@ H5FD__subfiling_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t add
int64_t *source_data_offset = NULL;
int64_t *sf_data_size = NULL;
int64_t *sf_offset = NULL;
- int ioc_total;
+ int num_subfiles;
herr_t ret_value = SUCCEED;
HDassert(file_ptr && file_ptr->pub.cls);
@@ -1522,7 +1752,7 @@ H5FD__subfiling_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t add
/*
* Retrieve the subfiling context object and the number
- * of I/O concentrators.
+ * of subfiles.
*
* Given the current I/O and the I/O concentrator info,
* we can determine some I/O transaction parameters.
@@ -1536,50 +1766,61 @@ H5FD__subfiling_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t add
HDassert(sf_context);
HDassert(sf_context->topology);
- ioc_total = sf_context->topology->n_io_concentrators;
+ num_subfiles = sf_context->sf_num_subfiles;
- if (ioc_total == 0) {
- H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid number of I/O concentrators (%d)",
- ioc_total);
+ if (num_subfiles <= 0) {
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid number of subfiles (%d)",
+ num_subfiles);
}
- else if (ioc_total == 1) {
- /***********************************
- * No striping - just a single IOC *
- ***********************************/
+ else if (num_subfiles == 1) {
+ /***************************************
+ * No striping - just a single subfile *
+ ***************************************/
/* Make vector write call to subfile */
- if (H5FDwrite_vector(file_ptr->sf_file, dxpl_id, 1, &type, &addr, &size, &buf) < 0)
+ if (H5FD_write_vector(file_ptr->sf_file, 1, &type, &addr, &size, &buf) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "write to subfile failed");
+
+ /*
+ * Mirror superblock writes to the stub file so that
+ * legacy HDF5 applications can check what type of
+ * file they are reading
+ */
+ if ((type == H5FD_MEM_SUPER) && (file_ptr->mpi_rank == 0)) {
+ if (H5FD_write_vector(file_ptr->stub_file, 1, &type, &addr, &size, &buf) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL,
+ "couldn't write superblock information to stub file");
+ }
}
else {
- int64_t max_io_req_per_ioc;
+ int64_t max_io_req_per_subfile;
int64_t file_offset;
int64_t block_size;
size_t max_depth;
herr_t status;
- int ioc_count = 0;
- int ioc_start = -1;
+ int num_subfiles_used = 0;
+ int first_subfile_idx = -1;
- /*********************************
- * Striping across multiple IOCs *
- *********************************/
+ /*************************************
+ * Striping across multiple subfiles *
+ *************************************/
block_size = sf_context->sf_blocksize_per_stripe;
max_depth = (size / (size_t)block_size) + 2;
/*
- * Given the number of I/O concentrators, allocate vectors (one per IOC)
- * to contain the translation of the I/O request into a collection of I/O
- * requests.
+ * Given the number of subfiles, allocate vectors (one per subfile)
+ * to contain the translation of the I/O request into a collection of
+ * I/O requests.
*/
- if (NULL ==
- (source_data_offset = HDcalloc(1, (size_t)ioc_total * max_depth * sizeof(*source_data_offset))))
+ if (NULL == (source_data_offset =
+ HDcalloc(1, (size_t)num_subfiles * max_depth * sizeof(*source_data_offset))))
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
"can't allocate source data offset I/O vector");
- if (NULL == (sf_data_size = HDcalloc(1, (size_t)ioc_total * max_depth * sizeof(*sf_data_size))))
+ if (NULL == (sf_data_size = HDcalloc(1, (size_t)num_subfiles * max_depth * sizeof(*sf_data_size))))
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
"can't allocate subfile data size I/O vector");
- if (NULL == (sf_offset = HDcalloc(1, (size_t)ioc_total * max_depth * sizeof(*sf_offset))))
+ if (NULL == (sf_offset = HDcalloc(1, (size_t)num_subfiles * max_depth * sizeof(*sf_offset))))
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
"can't allocate subfile offset I/O vector");
@@ -1587,31 +1828,27 @@ H5FD__subfiling_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t add
/*
* Get the potential set of IOC transactions; e.g., data sizes,
- * offsets and datatypes. These can all be used by either the
- * underlying IOC or by the sec2 driver.
- *
- * For now, assume we're dealing with contiguous datasets. Vector
- * I/O will probably handle the non-contiguous case.
+ * offsets and datatypes.
*/
- status = init_indep_io(sf_context, /* IN: Context used to look up config info */
- file_offset, /* IN: Starting file offset */
- size, /* IN: I/O size */
- 1, /* IN: Data extent of the 'type' assumes byte */
- max_depth, /* IN: Maximum stripe depth */
- source_data_offset, /* OUT: Memory offset */
- sf_offset, /* OUT: File offset */
- sf_data_size, /* OUT: Length of this contiguous block */
- &ioc_start, /* OUT: IOC index corresponding to starting offset */
- &ioc_count, /* OUT: Number of actual IOCs used */
- &max_io_req_per_ioc); /* OUT: Maximum number of requests to any IOC */
+ status = init_indep_io(sf_context, /* IN: Context used to look up config info */
+ file_offset, /* IN: Starting file offset */
+ size, /* IN: I/O size */
+ 1, /* IN: Data extent of the 'type' assumes byte */
+ max_depth, /* IN: Maximum stripe depth */
+ source_data_offset, /* OUT: Memory offset */
+ sf_offset, /* OUT: File offset */
+ sf_data_size, /* OUT: Length of this contiguous block */
+ &first_subfile_idx, /* OUT: Subfile index corresponding to starting offset */
+ &num_subfiles_used, /* OUT: Number of actual subfiles used */
+ &max_io_req_per_subfile); /* OUT: Maximum number of requests to any subfile */
if (status < 0)
H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTINIT, FAIL, "can't initialize IOC transactions");
- if (max_io_req_per_ioc > 0) {
+ if (max_io_req_per_subfile > 0) {
uint32_t vector_len;
- H5_CHECKED_ASSIGN(vector_len, uint32_t, ioc_count, int);
+ H5_CHECKED_ASSIGN(vector_len, uint32_t, num_subfiles_used, int);
/* Allocate I/O vectors */
if (NULL == (io_types = HDmalloc(vector_len * sizeof(*io_types))))
@@ -1627,20 +1864,20 @@ H5FD__subfiling_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t add
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
"can't allocate subfile I/O buffers vector");
- for (int64_t i = 0; i < max_io_req_per_ioc; i++) {
- uint32_t final_vec_len = vector_len;
- int next_ioc = ioc_start;
+ for (int64_t i = 0; i < max_io_req_per_subfile; i++) {
+ uint32_t final_vec_len = vector_len;
+ int next_subfile_idx = first_subfile_idx;
/* Fill in I/O types, offsets, sizes and buffers vectors */
for (uint32_t k = 0, vec_idx = 0; k < vector_len; k++) {
- size_t idx = (size_t)next_ioc * max_depth + (size_t)i;
+ size_t idx = (size_t)next_subfile_idx * max_depth + (size_t)i;
io_types[vec_idx] = type;
H5_CHECKED_ASSIGN(io_addrs[vec_idx], haddr_t, sf_offset[idx], int64_t);
H5_CHECKED_ASSIGN(io_sizes[vec_idx], size_t, sf_data_size[idx], int64_t);
io_bufs[vec_idx] = ((const char *)buf + source_data_offset[idx]);
- next_ioc = (next_ioc + 1) % ioc_total;
+ next_subfile_idx = (next_subfile_idx + 1) % num_subfiles;
/* Skip 0-sized I/Os */
if (io_sizes[vec_idx] == 0) {
@@ -1652,9 +1889,25 @@ H5FD__subfiling_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t add
}
/* Make vector write call to subfile */
- if (H5FDwrite_vector(file_ptr->sf_file, dxpl_id, final_vec_len, io_types, io_addrs, io_sizes,
- io_bufs) < 0)
+ if (H5FD_write_vector(file_ptr->sf_file, final_vec_len, io_types, io_addrs, io_sizes,
+ io_bufs) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "write to subfile failed");
+
+ /*
+ * Mirror superblock writes to the stub file so that
+ * legacy HDF5 applications can check what type of
+ * file they are reading
+ */
+ if (file_ptr->mpi_rank == 0) {
+ for (size_t count_idx = 0; count_idx < (size_t)final_vec_len; count_idx++) {
+ if (io_types[count_idx] == H5FD_MEM_SUPER) {
+ if (H5FD_write(file_ptr->stub_file, H5FD_MEM_SUPER, io_addrs[count_idx],
+ io_sizes[count_idx], io_bufs[count_idx]) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL,
+ "couldn't write superblock information to stub file");
+ }
+ }
+ }
}
}
}
@@ -2044,31 +2297,43 @@ H5FD__subfiling_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t H5
int64_t eoa;
int mpi_code;
- if (!H5CX_get_mpi_file_flushing())
- if (MPI_SUCCESS != (mpi_code = MPI_Barrier(file->comm)))
- H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code);
+ if (!H5CX_get_mpi_file_flushing()) {
+ if (file->mpi_size > 1)
+ if (MPI_SUCCESS != (mpi_code = MPI_Barrier(file->comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code);
+ }
if (0 == file->mpi_rank) {
if (H5FD__subfiling__get_real_eof(file->context_id, &sf_eof) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "can't get EOF");
}
- if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&sf_eof, 1, MPI_INT64_T, 0, file->comm)))
- H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code);
+ if (file->mpi_size > 1) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&sf_eof, 1, MPI_INT64_T, 0, file->comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code);
+ }
if (sf_eof < 0)
H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "invalid EOF");
H5_CHECKED_ASSIGN(eoa, int64_t, file->eoa, haddr_t);
- /* truncate sub-files */
- /* This is a hack. We should be doing the truncate of the sub-files via calls to
+ /* truncate subfiles */
+ /* This is a hack. We should be doing the truncate of the subfiles via calls to
* H5FD_truncate() with the IOC. However, that system is messed up at present.
* thus the following hack.
* JRM -- 12/18/21
*/
if (H5FD__subfiling__truncate_sub_files(file->context_id, eoa, file->comm) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTUPDATE, FAIL, "sub-file truncate request failed");
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTUPDATE, FAIL, "subfile truncate request failed");
+
+#if 0 /* TODO: Should be truncated only to size of superblock metadata */
+ /* Truncate the HDF5 stub file */
+ if (file->mpi_rank == 0) {
+ if (H5FD_truncate(file->stub_file, closing) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTUPDATE, FAIL, "stub file truncate request failed");
+ }
+#endif
/* Reset last file I/O information */
file->pos = HADDR_UNDEF;
@@ -2271,24 +2536,24 @@ done:
* As a consequence of not allowing use of MPI derived
* datatypes in the VFD layer, we need to accommodate the
* possibility that large I/O transactions will be required to
- * use multiple I/Os per IOC.
+ * use multiple I/Os per subfile.
*
- * Example: Using 4 IOCs, each with 1M stripe-depth; when
- * presented an I/O request for 8MB then at a minimum each IOC
- * will require 2 I/Os of 1MB each. Depending on the starting
- * file offset, the 2 I/Os can instead be 3...
+ * Example: Using 4 subfiles, each with 1M stripe-depth; when
+ * presented an I/O request for 8MB then at a minimum each
+ * subfile will require 2 I/Os of 1MB each. Depending on the
+ * starting file offset, the 2 I/Os can instead be 3...
*
* To fully describe the I/O transactions for reads and writes
* the output arrays are therefore arrays of I/O vectors,
* where each vector has a length of which corresponds to the
- * max number of I/O transactions per IOC. In the example
+ * max number of I/O transactions per subfile. In the example
* above, these vector lengths can be 2 or 3. The actual
* length is determined by the 'container_depth' variable.
*
- * For I/O operations which involve a subset of I/O
- * concentrators, the vector entries for the unused I/O
- * concentrators IOCs will have lengths of zero and be empty.
- * The 'container_depth' in this case will always be 1.
+ * For I/O operations which involve a subset of subfiles, the
+ * vector entries for the unused subfiles will have lengths of
+ * zero and be empty. The 'container_depth' in this case will
+ * always be 1.
*
* sf_context (IN)
* - the subfiling context for the file
@@ -2308,37 +2573,37 @@ done:
* the output arrays `mem_buf_offset`, `io_block_len`
* and `sf_offset`. NOTE that this routine expects each
* of these output arrays to have enough space allocated
- * for one I/O vector PER I/O concentrator. Therefore,
- * the total size of each output array should be at least
- * `max_iovec_len * n_io_concentrators`.
+ * for one I/O vector PER subfile. Therefore, the total
+ * size of each output array should be at least
+ * `max_iovec_len * num_subfiles`.
*
* mem_buf_offset (OUT)
- * - output array of vectors (one vector for each IOC)
+ * - output array of vectors (one vector for each subfile)
* containing the set of offsets into the memory buffer
* for I/O
*
* target_file_offset (OUT)
- * - output array of vectors (one vector for each IOC)
+ * - output array of vectors (one vector for each subfile)
* containing the set of offsets into the target file
*
* io_block_len (OUT)
- * - output array of vectors (one vector for each IOC)
+ * - output array of vectors (one vector for each subfile)
* containing the set of block lengths for each source
* buffer/target file offset.
*
- * first_ioc_index (OUT)
- * - the index of the first I/O concentrator that this I/O
- * operation begins at
+ * first_subfile_index (OUT)
+ * - the index of the first subfile that this I/O operation
+ * begins at
*
- * n_iocs_used (OUT)
- * - the number of I/O concentrators actually used for this
- * I/O operation, which may be different from the total
- * number of I/O concentrators for the file
+ * n_subfiles_used (OUT)
+ * - the number of subfiles actually used for this I/O
+ * operation, which may be different from the total
+ * number of subfiles for the file
*
- * max_io_req_per_ioc (OUT)
+ * max_io_req_per_subfile (OUT)
* - the maximum number of I/O requests to any particular
- * I/O concentrator, or the maximum "depth" of each I/O
- * vector in the output arrays.
+ * subfile, or the maximum "depth" of each I/O vector
+ * in the output arrays.
*
* Return: Non-negative on success/Negative on failure
*
@@ -2347,7 +2612,8 @@ done:
static herr_t
init_indep_io(subfiling_context_t *sf_context, int64_t file_offset, size_t io_nelemts, size_t dtype_extent,
size_t max_iovec_len, int64_t *mem_buf_offset, int64_t *target_file_offset,
- int64_t *io_block_len, int *first_ioc_index, int *n_iocs_used, int64_t *max_io_req_per_ioc)
+ int64_t *io_block_len, int *first_subfile_index, int *n_subfiles_used,
+ int64_t *max_io_req_per_subfile)
{
int64_t stripe_size = 0;
int64_t block_size = 0;
@@ -2360,8 +2626,8 @@ init_indep_io(subfiling_context_t *sf_context, int64_t file_offset, size_t io_ne
int64_t final_offset = 0;
int64_t start_length = 0;
int64_t final_length = 0;
- int64_t ioc_start = 0;
- int64_t ioc_final = 0;
+ int64_t first_subfile = 0;
+ int64_t last_subfile = 0;
int64_t start_row = 0;
int64_t row_offset = 0;
int64_t row_stripe_idx_start = 0;
@@ -2370,41 +2636,44 @@ init_indep_io(subfiling_context_t *sf_context, int64_t file_offset, size_t io_ne
int64_t curr_max_iovec_depth = 0;
int64_t total_bytes = 0;
int64_t mem_offset = 0;
- int ioc_count = 0;
+ int num_subfiles = 0;
herr_t ret_value = SUCCEED;
HDassert(sf_context);
- HDassert(sf_context->topology);
- HDassert(sf_context->topology->n_io_concentrators > 0);
HDassert(sf_context->sf_stripe_size > 0);
HDassert(sf_context->sf_blocksize_per_stripe > 0);
+ HDassert(sf_context->sf_num_subfiles > 0);
+ HDassert(sf_context->topology);
HDassert(mem_buf_offset);
HDassert(target_file_offset);
HDassert(io_block_len);
- HDassert(first_ioc_index);
- HDassert(n_iocs_used);
- HDassert(max_io_req_per_ioc);
+ HDassert(first_subfile_index);
+ HDassert(n_subfiles_used);
+ HDassert(max_io_req_per_subfile);
- *first_ioc_index = 0;
- *n_iocs_used = 0;
- *max_io_req_per_ioc = 0;
+ *first_subfile_index = 0;
+ *n_subfiles_used = 0;
+ *max_io_req_per_subfile = 0;
/*
* Retrieve the needed fields from the subfiling context.
*
- * ioc_count
- * - the total number of I/O concentrators in the
- * application topology
* stripe_size
* - the size of the data striping across the file's subfiles
* block_size
* - the size of a "block" across the IOCs, as calculated
- * by the stripe size multiplied by the number of I/O
- * concentrators
+ * by the stripe size multiplied by the number of
+ * subfiles
+ * num_subfiles
+ * - the total number of subfiles for the logical
+ * HDF5 file
+ * num_io_concentrators
+ * - the number of I/O concentrators currently being
+ * used
*/
- ioc_count = sf_context->topology->n_io_concentrators;
- stripe_size = sf_context->sf_stripe_size;
- block_size = sf_context->sf_blocksize_per_stripe;
+ stripe_size = sf_context->sf_stripe_size;
+ block_size = sf_context->sf_blocksize_per_stripe;
+ num_subfiles = sf_context->sf_num_subfiles;
H5_CHECKED_ASSIGN(data_size, int64_t, (io_nelemts * dtype_extent), size_t);
@@ -2415,16 +2684,16 @@ init_indep_io(subfiling_context_t *sf_context, int64_t file_offset, size_t io_ne
* - a stripe "index" given by the file offset divided by the
* stripe size. Note that when the file offset equals or exceeds
* the block size, we simply wrap around. So, for example, if 4
- * I/O concentrators are being used with a stripe size of 1MiB,
- * the block size would be 4MiB and file offset 4096 would have
- * a stripe index of 4 and reside in the same subfile as stripe
- * index 0 (offsets 0-1023)
+ * subfiles are being used with a stripe size of 1MiB, the block
+ * size would be 4MiB and file offset 4096 would have a stripe
+ * index of 4 and reside in the same subfile as stripe index 0
+ * (offsets 0-1023)
* offset_in_stripe
* - the relative offset in the stripe that the starting file
* offset resides in
* offset_in_block
- * - the relative offset in the "block" of stripes across the I/O
- * concentrators
+ * - the relative offset in the "block" of stripes across the
+ * subfiles
* final_offset
* - the last offset in the virtual file covered by this I/O
* operation. Simply the I/O size added to the starting file
@@ -2442,19 +2711,18 @@ init_indep_io(subfiling_context_t *sf_context, int64_t file_offset, size_t io_ne
HDassert(final_length <= stripe_size);
/*
- * Determine which I/O concentrator the I/O request begins
- * in and which "row" the I/O request begins in within the
- * "block" of stripes across the I/O concentrators. Note that
- * "row" here is just a conceptual way to think of how a block
- * of data stripes is laid out across the I/O concentrator
- * subfiles. A block's "column" size in bytes is equal to the
- * stripe size multiplied the number of I/O concentrators.
- * Therefore, file offsets that are multiples of the block size
- * begin a new "row".
+ * Determine which subfile the I/O request begins in and which
+ * "row" the I/O request begins in within the "block" of stripes
+ * across the subfiles. Note that "row" here is just a conceptual
+ * way to think of how a block of data stripes is laid out across
+ * the subfiles. A block's "column" size in bytes is equal to the
+ * stripe size multiplied by the number of subfiles. Therefore,
+ * file offsets that are multiples of the block size begin a new
+ * "row".
*/
- start_row = stripe_idx / ioc_count;
- ioc_start = stripe_idx % ioc_count;
- H5_CHECK_OVERFLOW(ioc_start, int64_t, int);
+ start_row = stripe_idx / num_subfiles;
+ first_subfile = stripe_idx % num_subfiles;
+ H5_CHECK_OVERFLOW(first_subfile, int64_t, int);
/*
* Set initial file offset for starting "row"
@@ -2464,53 +2732,52 @@ init_indep_io(subfiling_context_t *sf_context, int64_t file_offset, size_t io_ne
/*
* Determine the stripe "index" of the last offset in the
- * virtual file and, from that, determine the I/O concentrator
- * that the I/O request ends in.
+ * virtual file and, from that, determine the subfile that
+ * the I/O request ends in.
*/
final_stripe_idx = final_offset / stripe_size;
- ioc_final = final_stripe_idx % ioc_count;
+ last_subfile = final_stripe_idx % num_subfiles;
/*
* Determine how "deep" the resulting I/O vectors are at
* most by calculating the maximum number of "rows" spanned
* for any particular subfile; e.g. the maximum number of
- * I/O requests for any particular I/O concentrator
+ * I/O requests for any particular subfile
*/
- row_stripe_idx_start = stripe_idx - ioc_start;
- row_stripe_idx_final = final_stripe_idx - ioc_final;
- max_iovec_depth = ((row_stripe_idx_final - row_stripe_idx_start) / ioc_count) + 1;
+ row_stripe_idx_start = stripe_idx - first_subfile;
+ row_stripe_idx_final = final_stripe_idx - last_subfile;
+ max_iovec_depth = ((row_stripe_idx_final - row_stripe_idx_start) / num_subfiles) + 1;
- if (ioc_final < ioc_start)
+ if (last_subfile < first_subfile)
max_iovec_depth--;
/* Set returned parameters early */
- *first_ioc_index = (int)ioc_start;
- *n_iocs_used = ioc_count;
- *max_io_req_per_ioc = max_iovec_depth;
+ *first_subfile_index = (int)first_subfile;
+ *n_subfiles_used = num_subfiles;
+ *max_io_req_per_subfile = max_iovec_depth;
#ifdef H5_SUBFILING_DEBUG
H5_subfiling_log(sf_context->sf_context_id,
"%s: FILE OFFSET = %" PRId64 ", DATA SIZE = %zu, STRIPE SIZE = %" PRId64, __func__,
file_offset, io_nelemts, stripe_size);
H5_subfiling_log(sf_context->sf_context_id,
- "%s: IOC START = %" PRId64 ", IOC FINAL = %" PRId64 ", "
+ "%s: FIRST SUBFILE = %" PRId64 ", LAST SUBFILE = %" PRId64 ", "
"MAX IOVEC DEPTH = %" PRId64 ", START LENGTH = %" PRId64 ", FINAL LENGTH = %" PRId64,
- __func__, ioc_start, ioc_final, max_iovec_depth, start_length, final_length);
+ __func__, first_subfile, last_subfile, max_iovec_depth, start_length, final_length);
#endif
/*
- * Loop through the set of I/O concentrators to determine
- * the various vector components for each. I/O concentrators
- * whose data size is zero will not have I/O requests passed
- * to them.
+ * Loop through the set of subfiles to determine the various
+ * vector components for each. Subfiles whose data size is
+ * zero will not have I/O requests passed to them.
*/
curr_stripe_idx = stripe_idx;
curr_max_iovec_depth = max_iovec_depth;
- for (int i = 0, k = (int)ioc_start; i < ioc_count; i++) {
+ for (int i = 0, k = (int)first_subfile; i < num_subfiles; i++) {
int64_t *_mem_buf_offset;
int64_t *_target_file_offset;
int64_t *_io_block_len;
- int64_t ioc_bytes = 0;
+ int64_t subfile_bytes = 0;
int64_t iovec_depth;
hbool_t is_first = FALSE;
hbool_t is_last = FALSE;
@@ -2532,14 +2799,14 @@ init_indep_io(subfiling_context_t *sf_context, int64_t file_offset, size_t io_ne
HDmemset(_io_block_len, 0, (max_iovec_len * sizeof(*_io_block_len)));
if (total_bytes == data_size) {
- *n_iocs_used = i;
+ *n_subfiles_used = i;
goto done;
}
if (total_bytes < data_size) {
int64_t num_full_stripes = iovec_depth;
- if (k == ioc_start) {
+ if (k == first_subfile) {
is_first = TRUE;
/*
@@ -2547,12 +2814,12 @@ init_indep_io(subfiling_context_t *sf_context, int64_t file_offset, size_t io_ne
* starting on a stripe boundary
*/
if (start_length < stripe_size) {
- ioc_bytes += start_length;
+ subfile_bytes += start_length;
num_full_stripes--;
}
}
- if (k == ioc_final) {
+ if (k == last_subfile) {
is_last = TRUE;
/*
@@ -2560,34 +2827,35 @@ init_indep_io(subfiling_context_t *sf_context, int64_t file_offset, size_t io_ne
* ending on a stripe boundary
*/
if (final_length < stripe_size) {
- ioc_bytes += final_length;
+ subfile_bytes += final_length;
if (num_full_stripes)
num_full_stripes--;
}
}
- /* Account for IOCs with uniform segments */
+ /* Account for subfiles with uniform segments */
if (!is_first && !is_last) {
hbool_t thin_uniform_section = FALSE;
- if (ioc_final >= ioc_start) {
+ if (last_subfile >= first_subfile) {
/*
- * When an IOC has an index value that is greater
- * than both the starting IOC and ending IOC indices,
- * it is a "thinner" section with a smaller I/O vector
- * depth.
+ * When a subfile has an index value that is greater
+ * than both the starting subfile and ending subfile
+ * indices, it is a "thinner" section with a smaller
+ * I/O vector depth.
*/
- thin_uniform_section = (k > ioc_start) && (k > ioc_final);
+ thin_uniform_section = (k > first_subfile) && (k > last_subfile);
}
- if (ioc_final < ioc_start) {
+ if (last_subfile < first_subfile) {
/*
- * This can also happen when the IOC with the final
- * data segment has a smaller IOC index than the IOC
- * with the first data segment and the current IOC
- * index falls between the two.
+ * This can also happen when the subfile with the final
+ * data segment has a smaller subfile index than the
+ * subfile with the first data segment and the current
+ * subfile index falls between the two.
*/
- thin_uniform_section = thin_uniform_section || ((ioc_final < k) && (k < ioc_start));
+ thin_uniform_section =
+ thin_uniform_section || ((last_subfile < k) && (k < first_subfile));
}
if (thin_uniform_section) {
@@ -2605,45 +2873,45 @@ init_indep_io(subfiling_context_t *sf_context, int64_t file_offset, size_t io_ne
* size of the fully selected I/O stripes to the
* running bytes total
*/
- ioc_bytes += num_full_stripes * stripe_size;
- total_bytes += ioc_bytes;
+ subfile_bytes += num_full_stripes * stripe_size;
+ total_bytes += subfile_bytes;
}
_mem_buf_offset[0] = mem_offset;
_target_file_offset[0] = row_offset + offset_in_block;
- _io_block_len[0] = ioc_bytes;
+ _io_block_len[0] = subfile_bytes;
- if (ioc_count > 1) {
+ if (num_subfiles > 1) {
int64_t curr_file_offset = row_offset + offset_in_block;
/* Fill the I/O vectors */
if (is_first) {
if (is_last) { /* First + Last */
- if (iovec_fill_first_last(sf_context, iovec_depth, ioc_bytes, mem_offset,
+ if (iovec_fill_first_last(sf_context, iovec_depth, subfile_bytes, mem_offset,
curr_file_offset, start_length, final_length, _mem_buf_offset,
_target_file_offset, _io_block_len) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTINIT, FAIL, "can't fill I/O vectors");
}
else { /* First ONLY */
- if (iovec_fill_first(sf_context, iovec_depth, ioc_bytes, mem_offset, curr_file_offset,
+ if (iovec_fill_first(sf_context, iovec_depth, subfile_bytes, mem_offset, curr_file_offset,
start_length, _mem_buf_offset, _target_file_offset,
_io_block_len) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTINIT, FAIL, "can't fill I/O vectors");
}
/* Move the memory pointer to the starting location
- * for next IOC request.
+ * for next subfile I/O request.
*/
mem_offset += start_length;
}
else if (is_last) { /* Last ONLY */
- if (iovec_fill_last(sf_context, iovec_depth, ioc_bytes, mem_offset, curr_file_offset,
+ if (iovec_fill_last(sf_context, iovec_depth, subfile_bytes, mem_offset, curr_file_offset,
final_length, _mem_buf_offset, _target_file_offset, _io_block_len) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTINIT, FAIL, "can't fill I/O vectors");
mem_offset += stripe_size;
}
else { /* Everything else (uniform) */
- if (iovec_fill_uniform(sf_context, iovec_depth, ioc_bytes, mem_offset, curr_file_offset,
+ if (iovec_fill_uniform(sf_context, iovec_depth, subfile_bytes, mem_offset, curr_file_offset,
_mem_buf_offset, _target_file_offset, _io_block_len) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTINIT, FAIL, "can't fill I/O vectors");
@@ -2656,10 +2924,10 @@ init_indep_io(subfiling_context_t *sf_context, int64_t file_offset, size_t io_ne
k++;
curr_stripe_idx++;
- if (k == ioc_count) {
+ if (k == num_subfiles) {
k = 0;
offset_in_block = 0;
- curr_max_iovec_depth = ((final_stripe_idx - curr_stripe_idx) / ioc_count) + 1;
+ curr_max_iovec_depth = ((final_stripe_idx - curr_stripe_idx) / num_subfiles) + 1;
row_offset += block_size;
}
diff --git a/src/H5FDsubfiling/H5FDsubfiling.h b/src/H5FDsubfiling/H5FDsubfiling.h
index 3bc448b..93d0c3e 100644
--- a/src/H5FDsubfiling/H5FDsubfiling.h
+++ b/src/H5FDsubfiling/H5FDsubfiling.h
@@ -48,21 +48,51 @@
/**
* \def H5FD_SUBFILING_DEFAULT_STRIPE_SIZE
- * The default stripe size (in bytes) for data stripes in sub-files
+ * The default stripe size (in bytes) for data stripes in subfiles
*/
#define H5FD_SUBFILING_DEFAULT_STRIPE_SIZE (32 * 1024 * 1024)
/**
+ * \def H5FD_SUBFILING_DEFAULT_STRIPE_COUNT
+ * Macro for the default Subfiling stripe count value. The default
+ * is currently to use one subfile per node.
+ */
+#define H5FD_SUBFILING_DEFAULT_STRIPE_COUNT -1
+
+/**
* \def H5FD_SUBFILING_FILENAME_TEMPLATE
- * The basic template for a sub-file filename
+ * The basic template for a subfile filename. The format specifiers
+ * correspond to:
+ *
+ * %s -> base filename, e.g. "file.h5"
+ * %PRIu64 -> file inode, e.g. 11273556
+ * %0*d -> number (starting at 1) signifying the Nth (out of total
+ * number of subfiles) subfile. Zero-padded according
+ * to the number of digits in the number of subfiles
+ * (calculated by log10(num_subfiles) + 1)
+ * %d -> number of subfiles
+ *
+ * yielding filenames such as:
+ *
+ * file.h5.subfile_11273556_01_of_10
+ * file.h5.subfile_11273556_02_of_10
+ * file.h5.subfile_11273556_10_of_10
*/
-#define H5FD_SUBFILING_FILENAME_TEMPLATE ".subfile_%" PRIu64 "_%0*d_of_%d"
+#define H5FD_SUBFILING_FILENAME_TEMPLATE "%s.subfile_%" PRIu64 "_%0*d_of_%d"
/**
* \def H5FD_SUBFILING_CONFIG_FILENAME_TEMPLATE
- * The basic template for a #H5FD_SUBFILING driver configuration filename
+ * The basic template for a #H5FD_SUBFILING driver configuration filename.
+ * The format specifiers correspond to:
+ *
+ * %s -> base filename, e.g. "file.h5"
+ * %PRIu64 -> file inode, e.g. 11273556
+ *
+ * yielding a filename such as:
+ *
+ * file.h5.subfile_11273556.config
*/
-#define H5FD_SUBFILING_CONFIG_FILENAME_TEMPLATE ".subfile_%" PRIu64 ".config"
+#define H5FD_SUBFILING_CONFIG_FILENAME_TEMPLATE "%s.subfile_%" PRIu64 ".config"
/*
* Environment variables interpreted by the HDF5 Subfiling feature
@@ -71,7 +101,7 @@
/**
* \def H5FD_SUBFILING_STRIPE_SIZE
* Macro for name of the environment variable that specifies the size
- * (in bytes) for data stripes in sub-files
+ * (in bytes) for data stripes in subfiles
*
* The value set for this environment variable is interpreted as a
* long long value and must be > 0.
@@ -112,7 +142,7 @@
/**
* \def H5FD_SUBFILING_SUBFILE_PREFIX
* Macro for name of the environment variable that specifies a prefix
- * to apply to the filenames generated for sub-files
+ * to apply to the filenames generated for subfiles
*
* The value set for this environment variable is interpreted as a
* pathname.
@@ -153,53 +183,56 @@
* Unused. Sentinel value
*/
typedef enum {
- SELECT_IOC_ONE_PER_NODE = 0, /* Default */
+ SELECT_IOC_ONE_PER_NODE = 0, /* Default */
SELECT_IOC_EVERY_NTH_RANK, /* Starting at rank 0, select-next += N */
- SELECT_IOC_WITH_CONFIG, /* NOT IMPLEMENTED: Read-from-file */
- SELECT_IOC_TOTAL, /* Starting at rank 0, mpi_size / total */
- ioc_selection_options /* Sentinel value */
+ SELECT_IOC_WITH_CONFIG, /* NOT IMPLEMENTED: Read-from-file */
+ SELECT_IOC_TOTAL, /* Starting at rank 0, mpi_size / total */
+ ioc_selection_options /* Sentinel value */
} H5FD_subfiling_ioc_select_t;
/**
- * \struct H5FD_subfiling_shared_config_t
- * \brief Subfiling configuration structure that is shared between the #H5FD_SUBFILING
+ * \struct H5FD_subfiling_params_t
+ * \brief Subfiling parameter structure that is shared between the #H5FD_SUBFILING
* and #H5FD_IOC drivers
*
- * \var H5FD_subfiling_ioc_select_t H5FD_subfiling_shared_config_t::ioc_selection
+ * \var H5FD_subfiling_ioc_select_t H5FD_subfiling_params_t::ioc_selection
* The method to use for selecting MPI ranks to be I/O concentrators. The
* current default is to select one MPI rank per node to be an I/O concentrator.
* Refer to #H5FD_subfiling_ioc_select_t for a description of the algorithms
* available for use.
*
- * \var int64_t H5FD_subfiling_shared_config_t::stripe_size
+ * \var int64_t H5FD_subfiling_params_t::stripe_size
* The stripe size defines the size (in bytes) of the data stripes in the
- * sub-files for the logical HDF5 file. Data is striped across the sub-files
+ * subfiles for the logical HDF5 file. Data is striped across the subfiles
* in a round-robin wrap-around fashion in segments equal to the stripe size.
*
- * For example, in an HDF5 file consisting of four sub-files with a 1MiB stripe
- * size, the first and fifth 1MiB of data would reside in the first sub-file,
- * the second and sixth 1MiB of data would reside in the second sub-file and so
+ * For example, in an HDF5 file consisting of four subfiles with a 1MiB stripe
+ * size, the first and fifth 1MiB of data would reside in the first subfile,
+ * the second and sixth 1MiB of data would reside in the second subfile and so
* on.
*
* This value can also be set or adjusted with the #H5FD_SUBFILING_STRIPE_SIZE
* environment variable.
*
- * \var int32_t H5FD_subfiling_shared_config_t::stripe_count
- * The number of I/O concentrators (and, currently, the number of sub-files)
- * to use for the logical HDF5 file. This value is used in conjunction with
- * the IOC selection method to determine which MPI ranks will be assigned as
- * I/O concentrators.
- *
- * Alternatively, the mapping between MPI ranks and I/O concentrators can be
- * set or adjusted with a combination of the #ioc_selection field and the
- * #H5FD_SUBFILING_IOC_PER_NODE and #H5FD_SUBFILING_IOC_SELECTION_CRITERIA
- * environment variables.
+ * \var int32_t H5FD_subfiling_params_t::stripe_count
+ * The target number of subfiles to use for the logical HDF5 file. The current
+ * default is to use one subfile per node, but it can be useful to set a
+ * different target number of subfiles, especially if the HDF5 application will
+ * pre-create the HDF5 file on a single MPI rank. In that particular case, the
+ * single rank will need to know how many subfiles the logical HDF5 file will
+ * consist of in order to properly pre-create the file.
+ *
+ * This value is used in conjunction with the IOC selection method to determine
+ * which MPI ranks will be assigned as I/O concentrators. Alternatively, the
+ * mapping between MPI ranks and I/O concentrators can be set or adjusted with a
+ * combination of the #ioc_selection field and the #H5FD_SUBFILING_IOC_PER_NODE
+ * and #H5FD_SUBFILING_IOC_SELECTION_CRITERIA environment variables.
*/
-typedef struct H5FD_subfiling_shared_config_t {
- H5FD_subfiling_ioc_select_t ioc_selection; /* Method to select I/O concentrators */
- int64_t stripe_size; /* Size (in bytes) of data stripes in sub-files */
- int32_t stripe_count; /* Number of I/O concentrators to use */
-} H5FD_subfiling_shared_config_t;
+typedef struct H5FD_subfiling_params_t {
+ H5FD_subfiling_ioc_select_t ioc_selection; /* Method to select I/O concentrators */
+ int64_t stripe_size; /* Size (in bytes) of data stripes in subfiles */
+ int32_t stripe_count; /* Target number of subfiles to use */
+} H5FD_subfiling_params_t;
//! <!-- [H5FD_subfiling_config_t_snip] -->
/**
@@ -226,7 +259,7 @@ typedef struct H5FD_subfiling_shared_config_t {
* \var hid_t H5FD_subfiling_config_t::ioc_fapl_id
* The File Access Property List which is setup with the file driver that
* the #H5FD_SUBFILING driver will use for servicing I/O requests to the
- * sub-files. Currently, the File Access Property List must be setup with
+ * subfiles. Currently, the File Access Property List must be setup with
* the #H5FD_IOC driver by calling H5Pset_fapl_ioc(), but future development
* may allow other file drivers to be used.
*
@@ -235,19 +268,18 @@ typedef struct H5FD_subfiling_shared_config_t {
* use the #H5FD_IOC driver for its I/O operations. This field should currently
* always be set to TRUE.
*
- * \var H5FD_subfiling_shared_config_t H5FD_subfiling_config_t::shared_cfg
+ * \var H5FD_subfiling_params_t H5FD_subfiling_config_t::shared_cfg
* A structure which contains the subfiling parameters that are shared between
- * the #H5FD_SUBFILING and #H5FD_IOC drivers. This includes the sub-file stripe
- * size, number of I/O concentrators, IOC selection method, etc.
+ * the #H5FD_SUBFILING and #H5FD_IOC drivers. This includes the subfile stripe
+ * size, stripe count, IOC selection method, etc.
*
*/
typedef struct H5FD_subfiling_config_t {
- uint32_t magic; /* Must be set to H5FD_SUBFILING_FAPL_MAGIC */
- uint32_t version; /* Must be set to H5FD_SUBFILING_CURR_FAPL_VERSION */
- hid_t ioc_fapl_id; /* The FAPL setup with the stacked VFD to use for I/O concentrators */
- hbool_t require_ioc; /* Whether to use the IOC VFD (currently must always be TRUE) */
- H5FD_subfiling_shared_config_t
- shared_cfg; /* Subfiling/IOC parameters (stripe size, stripe count, etc.) */
+ uint32_t magic; /* Must be set to H5FD_SUBFILING_FAPL_MAGIC */
+ uint32_t version; /* Must be set to H5FD_SUBFILING_CURR_FAPL_VERSION */
+ hid_t ioc_fapl_id; /* The FAPL setup with the stacked VFD to use for I/O concentrators */
+ hbool_t require_ioc; /* Whether to use the IOC VFD (currently must always be TRUE) */
+ H5FD_subfiling_params_t shared_cfg; /* Subfiling/IOC parameters (stripe size, stripe count, etc.) */
} H5FD_subfiling_config_t;
//! <!-- [H5FD_subfiling_config_t_snip] -->
@@ -274,8 +306,8 @@ H5_DLL hid_t H5FD_subfiling_init(void);
*
* The #H5FD_SUBFILING driver is an MPI-based file driver that allows an
* HDF5 application to distribute a logical HDF5 file across a collection
- * of "sub-files" in equal-sized data segment "stripes". I/O to the logical
- * HDF5 file is then directed to the appropriate "sub-file" according to the
+ * of "subfiles" in equal-sized data segment "stripes". I/O to the logical
+ * HDF5 file is then directed to the appropriate "subfile" according to the
* #H5FD_SUBFILING configuration and a system of I/O concentrators, which
* are MPI ranks operating worker threads.
*
diff --git a/src/H5FDsubfiling/H5subfiling_common.c b/src/H5FDsubfiling/H5subfiling_common.c
index 9cc2c65..a1cca65 100644
--- a/src/H5FDsubfiling/H5subfiling_common.c
+++ b/src/H5FDsubfiling/H5subfiling_common.c
@@ -19,9 +19,9 @@
#include "H5MMprivate.h"
-typedef struct { /* Format of a context map entry */
- void *file_handle; /* key value (linear search of the cache) */
- int64_t sf_context_id; /* The return value if matching file_handle */
+typedef struct { /* Format of a context map entry */
+ uint64_t file_id; /* key value (linear search of the cache) */
+ int64_t sf_context_id; /* The return value if matching file_handle */
} file_map_to_context_t;
/* Identifiers for HDF5's error API */
@@ -30,423 +30,52 @@ hid_t H5subfiling_err_class_g = H5I_INVALID_HID;
char H5subfiling_mpi_error_str[MPI_MAX_ERROR_STRING];
int H5subfiling_mpi_error_str_len;
-static subfiling_context_t *sf_context_cache = NULL;
-static sf_topology_t *sf_topology_cache = NULL;
+/* MPI Datatype used to send/receive an RPC message */
+MPI_Datatype H5_subfiling_rpc_msg_type = MPI_DATATYPE_NULL;
-static size_t sf_context_cache_limit = 16;
-static size_t sf_topology_cache_limit = 4;
+static subfiling_context_t **sf_context_cache = NULL;
+static sf_topology_t **sf_topology_cache = NULL;
+
+static size_t sf_context_cache_size = 0;
+static size_t sf_topology_cache_size = 0;
+static size_t sf_context_cache_num_entries = 0;
+static size_t sf_topology_cache_num_entries = 0;
static file_map_to_context_t *sf_open_file_map = NULL;
static int sf_file_map_size = 0;
-#define DEFAULT_FILE_MAP_ENTRIES 8
+#define DEFAULT_CONTEXT_CACHE_SIZE 16
+#define DEFAULT_TOPOLOGY_CACHE_SIZE 4
+#define DEFAULT_FILE_MAP_ENTRIES 8
+
+static herr_t H5_free_subfiling_object(int64_t object_id);
static herr_t H5_free_subfiling_object_int(subfiling_context_t *sf_context);
static herr_t H5_free_subfiling_topology(sf_topology_t *topology);
-static herr_t init_subfiling(H5FD_subfiling_shared_config_t *subfiling_config, MPI_Comm comm,
+static herr_t init_subfiling(const char *base_filename, uint64_t file_id,
+ H5FD_subfiling_params_t *subfiling_config, int file_acc_flags, MPI_Comm comm,
int64_t *context_id_out);
-static herr_t init_app_topology(H5FD_subfiling_ioc_select_t ioc_selection_type, MPI_Comm comm,
+static herr_t init_app_topology(H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, MPI_Comm node_comm,
sf_topology_t **app_topology_out);
-static herr_t init_subfiling_context(subfiling_context_t *sf_context,
- H5FD_subfiling_shared_config_t *subfiling_config,
+static herr_t get_ioc_selection_criteria_from_env(H5FD_subfiling_ioc_select_t *ioc_selection_type,
+ char **ioc_sel_info_str);
+static herr_t find_cached_topology_info(MPI_Comm comm, H5FD_subfiling_params_t *subf_config,
+ long iocs_per_node, sf_topology_t **app_topology);
+static herr_t init_app_layout(sf_topology_t *app_topology, MPI_Comm comm, MPI_Comm node_comm);
+static herr_t gather_topology_info(app_layout_t *app_layout, MPI_Comm comm, MPI_Comm intra_comm);
+static int compare_layout_nodelocal(const void *layout1, const void *layout2);
+static herr_t identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride);
+static herr_t init_subfiling_context(subfiling_context_t *sf_context, const char *base_filename,
+ uint64_t file_id, H5FD_subfiling_params_t *subfiling_config,
sf_topology_t *app_topology, MPI_Comm file_comm);
static herr_t open_subfile_with_context(subfiling_context_t *sf_context, int file_acc_flags);
-static herr_t record_fid_to_subfile(void *file_handle, int64_t subfile_context_id, int *next_index);
-static herr_t ioc_open_file(int64_t file_context_id, int file_acc_flags);
-static herr_t generate_subfile_name(subfiling_context_t *sf_context, int file_acc_flags, char *filename_out,
- size_t filename_out_len, char **filename_basename_out,
- char **subfile_dir_out);
+static herr_t record_fid_to_subfile(uint64_t file_id, int64_t subfile_context_id, int *next_index);
+static void clear_fid_map_entry(uint64_t file_id, int64_t sf_context_id);
+static herr_t ioc_open_files(int64_t file_context_id, int file_acc_flags);
static herr_t create_config_file(subfiling_context_t *sf_context, const char *base_filename,
const char *subfile_dir, hbool_t truncate_if_exists);
-static herr_t open_config_file(subfiling_context_t *sf_context, const char *base_filename,
- const char *subfile_dir, const char *mode, FILE **config_file_out);
-
-static int get_next_fid_map_index(void);
-static void clear_fid_map_entry(void *file_handle, int64_t sf_context_id);
-static int compare_hostid(const void *h1, const void *h2);
-static herr_t get_ioc_selection_criteria_from_env(H5FD_subfiling_ioc_select_t *ioc_selection_type,
- char **ioc_sel_info_str);
-static int count_nodes(sf_topology_t *info, MPI_Comm comm);
-static herr_t gather_topology_info(sf_topology_t *info, MPI_Comm comm);
-static int identify_ioc_ranks(sf_topology_t *info, int node_count, int iocs_per_node);
-static inline void assign_ioc_ranks(sf_topology_t *app_topology, int ioc_count, int rank_multiple);
-
-static int
-get_next_fid_map_index(void)
-{
- int index = 0;
-
- HDassert(sf_open_file_map || (sf_file_map_size == 0));
-
- for (int i = 0; i < sf_file_map_size; i++) {
- if (sf_open_file_map[i].file_handle == NULL) {
- index = i;
- break;
- }
- }
-
- /* A valid index should always be found here */
- HDassert(index >= 0);
- HDassert((sf_file_map_size == 0) || (index < sf_file_map_size));
-
- return index;
-}
-
-/*-------------------------------------------------------------------------
- * Function: clear_fid_map_entry
- *
- * Purpose: Remove the map entry associated with the file->inode.
- * This is done at file close.
- *
- * Return: None
- * Errors: Cannot fail.
- *
- * Programmer: Richard Warren
- * 7/17/2020
- *
- * Changes: Initial Version/None.
- *
- *-------------------------------------------------------------------------
- */
-static void
-clear_fid_map_entry(void *file_handle, int64_t sf_context_id)
-{
- if (sf_open_file_map) {
- for (int i = 0; i < sf_file_map_size; i++) {
- if ((sf_open_file_map[i].file_handle == file_handle) &&
- (sf_open_file_map[i].sf_context_id == sf_context_id)) {
- sf_open_file_map[i].file_handle = NULL;
- sf_open_file_map[i].sf_context_id = -1;
- return;
- }
- }
- }
-} /* end clear_fid_map_entry() */
-
-/*
- * ---------------------------------------------------
- * Topology discovery related functions for choosing
- * I/O Concentrator (IOC) ranks.
- * Currently, the default approach for assigning an IOC
- * is select the lowest MPI rank on each node.
- *
- * The approach collectively generates N tuples
- * consisting of the MPI rank and hostid. This
- * collection is then sorted by hostid and scanned
- * to identify the IOC ranks.
- *
- * As time permits, addition assignment methods will
- * be implemented, e.g. 1-per-Nranks or via a config
- * option. Additional selection methodologies can
- * be included as users get more experience using the
- * subfiling implementation.
- * ---------------------------------------------------
- */
-
-/*-------------------------------------------------------------------------
- * Function: compare_hostid
- *
- * Purpose: qsort sorting function.
- * Compares tuples of 'layout_t'. The sorting is based on
- * the long hostid values.
- *
- * Return: result of: (hostid1 > hostid2)
- *
- * Programmer: Richard Warren
- * 7/17/2020
- *
- * Changes: Initial Version/None.
- *
- *-------------------------------------------------------------------------
- */
-static int
-compare_hostid(const void *h1, const void *h2)
-{
- const layout_t *host1 = (const layout_t *)h1;
- const layout_t *host2 = (const layout_t *)h2;
- return (host1->hostid > host2->hostid);
-}
-
-/*
--------------------------------------------------------------------------
- Programmer: Richard Warren
- Purpose: Return a character string which represents either the
- default selection method: SELECT_IOC_ONE_PER_NODE; or
- if the user has selected a method via the environment
- variable (H5FD_SUBFILING_IOC_SELECTION_CRITERIA), we
- return that along with any optional qualifier with for
- that method.
-
- Errors: None.
-
- Revision History -- Initial implementation
--------------------------------------------------------------------------
-*/
-static herr_t
-get_ioc_selection_criteria_from_env(H5FD_subfiling_ioc_select_t *ioc_selection_type, char **ioc_sel_info_str)
-{
- char *opt_value = NULL;
- char *env_value = HDgetenv(H5FD_SUBFILING_IOC_SELECTION_CRITERIA);
- herr_t ret_value = SUCCEED;
-
- HDassert(ioc_selection_type);
- HDassert(ioc_sel_info_str);
-
- *ioc_sel_info_str = NULL;
-
- if (env_value) {
- long check_value;
-
- /*
- * For non-default options, the environment variable
- * should have the following form: integer:[integer|string]
- * In particular, EveryNthRank == 1:64 or every 64 ranks assign an IOC
- * or WithConfig == 2:/<full_path_to_config_file>
- */
- if ((opt_value = HDstrchr(env_value, ':')))
- *opt_value++ = '\0';
-
- errno = 0;
- check_value = HDstrtol(env_value, NULL, 0);
-
- if (errno == ERANGE)
- H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL,
- "couldn't parse value from " H5FD_SUBFILING_IOC_SELECTION_CRITERIA
- " environment variable");
-
- if ((check_value < 0) || (check_value >= ioc_selection_options))
- H5_SUBFILING_GOTO_ERROR(
- H5E_VFL, H5E_BADVALUE, FAIL,
- "invalid IOC selection type value %ld from " H5FD_SUBFILING_IOC_SELECTION_CRITERIA
- " environment variable",
- check_value);
-
- *ioc_selection_type = (H5FD_subfiling_ioc_select_t)check_value;
- *ioc_sel_info_str = opt_value;
- }
-
-done:
- H5_SUBFILING_FUNC_LEAVE;
-}
-
-/*-------------------------------------------------------------------------
- * Function: count_nodes
- *
- * Purpose: Initializes the sorted collection of hostid+mpi_rank
- * tuples. After initialization, the collection is scanned
- * to determine the number of unique hostid entries. This
- * value will determine the number of actual I/O concentrators
- * that available to the application. A side effect is to
- * identify the 'node_index' of the current process.
- *
- * Return: The number of unique hostid's (nodes).
- * Errors: MPI_Abort if memory cannot be allocated.
- *
- * Programmer: Richard Warren
- * 7/17/2020
- *
- * Changes: Initial Version/None.
- *
- *-------------------------------------------------------------------------
- */
-static int
-count_nodes(sf_topology_t *info, MPI_Comm comm)
-{
- app_layout_t *app_layout = NULL;
- long nextid;
- int node_count;
- int hostid_index = -1;
- int my_rank;
- int mpi_code;
- int ret_value = 0;
-
- HDassert(info);
- HDassert(info->app_layout);
- HDassert(info->app_layout->layout);
- HDassert(info->app_layout->node_ranks);
- HDassert(MPI_COMM_NULL != comm);
-
- if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &my_rank)))
- H5_SUBFILING_MPI_GOTO_ERROR(-1, "MPI_Comm_rank failed", mpi_code);
-
- app_layout = info->app_layout;
- node_count = app_layout->node_count;
-
- nextid = app_layout->layout[0].hostid;
- /* Possibly record my hostid_index */
- if (app_layout->layout[0].rank == my_rank) {
- hostid_index = 0;
- }
-
- app_layout->node_ranks[0] = 0; /* Add index */
- node_count = 1;
-
- /* Recall that the topology array has been sorted! */
- for (int k = 1; k < app_layout->world_size; k++) {
- /* Possibly record my hostid_index */
- if (app_layout->layout[k].rank == my_rank)
- hostid_index = k;
- if (app_layout->layout[k].hostid != nextid) {
- nextid = app_layout->layout[k].hostid;
- /* Record the index of new hostid */
- app_layout->node_ranks[node_count++] = k;
- }
- }
-
- /* Mark the end of the node_ranks */
- app_layout->node_ranks[node_count] = app_layout->world_size;
- /* Save the index where we first located my hostid */
- app_layout->node_index = hostid_index;
-
- app_layout->node_count = node_count;
-
- ret_value = node_count;
-
-done:
- H5_SUBFILING_FUNC_LEAVE;
-}
-
-/*-------------------------------------------------------------------------
- * Function: gather_topology_info
- *
- * Purpose: Collectively generate a sorted collection of hostid+mpi_rank
- * tuples. The result is returned in the 'topology' field
- * of the sf_topology_t structure.
- *
- * Return: Non-negative on success/Negative on failure
- *
- * Programmer: Richard Warren
- * 7/17/2020
- *
- * Changes: Initial Version/None.
- *
- *-------------------------------------------------------------------------
- */
-static herr_t
-gather_topology_info(sf_topology_t *info, MPI_Comm comm)
-{
- app_layout_t *app_layout = NULL;
- layout_t my_hostinfo;
- long hostid;
- int sf_world_size;
- int sf_world_rank;
- herr_t ret_value = SUCCEED;
-
- HDassert(info);
- HDassert(info->app_layout);
- HDassert(info->app_layout->layout);
- HDassert(MPI_COMM_NULL != comm);
-
- app_layout = info->app_layout;
- sf_world_size = app_layout->world_size;
- sf_world_rank = app_layout->world_rank;
-
- hostid = gethostid();
-
- my_hostinfo.hostid = hostid;
- my_hostinfo.rank = sf_world_rank;
-
- app_layout->hostid = hostid;
- app_layout->layout[sf_world_rank] = my_hostinfo;
-
- if (sf_world_size > 1) {
- int mpi_code;
-
- if (MPI_SUCCESS !=
- (mpi_code = MPI_Allgather(&my_hostinfo, 2, MPI_LONG, app_layout->layout, 2, MPI_LONG, comm)))
- H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Allgather failed", mpi_code);
-
- HDqsort(app_layout->layout, (size_t)sf_world_size, sizeof(layout_t), compare_hostid);
- }
-
-done:
- H5_SUBFILING_FUNC_LEAVE;
-}
-
-/*-------------------------------------------------------------------------
- * Function: identify_ioc_ranks
- *
- * Purpose: We've already identified the number of unique nodes and
- * have a sorted list layout_t structures. Under normal
- * conditions, we only utilize a single IOC per node. Under
- * that circumstance, we only need to fill the io_concentrator
- * vector from the node_ranks array (which contains the index
- * into the layout array of lowest MPI rank on each node) into
- * the io_concentrator vector;
- * Otherwise, while determining the number of local_peers per
- * node, we can also select one or more additional IOCs.
- *
- * As a side effect, we fill the 'ioc_concentrator' vector
- * and set the 'rank_is_ioc' flag to TRUE if our rank is
- * identified as owning an I/O Concentrator (IOC).
- *
- *-------------------------------------------------------------------------
- */
-static int
-identify_ioc_ranks(sf_topology_t *info, int node_count, int iocs_per_node)
-{
- app_layout_t *app_layout = NULL;
- int total_ioc_count = 0;
-
- HDassert(info);
- HDassert(info->app_layout);
-
- app_layout = info->app_layout;
-
- for (int n = 0; n < node_count; n++) {
- int node_index = app_layout->node_ranks[n];
- int local_peer_count = app_layout->node_ranks[n + 1] - app_layout->node_ranks[n];
-
- info->io_concentrators[total_ioc_count++] = (int)(app_layout->layout[node_index++].rank);
-
- if (app_layout->layout[node_index - 1].rank == app_layout->world_rank) {
- info->subfile_rank = total_ioc_count - 1;
- info->rank_is_ioc = TRUE;
- }
-
- for (int k = 1; k < iocs_per_node; k++) {
- if (k < local_peer_count) {
- if (app_layout->layout[node_index].rank == app_layout->world_rank) {
- info->rank_is_ioc = TRUE;
- info->subfile_rank = total_ioc_count;
- }
- info->io_concentrators[total_ioc_count++] = (int)(app_layout->layout[node_index++].rank);
- }
- }
- }
-
- info->n_io_concentrators = total_ioc_count;
-
- return total_ioc_count;
-} /* end identify_ioc_ranks() */
-
-static inline void
-assign_ioc_ranks(sf_topology_t *app_topology, int ioc_count, int rank_multiple)
-{
- app_layout_t *app_layout = NULL;
- int *io_concentrators = NULL;
-
- HDassert(app_topology);
- HDassert(app_topology->app_layout);
- HDassert(app_topology->io_concentrators);
-
- app_layout = app_topology->app_layout;
- io_concentrators = app_topology->io_concentrators;
-
- /* fill the io_concentrators values based on the application layout */
- if (io_concentrators) {
- int ioc_index;
- for (int k = 0, ioc_next = 0; ioc_next < ioc_count; ioc_next++) {
- ioc_index = rank_multiple * k++;
- io_concentrators[ioc_next] = (int)(app_layout->layout[ioc_index].rank);
- if (io_concentrators[ioc_next] == app_layout->world_rank) {
- app_topology->subfile_rank = ioc_next;
- app_topology->rank_is_ioc = TRUE;
- }
- }
- app_topology->n_io_concentrators = ioc_count;
- }
-} /* end assign_ioc_ranks() */
+static herr_t open_config_file(const char *base_filename, const char *subfile_dir, uint64_t file_id,
+ const char *mode, FILE **config_file_out);
/*-------------------------------------------------------------------------
* Function: H5_new_subfiling_object_id
@@ -459,10 +88,19 @@ assign_ioc_ranks(sf_topology_t *app_topology, int ioc_count, int rank_multiple)
*-------------------------------------------------------------------------
*/
int64_t
-H5_new_subfiling_object_id(sf_obj_type_t obj_type, int64_t index_val)
+H5_new_subfiling_object_id(sf_obj_type_t obj_type)
{
- if (obj_type != SF_CONTEXT && obj_type != SF_TOPOLOGY)
+ int64_t index_val = 0;
+
+ if (obj_type == SF_CONTEXT) {
+ index_val = (int64_t)sf_context_cache_num_entries;
+ }
+ else if (obj_type == SF_TOPOLOGY) {
+ index_val = (int64_t)sf_topology_cache_num_entries;
+ }
+ else
return -1;
+
if (index_val < 0)
return -1;
@@ -492,12 +130,6 @@ H5_new_subfiling_object_id(sf_obj_type_t obj_type, int64_t index_val)
*
*-------------------------------------------------------------------------
*/
-/*
- * TODO: we don't appear to ever use this for retrieving a subfile topology
- * object. Might be able to refactor to just return a subfile context
- * object.
- */
-/* TODO: no way of freeing caches on close currently */
void *
H5_get_subfiling_object(int64_t object_id)
{
@@ -512,7 +144,7 @@ H5_get_subfiling_object(int64_t object_id)
if (obj_type == SF_CONTEXT) {
/* Contexts provide information principally about
* the application and how the data layout is managed
- * over some number of sub-files. The important
+ * over some number of subfiles. The important
* parameters are the number of subfiles (or in the
* context of IOCs, the MPI ranks and counts of the
* processes which host an I/O Concentrator. We
@@ -522,58 +154,121 @@ H5_get_subfiling_object(int64_t object_id)
/* Create subfiling context cache if it doesn't exist */
if (!sf_context_cache) {
- if (NULL == (sf_context_cache = HDcalloc(sf_context_cache_limit, sizeof(subfiling_context_t))))
+ if (NULL == (sf_context_cache = HDcalloc(DEFAULT_CONTEXT_CACHE_SIZE, sizeof(*sf_context_cache))))
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL,
"couldn't allocate space for subfiling context cache");
+ sf_context_cache_size = DEFAULT_CONTEXT_CACHE_SIZE;
+ sf_context_cache_num_entries = 0;
}
/* Make more space in context cache if needed */
- if ((size_t)obj_index == sf_context_cache_limit) {
+ if ((size_t)obj_index >= sf_context_cache_size) {
size_t old_num_entries;
+ size_t new_size;
void *tmp_realloc;
- old_num_entries = sf_context_cache_limit;
+ old_num_entries = sf_context_cache_num_entries;
- sf_context_cache_limit *= 2;
+ new_size = (sf_context_cache_size * 3) / 2;
- if (NULL == (tmp_realloc = HDrealloc(sf_context_cache,
- sf_context_cache_limit * sizeof(subfiling_context_t))))
+ if (NULL == (tmp_realloc = HDrealloc(sf_context_cache, new_size * sizeof(*sf_context_cache))))
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL,
"couldn't allocate space for subfiling context cache");
- sf_context_cache = tmp_realloc;
+ sf_context_cache = tmp_realloc;
+ sf_context_cache_size = new_size;
/* Clear newly-allocated entries */
- HDmemset(&sf_context_cache[obj_index], 0,
- (sf_context_cache_limit - old_num_entries) * sizeof(subfiling_context_t));
+ HDmemset(&sf_context_cache[old_num_entries], 0,
+ (sf_context_cache_size - old_num_entries) * sizeof(*sf_context_cache));
+
+ /*
+ * If we had to make more space, the given object index
+ * should always fall within range after a single re-allocation
+ */
+ HDassert((size_t)obj_index < sf_context_cache_size);
}
- /* Return direct pointer to the context cache entry */
- return (void *)&sf_context_cache[obj_index];
+ /*
+ * Since this cache currently just keeps all entries until
+ * application exit, context entry indices should just be
+ * consecutive
+ */
+ HDassert((size_t)obj_index <= sf_context_cache_num_entries);
+ if ((size_t)obj_index < sf_context_cache_num_entries)
+ ret_value = sf_context_cache[obj_index];
+ else {
+ HDassert(!sf_context_cache[sf_context_cache_num_entries]);
+
+ /* Allocate a new subfiling context object */
+ if (NULL == (ret_value = HDcalloc(1, sizeof(subfiling_context_t))))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL,
+ "couldn't allocate subfiling context object");
+
+ sf_context_cache[sf_context_cache_num_entries++] = ret_value;
+ }
}
else if (obj_type == SF_TOPOLOGY) {
/* Create subfiling topology cache if it doesn't exist */
if (!sf_topology_cache) {
- if (NULL == (sf_topology_cache = HDcalloc(sf_topology_cache_limit, sizeof(sf_topology_t))))
+ if (NULL ==
+ (sf_topology_cache = HDcalloc(DEFAULT_TOPOLOGY_CACHE_SIZE, sizeof(*sf_topology_cache))))
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL,
"couldn't allocate space for subfiling topology cache");
+ sf_topology_cache_size = DEFAULT_TOPOLOGY_CACHE_SIZE;
+ sf_topology_cache_num_entries = 0;
}
- /* We will likely only cache a single topology
- * which is that of the original parallel application.
- * In that context, we will identify the number of
- * nodes along with the number of MPI ranks on a node.
+ /* Make more space in topology cache if needed */
+ if ((size_t)obj_index >= sf_topology_cache_size) {
+ size_t old_num_entries;
+ size_t new_size;
+ void *tmp_realloc;
+
+ old_num_entries = sf_topology_cache_num_entries;
+
+ new_size = (sf_topology_cache_size * 3) / 2;
+
+ if (NULL == (tmp_realloc = HDrealloc(sf_topology_cache, new_size * sizeof(*sf_topology_cache))))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL,
+ "couldn't allocate space for subfiling topology cache");
+
+ sf_topology_cache = tmp_realloc;
+ sf_topology_cache_size = new_size;
+
+ /* Clear newly-allocated entries */
+ HDmemset(&sf_topology_cache[old_num_entries], 0,
+ (sf_topology_cache_size - old_num_entries) * sizeof(*sf_topology_cache));
+
+ /*
+ * If we had to make more space, the given object index
+ * should always fall within range after a single re-allocation
+ */
+ HDassert((size_t)obj_index < sf_topology_cache_size);
+ }
+
+ /*
+ * Since this cache currently just keeps all entries until
+ * application exit, topology entry indices should just be
+ * consecutive
*/
- if ((size_t)obj_index >= sf_topology_cache_limit)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, NULL,
- "invalid object index for subfiling topology object ID");
+ HDassert((size_t)obj_index <= sf_topology_cache_num_entries);
+ if ((size_t)obj_index < sf_topology_cache_num_entries)
+ ret_value = sf_topology_cache[obj_index];
+ else {
+ HDassert(!sf_topology_cache[sf_topology_cache_num_entries]);
+
+ /* Allocate a new subfiling topology object */
+ if (NULL == (ret_value = HDmalloc(sizeof(sf_topology_t))))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL,
+ "couldn't allocate subfiling topology object");
- /* Return direct pointer to the topology cache entry */
- return (void *)&sf_topology_cache[obj_index];
+ sf_topology_cache[sf_topology_cache_num_entries++] = ret_value;
+ }
}
-
#ifdef H5_SUBFILING_DEBUG
- HDprintf("%s: Unknown subfiling object type for ID %" PRId64 "\n", __func__, object_id);
+ else
+ HDprintf("%s: Unknown subfiling object type for ID %" PRId64 "\n", __func__, object_id);
#endif
done:
@@ -586,27 +281,55 @@ done:
* Purpose: Frees the underlying subfiling object for a given subfiling
* object ID.
*
+ * NOTE: Currently we assume that all created subfiling
+ * objects are cached in the (very simple) context/topology
+ * cache until application exit, so the only time a subfiling
+ * object should be freed by this routine is if something
+ * fails right after creating one. Otherwise, the internal
+ * indexing for the relevant cache will be invalid.
+ *
* Return: Non-negative on success/Negative on failure
*
*-------------------------------------------------------------------------
*/
-herr_t
+static herr_t
H5_free_subfiling_object(int64_t object_id)
{
- subfiling_context_t *sf_context = NULL;
- int64_t obj_type = (object_id >> 32) & 0x0FFFF;
- herr_t ret_value = SUCCEED;
+ int64_t obj_type = (object_id >> 32) & 0x0FFFF;
+ herr_t ret_value = SUCCEED;
- if (obj_type != SF_CONTEXT)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "invalid subfiling object type for ID %" PRId64,
- object_id);
+ if (obj_type == SF_CONTEXT) {
+ subfiling_context_t *sf_context;
- if (NULL == (sf_context = H5_get_subfiling_object(object_id)))
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL,
- "couldn't get subfiling context for subfiling object ID");
+ if (NULL == (sf_context = H5_get_subfiling_object(object_id)))
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL,
+ "couldn't get subfiling context for subfiling object ID");
+
+ if (H5_free_subfiling_object_int(sf_context) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling object");
+
+ HDassert(sf_context_cache_num_entries > 0);
+ HDassert(sf_context == sf_context_cache[sf_context_cache_num_entries - 1]);
+ sf_context_cache[sf_context_cache_num_entries - 1] = NULL;
+ sf_context_cache_num_entries--;
+ }
+ else {
+ sf_topology_t *sf_topology;
+
+ HDassert(obj_type == SF_TOPOLOGY);
- if (H5_free_subfiling_object_int(sf_context) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling object");
+ if (NULL == (sf_topology = H5_get_subfiling_object(object_id)))
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL,
+ "couldn't get subfiling context for subfiling object ID");
+
+ if (H5_free_subfiling_topology(sf_topology) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling topology");
+
+ HDassert(sf_topology_cache_num_entries > 0);
+ HDassert(sf_topology == sf_topology_cache[sf_topology_cache_num_entries - 1]);
+ sf_topology_cache[sf_topology_cache_num_entries - 1] = NULL;
+ sf_topology_cache_num_entries--;
+ }
done:
H5_SUBFILING_FUNC_LEAVE;
@@ -617,25 +340,10 @@ H5_free_subfiling_object_int(subfiling_context_t *sf_context)
{
HDassert(sf_context);
-#ifdef H5_SUBFILING_DEBUG
- if (sf_context->sf_logfile) {
- struct tm *tm = NULL;
- time_t cur_time;
-
- cur_time = time(NULL);
- tm = localtime(&cur_time);
-
- H5_subfiling_log(sf_context->sf_context_id, "\n-- LOGGING FINISH - %s", asctime(tm));
-
- HDfclose(sf_context->sf_logfile);
- sf_context->sf_logfile = NULL;
- }
-#endif
-
sf_context->sf_context_id = -1;
sf_context->h5_file_id = UINT64_MAX;
- sf_context->h5_file_handle = NULL;
- sf_context->sf_fid = -1;
+ sf_context->sf_num_fids = 0;
+ sf_context->sf_num_subfiles = -1;
sf_context->sf_write_count = 0;
sf_context->sf_read_count = 0;
sf_context->sf_eof = HADDR_UNDEF;
@@ -658,52 +366,63 @@ H5_free_subfiling_object_int(subfiling_context_t *sf_context)
return FAIL;
sf_context->sf_eof_comm = MPI_COMM_NULL;
}
- if (sf_context->sf_barrier_comm != MPI_COMM_NULL) {
- if (H5_mpi_comm_free(&sf_context->sf_barrier_comm) < 0)
+ if (sf_context->sf_node_comm != MPI_COMM_NULL) {
+ if (H5_mpi_comm_free(&sf_context->sf_node_comm) < 0)
return FAIL;
- sf_context->sf_barrier_comm = MPI_COMM_NULL;
+ sf_context->sf_node_comm = MPI_COMM_NULL;
}
if (sf_context->sf_group_comm != MPI_COMM_NULL) {
if (H5_mpi_comm_free(&sf_context->sf_group_comm) < 0)
return FAIL;
sf_context->sf_group_comm = MPI_COMM_NULL;
}
- if (sf_context->sf_intercomm != MPI_COMM_NULL) {
- if (H5_mpi_comm_free(&sf_context->sf_intercomm) < 0)
- return FAIL;
- sf_context->sf_intercomm = MPI_COMM_NULL;
- }
- sf_context->sf_group_size = -1;
- sf_context->sf_group_rank = -1;
- sf_context->sf_intercomm_root = -1;
+ sf_context->sf_group_size = -1;
+ sf_context->sf_group_rank = -1;
HDfree(sf_context->subfile_prefix);
sf_context->subfile_prefix = NULL;
- HDfree(sf_context->sf_filename);
- sf_context->sf_filename = NULL;
-
HDfree(sf_context->h5_filename);
sf_context->h5_filename = NULL;
- if (H5_free_subfiling_topology(sf_context->topology) < 0)
- return FAIL;
+ HDfree(sf_context->sf_fids);
+ sf_context->sf_fids = NULL;
+
+ /*
+ * Currently we assume that all created application topology
+ * objects are cached until application exit and may be shared
+ * among multiple subfiling contexts, so we free them separately
+ * from here to avoid issues with stale pointers.
+ */
sf_context->topology = NULL;
+ HDfree(sf_context);
+
return SUCCEED;
}
static herr_t
H5_free_subfiling_topology(sf_topology_t *topology)
{
+ herr_t ret_value = SUCCEED;
+
HDassert(topology);
- topology->subfile_rank = -1;
- topology->n_io_concentrators = 0;
+#ifndef NDEBUG
+ {
+ hbool_t topology_cached = FALSE;
- HDfree(topology->subfile_fd);
- topology->subfile_fd = NULL;
+ /* Make sure this application topology object is in the cache */
+ for (size_t i = 0; i < sf_topology_cache_num_entries; i++)
+ if (topology == sf_topology_cache[i])
+ topology_cached = TRUE;
+ HDassert(topology_cached);
+ }
+#endif
+
+ topology->ioc_idx = -1;
+ topology->n_io_concentrators = 0;
if (topology->app_layout) {
HDfree(topology->app_layout->layout);
@@ -720,9 +439,134 @@ H5_free_subfiling_topology(sf_topology_t *topology)
HDfree(topology->io_concentrators);
topology->io_concentrators = NULL;
+ if (H5_mpi_comm_free(&topology->app_comm) < 0)
+ H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI communicator");
+
HDfree(topology);
- return SUCCEED;
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: H5_open_subfiling_stub_file
+ *
+ * Purpose: Opens the stub file for an HDF5 file created with the
+ * Subfiling VFD. This stub file only contains some superblock
+ * metadata that can allow HDF5 applications to determine that
+ * the file is an HDF5 file and was created with the Subfiling
+ * VFD.
+ *
+ * This routine is collective across `file_comm`; once the
+ * stub file has been opened, the inode value for the file is
+ * retrieved and broadcasted to all MPI ranks in `file_comm`
+ * for future use.
+ *
+ * To avoid unnecessary overhead from a large-scale file open,
+ * this stub file is currently only opened on MPI rank 0. Note
+ * that this assumes that all the relevant metadata will be
+ * written from MPI rank 0. This should be fine for now since
+ * the HDF file signature and Subfiling driver info is really
+ * all that's needed, but this should be revisited since the
+ * file metadata can and will come from other MPI ranks as
+ * well.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5_open_subfiling_stub_file(const char *name, unsigned flags, MPI_Comm file_comm, H5FD_t **file_ptr,
+ uint64_t *file_id)
+{
+ H5P_genplist_t *plist = NULL;
+ uint64_t stub_file_id = UINT64_MAX;
+ hbool_t bcasted_inode = FALSE;
+ H5FD_t *stub_file = NULL;
+ hid_t fapl_id = H5I_INVALID_HID;
+ int mpi_rank = 0;
+ int mpi_size = 1;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
+ if (!name)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid subfiling stub file name");
+ if (file_comm == MPI_COMM_NULL)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid MPI communicator");
+ if (!file_id)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL file ID pointer");
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(file_comm, &mpi_rank)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code);
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(file_comm, &mpi_size)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code);
+
+ if (!file_ptr && (mpi_rank == 0))
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL stub file pointer");
+
+ /* Open stub file on MPI rank 0 only */
+ if (mpi_rank == 0) {
+ h5_stat_t st;
+ MPI_Comm stub_comm = MPI_COMM_SELF;
+ MPI_Info stub_info = MPI_INFO_NULL;
+
+ if ((fapl_id = H5P_create_id(H5P_CLS_FILE_ACCESS_g, FALSE)) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTREGISTER, FAIL, "can't create FAPL for stub file");
+ if (NULL == (plist = H5P_object_verify(fapl_id, H5P_FILE_ACCESS)))
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a file access property list");
+
+ /* Use MPI I/O driver for stub file to allow access to vector I/O */
+ if (H5P_set(plist, H5F_ACS_MPI_PARAMS_COMM_NAME, &stub_comm) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI communicator");
+ if (H5P_set(plist, H5F_ACS_MPI_PARAMS_INFO_NAME, &stub_info) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI info object");
+ if (H5P_set_driver(plist, H5FD_MPIO, NULL, NULL) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI I/O driver on FAPL");
+
+ if (NULL == (stub_file = H5FD_open(name, flags, fapl_id, HADDR_UNDEF)))
+ H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL, "couldn't open HDF5 stub file");
+
+ HDcompile_assert(sizeof(uint64_t) >= sizeof(ino_t));
+
+ /* Retrieve Inode value for stub file */
+ if (HDstat(name, &st) < 0) {
+ stub_file_id = UINT64_MAX;
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL,
+ "couldn't stat HDF5 stub file, errno = %d, error message = '%s'", errno,
+ HDstrerror(errno));
+ }
+ else
+ stub_file_id = (uint64_t)st.st_ino;
+ }
+
+ bcasted_inode = TRUE;
+
+ if (mpi_size > 1) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&stub_file_id, 1, MPI_UINT64_T, 0, file_comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code);
+ }
+
+ if (stub_file_id == UINT64_MAX)
+ H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "couldn't get inode value for HDF5 stub file");
+
+ if (file_ptr)
+ *file_ptr = stub_file;
+ *file_id = stub_file_id;
+
+done:
+ if (fapl_id >= 0 && H5I_dec_ref(fapl_id) < 0)
+ H5_SUBFILING_DONE_ERROR(H5E_ID, H5E_CANTDEC, FAIL, "can't close FAPL ID");
+
+ if (ret_value < 0) {
+ if (!bcasted_inode && (mpi_size > 1)) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&stub_file_id, 1, MPI_UINT64_T, 0, file_comm)))
+ H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Bcast failed", mpi_code);
+ }
+ if (stub_file) {
+ if (H5FD_close(stub_file) < 0)
+ H5_SUBFILING_DONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, "couldn't close HDF5 stub file");
+ }
+ }
+
+ H5_SUBFILING_FUNC_LEAVE;
}
/*-------------------------------------------------------------------------
@@ -752,16 +596,12 @@ H5_free_subfiling_topology(sf_topology_t *topology)
* Changes: Initial Version/None.
*-------------------------------------------------------------------------
*/
-/* TODO: revise description */
herr_t
-H5_open_subfiles(const char *base_filename, void *file_handle,
- H5FD_subfiling_shared_config_t *subfiling_config, int file_acc_flags, MPI_Comm file_comm,
- int64_t *context_id_out)
+H5_open_subfiles(const char *base_filename, uint64_t file_id, H5FD_subfiling_params_t *subfiling_config,
+ int file_acc_flags, MPI_Comm file_comm, int64_t *context_id_out)
{
subfiling_context_t *sf_context = NULL;
int64_t context_id = -1;
- int l_errors = 0;
- int g_errors = 0;
int mpi_code;
herr_t ret_value = SUCCEED;
@@ -775,20 +615,13 @@ H5_open_subfiles(const char *base_filename, void *file_handle,
H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL, "invalid subfiling context ID pointer");
/* Initialize new subfiling context ID based on configuration information */
- if (init_subfiling(subfiling_config, file_comm, &context_id) < 0)
+ if (init_subfiling(base_filename, file_id, subfiling_config, file_acc_flags, file_comm, &context_id) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't initialize subfiling context");
/* Retrieve the subfiling object for the newly-created context ID */
if (NULL == (sf_context = H5_get_subfiling_object(context_id)))
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't get subfiling object from context ID");
- /* Save some basic things in the new subfiling context */
- sf_context->h5_file_handle = file_handle;
-
- if (NULL == (sf_context->h5_filename = HDstrdup(base_filename)))
- H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
- "couldn't allocate space for subfiling filename");
-
/*
* If we're actually using the IOCs, we will
* start the service threads on the identified
@@ -802,7 +635,6 @@ H5_open_subfiles(const char *base_filename, void *file_handle,
struct tm *tm = NULL;
time_t cur_time;
int mpi_rank;
- int mpi_code;
/* Open debugging logfile */
@@ -825,24 +657,30 @@ H5_open_subfiles(const char *base_filename, void *file_handle,
*context_id_out = context_id;
done:
- if (ret_value < 0) {
- l_errors = 1;
- }
-
/*
* Form consensus on whether opening subfiles was
* successful
*/
- if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&l_errors, &g_errors, 1, MPI_INT, MPI_SUM, file_comm)))
- H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Allreduce failed", mpi_code);
+ {
+ int mpi_size = -1;
+ int err_result = (ret_value < 0);
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(file_comm, &mpi_size)))
+ H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Comm_size failed", mpi_code);
- if (g_errors > 0) {
- H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTOPENFILE, FAIL,
- "one or more IOC ranks couldn't open subfiles");
+ if (mpi_size > 1) {
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Allreduce(MPI_IN_PLACE, &err_result, 1, MPI_INT, MPI_MAX, file_comm)))
+ H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Allreduce failed", mpi_code);
+ }
+
+ if (err_result)
+ H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTOPENFILE, FAIL,
+ "one or more IOC ranks couldn't open subfiles");
}
if (ret_value < 0) {
- clear_fid_map_entry(file_handle, context_id);
+ clear_fid_map_entry(file_id, context_id);
if (context_id >= 0 && H5_free_subfiling_object(context_id) < 0)
H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling object");
@@ -873,54 +711,175 @@ done:
-------------------------------------------------------------------------
*/
static herr_t
-init_subfiling(H5FD_subfiling_shared_config_t *subfiling_config, MPI_Comm comm, int64_t *context_id_out)
+init_subfiling(const char *base_filename, uint64_t file_id, H5FD_subfiling_params_t *subfiling_config,
+ int file_acc_flags, MPI_Comm comm, int64_t *context_id_out)
{
- subfiling_context_t *new_context = NULL;
- sf_topology_t *app_topology = NULL;
- int64_t context_id = -1;
- int file_index = -1;
- herr_t ret_value = SUCCEED;
+ subfiling_context_t *new_context = NULL;
+ sf_topology_t *app_topology = NULL;
+ MPI_Comm node_comm = MPI_COMM_NULL;
+ int64_t context_id = -1;
+ FILE *config_file = NULL;
+ char *file_basename = NULL;
+ char *subfile_dir = NULL;
+ int mpi_rank;
+ int mpi_size;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
HDassert(context_id_out);
- file_index = get_next_fid_map_index();
- HDassert(file_index >= 0);
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &mpi_rank)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code);
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(comm, &mpi_size)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code);
/* Use the file's index to create a new subfiling context ID */
- if ((context_id = H5_new_subfiling_object_id(SF_CONTEXT, file_index)) < 0)
+ if ((context_id = H5_new_subfiling_object_id(SF_CONTEXT)) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't create new subfiling context ID");
/* Create a new subfiling context object with the created context ID */
if (NULL == (new_context = H5_get_subfiling_object(context_id)))
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't create new subfiling object");
+ new_context->sf_context_id = -1;
+ new_context->topology = NULL;
+ new_context->sf_msg_comm = MPI_COMM_NULL;
+ new_context->sf_data_comm = MPI_COMM_NULL;
+ new_context->sf_eof_comm = MPI_COMM_NULL;
+ new_context->sf_node_comm = MPI_COMM_NULL;
+ new_context->sf_group_comm = MPI_COMM_NULL;
+
+ /*
+ * If there's an existing subfiling configuration file for
+ * this file, read the stripe size and number of subfiles
+ * from it
+ */
+ if (0 == (file_acc_flags & O_CREAT)) {
+ int64_t config[2] = {0, 0}; /* {stripe size, num subfiles} */
+
+ if (mpi_rank == 0) {
+ /* TODO: currently no support for subfile prefix */
+ if (H5_dirname(base_filename, &subfile_dir) < 0)
+ config[0] = -1;
+
+ if (config[0] >= 0) {
+ if (H5_basename(base_filename, &file_basename) < 0)
+ config[0] = -1;
+ }
+
+ if (config[0] >= 0) {
+ if (open_config_file(file_basename, subfile_dir, file_id, "r", &config_file) < 0)
+ config[0] = -1;
+ }
+
+ if (config[0] >= 0) {
+ if (!config_file)
+ config[0] = -2; /* No config file; use setting from configuration */
+ else {
+ /*
+ * If a subfiling configuration file exists and we aren't truncating
+ * it, read the number of subfiles used at file creation time.
+ */
+ if (H5_get_subfiling_config_from_file(config_file, &config[0], &config[1]) < 0)
+ config[0] = -1;
+ }
+ }
+ }
+
+ if (mpi_size > 1) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Bcast(config, 2, MPI_INT64_T, 0, comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code);
+ }
+
+ /*
+ * Override the stripe size and stripe count settings in the
+ * application's subfiling configuration if we read values
+ * from an existing subfiling configuration file
+ */
+ if (config[0] == -1)
+ H5_SUBFILING_GOTO_ERROR(
+ H5E_FILE, H5E_CANTOPENFILE, FAIL,
+ "lead process couldn't read the number of subfiles from subfiling configuration file");
+ else {
+ if (config[0] > 0)
+ subfiling_config->stripe_size = config[0];
+ if (config[1] > 0) {
+ H5_CHECK_OVERFLOW(config[1], int64_t, int32_t);
+ subfiling_config->stripe_count = (int32_t)config[1];
+ }
+ }
+ }
+ else {
+ char *env_value = NULL;
+
+ /* Check for a subfiling stripe size setting from the environment */
+ if ((env_value = HDgetenv(H5FD_SUBFILING_STRIPE_SIZE))) {
+ long long stripe_size = -1;
+
+ errno = 0;
+
+ stripe_size = HDstrtoll(env_value, NULL, 0);
+ if (ERANGE == errno)
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL,
+ "invalid stripe size setting for " H5FD_SUBFILING_STRIPE_SIZE);
+
+ if (stripe_size > 0) {
+ subfiling_config->stripe_size = (int64_t)stripe_size;
+ }
+ }
+ }
+
+#if H5_CHECK_MPI_VERSION(3, 0)
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &mpi_rank)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code);
+
+ /* Create an MPI sub-communicator for intra-node communications */
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, mpi_rank, MPI_INFO_NULL, &node_comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_split_type failed", mpi_code);
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_set_errhandler(node_comm, MPI_ERRORS_RETURN)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_set_errhandler failed", mpi_code);
+#else
+#error "MPI-3 required for MPI_Comm_split_type"
+#endif
/*
* Setup the application topology information, including the computed
* number and distribution map of the set of I/O concentrators
*/
- if (init_app_topology(subfiling_config->ioc_selection, comm, &app_topology) < 0)
+ if (init_app_topology(subfiling_config, comm, node_comm, &app_topology) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't initialize application topology");
new_context->sf_context_id = context_id;
- if (init_subfiling_context(new_context, subfiling_config, app_topology, comm) < 0)
+ if (init_subfiling_context(new_context, base_filename, file_id, subfiling_config, app_topology, comm) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL,
"couldn't initialize subfiling application topology object");
-
- new_context->sf_base_addr = 0;
- if (new_context->topology->rank_is_ioc) {
- new_context->sf_base_addr =
- (int64_t)(new_context->topology->subfile_rank * new_context->sf_stripe_size);
- }
+ new_context->sf_node_comm = node_comm;
*context_id_out = context_id;
done:
+ if (config_file && (EOF == HDfclose(config_file)))
+ H5_SUBFILING_DONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL,
+ "couldn't close subfiling configuration file");
+
+ H5MM_free(file_basename);
+ H5MM_free(subfile_dir);
+
if (ret_value < 0) {
- HDfree(app_topology);
+ if (app_topology) {
+ if (H5_free_subfiling_topology(app_topology) < 0)
+ H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling topology");
+ }
+
+ if (H5_mpi_comm_free(&node_comm) < 0)
+ H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free MPI communicator");
if (context_id >= 0 && H5_free_subfiling_object(context_id) < 0)
H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling object");
+
+ *context_id_out = -1;
}
H5_SUBFILING_FUNC_LEAVE;
@@ -929,76 +888,89 @@ done:
/*-------------------------------------------------------------------------
* Function: init_app_topology
*
- * Purpose: Once a sorted collection of hostid/mpi_rank tuples has been
- * created and the number of unique hostids (nodes) has
- * been determined, we may modify this "default" value for
- * the number of IO Concentrators for this application.
+ * Purpose: Determine the topology of the application so that MPI ranks
+ * can be assigned as I/O concentrators. The default is to use
+ * 1 MPI rank per node as an I/O concentrator, but this can be
+ * changed by the application's subfiling configuration, or by
+ * an environment variable (H5FD_SUBFILING_IOC_PER_NODE).
*
- * The default of one(1) IO concentrator per node can be
- * changed (principally for testing) by environment variable.
- * if IOC_COUNT_PER_NODE is defined, then that integer value
- * is utilized as a multiplier to modify the set of
- * IO Concentrator ranks.
- *
- * The cached results will be replicated within the
- * subfiling_context_t structure and is utilized as a map from
- * io concentrator rank to MPI communicator rank for message
- * sends and receives.
- *
- * Return: The number of IO Concentrator ranks. We also cache
- * the MPI ranks in the 'io_concentrator' vector variable.
- * The length of this vector is cached as 'n_io_concentrators'.
- * Errors: MPI_Abort if memory cannot be allocated.
- *
- * Programmer: Richard Warren
- * 7/17/2020
- *
- * Changes: - Initial Version/None.
- * - Updated the API to allow a variety of methods for
- * determining the number and MPI ranks that will have
- * IO Concentrators. The default approach will define
- * a single IOC per node.
+ * Return: Non-negative on success/Negative on failure
*
*-------------------------------------------------------------------------
*/
static herr_t
-init_app_topology(H5FD_subfiling_ioc_select_t ioc_selection_type, MPI_Comm comm,
+init_app_topology(H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, MPI_Comm node_comm,
sf_topology_t **app_topology_out)
{
- sf_topology_t *app_topology = NULL;
- app_layout_t *app_layout = NULL;
- char *env_value = NULL;
- char *ioc_sel_str = NULL;
- long ioc_select_val = -1;
- long iocs_per_node = 1;
- int ioc_count = 0;
- int comm_rank;
- int comm_size;
- int mpi_code;
- herr_t ret_value = SUCCEED;
+ H5FD_subfiling_ioc_select_t ioc_selection_type;
+ sf_topology_t *app_topology = NULL;
+ int64_t topology_id = -1;
+ char *env_value = NULL;
+ char *ioc_sel_str = NULL;
+ long ioc_select_val = -1;
+ long iocs_per_node = 1;
+ int ioc_count = 0;
+ int rank_multiple = 1;
+ int comm_rank;
+ int comm_size;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+ HDassert(subfiling_config);
HDassert(MPI_COMM_NULL != comm);
+ HDassert(MPI_COMM_NULL != node_comm);
HDassert(app_topology_out);
HDassert(!*app_topology_out);
if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &comm_rank)))
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code);
-
if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(comm, &comm_size)))
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code);
+ ioc_selection_type = subfiling_config->ioc_selection;
+
/* Check if an IOC selection type was specified by environment variable */
if (get_ioc_selection_criteria_from_env(&ioc_selection_type, &ioc_sel_str) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL,
"couldn't get IOC selection type from environment");
- /* Sanity checking on different IOC selection strategies */
+ /*
+ * Check parameters for the specified IOC selection strategy
+ * and determine the maximum number of I/O concentrators
+ */
switch (ioc_selection_type) {
- case SELECT_IOC_EVERY_NTH_RANK: {
- errno = 0;
+ case SELECT_IOC_ONE_PER_NODE: {
+ if (comm_size > 1) {
+ /* Check for an IOC-per-node value set in the environment */
+ if ((env_value = HDgetenv(H5FD_SUBFILING_IOC_PER_NODE))) {
+ errno = 0;
+ ioc_select_val = HDstrtol(env_value, NULL, 0);
+ if ((ERANGE == errno)) {
+ HDprintf("invalid value '%s' for " H5FD_SUBFILING_IOC_PER_NODE "\n", env_value);
+ ioc_select_val = 1;
+ }
+
+ if (ioc_select_val > 0)
+ iocs_per_node = ioc_select_val;
+ }
+ }
+
+ /* IOC count will be adjusted after number of nodes is determined */
+ H5_CHECK_OVERFLOW(iocs_per_node, long, int);
+ ioc_count = (int)iocs_per_node;
+
+ break;
+ }
+ case SELECT_IOC_EVERY_NTH_RANK: {
+ /*
+ * User specifies a rank multiple value. Selection starts
+ * with rank 0 and then the user-specified stride is applied
+ * to identify other IOC ranks.
+ */
ioc_select_val = 1;
if (ioc_sel_str) {
+ errno = 0;
ioc_select_val = HDstrtol(ioc_sel_str, NULL, 0);
if ((ERANGE == errno) || (ioc_select_val <= 0)) {
HDprintf("invalid IOC selection strategy string '%s' for strategy "
@@ -1009,20 +981,25 @@ init_app_topology(H5FD_subfiling_ioc_select_t ioc_selection_type, MPI_Comm comm,
}
}
- break;
- }
+ H5_CHECK_OVERFLOW(ioc_select_val, long, int);
+ ioc_count = (comm_size / (int)ioc_select_val);
+
+ if ((comm_size % ioc_select_val) != 0) {
+ ioc_count++;
+ }
- case SELECT_IOC_WITH_CONFIG:
- HDprintf("SELECT_IOC_WITH_CONFIG IOC selection strategy not supported yet; defaulting to "
- "SELECT_IOC_ONE_PER_NODE\n");
- ioc_selection_type = SELECT_IOC_ONE_PER_NODE;
break;
+ }
case SELECT_IOC_TOTAL: {
- errno = 0;
-
+ /*
+ * User specifies a total number of I/O concentrators.
+ * Starting with rank 0, a stride of (mpi_size / total)
+ * is applied to identify other IOC ranks.
+ */
ioc_select_val = 1;
if (ioc_sel_str) {
+ errno = 0;
ioc_select_val = HDstrtol(ioc_sel_str, NULL, 0);
if ((ERANGE == errno) || (ioc_select_val <= 0) || (ioc_select_val >= comm_size)) {
HDprintf("invalid IOC selection strategy string '%s' for strategy SELECT_IOC_TOTAL; "
@@ -1033,113 +1010,626 @@ init_app_topology(H5FD_subfiling_ioc_select_t ioc_selection_type, MPI_Comm comm,
}
}
+ H5_CHECK_OVERFLOW(ioc_select_val, long, int);
+ ioc_count = (int)ioc_select_val;
+
+ rank_multiple = (comm_size / ioc_count);
+
break;
}
+ case SELECT_IOC_WITH_CONFIG:
default:
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "invalid IOC selection strategy");
+ break;
+ }
+
+ /*
+ * TODO: A different IOC selection string from the environment than what was
+ * used originally will cause the IOCs to be assigned differently than
+ * expected. While this generally shouldn't cause issues (other than
+ * for the SELECT_IOC_TOTAL case), this should still be dealt with
+ * eventually.
+ */
+ /* Check the subfiling topology cache to see if there's a matching object */
+ if (find_cached_topology_info(comm, subfiling_config, iocs_per_node, &app_topology) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL,
+ "can't check for cached subfiling topology object");
+ HDassert(!app_topology || (app_topology->selection_type == ioc_selection_type));
+
+ if (!app_topology) {
+ /* Generate an ID for the application topology object */
+ if ((topology_id = H5_new_subfiling_object_id(SF_TOPOLOGY)) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get ID for subfiling topology object");
+
+ /* Get a new application topology object from the cache */
+ if (NULL == (app_topology = H5_get_subfiling_object(topology_id)))
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get subfiling topology object");
+ app_topology->app_layout = NULL;
+ app_topology->app_comm = MPI_COMM_NULL;
+ app_topology->rank_is_ioc = FALSE;
+ app_topology->ioc_idx = -1;
+ app_topology->n_io_concentrators = ioc_count;
+ app_topology->io_concentrators = NULL;
+ app_topology->selection_type = ioc_selection_type;
+
+ if (H5_mpi_comm_dup(comm, &app_topology->app_comm) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTCOPY, FAIL, "can't duplicate MPI communicator");
+
+ if (init_app_layout(app_topology, comm, node_comm) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't initialize application layout");
+ HDassert(app_topology->app_layout);
+ HDassert(app_topology->app_layout->layout);
+ HDassert(app_topology->app_layout->node_ranks);
+ HDassert(app_topology->app_layout->node_count > 0);
+
+ /*
+ * Now that the application node count has been determined, adjust the
+ * number of I/O concentrators for the SELECT_IOC_ONE_PER_NODE case
+ */
+ if (app_topology->selection_type == SELECT_IOC_ONE_PER_NODE)
+ app_topology->n_io_concentrators = (int)iocs_per_node * app_topology->app_layout->node_count;
+
+ /*
+ * Make sure the number of I/O concentrators doesn't
+ * exceed the specified number of subfiles
+ */
+ if (subfiling_config->stripe_count != H5FD_SUBFILING_DEFAULT_STRIPE_COUNT) {
+ if (app_topology->n_io_concentrators > subfiling_config->stripe_count)
+ app_topology->n_io_concentrators = subfiling_config->stripe_count;
+ }
+
+ /*
+ * Determine which ranks are I/O concentrator ranks, based on the
+ * given IOC selection strategy and MPI information.
+ */
+ if (identify_ioc_ranks(app_topology, rank_multiple) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL,
+ "couldn't determine which MPI ranks are I/O concentrators");
+ }
+
+ *app_topology_out = app_topology;
+
+done:
+ if (ret_value < 0) {
+ if (app_topology && (topology_id >= 0)) {
+ if (H5_free_subfiling_object(topology_id) < 0)
+ H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free subfiling topology object");
+ }
+ }
+
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*
+-------------------------------------------------------------------------
+ Programmer: Richard Warren
+ Purpose: Return a character string which represents either the
+ default selection method: SELECT_IOC_ONE_PER_NODE; or
+ if the user has selected a method via the environment
+ variable (H5FD_SUBFILING_IOC_SELECTION_CRITERIA), we
+ return that along with any optional qualifier with for
+ that method.
+
+ Errors: None.
+
+ Revision History -- Initial implementation
+-------------------------------------------------------------------------
+*/
+static herr_t
+get_ioc_selection_criteria_from_env(H5FD_subfiling_ioc_select_t *ioc_selection_type, char **ioc_sel_info_str)
+{
+ char *opt_value = NULL;
+ char *env_value = HDgetenv(H5FD_SUBFILING_IOC_SELECTION_CRITERIA);
+ herr_t ret_value = SUCCEED;
+
+ HDassert(ioc_selection_type);
+ HDassert(ioc_sel_info_str);
+
+ *ioc_sel_info_str = NULL;
+
+ if (env_value) {
+ long check_value;
+
+ /*
+ * For non-default options, the environment variable
+ * should have the following form: integer:[integer|string]
+ * In particular, EveryNthRank == 1:64 or every 64 ranks assign an IOC
+ * or WithConfig == 2:/<full_path_to_config_file>
+ */
+ if ((opt_value = HDstrchr(env_value, ':')))
+ *opt_value++ = '\0';
+
+ errno = 0;
+ check_value = HDstrtol(env_value, NULL, 0);
+
+ if (errno == ERANGE)
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL,
+ "couldn't parse value from " H5FD_SUBFILING_IOC_SELECTION_CRITERIA
+ " environment variable");
+
+ if ((check_value < 0) || (check_value >= ioc_selection_options))
+ H5_SUBFILING_GOTO_ERROR(
+ H5E_VFL, H5E_BADVALUE, FAIL,
+ "invalid IOC selection type value %ld from " H5FD_SUBFILING_IOC_SELECTION_CRITERIA
+ " environment variable",
+ check_value);
+
+ *ioc_selection_type = (H5FD_subfiling_ioc_select_t)check_value;
+ *ioc_sel_info_str = opt_value;
+ }
+
+done:
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: find_cached_topology_info
+ *
+ * Purpose: Given an MPI communicator and IOC selection strategy,
+ * checks the subfiling topology cached to see if any matching
+ * topology objects have been cached.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+find_cached_topology_info(MPI_Comm comm, H5FD_subfiling_params_t *subf_config, long iocs_per_node,
+ sf_topology_t **app_topology)
+{
+ H5FD_subfiling_ioc_select_t ioc_selection_type;
+ int32_t stripe_count;
+ herr_t ret_value = SUCCEED;
+
+ HDassert(subf_config);
+
+ ioc_selection_type = subf_config->ioc_selection;
+ stripe_count = subf_config->stripe_count;
+
+ for (size_t i = 0; i < sf_topology_cache_num_entries; i++) {
+ sf_topology_t *cached_topology = sf_topology_cache[i];
+ int result;
+ int mpi_code;
+
+ HDassert(cached_topology);
+
+ /*
+ * If the selection types differ, just reject the cached topology
+ * for now rather than checking if the mapping is equivalent
+ */
+ if (ioc_selection_type != cached_topology->selection_type)
+ continue;
+
+ /*
+ * If the number of I/O concentrators in the cached topology
+ * is greater than the specified target number of subfiles,
+ * reject the cached topology
+ */
+ if (stripe_count != H5FD_SUBFILING_DEFAULT_STRIPE_COUNT) {
+ if (stripe_count < cached_topology->n_io_concentrators)
+ continue;
+ }
+
+ if (cached_topology->selection_type == SELECT_IOC_ONE_PER_NODE) {
+ HDassert(iocs_per_node >= 1);
+ HDassert(cached_topology->app_layout->node_count > 0);
+
+ /*
+ * If a IOCs-per-node setting was set in the environment and would
+ * cause the application topology to differ from the cached topology
+ * we found, don't reuse the cached topology
+ */
+ if (cached_topology->n_io_concentrators !=
+ (iocs_per_node * cached_topology->app_layout->node_count))
+ continue;
+ }
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_compare(comm, cached_topology->app_comm, &result)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_compare failed", mpi_code);
+
+ if (MPI_IDENT == result || MPI_CONGRUENT == result) {
+ *app_topology = cached_topology;
break;
+ }
}
- /* Allocate new application topology information object */
- if (NULL == (app_topology = HDcalloc(1, sizeof(*app_topology))))
+done:
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: init_app_layout
+ *
+ * Purpose: Determines the layout of MPI ranks across nodes in order to
+ * figure out the final application topology
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+init_app_layout(sf_topology_t *app_topology, MPI_Comm comm, MPI_Comm node_comm)
+{
+ app_layout_t *app_layout = NULL;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
+ HDassert(app_topology);
+ HDassert(!app_topology->app_layout);
+ HDassert(MPI_COMM_NULL != comm);
+ HDassert(MPI_COMM_NULL != node_comm);
+
+ if (NULL == (app_layout = HDcalloc(1, sizeof(*app_layout))))
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
- "couldn't create new subfiling topology object");
+ "couldn't allocate application layout structure");
- app_topology->subfile_rank = -1;
- app_topology->selection_type = ioc_selection_type;
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &app_layout->world_rank)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code);
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(comm, &app_layout->world_size)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code);
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(node_comm, &app_layout->node_local_rank)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code);
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(node_comm, &app_layout->node_local_size)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code);
- if (NULL == (app_topology->io_concentrators = HDcalloc((size_t)comm_size, sizeof(int))))
+ if (NULL == (app_layout->layout = HDmalloc((size_t)app_layout->world_size * sizeof(*app_layout->layout))))
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
- "couldn't allocate array of I/O concentrator ranks");
+ "couldn't allocate application layout array");
- if (!app_layout) {
- if (NULL == (app_layout = HDcalloc(1, sizeof(*app_layout))))
- H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
- "couldn't allocate application layout structure");
+ /* Gather the list of layout_t pairs to all ranks */
+ if (gather_topology_info(app_layout, comm, node_comm) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "can't gather application topology info");
- if (NULL == (app_layout->node_ranks = HDcalloc(1, ((size_t)comm_size + 1) * sizeof(int))))
- H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
- "couldn't allocate application layout node rank array");
+ /* Sort the list according to the node local lead rank values */
+ HDqsort(app_layout->layout, (size_t)app_layout->world_size, sizeof(layout_t), compare_layout_nodelocal);
- if (NULL == (app_layout->layout = HDcalloc(1, ((size_t)comm_size + 1) * sizeof(layout_t))))
- H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
- "couldn't allocate application layout array");
- }
+ /*
+ * Count the number of nodes by checking how many
+ * entries have a node local rank value of 0
+ */
+ app_layout->node_count = 0;
+ for (size_t i = 0; i < (size_t)app_layout->world_size; i++)
+ if (app_layout->layout[i].node_local_rank == 0)
+ app_layout->node_count++;
+
+ HDassert(app_layout->node_count > 0);
+
+ if (NULL ==
+ (app_layout->node_ranks = HDmalloc((size_t)app_layout->node_count * sizeof(*app_layout->node_ranks))))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "couldn't allocate application layout node rank array");
- app_layout->world_size = comm_size;
- app_layout->world_rank = comm_rank;
+ /*
+ * Record the rank value of the "lead"
+ * MPI rank on each node for later use
+ */
+ for (size_t i = 0, node_rank_index = 0; i < (size_t)app_layout->world_size; i++) {
+ if (app_layout->layout[i].node_local_rank == 0) {
+ HDassert(node_rank_index < (size_t)app_layout->node_count);
+ app_layout->node_ranks[node_rank_index++] = app_layout->layout[i].rank;
+ }
+ }
app_topology->app_layout = app_layout;
- gather_topology_info(app_topology, comm);
+done:
+ if (ret_value < 0) {
+ if (app_layout) {
+ HDfree(app_layout->layout);
+ HDfree(app_layout->node_ranks);
+ HDfree(app_layout);
+ }
+ }
+
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: gather_topology_info
+ *
+ * Purpose: Collectively generate a list of layout_t structures
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+gather_topology_info(app_layout_t *app_layout, MPI_Comm comm, MPI_Comm intra_comm)
+{
+ MPI_Group file_group = MPI_GROUP_NULL;
+ MPI_Group node_group = MPI_GROUP_NULL;
+ layout_t my_layout_info;
+ layout_t *layout_info_partial = NULL;
+ MPI_Comm aggr_comm = MPI_COMM_NULL;
+ int *recv_counts = NULL;
+ int *recv_displs = NULL;
+ int sf_world_size;
+ int sf_world_rank;
+ int node_local_rank;
+ int node_local_size;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
+ HDassert(app_layout);
+ HDassert(app_layout->layout);
+ HDassert(MPI_COMM_NULL != comm);
+
+ sf_world_rank = app_layout->world_rank;
+ sf_world_size = app_layout->world_size;
+ node_local_rank = app_layout->node_local_rank;
+ node_local_size = app_layout->node_local_size;
+
+ my_layout_info.rank = sf_world_rank;
+ my_layout_info.node_local_rank = node_local_rank;
+ my_layout_info.node_local_size = node_local_size;
/*
- * Determine which ranks are I/O concentrator ranks, based on the
- * given IOC selection strategy and MPI information.
+ * Get the rank value for the "lead" rank on this
+ * rank's node so that we can group the layout_t
+ * information for all node-local ranks together
*/
- switch (ioc_selection_type) {
+ {
+ const int local_lead = 0;
+ int lead_rank;
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_group(comm, &file_group)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_group failed", mpi_code);
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_group(intra_comm, &node_group)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_group failed", mpi_code);
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Group_translate_ranks(node_group, 1, &local_lead, file_group, &lead_rank)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Group_translate_ranks failed", mpi_code);
+
+ if (MPI_UNDEFINED == lead_rank)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't determine lead rank on node");
+
+ my_layout_info.node_lead_rank = lead_rank;
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Group_free(&node_group)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Group_free failed", mpi_code);
+ if (MPI_SUCCESS != (mpi_code = MPI_Group_free(&file_group)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Group_free failed", mpi_code);
+ }
+
+ app_layout->layout[sf_world_rank] = my_layout_info;
+
+ if (sf_world_size > 1) {
+#ifdef H5_SUBFILING_PREFER_ALLGATHER_TOPOLOGY
+ (void)intra_comm;
+
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Allgather(&my_layout_info, 4, MPI_INT, app_layout->layout, 4, MPI_INT, comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Allgather failed", mpi_code);
+#else
+ int aggr_comm_size = 0;
+
+ HDassert(MPI_COMM_NULL != intra_comm);
+
+ /* Split the file communicator into a sub-group of one rank per node */
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_split(comm, node_local_rank, sf_world_rank, &aggr_comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_split failed", mpi_code);
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(aggr_comm, &aggr_comm_size)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code);
+
+ /* Allocate a partial layout info array to aggregate into from node-local ranks */
+ if (node_local_rank == 0) {
+ if (NULL ==
+ (layout_info_partial = HDmalloc((size_t)node_local_size * sizeof(*layout_info_partial))))
+ /* Push error, but participate in gather operation */
+ H5_SUBFILING_DONE_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "can't allocate layout info array");
+ }
+
+ /* Gather node-local layout info to single master rank on each node */
+ if (MPI_SUCCESS != (mpi_code = MPI_Gather(&my_layout_info, 4, MPI_INT, layout_info_partial, 4,
+ MPI_INT, 0, intra_comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Gather failed", mpi_code);
+
+ /* Gather total layout info from/to each master rank on each node */
+ if (node_local_rank == 0) {
+ int send_size = 4 * node_local_size;
+
+ if (NULL == (recv_counts = HDmalloc((size_t)aggr_comm_size * sizeof(*recv_counts))))
+ H5_SUBFILING_DONE_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "can't allocate receive counts array");
+ if (NULL == (recv_displs = HDmalloc((size_t)aggr_comm_size * sizeof(*recv_displs))))
+ H5_SUBFILING_DONE_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "can't allocate receive displacements array");
+
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Allgather(&send_size, 1, MPI_INT, recv_counts, 1, MPI_INT, aggr_comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Allgather failed", mpi_code);
+
+ recv_displs[0] = 0;
+ for (int i = 1; i < aggr_comm_size; i++)
+ recv_displs[i] = recv_displs[i - 1] + recv_counts[i - 1];
+
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Allgatherv(layout_info_partial, send_size, MPI_INT, app_layout->layout,
+ recv_counts, recv_displs, MPI_INT, aggr_comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Allgatherv failed", mpi_code);
+
+ HDfree(recv_displs);
+ HDfree(recv_counts);
+ recv_displs = NULL;
+ recv_counts = NULL;
+ }
+
+ /*
+ * Each master rank on each node distributes the total
+ * layout info back to other node-local ranks
+ */
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Bcast(app_layout->layout, 4 * sf_world_size, MPI_INT, 0, intra_comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code);
+#endif
+ }
+
+done:
+ HDfree(recv_displs);
+ HDfree(recv_counts);
+ HDfree(layout_info_partial);
+
+ if (H5_mpi_comm_free(&aggr_comm) < 0)
+ H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI communicator");
+
+ if (node_group != MPI_GROUP_NULL)
+ if (MPI_SUCCESS != (mpi_code = MPI_Group_free(&node_group)))
+ H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Group_free failed", mpi_code);
+ if (file_group != MPI_GROUP_NULL)
+ if (MPI_SUCCESS != (mpi_code = MPI_Group_free(&file_group)))
+ H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Group_free failed", mpi_code);
+
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: compare_layout_nodelocal
+ *
+ * Purpose: Qsort sorting callback that sorts layout_t structures
+ * according to their node local lead MPI rank values. Ties
+ * are broken according to their regular node local MPI rank
+ * values
+ *
+ *-------------------------------------------------------------------------
+ */
+static int
+compare_layout_nodelocal(const void *layout1, const void *layout2)
+{
+ const layout_t *l1 = (const layout_t *)layout1;
+ const layout_t *l2 = (const layout_t *)layout2;
+
+ if (l1->node_lead_rank == l2->node_lead_rank) {
+ return (l1->node_local_rank > l2->node_local_rank) - (l1->node_local_rank < l2->node_local_rank);
+ }
+ else
+ return (l1->node_lead_rank > l2->node_lead_rank) - (l1->node_lead_rank < l2->node_lead_rank);
+}
+
+/*-------------------------------------------------------------------------
+ * Function: identify_ioc_ranks
+ *
+ * Purpose: We've already identified the number of unique nodes and
+ * have a sorted list of layout_t structures. Under normal
+ * conditions, we only utilize a single IOC per node. Under
+ * that circumstance, we only need to fill the
+ * io_concentrators vector from the node_ranks array (which
+ * contains the index into the layout array of lowest MPI rank
+ * on each node) into the io_concentrators vector; Otherwise,
+ * while determining the number of local ranks per node, we
+ * can also select one or more additional IOCs.
+ *
+ * As a side effect, we fill the 'io_concentrators' vector
+ * and set the 'rank_is_ioc' flag to TRUE if our rank is
+ * identified as owning an I/O Concentrator (IOC).
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride)
+{
+ app_layout_t *app_layout = NULL;
+ int *io_concentrators = NULL;
+ int max_iocs = 0;
+ herr_t ret_value = SUCCEED;
+
+ HDassert(app_topology);
+ HDassert(!app_topology->io_concentrators);
+ HDassert(app_topology->n_io_concentrators > 0);
+ HDassert(app_topology->app_layout);
+ HDassert(app_topology->app_layout->layout);
+ HDassert(app_topology->app_layout->node_count > 0);
+
+ app_layout = app_topology->app_layout;
+
+ max_iocs = app_topology->n_io_concentrators;
+
+ if (NULL == (app_topology->io_concentrators = HDmalloc((size_t)app_topology->n_io_concentrators *
+ sizeof(*app_topology->io_concentrators))))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "couldn't allocate array of I/O concentrator ranks");
+
+ io_concentrators = app_topology->io_concentrators;
+
+ switch (app_topology->selection_type) {
case SELECT_IOC_ONE_PER_NODE: {
- int node_count;
+ int total_ioc_count = 0;
+ int iocs_per_node = 1;
- app_topology->selection_type = SELECT_IOC_ONE_PER_NODE;
+ if (app_topology->n_io_concentrators > app_layout->node_count)
+ iocs_per_node = app_topology->n_io_concentrators / app_layout->node_count;
- if ((node_count = count_nodes(app_topology, comm)) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL,
- "couldn't determine number of nodes used");
+ HDassert(app_layout->node_ranks);
- /* Check for an IOC-per-node value set in the environment */
- if ((env_value = HDgetenv(H5FD_SUBFILING_IOC_PER_NODE))) {
- errno = 0;
- ioc_select_val = HDstrtol(env_value, NULL, 0);
- if ((ERANGE == errno)) {
- HDprintf("invalid value '%s' for " H5FD_SUBFILING_IOC_PER_NODE "\n", env_value);
- ioc_select_val = 1;
+ for (size_t i = 0; i < (size_t)app_layout->node_count; i++) {
+ int node_index = app_layout->node_ranks[i];
+ int local_size = app_layout->layout[node_index].node_local_size;
+
+ HDassert(total_ioc_count < app_topology->n_io_concentrators);
+ io_concentrators[total_ioc_count] = app_layout->layout[node_index++].rank;
+
+ if (app_layout->world_rank == io_concentrators[total_ioc_count]) {
+ app_topology->ioc_idx = total_ioc_count;
+ app_topology->rank_is_ioc = TRUE;
}
- if (ioc_select_val > 0)
- iocs_per_node = ioc_select_val;
- }
+ total_ioc_count++;
- H5_CHECK_OVERFLOW(iocs_per_node, long, int);
- ioc_count = identify_ioc_ranks(app_topology, node_count, (int)iocs_per_node);
+ for (size_t j = 1; j < (size_t)iocs_per_node; j++) {
+ if (total_ioc_count >= max_iocs)
+ break;
+ if (j >= (size_t)local_size)
+ break;
- break;
- }
+ HDassert(total_ioc_count < app_topology->n_io_concentrators);
+ io_concentrators[total_ioc_count] = app_layout->layout[node_index++].rank;
- case SELECT_IOC_EVERY_NTH_RANK: {
- /*
- * User specifies a rank multiple value. Selection starts
- * with rank 0 and then the user-specified stride is applied\
- * to identify other IOC ranks.
- */
+ if (app_layout->world_rank == io_concentrators[total_ioc_count]) {
+ app_topology->ioc_idx = total_ioc_count;
+ app_topology->rank_is_ioc = TRUE;
+ }
- H5_CHECK_OVERFLOW(ioc_select_val, long, int);
- ioc_count = (comm_size / (int)ioc_select_val);
+ total_ioc_count++;
+ }
- if ((comm_size % ioc_select_val) != 0) {
- ioc_count++;
+ if (total_ioc_count >= max_iocs)
+ break;
}
- assign_ioc_ranks(app_topology, ioc_count, (int)ioc_select_val);
+ /* Set final number of I/O concentrators after adjustments */
+ app_topology->n_io_concentrators = total_ioc_count;
break;
}
+ case SELECT_IOC_EVERY_NTH_RANK:
case SELECT_IOC_TOTAL: {
- int rank_multiple = 0;
+ int world_size = app_layout->world_size;
+ int ioc_next = 0;
- /*
- * User specifies a total number of I/O concentrators.
- * Starting with rank 0, a stride of (mpi_size / total)
- * is applied to identify other IOC ranks.
- */
+ HDassert(rank_stride > 0);
- H5_CHECK_OVERFLOW(ioc_select_val, long, int);
- ioc_count = (int)ioc_select_val;
+ for (int i = 0; ioc_next < app_topology->n_io_concentrators; ioc_next++) {
+ int ioc_index = rank_stride * i++;
- rank_multiple = (comm_size / ioc_count);
+ if (ioc_index >= world_size)
+ break;
+
+ io_concentrators[ioc_next] = app_layout->layout[ioc_index].rank;
+
+ if (app_layout->world_rank == io_concentrators[ioc_next]) {
+ app_topology->ioc_idx = ioc_next;
+ app_topology->rank_is_ioc = TRUE;
+ }
+
+ if (ioc_next + 1 >= max_iocs)
+ break;
+ }
- assign_ioc_ranks(app_topology, ioc_count, rank_multiple);
+ /* Set final number of I/O concentrators after adjustments */
+ app_topology->n_io_concentrators = ioc_next;
break;
}
@@ -1150,31 +1640,10 @@ init_app_topology(H5FD_subfiling_ioc_select_t ioc_selection_type, MPI_Comm comm,
break;
}
- HDassert(ioc_count > 0);
- app_topology->n_io_concentrators = ioc_count;
-
- /*
- * Create a vector of "potential" file descriptors
- * which can be indexed by the IOC ID
- */
- if (NULL == (app_topology->subfile_fd = HDcalloc((size_t)ioc_count, sizeof(int))))
- H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
- "couldn't allocate subfile file descriptor array");
-
- *app_topology_out = app_topology;
-
done:
if (ret_value < 0) {
- if (app_layout) {
- HDfree(app_layout->layout);
- HDfree(app_layout->node_ranks);
- HDfree(app_layout);
- }
- if (app_topology) {
- HDfree(app_topology->subfile_fd);
+ if (app_topology)
HDfree(app_topology->io_concentrators);
- HDfree(app_topology);
- }
}
H5_SUBFILING_FUNC_LEAVE;
@@ -1196,77 +1665,104 @@ done:
*-------------------------------------------------------------------------
*/
static herr_t
-init_subfiling_context(subfiling_context_t *sf_context, H5FD_subfiling_shared_config_t *subfiling_config,
- sf_topology_t *app_topology, MPI_Comm file_comm)
+init_subfiling_context(subfiling_context_t *sf_context, const char *base_filename, uint64_t file_id,
+ H5FD_subfiling_params_t *subfiling_config, sf_topology_t *app_topology,
+ MPI_Comm file_comm)
{
char *env_value = NULL;
- int comm_rank;
+ int mpi_rank;
int mpi_code;
herr_t ret_value = SUCCEED;
HDassert(sf_context);
HDassert(sf_context->topology == NULL);
+ HDassert(sf_context->sf_context_id >= 0);
+ HDassert(base_filename);
+ HDassert(file_id != UINT64_MAX);
HDassert(subfiling_config);
HDassert(app_topology);
HDassert(app_topology->n_io_concentrators > 0);
HDassert(MPI_COMM_NULL != file_comm);
- sf_context->topology = app_topology;
+ sf_context->h5_file_id = file_id;
+ sf_context->sf_fids = NULL;
+ sf_context->sf_num_fids = 0;
+ sf_context->sf_num_subfiles = subfiling_config->stripe_count;
+ sf_context->sf_write_count = 0;
+ sf_context->sf_read_count = 0;
+ sf_context->sf_eof = HADDR_UNDEF;
+ sf_context->sf_stripe_size = H5FD_SUBFILING_DEFAULT_STRIPE_SIZE;
+ sf_context->sf_base_addr = 0;
sf_context->sf_msg_comm = MPI_COMM_NULL;
sf_context->sf_data_comm = MPI_COMM_NULL;
sf_context->sf_eof_comm = MPI_COMM_NULL;
- sf_context->sf_barrier_comm = MPI_COMM_NULL;
+ sf_context->sf_node_comm = MPI_COMM_NULL;
sf_context->sf_group_comm = MPI_COMM_NULL;
- sf_context->sf_intercomm = MPI_COMM_NULL;
- sf_context->sf_stripe_size = H5FD_SUBFILING_DEFAULT_STRIPE_SIZE;
- sf_context->sf_write_count = 0;
- sf_context->sf_read_count = 0;
- sf_context->sf_eof = HADDR_UNDEF;
- sf_context->h5_file_handle = NULL;
- sf_context->sf_fid = -1;
sf_context->sf_group_size = 1;
sf_context->sf_group_rank = 0;
- sf_context->h5_filename = NULL;
- sf_context->sf_filename = NULL;
sf_context->subfile_prefix = NULL;
+ sf_context->h5_filename = NULL;
sf_context->ioc_data = NULL;
+ sf_context->topology = app_topology;
#ifdef H5_SUBFILING_DEBUG
sf_context->sf_logfile = NULL;
#endif
+ if (NULL == (sf_context->h5_filename = HDstrdup(base_filename)))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "couldn't allocate space for subfiling filename");
+
+ /* Check for a subfile name prefix setting in the environment */
+ if ((env_value = HDgetenv(H5FD_SUBFILING_SUBFILE_PREFIX))) {
+ if (NULL == (sf_context->subfile_prefix = HDstrdup(env_value)))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't copy subfile prefix value");
+ }
+
/*
- * Set IOC stripe size from subfiling configuration, then check
- * for a setting from the environment
+ * Set IOC stripe size from subfiling configuration
*/
if (subfiling_config->stripe_size > 0)
sf_context->sf_stripe_size = subfiling_config->stripe_size;
- if ((env_value = HDgetenv(H5FD_SUBFILING_STRIPE_SIZE))) {
- long long stripe_size = -1;
-
- errno = 0;
-
- stripe_size = HDstrtoll(env_value, NULL, 0);
- if (ERANGE == errno)
- H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL,
- "invalid stripe size setting for " H5FD_SUBFILING_STRIPE_SIZE);
-
- if (stripe_size > 0) {
- sf_context->sf_stripe_size = (int64_t)stripe_size;
- }
- }
+ /*
+ * If still set to the default, set the number of subfiles
+ * according to the default mapping of 1 I/O concentrator
+ * -> 1 subfile
+ */
+ if (sf_context->sf_num_subfiles == H5FD_SUBFILING_DEFAULT_STRIPE_COUNT)
+ sf_context->sf_num_subfiles = app_topology->n_io_concentrators;
/*
* Set blocksize per stripe value after possibly adjusting
- * for user-specified subfile stripe size
+ * for user-specified subfile stripe size and number of
+ * subfiles
*/
- sf_context->sf_blocksize_per_stripe = sf_context->sf_stripe_size * app_topology->n_io_concentrators;
+ sf_context->sf_blocksize_per_stripe = sf_context->sf_stripe_size * sf_context->sf_num_subfiles;
- /* Check for a subfile name prefix setting in the environment */
- if ((env_value = HDgetenv(H5FD_SUBFILING_SUBFILE_PREFIX))) {
- if (NULL == (sf_context->subfile_prefix = HDstrdup(env_value)))
- H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't copy subfile prefix value");
+ if (app_topology->rank_is_ioc) {
+ int leftover_subfiles;
+
+ /* Adjust base address after stripe size is set, if necessary */
+ sf_context->sf_base_addr = (int64_t)(app_topology->ioc_idx * sf_context->sf_stripe_size);
+
+ /*
+ * Calculate the number of subfiles this rank owns by
+ * round-robining them across the available IOCs and
+ * then allocate an array for the subfile IDs
+ */
+ sf_context->sf_num_fids = sf_context->sf_num_subfiles / app_topology->n_io_concentrators;
+
+ leftover_subfiles = sf_context->sf_num_subfiles % app_topology->n_io_concentrators;
+ if (leftover_subfiles && (leftover_subfiles > app_topology->ioc_idx))
+ sf_context->sf_num_fids++;
+
+ if (NULL ==
+ (sf_context->sf_fids = HDmalloc((size_t)sf_context->sf_num_fids * sizeof(*sf_context->sf_fids))))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't allocate subfile IDs array");
+
+ for (int i = 0; i < sf_context->sf_num_fids; i++)
+ sf_context->sf_fids[i] = -1;
}
/*
@@ -1274,7 +1770,7 @@ init_subfiling_context(subfiling_context_t *sf_context, H5FD_subfiling_shared_co
* to/from IOC ranks
*/
- if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(file_comm, &comm_rank)))
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(file_comm, &mpi_rank)))
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code);
if (MPI_SUCCESS != (mpi_code = MPI_Comm_dup(file_comm, &sf_context->sf_msg_comm)))
@@ -1295,15 +1791,9 @@ init_subfiling_context(subfiling_context_t *sf_context, H5FD_subfiling_shared_co
if (MPI_SUCCESS != (mpi_code = MPI_Comm_set_errhandler(sf_context->sf_eof_comm, MPI_ERRORS_RETURN)))
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_set_errhandler failed", mpi_code);
- if (MPI_SUCCESS != (mpi_code = MPI_Comm_dup(file_comm, &sf_context->sf_barrier_comm)))
- H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_dup failed", mpi_code);
-
- if (MPI_SUCCESS != (mpi_code = MPI_Comm_set_errhandler(sf_context->sf_barrier_comm, MPI_ERRORS_RETURN)))
- H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_set_errhandler failed", mpi_code);
-
/* Create an MPI sub-communicator for IOC ranks */
if (app_topology->n_io_concentrators > 1) {
- if (MPI_SUCCESS != (mpi_code = MPI_Comm_split(file_comm, app_topology->rank_is_ioc, comm_rank,
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_split(file_comm, app_topology->rank_is_ioc, mpi_rank,
&sf_context->sf_group_comm)))
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_split failed", mpi_code);
@@ -1314,11 +1804,18 @@ init_subfiling_context(subfiling_context_t *sf_context, H5FD_subfiling_shared_co
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code);
}
-done:
- if (ret_value < 0) {
- H5_free_subfiling_object_int(sf_context);
- }
+ /* Perform some final validation of subfiling configuration */
+ if (sf_context->sf_stripe_size <= 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "invalid subfiling stripe size (%" PRId64 ")",
+ sf_context->sf_stripe_size);
+
+ if (sf_context->sf_num_subfiles <= 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "invalid subfiling stripe count (%d)",
+ sf_context->sf_num_subfiles);
+ HDassert(sf_context->sf_num_subfiles >= app_topology->n_io_concentrators);
+
+done:
H5_SUBFILING_FUNC_LEAVE;
}
@@ -1362,37 +1859,29 @@ open_subfile_with_context(subfiling_context_t *sf_context, int file_acc_flags)
herr_t ret_value = SUCCEED;
HDassert(sf_context);
+ HDassert(sf_context->h5_file_id != UINT64_MAX);
/*
- * Save the HDF5 file ID (fid) to subfile context mapping.
+ * Save the HDF5 file ID (e.g., inode) to subfile context mapping.
* There shouldn't be any issue, but check the status and
* return if there was a problem.
*/
- if (record_fid_to_subfile(sf_context->h5_file_handle, sf_context->sf_context_id, NULL) < 0)
+ if (record_fid_to_subfile(sf_context->h5_file_id, sf_context->sf_context_id, NULL) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL,
"couldn't record HDF5 file ID to subfile context mapping");
/*
* If this rank is an I/O concentrator, actually open
- * the subfile belonging to this IOC rank
+ * the subfiles belonging to this IOC rank
*/
if (sf_context->topology->rank_is_ioc) {
- h5_stat_t st;
-
- /* Retrieve Inode value for HDF5 stub file */
- if (HDstat(sf_context->h5_filename, &st) < 0)
- H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "couldn't stat HDF5 stub file");
-
- HDcompile_assert(sizeof(uint64_t) >= sizeof(ino_t));
- sf_context->h5_file_id = (uint64_t)st.st_ino;
-
- if (ioc_open_file(sf_context->sf_context_id, file_acc_flags) < 0)
+ if (ioc_open_files(sf_context->sf_context_id, file_acc_flags) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, FAIL, "IOC couldn't open subfile");
}
done:
if (ret_value < 0) {
- clear_fid_map_entry(sf_context->h5_file_handle, sf_context->sf_context_id);
+ clear_fid_map_entry(sf_context->h5_file_id, sf_context->sf_context_id);
}
H5_SUBFILING_FUNC_LEAVE;
@@ -1429,29 +1918,29 @@ done:
*-------------------------------------------------------------------------
*/
static herr_t
-record_fid_to_subfile(void *file_handle, int64_t subfile_context_id, int *next_index)
+record_fid_to_subfile(uint64_t file_id, int64_t subfile_context_id, int *next_index)
{
int index;
herr_t ret_value = SUCCEED;
- if (sf_file_map_size == 0) {
+ if (!sf_open_file_map) {
if (NULL ==
(sf_open_file_map = HDmalloc((size_t)DEFAULT_FILE_MAP_ENTRIES * sizeof(*sf_open_file_map))))
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't allocate open file mapping");
sf_file_map_size = DEFAULT_FILE_MAP_ENTRIES;
for (int i = 0; i < sf_file_map_size; i++) {
- sf_open_file_map[i].file_handle = NULL;
+ sf_open_file_map[i].file_id = UINT64_MAX;
sf_open_file_map[i].sf_context_id = -1;
}
}
for (index = 0; index < sf_file_map_size; index++) {
- if (sf_open_file_map[index].file_handle == file_handle)
+ if (sf_open_file_map[index].file_id == file_id)
goto done;
- if (sf_open_file_map[index].file_handle == NULL) {
- sf_open_file_map[index].file_handle = file_handle;
+ if (sf_open_file_map[index].file_id == UINT64_MAX) {
+ sf_open_file_map[index].file_id = file_id;
sf_open_file_map[index].sf_context_id = subfile_context_id;
if (next_index) {
@@ -1474,14 +1963,14 @@ record_fid_to_subfile(void *file_handle, int64_t subfile_context_id, int *next_i
sf_file_map_size *= 2;
for (int i = index; i < sf_file_map_size; i++) {
- sf_open_file_map[i].file_handle = NULL;
+ sf_open_file_map[i].file_id = UINT64_MAX;
}
if (next_index) {
*next_index = index;
}
- sf_open_file_map[index].file_handle = file_handle;
+ sf_open_file_map[index].file_id = file_id;
sf_open_file_map[index++].sf_context_id = subfile_context_id;
}
@@ -1490,13 +1979,44 @@ done:
}
/*-------------------------------------------------------------------------
- * Function: ioc_open_file
+ * Function: clear_fid_map_entry
+ *
+ * Purpose: Remove the map entry associated with the file->inode.
+ * This is done at file close.
+ *
+ * Return: None
+ * Errors: Cannot fail.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static void
+clear_fid_map_entry(uint64_t file_id, int64_t sf_context_id)
+{
+ if (sf_open_file_map) {
+ for (int i = 0; i < sf_file_map_size; i++) {
+ if ((sf_open_file_map[i].file_id == file_id) &&
+ (sf_open_file_map[i].sf_context_id == sf_context_id)) {
+ sf_open_file_map[i].file_id = UINT64_MAX;
+ sf_open_file_map[i].sf_context_id = -1;
+ return;
+ }
+ }
+ }
+} /* end clear_fid_map_entry() */
+
+/*-------------------------------------------------------------------------
+ * Function: ioc_open_files
*
* Purpose: This function is called by an I/O concentrator in order to
- * open the subfile it is responsible for.
+ * open the subfiles it is responsible for.
*
- * The name of the subfile to be opened is generated based on
- * values from either:
+ * The names of the subfiles to be opened are generated based
+ * on values from either:
*
* - The corresponding subfiling configuration file, if one
* exists and the HDF5 file isn't being truncated
@@ -1504,7 +2024,7 @@ done:
* subfiling configuration file doesn't exist or the HDF5
* file is being truncated
*
- * After the subfile has been opened, a subfiling
+ * After the subfiles have been opened, a subfiling
* configuration file will be created if this is a file
* creation operation. If the truncate flag is specified, the
* subfiling configuration file will be re-created in order to
@@ -1528,40 +2048,83 @@ done:
*-------------------------------------------------------------------------
*/
static herr_t
-ioc_open_file(int64_t file_context_id, int file_acc_flags)
+ioc_open_files(int64_t file_context_id, int file_acc_flags)
{
- subfiling_context_t *sf_context = NULL;
- mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
- char *filepath = NULL;
- char *subfile_dir = NULL;
- char *base = NULL;
- int fd = -1;
- herr_t ret_value = SUCCEED;
+ subfiling_context_t *sf_context = NULL;
+ mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
+ char *filepath = NULL;
+ char *subfile_dir = NULL;
+ char *base = NULL;
+ int num_subfiles = 0;
+ int num_digits = 0;
+ herr_t ret_value = SUCCEED;
if (NULL == (sf_context = H5_get_subfiling_object(file_context_id)))
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, FAIL,
"couldn't get subfiling object from context ID");
- /* Only IOC ranks should be here */
+ HDassert(sf_context->h5_file_id != UINT64_MAX);
+ HDassert(sf_context->h5_filename);
+ HDassert(sf_context->sf_fids);
+ HDassert(sf_context->sf_num_subfiles > 0);
+ HDassert(sf_context->sf_num_fids > 0);
HDassert(sf_context->topology);
- HDassert(sf_context->topology->subfile_rank >= 0);
+ HDassert(sf_context->topology->ioc_idx >= 0); /* Only IOC ranks should be here */
+
+ /* Get the basename of the full HDF5 filename */
+ if (H5_basename(sf_context->h5_filename, &base) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't get HDF5 file basename");
+
+ /*
+ * Get the directory prefix where subfiles will be placed.
+ * Under normal circumstances, the subfiles are co-located
+ * with the HDF5 file, but users may specify a different
+ * directory name.
+ */
+ if (sf_context->subfile_prefix) {
+ if (NULL == (subfile_dir = H5MM_strdup(sf_context->subfile_prefix)))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't copy subfile prefix");
+ }
+ else {
+ if (H5_dirname(sf_context->h5_filename, &subfile_dir) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't get HDF5 file dirname");
+ }
- if (NULL == (filepath = HDcalloc(1, PATH_MAX)))
+ if (NULL == (filepath = HDmalloc(PATH_MAX)))
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
"couldn't allocate space for subfile filename");
- /* Generate the name of the subfile that this IOC rank will open */
- if (generate_subfile_name(sf_context, file_acc_flags, filepath, PATH_MAX, &base, &subfile_dir) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, FAIL, "couldn't generate name for subfile");
+ num_subfiles = sf_context->sf_num_subfiles;
+ num_digits = (int)(HDlog10(num_subfiles) + 1);
+
+ /*
+ * For each subfile this IOC rank owns, generate the name
+ * of the subfile and create/open it
+ */
+ for (int i = 0; i < sf_context->sf_num_fids; i++) {
+ int subfile_idx;
+
+ /* Round-robin subfiles among the available IOCs */
+ subfile_idx = (i * sf_context->topology->n_io_concentrators) + sf_context->topology->ioc_idx + 1;
- if (NULL == (sf_context->sf_filename = HDstrdup(filepath)))
- H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't copy subfile name");
+ /*
+ * Generate the name of the subfile. The subfile naming should
+ * produce files of the following form:
+ * If we assume the HDF5 file is named ABC.h5, and 20 subfiles
+ * are used, then the subfiles will have names:
+ * ABC.h5.subfile_<file-number>_01_of_20,
+ * ABC.h5.subfile_<file-number>_02_of_20, etc.
+ *
+ * and the configuration file will be named:
+ * ABC.h5.subfile_<file-number>.config
+ */
+ HDsnprintf(filepath, PATH_MAX, "%s/" H5FD_SUBFILING_FILENAME_TEMPLATE, subfile_dir, base,
+ sf_context->h5_file_id, num_digits, subfile_idx, num_subfiles);
- /* Attempt to create/open the subfile for this IOC rank */
- if ((fd = HDopen(filepath, file_acc_flags, mode)) < 0)
- H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL, "failed to open subfile");
+ if ((sf_context->sf_fids[i] = HDopen(filepath, file_acc_flags, mode)) < 0)
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL, "failed to open subfile");
+ }
- sf_context->sf_fid = fd;
if (file_acc_flags & O_CREAT)
sf_context->sf_eof = 0;
@@ -1569,7 +2132,7 @@ ioc_open_file(int64_t file_context_id, int file_acc_flags)
* If subfiles were created (rather than simply opened),
* check if we also need to create a config file.
*/
- if ((file_acc_flags & O_CREAT) && (sf_context->topology->subfile_rank == 0)) {
+ if ((file_acc_flags & O_CREAT) && (sf_context->topology->ioc_idx == 0)) {
if (create_config_file(sf_context, base, subfile_dir, (file_acc_flags & O_TRUNC)) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTCREATE, FAIL,
"couldn't create subfiling configuration file");
@@ -1578,12 +2141,10 @@ ioc_open_file(int64_t file_context_id, int file_acc_flags)
done:
if (ret_value < 0) {
if (sf_context) {
- HDfree(sf_context->sf_filename);
- sf_context->sf_filename = NULL;
-
- if (sf_context->sf_fid >= 0) {
- HDclose(sf_context->sf_fid);
- sf_context->sf_fid = -1;
+ for (int i = 0; i < sf_context->sf_num_fids; i++) {
+ if (sf_context->sf_fids[i] >= 0 && HDclose(sf_context->sf_fids[i]) < 0)
+ H5_SUBFILING_DONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, "failed to close subfile");
+ sf_context->sf_fids[i] = -1;
}
}
}
@@ -1595,144 +2156,6 @@ done:
H5_SUBFILING_FUNC_LEAVE;
}
-/*
- * Generate the name of the subfile this IOC rank will open,
- * based on available information.
- *
- * This may include:
- * - the subfiling configuration (from a subfiling configuration
- * file if one exists, or from the subfiling context object
- * otherwise)
- * - the base file's name and ID (inode or similar)
- * - the IOC's rank value within the set of I/O concentrators
- * - an optional filename prefix specified by the user
- */
-static herr_t
-generate_subfile_name(subfiling_context_t *sf_context, int file_acc_flags, char *filename_out,
- size_t filename_out_len, char **filename_basename_out, char **subfile_dir_out)
-{
- FILE *config_file = NULL;
- char *subfile_dir = NULL;
- char *prefix = NULL;
- char *base = NULL;
- int n_io_concentrators;
- int num_digits;
- herr_t ret_value = SUCCEED;
-
- HDassert(sf_context);
- HDassert(sf_context->h5_filename);
- HDassert(filename_out);
- HDassert(filename_basename_out);
- HDassert(subfile_dir_out);
-
- *filename_basename_out = NULL;
- *subfile_dir_out = NULL;
-
- /*
- * Initially use the number of I/O concentrators specified in the
- * subfiling context. However, if there's an existing subfiling
- * configuration file (and we aren't truncating it) we will use
- * the number specified there instead, as that should be the actual
- * number that the subfile names were originally generated with.
- * The current subfiling context may have a different number of I/O
- * concentrators specified; e.g. a simple serial file open for
- * reading purposes (think h5dump) might only be using 1 I/O
- * concentrator, whereas the file was created with several I/O
- * concentrators.
- */
- n_io_concentrators = sf_context->topology->n_io_concentrators;
-
- if (NULL == (prefix = HDmalloc(PATH_MAX)))
- H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
- "couldn't allocate space for subfile prefix");
-
- /* Under normal operation, we co-locate subfiles with the HDF5 file */
- HDstrncpy(prefix, sf_context->h5_filename, PATH_MAX - 1);
- prefix[PATH_MAX - 1] = '\0';
-
- if (H5_basename(prefix, &base) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't get subfile basename");
-
- if (sf_context->subfile_prefix) {
- /* Note: Users may specify a directory name which is inaccessible
- * from where the current is running. In particular, "node-local"
- * storage is not uniformly available to all processes.
- * We would like to check if the user pathname unavailable and
- * if so, we could default to creating the subfiles in the
- * current directory. (?)
- */
- if (NULL == (subfile_dir = H5MM_strdup(sf_context->subfile_prefix)))
- H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't copy subfile prefix");
- }
- else {
- if (H5_dirname(prefix, &subfile_dir) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't get subfile prefix");
- }
-
- /*
- * Open the file's subfiling configuration file, if it exists and
- * we aren't truncating the file.
- */
- if (0 == (file_acc_flags & O_TRUNC)) {
- if (open_config_file(sf_context, base, subfile_dir, "r", &config_file) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, FAIL,
- "couldn't open existing subfiling configuration file");
- }
-
- /*
- * If a subfiling configuration file exists and we aren't truncating
- * it, read the number of I/O concentrators used at file creation time
- * in order to generate the correct subfile names.
- */
- if (config_file) {
- if (H5_get_num_iocs_from_config_file(config_file, &n_io_concentrators) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL,
- "couldn't read from subfiling configuration file");
- }
-
- /*
- * Generate the name of the subfile. The subfile naming should
- * produce files of the following form:
- * If we assume the HDF5 file is named ABC.h5, and 20 I/O
- * concentrators are used, then the subfiles will have names:
- * ABC.h5.subfile_<file-number>_01_of_20,
- * ABC.h5.subfile_<file-number>_02_of_20, etc.
- *
- * and the configuration file will be named:
- * ABC.h5.subfile_<file-number>.config
- */
- num_digits = (int)(HDlog10(n_io_concentrators) + 1);
- HDsnprintf(filename_out, filename_out_len, "%s/%s" H5FD_SUBFILING_FILENAME_TEMPLATE, subfile_dir, base,
- sf_context->h5_file_id, num_digits, sf_context->topology->subfile_rank + 1,
- n_io_concentrators);
-
- *filename_basename_out = base;
- *subfile_dir_out = subfile_dir;
-
-done:
- if (config_file && (EOF == HDfclose(config_file)))
- H5_SUBFILING_DONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL,
- "couldn't close subfiling configuration file");
-
- if (ret_value < 0) {
- H5MM_free(subfile_dir);
- H5MM_free(base);
-
- if (*filename_basename_out) {
- H5MM_free(*filename_basename_out);
- *filename_basename_out = NULL;
- }
- if (*subfile_dir_out) {
- H5MM_free(*subfile_dir_out);
- *subfile_dir_out = NULL;
- }
- }
-
- HDfree(prefix);
-
- H5_SUBFILING_FUNC_LEAVE;
-}
-
/*-------------------------------------------------------------------------
* Function: create_config_file
*
@@ -1742,6 +2165,7 @@ done:
*
* - the stripe size for the file's subfiles
* - the number of I/O concentrators used for I/O to the file's subfiles
+ * - the number of subfiles the logical HDF5 file consists of
* - the base HDF5 filename
* - the optional directory prefix where the file's subfiles are placed
* - the names of each of the file's subfiles
@@ -1777,7 +2201,7 @@ create_config_file(subfiling_context_t *sf_context, const char *base_filename, c
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
"couldn't allocate space for subfiling configuration filename");
- HDsnprintf(config_filename, PATH_MAX, "%s/%s" H5FD_SUBFILING_CONFIG_FILENAME_TEMPLATE, subfile_dir,
+ HDsnprintf(config_filename, PATH_MAX, "%s/" H5FD_SUBFILING_CONFIG_FILENAME_TEMPLATE, subfile_dir,
base_filename, sf_context->h5_file_id);
/* Determine whether a subfiling configuration file exists */
@@ -1796,9 +2220,8 @@ create_config_file(subfiling_context_t *sf_context, const char *base_filename, c
* O_TRUNC flag was specified. In this case, truncate
* the existing config file and create a new one.
*/
- /* TODO: if truncating, consider removing old stale config files. */
if (!config_file_exists || truncate_if_exists) {
- int n_io_concentrators = sf_context->topology->n_io_concentrators;
+ int n_subfiles = sf_context->sf_num_subfiles;
int num_digits;
if (NULL == (config_file = HDfopen(config_filename, "w+")))
@@ -1816,7 +2239,13 @@ create_config_file(subfiling_context_t *sf_context, const char *base_filename, c
"failed to write to subfiling configuration file");
/* Write the number of I/O concentrators to the configuration file */
- HDsnprintf(line_buf, PATH_MAX, "aggregator_count=%d\n", n_io_concentrators);
+ HDsnprintf(line_buf, PATH_MAX, "aggregator_count=%d\n", sf_context->topology->n_io_concentrators);
+ if (HDfwrite(line_buf, HDstrlen(line_buf), 1, config_file) != 1)
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL,
+ "failed to write to subfiling configuration file");
+
+ /* Write the number of subfiles to the configuration file */
+ HDsnprintf(line_buf, PATH_MAX, "subfile_count=%d\n", n_subfiles);
if (HDfwrite(line_buf, HDstrlen(line_buf), 1, config_file) != 1)
H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL,
"failed to write to subfiling configuration file");
@@ -1834,10 +2263,10 @@ create_config_file(subfiling_context_t *sf_context, const char *base_filename, c
"failed to write to subfiling configuration file");
/* Write out each subfile name to the configuration file */
- num_digits = (int)(HDlog10(n_io_concentrators) + 1);
- for (int k = 0; k < n_io_concentrators; k++) {
- HDsnprintf(line_buf, PATH_MAX, "%s" H5FD_SUBFILING_FILENAME_TEMPLATE "\n", base_filename,
- sf_context->h5_file_id, num_digits, k + 1, n_io_concentrators);
+ num_digits = (int)(HDlog10(n_subfiles) + 1);
+ for (int k = 0; k < n_subfiles; k++) {
+ HDsnprintf(line_buf, PATH_MAX, H5FD_SUBFILING_FILENAME_TEMPLATE "\n", base_filename,
+ sf_context->h5_file_id, num_digits, k + 1, n_subfiles);
if (HDfwrite(line_buf, HDstrlen(line_buf), 1, config_file) != 1)
H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL,
@@ -1873,8 +2302,8 @@ done:
*-------------------------------------------------------------------------
*/
static herr_t
-open_config_file(subfiling_context_t *sf_context, const char *base_filename, const char *subfile_dir,
- const char *mode, FILE **config_file_out)
+open_config_file(const char *base_filename, const char *subfile_dir, uint64_t file_id, const char *mode,
+ FILE **config_file_out)
{
hbool_t config_file_exists = FALSE;
FILE *config_file = NULL;
@@ -1882,17 +2311,14 @@ open_config_file(subfiling_context_t *sf_context, const char *base_filename, con
int ret = 0;
herr_t ret_value = SUCCEED;
- HDassert(sf_context);
HDassert(base_filename);
HDassert(subfile_dir);
+ HDassert(file_id != UINT64_MAX);
HDassert(mode);
HDassert(config_file_out);
*config_file_out = NULL;
- if (sf_context->h5_file_id == UINT64_MAX)
- H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "invalid HDF5 file ID %" PRIu64,
- sf_context->h5_file_id);
if (*base_filename == '\0')
H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "invalid base HDF5 filename '%s'",
base_filename);
@@ -1903,8 +2329,8 @@ open_config_file(subfiling_context_t *sf_context, const char *base_filename, con
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
"couldn't allocate space for subfiling configuration filename");
- HDsnprintf(config_filename, PATH_MAX, "%s/%s" H5FD_SUBFILING_CONFIG_FILENAME_TEMPLATE, subfile_dir,
- base_filename, sf_context->h5_file_id);
+ HDsnprintf(config_filename, PATH_MAX, "%s/" H5FD_SUBFILING_CONFIG_FILENAME_TEMPLATE, subfile_dir,
+ base_filename, file_id);
/* Determine whether a subfiling configuration file exists */
errno = 0;
@@ -1938,26 +2364,26 @@ done:
}
/*-------------------------------------------------------------------------
- * Function: H5_get_num_iocs_from_config_file
+ * Function: H5_get_subfiling_config_from_file
*
- * Purpose: Reads a Subfiling configuration file to get the number of
- * I/O concentrators used for the logical HDF5 file.
+ * Purpose: Reads a Subfiling configuration file to get the stripe size
+ * and number of subfiles used for the logical HDF5 file.
*
* Return: Non-negative on success/Negative on failure
*
*-------------------------------------------------------------------------
*/
herr_t
-H5_get_num_iocs_from_config_file(FILE *config_file, int *n_io_concentrators)
+H5_get_subfiling_config_from_file(FILE *config_file, int64_t *stripe_size, int64_t *num_subfiles)
{
- char *config_buf = NULL;
- char *ioc_substr = NULL;
- long config_file_len = 0;
- int read_n_io_concs = 0;
- herr_t ret_value = SUCCEED;
+ int64_t read_stripe_size = 0;
+ int64_t read_num_subfiles = 0;
+ char *config_buf = NULL;
+ char *substr = NULL;
+ long config_file_len = 0;
+ herr_t ret_value = SUCCEED;
HDassert(config_file);
- HDassert(n_io_concentrators);
if (HDfseek(config_file, 0, SEEK_END) < 0)
H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_SEEKERROR, FAIL,
@@ -1981,22 +2407,40 @@ H5_get_num_iocs_from_config_file(FILE *config_file, int *n_io_concentrators)
config_buf[config_file_len] = '\0';
- if (NULL == (ioc_substr = HDstrstr(config_buf, "aggregator_count")))
- H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL,
- "malformed subfiling configuration file - no aggregator count entry");
+ if (stripe_size) {
+ if (NULL == (substr = HDstrstr(config_buf, "stripe_size")))
+ H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL,
+ "malformed subfiling configuration file - no stripe size entry");
+
+ if (EOF == HDsscanf(substr, "stripe_size=%" PRId64, &read_stripe_size))
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL,
+ "couldn't get stripe size from subfiling configuration file");
- if (EOF == HDsscanf(ioc_substr, "aggregator_count=%d", &read_n_io_concs))
- H5_SUBFILING_SYS_GOTO_ERROR(
- H5E_FILE, H5E_CANTGET, FAIL,
- "couldn't get number of I/O concentrators from subfiling configuration file");
+ if (read_stripe_size <= 0)
+ H5_SUBFILING_GOTO_ERROR(
+ H5E_FILE, H5E_BADVALUE, FAIL,
+ "invalid stripe size (%" PRId64 ") read from subfiling configuration file", read_stripe_size);
+
+ *stripe_size = read_stripe_size;
+ }
- if (read_n_io_concs <= 0)
- H5_SUBFILING_GOTO_ERROR(
- H5E_FILE, H5E_BADVALUE, FAIL,
- "invalid number of I/O concentrators (%d) read from subfiling configuration file",
- read_n_io_concs);
+ if (num_subfiles) {
+ if (NULL == (substr = HDstrstr(config_buf, "subfile_count")))
+ H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL,
+ "malformed subfiling configuration file - no subfile count entry");
- *n_io_concentrators = read_n_io_concs;
+ if (EOF == HDsscanf(substr, "subfile_count=%" PRId64, &read_num_subfiles))
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL,
+ "couldn't get number of subfiles from subfiling configuration file");
+
+ if (read_num_subfiles <= 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL,
+ "invalid number of subfiles (%" PRId64
+ ") read from subfiling configuration file",
+ read_num_subfiles);
+
+ *num_subfiles = read_num_subfiles;
+ }
done:
HDfree(config_buf);
@@ -2005,6 +2449,135 @@ done:
}
/*-------------------------------------------------------------------------
+ * Function: H5_resolve_pathname
+ *
+ * Purpose: Simple wrapper routine around realpath(3) to fully resolve
+ * a given filepath. Collective across the specified MPI
+ * communicator in order to minimize file system contention
+ * between MPI ranks.
+ *
+ * The resolved filepath returned through `resolved_filepath`
+ * must be freed by the caller with HDfree.
+ *
+ * Return Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5_resolve_pathname(const char *filepath, MPI_Comm comm, char **resolved_filepath)
+{
+ hsize_t path_len = HSIZE_UNDEF;
+ hbool_t bcasted_path_len = FALSE;
+ hbool_t bcasted_path = FALSE;
+ char *resolved_path = NULL;
+ char *file_basename = NULL;
+ char *file_dirname = NULL;
+ char *cwd = NULL;
+ int mpi_rank;
+ int mpi_size;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
+ HDassert(filepath);
+ HDassert(resolved_filepath);
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &mpi_rank)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code);
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(comm, &mpi_size)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code);
+
+ if (mpi_rank == 0) {
+ if (NULL == (resolved_path = HDrealpath(filepath, NULL))) {
+ if (ENOENT == errno) {
+ if (H5_dirname(filepath, &file_dirname) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't get file dirname");
+
+ /* If filepath is just the filename, set up path using CWD */
+ if (!HDstrcmp(file_dirname, ".")) {
+ if (NULL == (resolved_path = HDmalloc(PATH_MAX)))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "can't allocate buffer for filepath");
+ if (H5_basename(filepath, &file_basename) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't get file basename");
+ if (NULL == (cwd = HDmalloc(PATH_MAX)))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "can't allocate buffer for CWD");
+
+ if (NULL == HDgetcwd(cwd, PATH_MAX))
+ H5_SUBFILING_GOTO_ERROR(
+ H5E_VFL, H5E_CANTGET, FAIL,
+ "can't get current working directory, errno = %d, error message = '%s'", errno,
+ HDstrerror(errno));
+
+ HDsnprintf(resolved_path, PATH_MAX, "%s/%s", cwd, file_basename);
+ }
+ else {
+ /* Otherwise, just use what was given as the pathname */
+ if (NULL == (resolved_path = HDstrdup(filepath)))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't copy filename");
+ }
+ }
+ else
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL,
+ "can't resolve subfile path, errno = %d, error message = '%s'", errno,
+ HDstrerror(errno));
+ }
+
+ if (resolved_path) {
+ H5_CHECKED_ASSIGN(path_len, hsize_t, (HDstrlen(resolved_path) + 1), size_t);
+ }
+ else
+ path_len = HSIZE_UNDEF;
+ }
+
+ /* Broadcast the size of the resolved filepath string to other ranks */
+ bcasted_path_len = TRUE;
+ if (mpi_size > 1) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&path_len, 1, HSIZE_AS_MPI_TYPE, 0, comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code);
+ }
+
+ if (path_len == HSIZE_UNDEF)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "couldn't resolve filepath");
+
+ if (mpi_rank != 0) {
+ if (NULL == (resolved_path = HDmalloc(path_len)))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate file name buffer");
+ }
+
+ /* Broadcast the resolved filepath to other ranks */
+ bcasted_path = TRUE;
+ if (mpi_size > 1) {
+ H5_CHECK_OVERFLOW(path_len, hsize_t, int);
+ if (MPI_SUCCESS != (mpi_code = MPI_Bcast(resolved_path, (int)path_len, MPI_CHAR, 0, comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code);
+ }
+
+ *resolved_filepath = resolved_path;
+
+done:
+ HDfree(cwd);
+ H5MM_free(file_basename);
+ H5MM_free(file_dirname);
+
+ if (ret_value < 0) {
+ if (!bcasted_path_len) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&path_len, 1, HSIZE_AS_MPI_TYPE, 0, comm)))
+ H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Bcast failed", mpi_code);
+ }
+ if (!bcasted_path && (path_len != HSIZE_UNDEF)) {
+ H5_CHECK_OVERFLOW(path_len, hsize_t, int);
+ if (MPI_SUCCESS != (mpi_code = MPI_Bcast(resolved_path, (int)path_len, MPI_CHAR, 0, comm)))
+ H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Bcast failed", mpi_code);
+ }
+
+ HDfree(resolved_path);
+ }
+
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
* Function: H5_close_subfiles
*
* Purpose: This is a simple wrapper function for the internal version
@@ -2046,35 +2619,39 @@ done:
*-------------------------------------------------------------------------
*/
herr_t
-H5_close_subfiles(int64_t subfiling_context_id)
+H5_close_subfiles(int64_t subfiling_context_id, MPI_Comm file_comm)
{
subfiling_context_t *sf_context = NULL;
MPI_Request barrier_req = MPI_REQUEST_NULL;
+ int mpi_size;
int mpi_code;
herr_t ret_value = SUCCEED;
if (NULL == (sf_context = H5_get_subfiling_object(subfiling_context_id)))
H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "couldn't get subfiling object from context ID");
- /* We make the subfile close operation collective.
- * Otherwise, there may be a race condition between
- * our closing the subfiles and the user application
- * moving ahead and possibly re-opening a file.
- *
- * If we can, we utilize an async barrier which gives
- * us the opportunity to reduce the CPU load due to
- * MPI spinning while waiting for the barrier to
- * complete. This is especially important if there
- * is heavy thread utilization due to subfiling
- * activities, i.e. the thread pool might be
- * extremely busy servicing I/O requests from all
- * HDF5 application ranks.
- */
-#if MPI_VERSION > 3 || (MPI_VERSION == 3 && MPI_SUBVERSION >= 1)
- {
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(file_comm, &mpi_size)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code);
+
+ /* We make the subfile close operation collective.
+ * Otherwise, there may be a race condition between
+ * our closing the subfiles and the user application
+ * moving ahead and possibly re-opening a file.
+ *
+ * If we can, we utilize an async barrier which gives
+ * us the opportunity to reduce the CPU load due to
+ * MPI spinning while waiting for the barrier to
+ * complete. This is especially important if there
+ * is heavy thread utilization due to subfiling
+ * activities, i.e. the thread pool might be
+ * extremely busy servicing I/O requests from all
+ * HDF5 application ranks.
+ */
+ if (mpi_size > 1) {
+#if H5_CHECK_MPI_VERSION(3, 1)
int barrier_complete = 0;
- if (MPI_SUCCESS != (mpi_code = MPI_Ibarrier(sf_context->sf_barrier_comm, &barrier_req)))
+ if (MPI_SUCCESS != (mpi_code = MPI_Ibarrier(file_comm, &barrier_req)))
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Ibarrier failed", mpi_code);
while (!barrier_complete) {
@@ -2084,24 +2661,25 @@ H5_close_subfiles(int64_t subfiling_context_id)
if (MPI_SUCCESS != (mpi_code = MPI_Test(&barrier_req, &barrier_complete, MPI_STATUS_IGNORE)))
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Test failed", mpi_code);
}
- }
#else
- if (MPI_SUCCESS != (mpi_code = MPI_Barrier(sf_context->sf_barrier_comm)))
- H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code);
+ if (MPI_SUCCESS != (mpi_code = MPI_Barrier(file_comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code);
#endif
+ }
/* The map from file handle to subfiling context can now be cleared */
- if (sf_context->h5_file_handle != NULL) {
- clear_fid_map_entry(sf_context->h5_file_handle, sf_context->sf_context_id);
+ if (sf_context->h5_file_id != UINT64_MAX) {
+ clear_fid_map_entry(sf_context->h5_file_id, sf_context->sf_context_id);
}
if (sf_context->topology->rank_is_ioc) {
- if (sf_context->sf_fid >= 0) {
- errno = 0;
- if (HDclose(sf_context->sf_fid) < 0)
- H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, "couldn't close subfile");
-
- sf_context->sf_fid = -1;
+ if (sf_context->sf_fids) {
+ for (int i = 0; i < sf_context->sf_num_fids; i++) {
+ errno = 0;
+ if (sf_context->sf_fids[i] >= 0 && HDclose(sf_context->sf_fids[i]) < 0)
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, "couldn't close subfile");
+ sf_context->sf_fids[i] = -1;
+ }
}
}
@@ -2110,11 +2688,11 @@ H5_close_subfiles(int64_t subfiling_context_id)
* and opening another file before this file is completely closed
* down.
*/
-#if MPI_VERSION > 3 || (MPI_VERSION == 3 && MPI_SUBVERSION >= 1)
- {
+ if (mpi_size > 1) {
+#if H5_CHECK_MPI_VERSION(3, 1)
int barrier_complete = 0;
- if (MPI_SUCCESS != (mpi_code = MPI_Ibarrier(sf_context->sf_barrier_comm, &barrier_req)))
+ if (MPI_SUCCESS != (mpi_code = MPI_Ibarrier(file_comm, &barrier_req)))
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Ibarrier failed", mpi_code);
while (!barrier_complete) {
@@ -2124,24 +2702,213 @@ H5_close_subfiles(int64_t subfiling_context_id)
if (MPI_SUCCESS != (mpi_code = MPI_Test(&barrier_req, &barrier_complete, MPI_STATUS_IGNORE)))
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Test failed", mpi_code);
}
- }
#else
- if (MPI_SUCCESS != (mpi_code = MPI_Barrier(sf_context->sf_barrier_comm)))
- H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code);
+ if (MPI_SUCCESS != (mpi_code = MPI_Barrier(file_comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code);
+#endif
+ }
+
+#ifdef H5_SUBFILING_DEBUG
+ if (sf_context->sf_logfile) {
+ struct tm *tm = NULL;
+ time_t cur_time;
+
+ cur_time = time(NULL);
+ tm = localtime(&cur_time);
+
+ H5_subfiling_log(sf_context->sf_context_id, "\n-- LOGGING FINISH - %s", asctime(tm));
+
+ HDfclose(sf_context->sf_logfile);
+ sf_context->sf_logfile = NULL;
+ }
#endif
done:
- if (sf_context && H5_free_subfiling_object_int(sf_context) < 0)
- H5_SUBFILING_DONE_ERROR(H5E_FILE, H5E_CANTFREE, FAIL, "couldn't free subfiling context object");
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: H5_subfiling_set_config_prop
+ *
+ * Purpose: Sets the specified Subfiling VFD configuration as a
+ * property on the given FAPL pointer. The Subfiling VFD uses
+ * this property to pass its configuration down to the IOC VFD
+ * without needing each IOC VFD to include it as part of its
+ * public configuration.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5_subfiling_set_config_prop(H5P_genplist_t *plist_ptr, const H5FD_subfiling_params_t *vfd_config)
+{
+ htri_t prop_exists = FAIL;
+ herr_t ret_value = SUCCEED;
+
+ if (!plist_ptr)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL FAPL pointer");
+ if (!vfd_config)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid subfiling configuration pointer");
+
+ if ((prop_exists = H5P_exist_plist(plist_ptr, H5FD_SUBFILING_CONFIG_PROP)) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL,
+ "can't check if subfiling configuration property exists in FAPL");
+
+ if (prop_exists) {
+ if (H5P_set(plist_ptr, H5FD_SUBFILING_CONFIG_PROP, vfd_config) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL,
+ "can't set subfiling configuration property on FAPL");
+ }
+ else {
+ union {
+ const void *const_ptr_to_data;
+ void *ptr_to_data;
+ } eliminate_const_warning;
+
+ /*
+ * Cast away const since H5P_insert doesn't match the signature
+ * for "value" as H5P_set
+ */
+ eliminate_const_warning.const_ptr_to_data = vfd_config;
+
+ if (H5P_insert(plist_ptr, H5FD_SUBFILING_CONFIG_PROP, sizeof(H5FD_subfiling_params_t),
+ eliminate_const_warning.ptr_to_data, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTREGISTER, FAIL,
+ "unable to register subfiling configuration property in FAPL");
+ }
+
+done:
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: H5_subfiling_get_config_prop
+ *
+ * Purpose: Retrieves the Subfiling VFD configuration from the given
+ * FAPL pointer. The Subfiling VFD uses this property to pass
+ * its configuration down to the IOC VFD without needing each
+ * IOC VFD to include it as part of its public configuration.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5_subfiling_get_config_prop(H5P_genplist_t *plist_ptr, H5FD_subfiling_params_t *vfd_config)
+{
+ htri_t prop_exists = FAIL;
+ herr_t ret_value = SUCCEED;
+
+ if (!plist_ptr)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL FAPL pointer");
+ if (!vfd_config)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid subfiling configuration pointer");
+
+ if ((prop_exists = H5P_exist_plist(plist_ptr, H5FD_SUBFILING_CONFIG_PROP)) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL,
+ "can't check if subfiling configuration property exists in FAPL");
+
+ if (prop_exists) {
+ if (H5P_get(plist_ptr, H5FD_SUBFILING_CONFIG_PROP, vfd_config) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL,
+ "can't get subfiling configuration property from FAPL");
+ }
+ else {
+ vfd_config->ioc_selection = SELECT_IOC_ONE_PER_NODE;
+ vfd_config->stripe_size = H5FD_SUBFILING_DEFAULT_STRIPE_SIZE;
+ vfd_config->stripe_count = H5FD_SUBFILING_DEFAULT_STRIPE_COUNT;
+ }
+
+done:
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: H5_subfiling_set_file_id_prop
+ *
+ * Purpose: Sets the specified file ID (Inode) value as a property on
+ * the given FAPL pointer. The Subfiling VFD uses this
+ * property to pass the HDF5 stub file ID value down to the
+ * IOC VFD.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5_subfiling_set_file_id_prop(H5P_genplist_t *plist_ptr, uint64_t file_id)
+{
+ htri_t prop_exists = FAIL;
+ herr_t ret_value = SUCCEED;
+
+ if (!plist_ptr)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL FAPL pointer");
+ if (file_id == UINT64_MAX)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid file ID value");
+
+ if ((prop_exists = H5P_exist_plist(plist_ptr, H5FD_SUBFILING_STUB_FILE_ID)) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL,
+ "can't check if file ID property exists in FAPL");
+
+ if (prop_exists) {
+ if (H5P_set(plist_ptr, H5FD_SUBFILING_STUB_FILE_ID, &file_id) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set file ID property on FAPL");
+ }
+ else {
+ if (H5P_insert(plist_ptr, H5FD_SUBFILING_STUB_FILE_ID, sizeof(uint64_t), &file_id, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTREGISTER, FAIL,
+ "unable to register file ID property in FAPL");
+ }
+
+done:
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: H5_subfiling_get_file_id_prop
+ *
+ * Purpose: Retrieves the file ID (Inode) value from the given FAPL
+ * pointer. The Subfiling VFD uses this property to pass the
+ * HDF5 stub file ID value down to the IOC VFD.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5_subfiling_get_file_id_prop(H5P_genplist_t *plist_ptr, uint64_t *file_id)
+{
+ htri_t prop_exists = FAIL;
+ herr_t ret_value = SUCCEED;
+
+ if (!plist_ptr)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL FAPL pointer");
+ if (!file_id)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL file ID pointer");
+ if ((prop_exists = H5P_exist_plist(plist_ptr, H5FD_SUBFILING_STUB_FILE_ID)) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL,
+ "can't check if file ID property exists in FAPL");
+
+ if (prop_exists) {
+ if (H5P_get(plist_ptr, H5FD_SUBFILING_STUB_FILE_ID, file_id) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get file ID property from FAPL");
+ }
+ else
+ *file_id = UINT64_MAX;
+
+done:
H5_SUBFILING_FUNC_LEAVE;
}
/*-------------------------------------------------------------------------
- * Function: H5_subfile_fhandle_to_context
+ * Function: H5_subfile_fid_to_context
*
* Purpose: This is a basic lookup function which returns the subfiling
- * context id associated with the specified file handle.
+ * context id associated with the specified file ID.
*
* Return: Non-negative subfiling context ID if the context exists
* Negative on failure or if the subfiling context doesn't
@@ -2155,7 +2922,7 @@ done:
*-------------------------------------------------------------------------
*/
int64_t
-H5_subfile_fhandle_to_context(void *file_handle)
+H5_subfile_fid_to_context(uint64_t file_id)
{
int64_t ret_value = -1;
@@ -2163,14 +2930,107 @@ H5_subfile_fhandle_to_context(void *file_handle)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, -1, "open file map is NULL");
for (int i = 0; i < sf_file_map_size; i++) {
- if (sf_open_file_map[i].file_handle == file_handle) {
+ if (sf_open_file_map[i].file_id == file_id) {
return sf_open_file_map[i].sf_context_id;
}
}
done:
H5_SUBFILING_FUNC_LEAVE;
-} /* end H5_subfile_fhandle_to_context() */
+} /* end H5_subfile_fid_to_context() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5_subfiling_validate_config
+ *
+ * Purpose: Checks that the given subfiling configuration parameters
+ * are valid
+ *
+ * Return: Non-negative on success/Negative on failure
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5_subfiling_validate_config(const H5FD_subfiling_params_t *subf_config)
+{
+ H5FD_subfiling_ioc_select_t ioc_sel_type;
+ herr_t ret_value = SUCCEED;
+
+ if (!subf_config)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL subfiling configuration pointer");
+
+ /*
+ * Compare against each IOC selection value directly since
+ * the enum might be a signed or unsigned type and a comparison
+ * against < 0 could generate a warning
+ */
+ ioc_sel_type = subf_config->ioc_selection;
+ if (ioc_sel_type != SELECT_IOC_ONE_PER_NODE && ioc_sel_type != SELECT_IOC_EVERY_NTH_RANK &&
+ ioc_sel_type != SELECT_IOC_WITH_CONFIG && ioc_sel_type != SELECT_IOC_TOTAL)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid IOC selection method");
+
+ if (subf_config->stripe_size <= 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid stripe size");
+
+ if (subf_config->stripe_count <= 0 && subf_config->stripe_count != H5FD_SUBFILING_DEFAULT_STRIPE_COUNT)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid stripe count");
+
+done:
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: H5_subfiling_terminate
+ *
+ * Purpose: A cleanup routine to be called by the Subfiling VFD when
+ * it is terminating. Cleans up internal resources such as the
+ * context and topology caches.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5_subfiling_terminate(void)
+{
+ herr_t ret_value = SUCCEED;
+
+ /* Clean up subfiling context and topology caches */
+ if (sf_context_cache) {
+ for (size_t i = 0; i < sf_context_cache_num_entries; i++) {
+ if (H5_free_subfiling_object_int(sf_context_cache[i]) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL,
+ "couldn't free subfiling context object");
+ sf_context_cache[i] = NULL;
+ }
+
+ sf_context_cache_size = 0;
+ sf_context_cache_num_entries = 0;
+
+ HDfree(sf_context_cache);
+ sf_context_cache = NULL;
+ }
+ if (sf_topology_cache) {
+ for (size_t i = 0; i < sf_topology_cache_num_entries; i++) {
+ if (H5_free_subfiling_topology(sf_topology_cache[i]) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL,
+ "couldn't free subfiling topology object");
+ sf_topology_cache[i] = NULL;
+ }
+
+ sf_topology_cache_size = 0;
+ sf_topology_cache_num_entries = 0;
+
+ HDfree(sf_topology_cache);
+ sf_topology_cache = NULL;
+ }
+
+ /* Clean up the file ID to context object mapping */
+ sf_file_map_size = 0;
+ HDfree(sf_open_file_map);
+ sf_open_file_map = NULL;
+
+done:
+ H5_SUBFILING_FUNC_LEAVE;
+}
#ifdef H5_SUBFILING_DEBUG
void
diff --git a/src/H5FDsubfiling/H5subfiling_common.h b/src/H5FDsubfiling/H5subfiling_common.h
index 6e2965f..ba6dfdc 100644
--- a/src/H5FDsubfiling/H5subfiling_common.h
+++ b/src/H5FDsubfiling/H5subfiling_common.h
@@ -20,17 +20,49 @@
#include <stdatomic.h>
#include "H5private.h"
+#include "H5FDprivate.h"
#include "H5Iprivate.h"
+#include "H5Pprivate.h"
#include "H5FDsubfiling.h"
#include "H5FDioc.h"
+#ifndef PATH_MAX
+#define PATH_MAX 4096
+#endif
+
/*
* Some definitions for debugging the Subfiling feature
*/
/* #define H5_SUBFILING_DEBUG */
/*
+ * Some definitions for controlling performance across
+ * different machines where some types of MPI operations
+ * may be better optimized than others
+ */
+/* #define H5_SUBFILING_PREFER_ALLGATHER_TOPOLOGY */
+#ifndef H5_SUBFILING_PREFER_ALLGATHER_TOPOLOGY
+#if !H5_CHECK_MPI_VERSION(3, 0)
+#error "MPI 3 required for MPI_Comm_split_type"
+#endif
+#endif
+
+/*
+ * Name of the HDF5 FAPL property that the Subfiling VFD
+ * uses to pass its configuration down to the underlying
+ * IOC VFD
+ */
+#define H5FD_SUBFILING_CONFIG_PROP "H5FD_SUBFILING_CONFIG_PROP"
+
+/*
+ * Name of the HDF5 FAPL property that the Subfiling VFD
+ * uses to pass the HDF5 stub file's Inode value to the
+ * underlying IOC VFD
+ */
+#define H5FD_SUBFILING_STUB_FILE_ID "H5FD_SUBFILING_STUB_FILE_ID"
+
+/*
* MPI Tags are 32 bits, we treat them as unsigned
* to allow the use of the available bits for RPC
* selections, i.e. a message from the VFD read or write functions
@@ -80,8 +112,10 @@
/* MPI tag values for data communicator */
#define WRITE_INDEP_ACK 0
-#define READ_INDEP_DATA 1
-#define WRITE_TAG_BASE 2
+#define READ_INDEP_ACK 1
+#define READ_INDEP_DATA 2
+#define WRITE_DATA_DONE 3
+#define IO_TAG_BASE 4
/*
* Object type definitions for subfiling objects.
@@ -112,70 +146,70 @@ typedef enum io_ops {
LOGGING_OP = 16
} io_op_t;
-/* Every application rank will record their MPI rank
- * and hostid as a structure. These eventually get
- * communicated to MPI rank zero(0) and sorted before
- * being broadcast. The resulting sorted vector
- * provides a basis for determining which MPI ranks
- * will host an IO Concentrator (IOC), e.g. For
- * default behavior, we choose the first vector entry
- * associated with a "new" hostid.
+/*
+ * Every MPI rank in a file's communicator will
+ * record their MPI rank for the file communicator
+ * and their node-local MPI rank for the node's
+ * communicator. Then the resulting information
+ * will be broadcast to all MPI ranks and will
+ * provide a basis for determining which MPI ranks
+ * will host an I/O concentrator.
*/
typedef struct {
- long rank;
- long hostid;
+ int rank;
+ int node_local_rank;
+ int node_local_size;
+ int node_lead_rank;
} layout_t;
-/* This typedef defines a fixed process layout which
+/*
+ * This typedef defines a fixed process layout which
* can be reused for any number of file open operations
*/
typedef struct app_layout_t {
- long hostid; /* value returned by gethostid() */
- layout_t *layout; /* Vector of {rank,hostid} values */
- int *node_ranks; /* ranks extracted from sorted layout */
- int node_count; /* Total nodes (different hostids) */
- int node_index; /* My node: index into node_ranks */
- int local_peers; /* How may local peers on my node */
- int world_rank; /* My MPI rank */
- int world_size; /* Total number of MPI ranks */
+ layout_t *layout; /* Array of (rank, node local rank, node local size) values */
+ int *node_ranks; /* Array of lowest MPI rank values on each node */
+ int node_count; /* Total number of nodes */
+ int world_rank; /* MPI rank in file communicator */
+ int world_size; /* Size of file communicator */
+ int node_local_rank; /* MPI rank on node */
+ int node_local_size; /* Size of node intra-communicator */
} app_layout_t;
/* This typedef defines things related to IOC selections */
typedef struct topology {
- app_layout_t *app_layout; /* Pointer to our layout struct */
- bool rank_is_ioc; /* Indicates that we host an IOC */
- int subfile_rank; /* Valid only if rank_is_ioc */
- int n_io_concentrators; /* Number of IO concentrators */
- int *io_concentrators; /* Vector of ranks which are IOCs */
- int *subfile_fd; /* file descriptor (if IOC) */
- H5FD_subfiling_ioc_select_t selection_type; /* Cache our IOC selection criteria */
+ app_layout_t *app_layout; /* Pointer to our layout struct */
+ MPI_Comm app_comm; /* MPI communicator for this topology */
+ bool rank_is_ioc; /* Indicates that we host an IOC */
+ int ioc_idx; /* Valid only if rank_is_ioc */
+ int n_io_concentrators; /* Number of I/O concentrators */
+ int *io_concentrators; /* Vector of ranks which are IOCs */
+ H5FD_subfiling_ioc_select_t selection_type; /* Cache our IOC selection criteria */
} sf_topology_t;
typedef struct {
int64_t sf_context_id; /* Generated context ID which embeds the cache index */
- uint64_t h5_file_id; /* GUID (basically the inode value) */
- void *h5_file_handle; /* Low-level handle for the HDF5 stub file */
- int sf_fid; /* value returned by open(file,..) */
- size_t sf_write_count; /* Statistics: write_count */
- size_t sf_read_count; /* Statistics: read_count */
- haddr_t sf_eof; /* File eof */
- int64_t sf_stripe_size; /* Stripe-depth */
- int64_t sf_blocksize_per_stripe; /* Stripe-depth X n_IOCs */
- int64_t sf_base_addr; /* For an IOC, our base address */
- MPI_Comm sf_msg_comm; /* MPI comm used to send RPC msg */
- MPI_Comm sf_data_comm; /* MPI comm used to move data */
- MPI_Comm sf_eof_comm; /* MPI comm used to communicate EOF */
- MPI_Comm sf_barrier_comm; /* MPI comm used for barrier operations */
- MPI_Comm sf_group_comm; /* Not used: for IOC collectives */
- MPI_Comm sf_intercomm; /* Not used: for msgs to all IOC */
- int sf_group_size; /* IOC count (in sf_group_comm) */
- int sf_group_rank; /* IOC rank (in sf_group_comm) */
- int sf_intercomm_root; /* Not used: for IOC comms */
- char *subfile_prefix; /* If subfiles are node-local */
- char *sf_filename; /* A generated subfile name */
- char *h5_filename; /* The user supplied file name */
- void *ioc_data; /* Private data for underlying IOC */
- sf_topology_t *topology; /* pointer to our topology */
+ uint64_t h5_file_id; /* GUID (basically the inode value) */
+ int *sf_fids; /* Array of file IDs for subfiles this rank owns */
+ int sf_num_fids; /* Number of subfiles this rank owns */
+ int sf_num_subfiles; /* Total number of subfiles for logical HDF5 file */
+ size_t sf_write_count; /* Statistics: write_count */
+ size_t sf_read_count; /* Statistics: read_count */
+ haddr_t sf_eof; /* File eof */
+ int64_t sf_stripe_size; /* Stripe-depth */
+ int64_t sf_blocksize_per_stripe; /* Stripe-depth X n_IOCs */
+ int64_t sf_base_addr; /* For an IOC, our base address */
+ MPI_Comm sf_msg_comm; /* MPI comm used to send RPC msg */
+ MPI_Comm sf_data_comm; /* MPI comm used to move data */
+ MPI_Comm sf_eof_comm; /* MPI comm used to communicate EOF */
+ MPI_Comm sf_node_comm; /* MPI comm used for intra-node comms */
+ MPI_Comm sf_group_comm; /* Not used: for IOC collectives */
+ int sf_group_size; /* IOC count (in sf_group_comm) */
+ int sf_group_rank; /* IOC rank (in sf_group_comm) */
+ char *subfile_prefix; /* If subfiles are node-local */
+ char *h5_filename; /* The user supplied file name */
+ void *ioc_data; /* Private data for underlying IOC */
+ sf_topology_t *topology; /* Pointer to our topology */
#ifdef H5_SUBFILING_DEBUG
char sf_logfile_name[PATH_MAX];
@@ -189,30 +223,45 @@ typedef struct {
* an easy gathering of statistics by the IO Concentrator.
*/
typedef struct {
- /* {Datasize, Offset, FileID} */
- int64_t header[3]; /* The basic RPC input plus */
- int tag; /* the supplied OPCODE tag */
- int source; /* Rank of who sent the message */
- int subfile_rank; /* The IOC rank */
- int64_t context_id; /* context to be used to complete */
- double start_time; /* the request, + time of receipt */
- /* from which we calc Time(queued) */
+ int64_t header[3]; /* The basic RPC input */
+ int tag; /* the supplied OPCODE tag */
+ int source; /* Rank of who sent the message */
+ int ioc_idx; /* The IOC rank */
+ int64_t context_id; /* context to be used to complete */
+ double start_time; /* the request, + time of receipt */
+ /* from which we calc Time(queued) */
} sf_work_request_t;
+/* MPI Datatype used to send/receive an RPC message */
+extern MPI_Datatype H5_subfiling_rpc_msg_type;
+
#ifdef __cplusplus
extern "C" {
#endif
-H5_DLL herr_t H5_open_subfiles(const char *base_filename, void *h5_file_handle,
- H5FD_subfiling_shared_config_t *subfiling_config, int file_acc_flags,
+H5_DLL herr_t H5_open_subfiling_stub_file(const char *name, unsigned flags, MPI_Comm file_comm,
+ H5FD_t **file_ptr, uint64_t *file_id);
+H5_DLL herr_t H5_open_subfiles(const char *base_filename, uint64_t file_id,
+ H5FD_subfiling_params_t *subfiling_config, int file_acc_flags,
MPI_Comm file_comm, int64_t *context_id_out);
-H5_DLL herr_t H5_close_subfiles(int64_t subfiling_context_id);
+H5_DLL herr_t H5_close_subfiles(int64_t subfiling_context_id, MPI_Comm file_comm);
-H5_DLL int64_t H5_new_subfiling_object_id(sf_obj_type_t obj_type, int64_t index_val);
+H5_DLL int64_t H5_new_subfiling_object_id(sf_obj_type_t obj_type);
H5_DLL void *H5_get_subfiling_object(int64_t object_id);
-H5_DLL int64_t H5_subfile_fhandle_to_context(void *file_handle);
-H5_DLL herr_t H5_free_subfiling_object(int64_t object_id);
-H5_DLL herr_t H5_get_num_iocs_from_config_file(FILE *config_file, int *n_io_concentrators);
+H5_DLL herr_t H5_get_subfiling_config_from_file(FILE *config_file, int64_t *stripe_size,
+ int64_t *num_subfiles);
+H5_DLL herr_t H5_resolve_pathname(const char *filepath, MPI_Comm comm, char **resolved_filepath);
+
+H5_DLL herr_t H5_subfiling_set_config_prop(H5P_genplist_t *plist_ptr,
+ const H5FD_subfiling_params_t *vfd_config);
+H5_DLL herr_t H5_subfiling_get_config_prop(H5P_genplist_t *plist_ptr, H5FD_subfiling_params_t *vfd_config);
+H5_DLL herr_t H5_subfiling_set_file_id_prop(H5P_genplist_t *plist_ptr, uint64_t file_id);
+H5_DLL herr_t H5_subfiling_get_file_id_prop(H5P_genplist_t *plist_ptr, uint64_t *file_id);
+H5_DLL int64_t H5_subfile_fid_to_context(uint64_t file_id);
+
+H5_DLL herr_t H5_subfiling_validate_config(const H5FD_subfiling_params_t *subf_config);
+
+H5_DLL herr_t H5_subfiling_terminate(void);
H5_DLL void H5_subfiling_log(int64_t sf_context_id, const char *fmt, ...);