summaryrefslogtreecommitdiffstats
path: root/src/H5FDsubfiling/H5subfiling_common.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/H5FDsubfiling/H5subfiling_common.c')
-rw-r--r--src/H5FDsubfiling/H5subfiling_common.c3070
1 files changed, 3070 insertions, 0 deletions
diff --git a/src/H5FDsubfiling/H5subfiling_common.c b/src/H5FDsubfiling/H5subfiling_common.c
new file mode 100644
index 0000000..3e83cf5
--- /dev/null
+++ b/src/H5FDsubfiling/H5subfiling_common.c
@@ -0,0 +1,3070 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright by The HDF Group. *
+ * All rights reserved. *
+ * *
+ * This file is part of HDF5. The full HDF5 copyright notice, including *
+ * terms governing use, modification, and redistribution, is contained in *
+ * the COPYING file, which can be found at the root of the source code *
+ * distribution tree, or in https://www.hdfgroup.org/licenses. *
+ * If you do not have access to either file, you may request a copy from *
+ * help@hdfgroup.org. *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+/*
+ * Generic code for integrating an HDF5 VFD with the subfiling feature
+ */
+
+#include "H5subfiling_common.h"
+#include "H5subfiling_err.h"
+
+#include "H5MMprivate.h"
+
+typedef struct { /* Format of a context map entry */
+ uint64_t file_id; /* key value (linear search of the cache) */
+ int64_t sf_context_id; /* The return value if matching file_handle */
+} file_map_to_context_t;
+
+/* Identifiers for HDF5's error API */
+hid_t H5subfiling_err_stack_g = H5I_INVALID_HID;
+hid_t H5subfiling_err_class_g = H5I_INVALID_HID;
+char H5subfiling_mpi_error_str[MPI_MAX_ERROR_STRING];
+int H5subfiling_mpi_error_str_len;
+
+/* MPI Datatype used to send/receive an RPC message */
+MPI_Datatype H5_subfiling_rpc_msg_type = MPI_DATATYPE_NULL;
+
+static subfiling_context_t **sf_context_cache = NULL;
+static sf_topology_t **sf_topology_cache = NULL;
+
+static size_t sf_context_cache_size = 0;
+static size_t sf_topology_cache_size = 0;
+static size_t sf_context_cache_num_entries = 0;
+static size_t sf_topology_cache_num_entries = 0;
+
+static file_map_to_context_t *sf_open_file_map = NULL;
+static int sf_file_map_size = 0;
+
+#define DEFAULT_CONTEXT_CACHE_SIZE 16
+#define DEFAULT_TOPOLOGY_CACHE_SIZE 4
+#define DEFAULT_FILE_MAP_ENTRIES 8
+
+static herr_t H5_free_subfiling_object(int64_t object_id);
+static herr_t H5_free_subfiling_object_int(subfiling_context_t *sf_context);
+static herr_t H5_free_subfiling_topology(sf_topology_t *topology);
+
+static herr_t init_subfiling(const char *base_filename, uint64_t file_id,
+ H5FD_subfiling_params_t *subfiling_config, int file_acc_flags, MPI_Comm comm,
+ int64_t *context_id_out);
+static herr_t init_app_topology(H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, MPI_Comm node_comm,
+ sf_topology_t **app_topology_out);
+static herr_t get_ioc_selection_criteria_from_env(H5FD_subfiling_ioc_select_t *ioc_selection_type,
+ char **ioc_sel_info_str);
+static herr_t find_cached_topology_info(MPI_Comm comm, H5FD_subfiling_params_t *subf_config,
+ long iocs_per_node, sf_topology_t **app_topology);
+static herr_t init_app_layout(sf_topology_t *app_topology, MPI_Comm comm, MPI_Comm node_comm);
+static herr_t gather_topology_info(app_layout_t *app_layout, MPI_Comm comm, MPI_Comm intra_comm);
+static int compare_layout_nodelocal(const void *layout1, const void *layout2);
+static herr_t identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride);
+static herr_t init_subfiling_context(subfiling_context_t *sf_context, const char *base_filename,
+ uint64_t file_id, H5FD_subfiling_params_t *subfiling_config,
+ sf_topology_t *app_topology, MPI_Comm file_comm);
+static herr_t open_subfile_with_context(subfiling_context_t *sf_context, int file_acc_flags);
+static herr_t record_fid_to_subfile(uint64_t file_id, int64_t subfile_context_id, int *next_index);
+static void clear_fid_map_entry(uint64_t file_id, int64_t sf_context_id);
+static herr_t ioc_open_files(int64_t file_context_id, int file_acc_flags);
+static herr_t create_config_file(subfiling_context_t *sf_context, const char *base_filename,
+ const char *subfile_dir, hbool_t truncate_if_exists);
+static herr_t open_config_file(const char *base_filename, const char *subfile_dir, uint64_t file_id,
+ const char *mode, FILE **config_file_out);
+
+/*-------------------------------------------------------------------------
+ * Function: H5_new_subfiling_object_id
+ *
+ * Purpose: Given a subfiling object type and an index value, generates
+ * a new subfiling object ID.
+ *
+ * Return: Non-negative object ID on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+int64_t
+H5_new_subfiling_object_id(sf_obj_type_t obj_type)
+{
+ int64_t index_val = 0;
+
+ if (obj_type == SF_CONTEXT) {
+ index_val = (int64_t)sf_context_cache_num_entries;
+ }
+ else if (obj_type == SF_TOPOLOGY) {
+ index_val = (int64_t)sf_topology_cache_num_entries;
+ }
+ else
+ return -1;
+
+ if (index_val < 0)
+ return -1;
+
+ return (((int64_t)obj_type << 32) | index_val);
+}
+
+/*-------------------------------------------------------------------------
+ * Function: H5_get_subfiling_object
+ *
+ * Purpose: Given a subfiling object ID, returns a pointer to the
+ * underlying object, which can be either a subfiling context
+ * object (subfiling_context_t) or a subfiling topology
+ * object (sf_topology_t).
+ *
+ * A subfiling object ID contains the object type in the upper
+ * 32 bits and an index value in the lower 32 bits.
+ *
+ * Subfiling contexts are 1 per open file. If only one file is
+ * open at a time, then only a single subfiling context cache
+ * entry will be used.
+ *
+ * Return: Pointer to underlying subfiling object if subfiling object
+ * ID is valid
+ *
+ * NULL if subfiling object ID is invalid or an internal
+ * failure occurs
+ *
+ *-------------------------------------------------------------------------
+ */
+void *
+H5_get_subfiling_object(int64_t object_id)
+{
+ int64_t obj_type = (object_id >> 32) & 0x0FFFF;
+ int64_t obj_index = object_id & 0x0FFFF;
+ void *ret_value = NULL;
+
+ if (obj_index < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, NULL,
+ "invalid object index for subfiling object ID %" PRId64, object_id);
+
+ if (obj_type == SF_CONTEXT) {
+ /* Contexts provide information principally about
+ * the application and how the data layout is managed
+ * over some number of subfiles. The important
+ * parameters are the number of subfiles (or in the
+ * context of IOCs, the MPI ranks and counts of the
+ * processes which host an I/O Concentrator. We
+ * also provide a map of IOC rank to MPI rank
+ * to facilitate the communication of I/O requests.
+ */
+
+ /* Create subfiling context cache if it doesn't exist */
+ if (!sf_context_cache) {
+ if (NULL == (sf_context_cache = HDcalloc(DEFAULT_CONTEXT_CACHE_SIZE, sizeof(*sf_context_cache))))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL,
+ "couldn't allocate space for subfiling context cache");
+ sf_context_cache_size = DEFAULT_CONTEXT_CACHE_SIZE;
+ sf_context_cache_num_entries = 0;
+ }
+
+ /* Make more space in context cache if needed */
+ if ((size_t)obj_index >= sf_context_cache_size) {
+ size_t old_num_entries;
+ size_t new_size;
+ void *tmp_realloc;
+
+ old_num_entries = sf_context_cache_num_entries;
+
+ new_size = (sf_context_cache_size * 3) / 2;
+
+ if (NULL == (tmp_realloc = HDrealloc(sf_context_cache, new_size * sizeof(*sf_context_cache))))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL,
+ "couldn't allocate space for subfiling context cache");
+
+ sf_context_cache = tmp_realloc;
+ sf_context_cache_size = new_size;
+
+ /* Clear newly-allocated entries */
+ HDmemset(&sf_context_cache[old_num_entries], 0,
+ (sf_context_cache_size - old_num_entries) * sizeof(*sf_context_cache));
+
+ /*
+ * If we had to make more space, the given object index
+ * should always fall within range after a single re-allocation
+ */
+ HDassert((size_t)obj_index < sf_context_cache_size);
+ }
+
+ /*
+ * Since this cache currently just keeps all entries until
+ * application exit, context entry indices should just be
+ * consecutive
+ */
+ HDassert((size_t)obj_index <= sf_context_cache_num_entries);
+ if ((size_t)obj_index < sf_context_cache_num_entries)
+ ret_value = sf_context_cache[obj_index];
+ else {
+ HDassert(!sf_context_cache[sf_context_cache_num_entries]);
+
+ /* Allocate a new subfiling context object */
+ if (NULL == (ret_value = HDcalloc(1, sizeof(subfiling_context_t))))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL,
+ "couldn't allocate subfiling context object");
+
+ sf_context_cache[sf_context_cache_num_entries++] = ret_value;
+ }
+ }
+ else if (obj_type == SF_TOPOLOGY) {
+ /* Create subfiling topology cache if it doesn't exist */
+ if (!sf_topology_cache) {
+ if (NULL ==
+ (sf_topology_cache = HDcalloc(DEFAULT_TOPOLOGY_CACHE_SIZE, sizeof(*sf_topology_cache))))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL,
+ "couldn't allocate space for subfiling topology cache");
+ sf_topology_cache_size = DEFAULT_TOPOLOGY_CACHE_SIZE;
+ sf_topology_cache_num_entries = 0;
+ }
+
+ /* Make more space in topology cache if needed */
+ if ((size_t)obj_index >= sf_topology_cache_size) {
+ size_t old_num_entries;
+ size_t new_size;
+ void *tmp_realloc;
+
+ old_num_entries = sf_topology_cache_num_entries;
+
+ new_size = (sf_topology_cache_size * 3) / 2;
+
+ if (NULL == (tmp_realloc = HDrealloc(sf_topology_cache, new_size * sizeof(*sf_topology_cache))))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL,
+ "couldn't allocate space for subfiling topology cache");
+
+ sf_topology_cache = tmp_realloc;
+ sf_topology_cache_size = new_size;
+
+ /* Clear newly-allocated entries */
+ HDmemset(&sf_topology_cache[old_num_entries], 0,
+ (sf_topology_cache_size - old_num_entries) * sizeof(*sf_topology_cache));
+
+ /*
+ * If we had to make more space, the given object index
+ * should always fall within range after a single re-allocation
+ */
+ HDassert((size_t)obj_index < sf_topology_cache_size);
+ }
+
+ /*
+ * Since this cache currently just keeps all entries until
+ * application exit, topology entry indices should just be
+ * consecutive
+ */
+ HDassert((size_t)obj_index <= sf_topology_cache_num_entries);
+ if ((size_t)obj_index < sf_topology_cache_num_entries)
+ ret_value = sf_topology_cache[obj_index];
+ else {
+ HDassert(!sf_topology_cache[sf_topology_cache_num_entries]);
+
+ /* Allocate a new subfiling topology object */
+ if (NULL == (ret_value = HDmalloc(sizeof(sf_topology_t))))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL,
+ "couldn't allocate subfiling topology object");
+
+ sf_topology_cache[sf_topology_cache_num_entries++] = ret_value;
+ }
+ }
+#ifdef H5_SUBFILING_DEBUG
+ else
+ HDprintf("%s: Unknown subfiling object type for ID %" PRId64 "\n", __func__, object_id);
+#endif
+
+done:
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: H5_free_subfiling_object
+ *
+ * Purpose: Frees the underlying subfiling object for a given subfiling
+ * object ID.
+ *
+ * NOTE: Currently we assume that all created subfiling
+ * objects are cached in the (very simple) context/topology
+ * cache until application exit, so the only time a subfiling
+ * object should be freed by this routine is if something
+ * fails right after creating one. Otherwise, the internal
+ * indexing for the relevant cache will be invalid.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5_free_subfiling_object(int64_t object_id)
+{
+ int64_t obj_type = (object_id >> 32) & 0x0FFFF;
+ herr_t ret_value = SUCCEED;
+
+ if (obj_type == SF_CONTEXT) {
+ subfiling_context_t *sf_context;
+
+ if (NULL == (sf_context = H5_get_subfiling_object(object_id)))
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL,
+ "couldn't get subfiling context for subfiling object ID");
+
+ if (H5_free_subfiling_object_int(sf_context) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling object");
+
+ HDassert(sf_context_cache_num_entries > 0);
+ HDassert(sf_context == sf_context_cache[sf_context_cache_num_entries - 1]);
+ sf_context_cache[sf_context_cache_num_entries - 1] = NULL;
+ sf_context_cache_num_entries--;
+ }
+ else {
+ sf_topology_t *sf_topology;
+
+ HDassert(obj_type == SF_TOPOLOGY);
+
+ if (NULL == (sf_topology = H5_get_subfiling_object(object_id)))
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL,
+ "couldn't get subfiling context for subfiling object ID");
+
+ if (H5_free_subfiling_topology(sf_topology) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling topology");
+
+ HDassert(sf_topology_cache_num_entries > 0);
+ HDassert(sf_topology == sf_topology_cache[sf_topology_cache_num_entries - 1]);
+ sf_topology_cache[sf_topology_cache_num_entries - 1] = NULL;
+ sf_topology_cache_num_entries--;
+ }
+
+done:
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+static herr_t
+H5_free_subfiling_object_int(subfiling_context_t *sf_context)
+{
+ HDassert(sf_context);
+
+ sf_context->sf_context_id = -1;
+ sf_context->h5_file_id = UINT64_MAX;
+ sf_context->sf_num_fids = 0;
+ sf_context->sf_num_subfiles = -1;
+ sf_context->sf_write_count = 0;
+ sf_context->sf_read_count = 0;
+ sf_context->sf_eof = HADDR_UNDEF;
+ sf_context->sf_stripe_size = -1;
+ sf_context->sf_blocksize_per_stripe = -1;
+ sf_context->sf_base_addr = -1;
+
+ if (sf_context->sf_msg_comm != MPI_COMM_NULL) {
+ if (H5_mpi_comm_free(&sf_context->sf_msg_comm) < 0)
+ return FAIL;
+ sf_context->sf_msg_comm = MPI_COMM_NULL;
+ }
+ if (sf_context->sf_data_comm != MPI_COMM_NULL) {
+ if (H5_mpi_comm_free(&sf_context->sf_data_comm) < 0)
+ return FAIL;
+ sf_context->sf_data_comm = MPI_COMM_NULL;
+ }
+ if (sf_context->sf_eof_comm != MPI_COMM_NULL) {
+ if (H5_mpi_comm_free(&sf_context->sf_eof_comm) < 0)
+ return FAIL;
+ sf_context->sf_eof_comm = MPI_COMM_NULL;
+ }
+ if (sf_context->sf_node_comm != MPI_COMM_NULL) {
+ if (H5_mpi_comm_free(&sf_context->sf_node_comm) < 0)
+ return FAIL;
+ sf_context->sf_node_comm = MPI_COMM_NULL;
+ }
+ if (sf_context->sf_group_comm != MPI_COMM_NULL) {
+ if (H5_mpi_comm_free(&sf_context->sf_group_comm) < 0)
+ return FAIL;
+ sf_context->sf_group_comm = MPI_COMM_NULL;
+ }
+
+ sf_context->sf_group_size = -1;
+ sf_context->sf_group_rank = -1;
+
+ HDfree(sf_context->subfile_prefix);
+ sf_context->subfile_prefix = NULL;
+
+ HDfree(sf_context->h5_filename);
+ sf_context->h5_filename = NULL;
+
+ HDfree(sf_context->sf_fids);
+ sf_context->sf_fids = NULL;
+
+ /*
+ * Currently we assume that all created application topology
+ * objects are cached until application exit and may be shared
+ * among multiple subfiling contexts, so we free them separately
+ * from here to avoid issues with stale pointers.
+ */
+ sf_context->topology = NULL;
+
+ HDfree(sf_context);
+
+ return SUCCEED;
+}
+
+static herr_t
+H5_free_subfiling_topology(sf_topology_t *topology)
+{
+ herr_t ret_value = SUCCEED;
+
+ HDassert(topology);
+
+#ifndef NDEBUG
+ {
+ hbool_t topology_cached = FALSE;
+
+ /* Make sure this application topology object is in the cache */
+ for (size_t i = 0; i < sf_topology_cache_num_entries; i++)
+ if (topology == sf_topology_cache[i])
+ topology_cached = TRUE;
+ HDassert(topology_cached);
+ }
+#endif
+
+ topology->ioc_idx = -1;
+ topology->n_io_concentrators = 0;
+
+ if (topology->app_layout) {
+ HDfree(topology->app_layout->layout);
+ topology->app_layout->layout = NULL;
+
+ HDfree(topology->app_layout->node_ranks);
+ topology->app_layout->node_ranks = NULL;
+
+ HDfree(topology->app_layout);
+ }
+
+ topology->app_layout = NULL;
+
+ HDfree(topology->io_concentrators);
+ topology->io_concentrators = NULL;
+
+ if (H5_mpi_comm_free(&topology->app_comm) < 0)
+ H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI communicator");
+
+ HDfree(topology);
+
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: H5_open_subfiling_stub_file
+ *
+ * Purpose: Opens the stub file for an HDF5 file created with the
+ * Subfiling VFD. This stub file only contains some superblock
+ * metadata that can allow HDF5 applications to determine that
+ * the file is an HDF5 file and was created with the Subfiling
+ * VFD.
+ *
+ * This routine is collective across `file_comm`; once the
+ * stub file has been opened, the inode value for the file is
+ * retrieved and broadcasted to all MPI ranks in `file_comm`
+ * for future use.
+ *
+ * To avoid unnecessary overhead from a large-scale file open,
+ * this stub file is currently only opened on MPI rank 0. Note
+ * that this assumes that all the relevant metadata will be
+ * written from MPI rank 0. This should be fine for now since
+ * the HDF file signature and Subfiling driver info is really
+ * all that's needed, but this should be revisited since the
+ * file metadata can and will come from other MPI ranks as
+ * well.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5_open_subfiling_stub_file(const char *name, unsigned flags, MPI_Comm file_comm, H5FD_t **file_ptr,
+ uint64_t *file_id)
+{
+ H5P_genplist_t *plist = NULL;
+ uint64_t stub_file_id = UINT64_MAX;
+ hbool_t bcasted_inode = FALSE;
+ H5FD_t *stub_file = NULL;
+ hid_t fapl_id = H5I_INVALID_HID;
+ int mpi_rank = 0;
+ int mpi_size = 1;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
+ if (!name)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid subfiling stub file name");
+ if (file_comm == MPI_COMM_NULL)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid MPI communicator");
+ if (!file_id)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL file ID pointer");
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(file_comm, &mpi_rank)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code);
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(file_comm, &mpi_size)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code);
+
+ if (!file_ptr && (mpi_rank == 0))
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL stub file pointer");
+
+ /* Open stub file on MPI rank 0 only */
+ if (mpi_rank == 0) {
+ h5_stat_t st;
+ MPI_Comm stub_comm = MPI_COMM_SELF;
+ MPI_Info stub_info = MPI_INFO_NULL;
+
+ if ((fapl_id = H5P_create_id(H5P_CLS_FILE_ACCESS_g, FALSE)) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTREGISTER, FAIL, "can't create FAPL for stub file");
+ if (NULL == (plist = H5P_object_verify(fapl_id, H5P_FILE_ACCESS)))
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a file access property list");
+
+ /* Use MPI I/O driver for stub file to allow access to vector I/O */
+ if (H5P_set(plist, H5F_ACS_MPI_PARAMS_COMM_NAME, &stub_comm) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI communicator");
+ if (H5P_set(plist, H5F_ACS_MPI_PARAMS_INFO_NAME, &stub_info) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI info object");
+ if (H5P_set_driver(plist, H5FD_MPIO, NULL, NULL) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI I/O driver on FAPL");
+
+ if (NULL == (stub_file = H5FD_open(name, flags, fapl_id, HADDR_UNDEF)))
+ H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL, "couldn't open HDF5 stub file");
+
+ HDcompile_assert(sizeof(uint64_t) >= sizeof(ino_t));
+
+ /* Retrieve Inode value for stub file */
+ if (HDstat(name, &st) < 0) {
+ stub_file_id = UINT64_MAX;
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL,
+ "couldn't stat HDF5 stub file, errno = %d, error message = '%s'", errno,
+ HDstrerror(errno));
+ }
+ else
+ stub_file_id = (uint64_t)st.st_ino;
+ }
+
+ bcasted_inode = TRUE;
+
+ if (mpi_size > 1) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&stub_file_id, 1, MPI_UINT64_T, 0, file_comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code);
+ }
+
+ if (stub_file_id == UINT64_MAX)
+ H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "couldn't get inode value for HDF5 stub file");
+
+ if (file_ptr)
+ *file_ptr = stub_file;
+ *file_id = stub_file_id;
+
+done:
+ if (fapl_id >= 0 && H5I_dec_ref(fapl_id) < 0)
+ H5_SUBFILING_DONE_ERROR(H5E_ID, H5E_CANTDEC, FAIL, "can't close FAPL ID");
+
+ if (ret_value < 0) {
+ if (!bcasted_inode && (mpi_size > 1)) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&stub_file_id, 1, MPI_UINT64_T, 0, file_comm)))
+ H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Bcast failed", mpi_code);
+ }
+ if (stub_file) {
+ if (H5FD_close(stub_file) < 0)
+ H5_SUBFILING_DONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, "couldn't close HDF5 stub file");
+ }
+ }
+
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: H5_open_subfiles
+ *
+ * Purpose: Wrapper for the internal 'open__subfiles' function
+ * Similar to the other public wrapper functions, we
+ * discover (via the sf_context) the number of io concentrators
+ * and pass that to the internal function so that vector
+ * storage arrays can be stack based rather than explicitly
+ * allocated and freed.
+ *
+ * The Internal function is responsible for sending all IOC
+ * instances, the (sub)file open requests.
+ *
+ * Prior to calling the internal open function, we initialize
+ * a new subfiling context that contains topology info and
+ * new MPI communicators that facilitate messaging between
+ * HDF5 clients and the IOCs.
+ *
+ * Return: Success (0) or Faiure (non-zero)
+ * Errors: If MPI operations fail for some reason.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5_open_subfiles(const char *base_filename, uint64_t file_id, H5FD_subfiling_params_t *subfiling_config,
+ int file_acc_flags, MPI_Comm file_comm, int64_t *context_id_out)
+{
+ subfiling_context_t *sf_context = NULL;
+ int64_t context_id = -1;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
+ if (!base_filename)
+ H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL, "invalid subfiling base filename");
+
+ if (!subfiling_config)
+ H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL, "invalid subfiling configuration");
+
+ if (!context_id_out)
+ H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL, "invalid subfiling context ID pointer");
+
+ /* Initialize new subfiling context ID based on configuration information */
+ if (init_subfiling(base_filename, file_id, subfiling_config, file_acc_flags, file_comm, &context_id) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't initialize subfiling context");
+
+ /* Retrieve the subfiling object for the newly-created context ID */
+ if (NULL == (sf_context = H5_get_subfiling_object(context_id)))
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't get subfiling object from context ID");
+
+ /*
+ * If we're actually using the IOCs, we will
+ * start the service threads on the identified
+ * ranks as part of the subfile opening.
+ */
+ if (open_subfile_with_context(sf_context, file_acc_flags) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL, "couldn't open subfiling subfiles");
+
+#ifdef H5_SUBFILING_DEBUG
+ {
+ struct tm *tm = NULL;
+ time_t cur_time;
+ int mpi_rank;
+
+ /* Open debugging logfile */
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(file_comm, &mpi_rank)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code);
+
+ HDsnprintf(sf_context->sf_logfile_name, PATH_MAX, "%s.log.%d", sf_context->h5_filename, mpi_rank);
+
+ if (NULL == (sf_context->sf_logfile = HDfopen(sf_context->sf_logfile_name, "a")))
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL,
+ "couldn't open subfiling debug logfile");
+
+ cur_time = time(NULL);
+ tm = localtime(&cur_time);
+
+ H5_subfiling_log(context_id, "-- LOGGING BEGIN - %s", asctime(tm));
+ }
+#endif
+
+ *context_id_out = context_id;
+
+done:
+ /*
+ * Form consensus on whether opening subfiles was
+ * successful
+ */
+ {
+ int mpi_size = -1;
+ int err_result = (ret_value < 0);
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(file_comm, &mpi_size)))
+ H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Comm_size failed", mpi_code);
+
+ if (mpi_size > 1) {
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Allreduce(MPI_IN_PLACE, &err_result, 1, MPI_INT, MPI_MAX, file_comm)))
+ H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Allreduce failed", mpi_code);
+ }
+
+ if (err_result)
+ H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTOPENFILE, FAIL,
+ "one or more IOC ranks couldn't open subfiles");
+ }
+
+ if (ret_value < 0) {
+ clear_fid_map_entry(file_id, context_id);
+
+ if (context_id >= 0 && H5_free_subfiling_object(context_id) < 0)
+ H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling object");
+
+ *context_id_out = -1;
+ }
+
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*
+-------------------------------------------------------------------------
+ Programmer: Richard Warren
+ Purpose: Called as part of a file open operation, we initialize a
+ subfiling context which includes the application topology
+ along with other relevant info such as the MPI objects
+ (communicators) for communicating with IO concentrators.
+ We also identify which MPI ranks will have IOC threads
+ started on them.
+
+ We return a context ID via the 'sf_context' variable.
+
+ Errors: returns an error if we detect any initialization errors,
+ including malloc failures or any resource allocation
+ problems.
+
+ Revision History -- Initial implementation
+-------------------------------------------------------------------------
+*/
+static herr_t
+init_subfiling(const char *base_filename, uint64_t file_id, H5FD_subfiling_params_t *subfiling_config,
+ int file_acc_flags, MPI_Comm comm, int64_t *context_id_out)
+{
+ subfiling_context_t *new_context = NULL;
+ sf_topology_t *app_topology = NULL;
+ MPI_Comm node_comm = MPI_COMM_NULL;
+ int64_t context_id = -1;
+ FILE *config_file = NULL;
+ char *file_basename = NULL;
+ char *subfile_dir = NULL;
+ int mpi_rank;
+ int mpi_size;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
+ HDassert(context_id_out);
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &mpi_rank)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code);
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(comm, &mpi_size)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code);
+
+ /* Use the file's index to create a new subfiling context ID */
+ if ((context_id = H5_new_subfiling_object_id(SF_CONTEXT)) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't create new subfiling context ID");
+
+ /* Create a new subfiling context object with the created context ID */
+ if (NULL == (new_context = H5_get_subfiling_object(context_id)))
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't create new subfiling object");
+ new_context->sf_context_id = -1;
+ new_context->topology = NULL;
+ new_context->sf_msg_comm = MPI_COMM_NULL;
+ new_context->sf_data_comm = MPI_COMM_NULL;
+ new_context->sf_eof_comm = MPI_COMM_NULL;
+ new_context->sf_node_comm = MPI_COMM_NULL;
+ new_context->sf_group_comm = MPI_COMM_NULL;
+
+ /*
+ * If there's an existing subfiling configuration file for
+ * this file, read the stripe size and number of subfiles
+ * from it
+ */
+ if (0 == (file_acc_flags & O_CREAT)) {
+ int64_t config[2] = {0, 0}; /* {stripe size, num subfiles} */
+
+ if (mpi_rank == 0) {
+ /* TODO: currently no support for subfile prefix */
+ if (H5_dirname(base_filename, &subfile_dir) < 0)
+ config[0] = -1;
+
+ if (config[0] >= 0) {
+ if (H5_basename(base_filename, &file_basename) < 0)
+ config[0] = -1;
+ }
+
+ if (config[0] >= 0) {
+ if (open_config_file(file_basename, subfile_dir, file_id, "r", &config_file) < 0)
+ config[0] = -1;
+ }
+
+ if (config[0] >= 0) {
+ if (!config_file)
+ config[0] = -2; /* No config file; use setting from configuration */
+ else {
+ /*
+ * If a subfiling configuration file exists and we aren't truncating
+ * it, read the number of subfiles used at file creation time.
+ */
+ if (H5_get_subfiling_config_from_file(config_file, &config[0], &config[1]) < 0)
+ config[0] = -1;
+ }
+ }
+ }
+
+ if (mpi_size > 1) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Bcast(config, 2, MPI_INT64_T, 0, comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code);
+ }
+
+ /*
+ * Override the stripe size and stripe count settings in the
+ * application's subfiling configuration if we read values
+ * from an existing subfiling configuration file
+ */
+ if (config[0] == -1)
+ H5_SUBFILING_GOTO_ERROR(
+ H5E_FILE, H5E_CANTOPENFILE, FAIL,
+ "lead process couldn't read the number of subfiles from subfiling configuration file");
+ else {
+ if (config[0] > 0)
+ subfiling_config->stripe_size = config[0];
+ if (config[1] > 0) {
+ H5_CHECK_OVERFLOW(config[1], int64_t, int32_t);
+ subfiling_config->stripe_count = (int32_t)config[1];
+ }
+ }
+ }
+ else {
+ char *env_value = NULL;
+
+ /* Check for a subfiling stripe size setting from the environment */
+ if ((env_value = HDgetenv(H5FD_SUBFILING_STRIPE_SIZE))) {
+ long long stripe_size = -1;
+
+ errno = 0;
+
+ stripe_size = HDstrtoll(env_value, NULL, 0);
+ if (ERANGE == errno)
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL,
+ "invalid stripe size setting for " H5FD_SUBFILING_STRIPE_SIZE);
+
+ if (stripe_size > 0) {
+ subfiling_config->stripe_size = (int64_t)stripe_size;
+ }
+ }
+ }
+
+#if H5_CHECK_MPI_VERSION(3, 0)
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &mpi_rank)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code);
+
+ /* Create an MPI sub-communicator for intra-node communications */
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, mpi_rank, MPI_INFO_NULL, &node_comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_split_type failed", mpi_code);
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_set_errhandler(node_comm, MPI_ERRORS_RETURN)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_set_errhandler failed", mpi_code);
+#else
+#error "MPI-3 required for MPI_Comm_split_type"
+#endif
+
+ /*
+ * Setup the application topology information, including the computed
+ * number and distribution map of the set of I/O concentrators
+ */
+ if (init_app_topology(subfiling_config, comm, node_comm, &app_topology) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't initialize application topology");
+
+ new_context->sf_context_id = context_id;
+
+ if (init_subfiling_context(new_context, base_filename, file_id, subfiling_config, app_topology, comm) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL,
+ "couldn't initialize subfiling application topology object");
+ new_context->sf_node_comm = node_comm;
+
+ *context_id_out = context_id;
+
+done:
+ if (config_file && (EOF == HDfclose(config_file)))
+ H5_SUBFILING_DONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL,
+ "couldn't close subfiling configuration file");
+
+ H5MM_free(file_basename);
+ H5MM_free(subfile_dir);
+
+ if (ret_value < 0) {
+ if (app_topology) {
+ if (H5_free_subfiling_topology(app_topology) < 0)
+ H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling topology");
+ }
+
+ if (H5_mpi_comm_free(&node_comm) < 0)
+ H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free MPI communicator");
+
+ if (context_id >= 0 && H5_free_subfiling_object(context_id) < 0)
+ H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling object");
+
+ *context_id_out = -1;
+ }
+
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: init_app_topology
+ *
+ * Purpose: Determine the topology of the application so that MPI ranks
+ * can be assigned as I/O concentrators. The default is to use
+ * 1 MPI rank per node as an I/O concentrator, but this can be
+ * changed by the application's subfiling configuration, or by
+ * an environment variable (H5FD_SUBFILING_IOC_PER_NODE).
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+init_app_topology(H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, MPI_Comm node_comm,
+ sf_topology_t **app_topology_out)
+{
+ H5FD_subfiling_ioc_select_t ioc_selection_type;
+ sf_topology_t *app_topology = NULL;
+ int64_t topology_id = -1;
+ char *env_value = NULL;
+ char *ioc_sel_str = NULL;
+ long ioc_select_val = -1;
+ long iocs_per_node = 1;
+ int ioc_count = 0;
+ int rank_multiple = 1;
+ int comm_rank;
+ int comm_size;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
+ HDassert(subfiling_config);
+ HDassert(MPI_COMM_NULL != comm);
+ HDassert(MPI_COMM_NULL != node_comm);
+ HDassert(app_topology_out);
+ HDassert(!*app_topology_out);
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &comm_rank)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code);
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(comm, &comm_size)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code);
+
+ ioc_selection_type = subfiling_config->ioc_selection;
+
+ /* Check if an IOC selection type was specified by environment variable */
+ if (get_ioc_selection_criteria_from_env(&ioc_selection_type, &ioc_sel_str) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL,
+ "couldn't get IOC selection type from environment");
+
+ /*
+ * Check parameters for the specified IOC selection strategy
+ * and determine the maximum number of I/O concentrators
+ */
+ switch (ioc_selection_type) {
+ case SELECT_IOC_ONE_PER_NODE: {
+ if (comm_size > 1) {
+ /* Check for an IOC-per-node value set in the environment */
+ if ((env_value = HDgetenv(H5FD_SUBFILING_IOC_PER_NODE))) {
+ errno = 0;
+ ioc_select_val = HDstrtol(env_value, NULL, 0);
+ if ((ERANGE == errno)) {
+ HDprintf("invalid value '%s' for " H5FD_SUBFILING_IOC_PER_NODE "\n", env_value);
+ ioc_select_val = 1;
+ }
+
+ if (ioc_select_val > 0)
+ iocs_per_node = ioc_select_val;
+ }
+ }
+
+ /* IOC count will be adjusted after number of nodes is determined */
+ H5_CHECK_OVERFLOW(iocs_per_node, long, int);
+ ioc_count = (int)iocs_per_node;
+
+ break;
+ }
+
+ case SELECT_IOC_EVERY_NTH_RANK: {
+ /*
+ * User specifies a rank multiple value. Selection starts
+ * with rank 0 and then the user-specified stride is applied
+ * to identify other IOC ranks.
+ */
+ ioc_select_val = 1;
+ if (ioc_sel_str) {
+ errno = 0;
+ ioc_select_val = HDstrtol(ioc_sel_str, NULL, 0);
+ if ((ERANGE == errno) || (ioc_select_val <= 0)) {
+ HDprintf("invalid IOC selection strategy string '%s' for strategy "
+ "SELECT_IOC_EVERY_NTH_RANK; defaulting to SELECT_IOC_ONE_PER_NODE\n",
+ ioc_sel_str);
+ ioc_select_val = 1;
+ ioc_selection_type = SELECT_IOC_ONE_PER_NODE;
+ }
+ }
+
+ H5_CHECK_OVERFLOW(ioc_select_val, long, int);
+ ioc_count = (comm_size / (int)ioc_select_val);
+
+ if ((comm_size % ioc_select_val) != 0) {
+ ioc_count++;
+ }
+
+ break;
+ }
+
+ case SELECT_IOC_TOTAL: {
+ /*
+ * User specifies a total number of I/O concentrators.
+ * Starting with rank 0, a stride of (mpi_size / total)
+ * is applied to identify other IOC ranks.
+ */
+ ioc_select_val = 1;
+ if (ioc_sel_str) {
+ errno = 0;
+ ioc_select_val = HDstrtol(ioc_sel_str, NULL, 0);
+ if ((ERANGE == errno) || (ioc_select_val <= 0) || (ioc_select_val >= comm_size)) {
+ HDprintf("invalid IOC selection strategy string '%s' for strategy SELECT_IOC_TOTAL; "
+ "defaulting to SELECT_IOC_ONE_PER_NODE\n",
+ ioc_sel_str);
+ ioc_select_val = 1;
+ ioc_selection_type = SELECT_IOC_ONE_PER_NODE;
+ }
+ }
+
+ H5_CHECK_OVERFLOW(ioc_select_val, long, int);
+ ioc_count = (int)ioc_select_val;
+
+ rank_multiple = (comm_size / ioc_count);
+
+ break;
+ }
+
+ case SELECT_IOC_WITH_CONFIG:
+ default:
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "invalid IOC selection strategy");
+ break;
+ }
+
+ /*
+ * TODO: A different IOC selection string from the environment than what was
+ * used originally will cause the IOCs to be assigned differently than
+ * expected. While this generally shouldn't cause issues (other than
+ * for the SELECT_IOC_TOTAL case), this should still be dealt with
+ * eventually.
+ */
+ /* Check the subfiling topology cache to see if there's a matching object */
+ if (find_cached_topology_info(comm, subfiling_config, iocs_per_node, &app_topology) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL,
+ "can't check for cached subfiling topology object");
+ HDassert(!app_topology || (app_topology->selection_type == ioc_selection_type));
+
+ if (!app_topology) {
+ /* Generate an ID for the application topology object */
+ if ((topology_id = H5_new_subfiling_object_id(SF_TOPOLOGY)) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get ID for subfiling topology object");
+
+ /* Get a new application topology object from the cache */
+ if (NULL == (app_topology = H5_get_subfiling_object(topology_id)))
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get subfiling topology object");
+ app_topology->app_layout = NULL;
+ app_topology->app_comm = MPI_COMM_NULL;
+ app_topology->rank_is_ioc = FALSE;
+ app_topology->ioc_idx = -1;
+ app_topology->n_io_concentrators = ioc_count;
+ app_topology->io_concentrators = NULL;
+ app_topology->selection_type = ioc_selection_type;
+
+ if (H5_mpi_comm_dup(comm, &app_topology->app_comm) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTCOPY, FAIL, "can't duplicate MPI communicator");
+
+ if (init_app_layout(app_topology, comm, node_comm) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't initialize application layout");
+ HDassert(app_topology->app_layout);
+ HDassert(app_topology->app_layout->layout);
+ HDassert(app_topology->app_layout->node_ranks);
+ HDassert(app_topology->app_layout->node_count > 0);
+
+ /*
+ * Now that the application node count has been determined, adjust the
+ * number of I/O concentrators for the SELECT_IOC_ONE_PER_NODE case
+ */
+ if (app_topology->selection_type == SELECT_IOC_ONE_PER_NODE)
+ app_topology->n_io_concentrators = (int)iocs_per_node * app_topology->app_layout->node_count;
+
+ /*
+ * Make sure the number of I/O concentrators doesn't
+ * exceed the specified number of subfiles
+ */
+ if (subfiling_config->stripe_count != H5FD_SUBFILING_DEFAULT_STRIPE_COUNT) {
+ if (app_topology->n_io_concentrators > subfiling_config->stripe_count)
+ app_topology->n_io_concentrators = subfiling_config->stripe_count;
+ }
+
+ /*
+ * Determine which ranks are I/O concentrator ranks, based on the
+ * given IOC selection strategy and MPI information.
+ */
+ if (identify_ioc_ranks(app_topology, rank_multiple) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL,
+ "couldn't determine which MPI ranks are I/O concentrators");
+ }
+
+ *app_topology_out = app_topology;
+
+done:
+ if (ret_value < 0) {
+ if (app_topology && (topology_id >= 0)) {
+ if (H5_free_subfiling_object(topology_id) < 0)
+ H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free subfiling topology object");
+ }
+ }
+
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*
+-------------------------------------------------------------------------
+ Programmer: Richard Warren
+ Purpose: Return a character string which represents either the
+ default selection method: SELECT_IOC_ONE_PER_NODE; or
+ if the user has selected a method via the environment
+ variable (H5FD_SUBFILING_IOC_SELECTION_CRITERIA), we
+ return that along with any optional qualifier with for
+ that method.
+
+ Errors: None.
+
+ Revision History -- Initial implementation
+-------------------------------------------------------------------------
+*/
+static herr_t
+get_ioc_selection_criteria_from_env(H5FD_subfiling_ioc_select_t *ioc_selection_type, char **ioc_sel_info_str)
+{
+ char *opt_value = NULL;
+ char *env_value = HDgetenv(H5FD_SUBFILING_IOC_SELECTION_CRITERIA);
+ herr_t ret_value = SUCCEED;
+
+ HDassert(ioc_selection_type);
+ HDassert(ioc_sel_info_str);
+
+ *ioc_sel_info_str = NULL;
+
+ if (env_value) {
+ long check_value;
+
+ /*
+ * For non-default options, the environment variable
+ * should have the following form: integer:[integer|string]
+ * In particular, EveryNthRank == 1:64 or every 64 ranks assign an IOC
+ * or WithConfig == 2:/<full_path_to_config_file>
+ */
+ if ((opt_value = HDstrchr(env_value, ':')))
+ *opt_value++ = '\0';
+
+ errno = 0;
+ check_value = HDstrtol(env_value, NULL, 0);
+
+ if (errno == ERANGE)
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL,
+ "couldn't parse value from " H5FD_SUBFILING_IOC_SELECTION_CRITERIA
+ " environment variable");
+
+ if ((check_value < 0) || (check_value >= ioc_selection_options))
+ H5_SUBFILING_GOTO_ERROR(
+ H5E_VFL, H5E_BADVALUE, FAIL,
+ "invalid IOC selection type value %ld from " H5FD_SUBFILING_IOC_SELECTION_CRITERIA
+ " environment variable",
+ check_value);
+
+ *ioc_selection_type = (H5FD_subfiling_ioc_select_t)check_value;
+ *ioc_sel_info_str = opt_value;
+ }
+
+done:
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: find_cached_topology_info
+ *
+ * Purpose: Given an MPI communicator and IOC selection strategy,
+ * checks the subfiling topology cached to see if any matching
+ * topology objects have been cached.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+find_cached_topology_info(MPI_Comm comm, H5FD_subfiling_params_t *subf_config, long iocs_per_node,
+ sf_topology_t **app_topology)
+{
+ H5FD_subfiling_ioc_select_t ioc_selection_type;
+ int32_t stripe_count;
+ herr_t ret_value = SUCCEED;
+
+ HDassert(subf_config);
+
+ ioc_selection_type = subf_config->ioc_selection;
+ stripe_count = subf_config->stripe_count;
+
+ for (size_t i = 0; i < sf_topology_cache_num_entries; i++) {
+ sf_topology_t *cached_topology = sf_topology_cache[i];
+ int result;
+ int mpi_code;
+
+ HDassert(cached_topology);
+
+ /*
+ * If the selection types differ, just reject the cached topology
+ * for now rather than checking if the mapping is equivalent
+ */
+ if (ioc_selection_type != cached_topology->selection_type)
+ continue;
+
+ /*
+ * If the number of I/O concentrators in the cached topology
+ * is greater than the specified target number of subfiles,
+ * reject the cached topology
+ */
+ if (stripe_count != H5FD_SUBFILING_DEFAULT_STRIPE_COUNT) {
+ if (stripe_count < cached_topology->n_io_concentrators)
+ continue;
+ }
+
+ if (cached_topology->selection_type == SELECT_IOC_ONE_PER_NODE) {
+ HDassert(iocs_per_node >= 1);
+ HDassert(cached_topology->app_layout->node_count > 0);
+
+ /*
+ * If a IOCs-per-node setting was set in the environment and would
+ * cause the application topology to differ from the cached topology
+ * we found, don't reuse the cached topology
+ */
+ if (cached_topology->n_io_concentrators !=
+ (iocs_per_node * cached_topology->app_layout->node_count))
+ continue;
+ }
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_compare(comm, cached_topology->app_comm, &result)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_compare failed", mpi_code);
+
+ if (MPI_IDENT == result || MPI_CONGRUENT == result) {
+ *app_topology = cached_topology;
+ break;
+ }
+ }
+
+done:
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: init_app_layout
+ *
+ * Purpose: Determines the layout of MPI ranks across nodes in order to
+ * figure out the final application topology
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+init_app_layout(sf_topology_t *app_topology, MPI_Comm comm, MPI_Comm node_comm)
+{
+ app_layout_t *app_layout = NULL;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
+ HDassert(app_topology);
+ HDassert(!app_topology->app_layout);
+ HDassert(MPI_COMM_NULL != comm);
+ HDassert(MPI_COMM_NULL != node_comm);
+
+ if (NULL == (app_layout = HDcalloc(1, sizeof(*app_layout))))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "couldn't allocate application layout structure");
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &app_layout->world_rank)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code);
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(comm, &app_layout->world_size)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code);
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(node_comm, &app_layout->node_local_rank)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code);
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(node_comm, &app_layout->node_local_size)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code);
+
+ if (NULL == (app_layout->layout = HDmalloc((size_t)app_layout->world_size * sizeof(*app_layout->layout))))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "couldn't allocate application layout array");
+
+ /* Gather the list of layout_t pairs to all ranks */
+ if (gather_topology_info(app_layout, comm, node_comm) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "can't gather application topology info");
+
+ /* Sort the list according to the node local lead rank values */
+ HDqsort(app_layout->layout, (size_t)app_layout->world_size, sizeof(layout_t), compare_layout_nodelocal);
+
+ /*
+ * Count the number of nodes by checking how many
+ * entries have a node local rank value of 0
+ */
+ app_layout->node_count = 0;
+ for (size_t i = 0; i < (size_t)app_layout->world_size; i++)
+ if (app_layout->layout[i].node_local_rank == 0)
+ app_layout->node_count++;
+
+ HDassert(app_layout->node_count > 0);
+
+ if (NULL ==
+ (app_layout->node_ranks = HDmalloc((size_t)app_layout->node_count * sizeof(*app_layout->node_ranks))))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "couldn't allocate application layout node rank array");
+
+ /*
+ * Record the rank value of the "lead"
+ * MPI rank on each node for later use
+ */
+ for (size_t i = 0, node_rank_index = 0; i < (size_t)app_layout->world_size; i++) {
+ if (app_layout->layout[i].node_local_rank == 0) {
+ HDassert(node_rank_index < (size_t)app_layout->node_count);
+ app_layout->node_ranks[node_rank_index++] = app_layout->layout[i].rank;
+ }
+ }
+
+ app_topology->app_layout = app_layout;
+
+done:
+ if (ret_value < 0) {
+ if (app_layout) {
+ HDfree(app_layout->layout);
+ HDfree(app_layout->node_ranks);
+ HDfree(app_layout);
+ }
+ }
+
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: gather_topology_info
+ *
+ * Purpose: Collectively generate a list of layout_t structures
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+gather_topology_info(app_layout_t *app_layout, MPI_Comm comm, MPI_Comm intra_comm)
+{
+ MPI_Group file_group = MPI_GROUP_NULL;
+ MPI_Group node_group = MPI_GROUP_NULL;
+ layout_t my_layout_info;
+ layout_t *layout_info_partial = NULL;
+ MPI_Comm aggr_comm = MPI_COMM_NULL;
+ int *recv_counts = NULL;
+ int *recv_displs = NULL;
+ int sf_world_size;
+ int sf_world_rank;
+ int node_local_rank;
+ int node_local_size;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
+ HDassert(app_layout);
+ HDassert(app_layout->layout);
+ HDassert(MPI_COMM_NULL != comm);
+
+ sf_world_rank = app_layout->world_rank;
+ sf_world_size = app_layout->world_size;
+ node_local_rank = app_layout->node_local_rank;
+ node_local_size = app_layout->node_local_size;
+
+ my_layout_info.rank = sf_world_rank;
+ my_layout_info.node_local_rank = node_local_rank;
+ my_layout_info.node_local_size = node_local_size;
+
+ /*
+ * Get the rank value for the "lead" rank on this
+ * rank's node so that we can group the layout_t
+ * information for all node-local ranks together
+ */
+ {
+ const int local_lead = 0;
+ int lead_rank;
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_group(comm, &file_group)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_group failed", mpi_code);
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_group(intra_comm, &node_group)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_group failed", mpi_code);
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Group_translate_ranks(node_group, 1, &local_lead, file_group, &lead_rank)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Group_translate_ranks failed", mpi_code);
+
+ if (MPI_UNDEFINED == lead_rank)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't determine lead rank on node");
+
+ my_layout_info.node_lead_rank = lead_rank;
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Group_free(&node_group)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Group_free failed", mpi_code);
+ if (MPI_SUCCESS != (mpi_code = MPI_Group_free(&file_group)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Group_free failed", mpi_code);
+ }
+
+ app_layout->layout[sf_world_rank] = my_layout_info;
+
+ if (sf_world_size > 1) {
+#ifdef H5_SUBFILING_PREFER_ALLGATHER_TOPOLOGY
+ (void)intra_comm;
+
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Allgather(&my_layout_info, 4, MPI_INT, app_layout->layout, 4, MPI_INT, comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Allgather failed", mpi_code);
+#else
+ int aggr_comm_size = 0;
+
+ HDassert(MPI_COMM_NULL != intra_comm);
+
+ /* Split the file communicator into a sub-group of one rank per node */
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_split(comm, node_local_rank, sf_world_rank, &aggr_comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_split failed", mpi_code);
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(aggr_comm, &aggr_comm_size)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code);
+
+ /* Allocate a partial layout info array to aggregate into from node-local ranks */
+ if (node_local_rank == 0) {
+ if (NULL ==
+ (layout_info_partial = HDmalloc((size_t)node_local_size * sizeof(*layout_info_partial))))
+ /* Push error, but participate in gather operation */
+ H5_SUBFILING_DONE_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "can't allocate layout info array");
+ }
+
+ /* Gather node-local layout info to single master rank on each node */
+ if (MPI_SUCCESS != (mpi_code = MPI_Gather(&my_layout_info, 4, MPI_INT, layout_info_partial, 4,
+ MPI_INT, 0, intra_comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Gather failed", mpi_code);
+
+ /* Gather total layout info from/to each master rank on each node */
+ if (node_local_rank == 0) {
+ int send_size = 4 * node_local_size;
+
+ if (NULL == (recv_counts = HDmalloc((size_t)aggr_comm_size * sizeof(*recv_counts))))
+ H5_SUBFILING_DONE_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "can't allocate receive counts array");
+ if (NULL == (recv_displs = HDmalloc((size_t)aggr_comm_size * sizeof(*recv_displs))))
+ H5_SUBFILING_DONE_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "can't allocate receive displacements array");
+
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Allgather(&send_size, 1, MPI_INT, recv_counts, 1, MPI_INT, aggr_comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Allgather failed", mpi_code);
+
+ recv_displs[0] = 0;
+ for (int i = 1; i < aggr_comm_size; i++)
+ recv_displs[i] = recv_displs[i - 1] + recv_counts[i - 1];
+
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Allgatherv(layout_info_partial, send_size, MPI_INT, app_layout->layout,
+ recv_counts, recv_displs, MPI_INT, aggr_comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Allgatherv failed", mpi_code);
+
+ HDfree(recv_displs);
+ HDfree(recv_counts);
+ recv_displs = NULL;
+ recv_counts = NULL;
+ }
+
+ /*
+ * Each master rank on each node distributes the total
+ * layout info back to other node-local ranks
+ */
+ if (MPI_SUCCESS !=
+ (mpi_code = MPI_Bcast(app_layout->layout, 4 * sf_world_size, MPI_INT, 0, intra_comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code);
+#endif
+ }
+
+done:
+ HDfree(recv_displs);
+ HDfree(recv_counts);
+ HDfree(layout_info_partial);
+
+ if (H5_mpi_comm_free(&aggr_comm) < 0)
+ H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI communicator");
+
+ if (node_group != MPI_GROUP_NULL)
+ if (MPI_SUCCESS != (mpi_code = MPI_Group_free(&node_group)))
+ H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Group_free failed", mpi_code);
+ if (file_group != MPI_GROUP_NULL)
+ if (MPI_SUCCESS != (mpi_code = MPI_Group_free(&file_group)))
+ H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Group_free failed", mpi_code);
+
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: compare_layout_nodelocal
+ *
+ * Purpose: Qsort sorting callback that sorts layout_t structures
+ * according to their node local lead MPI rank values. Ties
+ * are broken according to their regular node local MPI rank
+ * values
+ *
+ *-------------------------------------------------------------------------
+ */
+static int
+compare_layout_nodelocal(const void *layout1, const void *layout2)
+{
+ const layout_t *l1 = (const layout_t *)layout1;
+ const layout_t *l2 = (const layout_t *)layout2;
+
+ if (l1->node_lead_rank == l2->node_lead_rank) {
+ return (l1->node_local_rank > l2->node_local_rank) - (l1->node_local_rank < l2->node_local_rank);
+ }
+ else
+ return (l1->node_lead_rank > l2->node_lead_rank) - (l1->node_lead_rank < l2->node_lead_rank);
+}
+
+/*-------------------------------------------------------------------------
+ * Function: identify_ioc_ranks
+ *
+ * Purpose: We've already identified the number of unique nodes and
+ * have a sorted list of layout_t structures. Under normal
+ * conditions, we only utilize a single IOC per node. Under
+ * that circumstance, we only need to fill the
+ * io_concentrators vector from the node_ranks array (which
+ * contains the index into the layout array of lowest MPI rank
+ * on each node) into the io_concentrators vector; Otherwise,
+ * while determining the number of local ranks per node, we
+ * can also select one or more additional IOCs.
+ *
+ * As a side effect, we fill the 'io_concentrators' vector
+ * and set the 'rank_is_ioc' flag to TRUE if our rank is
+ * identified as owning an I/O Concentrator (IOC).
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride)
+{
+ app_layout_t *app_layout = NULL;
+ int *io_concentrators = NULL;
+ int max_iocs = 0;
+ herr_t ret_value = SUCCEED;
+
+ HDassert(app_topology);
+ HDassert(!app_topology->io_concentrators);
+ HDassert(app_topology->n_io_concentrators > 0);
+ HDassert(app_topology->app_layout);
+ HDassert(app_topology->app_layout->layout);
+ HDassert(app_topology->app_layout->node_count > 0);
+
+ app_layout = app_topology->app_layout;
+
+ max_iocs = app_topology->n_io_concentrators;
+
+ if (NULL == (app_topology->io_concentrators = HDmalloc((size_t)app_topology->n_io_concentrators *
+ sizeof(*app_topology->io_concentrators))))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "couldn't allocate array of I/O concentrator ranks");
+
+ io_concentrators = app_topology->io_concentrators;
+
+ switch (app_topology->selection_type) {
+ case SELECT_IOC_ONE_PER_NODE: {
+ int total_ioc_count = 0;
+ int iocs_per_node = 1;
+
+ if (app_topology->n_io_concentrators > app_layout->node_count)
+ iocs_per_node = app_topology->n_io_concentrators / app_layout->node_count;
+
+ HDassert(app_layout->node_ranks);
+
+ for (size_t i = 0; i < (size_t)app_layout->node_count; i++) {
+ int node_index = app_layout->node_ranks[i];
+ int local_size = app_layout->layout[node_index].node_local_size;
+
+ HDassert(total_ioc_count < app_topology->n_io_concentrators);
+ io_concentrators[total_ioc_count] = app_layout->layout[node_index++].rank;
+
+ if (app_layout->world_rank == io_concentrators[total_ioc_count]) {
+ app_topology->ioc_idx = total_ioc_count;
+ app_topology->rank_is_ioc = TRUE;
+ }
+
+ total_ioc_count++;
+
+ for (size_t j = 1; j < (size_t)iocs_per_node; j++) {
+ if (total_ioc_count >= max_iocs)
+ break;
+ if (j >= (size_t)local_size)
+ break;
+
+ HDassert(total_ioc_count < app_topology->n_io_concentrators);
+ io_concentrators[total_ioc_count] = app_layout->layout[node_index++].rank;
+
+ if (app_layout->world_rank == io_concentrators[total_ioc_count]) {
+ app_topology->ioc_idx = total_ioc_count;
+ app_topology->rank_is_ioc = TRUE;
+ }
+
+ total_ioc_count++;
+ }
+
+ if (total_ioc_count >= max_iocs)
+ break;
+ }
+
+ /* Set final number of I/O concentrators after adjustments */
+ app_topology->n_io_concentrators = total_ioc_count;
+
+ break;
+ }
+
+ case SELECT_IOC_EVERY_NTH_RANK:
+ case SELECT_IOC_TOTAL: {
+ int world_size = app_layout->world_size;
+ int ioc_next = 0;
+
+ HDassert(rank_stride > 0);
+
+ for (int i = 0; ioc_next < app_topology->n_io_concentrators; ioc_next++) {
+ int ioc_index = rank_stride * i++;
+
+ if (ioc_index >= world_size)
+ break;
+
+ io_concentrators[ioc_next] = app_layout->layout[ioc_index].rank;
+
+ if (app_layout->world_rank == io_concentrators[ioc_next]) {
+ app_topology->ioc_idx = ioc_next;
+ app_topology->rank_is_ioc = TRUE;
+ }
+
+ if (ioc_next + 1 >= max_iocs)
+ break;
+ }
+
+ /* Set final number of I/O concentrators after adjustments */
+ app_topology->n_io_concentrators = ioc_next;
+
+ break;
+ }
+
+ case SELECT_IOC_WITH_CONFIG:
+ default:
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "invalid IOC selection strategy");
+ break;
+ }
+
+done:
+ if (ret_value < 0) {
+ if (app_topology)
+ HDfree(app_topology->io_concentrators);
+ }
+
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: init_subfile_context
+ *
+ * Purpose: Called as part of the HDF5 file + subfiling opening.
+ * This initializes the subfiling context and associates
+ * this context with the specific HDF5 file.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+init_subfiling_context(subfiling_context_t *sf_context, const char *base_filename, uint64_t file_id,
+ H5FD_subfiling_params_t *subfiling_config, sf_topology_t *app_topology,
+ MPI_Comm file_comm)
+{
+ char *env_value = NULL;
+ int mpi_rank;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
+ HDassert(sf_context);
+ HDassert(sf_context->topology == NULL);
+ HDassert(sf_context->sf_context_id >= 0);
+ HDassert(base_filename);
+ HDassert(file_id != UINT64_MAX);
+ HDassert(subfiling_config);
+ HDassert(app_topology);
+ HDassert(app_topology->n_io_concentrators > 0);
+ HDassert(MPI_COMM_NULL != file_comm);
+
+ sf_context->h5_file_id = file_id;
+ sf_context->sf_fids = NULL;
+ sf_context->sf_num_fids = 0;
+ sf_context->sf_num_subfiles = subfiling_config->stripe_count;
+ sf_context->sf_write_count = 0;
+ sf_context->sf_read_count = 0;
+ sf_context->sf_eof = HADDR_UNDEF;
+ sf_context->sf_stripe_size = H5FD_SUBFILING_DEFAULT_STRIPE_SIZE;
+ sf_context->sf_base_addr = 0;
+ sf_context->sf_msg_comm = MPI_COMM_NULL;
+ sf_context->sf_data_comm = MPI_COMM_NULL;
+ sf_context->sf_eof_comm = MPI_COMM_NULL;
+ sf_context->sf_node_comm = MPI_COMM_NULL;
+ sf_context->sf_group_comm = MPI_COMM_NULL;
+ sf_context->sf_group_size = 1;
+ sf_context->sf_group_rank = 0;
+ sf_context->subfile_prefix = NULL;
+ sf_context->h5_filename = NULL;
+ sf_context->ioc_data = NULL;
+ sf_context->topology = app_topology;
+
+#ifdef H5_SUBFILING_DEBUG
+ sf_context->sf_logfile = NULL;
+#endif
+
+ if (NULL == (sf_context->h5_filename = HDstrdup(base_filename)))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "couldn't allocate space for subfiling filename");
+
+ /* Check for a subfile name prefix setting in the environment */
+ if ((env_value = HDgetenv(H5FD_SUBFILING_SUBFILE_PREFIX))) {
+ if (NULL == (sf_context->subfile_prefix = HDstrdup(env_value)))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't copy subfile prefix value");
+ }
+
+ /*
+ * Set IOC stripe size from subfiling configuration
+ */
+ if (subfiling_config->stripe_size > 0)
+ sf_context->sf_stripe_size = subfiling_config->stripe_size;
+
+ /*
+ * If still set to the default, set the number of subfiles
+ * according to the default mapping of 1 I/O concentrator
+ * -> 1 subfile
+ */
+ if (sf_context->sf_num_subfiles == H5FD_SUBFILING_DEFAULT_STRIPE_COUNT)
+ sf_context->sf_num_subfiles = app_topology->n_io_concentrators;
+
+ /*
+ * Set blocksize per stripe value after possibly adjusting
+ * for user-specified subfile stripe size and number of
+ * subfiles
+ */
+ sf_context->sf_blocksize_per_stripe = sf_context->sf_stripe_size * sf_context->sf_num_subfiles;
+
+ if (app_topology->rank_is_ioc) {
+ int leftover_subfiles;
+
+ /* Adjust base address after stripe size is set, if necessary */
+ sf_context->sf_base_addr = (int64_t)(app_topology->ioc_idx * sf_context->sf_stripe_size);
+
+ /*
+ * Calculate the number of subfiles this rank owns by
+ * round-robining them across the available IOCs and
+ * then allocate an array for the subfile IDs
+ */
+ sf_context->sf_num_fids = sf_context->sf_num_subfiles / app_topology->n_io_concentrators;
+
+ leftover_subfiles = sf_context->sf_num_subfiles % app_topology->n_io_concentrators;
+ if (leftover_subfiles && (leftover_subfiles > app_topology->ioc_idx))
+ sf_context->sf_num_fids++;
+
+ if (NULL ==
+ (sf_context->sf_fids = HDmalloc((size_t)sf_context->sf_num_fids * sizeof(*sf_context->sf_fids))))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't allocate subfile IDs array");
+
+ for (int i = 0; i < sf_context->sf_num_fids; i++)
+ sf_context->sf_fids[i] = -1;
+ }
+
+ /*
+ * Set up various MPI sub-communicators for MPI operations
+ * to/from IOC ranks
+ */
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(file_comm, &mpi_rank)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code);
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_dup(file_comm, &sf_context->sf_msg_comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_dup failed", mpi_code);
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_set_errhandler(sf_context->sf_msg_comm, MPI_ERRORS_RETURN)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_set_errhandler failed", mpi_code);
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_dup(file_comm, &sf_context->sf_data_comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_dup failed", mpi_code);
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_set_errhandler(sf_context->sf_data_comm, MPI_ERRORS_RETURN)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_set_errhandler failed", mpi_code);
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_dup(file_comm, &sf_context->sf_eof_comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_dup failed", mpi_code);
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_set_errhandler(sf_context->sf_eof_comm, MPI_ERRORS_RETURN)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_set_errhandler failed", mpi_code);
+
+ /* Create an MPI sub-communicator for IOC ranks */
+ if (app_topology->n_io_concentrators > 1) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_split(file_comm, app_topology->rank_is_ioc, mpi_rank,
+ &sf_context->sf_group_comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_split failed", mpi_code);
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(sf_context->sf_group_comm, &sf_context->sf_group_rank)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code);
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(sf_context->sf_group_comm, &sf_context->sf_group_size)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code);
+ }
+
+ /* Perform some final validation of subfiling configuration */
+ if (sf_context->sf_stripe_size <= 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "invalid subfiling stripe size (%" PRId64 ")",
+ sf_context->sf_stripe_size);
+
+ if (sf_context->sf_num_subfiles <= 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "invalid subfiling stripe count (%d)",
+ sf_context->sf_num_subfiles);
+
+ HDassert(sf_context->sf_num_subfiles >= app_topology->n_io_concentrators);
+
+done:
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: open_subfile_with_context
+ *
+ * Purpose: While we cannot know a priori, whether an HDF client will
+ * need to access data across the entirety of a file, e.g.
+ * an individual MPI rank may read or write only small
+ * segments of the entire file space; this function sends
+ * a file OPEN_OP to every IO concentrator.
+ *
+ * Prior to opening any subfiles, the H5FDopen will have
+ * created an HDF5 file with the user specified naming.
+ * A path prefix will be selected and is available as
+ * an input argument.
+ *
+ * The opened HDF5 file handle will contain device and
+ * inode values, these being constant for all processes
+ * opening the shared file. The inode value is utilized
+ * as a key value and is associated with the sf_context
+ * which we receive as one of the input arguments.
+ *
+ * IO Concentrator threads will be initialized on MPI ranks
+ * which have been identified via application toplogy
+ * discovery. The number and mapping of IOC to MPI_rank
+ * is part of the sf_context->topology structure.
+ *
+ * Return: Success (0) or Faiure (non-zero)
+ * Errors: If MPI operations fail for some reason.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+open_subfile_with_context(subfiling_context_t *sf_context, int file_acc_flags)
+{
+ herr_t ret_value = SUCCEED;
+
+ HDassert(sf_context);
+ HDassert(sf_context->h5_file_id != UINT64_MAX);
+
+ /*
+ * Save the HDF5 file ID (e.g., inode) to subfile context mapping.
+ * There shouldn't be any issue, but check the status and
+ * return if there was a problem.
+ */
+ if (record_fid_to_subfile(sf_context->h5_file_id, sf_context->sf_context_id, NULL) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL,
+ "couldn't record HDF5 file ID to subfile context mapping");
+
+ /*
+ * If this rank is an I/O concentrator, actually open
+ * the subfiles belonging to this IOC rank
+ */
+ if (sf_context->topology->rank_is_ioc) {
+ if (ioc_open_files(sf_context->sf_context_id, file_acc_flags) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, FAIL, "IOC couldn't open subfile");
+ }
+
+done:
+ if (ret_value < 0) {
+ clear_fid_map_entry(sf_context->h5_file_id, sf_context->sf_context_id);
+ }
+
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: record_fid_to_subfile
+ *
+ * Purpose: Every opened HDF5 file will have (if utilizing subfiling)
+ * a subfiling context associated with it. It is important that
+ * the HDF5 file index is a constant rather than utilizing a
+ * posix file handle since files can be opened multiple times
+ * and with each file open, a new file handle will be assigned.
+ * Note that in such a case, the actual filesystem id will be
+ * retained.
+ *
+ * We utilize that filesystem id (ino_t inode) so that
+ * irrespective of what process opens a common file, the
+ * subfiling system will generate a consistent context for this
+ * file across all parallel ranks.
+ *
+ * This function simply records the filesystem handle to
+ * subfiling context mapping.
+ *
+ * Return: SUCCEED or FAIL.
+ * Errors: FAILs ONLY if storage for the mapping entry cannot
+ * be allocated.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+record_fid_to_subfile(uint64_t file_id, int64_t subfile_context_id, int *next_index)
+{
+ int index;
+ herr_t ret_value = SUCCEED;
+
+ if (!sf_open_file_map) {
+ if (NULL ==
+ (sf_open_file_map = HDmalloc((size_t)DEFAULT_FILE_MAP_ENTRIES * sizeof(*sf_open_file_map))))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't allocate open file mapping");
+
+ sf_file_map_size = DEFAULT_FILE_MAP_ENTRIES;
+ for (int i = 0; i < sf_file_map_size; i++) {
+ sf_open_file_map[i].file_id = UINT64_MAX;
+ sf_open_file_map[i].sf_context_id = -1;
+ }
+ }
+
+ for (index = 0; index < sf_file_map_size; index++) {
+ if (sf_open_file_map[index].file_id == file_id)
+ goto done;
+
+ if (sf_open_file_map[index].file_id == UINT64_MAX) {
+ sf_open_file_map[index].file_id = file_id;
+ sf_open_file_map[index].sf_context_id = subfile_context_id;
+
+ if (next_index) {
+ *next_index = index;
+ }
+
+ goto done;
+ }
+ }
+
+ if (index == sf_file_map_size) {
+ void *tmp_realloc;
+
+ if (NULL == (tmp_realloc = HDrealloc(sf_open_file_map,
+ ((size_t)(sf_file_map_size * 2) * sizeof(*sf_open_file_map)))))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "couldn't reallocate open file mapping");
+
+ sf_open_file_map = tmp_realloc;
+ sf_file_map_size *= 2;
+
+ for (int i = index; i < sf_file_map_size; i++) {
+ sf_open_file_map[i].file_id = UINT64_MAX;
+ }
+
+ if (next_index) {
+ *next_index = index;
+ }
+
+ sf_open_file_map[index].file_id = file_id;
+ sf_open_file_map[index++].sf_context_id = subfile_context_id;
+ }
+
+done:
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: clear_fid_map_entry
+ *
+ * Purpose: Remove the map entry associated with the file->inode.
+ * This is done at file close.
+ *
+ * Return: None
+ * Errors: Cannot fail.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static void
+clear_fid_map_entry(uint64_t file_id, int64_t sf_context_id)
+{
+ if (sf_open_file_map) {
+ for (int i = 0; i < sf_file_map_size; i++) {
+ if ((sf_open_file_map[i].file_id == file_id) &&
+ (sf_open_file_map[i].sf_context_id == sf_context_id)) {
+ sf_open_file_map[i].file_id = UINT64_MAX;
+ sf_open_file_map[i].sf_context_id = -1;
+ return;
+ }
+ }
+ }
+} /* end clear_fid_map_entry() */
+
+/*-------------------------------------------------------------------------
+ * Function: ioc_open_files
+ *
+ * Purpose: This function is called by an I/O concentrator in order to
+ * open the subfiles it is responsible for.
+ *
+ * The names of the subfiles to be opened are generated based
+ * on values from either:
+ *
+ * - The corresponding subfiling configuration file, if one
+ * exists and the HDF5 file isn't being truncated
+ * - The current subfiling context object for the file, if a
+ * subfiling configuration file doesn't exist or the HDF5
+ * file is being truncated
+ *
+ * After the subfiles have been opened, a subfiling
+ * configuration file will be created if this is a file
+ * creation operation. If the truncate flag is specified, the
+ * subfiling configuration file will be re-created in order to
+ * account for any possible changes in the subfiling
+ * configuration.
+ *
+ * Note that the HDF5 file opening protocol may attempt to
+ * open a file twice. A first open attempt is made without any
+ * truncate or other flags which would modify the file state
+ * if it already exists. Then, if this tentative open wasn't
+ * sufficient, the file is closed and a second file open using
+ * the user supplied open flags is invoked.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+ioc_open_files(int64_t file_context_id, int file_acc_flags)
+{
+ subfiling_context_t *sf_context = NULL;
+ mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
+ char *filepath = NULL;
+ char *subfile_dir = NULL;
+ char *base = NULL;
+ int num_subfiles = 0;
+ int num_digits = 0;
+ herr_t ret_value = SUCCEED;
+
+ if (NULL == (sf_context = H5_get_subfiling_object(file_context_id)))
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, FAIL,
+ "couldn't get subfiling object from context ID");
+
+ HDassert(sf_context->h5_file_id != UINT64_MAX);
+ HDassert(sf_context->h5_filename);
+ HDassert(sf_context->sf_fids);
+ HDassert(sf_context->sf_num_subfiles > 0);
+ HDassert(sf_context->sf_num_fids > 0);
+ HDassert(sf_context->topology);
+ HDassert(sf_context->topology->ioc_idx >= 0); /* Only IOC ranks should be here */
+
+ /* Get the basename of the full HDF5 filename */
+ if (H5_basename(sf_context->h5_filename, &base) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't get HDF5 file basename");
+
+ /*
+ * Get the directory prefix where subfiles will be placed.
+ * Under normal circumstances, the subfiles are co-located
+ * with the HDF5 file, but users may specify a different
+ * directory name.
+ */
+ if (sf_context->subfile_prefix) {
+ if (NULL == (subfile_dir = H5MM_strdup(sf_context->subfile_prefix)))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't copy subfile prefix");
+ }
+ else {
+ if (H5_dirname(sf_context->h5_filename, &subfile_dir) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't get HDF5 file dirname");
+ }
+
+ if (NULL == (filepath = HDmalloc(PATH_MAX)))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "couldn't allocate space for subfile filename");
+
+ num_subfiles = sf_context->sf_num_subfiles;
+ num_digits = (int)(HDlog10(num_subfiles) + 1);
+
+ /*
+ * For each subfile this IOC rank owns, generate the name
+ * of the subfile and create/open it
+ */
+ for (int i = 0; i < sf_context->sf_num_fids; i++) {
+ int subfile_idx;
+
+ /* Round-robin subfiles among the available IOCs */
+ subfile_idx = (i * sf_context->topology->n_io_concentrators) + sf_context->topology->ioc_idx + 1;
+
+ /*
+ * Generate the name of the subfile. The subfile naming should
+ * produce files of the following form:
+ * If we assume the HDF5 file is named ABC.h5, and 20 subfiles
+ * are used, then the subfiles will have names:
+ * ABC.h5.subfile_<file-number>_01_of_20,
+ * ABC.h5.subfile_<file-number>_02_of_20, etc.
+ *
+ * and the configuration file will be named:
+ * ABC.h5.subfile_<file-number>.config
+ */
+ HDsnprintf(filepath, PATH_MAX, "%s/" H5FD_SUBFILING_FILENAME_TEMPLATE, subfile_dir, base,
+ sf_context->h5_file_id, num_digits, subfile_idx, num_subfiles);
+
+ if ((sf_context->sf_fids[i] = HDopen(filepath, file_acc_flags, mode)) < 0)
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL, "failed to open subfile");
+ }
+
+ if (file_acc_flags & O_CREAT)
+ sf_context->sf_eof = 0;
+
+ /*
+ * If subfiles were created (rather than simply opened),
+ * check if we also need to create a config file.
+ */
+ if ((file_acc_flags & O_CREAT) && (sf_context->topology->ioc_idx == 0)) {
+ if (create_config_file(sf_context, base, subfile_dir, (file_acc_flags & O_TRUNC)) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTCREATE, FAIL,
+ "couldn't create subfiling configuration file");
+ }
+
+done:
+ if (ret_value < 0) {
+ if (sf_context) {
+ for (int i = 0; i < sf_context->sf_num_fids; i++) {
+ if (sf_context->sf_fids[i] >= 0 && HDclose(sf_context->sf_fids[i]) < 0)
+ H5_SUBFILING_DONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, "failed to close subfile");
+ sf_context->sf_fids[i] = -1;
+ }
+ }
+ }
+
+ H5MM_free(base);
+ H5MM_free(subfile_dir);
+ HDfree(filepath);
+
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: create_config_file
+ *
+ * Purpose: Creates a configuration file that contains
+ * subfiling-related information for a file. This file
+ * includes information such as:
+ *
+ * - the stripe size for the file's subfiles
+ * - the number of I/O concentrators used for I/O to the file's subfiles
+ * - the number of subfiles the logical HDF5 file consists of
+ * - the base HDF5 filename
+ * - the optional directory prefix where the file's subfiles are placed
+ * - the names of each of the file's subfiles
+ *
+ * Return: Non-negative on success/Negative on failure
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+create_config_file(subfiling_context_t *sf_context, const char *base_filename, const char *subfile_dir,
+ hbool_t truncate_if_exists)
+{
+ hbool_t config_file_exists = FALSE;
+ FILE *config_file = NULL;
+ char *config_filename = NULL;
+ char *line_buf = NULL;
+ int ret = 0;
+ herr_t ret_value = SUCCEED;
+
+ HDassert(sf_context);
+ HDassert(base_filename);
+ HDassert(subfile_dir);
+
+ if (sf_context->h5_file_id == UINT64_MAX)
+ H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "invalid HDF5 file ID %" PRIu64,
+ sf_context->h5_file_id);
+ if (*base_filename == '\0')
+ H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "invalid base HDF5 filename '%s'",
+ base_filename);
+ if (*subfile_dir == '\0')
+ subfile_dir = ".";
+
+ if (NULL == (config_filename = HDmalloc(PATH_MAX)))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "couldn't allocate space for subfiling configuration filename");
+
+ HDsnprintf(config_filename, PATH_MAX, "%s/" H5FD_SUBFILING_CONFIG_FILENAME_TEMPLATE, subfile_dir,
+ base_filename, sf_context->h5_file_id);
+
+ /* Determine whether a subfiling configuration file exists */
+ errno = 0;
+ ret = HDaccess(config_filename, F_OK);
+
+ config_file_exists = (ret == 0) || ((ret < 0) && (ENOENT != errno));
+
+ if (config_file_exists && (ret != 0))
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL,
+ "couldn't check existence of subfiling configuration file");
+
+ /*
+ * If a config file doesn't exist, create one. If a
+ * config file does exist, don't touch it unless the
+ * O_TRUNC flag was specified. In this case, truncate
+ * the existing config file and create a new one.
+ */
+ if (!config_file_exists || truncate_if_exists) {
+ int n_subfiles = sf_context->sf_num_subfiles;
+ int num_digits;
+
+ if (NULL == (config_file = HDfopen(config_filename, "w+")))
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL,
+ "couldn't open subfiling configuration file");
+
+ if (NULL == (line_buf = HDmalloc(PATH_MAX)))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "couldn't allocate buffer for writing to subfiling configuration file");
+
+ /* Write the subfiling stripe size to the configuration file */
+ HDsnprintf(line_buf, PATH_MAX, "stripe_size=%" PRId64 "\n", sf_context->sf_stripe_size);
+ if (HDfwrite(line_buf, HDstrlen(line_buf), 1, config_file) != 1)
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL,
+ "failed to write to subfiling configuration file");
+
+ /* Write the number of I/O concentrators to the configuration file */
+ HDsnprintf(line_buf, PATH_MAX, "aggregator_count=%d\n", sf_context->topology->n_io_concentrators);
+ if (HDfwrite(line_buf, HDstrlen(line_buf), 1, config_file) != 1)
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL,
+ "failed to write to subfiling configuration file");
+
+ /* Write the number of subfiles to the configuration file */
+ HDsnprintf(line_buf, PATH_MAX, "subfile_count=%d\n", n_subfiles);
+ if (HDfwrite(line_buf, HDstrlen(line_buf), 1, config_file) != 1)
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL,
+ "failed to write to subfiling configuration file");
+
+ /* Write the base HDF5 filename to the configuration file */
+ HDsnprintf(line_buf, PATH_MAX, "hdf5_file=%s\n", sf_context->h5_filename);
+ if (HDfwrite(line_buf, HDstrlen(line_buf), 1, config_file) != 1)
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL,
+ "failed to write to subfiling configuration file");
+
+ /* Write the optional subfile directory prefix to the configuration file */
+ HDsnprintf(line_buf, PATH_MAX, "subfile_dir=%s\n", subfile_dir);
+ if (HDfwrite(line_buf, HDstrlen(line_buf), 1, config_file) != 1)
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL,
+ "failed to write to subfiling configuration file");
+
+ /* Write out each subfile name to the configuration file */
+ num_digits = (int)(HDlog10(n_subfiles) + 1);
+ for (int k = 0; k < n_subfiles; k++) {
+ HDsnprintf(line_buf, PATH_MAX, H5FD_SUBFILING_FILENAME_TEMPLATE "\n", base_filename,
+ sf_context->h5_file_id, num_digits, k + 1, n_subfiles);
+
+ if (HDfwrite(line_buf, HDstrlen(line_buf), 1, config_file) != 1)
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL,
+ "failed to write to subfiling configuration file");
+ }
+ }
+
+done:
+ if (config_file) {
+ if (EOF == HDfclose(config_file))
+ H5_SUBFILING_DONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL,
+ "couldn't close subfiling configuration file");
+ }
+
+ HDfree(line_buf);
+ HDfree(config_filename);
+
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: open_config_file
+ *
+ * Purpose: Opens the subfiling configuration file for a given HDF5
+ * file and sets `config_file_out`, if a configuration file
+ * exists. Otherwise, `config_file_out` is set to NULL.
+ *
+ * It is the caller's responsibility to check
+ * `config_file_out` on success and close an opened file as
+ * necessary.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+open_config_file(const char *base_filename, const char *subfile_dir, uint64_t file_id, const char *mode,
+ FILE **config_file_out)
+{
+ hbool_t config_file_exists = FALSE;
+ FILE *config_file = NULL;
+ char *config_filename = NULL;
+ int ret = 0;
+ herr_t ret_value = SUCCEED;
+
+ HDassert(base_filename);
+ HDassert(subfile_dir);
+ HDassert(file_id != UINT64_MAX);
+ HDassert(mode);
+ HDassert(config_file_out);
+
+ *config_file_out = NULL;
+
+ if (*base_filename == '\0')
+ H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "invalid base HDF5 filename '%s'",
+ base_filename);
+ if (*subfile_dir == '\0')
+ subfile_dir = ".";
+
+ if (NULL == (config_filename = HDmalloc(PATH_MAX)))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "couldn't allocate space for subfiling configuration filename");
+
+ HDsnprintf(config_filename, PATH_MAX, "%s/" H5FD_SUBFILING_CONFIG_FILENAME_TEMPLATE, subfile_dir,
+ base_filename, file_id);
+
+ /* Determine whether a subfiling configuration file exists */
+ errno = 0;
+ ret = HDaccess(config_filename, F_OK);
+
+ config_file_exists = (ret == 0) || ((ret < 0) && (ENOENT != errno));
+
+ if (!config_file_exists)
+ goto done;
+
+ if (config_file_exists && (ret != 0))
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL,
+ "couldn't check existence of subfiling configuration file");
+
+ if (NULL == (config_file = HDfopen(config_filename, mode)))
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL,
+ "couldn't open subfiling configuration file");
+
+ *config_file_out = config_file;
+
+done:
+ if (ret_value < 0) {
+ if (config_file && (EOF == HDfclose(config_file)))
+ H5_SUBFILING_DONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL,
+ "couldn't close subfiling configuration file");
+ }
+
+ HDfree(config_filename);
+
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: H5_get_subfiling_config_from_file
+ *
+ * Purpose: Reads a Subfiling configuration file to get the stripe size
+ * and number of subfiles used for the logical HDF5 file.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5_get_subfiling_config_from_file(FILE *config_file, int64_t *stripe_size, int64_t *num_subfiles)
+{
+ int64_t read_stripe_size = 0;
+ int64_t read_num_subfiles = 0;
+ char *config_buf = NULL;
+ char *substr = NULL;
+ long config_file_len = 0;
+ herr_t ret_value = SUCCEED;
+
+ HDassert(config_file);
+
+ if (HDfseek(config_file, 0, SEEK_END) < 0)
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_SEEKERROR, FAIL,
+ "couldn't seek to end of subfiling configuration file");
+
+ if ((config_file_len = HDftell(config_file)) < 0)
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL,
+ "couldn't get size of subfiling configuration file");
+
+ if (HDfseek(config_file, 0, SEEK_SET) < 0)
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_SEEKERROR, FAIL,
+ "couldn't seek to beginning of subfiling configuration file");
+
+ if (NULL == (config_buf = HDmalloc((size_t)config_file_len + 1)))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "couldn't allocate space for reading from subfiling configuration file");
+
+ if (HDfread(config_buf, (size_t)config_file_len, 1, config_file) != 1)
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_READERROR, FAIL,
+ "couldn't read from subfiling configuration file");
+
+ config_buf[config_file_len] = '\0';
+
+ if (stripe_size) {
+ if (NULL == (substr = HDstrstr(config_buf, "stripe_size")))
+ H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL,
+ "malformed subfiling configuration file - no stripe size entry");
+
+ if (EOF == HDsscanf(substr, "stripe_size=%" PRId64, &read_stripe_size))
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL,
+ "couldn't get stripe size from subfiling configuration file");
+
+ if (read_stripe_size <= 0)
+ H5_SUBFILING_GOTO_ERROR(
+ H5E_FILE, H5E_BADVALUE, FAIL,
+ "invalid stripe size (%" PRId64 ") read from subfiling configuration file", read_stripe_size);
+
+ *stripe_size = read_stripe_size;
+ }
+
+ if (num_subfiles) {
+ if (NULL == (substr = HDstrstr(config_buf, "subfile_count")))
+ H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL,
+ "malformed subfiling configuration file - no subfile count entry");
+
+ if (EOF == HDsscanf(substr, "subfile_count=%" PRId64, &read_num_subfiles))
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL,
+ "couldn't get number of subfiles from subfiling configuration file");
+
+ if (read_num_subfiles <= 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL,
+ "invalid number of subfiles (%" PRId64
+ ") read from subfiling configuration file",
+ read_num_subfiles);
+
+ *num_subfiles = read_num_subfiles;
+ }
+
+done:
+ HDfree(config_buf);
+
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: H5_resolve_pathname
+ *
+ * Purpose: Simple wrapper routine around realpath(3) to fully resolve
+ * a given filepath. Collective across the specified MPI
+ * communicator in order to minimize file system contention
+ * between MPI ranks.
+ *
+ * The resolved filepath returned through `resolved_filepath`
+ * must be freed by the caller with HDfree.
+ *
+ * Return Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5_resolve_pathname(const char *filepath, MPI_Comm comm, char **resolved_filepath)
+{
+ hsize_t path_len = HSIZE_UNDEF;
+ hbool_t bcasted_path_len = FALSE;
+ hbool_t bcasted_path = FALSE;
+ char *resolved_path = NULL;
+ char *file_basename = NULL;
+ char *file_dirname = NULL;
+ char *cwd = NULL;
+ int mpi_rank;
+ int mpi_size;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
+ HDassert(filepath);
+ HDassert(resolved_filepath);
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &mpi_rank)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code);
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(comm, &mpi_size)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code);
+
+ if (mpi_rank == 0) {
+ if (NULL == (resolved_path = HDrealpath(filepath, NULL))) {
+ if (ENOENT == errno) {
+ if (H5_dirname(filepath, &file_dirname) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't get file dirname");
+
+ /* If filepath is just the filename, set up path using CWD */
+ if (!HDstrcmp(file_dirname, ".")) {
+ if (NULL == (resolved_path = HDmalloc(PATH_MAX)))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "can't allocate buffer for filepath");
+ if (H5_basename(filepath, &file_basename) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't get file basename");
+ if (NULL == (cwd = HDmalloc(PATH_MAX)))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
+ "can't allocate buffer for CWD");
+
+ if (NULL == HDgetcwd(cwd, PATH_MAX))
+ H5_SUBFILING_GOTO_ERROR(
+ H5E_VFL, H5E_CANTGET, FAIL,
+ "can't get current working directory, errno = %d, error message = '%s'", errno,
+ HDstrerror(errno));
+
+ HDsnprintf(resolved_path, PATH_MAX, "%s/%s", cwd, file_basename);
+ }
+ else {
+ /* Otherwise, just use what was given as the pathname */
+ if (NULL == (resolved_path = HDstrdup(filepath)))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't copy filename");
+ }
+ }
+ else
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL,
+ "can't resolve subfile path, errno = %d, error message = '%s'", errno,
+ HDstrerror(errno));
+ }
+
+ if (resolved_path) {
+ H5_CHECKED_ASSIGN(path_len, hsize_t, (HDstrlen(resolved_path) + 1), size_t);
+ }
+ else
+ path_len = HSIZE_UNDEF;
+ }
+
+ /* Broadcast the size of the resolved filepath string to other ranks */
+ bcasted_path_len = TRUE;
+ if (mpi_size > 1) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&path_len, 1, HSIZE_AS_MPI_TYPE, 0, comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code);
+ }
+
+ if (path_len == HSIZE_UNDEF)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "couldn't resolve filepath");
+
+ if (mpi_rank != 0) {
+ if (NULL == (resolved_path = HDmalloc(path_len)))
+ H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate file name buffer");
+ }
+
+ /* Broadcast the resolved filepath to other ranks */
+ bcasted_path = TRUE;
+ if (mpi_size > 1) {
+ H5_CHECK_OVERFLOW(path_len, hsize_t, int);
+ if (MPI_SUCCESS != (mpi_code = MPI_Bcast(resolved_path, (int)path_len, MPI_CHAR, 0, comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code);
+ }
+
+ *resolved_filepath = resolved_path;
+
+done:
+ HDfree(cwd);
+ H5MM_free(file_basename);
+ H5MM_free(file_dirname);
+
+ if (ret_value < 0) {
+ if (!bcasted_path_len) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&path_len, 1, HSIZE_AS_MPI_TYPE, 0, comm)))
+ H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Bcast failed", mpi_code);
+ }
+ if (!bcasted_path && (path_len != HSIZE_UNDEF)) {
+ H5_CHECK_OVERFLOW(path_len, hsize_t, int);
+ if (MPI_SUCCESS != (mpi_code = MPI_Bcast(resolved_path, (int)path_len, MPI_CHAR, 0, comm)))
+ H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Bcast failed", mpi_code);
+ }
+
+ HDfree(resolved_path);
+ }
+
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: H5_close_subfiles
+ *
+ * Purpose: This is a simple wrapper function for the internal version
+ * which actually manages all subfile closing via commands
+ * to the set of IO Concentrators.
+ *
+ * Return: Success (0) or Faiure (non-zero)
+ * Errors: If MPI operations fail for some reason.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *-------------------------------------------------------------------------
+ */
+/*-------------------------------------------------------------------------
+ * Function: Internal close__subfiles
+ *
+ * Purpose: When closing and HDF5 file, we need to close any associated
+ * subfiles as well. This function cycles through all known
+ * IO Concentrators to send a file CLOSE_OP command.
+ *
+ * This function is collective across all MPI ranks which
+ * have opened HDF5 file which associated with the provided
+ * sf_context. Once the request has been issued by all
+ * ranks, the subfile at each IOC will be closed and an
+ * completion ACK will be received.
+ *
+ * Once the subfiles are closed, we initiate a teardown of
+ * the IOC and associated thread_pool threads.
+ *
+ * Return: Success (0) or Faiure (non-zero)
+ * Errors: If MPI operations fail for some reason.
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5_close_subfiles(int64_t subfiling_context_id, MPI_Comm file_comm)
+{
+ subfiling_context_t *sf_context = NULL;
+ MPI_Request barrier_req = MPI_REQUEST_NULL;
+ int mpi_size;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
+ if (NULL == (sf_context = H5_get_subfiling_object(subfiling_context_id)))
+ H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "couldn't get subfiling object from context ID");
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(file_comm, &mpi_size)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code);
+
+ /* We make the subfile close operation collective.
+ * Otherwise, there may be a race condition between
+ * our closing the subfiles and the user application
+ * moving ahead and possibly re-opening a file.
+ *
+ * If we can, we utilize an async barrier which gives
+ * us the opportunity to reduce the CPU load due to
+ * MPI spinning while waiting for the barrier to
+ * complete. This is especially important if there
+ * is heavy thread utilization due to subfiling
+ * activities, i.e. the thread pool might be
+ * extremely busy servicing I/O requests from all
+ * HDF5 application ranks.
+ */
+ if (mpi_size > 1) {
+#if H5_CHECK_MPI_VERSION(3, 1)
+ int barrier_complete = 0;
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Ibarrier(file_comm, &barrier_req)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Ibarrier failed", mpi_code);
+
+ while (!barrier_complete) {
+ useconds_t t_delay = 5;
+ usleep(t_delay);
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Test(&barrier_req, &barrier_complete, MPI_STATUS_IGNORE)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Test failed", mpi_code);
+ }
+#else
+ if (MPI_SUCCESS != (mpi_code = MPI_Barrier(file_comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code);
+#endif
+ }
+
+ /* The map from file handle to subfiling context can now be cleared */
+ if (sf_context->h5_file_id != UINT64_MAX) {
+ clear_fid_map_entry(sf_context->h5_file_id, sf_context->sf_context_id);
+ }
+
+ if (sf_context->topology->rank_is_ioc) {
+ if (sf_context->sf_fids) {
+ for (int i = 0; i < sf_context->sf_num_fids; i++) {
+ errno = 0;
+ if (sf_context->sf_fids[i] >= 0 && HDclose(sf_context->sf_fids[i]) < 0)
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, "couldn't close subfile");
+ sf_context->sf_fids[i] = -1;
+ }
+ }
+ }
+
+ /*
+ * Run another barrier to prevent some ranks from running ahead,
+ * and opening another file before this file is completely closed
+ * down.
+ */
+ if (mpi_size > 1) {
+#if H5_CHECK_MPI_VERSION(3, 1)
+ int barrier_complete = 0;
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Ibarrier(file_comm, &barrier_req)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Ibarrier failed", mpi_code);
+
+ while (!barrier_complete) {
+ useconds_t t_delay = 5;
+ usleep(t_delay);
+
+ if (MPI_SUCCESS != (mpi_code = MPI_Test(&barrier_req, &barrier_complete, MPI_STATUS_IGNORE)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Test failed", mpi_code);
+ }
+#else
+ if (MPI_SUCCESS != (mpi_code = MPI_Barrier(file_comm)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code);
+#endif
+ }
+
+#ifdef H5_SUBFILING_DEBUG
+ if (sf_context->sf_logfile) {
+ struct tm *tm = NULL;
+ time_t cur_time;
+
+ cur_time = time(NULL);
+ tm = localtime(&cur_time);
+
+ H5_subfiling_log(sf_context->sf_context_id, "\n-- LOGGING FINISH - %s", asctime(tm));
+
+ HDfclose(sf_context->sf_logfile);
+ sf_context->sf_logfile = NULL;
+ }
+#endif
+
+done:
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: H5_subfiling_set_config_prop
+ *
+ * Purpose: Sets the specified Subfiling VFD configuration as a
+ * property on the given FAPL pointer. The Subfiling VFD uses
+ * this property to pass its configuration down to the IOC VFD
+ * without needing each IOC VFD to include it as part of its
+ * public configuration.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5_subfiling_set_config_prop(H5P_genplist_t *plist_ptr, const H5FD_subfiling_params_t *vfd_config)
+{
+ htri_t prop_exists = FAIL;
+ herr_t ret_value = SUCCEED;
+
+ if (!plist_ptr)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL FAPL pointer");
+ if (!vfd_config)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid subfiling configuration pointer");
+
+ if ((prop_exists = H5P_exist_plist(plist_ptr, H5FD_SUBFILING_CONFIG_PROP)) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL,
+ "can't check if subfiling configuration property exists in FAPL");
+
+ if (prop_exists) {
+ if (H5P_set(plist_ptr, H5FD_SUBFILING_CONFIG_PROP, vfd_config) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL,
+ "can't set subfiling configuration property on FAPL");
+ }
+ else {
+ union {
+ const void *const_ptr_to_data;
+ void *ptr_to_data;
+ } eliminate_const_warning;
+
+ /*
+ * Cast away const since H5P_insert doesn't match the signature
+ * for "value" as H5P_set
+ */
+ eliminate_const_warning.const_ptr_to_data = vfd_config;
+
+ if (H5P_insert(plist_ptr, H5FD_SUBFILING_CONFIG_PROP, sizeof(H5FD_subfiling_params_t),
+ eliminate_const_warning.ptr_to_data, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTREGISTER, FAIL,
+ "unable to register subfiling configuration property in FAPL");
+ }
+
+done:
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: H5_subfiling_get_config_prop
+ *
+ * Purpose: Retrieves the Subfiling VFD configuration from the given
+ * FAPL pointer. The Subfiling VFD uses this property to pass
+ * its configuration down to the IOC VFD without needing each
+ * IOC VFD to include it as part of its public configuration.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5_subfiling_get_config_prop(H5P_genplist_t *plist_ptr, H5FD_subfiling_params_t *vfd_config)
+{
+ htri_t prop_exists = FAIL;
+ herr_t ret_value = SUCCEED;
+
+ if (!plist_ptr)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL FAPL pointer");
+ if (!vfd_config)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid subfiling configuration pointer");
+
+ if ((prop_exists = H5P_exist_plist(plist_ptr, H5FD_SUBFILING_CONFIG_PROP)) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL,
+ "can't check if subfiling configuration property exists in FAPL");
+
+ if (prop_exists) {
+ if (H5P_get(plist_ptr, H5FD_SUBFILING_CONFIG_PROP, vfd_config) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL,
+ "can't get subfiling configuration property from FAPL");
+ }
+ else {
+ vfd_config->ioc_selection = SELECT_IOC_ONE_PER_NODE;
+ vfd_config->stripe_size = H5FD_SUBFILING_DEFAULT_STRIPE_SIZE;
+ vfd_config->stripe_count = H5FD_SUBFILING_DEFAULT_STRIPE_COUNT;
+ }
+
+done:
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: H5_subfiling_set_file_id_prop
+ *
+ * Purpose: Sets the specified file ID (Inode) value as a property on
+ * the given FAPL pointer. The Subfiling VFD uses this
+ * property to pass the HDF5 stub file ID value down to the
+ * IOC VFD.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5_subfiling_set_file_id_prop(H5P_genplist_t *plist_ptr, uint64_t file_id)
+{
+ htri_t prop_exists = FAIL;
+ herr_t ret_value = SUCCEED;
+
+ if (!plist_ptr)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL FAPL pointer");
+ if (file_id == UINT64_MAX)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid file ID value");
+
+ if ((prop_exists = H5P_exist_plist(plist_ptr, H5FD_SUBFILING_STUB_FILE_ID)) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL,
+ "can't check if file ID property exists in FAPL");
+
+ if (prop_exists) {
+ if (H5P_set(plist_ptr, H5FD_SUBFILING_STUB_FILE_ID, &file_id) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set file ID property on FAPL");
+ }
+ else {
+ if (H5P_insert(plist_ptr, H5FD_SUBFILING_STUB_FILE_ID, sizeof(uint64_t), &file_id, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTREGISTER, FAIL,
+ "unable to register file ID property in FAPL");
+ }
+
+done:
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: H5_subfiling_get_file_id_prop
+ *
+ * Purpose: Retrieves the file ID (Inode) value from the given FAPL
+ * pointer. The Subfiling VFD uses this property to pass the
+ * HDF5 stub file ID value down to the IOC VFD.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5_subfiling_get_file_id_prop(H5P_genplist_t *plist_ptr, uint64_t *file_id)
+{
+ htri_t prop_exists = FAIL;
+ herr_t ret_value = SUCCEED;
+
+ if (!plist_ptr)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL FAPL pointer");
+ if (!file_id)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL file ID pointer");
+
+ if ((prop_exists = H5P_exist_plist(plist_ptr, H5FD_SUBFILING_STUB_FILE_ID)) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL,
+ "can't check if file ID property exists in FAPL");
+
+ if (prop_exists) {
+ if (H5P_get(plist_ptr, H5FD_SUBFILING_STUB_FILE_ID, file_id) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get file ID property from FAPL");
+ }
+ else
+ *file_id = UINT64_MAX;
+
+done:
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: H5_subfile_fid_to_context
+ *
+ * Purpose: This is a basic lookup function which returns the subfiling
+ * context id associated with the specified file ID.
+ *
+ * Return: Non-negative subfiling context ID if the context exists
+ * Negative on failure or if the subfiling context doesn't
+ * exist
+ *
+ * Programmer: Richard Warren
+ * 7/17/2020
+ *
+ * Changes: Initial Version/None.
+ *
+ *-------------------------------------------------------------------------
+ */
+int64_t
+H5_subfile_fid_to_context(uint64_t file_id)
+{
+ int64_t ret_value = -1;
+
+ if (!sf_open_file_map)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, -1, "open file map is NULL");
+
+ for (int i = 0; i < sf_file_map_size; i++) {
+ if (sf_open_file_map[i].file_id == file_id) {
+ return sf_open_file_map[i].sf_context_id;
+ }
+ }
+
+done:
+ H5_SUBFILING_FUNC_LEAVE;
+} /* end H5_subfile_fid_to_context() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5_subfiling_validate_config
+ *
+ * Purpose: Checks that the given subfiling configuration parameters
+ * are valid
+ *
+ * Return: Non-negative on success/Negative on failure
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5_subfiling_validate_config(const H5FD_subfiling_params_t *subf_config)
+{
+ H5FD_subfiling_ioc_select_t ioc_sel_type;
+ herr_t ret_value = SUCCEED;
+
+ if (!subf_config)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL subfiling configuration pointer");
+
+ /*
+ * Compare against each IOC selection value directly since
+ * the enum might be a signed or unsigned type and a comparison
+ * against < 0 could generate a warning
+ */
+ ioc_sel_type = subf_config->ioc_selection;
+ if (ioc_sel_type != SELECT_IOC_ONE_PER_NODE && ioc_sel_type != SELECT_IOC_EVERY_NTH_RANK &&
+ ioc_sel_type != SELECT_IOC_WITH_CONFIG && ioc_sel_type != SELECT_IOC_TOTAL)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid IOC selection method");
+
+ if (subf_config->stripe_size <= 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid stripe size");
+
+ if (subf_config->stripe_count <= 0 && subf_config->stripe_count != H5FD_SUBFILING_DEFAULT_STRIPE_COUNT)
+ H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid stripe count");
+
+done:
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+/*-------------------------------------------------------------------------
+ * Function: H5_subfiling_terminate
+ *
+ * Purpose: A cleanup routine to be called by the Subfiling VFD when
+ * it is terminating. Cleans up internal resources such as the
+ * context and topology caches.
+ *
+ * Return: Non-negative on success/Negative on failure
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5_subfiling_terminate(void)
+{
+ herr_t ret_value = SUCCEED;
+
+ /* Clean up subfiling context and topology caches */
+ if (sf_context_cache) {
+ for (size_t i = 0; i < sf_context_cache_num_entries; i++) {
+ if (H5_free_subfiling_object_int(sf_context_cache[i]) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL,
+ "couldn't free subfiling context object");
+ sf_context_cache[i] = NULL;
+ }
+
+ sf_context_cache_size = 0;
+ sf_context_cache_num_entries = 0;
+
+ HDfree(sf_context_cache);
+ sf_context_cache = NULL;
+ }
+ if (sf_topology_cache) {
+ for (size_t i = 0; i < sf_topology_cache_num_entries; i++) {
+ if (H5_free_subfiling_topology(sf_topology_cache[i]) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL,
+ "couldn't free subfiling topology object");
+ sf_topology_cache[i] = NULL;
+ }
+
+ sf_topology_cache_size = 0;
+ sf_topology_cache_num_entries = 0;
+
+ HDfree(sf_topology_cache);
+ sf_topology_cache = NULL;
+ }
+
+ /* Clean up the file ID to context object mapping */
+ sf_file_map_size = 0;
+ HDfree(sf_open_file_map);
+ sf_open_file_map = NULL;
+
+done:
+ H5_SUBFILING_FUNC_LEAVE;
+}
+
+#ifdef H5_SUBFILING_DEBUG
+void
+H5_subfiling_log(int64_t sf_context_id, const char *fmt, ...)
+{
+ subfiling_context_t *sf_context = NULL;
+ va_list log_args;
+
+ va_start(log_args, fmt);
+
+ /* Retrieve the subfiling object for the newly-created context ID */
+ if (NULL == (sf_context = H5_get_subfiling_object(sf_context_id))) {
+ HDprintf("%s: couldn't get subfiling object from context ID\n", __func__);
+ goto done;
+ }
+
+ H5FD_ioc_begin_thread_exclusive();
+
+ if (sf_context->sf_logfile) {
+ HDvfprintf(sf_context->sf_logfile, fmt, log_args);
+ HDfputs("\n", sf_context->sf_logfile);
+ HDfflush(sf_context->sf_logfile);
+ }
+ else {
+ HDvprintf(fmt, log_args);
+ HDputs("");
+ HDfflush(stdout);
+ }
+
+ H5FD_ioc_end_thread_exclusive();
+
+done:
+ va_end(log_args);
+
+ return;
+}
+#endif