diff options
Diffstat (limited to 'src/H5FDsubfiling/H5subfiling_common.c')
-rw-r--r-- | src/H5FDsubfiling/H5subfiling_common.c | 2840 |
1 files changed, 1850 insertions, 990 deletions
diff --git a/src/H5FDsubfiling/H5subfiling_common.c b/src/H5FDsubfiling/H5subfiling_common.c index 9cc2c65..a1cca65 100644 --- a/src/H5FDsubfiling/H5subfiling_common.c +++ b/src/H5FDsubfiling/H5subfiling_common.c @@ -19,9 +19,9 @@ #include "H5MMprivate.h" -typedef struct { /* Format of a context map entry */ - void *file_handle; /* key value (linear search of the cache) */ - int64_t sf_context_id; /* The return value if matching file_handle */ +typedef struct { /* Format of a context map entry */ + uint64_t file_id; /* key value (linear search of the cache) */ + int64_t sf_context_id; /* The return value if matching file_handle */ } file_map_to_context_t; /* Identifiers for HDF5's error API */ @@ -30,423 +30,52 @@ hid_t H5subfiling_err_class_g = H5I_INVALID_HID; char H5subfiling_mpi_error_str[MPI_MAX_ERROR_STRING]; int H5subfiling_mpi_error_str_len; -static subfiling_context_t *sf_context_cache = NULL; -static sf_topology_t *sf_topology_cache = NULL; +/* MPI Datatype used to send/receive an RPC message */ +MPI_Datatype H5_subfiling_rpc_msg_type = MPI_DATATYPE_NULL; -static size_t sf_context_cache_limit = 16; -static size_t sf_topology_cache_limit = 4; +static subfiling_context_t **sf_context_cache = NULL; +static sf_topology_t **sf_topology_cache = NULL; + +static size_t sf_context_cache_size = 0; +static size_t sf_topology_cache_size = 0; +static size_t sf_context_cache_num_entries = 0; +static size_t sf_topology_cache_num_entries = 0; static file_map_to_context_t *sf_open_file_map = NULL; static int sf_file_map_size = 0; -#define DEFAULT_FILE_MAP_ENTRIES 8 +#define DEFAULT_CONTEXT_CACHE_SIZE 16 +#define DEFAULT_TOPOLOGY_CACHE_SIZE 4 +#define DEFAULT_FILE_MAP_ENTRIES 8 + +static herr_t H5_free_subfiling_object(int64_t object_id); static herr_t H5_free_subfiling_object_int(subfiling_context_t *sf_context); static herr_t H5_free_subfiling_topology(sf_topology_t *topology); -static herr_t init_subfiling(H5FD_subfiling_shared_config_t *subfiling_config, MPI_Comm comm, +static herr_t init_subfiling(const char *base_filename, uint64_t file_id, + H5FD_subfiling_params_t *subfiling_config, int file_acc_flags, MPI_Comm comm, int64_t *context_id_out); -static herr_t init_app_topology(H5FD_subfiling_ioc_select_t ioc_selection_type, MPI_Comm comm, +static herr_t init_app_topology(H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, MPI_Comm node_comm, sf_topology_t **app_topology_out); -static herr_t init_subfiling_context(subfiling_context_t *sf_context, - H5FD_subfiling_shared_config_t *subfiling_config, +static herr_t get_ioc_selection_criteria_from_env(H5FD_subfiling_ioc_select_t *ioc_selection_type, + char **ioc_sel_info_str); +static herr_t find_cached_topology_info(MPI_Comm comm, H5FD_subfiling_params_t *subf_config, + long iocs_per_node, sf_topology_t **app_topology); +static herr_t init_app_layout(sf_topology_t *app_topology, MPI_Comm comm, MPI_Comm node_comm); +static herr_t gather_topology_info(app_layout_t *app_layout, MPI_Comm comm, MPI_Comm intra_comm); +static int compare_layout_nodelocal(const void *layout1, const void *layout2); +static herr_t identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride); +static herr_t init_subfiling_context(subfiling_context_t *sf_context, const char *base_filename, + uint64_t file_id, H5FD_subfiling_params_t *subfiling_config, sf_topology_t *app_topology, MPI_Comm file_comm); static herr_t open_subfile_with_context(subfiling_context_t *sf_context, int file_acc_flags); -static herr_t record_fid_to_subfile(void *file_handle, int64_t subfile_context_id, int *next_index); -static herr_t ioc_open_file(int64_t file_context_id, int file_acc_flags); -static herr_t generate_subfile_name(subfiling_context_t *sf_context, int file_acc_flags, char *filename_out, - size_t filename_out_len, char **filename_basename_out, - char **subfile_dir_out); +static herr_t record_fid_to_subfile(uint64_t file_id, int64_t subfile_context_id, int *next_index); +static void clear_fid_map_entry(uint64_t file_id, int64_t sf_context_id); +static herr_t ioc_open_files(int64_t file_context_id, int file_acc_flags); static herr_t create_config_file(subfiling_context_t *sf_context, const char *base_filename, const char *subfile_dir, hbool_t truncate_if_exists); -static herr_t open_config_file(subfiling_context_t *sf_context, const char *base_filename, - const char *subfile_dir, const char *mode, FILE **config_file_out); - -static int get_next_fid_map_index(void); -static void clear_fid_map_entry(void *file_handle, int64_t sf_context_id); -static int compare_hostid(const void *h1, const void *h2); -static herr_t get_ioc_selection_criteria_from_env(H5FD_subfiling_ioc_select_t *ioc_selection_type, - char **ioc_sel_info_str); -static int count_nodes(sf_topology_t *info, MPI_Comm comm); -static herr_t gather_topology_info(sf_topology_t *info, MPI_Comm comm); -static int identify_ioc_ranks(sf_topology_t *info, int node_count, int iocs_per_node); -static inline void assign_ioc_ranks(sf_topology_t *app_topology, int ioc_count, int rank_multiple); - -static int -get_next_fid_map_index(void) -{ - int index = 0; - - HDassert(sf_open_file_map || (sf_file_map_size == 0)); - - for (int i = 0; i < sf_file_map_size; i++) { - if (sf_open_file_map[i].file_handle == NULL) { - index = i; - break; - } - } - - /* A valid index should always be found here */ - HDassert(index >= 0); - HDassert((sf_file_map_size == 0) || (index < sf_file_map_size)); - - return index; -} - -/*------------------------------------------------------------------------- - * Function: clear_fid_map_entry - * - * Purpose: Remove the map entry associated with the file->inode. - * This is done at file close. - * - * Return: None - * Errors: Cannot fail. - * - * Programmer: Richard Warren - * 7/17/2020 - * - * Changes: Initial Version/None. - * - *------------------------------------------------------------------------- - */ -static void -clear_fid_map_entry(void *file_handle, int64_t sf_context_id) -{ - if (sf_open_file_map) { - for (int i = 0; i < sf_file_map_size; i++) { - if ((sf_open_file_map[i].file_handle == file_handle) && - (sf_open_file_map[i].sf_context_id == sf_context_id)) { - sf_open_file_map[i].file_handle = NULL; - sf_open_file_map[i].sf_context_id = -1; - return; - } - } - } -} /* end clear_fid_map_entry() */ - -/* - * --------------------------------------------------- - * Topology discovery related functions for choosing - * I/O Concentrator (IOC) ranks. - * Currently, the default approach for assigning an IOC - * is select the lowest MPI rank on each node. - * - * The approach collectively generates N tuples - * consisting of the MPI rank and hostid. This - * collection is then sorted by hostid and scanned - * to identify the IOC ranks. - * - * As time permits, addition assignment methods will - * be implemented, e.g. 1-per-Nranks or via a config - * option. Additional selection methodologies can - * be included as users get more experience using the - * subfiling implementation. - * --------------------------------------------------- - */ - -/*------------------------------------------------------------------------- - * Function: compare_hostid - * - * Purpose: qsort sorting function. - * Compares tuples of 'layout_t'. The sorting is based on - * the long hostid values. - * - * Return: result of: (hostid1 > hostid2) - * - * Programmer: Richard Warren - * 7/17/2020 - * - * Changes: Initial Version/None. - * - *------------------------------------------------------------------------- - */ -static int -compare_hostid(const void *h1, const void *h2) -{ - const layout_t *host1 = (const layout_t *)h1; - const layout_t *host2 = (const layout_t *)h2; - return (host1->hostid > host2->hostid); -} - -/* -------------------------------------------------------------------------- - Programmer: Richard Warren - Purpose: Return a character string which represents either the - default selection method: SELECT_IOC_ONE_PER_NODE; or - if the user has selected a method via the environment - variable (H5FD_SUBFILING_IOC_SELECTION_CRITERIA), we - return that along with any optional qualifier with for - that method. - - Errors: None. - - Revision History -- Initial implementation -------------------------------------------------------------------------- -*/ -static herr_t -get_ioc_selection_criteria_from_env(H5FD_subfiling_ioc_select_t *ioc_selection_type, char **ioc_sel_info_str) -{ - char *opt_value = NULL; - char *env_value = HDgetenv(H5FD_SUBFILING_IOC_SELECTION_CRITERIA); - herr_t ret_value = SUCCEED; - - HDassert(ioc_selection_type); - HDassert(ioc_sel_info_str); - - *ioc_sel_info_str = NULL; - - if (env_value) { - long check_value; - - /* - * For non-default options, the environment variable - * should have the following form: integer:[integer|string] - * In particular, EveryNthRank == 1:64 or every 64 ranks assign an IOC - * or WithConfig == 2:/<full_path_to_config_file> - */ - if ((opt_value = HDstrchr(env_value, ':'))) - *opt_value++ = '\0'; - - errno = 0; - check_value = HDstrtol(env_value, NULL, 0); - - if (errno == ERANGE) - H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, - "couldn't parse value from " H5FD_SUBFILING_IOC_SELECTION_CRITERIA - " environment variable"); - - if ((check_value < 0) || (check_value >= ioc_selection_options)) - H5_SUBFILING_GOTO_ERROR( - H5E_VFL, H5E_BADVALUE, FAIL, - "invalid IOC selection type value %ld from " H5FD_SUBFILING_IOC_SELECTION_CRITERIA - " environment variable", - check_value); - - *ioc_selection_type = (H5FD_subfiling_ioc_select_t)check_value; - *ioc_sel_info_str = opt_value; - } - -done: - H5_SUBFILING_FUNC_LEAVE; -} - -/*------------------------------------------------------------------------- - * Function: count_nodes - * - * Purpose: Initializes the sorted collection of hostid+mpi_rank - * tuples. After initialization, the collection is scanned - * to determine the number of unique hostid entries. This - * value will determine the number of actual I/O concentrators - * that available to the application. A side effect is to - * identify the 'node_index' of the current process. - * - * Return: The number of unique hostid's (nodes). - * Errors: MPI_Abort if memory cannot be allocated. - * - * Programmer: Richard Warren - * 7/17/2020 - * - * Changes: Initial Version/None. - * - *------------------------------------------------------------------------- - */ -static int -count_nodes(sf_topology_t *info, MPI_Comm comm) -{ - app_layout_t *app_layout = NULL; - long nextid; - int node_count; - int hostid_index = -1; - int my_rank; - int mpi_code; - int ret_value = 0; - - HDassert(info); - HDassert(info->app_layout); - HDassert(info->app_layout->layout); - HDassert(info->app_layout->node_ranks); - HDassert(MPI_COMM_NULL != comm); - - if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &my_rank))) - H5_SUBFILING_MPI_GOTO_ERROR(-1, "MPI_Comm_rank failed", mpi_code); - - app_layout = info->app_layout; - node_count = app_layout->node_count; - - nextid = app_layout->layout[0].hostid; - /* Possibly record my hostid_index */ - if (app_layout->layout[0].rank == my_rank) { - hostid_index = 0; - } - - app_layout->node_ranks[0] = 0; /* Add index */ - node_count = 1; - - /* Recall that the topology array has been sorted! */ - for (int k = 1; k < app_layout->world_size; k++) { - /* Possibly record my hostid_index */ - if (app_layout->layout[k].rank == my_rank) - hostid_index = k; - if (app_layout->layout[k].hostid != nextid) { - nextid = app_layout->layout[k].hostid; - /* Record the index of new hostid */ - app_layout->node_ranks[node_count++] = k; - } - } - - /* Mark the end of the node_ranks */ - app_layout->node_ranks[node_count] = app_layout->world_size; - /* Save the index where we first located my hostid */ - app_layout->node_index = hostid_index; - - app_layout->node_count = node_count; - - ret_value = node_count; - -done: - H5_SUBFILING_FUNC_LEAVE; -} - -/*------------------------------------------------------------------------- - * Function: gather_topology_info - * - * Purpose: Collectively generate a sorted collection of hostid+mpi_rank - * tuples. The result is returned in the 'topology' field - * of the sf_topology_t structure. - * - * Return: Non-negative on success/Negative on failure - * - * Programmer: Richard Warren - * 7/17/2020 - * - * Changes: Initial Version/None. - * - *------------------------------------------------------------------------- - */ -static herr_t -gather_topology_info(sf_topology_t *info, MPI_Comm comm) -{ - app_layout_t *app_layout = NULL; - layout_t my_hostinfo; - long hostid; - int sf_world_size; - int sf_world_rank; - herr_t ret_value = SUCCEED; - - HDassert(info); - HDassert(info->app_layout); - HDassert(info->app_layout->layout); - HDassert(MPI_COMM_NULL != comm); - - app_layout = info->app_layout; - sf_world_size = app_layout->world_size; - sf_world_rank = app_layout->world_rank; - - hostid = gethostid(); - - my_hostinfo.hostid = hostid; - my_hostinfo.rank = sf_world_rank; - - app_layout->hostid = hostid; - app_layout->layout[sf_world_rank] = my_hostinfo; - - if (sf_world_size > 1) { - int mpi_code; - - if (MPI_SUCCESS != - (mpi_code = MPI_Allgather(&my_hostinfo, 2, MPI_LONG, app_layout->layout, 2, MPI_LONG, comm))) - H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Allgather failed", mpi_code); - - HDqsort(app_layout->layout, (size_t)sf_world_size, sizeof(layout_t), compare_hostid); - } - -done: - H5_SUBFILING_FUNC_LEAVE; -} - -/*------------------------------------------------------------------------- - * Function: identify_ioc_ranks - * - * Purpose: We've already identified the number of unique nodes and - * have a sorted list layout_t structures. Under normal - * conditions, we only utilize a single IOC per node. Under - * that circumstance, we only need to fill the io_concentrator - * vector from the node_ranks array (which contains the index - * into the layout array of lowest MPI rank on each node) into - * the io_concentrator vector; - * Otherwise, while determining the number of local_peers per - * node, we can also select one or more additional IOCs. - * - * As a side effect, we fill the 'ioc_concentrator' vector - * and set the 'rank_is_ioc' flag to TRUE if our rank is - * identified as owning an I/O Concentrator (IOC). - * - *------------------------------------------------------------------------- - */ -static int -identify_ioc_ranks(sf_topology_t *info, int node_count, int iocs_per_node) -{ - app_layout_t *app_layout = NULL; - int total_ioc_count = 0; - - HDassert(info); - HDassert(info->app_layout); - - app_layout = info->app_layout; - - for (int n = 0; n < node_count; n++) { - int node_index = app_layout->node_ranks[n]; - int local_peer_count = app_layout->node_ranks[n + 1] - app_layout->node_ranks[n]; - - info->io_concentrators[total_ioc_count++] = (int)(app_layout->layout[node_index++].rank); - - if (app_layout->layout[node_index - 1].rank == app_layout->world_rank) { - info->subfile_rank = total_ioc_count - 1; - info->rank_is_ioc = TRUE; - } - - for (int k = 1; k < iocs_per_node; k++) { - if (k < local_peer_count) { - if (app_layout->layout[node_index].rank == app_layout->world_rank) { - info->rank_is_ioc = TRUE; - info->subfile_rank = total_ioc_count; - } - info->io_concentrators[total_ioc_count++] = (int)(app_layout->layout[node_index++].rank); - } - } - } - - info->n_io_concentrators = total_ioc_count; - - return total_ioc_count; -} /* end identify_ioc_ranks() */ - -static inline void -assign_ioc_ranks(sf_topology_t *app_topology, int ioc_count, int rank_multiple) -{ - app_layout_t *app_layout = NULL; - int *io_concentrators = NULL; - - HDassert(app_topology); - HDassert(app_topology->app_layout); - HDassert(app_topology->io_concentrators); - - app_layout = app_topology->app_layout; - io_concentrators = app_topology->io_concentrators; - - /* fill the io_concentrators values based on the application layout */ - if (io_concentrators) { - int ioc_index; - for (int k = 0, ioc_next = 0; ioc_next < ioc_count; ioc_next++) { - ioc_index = rank_multiple * k++; - io_concentrators[ioc_next] = (int)(app_layout->layout[ioc_index].rank); - if (io_concentrators[ioc_next] == app_layout->world_rank) { - app_topology->subfile_rank = ioc_next; - app_topology->rank_is_ioc = TRUE; - } - } - app_topology->n_io_concentrators = ioc_count; - } -} /* end assign_ioc_ranks() */ +static herr_t open_config_file(const char *base_filename, const char *subfile_dir, uint64_t file_id, + const char *mode, FILE **config_file_out); /*------------------------------------------------------------------------- * Function: H5_new_subfiling_object_id @@ -459,10 +88,19 @@ assign_ioc_ranks(sf_topology_t *app_topology, int ioc_count, int rank_multiple) *------------------------------------------------------------------------- */ int64_t -H5_new_subfiling_object_id(sf_obj_type_t obj_type, int64_t index_val) +H5_new_subfiling_object_id(sf_obj_type_t obj_type) { - if (obj_type != SF_CONTEXT && obj_type != SF_TOPOLOGY) + int64_t index_val = 0; + + if (obj_type == SF_CONTEXT) { + index_val = (int64_t)sf_context_cache_num_entries; + } + else if (obj_type == SF_TOPOLOGY) { + index_val = (int64_t)sf_topology_cache_num_entries; + } + else return -1; + if (index_val < 0) return -1; @@ -492,12 +130,6 @@ H5_new_subfiling_object_id(sf_obj_type_t obj_type, int64_t index_val) * *------------------------------------------------------------------------- */ -/* - * TODO: we don't appear to ever use this for retrieving a subfile topology - * object. Might be able to refactor to just return a subfile context - * object. - */ -/* TODO: no way of freeing caches on close currently */ void * H5_get_subfiling_object(int64_t object_id) { @@ -512,7 +144,7 @@ H5_get_subfiling_object(int64_t object_id) if (obj_type == SF_CONTEXT) { /* Contexts provide information principally about * the application and how the data layout is managed - * over some number of sub-files. The important + * over some number of subfiles. The important * parameters are the number of subfiles (or in the * context of IOCs, the MPI ranks and counts of the * processes which host an I/O Concentrator. We @@ -522,58 +154,121 @@ H5_get_subfiling_object(int64_t object_id) /* Create subfiling context cache if it doesn't exist */ if (!sf_context_cache) { - if (NULL == (sf_context_cache = HDcalloc(sf_context_cache_limit, sizeof(subfiling_context_t)))) + if (NULL == (sf_context_cache = HDcalloc(DEFAULT_CONTEXT_CACHE_SIZE, sizeof(*sf_context_cache)))) H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL, "couldn't allocate space for subfiling context cache"); + sf_context_cache_size = DEFAULT_CONTEXT_CACHE_SIZE; + sf_context_cache_num_entries = 0; } /* Make more space in context cache if needed */ - if ((size_t)obj_index == sf_context_cache_limit) { + if ((size_t)obj_index >= sf_context_cache_size) { size_t old_num_entries; + size_t new_size; void *tmp_realloc; - old_num_entries = sf_context_cache_limit; + old_num_entries = sf_context_cache_num_entries; - sf_context_cache_limit *= 2; + new_size = (sf_context_cache_size * 3) / 2; - if (NULL == (tmp_realloc = HDrealloc(sf_context_cache, - sf_context_cache_limit * sizeof(subfiling_context_t)))) + if (NULL == (tmp_realloc = HDrealloc(sf_context_cache, new_size * sizeof(*sf_context_cache)))) H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL, "couldn't allocate space for subfiling context cache"); - sf_context_cache = tmp_realloc; + sf_context_cache = tmp_realloc; + sf_context_cache_size = new_size; /* Clear newly-allocated entries */ - HDmemset(&sf_context_cache[obj_index], 0, - (sf_context_cache_limit - old_num_entries) * sizeof(subfiling_context_t)); + HDmemset(&sf_context_cache[old_num_entries], 0, + (sf_context_cache_size - old_num_entries) * sizeof(*sf_context_cache)); + + /* + * If we had to make more space, the given object index + * should always fall within range after a single re-allocation + */ + HDassert((size_t)obj_index < sf_context_cache_size); } - /* Return direct pointer to the context cache entry */ - return (void *)&sf_context_cache[obj_index]; + /* + * Since this cache currently just keeps all entries until + * application exit, context entry indices should just be + * consecutive + */ + HDassert((size_t)obj_index <= sf_context_cache_num_entries); + if ((size_t)obj_index < sf_context_cache_num_entries) + ret_value = sf_context_cache[obj_index]; + else { + HDassert(!sf_context_cache[sf_context_cache_num_entries]); + + /* Allocate a new subfiling context object */ + if (NULL == (ret_value = HDcalloc(1, sizeof(subfiling_context_t)))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL, + "couldn't allocate subfiling context object"); + + sf_context_cache[sf_context_cache_num_entries++] = ret_value; + } } else if (obj_type == SF_TOPOLOGY) { /* Create subfiling topology cache if it doesn't exist */ if (!sf_topology_cache) { - if (NULL == (sf_topology_cache = HDcalloc(sf_topology_cache_limit, sizeof(sf_topology_t)))) + if (NULL == + (sf_topology_cache = HDcalloc(DEFAULT_TOPOLOGY_CACHE_SIZE, sizeof(*sf_topology_cache)))) H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL, "couldn't allocate space for subfiling topology cache"); + sf_topology_cache_size = DEFAULT_TOPOLOGY_CACHE_SIZE; + sf_topology_cache_num_entries = 0; } - /* We will likely only cache a single topology - * which is that of the original parallel application. - * In that context, we will identify the number of - * nodes along with the number of MPI ranks on a node. + /* Make more space in topology cache if needed */ + if ((size_t)obj_index >= sf_topology_cache_size) { + size_t old_num_entries; + size_t new_size; + void *tmp_realloc; + + old_num_entries = sf_topology_cache_num_entries; + + new_size = (sf_topology_cache_size * 3) / 2; + + if (NULL == (tmp_realloc = HDrealloc(sf_topology_cache, new_size * sizeof(*sf_topology_cache)))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL, + "couldn't allocate space for subfiling topology cache"); + + sf_topology_cache = tmp_realloc; + sf_topology_cache_size = new_size; + + /* Clear newly-allocated entries */ + HDmemset(&sf_topology_cache[old_num_entries], 0, + (sf_topology_cache_size - old_num_entries) * sizeof(*sf_topology_cache)); + + /* + * If we had to make more space, the given object index + * should always fall within range after a single re-allocation + */ + HDassert((size_t)obj_index < sf_topology_cache_size); + } + + /* + * Since this cache currently just keeps all entries until + * application exit, topology entry indices should just be + * consecutive */ - if ((size_t)obj_index >= sf_topology_cache_limit) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, NULL, - "invalid object index for subfiling topology object ID"); + HDassert((size_t)obj_index <= sf_topology_cache_num_entries); + if ((size_t)obj_index < sf_topology_cache_num_entries) + ret_value = sf_topology_cache[obj_index]; + else { + HDassert(!sf_topology_cache[sf_topology_cache_num_entries]); + + /* Allocate a new subfiling topology object */ + if (NULL == (ret_value = HDmalloc(sizeof(sf_topology_t)))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL, + "couldn't allocate subfiling topology object"); - /* Return direct pointer to the topology cache entry */ - return (void *)&sf_topology_cache[obj_index]; + sf_topology_cache[sf_topology_cache_num_entries++] = ret_value; + } } - #ifdef H5_SUBFILING_DEBUG - HDprintf("%s: Unknown subfiling object type for ID %" PRId64 "\n", __func__, object_id); + else + HDprintf("%s: Unknown subfiling object type for ID %" PRId64 "\n", __func__, object_id); #endif done: @@ -586,27 +281,55 @@ done: * Purpose: Frees the underlying subfiling object for a given subfiling * object ID. * + * NOTE: Currently we assume that all created subfiling + * objects are cached in the (very simple) context/topology + * cache until application exit, so the only time a subfiling + * object should be freed by this routine is if something + * fails right after creating one. Otherwise, the internal + * indexing for the relevant cache will be invalid. + * * Return: Non-negative on success/Negative on failure * *------------------------------------------------------------------------- */ -herr_t +static herr_t H5_free_subfiling_object(int64_t object_id) { - subfiling_context_t *sf_context = NULL; - int64_t obj_type = (object_id >> 32) & 0x0FFFF; - herr_t ret_value = SUCCEED; + int64_t obj_type = (object_id >> 32) & 0x0FFFF; + herr_t ret_value = SUCCEED; - if (obj_type != SF_CONTEXT) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "invalid subfiling object type for ID %" PRId64, - object_id); + if (obj_type == SF_CONTEXT) { + subfiling_context_t *sf_context; - if (NULL == (sf_context = H5_get_subfiling_object(object_id))) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, - "couldn't get subfiling context for subfiling object ID"); + if (NULL == (sf_context = H5_get_subfiling_object(object_id))) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, + "couldn't get subfiling context for subfiling object ID"); + + if (H5_free_subfiling_object_int(sf_context) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling object"); + + HDassert(sf_context_cache_num_entries > 0); + HDassert(sf_context == sf_context_cache[sf_context_cache_num_entries - 1]); + sf_context_cache[sf_context_cache_num_entries - 1] = NULL; + sf_context_cache_num_entries--; + } + else { + sf_topology_t *sf_topology; + + HDassert(obj_type == SF_TOPOLOGY); - if (H5_free_subfiling_object_int(sf_context) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling object"); + if (NULL == (sf_topology = H5_get_subfiling_object(object_id))) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, + "couldn't get subfiling context for subfiling object ID"); + + if (H5_free_subfiling_topology(sf_topology) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling topology"); + + HDassert(sf_topology_cache_num_entries > 0); + HDassert(sf_topology == sf_topology_cache[sf_topology_cache_num_entries - 1]); + sf_topology_cache[sf_topology_cache_num_entries - 1] = NULL; + sf_topology_cache_num_entries--; + } done: H5_SUBFILING_FUNC_LEAVE; @@ -617,25 +340,10 @@ H5_free_subfiling_object_int(subfiling_context_t *sf_context) { HDassert(sf_context); -#ifdef H5_SUBFILING_DEBUG - if (sf_context->sf_logfile) { - struct tm *tm = NULL; - time_t cur_time; - - cur_time = time(NULL); - tm = localtime(&cur_time); - - H5_subfiling_log(sf_context->sf_context_id, "\n-- LOGGING FINISH - %s", asctime(tm)); - - HDfclose(sf_context->sf_logfile); - sf_context->sf_logfile = NULL; - } -#endif - sf_context->sf_context_id = -1; sf_context->h5_file_id = UINT64_MAX; - sf_context->h5_file_handle = NULL; - sf_context->sf_fid = -1; + sf_context->sf_num_fids = 0; + sf_context->sf_num_subfiles = -1; sf_context->sf_write_count = 0; sf_context->sf_read_count = 0; sf_context->sf_eof = HADDR_UNDEF; @@ -658,52 +366,63 @@ H5_free_subfiling_object_int(subfiling_context_t *sf_context) return FAIL; sf_context->sf_eof_comm = MPI_COMM_NULL; } - if (sf_context->sf_barrier_comm != MPI_COMM_NULL) { - if (H5_mpi_comm_free(&sf_context->sf_barrier_comm) < 0) + if (sf_context->sf_node_comm != MPI_COMM_NULL) { + if (H5_mpi_comm_free(&sf_context->sf_node_comm) < 0) return FAIL; - sf_context->sf_barrier_comm = MPI_COMM_NULL; + sf_context->sf_node_comm = MPI_COMM_NULL; } if (sf_context->sf_group_comm != MPI_COMM_NULL) { if (H5_mpi_comm_free(&sf_context->sf_group_comm) < 0) return FAIL; sf_context->sf_group_comm = MPI_COMM_NULL; } - if (sf_context->sf_intercomm != MPI_COMM_NULL) { - if (H5_mpi_comm_free(&sf_context->sf_intercomm) < 0) - return FAIL; - sf_context->sf_intercomm = MPI_COMM_NULL; - } - sf_context->sf_group_size = -1; - sf_context->sf_group_rank = -1; - sf_context->sf_intercomm_root = -1; + sf_context->sf_group_size = -1; + sf_context->sf_group_rank = -1; HDfree(sf_context->subfile_prefix); sf_context->subfile_prefix = NULL; - HDfree(sf_context->sf_filename); - sf_context->sf_filename = NULL; - HDfree(sf_context->h5_filename); sf_context->h5_filename = NULL; - if (H5_free_subfiling_topology(sf_context->topology) < 0) - return FAIL; + HDfree(sf_context->sf_fids); + sf_context->sf_fids = NULL; + + /* + * Currently we assume that all created application topology + * objects are cached until application exit and may be shared + * among multiple subfiling contexts, so we free them separately + * from here to avoid issues with stale pointers. + */ sf_context->topology = NULL; + HDfree(sf_context); + return SUCCEED; } static herr_t H5_free_subfiling_topology(sf_topology_t *topology) { + herr_t ret_value = SUCCEED; + HDassert(topology); - topology->subfile_rank = -1; - topology->n_io_concentrators = 0; +#ifndef NDEBUG + { + hbool_t topology_cached = FALSE; - HDfree(topology->subfile_fd); - topology->subfile_fd = NULL; + /* Make sure this application topology object is in the cache */ + for (size_t i = 0; i < sf_topology_cache_num_entries; i++) + if (topology == sf_topology_cache[i]) + topology_cached = TRUE; + HDassert(topology_cached); + } +#endif + + topology->ioc_idx = -1; + topology->n_io_concentrators = 0; if (topology->app_layout) { HDfree(topology->app_layout->layout); @@ -720,9 +439,134 @@ H5_free_subfiling_topology(sf_topology_t *topology) HDfree(topology->io_concentrators); topology->io_concentrators = NULL; + if (H5_mpi_comm_free(&topology->app_comm) < 0) + H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI communicator"); + HDfree(topology); - return SUCCEED; + H5_SUBFILING_FUNC_LEAVE; +} + +/*------------------------------------------------------------------------- + * Function: H5_open_subfiling_stub_file + * + * Purpose: Opens the stub file for an HDF5 file created with the + * Subfiling VFD. This stub file only contains some superblock + * metadata that can allow HDF5 applications to determine that + * the file is an HDF5 file and was created with the Subfiling + * VFD. + * + * This routine is collective across `file_comm`; once the + * stub file has been opened, the inode value for the file is + * retrieved and broadcasted to all MPI ranks in `file_comm` + * for future use. + * + * To avoid unnecessary overhead from a large-scale file open, + * this stub file is currently only opened on MPI rank 0. Note + * that this assumes that all the relevant metadata will be + * written from MPI rank 0. This should be fine for now since + * the HDF file signature and Subfiling driver info is really + * all that's needed, but this should be revisited since the + * file metadata can and will come from other MPI ranks as + * well. + * + * Return: Non-negative on success/Negative on failure + *------------------------------------------------------------------------- + */ +herr_t +H5_open_subfiling_stub_file(const char *name, unsigned flags, MPI_Comm file_comm, H5FD_t **file_ptr, + uint64_t *file_id) +{ + H5P_genplist_t *plist = NULL; + uint64_t stub_file_id = UINT64_MAX; + hbool_t bcasted_inode = FALSE; + H5FD_t *stub_file = NULL; + hid_t fapl_id = H5I_INVALID_HID; + int mpi_rank = 0; + int mpi_size = 1; + int mpi_code; + herr_t ret_value = SUCCEED; + + if (!name) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid subfiling stub file name"); + if (file_comm == MPI_COMM_NULL) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid MPI communicator"); + if (!file_id) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL file ID pointer"); + + if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(file_comm, &mpi_rank))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code); + if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(file_comm, &mpi_size))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code); + + if (!file_ptr && (mpi_rank == 0)) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL stub file pointer"); + + /* Open stub file on MPI rank 0 only */ + if (mpi_rank == 0) { + h5_stat_t st; + MPI_Comm stub_comm = MPI_COMM_SELF; + MPI_Info stub_info = MPI_INFO_NULL; + + if ((fapl_id = H5P_create_id(H5P_CLS_FILE_ACCESS_g, FALSE)) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTREGISTER, FAIL, "can't create FAPL for stub file"); + if (NULL == (plist = H5P_object_verify(fapl_id, H5P_FILE_ACCESS))) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a file access property list"); + + /* Use MPI I/O driver for stub file to allow access to vector I/O */ + if (H5P_set(plist, H5F_ACS_MPI_PARAMS_COMM_NAME, &stub_comm) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI communicator"); + if (H5P_set(plist, H5F_ACS_MPI_PARAMS_INFO_NAME, &stub_info) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI info object"); + if (H5P_set_driver(plist, H5FD_MPIO, NULL, NULL) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI I/O driver on FAPL"); + + if (NULL == (stub_file = H5FD_open(name, flags, fapl_id, HADDR_UNDEF))) + H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL, "couldn't open HDF5 stub file"); + + HDcompile_assert(sizeof(uint64_t) >= sizeof(ino_t)); + + /* Retrieve Inode value for stub file */ + if (HDstat(name, &st) < 0) { + stub_file_id = UINT64_MAX; + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, + "couldn't stat HDF5 stub file, errno = %d, error message = '%s'", errno, + HDstrerror(errno)); + } + else + stub_file_id = (uint64_t)st.st_ino; + } + + bcasted_inode = TRUE; + + if (mpi_size > 1) { + if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&stub_file_id, 1, MPI_UINT64_T, 0, file_comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code); + } + + if (stub_file_id == UINT64_MAX) + H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "couldn't get inode value for HDF5 stub file"); + + if (file_ptr) + *file_ptr = stub_file; + *file_id = stub_file_id; + +done: + if (fapl_id >= 0 && H5I_dec_ref(fapl_id) < 0) + H5_SUBFILING_DONE_ERROR(H5E_ID, H5E_CANTDEC, FAIL, "can't close FAPL ID"); + + if (ret_value < 0) { + if (!bcasted_inode && (mpi_size > 1)) { + if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&stub_file_id, 1, MPI_UINT64_T, 0, file_comm))) + H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Bcast failed", mpi_code); + } + if (stub_file) { + if (H5FD_close(stub_file) < 0) + H5_SUBFILING_DONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, "couldn't close HDF5 stub file"); + } + } + + H5_SUBFILING_FUNC_LEAVE; } /*------------------------------------------------------------------------- @@ -752,16 +596,12 @@ H5_free_subfiling_topology(sf_topology_t *topology) * Changes: Initial Version/None. *------------------------------------------------------------------------- */ -/* TODO: revise description */ herr_t -H5_open_subfiles(const char *base_filename, void *file_handle, - H5FD_subfiling_shared_config_t *subfiling_config, int file_acc_flags, MPI_Comm file_comm, - int64_t *context_id_out) +H5_open_subfiles(const char *base_filename, uint64_t file_id, H5FD_subfiling_params_t *subfiling_config, + int file_acc_flags, MPI_Comm file_comm, int64_t *context_id_out) { subfiling_context_t *sf_context = NULL; int64_t context_id = -1; - int l_errors = 0; - int g_errors = 0; int mpi_code; herr_t ret_value = SUCCEED; @@ -775,20 +615,13 @@ H5_open_subfiles(const char *base_filename, void *file_handle, H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL, "invalid subfiling context ID pointer"); /* Initialize new subfiling context ID based on configuration information */ - if (init_subfiling(subfiling_config, file_comm, &context_id) < 0) + if (init_subfiling(base_filename, file_id, subfiling_config, file_acc_flags, file_comm, &context_id) < 0) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't initialize subfiling context"); /* Retrieve the subfiling object for the newly-created context ID */ if (NULL == (sf_context = H5_get_subfiling_object(context_id))) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't get subfiling object from context ID"); - /* Save some basic things in the new subfiling context */ - sf_context->h5_file_handle = file_handle; - - if (NULL == (sf_context->h5_filename = HDstrdup(base_filename))) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, - "couldn't allocate space for subfiling filename"); - /* * If we're actually using the IOCs, we will * start the service threads on the identified @@ -802,7 +635,6 @@ H5_open_subfiles(const char *base_filename, void *file_handle, struct tm *tm = NULL; time_t cur_time; int mpi_rank; - int mpi_code; /* Open debugging logfile */ @@ -825,24 +657,30 @@ H5_open_subfiles(const char *base_filename, void *file_handle, *context_id_out = context_id; done: - if (ret_value < 0) { - l_errors = 1; - } - /* * Form consensus on whether opening subfiles was * successful */ - if (MPI_SUCCESS != (mpi_code = MPI_Allreduce(&l_errors, &g_errors, 1, MPI_INT, MPI_SUM, file_comm))) - H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Allreduce failed", mpi_code); + { + int mpi_size = -1; + int err_result = (ret_value < 0); + + if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(file_comm, &mpi_size))) + H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Comm_size failed", mpi_code); - if (g_errors > 0) { - H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTOPENFILE, FAIL, - "one or more IOC ranks couldn't open subfiles"); + if (mpi_size > 1) { + if (MPI_SUCCESS != + (mpi_code = MPI_Allreduce(MPI_IN_PLACE, &err_result, 1, MPI_INT, MPI_MAX, file_comm))) + H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Allreduce failed", mpi_code); + } + + if (err_result) + H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTOPENFILE, FAIL, + "one or more IOC ranks couldn't open subfiles"); } if (ret_value < 0) { - clear_fid_map_entry(file_handle, context_id); + clear_fid_map_entry(file_id, context_id); if (context_id >= 0 && H5_free_subfiling_object(context_id) < 0) H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling object"); @@ -873,54 +711,175 @@ done: ------------------------------------------------------------------------- */ static herr_t -init_subfiling(H5FD_subfiling_shared_config_t *subfiling_config, MPI_Comm comm, int64_t *context_id_out) +init_subfiling(const char *base_filename, uint64_t file_id, H5FD_subfiling_params_t *subfiling_config, + int file_acc_flags, MPI_Comm comm, int64_t *context_id_out) { - subfiling_context_t *new_context = NULL; - sf_topology_t *app_topology = NULL; - int64_t context_id = -1; - int file_index = -1; - herr_t ret_value = SUCCEED; + subfiling_context_t *new_context = NULL; + sf_topology_t *app_topology = NULL; + MPI_Comm node_comm = MPI_COMM_NULL; + int64_t context_id = -1; + FILE *config_file = NULL; + char *file_basename = NULL; + char *subfile_dir = NULL; + int mpi_rank; + int mpi_size; + int mpi_code; + herr_t ret_value = SUCCEED; HDassert(context_id_out); - file_index = get_next_fid_map_index(); - HDassert(file_index >= 0); + if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &mpi_rank))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code); + if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(comm, &mpi_size))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code); /* Use the file's index to create a new subfiling context ID */ - if ((context_id = H5_new_subfiling_object_id(SF_CONTEXT, file_index)) < 0) + if ((context_id = H5_new_subfiling_object_id(SF_CONTEXT)) < 0) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't create new subfiling context ID"); /* Create a new subfiling context object with the created context ID */ if (NULL == (new_context = H5_get_subfiling_object(context_id))) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't create new subfiling object"); + new_context->sf_context_id = -1; + new_context->topology = NULL; + new_context->sf_msg_comm = MPI_COMM_NULL; + new_context->sf_data_comm = MPI_COMM_NULL; + new_context->sf_eof_comm = MPI_COMM_NULL; + new_context->sf_node_comm = MPI_COMM_NULL; + new_context->sf_group_comm = MPI_COMM_NULL; + + /* + * If there's an existing subfiling configuration file for + * this file, read the stripe size and number of subfiles + * from it + */ + if (0 == (file_acc_flags & O_CREAT)) { + int64_t config[2] = {0, 0}; /* {stripe size, num subfiles} */ + + if (mpi_rank == 0) { + /* TODO: currently no support for subfile prefix */ + if (H5_dirname(base_filename, &subfile_dir) < 0) + config[0] = -1; + + if (config[0] >= 0) { + if (H5_basename(base_filename, &file_basename) < 0) + config[0] = -1; + } + + if (config[0] >= 0) { + if (open_config_file(file_basename, subfile_dir, file_id, "r", &config_file) < 0) + config[0] = -1; + } + + if (config[0] >= 0) { + if (!config_file) + config[0] = -2; /* No config file; use setting from configuration */ + else { + /* + * If a subfiling configuration file exists and we aren't truncating + * it, read the number of subfiles used at file creation time. + */ + if (H5_get_subfiling_config_from_file(config_file, &config[0], &config[1]) < 0) + config[0] = -1; + } + } + } + + if (mpi_size > 1) { + if (MPI_SUCCESS != (mpi_code = MPI_Bcast(config, 2, MPI_INT64_T, 0, comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code); + } + + /* + * Override the stripe size and stripe count settings in the + * application's subfiling configuration if we read values + * from an existing subfiling configuration file + */ + if (config[0] == -1) + H5_SUBFILING_GOTO_ERROR( + H5E_FILE, H5E_CANTOPENFILE, FAIL, + "lead process couldn't read the number of subfiles from subfiling configuration file"); + else { + if (config[0] > 0) + subfiling_config->stripe_size = config[0]; + if (config[1] > 0) { + H5_CHECK_OVERFLOW(config[1], int64_t, int32_t); + subfiling_config->stripe_count = (int32_t)config[1]; + } + } + } + else { + char *env_value = NULL; + + /* Check for a subfiling stripe size setting from the environment */ + if ((env_value = HDgetenv(H5FD_SUBFILING_STRIPE_SIZE))) { + long long stripe_size = -1; + + errno = 0; + + stripe_size = HDstrtoll(env_value, NULL, 0); + if (ERANGE == errno) + H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, + "invalid stripe size setting for " H5FD_SUBFILING_STRIPE_SIZE); + + if (stripe_size > 0) { + subfiling_config->stripe_size = (int64_t)stripe_size; + } + } + } + +#if H5_CHECK_MPI_VERSION(3, 0) + if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &mpi_rank))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code); + + /* Create an MPI sub-communicator for intra-node communications */ + if (MPI_SUCCESS != + (mpi_code = MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, mpi_rank, MPI_INFO_NULL, &node_comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_split_type failed", mpi_code); + + if (MPI_SUCCESS != (mpi_code = MPI_Comm_set_errhandler(node_comm, MPI_ERRORS_RETURN))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_set_errhandler failed", mpi_code); +#else +#error "MPI-3 required for MPI_Comm_split_type" +#endif /* * Setup the application topology information, including the computed * number and distribution map of the set of I/O concentrators */ - if (init_app_topology(subfiling_config->ioc_selection, comm, &app_topology) < 0) + if (init_app_topology(subfiling_config, comm, node_comm, &app_topology) < 0) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't initialize application topology"); new_context->sf_context_id = context_id; - if (init_subfiling_context(new_context, subfiling_config, app_topology, comm) < 0) + if (init_subfiling_context(new_context, base_filename, file_id, subfiling_config, app_topology, comm) < 0) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't initialize subfiling application topology object"); - - new_context->sf_base_addr = 0; - if (new_context->topology->rank_is_ioc) { - new_context->sf_base_addr = - (int64_t)(new_context->topology->subfile_rank * new_context->sf_stripe_size); - } + new_context->sf_node_comm = node_comm; *context_id_out = context_id; done: + if (config_file && (EOF == HDfclose(config_file))) + H5_SUBFILING_DONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, + "couldn't close subfiling configuration file"); + + H5MM_free(file_basename); + H5MM_free(subfile_dir); + if (ret_value < 0) { - HDfree(app_topology); + if (app_topology) { + if (H5_free_subfiling_topology(app_topology) < 0) + H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling topology"); + } + + if (H5_mpi_comm_free(&node_comm) < 0) + H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free MPI communicator"); if (context_id >= 0 && H5_free_subfiling_object(context_id) < 0) H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling object"); + + *context_id_out = -1; } H5_SUBFILING_FUNC_LEAVE; @@ -929,76 +888,89 @@ done: /*------------------------------------------------------------------------- * Function: init_app_topology * - * Purpose: Once a sorted collection of hostid/mpi_rank tuples has been - * created and the number of unique hostids (nodes) has - * been determined, we may modify this "default" value for - * the number of IO Concentrators for this application. + * Purpose: Determine the topology of the application so that MPI ranks + * can be assigned as I/O concentrators. The default is to use + * 1 MPI rank per node as an I/O concentrator, but this can be + * changed by the application's subfiling configuration, or by + * an environment variable (H5FD_SUBFILING_IOC_PER_NODE). * - * The default of one(1) IO concentrator per node can be - * changed (principally for testing) by environment variable. - * if IOC_COUNT_PER_NODE is defined, then that integer value - * is utilized as a multiplier to modify the set of - * IO Concentrator ranks. - * - * The cached results will be replicated within the - * subfiling_context_t structure and is utilized as a map from - * io concentrator rank to MPI communicator rank for message - * sends and receives. - * - * Return: The number of IO Concentrator ranks. We also cache - * the MPI ranks in the 'io_concentrator' vector variable. - * The length of this vector is cached as 'n_io_concentrators'. - * Errors: MPI_Abort if memory cannot be allocated. - * - * Programmer: Richard Warren - * 7/17/2020 - * - * Changes: - Initial Version/None. - * - Updated the API to allow a variety of methods for - * determining the number and MPI ranks that will have - * IO Concentrators. The default approach will define - * a single IOC per node. + * Return: Non-negative on success/Negative on failure * *------------------------------------------------------------------------- */ static herr_t -init_app_topology(H5FD_subfiling_ioc_select_t ioc_selection_type, MPI_Comm comm, +init_app_topology(H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, MPI_Comm node_comm, sf_topology_t **app_topology_out) { - sf_topology_t *app_topology = NULL; - app_layout_t *app_layout = NULL; - char *env_value = NULL; - char *ioc_sel_str = NULL; - long ioc_select_val = -1; - long iocs_per_node = 1; - int ioc_count = 0; - int comm_rank; - int comm_size; - int mpi_code; - herr_t ret_value = SUCCEED; + H5FD_subfiling_ioc_select_t ioc_selection_type; + sf_topology_t *app_topology = NULL; + int64_t topology_id = -1; + char *env_value = NULL; + char *ioc_sel_str = NULL; + long ioc_select_val = -1; + long iocs_per_node = 1; + int ioc_count = 0; + int rank_multiple = 1; + int comm_rank; + int comm_size; + int mpi_code; + herr_t ret_value = SUCCEED; + HDassert(subfiling_config); HDassert(MPI_COMM_NULL != comm); + HDassert(MPI_COMM_NULL != node_comm); HDassert(app_topology_out); HDassert(!*app_topology_out); if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &comm_rank))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code); - if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(comm, &comm_size))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code); + ioc_selection_type = subfiling_config->ioc_selection; + /* Check if an IOC selection type was specified by environment variable */ if (get_ioc_selection_criteria_from_env(&ioc_selection_type, &ioc_sel_str) < 0) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "couldn't get IOC selection type from environment"); - /* Sanity checking on different IOC selection strategies */ + /* + * Check parameters for the specified IOC selection strategy + * and determine the maximum number of I/O concentrators + */ switch (ioc_selection_type) { - case SELECT_IOC_EVERY_NTH_RANK: { - errno = 0; + case SELECT_IOC_ONE_PER_NODE: { + if (comm_size > 1) { + /* Check for an IOC-per-node value set in the environment */ + if ((env_value = HDgetenv(H5FD_SUBFILING_IOC_PER_NODE))) { + errno = 0; + ioc_select_val = HDstrtol(env_value, NULL, 0); + if ((ERANGE == errno)) { + HDprintf("invalid value '%s' for " H5FD_SUBFILING_IOC_PER_NODE "\n", env_value); + ioc_select_val = 1; + } + + if (ioc_select_val > 0) + iocs_per_node = ioc_select_val; + } + } + + /* IOC count will be adjusted after number of nodes is determined */ + H5_CHECK_OVERFLOW(iocs_per_node, long, int); + ioc_count = (int)iocs_per_node; + + break; + } + case SELECT_IOC_EVERY_NTH_RANK: { + /* + * User specifies a rank multiple value. Selection starts + * with rank 0 and then the user-specified stride is applied + * to identify other IOC ranks. + */ ioc_select_val = 1; if (ioc_sel_str) { + errno = 0; ioc_select_val = HDstrtol(ioc_sel_str, NULL, 0); if ((ERANGE == errno) || (ioc_select_val <= 0)) { HDprintf("invalid IOC selection strategy string '%s' for strategy " @@ -1009,20 +981,25 @@ init_app_topology(H5FD_subfiling_ioc_select_t ioc_selection_type, MPI_Comm comm, } } - break; - } + H5_CHECK_OVERFLOW(ioc_select_val, long, int); + ioc_count = (comm_size / (int)ioc_select_val); + + if ((comm_size % ioc_select_val) != 0) { + ioc_count++; + } - case SELECT_IOC_WITH_CONFIG: - HDprintf("SELECT_IOC_WITH_CONFIG IOC selection strategy not supported yet; defaulting to " - "SELECT_IOC_ONE_PER_NODE\n"); - ioc_selection_type = SELECT_IOC_ONE_PER_NODE; break; + } case SELECT_IOC_TOTAL: { - errno = 0; - + /* + * User specifies a total number of I/O concentrators. + * Starting with rank 0, a stride of (mpi_size / total) + * is applied to identify other IOC ranks. + */ ioc_select_val = 1; if (ioc_sel_str) { + errno = 0; ioc_select_val = HDstrtol(ioc_sel_str, NULL, 0); if ((ERANGE == errno) || (ioc_select_val <= 0) || (ioc_select_val >= comm_size)) { HDprintf("invalid IOC selection strategy string '%s' for strategy SELECT_IOC_TOTAL; " @@ -1033,113 +1010,626 @@ init_app_topology(H5FD_subfiling_ioc_select_t ioc_selection_type, MPI_Comm comm, } } + H5_CHECK_OVERFLOW(ioc_select_val, long, int); + ioc_count = (int)ioc_select_val; + + rank_multiple = (comm_size / ioc_count); + break; } + case SELECT_IOC_WITH_CONFIG: default: + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "invalid IOC selection strategy"); + break; + } + + /* + * TODO: A different IOC selection string from the environment than what was + * used originally will cause the IOCs to be assigned differently than + * expected. While this generally shouldn't cause issues (other than + * for the SELECT_IOC_TOTAL case), this should still be dealt with + * eventually. + */ + /* Check the subfiling topology cache to see if there's a matching object */ + if (find_cached_topology_info(comm, subfiling_config, iocs_per_node, &app_topology) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, + "can't check for cached subfiling topology object"); + HDassert(!app_topology || (app_topology->selection_type == ioc_selection_type)); + + if (!app_topology) { + /* Generate an ID for the application topology object */ + if ((topology_id = H5_new_subfiling_object_id(SF_TOPOLOGY)) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get ID for subfiling topology object"); + + /* Get a new application topology object from the cache */ + if (NULL == (app_topology = H5_get_subfiling_object(topology_id))) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get subfiling topology object"); + app_topology->app_layout = NULL; + app_topology->app_comm = MPI_COMM_NULL; + app_topology->rank_is_ioc = FALSE; + app_topology->ioc_idx = -1; + app_topology->n_io_concentrators = ioc_count; + app_topology->io_concentrators = NULL; + app_topology->selection_type = ioc_selection_type; + + if (H5_mpi_comm_dup(comm, &app_topology->app_comm) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTCOPY, FAIL, "can't duplicate MPI communicator"); + + if (init_app_layout(app_topology, comm, node_comm) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't initialize application layout"); + HDassert(app_topology->app_layout); + HDassert(app_topology->app_layout->layout); + HDassert(app_topology->app_layout->node_ranks); + HDassert(app_topology->app_layout->node_count > 0); + + /* + * Now that the application node count has been determined, adjust the + * number of I/O concentrators for the SELECT_IOC_ONE_PER_NODE case + */ + if (app_topology->selection_type == SELECT_IOC_ONE_PER_NODE) + app_topology->n_io_concentrators = (int)iocs_per_node * app_topology->app_layout->node_count; + + /* + * Make sure the number of I/O concentrators doesn't + * exceed the specified number of subfiles + */ + if (subfiling_config->stripe_count != H5FD_SUBFILING_DEFAULT_STRIPE_COUNT) { + if (app_topology->n_io_concentrators > subfiling_config->stripe_count) + app_topology->n_io_concentrators = subfiling_config->stripe_count; + } + + /* + * Determine which ranks are I/O concentrator ranks, based on the + * given IOC selection strategy and MPI information. + */ + if (identify_ioc_ranks(app_topology, rank_multiple) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, + "couldn't determine which MPI ranks are I/O concentrators"); + } + + *app_topology_out = app_topology; + +done: + if (ret_value < 0) { + if (app_topology && (topology_id >= 0)) { + if (H5_free_subfiling_object(topology_id) < 0) + H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free subfiling topology object"); + } + } + + H5_SUBFILING_FUNC_LEAVE; +} + +/* +------------------------------------------------------------------------- + Programmer: Richard Warren + Purpose: Return a character string which represents either the + default selection method: SELECT_IOC_ONE_PER_NODE; or + if the user has selected a method via the environment + variable (H5FD_SUBFILING_IOC_SELECTION_CRITERIA), we + return that along with any optional qualifier with for + that method. + + Errors: None. + + Revision History -- Initial implementation +------------------------------------------------------------------------- +*/ +static herr_t +get_ioc_selection_criteria_from_env(H5FD_subfiling_ioc_select_t *ioc_selection_type, char **ioc_sel_info_str) +{ + char *opt_value = NULL; + char *env_value = HDgetenv(H5FD_SUBFILING_IOC_SELECTION_CRITERIA); + herr_t ret_value = SUCCEED; + + HDassert(ioc_selection_type); + HDassert(ioc_sel_info_str); + + *ioc_sel_info_str = NULL; + + if (env_value) { + long check_value; + + /* + * For non-default options, the environment variable + * should have the following form: integer:[integer|string] + * In particular, EveryNthRank == 1:64 or every 64 ranks assign an IOC + * or WithConfig == 2:/<full_path_to_config_file> + */ + if ((opt_value = HDstrchr(env_value, ':'))) + *opt_value++ = '\0'; + + errno = 0; + check_value = HDstrtol(env_value, NULL, 0); + + if (errno == ERANGE) + H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, + "couldn't parse value from " H5FD_SUBFILING_IOC_SELECTION_CRITERIA + " environment variable"); + + if ((check_value < 0) || (check_value >= ioc_selection_options)) + H5_SUBFILING_GOTO_ERROR( + H5E_VFL, H5E_BADVALUE, FAIL, + "invalid IOC selection type value %ld from " H5FD_SUBFILING_IOC_SELECTION_CRITERIA + " environment variable", + check_value); + + *ioc_selection_type = (H5FD_subfiling_ioc_select_t)check_value; + *ioc_sel_info_str = opt_value; + } + +done: + H5_SUBFILING_FUNC_LEAVE; +} + +/*------------------------------------------------------------------------- + * Function: find_cached_topology_info + * + * Purpose: Given an MPI communicator and IOC selection strategy, + * checks the subfiling topology cached to see if any matching + * topology objects have been cached. + * + * Return: Non-negative on success/Negative on failure + * + *------------------------------------------------------------------------- + */ +static herr_t +find_cached_topology_info(MPI_Comm comm, H5FD_subfiling_params_t *subf_config, long iocs_per_node, + sf_topology_t **app_topology) +{ + H5FD_subfiling_ioc_select_t ioc_selection_type; + int32_t stripe_count; + herr_t ret_value = SUCCEED; + + HDassert(subf_config); + + ioc_selection_type = subf_config->ioc_selection; + stripe_count = subf_config->stripe_count; + + for (size_t i = 0; i < sf_topology_cache_num_entries; i++) { + sf_topology_t *cached_topology = sf_topology_cache[i]; + int result; + int mpi_code; + + HDassert(cached_topology); + + /* + * If the selection types differ, just reject the cached topology + * for now rather than checking if the mapping is equivalent + */ + if (ioc_selection_type != cached_topology->selection_type) + continue; + + /* + * If the number of I/O concentrators in the cached topology + * is greater than the specified target number of subfiles, + * reject the cached topology + */ + if (stripe_count != H5FD_SUBFILING_DEFAULT_STRIPE_COUNT) { + if (stripe_count < cached_topology->n_io_concentrators) + continue; + } + + if (cached_topology->selection_type == SELECT_IOC_ONE_PER_NODE) { + HDassert(iocs_per_node >= 1); + HDassert(cached_topology->app_layout->node_count > 0); + + /* + * If a IOCs-per-node setting was set in the environment and would + * cause the application topology to differ from the cached topology + * we found, don't reuse the cached topology + */ + if (cached_topology->n_io_concentrators != + (iocs_per_node * cached_topology->app_layout->node_count)) + continue; + } + + if (MPI_SUCCESS != (mpi_code = MPI_Comm_compare(comm, cached_topology->app_comm, &result))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_compare failed", mpi_code); + + if (MPI_IDENT == result || MPI_CONGRUENT == result) { + *app_topology = cached_topology; break; + } } - /* Allocate new application topology information object */ - if (NULL == (app_topology = HDcalloc(1, sizeof(*app_topology)))) +done: + H5_SUBFILING_FUNC_LEAVE; +} + +/*------------------------------------------------------------------------- + * Function: init_app_layout + * + * Purpose: Determines the layout of MPI ranks across nodes in order to + * figure out the final application topology + * + * Return: Non-negative on success/Negative on failure + * + *------------------------------------------------------------------------- + */ +static herr_t +init_app_layout(sf_topology_t *app_topology, MPI_Comm comm, MPI_Comm node_comm) +{ + app_layout_t *app_layout = NULL; + int mpi_code; + herr_t ret_value = SUCCEED; + + HDassert(app_topology); + HDassert(!app_topology->app_layout); + HDassert(MPI_COMM_NULL != comm); + HDassert(MPI_COMM_NULL != node_comm); + + if (NULL == (app_layout = HDcalloc(1, sizeof(*app_layout)))) H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, - "couldn't create new subfiling topology object"); + "couldn't allocate application layout structure"); - app_topology->subfile_rank = -1; - app_topology->selection_type = ioc_selection_type; + if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &app_layout->world_rank))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code); + if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(comm, &app_layout->world_size))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code); + if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(node_comm, &app_layout->node_local_rank))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code); + if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(node_comm, &app_layout->node_local_size))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code); - if (NULL == (app_topology->io_concentrators = HDcalloc((size_t)comm_size, sizeof(int)))) + if (NULL == (app_layout->layout = HDmalloc((size_t)app_layout->world_size * sizeof(*app_layout->layout)))) H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, - "couldn't allocate array of I/O concentrator ranks"); + "couldn't allocate application layout array"); - if (!app_layout) { - if (NULL == (app_layout = HDcalloc(1, sizeof(*app_layout)))) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, - "couldn't allocate application layout structure"); + /* Gather the list of layout_t pairs to all ranks */ + if (gather_topology_info(app_layout, comm, node_comm) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "can't gather application topology info"); - if (NULL == (app_layout->node_ranks = HDcalloc(1, ((size_t)comm_size + 1) * sizeof(int)))) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, - "couldn't allocate application layout node rank array"); + /* Sort the list according to the node local lead rank values */ + HDqsort(app_layout->layout, (size_t)app_layout->world_size, sizeof(layout_t), compare_layout_nodelocal); - if (NULL == (app_layout->layout = HDcalloc(1, ((size_t)comm_size + 1) * sizeof(layout_t)))) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, - "couldn't allocate application layout array"); - } + /* + * Count the number of nodes by checking how many + * entries have a node local rank value of 0 + */ + app_layout->node_count = 0; + for (size_t i = 0; i < (size_t)app_layout->world_size; i++) + if (app_layout->layout[i].node_local_rank == 0) + app_layout->node_count++; + + HDassert(app_layout->node_count > 0); + + if (NULL == + (app_layout->node_ranks = HDmalloc((size_t)app_layout->node_count * sizeof(*app_layout->node_ranks)))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, + "couldn't allocate application layout node rank array"); - app_layout->world_size = comm_size; - app_layout->world_rank = comm_rank; + /* + * Record the rank value of the "lead" + * MPI rank on each node for later use + */ + for (size_t i = 0, node_rank_index = 0; i < (size_t)app_layout->world_size; i++) { + if (app_layout->layout[i].node_local_rank == 0) { + HDassert(node_rank_index < (size_t)app_layout->node_count); + app_layout->node_ranks[node_rank_index++] = app_layout->layout[i].rank; + } + } app_topology->app_layout = app_layout; - gather_topology_info(app_topology, comm); +done: + if (ret_value < 0) { + if (app_layout) { + HDfree(app_layout->layout); + HDfree(app_layout->node_ranks); + HDfree(app_layout); + } + } + + H5_SUBFILING_FUNC_LEAVE; +} + +/*------------------------------------------------------------------------- + * Function: gather_topology_info + * + * Purpose: Collectively generate a list of layout_t structures + * + * Return: Non-negative on success/Negative on failure + * + *------------------------------------------------------------------------- + */ +static herr_t +gather_topology_info(app_layout_t *app_layout, MPI_Comm comm, MPI_Comm intra_comm) +{ + MPI_Group file_group = MPI_GROUP_NULL; + MPI_Group node_group = MPI_GROUP_NULL; + layout_t my_layout_info; + layout_t *layout_info_partial = NULL; + MPI_Comm aggr_comm = MPI_COMM_NULL; + int *recv_counts = NULL; + int *recv_displs = NULL; + int sf_world_size; + int sf_world_rank; + int node_local_rank; + int node_local_size; + int mpi_code; + herr_t ret_value = SUCCEED; + + HDassert(app_layout); + HDassert(app_layout->layout); + HDassert(MPI_COMM_NULL != comm); + + sf_world_rank = app_layout->world_rank; + sf_world_size = app_layout->world_size; + node_local_rank = app_layout->node_local_rank; + node_local_size = app_layout->node_local_size; + + my_layout_info.rank = sf_world_rank; + my_layout_info.node_local_rank = node_local_rank; + my_layout_info.node_local_size = node_local_size; /* - * Determine which ranks are I/O concentrator ranks, based on the - * given IOC selection strategy and MPI information. + * Get the rank value for the "lead" rank on this + * rank's node so that we can group the layout_t + * information for all node-local ranks together */ - switch (ioc_selection_type) { + { + const int local_lead = 0; + int lead_rank; + + if (MPI_SUCCESS != (mpi_code = MPI_Comm_group(comm, &file_group))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_group failed", mpi_code); + if (MPI_SUCCESS != (mpi_code = MPI_Comm_group(intra_comm, &node_group))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_group failed", mpi_code); + if (MPI_SUCCESS != + (mpi_code = MPI_Group_translate_ranks(node_group, 1, &local_lead, file_group, &lead_rank))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Group_translate_ranks failed", mpi_code); + + if (MPI_UNDEFINED == lead_rank) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't determine lead rank on node"); + + my_layout_info.node_lead_rank = lead_rank; + + if (MPI_SUCCESS != (mpi_code = MPI_Group_free(&node_group))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Group_free failed", mpi_code); + if (MPI_SUCCESS != (mpi_code = MPI_Group_free(&file_group))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Group_free failed", mpi_code); + } + + app_layout->layout[sf_world_rank] = my_layout_info; + + if (sf_world_size > 1) { +#ifdef H5_SUBFILING_PREFER_ALLGATHER_TOPOLOGY + (void)intra_comm; + + if (MPI_SUCCESS != + (mpi_code = MPI_Allgather(&my_layout_info, 4, MPI_INT, app_layout->layout, 4, MPI_INT, comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Allgather failed", mpi_code); +#else + int aggr_comm_size = 0; + + HDassert(MPI_COMM_NULL != intra_comm); + + /* Split the file communicator into a sub-group of one rank per node */ + if (MPI_SUCCESS != (mpi_code = MPI_Comm_split(comm, node_local_rank, sf_world_rank, &aggr_comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_split failed", mpi_code); + + if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(aggr_comm, &aggr_comm_size))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code); + + /* Allocate a partial layout info array to aggregate into from node-local ranks */ + if (node_local_rank == 0) { + if (NULL == + (layout_info_partial = HDmalloc((size_t)node_local_size * sizeof(*layout_info_partial)))) + /* Push error, but participate in gather operation */ + H5_SUBFILING_DONE_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, + "can't allocate layout info array"); + } + + /* Gather node-local layout info to single master rank on each node */ + if (MPI_SUCCESS != (mpi_code = MPI_Gather(&my_layout_info, 4, MPI_INT, layout_info_partial, 4, + MPI_INT, 0, intra_comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Gather failed", mpi_code); + + /* Gather total layout info from/to each master rank on each node */ + if (node_local_rank == 0) { + int send_size = 4 * node_local_size; + + if (NULL == (recv_counts = HDmalloc((size_t)aggr_comm_size * sizeof(*recv_counts)))) + H5_SUBFILING_DONE_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, + "can't allocate receive counts array"); + if (NULL == (recv_displs = HDmalloc((size_t)aggr_comm_size * sizeof(*recv_displs)))) + H5_SUBFILING_DONE_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, + "can't allocate receive displacements array"); + + if (MPI_SUCCESS != + (mpi_code = MPI_Allgather(&send_size, 1, MPI_INT, recv_counts, 1, MPI_INT, aggr_comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Allgather failed", mpi_code); + + recv_displs[0] = 0; + for (int i = 1; i < aggr_comm_size; i++) + recv_displs[i] = recv_displs[i - 1] + recv_counts[i - 1]; + + if (MPI_SUCCESS != + (mpi_code = MPI_Allgatherv(layout_info_partial, send_size, MPI_INT, app_layout->layout, + recv_counts, recv_displs, MPI_INT, aggr_comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Allgatherv failed", mpi_code); + + HDfree(recv_displs); + HDfree(recv_counts); + recv_displs = NULL; + recv_counts = NULL; + } + + /* + * Each master rank on each node distributes the total + * layout info back to other node-local ranks + */ + if (MPI_SUCCESS != + (mpi_code = MPI_Bcast(app_layout->layout, 4 * sf_world_size, MPI_INT, 0, intra_comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code); +#endif + } + +done: + HDfree(recv_displs); + HDfree(recv_counts); + HDfree(layout_info_partial); + + if (H5_mpi_comm_free(&aggr_comm) < 0) + H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI communicator"); + + if (node_group != MPI_GROUP_NULL) + if (MPI_SUCCESS != (mpi_code = MPI_Group_free(&node_group))) + H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Group_free failed", mpi_code); + if (file_group != MPI_GROUP_NULL) + if (MPI_SUCCESS != (mpi_code = MPI_Group_free(&file_group))) + H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Group_free failed", mpi_code); + + H5_SUBFILING_FUNC_LEAVE; +} + +/*------------------------------------------------------------------------- + * Function: compare_layout_nodelocal + * + * Purpose: Qsort sorting callback that sorts layout_t structures + * according to their node local lead MPI rank values. Ties + * are broken according to their regular node local MPI rank + * values + * + *------------------------------------------------------------------------- + */ +static int +compare_layout_nodelocal(const void *layout1, const void *layout2) +{ + const layout_t *l1 = (const layout_t *)layout1; + const layout_t *l2 = (const layout_t *)layout2; + + if (l1->node_lead_rank == l2->node_lead_rank) { + return (l1->node_local_rank > l2->node_local_rank) - (l1->node_local_rank < l2->node_local_rank); + } + else + return (l1->node_lead_rank > l2->node_lead_rank) - (l1->node_lead_rank < l2->node_lead_rank); +} + +/*------------------------------------------------------------------------- + * Function: identify_ioc_ranks + * + * Purpose: We've already identified the number of unique nodes and + * have a sorted list of layout_t structures. Under normal + * conditions, we only utilize a single IOC per node. Under + * that circumstance, we only need to fill the + * io_concentrators vector from the node_ranks array (which + * contains the index into the layout array of lowest MPI rank + * on each node) into the io_concentrators vector; Otherwise, + * while determining the number of local ranks per node, we + * can also select one or more additional IOCs. + * + * As a side effect, we fill the 'io_concentrators' vector + * and set the 'rank_is_ioc' flag to TRUE if our rank is + * identified as owning an I/O Concentrator (IOC). + * + *------------------------------------------------------------------------- + */ +static herr_t +identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride) +{ + app_layout_t *app_layout = NULL; + int *io_concentrators = NULL; + int max_iocs = 0; + herr_t ret_value = SUCCEED; + + HDassert(app_topology); + HDassert(!app_topology->io_concentrators); + HDassert(app_topology->n_io_concentrators > 0); + HDassert(app_topology->app_layout); + HDassert(app_topology->app_layout->layout); + HDassert(app_topology->app_layout->node_count > 0); + + app_layout = app_topology->app_layout; + + max_iocs = app_topology->n_io_concentrators; + + if (NULL == (app_topology->io_concentrators = HDmalloc((size_t)app_topology->n_io_concentrators * + sizeof(*app_topology->io_concentrators)))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, + "couldn't allocate array of I/O concentrator ranks"); + + io_concentrators = app_topology->io_concentrators; + + switch (app_topology->selection_type) { case SELECT_IOC_ONE_PER_NODE: { - int node_count; + int total_ioc_count = 0; + int iocs_per_node = 1; - app_topology->selection_type = SELECT_IOC_ONE_PER_NODE; + if (app_topology->n_io_concentrators > app_layout->node_count) + iocs_per_node = app_topology->n_io_concentrators / app_layout->node_count; - if ((node_count = count_nodes(app_topology, comm)) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, - "couldn't determine number of nodes used"); + HDassert(app_layout->node_ranks); - /* Check for an IOC-per-node value set in the environment */ - if ((env_value = HDgetenv(H5FD_SUBFILING_IOC_PER_NODE))) { - errno = 0; - ioc_select_val = HDstrtol(env_value, NULL, 0); - if ((ERANGE == errno)) { - HDprintf("invalid value '%s' for " H5FD_SUBFILING_IOC_PER_NODE "\n", env_value); - ioc_select_val = 1; + for (size_t i = 0; i < (size_t)app_layout->node_count; i++) { + int node_index = app_layout->node_ranks[i]; + int local_size = app_layout->layout[node_index].node_local_size; + + HDassert(total_ioc_count < app_topology->n_io_concentrators); + io_concentrators[total_ioc_count] = app_layout->layout[node_index++].rank; + + if (app_layout->world_rank == io_concentrators[total_ioc_count]) { + app_topology->ioc_idx = total_ioc_count; + app_topology->rank_is_ioc = TRUE; } - if (ioc_select_val > 0) - iocs_per_node = ioc_select_val; - } + total_ioc_count++; - H5_CHECK_OVERFLOW(iocs_per_node, long, int); - ioc_count = identify_ioc_ranks(app_topology, node_count, (int)iocs_per_node); + for (size_t j = 1; j < (size_t)iocs_per_node; j++) { + if (total_ioc_count >= max_iocs) + break; + if (j >= (size_t)local_size) + break; - break; - } + HDassert(total_ioc_count < app_topology->n_io_concentrators); + io_concentrators[total_ioc_count] = app_layout->layout[node_index++].rank; - case SELECT_IOC_EVERY_NTH_RANK: { - /* - * User specifies a rank multiple value. Selection starts - * with rank 0 and then the user-specified stride is applied\ - * to identify other IOC ranks. - */ + if (app_layout->world_rank == io_concentrators[total_ioc_count]) { + app_topology->ioc_idx = total_ioc_count; + app_topology->rank_is_ioc = TRUE; + } - H5_CHECK_OVERFLOW(ioc_select_val, long, int); - ioc_count = (comm_size / (int)ioc_select_val); + total_ioc_count++; + } - if ((comm_size % ioc_select_val) != 0) { - ioc_count++; + if (total_ioc_count >= max_iocs) + break; } - assign_ioc_ranks(app_topology, ioc_count, (int)ioc_select_val); + /* Set final number of I/O concentrators after adjustments */ + app_topology->n_io_concentrators = total_ioc_count; break; } + case SELECT_IOC_EVERY_NTH_RANK: case SELECT_IOC_TOTAL: { - int rank_multiple = 0; + int world_size = app_layout->world_size; + int ioc_next = 0; - /* - * User specifies a total number of I/O concentrators. - * Starting with rank 0, a stride of (mpi_size / total) - * is applied to identify other IOC ranks. - */ + HDassert(rank_stride > 0); - H5_CHECK_OVERFLOW(ioc_select_val, long, int); - ioc_count = (int)ioc_select_val; + for (int i = 0; ioc_next < app_topology->n_io_concentrators; ioc_next++) { + int ioc_index = rank_stride * i++; - rank_multiple = (comm_size / ioc_count); + if (ioc_index >= world_size) + break; + + io_concentrators[ioc_next] = app_layout->layout[ioc_index].rank; + + if (app_layout->world_rank == io_concentrators[ioc_next]) { + app_topology->ioc_idx = ioc_next; + app_topology->rank_is_ioc = TRUE; + } + + if (ioc_next + 1 >= max_iocs) + break; + } - assign_ioc_ranks(app_topology, ioc_count, rank_multiple); + /* Set final number of I/O concentrators after adjustments */ + app_topology->n_io_concentrators = ioc_next; break; } @@ -1150,31 +1640,10 @@ init_app_topology(H5FD_subfiling_ioc_select_t ioc_selection_type, MPI_Comm comm, break; } - HDassert(ioc_count > 0); - app_topology->n_io_concentrators = ioc_count; - - /* - * Create a vector of "potential" file descriptors - * which can be indexed by the IOC ID - */ - if (NULL == (app_topology->subfile_fd = HDcalloc((size_t)ioc_count, sizeof(int)))) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, - "couldn't allocate subfile file descriptor array"); - - *app_topology_out = app_topology; - done: if (ret_value < 0) { - if (app_layout) { - HDfree(app_layout->layout); - HDfree(app_layout->node_ranks); - HDfree(app_layout); - } - if (app_topology) { - HDfree(app_topology->subfile_fd); + if (app_topology) HDfree(app_topology->io_concentrators); - HDfree(app_topology); - } } H5_SUBFILING_FUNC_LEAVE; @@ -1196,77 +1665,104 @@ done: *------------------------------------------------------------------------- */ static herr_t -init_subfiling_context(subfiling_context_t *sf_context, H5FD_subfiling_shared_config_t *subfiling_config, - sf_topology_t *app_topology, MPI_Comm file_comm) +init_subfiling_context(subfiling_context_t *sf_context, const char *base_filename, uint64_t file_id, + H5FD_subfiling_params_t *subfiling_config, sf_topology_t *app_topology, + MPI_Comm file_comm) { char *env_value = NULL; - int comm_rank; + int mpi_rank; int mpi_code; herr_t ret_value = SUCCEED; HDassert(sf_context); HDassert(sf_context->topology == NULL); + HDassert(sf_context->sf_context_id >= 0); + HDassert(base_filename); + HDassert(file_id != UINT64_MAX); HDassert(subfiling_config); HDassert(app_topology); HDassert(app_topology->n_io_concentrators > 0); HDassert(MPI_COMM_NULL != file_comm); - sf_context->topology = app_topology; + sf_context->h5_file_id = file_id; + sf_context->sf_fids = NULL; + sf_context->sf_num_fids = 0; + sf_context->sf_num_subfiles = subfiling_config->stripe_count; + sf_context->sf_write_count = 0; + sf_context->sf_read_count = 0; + sf_context->sf_eof = HADDR_UNDEF; + sf_context->sf_stripe_size = H5FD_SUBFILING_DEFAULT_STRIPE_SIZE; + sf_context->sf_base_addr = 0; sf_context->sf_msg_comm = MPI_COMM_NULL; sf_context->sf_data_comm = MPI_COMM_NULL; sf_context->sf_eof_comm = MPI_COMM_NULL; - sf_context->sf_barrier_comm = MPI_COMM_NULL; + sf_context->sf_node_comm = MPI_COMM_NULL; sf_context->sf_group_comm = MPI_COMM_NULL; - sf_context->sf_intercomm = MPI_COMM_NULL; - sf_context->sf_stripe_size = H5FD_SUBFILING_DEFAULT_STRIPE_SIZE; - sf_context->sf_write_count = 0; - sf_context->sf_read_count = 0; - sf_context->sf_eof = HADDR_UNDEF; - sf_context->h5_file_handle = NULL; - sf_context->sf_fid = -1; sf_context->sf_group_size = 1; sf_context->sf_group_rank = 0; - sf_context->h5_filename = NULL; - sf_context->sf_filename = NULL; sf_context->subfile_prefix = NULL; + sf_context->h5_filename = NULL; sf_context->ioc_data = NULL; + sf_context->topology = app_topology; #ifdef H5_SUBFILING_DEBUG sf_context->sf_logfile = NULL; #endif + if (NULL == (sf_context->h5_filename = HDstrdup(base_filename))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, + "couldn't allocate space for subfiling filename"); + + /* Check for a subfile name prefix setting in the environment */ + if ((env_value = HDgetenv(H5FD_SUBFILING_SUBFILE_PREFIX))) { + if (NULL == (sf_context->subfile_prefix = HDstrdup(env_value))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't copy subfile prefix value"); + } + /* - * Set IOC stripe size from subfiling configuration, then check - * for a setting from the environment + * Set IOC stripe size from subfiling configuration */ if (subfiling_config->stripe_size > 0) sf_context->sf_stripe_size = subfiling_config->stripe_size; - if ((env_value = HDgetenv(H5FD_SUBFILING_STRIPE_SIZE))) { - long long stripe_size = -1; - - errno = 0; - - stripe_size = HDstrtoll(env_value, NULL, 0); - if (ERANGE == errno) - H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, - "invalid stripe size setting for " H5FD_SUBFILING_STRIPE_SIZE); - - if (stripe_size > 0) { - sf_context->sf_stripe_size = (int64_t)stripe_size; - } - } + /* + * If still set to the default, set the number of subfiles + * according to the default mapping of 1 I/O concentrator + * -> 1 subfile + */ + if (sf_context->sf_num_subfiles == H5FD_SUBFILING_DEFAULT_STRIPE_COUNT) + sf_context->sf_num_subfiles = app_topology->n_io_concentrators; /* * Set blocksize per stripe value after possibly adjusting - * for user-specified subfile stripe size + * for user-specified subfile stripe size and number of + * subfiles */ - sf_context->sf_blocksize_per_stripe = sf_context->sf_stripe_size * app_topology->n_io_concentrators; + sf_context->sf_blocksize_per_stripe = sf_context->sf_stripe_size * sf_context->sf_num_subfiles; - /* Check for a subfile name prefix setting in the environment */ - if ((env_value = HDgetenv(H5FD_SUBFILING_SUBFILE_PREFIX))) { - if (NULL == (sf_context->subfile_prefix = HDstrdup(env_value))) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't copy subfile prefix value"); + if (app_topology->rank_is_ioc) { + int leftover_subfiles; + + /* Adjust base address after stripe size is set, if necessary */ + sf_context->sf_base_addr = (int64_t)(app_topology->ioc_idx * sf_context->sf_stripe_size); + + /* + * Calculate the number of subfiles this rank owns by + * round-robining them across the available IOCs and + * then allocate an array for the subfile IDs + */ + sf_context->sf_num_fids = sf_context->sf_num_subfiles / app_topology->n_io_concentrators; + + leftover_subfiles = sf_context->sf_num_subfiles % app_topology->n_io_concentrators; + if (leftover_subfiles && (leftover_subfiles > app_topology->ioc_idx)) + sf_context->sf_num_fids++; + + if (NULL == + (sf_context->sf_fids = HDmalloc((size_t)sf_context->sf_num_fids * sizeof(*sf_context->sf_fids)))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't allocate subfile IDs array"); + + for (int i = 0; i < sf_context->sf_num_fids; i++) + sf_context->sf_fids[i] = -1; } /* @@ -1274,7 +1770,7 @@ init_subfiling_context(subfiling_context_t *sf_context, H5FD_subfiling_shared_co * to/from IOC ranks */ - if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(file_comm, &comm_rank))) + if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(file_comm, &mpi_rank))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code); if (MPI_SUCCESS != (mpi_code = MPI_Comm_dup(file_comm, &sf_context->sf_msg_comm))) @@ -1295,15 +1791,9 @@ init_subfiling_context(subfiling_context_t *sf_context, H5FD_subfiling_shared_co if (MPI_SUCCESS != (mpi_code = MPI_Comm_set_errhandler(sf_context->sf_eof_comm, MPI_ERRORS_RETURN))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_set_errhandler failed", mpi_code); - if (MPI_SUCCESS != (mpi_code = MPI_Comm_dup(file_comm, &sf_context->sf_barrier_comm))) - H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_dup failed", mpi_code); - - if (MPI_SUCCESS != (mpi_code = MPI_Comm_set_errhandler(sf_context->sf_barrier_comm, MPI_ERRORS_RETURN))) - H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_set_errhandler failed", mpi_code); - /* Create an MPI sub-communicator for IOC ranks */ if (app_topology->n_io_concentrators > 1) { - if (MPI_SUCCESS != (mpi_code = MPI_Comm_split(file_comm, app_topology->rank_is_ioc, comm_rank, + if (MPI_SUCCESS != (mpi_code = MPI_Comm_split(file_comm, app_topology->rank_is_ioc, mpi_rank, &sf_context->sf_group_comm))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_split failed", mpi_code); @@ -1314,11 +1804,18 @@ init_subfiling_context(subfiling_context_t *sf_context, H5FD_subfiling_shared_co H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code); } -done: - if (ret_value < 0) { - H5_free_subfiling_object_int(sf_context); - } + /* Perform some final validation of subfiling configuration */ + if (sf_context->sf_stripe_size <= 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "invalid subfiling stripe size (%" PRId64 ")", + sf_context->sf_stripe_size); + + if (sf_context->sf_num_subfiles <= 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "invalid subfiling stripe count (%d)", + sf_context->sf_num_subfiles); + HDassert(sf_context->sf_num_subfiles >= app_topology->n_io_concentrators); + +done: H5_SUBFILING_FUNC_LEAVE; } @@ -1362,37 +1859,29 @@ open_subfile_with_context(subfiling_context_t *sf_context, int file_acc_flags) herr_t ret_value = SUCCEED; HDassert(sf_context); + HDassert(sf_context->h5_file_id != UINT64_MAX); /* - * Save the HDF5 file ID (fid) to subfile context mapping. + * Save the HDF5 file ID (e.g., inode) to subfile context mapping. * There shouldn't be any issue, but check the status and * return if there was a problem. */ - if (record_fid_to_subfile(sf_context->h5_file_handle, sf_context->sf_context_id, NULL) < 0) + if (record_fid_to_subfile(sf_context->h5_file_id, sf_context->sf_context_id, NULL) < 0) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't record HDF5 file ID to subfile context mapping"); /* * If this rank is an I/O concentrator, actually open - * the subfile belonging to this IOC rank + * the subfiles belonging to this IOC rank */ if (sf_context->topology->rank_is_ioc) { - h5_stat_t st; - - /* Retrieve Inode value for HDF5 stub file */ - if (HDstat(sf_context->h5_filename, &st) < 0) - H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "couldn't stat HDF5 stub file"); - - HDcompile_assert(sizeof(uint64_t) >= sizeof(ino_t)); - sf_context->h5_file_id = (uint64_t)st.st_ino; - - if (ioc_open_file(sf_context->sf_context_id, file_acc_flags) < 0) + if (ioc_open_files(sf_context->sf_context_id, file_acc_flags) < 0) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, FAIL, "IOC couldn't open subfile"); } done: if (ret_value < 0) { - clear_fid_map_entry(sf_context->h5_file_handle, sf_context->sf_context_id); + clear_fid_map_entry(sf_context->h5_file_id, sf_context->sf_context_id); } H5_SUBFILING_FUNC_LEAVE; @@ -1429,29 +1918,29 @@ done: *------------------------------------------------------------------------- */ static herr_t -record_fid_to_subfile(void *file_handle, int64_t subfile_context_id, int *next_index) +record_fid_to_subfile(uint64_t file_id, int64_t subfile_context_id, int *next_index) { int index; herr_t ret_value = SUCCEED; - if (sf_file_map_size == 0) { + if (!sf_open_file_map) { if (NULL == (sf_open_file_map = HDmalloc((size_t)DEFAULT_FILE_MAP_ENTRIES * sizeof(*sf_open_file_map)))) H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't allocate open file mapping"); sf_file_map_size = DEFAULT_FILE_MAP_ENTRIES; for (int i = 0; i < sf_file_map_size; i++) { - sf_open_file_map[i].file_handle = NULL; + sf_open_file_map[i].file_id = UINT64_MAX; sf_open_file_map[i].sf_context_id = -1; } } for (index = 0; index < sf_file_map_size; index++) { - if (sf_open_file_map[index].file_handle == file_handle) + if (sf_open_file_map[index].file_id == file_id) goto done; - if (sf_open_file_map[index].file_handle == NULL) { - sf_open_file_map[index].file_handle = file_handle; + if (sf_open_file_map[index].file_id == UINT64_MAX) { + sf_open_file_map[index].file_id = file_id; sf_open_file_map[index].sf_context_id = subfile_context_id; if (next_index) { @@ -1474,14 +1963,14 @@ record_fid_to_subfile(void *file_handle, int64_t subfile_context_id, int *next_i sf_file_map_size *= 2; for (int i = index; i < sf_file_map_size; i++) { - sf_open_file_map[i].file_handle = NULL; + sf_open_file_map[i].file_id = UINT64_MAX; } if (next_index) { *next_index = index; } - sf_open_file_map[index].file_handle = file_handle; + sf_open_file_map[index].file_id = file_id; sf_open_file_map[index++].sf_context_id = subfile_context_id; } @@ -1490,13 +1979,44 @@ done: } /*------------------------------------------------------------------------- - * Function: ioc_open_file + * Function: clear_fid_map_entry + * + * Purpose: Remove the map entry associated with the file->inode. + * This is done at file close. + * + * Return: None + * Errors: Cannot fail. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +static void +clear_fid_map_entry(uint64_t file_id, int64_t sf_context_id) +{ + if (sf_open_file_map) { + for (int i = 0; i < sf_file_map_size; i++) { + if ((sf_open_file_map[i].file_id == file_id) && + (sf_open_file_map[i].sf_context_id == sf_context_id)) { + sf_open_file_map[i].file_id = UINT64_MAX; + sf_open_file_map[i].sf_context_id = -1; + return; + } + } + } +} /* end clear_fid_map_entry() */ + +/*------------------------------------------------------------------------- + * Function: ioc_open_files * * Purpose: This function is called by an I/O concentrator in order to - * open the subfile it is responsible for. + * open the subfiles it is responsible for. * - * The name of the subfile to be opened is generated based on - * values from either: + * The names of the subfiles to be opened are generated based + * on values from either: * * - The corresponding subfiling configuration file, if one * exists and the HDF5 file isn't being truncated @@ -1504,7 +2024,7 @@ done: * subfiling configuration file doesn't exist or the HDF5 * file is being truncated * - * After the subfile has been opened, a subfiling + * After the subfiles have been opened, a subfiling * configuration file will be created if this is a file * creation operation. If the truncate flag is specified, the * subfiling configuration file will be re-created in order to @@ -1528,40 +2048,83 @@ done: *------------------------------------------------------------------------- */ static herr_t -ioc_open_file(int64_t file_context_id, int file_acc_flags) +ioc_open_files(int64_t file_context_id, int file_acc_flags) { - subfiling_context_t *sf_context = NULL; - mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; - char *filepath = NULL; - char *subfile_dir = NULL; - char *base = NULL; - int fd = -1; - herr_t ret_value = SUCCEED; + subfiling_context_t *sf_context = NULL; + mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; + char *filepath = NULL; + char *subfile_dir = NULL; + char *base = NULL; + int num_subfiles = 0; + int num_digits = 0; + herr_t ret_value = SUCCEED; if (NULL == (sf_context = H5_get_subfiling_object(file_context_id))) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, FAIL, "couldn't get subfiling object from context ID"); - /* Only IOC ranks should be here */ + HDassert(sf_context->h5_file_id != UINT64_MAX); + HDassert(sf_context->h5_filename); + HDassert(sf_context->sf_fids); + HDassert(sf_context->sf_num_subfiles > 0); + HDassert(sf_context->sf_num_fids > 0); HDassert(sf_context->topology); - HDassert(sf_context->topology->subfile_rank >= 0); + HDassert(sf_context->topology->ioc_idx >= 0); /* Only IOC ranks should be here */ + + /* Get the basename of the full HDF5 filename */ + if (H5_basename(sf_context->h5_filename, &base) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't get HDF5 file basename"); + + /* + * Get the directory prefix where subfiles will be placed. + * Under normal circumstances, the subfiles are co-located + * with the HDF5 file, but users may specify a different + * directory name. + */ + if (sf_context->subfile_prefix) { + if (NULL == (subfile_dir = H5MM_strdup(sf_context->subfile_prefix))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't copy subfile prefix"); + } + else { + if (H5_dirname(sf_context->h5_filename, &subfile_dir) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't get HDF5 file dirname"); + } - if (NULL == (filepath = HDcalloc(1, PATH_MAX))) + if (NULL == (filepath = HDmalloc(PATH_MAX))) H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't allocate space for subfile filename"); - /* Generate the name of the subfile that this IOC rank will open */ - if (generate_subfile_name(sf_context, file_acc_flags, filepath, PATH_MAX, &base, &subfile_dir) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, FAIL, "couldn't generate name for subfile"); + num_subfiles = sf_context->sf_num_subfiles; + num_digits = (int)(HDlog10(num_subfiles) + 1); + + /* + * For each subfile this IOC rank owns, generate the name + * of the subfile and create/open it + */ + for (int i = 0; i < sf_context->sf_num_fids; i++) { + int subfile_idx; + + /* Round-robin subfiles among the available IOCs */ + subfile_idx = (i * sf_context->topology->n_io_concentrators) + sf_context->topology->ioc_idx + 1; - if (NULL == (sf_context->sf_filename = HDstrdup(filepath))) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't copy subfile name"); + /* + * Generate the name of the subfile. The subfile naming should + * produce files of the following form: + * If we assume the HDF5 file is named ABC.h5, and 20 subfiles + * are used, then the subfiles will have names: + * ABC.h5.subfile_<file-number>_01_of_20, + * ABC.h5.subfile_<file-number>_02_of_20, etc. + * + * and the configuration file will be named: + * ABC.h5.subfile_<file-number>.config + */ + HDsnprintf(filepath, PATH_MAX, "%s/" H5FD_SUBFILING_FILENAME_TEMPLATE, subfile_dir, base, + sf_context->h5_file_id, num_digits, subfile_idx, num_subfiles); - /* Attempt to create/open the subfile for this IOC rank */ - if ((fd = HDopen(filepath, file_acc_flags, mode)) < 0) - H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL, "failed to open subfile"); + if ((sf_context->sf_fids[i] = HDopen(filepath, file_acc_flags, mode)) < 0) + H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, FAIL, "failed to open subfile"); + } - sf_context->sf_fid = fd; if (file_acc_flags & O_CREAT) sf_context->sf_eof = 0; @@ -1569,7 +2132,7 @@ ioc_open_file(int64_t file_context_id, int file_acc_flags) * If subfiles were created (rather than simply opened), * check if we also need to create a config file. */ - if ((file_acc_flags & O_CREAT) && (sf_context->topology->subfile_rank == 0)) { + if ((file_acc_flags & O_CREAT) && (sf_context->topology->ioc_idx == 0)) { if (create_config_file(sf_context, base, subfile_dir, (file_acc_flags & O_TRUNC)) < 0) H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTCREATE, FAIL, "couldn't create subfiling configuration file"); @@ -1578,12 +2141,10 @@ ioc_open_file(int64_t file_context_id, int file_acc_flags) done: if (ret_value < 0) { if (sf_context) { - HDfree(sf_context->sf_filename); - sf_context->sf_filename = NULL; - - if (sf_context->sf_fid >= 0) { - HDclose(sf_context->sf_fid); - sf_context->sf_fid = -1; + for (int i = 0; i < sf_context->sf_num_fids; i++) { + if (sf_context->sf_fids[i] >= 0 && HDclose(sf_context->sf_fids[i]) < 0) + H5_SUBFILING_DONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, "failed to close subfile"); + sf_context->sf_fids[i] = -1; } } } @@ -1595,144 +2156,6 @@ done: H5_SUBFILING_FUNC_LEAVE; } -/* - * Generate the name of the subfile this IOC rank will open, - * based on available information. - * - * This may include: - * - the subfiling configuration (from a subfiling configuration - * file if one exists, or from the subfiling context object - * otherwise) - * - the base file's name and ID (inode or similar) - * - the IOC's rank value within the set of I/O concentrators - * - an optional filename prefix specified by the user - */ -static herr_t -generate_subfile_name(subfiling_context_t *sf_context, int file_acc_flags, char *filename_out, - size_t filename_out_len, char **filename_basename_out, char **subfile_dir_out) -{ - FILE *config_file = NULL; - char *subfile_dir = NULL; - char *prefix = NULL; - char *base = NULL; - int n_io_concentrators; - int num_digits; - herr_t ret_value = SUCCEED; - - HDassert(sf_context); - HDassert(sf_context->h5_filename); - HDassert(filename_out); - HDassert(filename_basename_out); - HDassert(subfile_dir_out); - - *filename_basename_out = NULL; - *subfile_dir_out = NULL; - - /* - * Initially use the number of I/O concentrators specified in the - * subfiling context. However, if there's an existing subfiling - * configuration file (and we aren't truncating it) we will use - * the number specified there instead, as that should be the actual - * number that the subfile names were originally generated with. - * The current subfiling context may have a different number of I/O - * concentrators specified; e.g. a simple serial file open for - * reading purposes (think h5dump) might only be using 1 I/O - * concentrator, whereas the file was created with several I/O - * concentrators. - */ - n_io_concentrators = sf_context->topology->n_io_concentrators; - - if (NULL == (prefix = HDmalloc(PATH_MAX))) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, - "couldn't allocate space for subfile prefix"); - - /* Under normal operation, we co-locate subfiles with the HDF5 file */ - HDstrncpy(prefix, sf_context->h5_filename, PATH_MAX - 1); - prefix[PATH_MAX - 1] = '\0'; - - if (H5_basename(prefix, &base) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't get subfile basename"); - - if (sf_context->subfile_prefix) { - /* Note: Users may specify a directory name which is inaccessible - * from where the current is running. In particular, "node-local" - * storage is not uniformly available to all processes. - * We would like to check if the user pathname unavailable and - * if so, we could default to creating the subfiles in the - * current directory. (?) - */ - if (NULL == (subfile_dir = H5MM_strdup(sf_context->subfile_prefix))) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't copy subfile prefix"); - } - else { - if (H5_dirname(prefix, &subfile_dir) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't get subfile prefix"); - } - - /* - * Open the file's subfiling configuration file, if it exists and - * we aren't truncating the file. - */ - if (0 == (file_acc_flags & O_TRUNC)) { - if (open_config_file(sf_context, base, subfile_dir, "r", &config_file) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, FAIL, - "couldn't open existing subfiling configuration file"); - } - - /* - * If a subfiling configuration file exists and we aren't truncating - * it, read the number of I/O concentrators used at file creation time - * in order to generate the correct subfile names. - */ - if (config_file) { - if (H5_get_num_iocs_from_config_file(config_file, &n_io_concentrators) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, - "couldn't read from subfiling configuration file"); - } - - /* - * Generate the name of the subfile. The subfile naming should - * produce files of the following form: - * If we assume the HDF5 file is named ABC.h5, and 20 I/O - * concentrators are used, then the subfiles will have names: - * ABC.h5.subfile_<file-number>_01_of_20, - * ABC.h5.subfile_<file-number>_02_of_20, etc. - * - * and the configuration file will be named: - * ABC.h5.subfile_<file-number>.config - */ - num_digits = (int)(HDlog10(n_io_concentrators) + 1); - HDsnprintf(filename_out, filename_out_len, "%s/%s" H5FD_SUBFILING_FILENAME_TEMPLATE, subfile_dir, base, - sf_context->h5_file_id, num_digits, sf_context->topology->subfile_rank + 1, - n_io_concentrators); - - *filename_basename_out = base; - *subfile_dir_out = subfile_dir; - -done: - if (config_file && (EOF == HDfclose(config_file))) - H5_SUBFILING_DONE_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, - "couldn't close subfiling configuration file"); - - if (ret_value < 0) { - H5MM_free(subfile_dir); - H5MM_free(base); - - if (*filename_basename_out) { - H5MM_free(*filename_basename_out); - *filename_basename_out = NULL; - } - if (*subfile_dir_out) { - H5MM_free(*subfile_dir_out); - *subfile_dir_out = NULL; - } - } - - HDfree(prefix); - - H5_SUBFILING_FUNC_LEAVE; -} - /*------------------------------------------------------------------------- * Function: create_config_file * @@ -1742,6 +2165,7 @@ done: * * - the stripe size for the file's subfiles * - the number of I/O concentrators used for I/O to the file's subfiles + * - the number of subfiles the logical HDF5 file consists of * - the base HDF5 filename * - the optional directory prefix where the file's subfiles are placed * - the names of each of the file's subfiles @@ -1777,7 +2201,7 @@ create_config_file(subfiling_context_t *sf_context, const char *base_filename, c H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't allocate space for subfiling configuration filename"); - HDsnprintf(config_filename, PATH_MAX, "%s/%s" H5FD_SUBFILING_CONFIG_FILENAME_TEMPLATE, subfile_dir, + HDsnprintf(config_filename, PATH_MAX, "%s/" H5FD_SUBFILING_CONFIG_FILENAME_TEMPLATE, subfile_dir, base_filename, sf_context->h5_file_id); /* Determine whether a subfiling configuration file exists */ @@ -1796,9 +2220,8 @@ create_config_file(subfiling_context_t *sf_context, const char *base_filename, c * O_TRUNC flag was specified. In this case, truncate * the existing config file and create a new one. */ - /* TODO: if truncating, consider removing old stale config files. */ if (!config_file_exists || truncate_if_exists) { - int n_io_concentrators = sf_context->topology->n_io_concentrators; + int n_subfiles = sf_context->sf_num_subfiles; int num_digits; if (NULL == (config_file = HDfopen(config_filename, "w+"))) @@ -1816,7 +2239,13 @@ create_config_file(subfiling_context_t *sf_context, const char *base_filename, c "failed to write to subfiling configuration file"); /* Write the number of I/O concentrators to the configuration file */ - HDsnprintf(line_buf, PATH_MAX, "aggregator_count=%d\n", n_io_concentrators); + HDsnprintf(line_buf, PATH_MAX, "aggregator_count=%d\n", sf_context->topology->n_io_concentrators); + if (HDfwrite(line_buf, HDstrlen(line_buf), 1, config_file) != 1) + H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, + "failed to write to subfiling configuration file"); + + /* Write the number of subfiles to the configuration file */ + HDsnprintf(line_buf, PATH_MAX, "subfile_count=%d\n", n_subfiles); if (HDfwrite(line_buf, HDstrlen(line_buf), 1, config_file) != 1) H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, "failed to write to subfiling configuration file"); @@ -1834,10 +2263,10 @@ create_config_file(subfiling_context_t *sf_context, const char *base_filename, c "failed to write to subfiling configuration file"); /* Write out each subfile name to the configuration file */ - num_digits = (int)(HDlog10(n_io_concentrators) + 1); - for (int k = 0; k < n_io_concentrators; k++) { - HDsnprintf(line_buf, PATH_MAX, "%s" H5FD_SUBFILING_FILENAME_TEMPLATE "\n", base_filename, - sf_context->h5_file_id, num_digits, k + 1, n_io_concentrators); + num_digits = (int)(HDlog10(n_subfiles) + 1); + for (int k = 0; k < n_subfiles; k++) { + HDsnprintf(line_buf, PATH_MAX, H5FD_SUBFILING_FILENAME_TEMPLATE "\n", base_filename, + sf_context->h5_file_id, num_digits, k + 1, n_subfiles); if (HDfwrite(line_buf, HDstrlen(line_buf), 1, config_file) != 1) H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_WRITEERROR, FAIL, @@ -1873,8 +2302,8 @@ done: *------------------------------------------------------------------------- */ static herr_t -open_config_file(subfiling_context_t *sf_context, const char *base_filename, const char *subfile_dir, - const char *mode, FILE **config_file_out) +open_config_file(const char *base_filename, const char *subfile_dir, uint64_t file_id, const char *mode, + FILE **config_file_out) { hbool_t config_file_exists = FALSE; FILE *config_file = NULL; @@ -1882,17 +2311,14 @@ open_config_file(subfiling_context_t *sf_context, const char *base_filename, con int ret = 0; herr_t ret_value = SUCCEED; - HDassert(sf_context); HDassert(base_filename); HDassert(subfile_dir); + HDassert(file_id != UINT64_MAX); HDassert(mode); HDassert(config_file_out); *config_file_out = NULL; - if (sf_context->h5_file_id == UINT64_MAX) - H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "invalid HDF5 file ID %" PRIu64, - sf_context->h5_file_id); if (*base_filename == '\0') H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "invalid base HDF5 filename '%s'", base_filename); @@ -1903,8 +2329,8 @@ open_config_file(subfiling_context_t *sf_context, const char *base_filename, con H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't allocate space for subfiling configuration filename"); - HDsnprintf(config_filename, PATH_MAX, "%s/%s" H5FD_SUBFILING_CONFIG_FILENAME_TEMPLATE, subfile_dir, - base_filename, sf_context->h5_file_id); + HDsnprintf(config_filename, PATH_MAX, "%s/" H5FD_SUBFILING_CONFIG_FILENAME_TEMPLATE, subfile_dir, + base_filename, file_id); /* Determine whether a subfiling configuration file exists */ errno = 0; @@ -1938,26 +2364,26 @@ done: } /*------------------------------------------------------------------------- - * Function: H5_get_num_iocs_from_config_file + * Function: H5_get_subfiling_config_from_file * - * Purpose: Reads a Subfiling configuration file to get the number of - * I/O concentrators used for the logical HDF5 file. + * Purpose: Reads a Subfiling configuration file to get the stripe size + * and number of subfiles used for the logical HDF5 file. * * Return: Non-negative on success/Negative on failure * *------------------------------------------------------------------------- */ herr_t -H5_get_num_iocs_from_config_file(FILE *config_file, int *n_io_concentrators) +H5_get_subfiling_config_from_file(FILE *config_file, int64_t *stripe_size, int64_t *num_subfiles) { - char *config_buf = NULL; - char *ioc_substr = NULL; - long config_file_len = 0; - int read_n_io_concs = 0; - herr_t ret_value = SUCCEED; + int64_t read_stripe_size = 0; + int64_t read_num_subfiles = 0; + char *config_buf = NULL; + char *substr = NULL; + long config_file_len = 0; + herr_t ret_value = SUCCEED; HDassert(config_file); - HDassert(n_io_concentrators); if (HDfseek(config_file, 0, SEEK_END) < 0) H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_SEEKERROR, FAIL, @@ -1981,22 +2407,40 @@ H5_get_num_iocs_from_config_file(FILE *config_file, int *n_io_concentrators) config_buf[config_file_len] = '\0'; - if (NULL == (ioc_substr = HDstrstr(config_buf, "aggregator_count"))) - H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, - "malformed subfiling configuration file - no aggregator count entry"); + if (stripe_size) { + if (NULL == (substr = HDstrstr(config_buf, "stripe_size"))) + H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, + "malformed subfiling configuration file - no stripe size entry"); + + if (EOF == HDsscanf(substr, "stripe_size=%" PRId64, &read_stripe_size)) + H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, + "couldn't get stripe size from subfiling configuration file"); - if (EOF == HDsscanf(ioc_substr, "aggregator_count=%d", &read_n_io_concs)) - H5_SUBFILING_SYS_GOTO_ERROR( - H5E_FILE, H5E_CANTGET, FAIL, - "couldn't get number of I/O concentrators from subfiling configuration file"); + if (read_stripe_size <= 0) + H5_SUBFILING_GOTO_ERROR( + H5E_FILE, H5E_BADVALUE, FAIL, + "invalid stripe size (%" PRId64 ") read from subfiling configuration file", read_stripe_size); + + *stripe_size = read_stripe_size; + } - if (read_n_io_concs <= 0) - H5_SUBFILING_GOTO_ERROR( - H5E_FILE, H5E_BADVALUE, FAIL, - "invalid number of I/O concentrators (%d) read from subfiling configuration file", - read_n_io_concs); + if (num_subfiles) { + if (NULL == (substr = HDstrstr(config_buf, "subfile_count"))) + H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, + "malformed subfiling configuration file - no subfile count entry"); - *n_io_concentrators = read_n_io_concs; + if (EOF == HDsscanf(substr, "subfile_count=%" PRId64, &read_num_subfiles)) + H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, + "couldn't get number of subfiles from subfiling configuration file"); + + if (read_num_subfiles <= 0) + H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, + "invalid number of subfiles (%" PRId64 + ") read from subfiling configuration file", + read_num_subfiles); + + *num_subfiles = read_num_subfiles; + } done: HDfree(config_buf); @@ -2005,6 +2449,135 @@ done: } /*------------------------------------------------------------------------- + * Function: H5_resolve_pathname + * + * Purpose: Simple wrapper routine around realpath(3) to fully resolve + * a given filepath. Collective across the specified MPI + * communicator in order to minimize file system contention + * between MPI ranks. + * + * The resolved filepath returned through `resolved_filepath` + * must be freed by the caller with HDfree. + * + * Return Non-negative on success/Negative on failure + * + *------------------------------------------------------------------------- + */ +herr_t +H5_resolve_pathname(const char *filepath, MPI_Comm comm, char **resolved_filepath) +{ + hsize_t path_len = HSIZE_UNDEF; + hbool_t bcasted_path_len = FALSE; + hbool_t bcasted_path = FALSE; + char *resolved_path = NULL; + char *file_basename = NULL; + char *file_dirname = NULL; + char *cwd = NULL; + int mpi_rank; + int mpi_size; + int mpi_code; + herr_t ret_value = SUCCEED; + + HDassert(filepath); + HDassert(resolved_filepath); + + if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &mpi_rank))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code); + if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(comm, &mpi_size))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code); + + if (mpi_rank == 0) { + if (NULL == (resolved_path = HDrealpath(filepath, NULL))) { + if (ENOENT == errno) { + if (H5_dirname(filepath, &file_dirname) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't get file dirname"); + + /* If filepath is just the filename, set up path using CWD */ + if (!HDstrcmp(file_dirname, ".")) { + if (NULL == (resolved_path = HDmalloc(PATH_MAX))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, + "can't allocate buffer for filepath"); + if (H5_basename(filepath, &file_basename) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't get file basename"); + if (NULL == (cwd = HDmalloc(PATH_MAX))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, + "can't allocate buffer for CWD"); + + if (NULL == HDgetcwd(cwd, PATH_MAX)) + H5_SUBFILING_GOTO_ERROR( + H5E_VFL, H5E_CANTGET, FAIL, + "can't get current working directory, errno = %d, error message = '%s'", errno, + HDstrerror(errno)); + + HDsnprintf(resolved_path, PATH_MAX, "%s/%s", cwd, file_basename); + } + else { + /* Otherwise, just use what was given as the pathname */ + if (NULL == (resolved_path = HDstrdup(filepath))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't copy filename"); + } + } + else + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, + "can't resolve subfile path, errno = %d, error message = '%s'", errno, + HDstrerror(errno)); + } + + if (resolved_path) { + H5_CHECKED_ASSIGN(path_len, hsize_t, (HDstrlen(resolved_path) + 1), size_t); + } + else + path_len = HSIZE_UNDEF; + } + + /* Broadcast the size of the resolved filepath string to other ranks */ + bcasted_path_len = TRUE; + if (mpi_size > 1) { + if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&path_len, 1, HSIZE_AS_MPI_TYPE, 0, comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code); + } + + if (path_len == HSIZE_UNDEF) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "couldn't resolve filepath"); + + if (mpi_rank != 0) { + if (NULL == (resolved_path = HDmalloc(path_len))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate file name buffer"); + } + + /* Broadcast the resolved filepath to other ranks */ + bcasted_path = TRUE; + if (mpi_size > 1) { + H5_CHECK_OVERFLOW(path_len, hsize_t, int); + if (MPI_SUCCESS != (mpi_code = MPI_Bcast(resolved_path, (int)path_len, MPI_CHAR, 0, comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code); + } + + *resolved_filepath = resolved_path; + +done: + HDfree(cwd); + H5MM_free(file_basename); + H5MM_free(file_dirname); + + if (ret_value < 0) { + if (!bcasted_path_len) { + if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&path_len, 1, HSIZE_AS_MPI_TYPE, 0, comm))) + H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Bcast failed", mpi_code); + } + if (!bcasted_path && (path_len != HSIZE_UNDEF)) { + H5_CHECK_OVERFLOW(path_len, hsize_t, int); + if (MPI_SUCCESS != (mpi_code = MPI_Bcast(resolved_path, (int)path_len, MPI_CHAR, 0, comm))) + H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Bcast failed", mpi_code); + } + + HDfree(resolved_path); + } + + H5_SUBFILING_FUNC_LEAVE; +} + +/*------------------------------------------------------------------------- * Function: H5_close_subfiles * * Purpose: This is a simple wrapper function for the internal version @@ -2046,35 +2619,39 @@ done: *------------------------------------------------------------------------- */ herr_t -H5_close_subfiles(int64_t subfiling_context_id) +H5_close_subfiles(int64_t subfiling_context_id, MPI_Comm file_comm) { subfiling_context_t *sf_context = NULL; MPI_Request barrier_req = MPI_REQUEST_NULL; + int mpi_size; int mpi_code; herr_t ret_value = SUCCEED; if (NULL == (sf_context = H5_get_subfiling_object(subfiling_context_id))) H5_SUBFILING_GOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "couldn't get subfiling object from context ID"); - /* We make the subfile close operation collective. - * Otherwise, there may be a race condition between - * our closing the subfiles and the user application - * moving ahead and possibly re-opening a file. - * - * If we can, we utilize an async barrier which gives - * us the opportunity to reduce the CPU load due to - * MPI spinning while waiting for the barrier to - * complete. This is especially important if there - * is heavy thread utilization due to subfiling - * activities, i.e. the thread pool might be - * extremely busy servicing I/O requests from all - * HDF5 application ranks. - */ -#if MPI_VERSION > 3 || (MPI_VERSION == 3 && MPI_SUBVERSION >= 1) - { + if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(file_comm, &mpi_size))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Comm_size failed", mpi_code); + + /* We make the subfile close operation collective. + * Otherwise, there may be a race condition between + * our closing the subfiles and the user application + * moving ahead and possibly re-opening a file. + * + * If we can, we utilize an async barrier which gives + * us the opportunity to reduce the CPU load due to + * MPI spinning while waiting for the barrier to + * complete. This is especially important if there + * is heavy thread utilization due to subfiling + * activities, i.e. the thread pool might be + * extremely busy servicing I/O requests from all + * HDF5 application ranks. + */ + if (mpi_size > 1) { +#if H5_CHECK_MPI_VERSION(3, 1) int barrier_complete = 0; - if (MPI_SUCCESS != (mpi_code = MPI_Ibarrier(sf_context->sf_barrier_comm, &barrier_req))) + if (MPI_SUCCESS != (mpi_code = MPI_Ibarrier(file_comm, &barrier_req))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Ibarrier failed", mpi_code); while (!barrier_complete) { @@ -2084,24 +2661,25 @@ H5_close_subfiles(int64_t subfiling_context_id) if (MPI_SUCCESS != (mpi_code = MPI_Test(&barrier_req, &barrier_complete, MPI_STATUS_IGNORE))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Test failed", mpi_code); } - } #else - if (MPI_SUCCESS != (mpi_code = MPI_Barrier(sf_context->sf_barrier_comm))) - H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); + if (MPI_SUCCESS != (mpi_code = MPI_Barrier(file_comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); #endif + } /* The map from file handle to subfiling context can now be cleared */ - if (sf_context->h5_file_handle != NULL) { - clear_fid_map_entry(sf_context->h5_file_handle, sf_context->sf_context_id); + if (sf_context->h5_file_id != UINT64_MAX) { + clear_fid_map_entry(sf_context->h5_file_id, sf_context->sf_context_id); } if (sf_context->topology->rank_is_ioc) { - if (sf_context->sf_fid >= 0) { - errno = 0; - if (HDclose(sf_context->sf_fid) < 0) - H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, "couldn't close subfile"); - - sf_context->sf_fid = -1; + if (sf_context->sf_fids) { + for (int i = 0; i < sf_context->sf_num_fids; i++) { + errno = 0; + if (sf_context->sf_fids[i] >= 0 && HDclose(sf_context->sf_fids[i]) < 0) + H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_CANTCLOSEFILE, FAIL, "couldn't close subfile"); + sf_context->sf_fids[i] = -1; + } } } @@ -2110,11 +2688,11 @@ H5_close_subfiles(int64_t subfiling_context_id) * and opening another file before this file is completely closed * down. */ -#if MPI_VERSION > 3 || (MPI_VERSION == 3 && MPI_SUBVERSION >= 1) - { + if (mpi_size > 1) { +#if H5_CHECK_MPI_VERSION(3, 1) int barrier_complete = 0; - if (MPI_SUCCESS != (mpi_code = MPI_Ibarrier(sf_context->sf_barrier_comm, &barrier_req))) + if (MPI_SUCCESS != (mpi_code = MPI_Ibarrier(file_comm, &barrier_req))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Ibarrier failed", mpi_code); while (!barrier_complete) { @@ -2124,24 +2702,213 @@ H5_close_subfiles(int64_t subfiling_context_id) if (MPI_SUCCESS != (mpi_code = MPI_Test(&barrier_req, &barrier_complete, MPI_STATUS_IGNORE))) H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Test failed", mpi_code); } - } #else - if (MPI_SUCCESS != (mpi_code = MPI_Barrier(sf_context->sf_barrier_comm))) - H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); + if (MPI_SUCCESS != (mpi_code = MPI_Barrier(file_comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); +#endif + } + +#ifdef H5_SUBFILING_DEBUG + if (sf_context->sf_logfile) { + struct tm *tm = NULL; + time_t cur_time; + + cur_time = time(NULL); + tm = localtime(&cur_time); + + H5_subfiling_log(sf_context->sf_context_id, "\n-- LOGGING FINISH - %s", asctime(tm)); + + HDfclose(sf_context->sf_logfile); + sf_context->sf_logfile = NULL; + } #endif done: - if (sf_context && H5_free_subfiling_object_int(sf_context) < 0) - H5_SUBFILING_DONE_ERROR(H5E_FILE, H5E_CANTFREE, FAIL, "couldn't free subfiling context object"); + H5_SUBFILING_FUNC_LEAVE; +} + +/*------------------------------------------------------------------------- + * Function: H5_subfiling_set_config_prop + * + * Purpose: Sets the specified Subfiling VFD configuration as a + * property on the given FAPL pointer. The Subfiling VFD uses + * this property to pass its configuration down to the IOC VFD + * without needing each IOC VFD to include it as part of its + * public configuration. + * + * Return: Non-negative on success/Negative on failure + * + *------------------------------------------------------------------------- + */ +herr_t +H5_subfiling_set_config_prop(H5P_genplist_t *plist_ptr, const H5FD_subfiling_params_t *vfd_config) +{ + htri_t prop_exists = FAIL; + herr_t ret_value = SUCCEED; + + if (!plist_ptr) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL FAPL pointer"); + if (!vfd_config) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid subfiling configuration pointer"); + + if ((prop_exists = H5P_exist_plist(plist_ptr, H5FD_SUBFILING_CONFIG_PROP)) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, + "can't check if subfiling configuration property exists in FAPL"); + + if (prop_exists) { + if (H5P_set(plist_ptr, H5FD_SUBFILING_CONFIG_PROP, vfd_config) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, + "can't set subfiling configuration property on FAPL"); + } + else { + union { + const void *const_ptr_to_data; + void *ptr_to_data; + } eliminate_const_warning; + + /* + * Cast away const since H5P_insert doesn't match the signature + * for "value" as H5P_set + */ + eliminate_const_warning.const_ptr_to_data = vfd_config; + + if (H5P_insert(plist_ptr, H5FD_SUBFILING_CONFIG_PROP, sizeof(H5FD_subfiling_params_t), + eliminate_const_warning.ptr_to_data, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTREGISTER, FAIL, + "unable to register subfiling configuration property in FAPL"); + } + +done: + H5_SUBFILING_FUNC_LEAVE; +} + +/*------------------------------------------------------------------------- + * Function: H5_subfiling_get_config_prop + * + * Purpose: Retrieves the Subfiling VFD configuration from the given + * FAPL pointer. The Subfiling VFD uses this property to pass + * its configuration down to the IOC VFD without needing each + * IOC VFD to include it as part of its public configuration. + * + * Return: Non-negative on success/Negative on failure + * + *------------------------------------------------------------------------- + */ +herr_t +H5_subfiling_get_config_prop(H5P_genplist_t *plist_ptr, H5FD_subfiling_params_t *vfd_config) +{ + htri_t prop_exists = FAIL; + herr_t ret_value = SUCCEED; + + if (!plist_ptr) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL FAPL pointer"); + if (!vfd_config) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid subfiling configuration pointer"); + + if ((prop_exists = H5P_exist_plist(plist_ptr, H5FD_SUBFILING_CONFIG_PROP)) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, + "can't check if subfiling configuration property exists in FAPL"); + + if (prop_exists) { + if (H5P_get(plist_ptr, H5FD_SUBFILING_CONFIG_PROP, vfd_config) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, + "can't get subfiling configuration property from FAPL"); + } + else { + vfd_config->ioc_selection = SELECT_IOC_ONE_PER_NODE; + vfd_config->stripe_size = H5FD_SUBFILING_DEFAULT_STRIPE_SIZE; + vfd_config->stripe_count = H5FD_SUBFILING_DEFAULT_STRIPE_COUNT; + } + +done: + H5_SUBFILING_FUNC_LEAVE; +} + +/*------------------------------------------------------------------------- + * Function: H5_subfiling_set_file_id_prop + * + * Purpose: Sets the specified file ID (Inode) value as a property on + * the given FAPL pointer. The Subfiling VFD uses this + * property to pass the HDF5 stub file ID value down to the + * IOC VFD. + * + * Return: Non-negative on success/Negative on failure + * + *------------------------------------------------------------------------- + */ +herr_t +H5_subfiling_set_file_id_prop(H5P_genplist_t *plist_ptr, uint64_t file_id) +{ + htri_t prop_exists = FAIL; + herr_t ret_value = SUCCEED; + + if (!plist_ptr) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL FAPL pointer"); + if (file_id == UINT64_MAX) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid file ID value"); + + if ((prop_exists = H5P_exist_plist(plist_ptr, H5FD_SUBFILING_STUB_FILE_ID)) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, + "can't check if file ID property exists in FAPL"); + + if (prop_exists) { + if (H5P_set(plist_ptr, H5FD_SUBFILING_STUB_FILE_ID, &file_id) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set file ID property on FAPL"); + } + else { + if (H5P_insert(plist_ptr, H5FD_SUBFILING_STUB_FILE_ID, sizeof(uint64_t), &file_id, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTREGISTER, FAIL, + "unable to register file ID property in FAPL"); + } + +done: + H5_SUBFILING_FUNC_LEAVE; +} + +/*------------------------------------------------------------------------- + * Function: H5_subfiling_get_file_id_prop + * + * Purpose: Retrieves the file ID (Inode) value from the given FAPL + * pointer. The Subfiling VFD uses this property to pass the + * HDF5 stub file ID value down to the IOC VFD. + * + * Return: Non-negative on success/Negative on failure + * + *------------------------------------------------------------------------- + */ +herr_t +H5_subfiling_get_file_id_prop(H5P_genplist_t *plist_ptr, uint64_t *file_id) +{ + htri_t prop_exists = FAIL; + herr_t ret_value = SUCCEED; + + if (!plist_ptr) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL FAPL pointer"); + if (!file_id) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL file ID pointer"); + if ((prop_exists = H5P_exist_plist(plist_ptr, H5FD_SUBFILING_STUB_FILE_ID)) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, + "can't check if file ID property exists in FAPL"); + + if (prop_exists) { + if (H5P_get(plist_ptr, H5FD_SUBFILING_STUB_FILE_ID, file_id) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get file ID property from FAPL"); + } + else + *file_id = UINT64_MAX; + +done: H5_SUBFILING_FUNC_LEAVE; } /*------------------------------------------------------------------------- - * Function: H5_subfile_fhandle_to_context + * Function: H5_subfile_fid_to_context * * Purpose: This is a basic lookup function which returns the subfiling - * context id associated with the specified file handle. + * context id associated with the specified file ID. * * Return: Non-negative subfiling context ID if the context exists * Negative on failure or if the subfiling context doesn't @@ -2155,7 +2922,7 @@ done: *------------------------------------------------------------------------- */ int64_t -H5_subfile_fhandle_to_context(void *file_handle) +H5_subfile_fid_to_context(uint64_t file_id) { int64_t ret_value = -1; @@ -2163,14 +2930,107 @@ H5_subfile_fhandle_to_context(void *file_handle) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, -1, "open file map is NULL"); for (int i = 0; i < sf_file_map_size; i++) { - if (sf_open_file_map[i].file_handle == file_handle) { + if (sf_open_file_map[i].file_id == file_id) { return sf_open_file_map[i].sf_context_id; } } done: H5_SUBFILING_FUNC_LEAVE; -} /* end H5_subfile_fhandle_to_context() */ +} /* end H5_subfile_fid_to_context() */ + +/*------------------------------------------------------------------------- + * Function: H5_subfiling_validate_config + * + * Purpose: Checks that the given subfiling configuration parameters + * are valid + * + * Return: Non-negative on success/Negative on failure + *------------------------------------------------------------------------- + */ +herr_t +H5_subfiling_validate_config(const H5FD_subfiling_params_t *subf_config) +{ + H5FD_subfiling_ioc_select_t ioc_sel_type; + herr_t ret_value = SUCCEED; + + if (!subf_config) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "NULL subfiling configuration pointer"); + + /* + * Compare against each IOC selection value directly since + * the enum might be a signed or unsigned type and a comparison + * against < 0 could generate a warning + */ + ioc_sel_type = subf_config->ioc_selection; + if (ioc_sel_type != SELECT_IOC_ONE_PER_NODE && ioc_sel_type != SELECT_IOC_EVERY_NTH_RANK && + ioc_sel_type != SELECT_IOC_WITH_CONFIG && ioc_sel_type != SELECT_IOC_TOTAL) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid IOC selection method"); + + if (subf_config->stripe_size <= 0) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid stripe size"); + + if (subf_config->stripe_count <= 0 && subf_config->stripe_count != H5FD_SUBFILING_DEFAULT_STRIPE_COUNT) + H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid stripe count"); + +done: + H5_SUBFILING_FUNC_LEAVE; +} + +/*------------------------------------------------------------------------- + * Function: H5_subfiling_terminate + * + * Purpose: A cleanup routine to be called by the Subfiling VFD when + * it is terminating. Cleans up internal resources such as the + * context and topology caches. + * + * Return: Non-negative on success/Negative on failure + * + *------------------------------------------------------------------------- + */ +herr_t +H5_subfiling_terminate(void) +{ + herr_t ret_value = SUCCEED; + + /* Clean up subfiling context and topology caches */ + if (sf_context_cache) { + for (size_t i = 0; i < sf_context_cache_num_entries; i++) { + if (H5_free_subfiling_object_int(sf_context_cache[i]) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, + "couldn't free subfiling context object"); + sf_context_cache[i] = NULL; + } + + sf_context_cache_size = 0; + sf_context_cache_num_entries = 0; + + HDfree(sf_context_cache); + sf_context_cache = NULL; + } + if (sf_topology_cache) { + for (size_t i = 0; i < sf_topology_cache_num_entries; i++) { + if (H5_free_subfiling_topology(sf_topology_cache[i]) < 0) + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, + "couldn't free subfiling topology object"); + sf_topology_cache[i] = NULL; + } + + sf_topology_cache_size = 0; + sf_topology_cache_num_entries = 0; + + HDfree(sf_topology_cache); + sf_topology_cache = NULL; + } + + /* Clean up the file ID to context object mapping */ + sf_file_map_size = 0; + HDfree(sf_open_file_map); + sf_open_file_map = NULL; + +done: + H5_SUBFILING_FUNC_LEAVE; +} #ifdef H5_SUBFILING_DEBUG void |