summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjhendersonHDF <jhenderson@hdfgroup.org>2023-09-07 01:47:29 (GMT)
committerGitHub <noreply@github.com>2023-09-07 01:47:29 (GMT)
commitfcf2e6542483cb8a07bbf9134ac9c2b849c38562 (patch)
tree5b206bcda79ab7881f27ced82af805bf940c4535
parent60040c4791cda2c4bee721393a94447a522a754a (diff)
downloadhdf5-fcf2e6542483cb8a07bbf9134ac9c2b849c38562.zip
hdf5-fcf2e6542483cb8a07bbf9134ac9c2b849c38562.tar.gz
hdf5-fcf2e6542483cb8a07bbf9134ac9c2b849c38562.tar.bz2
Fix Subfiling VFD IOC assignment bug (#3456) (#3518)
-rw-r--r--release_docs/RELEASE.txt13
-rw-r--r--src/H5FDsubfiling/H5FDsubfiling_priv.h5
-rw-r--r--src/H5FDsubfiling/H5subfiling_common.c66
-rw-r--r--src/H5FDsubfiling/H5subfiling_common.h19
4 files changed, 84 insertions, 19 deletions
diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt
index ba0edba..0c35f11 100644
--- a/release_docs/RELEASE.txt
+++ b/release_docs/RELEASE.txt
@@ -109,6 +109,19 @@ Bug Fixes since HDF5-1.14.2 release
===================================
Library
-------
+ - Fixed a bug with the way the Subfiling VFD assigns I/O concentrators
+
+ During a file open operation, the Subfiling VFD determines the topology
+ of the application and uses that to select a subset of MPI ranks that
+ I/O will be forwarded to, called I/O concentrators. The code for this
+ had previously assumed that the parallel job launcher application (e.g.,
+ mpirun, srun, etc.) would distribute MPI ranks sequentially among a node
+ until all processors on that node have been assigned before going on to
+ the next node. When the launcher application mapped MPI ranks to nodes
+ in a different fashion, such as round-robin, this could cause the Subfiling
+ VFD to incorrectly map MPI ranks as I/O concentrators, leading to missing
+ subfiles.
+
- Fixed a file space allocation bug in the parallel library for chunked
datasets
diff --git a/src/H5FDsubfiling/H5FDsubfiling_priv.h b/src/H5FDsubfiling/H5FDsubfiling_priv.h
index 96a7322..08fef7d 100644
--- a/src/H5FDsubfiling/H5FDsubfiling_priv.h
+++ b/src/H5FDsubfiling/H5FDsubfiling_priv.h
@@ -40,11 +40,6 @@
#include "H5subfiling_common.h"
#include "H5subfiling_err.h"
-/*
- * Some definitions for debugging the Subfiling VFD
- */
-/* #define H5FD_SUBFILING_DEBUG */
-
#define DRIVER_INFO_MESSAGE_MAX_INFO 65536
#define DRIVER_INFO_MESSAGE_MAX_LENGTH 65552 /* MAX_INFO + sizeof(info_header_t) */
diff --git a/src/H5FDsubfiling/H5subfiling_common.c b/src/H5FDsubfiling/H5subfiling_common.c
index 63791c1..34aaa93 100644
--- a/src/H5FDsubfiling/H5subfiling_common.c
+++ b/src/H5FDsubfiling/H5subfiling_common.c
@@ -55,8 +55,8 @@ static herr_t H5_free_subfiling_topology(sf_topology_t *topology);
static herr_t init_subfiling(const char *base_filename, uint64_t file_id,
H5FD_subfiling_params_t *subfiling_config, int file_acc_flags, MPI_Comm comm,
int64_t *context_id_out);
-static herr_t init_app_topology(H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, MPI_Comm node_comm,
- sf_topology_t **app_topology_out);
+static herr_t init_app_topology(int64_t sf_context_id, H5FD_subfiling_params_t *subfiling_config,
+ MPI_Comm comm, MPI_Comm node_comm, sf_topology_t **app_topology_out);
static herr_t get_ioc_selection_criteria_from_env(H5FD_subfiling_ioc_select_t *ioc_selection_type,
char **ioc_sel_info_str);
static herr_t find_cached_topology_info(MPI_Comm comm, H5FD_subfiling_params_t *subf_config,
@@ -64,7 +64,7 @@ static herr_t find_cached_topology_info(MPI_Comm comm, H5FD_subfiling_params_t *
static herr_t init_app_layout(sf_topology_t *app_topology, MPI_Comm comm, MPI_Comm node_comm);
static herr_t gather_topology_info(app_layout_t *app_layout, MPI_Comm comm, MPI_Comm intra_comm);
static int compare_layout_nodelocal(const void *layout1, const void *layout2);
-static herr_t identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride);
+static herr_t identify_ioc_ranks(int64_t sf_context_id, sf_topology_t *app_topology, int rank_stride);
static herr_t init_subfiling_context(subfiling_context_t *sf_context, const char *base_filename,
uint64_t file_id, H5FD_subfiling_params_t *subfiling_config,
sf_topology_t *app_topology, MPI_Comm file_comm);
@@ -886,7 +886,7 @@ init_subfiling(const char *base_filename, uint64_t file_id, H5FD_subfiling_param
* Setup the application topology information, including the computed
* number and distribution map of the set of I/O concentrators
*/
- if (init_app_topology(subfiling_config, comm, node_comm, &app_topology) < 0)
+ if (init_app_topology(context_id, subfiling_config, comm, node_comm, &app_topology) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't initialize application topology");
new_context->sf_context_id = context_id;
@@ -938,8 +938,8 @@ done:
*-------------------------------------------------------------------------
*/
static herr_t
-init_app_topology(H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, MPI_Comm node_comm,
- sf_topology_t **app_topology_out)
+init_app_topology(int64_t sf_context_id, H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm,
+ MPI_Comm node_comm, sf_topology_t **app_topology_out)
{
H5FD_subfiling_ioc_select_t ioc_selection_type;
sf_topology_t *app_topology = NULL;
@@ -1142,7 +1142,7 @@ init_app_topology(H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, MPI_
* Determine which ranks are I/O concentrator ranks, based on the
* given IOC selection strategy and MPI information.
*/
- if (identify_ioc_ranks(app_topology, rank_multiple) < 0)
+ if (identify_ioc_ranks(sf_context_id, app_topology, rank_multiple) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL,
"couldn't determine which MPI ranks are I/O concentrators");
}
@@ -1600,7 +1600,7 @@ compare_layout_nodelocal(const void *layout1, const void *layout2)
*-------------------------------------------------------------------------
*/
static herr_t
-identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride)
+identify_ioc_ranks(int64_t sf_context_id, sf_topology_t *app_topology, int rank_stride)
{
app_layout_t *app_layout = NULL;
int *io_concentrators = NULL;
@@ -1613,6 +1613,11 @@ identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride)
assert(app_topology->app_layout);
assert(app_topology->app_layout->layout);
assert(app_topology->app_layout->node_count > 0);
+ assert(app_topology->app_layout->node_count <= app_topology->app_layout->world_size);
+
+#ifndef H5_SUBFILING_DEBUG
+ (void)sf_context_id;
+#endif
app_layout = app_topology->app_layout;
@@ -1629,36 +1634,52 @@ identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride)
case SELECT_IOC_ONE_PER_NODE: {
int total_ioc_count = 0;
int iocs_per_node = 1;
+ int last_lead_rank;
if (app_topology->n_io_concentrators > app_layout->node_count)
iocs_per_node = app_topology->n_io_concentrators / app_layout->node_count;
- assert(app_layout->node_ranks);
+ /*
+ * NOTE: The below code assumes that the app_layout->layout
+ * array was sorted according to the node_lead_rank field,
+ * such that entries for MPI ranks on the same node will occur
+ * together in the array.
+ */
- for (size_t i = 0; i < (size_t)app_layout->node_count; i++) {
- int node_index = app_layout->node_ranks[i];
- int local_size = app_layout->layout[node_index].node_local_size;
+ last_lead_rank = app_layout->layout[0].node_lead_rank;
+ for (size_t i = 0, layout_idx = 0; i < (size_t)app_layout->node_count; i++) {
+ int local_size = app_layout->layout[layout_idx].node_local_size;
+ /* Assign first I/O concentrator from this node */
assert(total_ioc_count < app_topology->n_io_concentrators);
- io_concentrators[total_ioc_count] = app_layout->layout[node_index++].rank;
+ io_concentrators[total_ioc_count] = app_layout->layout[layout_idx++].rank;
if (app_layout->world_rank == io_concentrators[total_ioc_count]) {
+ assert(!app_topology->rank_is_ioc);
+
app_topology->ioc_idx = total_ioc_count;
app_topology->rank_is_ioc = TRUE;
}
total_ioc_count++;
+ /* Assign any additional I/O concentrators from this node */
for (size_t j = 1; j < (size_t)iocs_per_node; j++) {
if (total_ioc_count >= max_iocs)
break;
if (j >= (size_t)local_size)
break;
+ /* Sanity check to make sure this rank is on the same node */
+ assert(app_layout->layout[layout_idx].node_lead_rank ==
+ app_layout->layout[layout_idx - 1].node_lead_rank);
+
assert(total_ioc_count < app_topology->n_io_concentrators);
- io_concentrators[total_ioc_count] = app_layout->layout[node_index++].rank;
+ io_concentrators[total_ioc_count] = app_layout->layout[layout_idx++].rank;
if (app_layout->world_rank == io_concentrators[total_ioc_count]) {
+ assert(!app_topology->rank_is_ioc);
+
app_topology->ioc_idx = total_ioc_count;
app_topology->rank_is_ioc = TRUE;
}
@@ -1668,8 +1689,25 @@ identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride)
if (total_ioc_count >= max_iocs)
break;
+
+ /* Find the block of layout structures for the next node */
+ while ((layout_idx < (size_t)app_layout->world_size) &&
+ (last_lead_rank == app_layout->layout[layout_idx].node_lead_rank))
+ layout_idx++;
+
+ if (layout_idx >= (size_t)app_layout->world_size)
+ break;
+
+ last_lead_rank = app_layout->layout[layout_idx].node_lead_rank;
}
+#ifdef H5_SUBFILING_DEBUG
+ if (app_topology->n_io_concentrators != total_ioc_count)
+ H5_subfiling_log(sf_context_id,
+ "%s: **WARN** Number of I/O concentrators adjusted from %d to %d", __func__,
+ app_topology->n_io_concentrators, total_ioc_count);
+#endif
+
/* Set final number of I/O concentrators after adjustments */
app_topology->n_io_concentrators = total_ioc_count;
diff --git a/src/H5FDsubfiling/H5subfiling_common.h b/src/H5FDsubfiling/H5subfiling_common.h
index 51d8b22..395183a 100644
--- a/src/H5FDsubfiling/H5subfiling_common.h
+++ b/src/H5FDsubfiling/H5subfiling_common.h
@@ -155,6 +155,25 @@ typedef enum io_ops {
* will be broadcast to all MPI ranks and will
* provide a basis for determining which MPI ranks
* will host an I/O concentrator.
+ *
+ * - rank
+ * The MPI rank value for this processor
+ *
+ * - node_local_rank
+ * The MPI rank value for this processor in an MPI
+ * communicator that only involves MPI ranks on the
+ * same node as this processor
+ *
+ * - node_local_size
+ * The number of MPI ranks on the same node as this
+ * processor, including this processor itself
+ *
+ * - node_lead_rank
+ * The lowest MPI rank value for processors on the
+ * same node as this processor (possibly the MPI
+ * rank value for this processor); Denotes a "lead"
+ * MPI rank for certain operations.
+ *
*/
typedef struct {
int rank;