summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorjhendersonHDF <jhenderson@hdfgroup.org>2023-04-27 16:52:11 (GMT)
committerGitHub <noreply@github.com>2023-04-27 16:52:11 (GMT)
commitb5ecb0af6dc0665d16ee7f257332741fcbef91a2 (patch)
tree386a9862561463cf5c790d933ebe29467fda4840 /src
parent14a19b8c905328344c7d33505986d340eab34933 (diff)
downloadhdf5-b5ecb0af6dc0665d16ee7f257332741fcbef91a2.zip
hdf5-b5ecb0af6dc0665d16ee7f257332741fcbef91a2.tar.gz
hdf5-b5ecb0af6dc0665d16ee7f257332741fcbef91a2.tar.bz2
Subfiling VFD - check if MPI is finalized during VFD termination (#2683)
Diffstat (limited to 'src')
-rw-r--r--src/H5FDsubfiling/H5FDioc.c18
-rw-r--r--src/H5FDsubfiling/H5FDsubfiling.c42
-rw-r--r--src/H5FDsubfiling/H5subfiling_common.c55
3 files changed, 87 insertions, 28 deletions
diff --git a/src/H5FDsubfiling/H5FDioc.c b/src/H5FDsubfiling/H5FDioc.c
index 2fd8b64..7d20021 100644
--- a/src/H5FDsubfiling/H5FDioc.c
+++ b/src/H5FDsubfiling/H5FDioc.c
@@ -887,16 +887,20 @@ done:
static herr_t
H5FD__ioc_close_int(H5FD_ioc_t *file_ptr)
{
+ int mpi_finalized;
+ int mpi_code;
herr_t ret_value = SUCCEED;
HDassert(file_ptr);
+ if (MPI_SUCCESS != (mpi_code = MPI_Finalized(&mpi_finalized)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Finalized failed", mpi_code);
+
if (file_ptr->context_id >= 0) {
subfiling_context_t *sf_context = H5_get_subfiling_object(file_ptr->context_id);
- int mpi_code;
/* Don't allow IOC threads to be finalized until everyone gets here */
- if (file_ptr->mpi_size > 1)
+ if (!mpi_finalized && (file_ptr->mpi_size > 1))
if (MPI_SUCCESS != (mpi_code = MPI_Barrier(file_ptr->comm)))
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code);
@@ -911,10 +915,12 @@ H5FD__ioc_close_int(H5FD_ioc_t *file_ptr)
file_ptr->context_id = -1;
}
- if (H5_mpi_comm_free(&file_ptr->comm) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free MPI Communicator");
- if (H5_mpi_info_free(&file_ptr->info) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free MPI Info object");
+ if (!mpi_finalized) {
+ if (H5_mpi_comm_free(&file_ptr->comm) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free MPI Communicator");
+ if (H5_mpi_info_free(&file_ptr->info) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free MPI Info object");
+ }
done:
HDfree(file_ptr->file_path);
diff --git a/src/H5FDsubfiling/H5FDsubfiling.c b/src/H5FDsubfiling/H5FDsubfiling.c
index e086190..64c92ed 100644
--- a/src/H5FDsubfiling/H5FDsubfiling.c
+++ b/src/H5FDsubfiling/H5FDsubfiling.c
@@ -374,12 +374,29 @@ H5FD__subfiling_term(void)
herr_t ret_value = SUCCEED;
if (H5FD_SUBFILING_g >= 0) {
+ int mpi_finalized;
int mpi_code;
+ /*
+ * Retrieve status of whether MPI has already been terminated.
+ * This can happen if an HDF5 ID is left unclosed and HDF5
+ * shuts down after MPI_Finalize() is called in an application.
+ */
+ if (MPI_SUCCESS != (mpi_code = MPI_Finalized(&mpi_finalized)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Finalized failed", mpi_code);
+
/* Free RPC message MPI Datatype */
- if (H5_subfiling_rpc_msg_type != MPI_DATATYPE_NULL)
- if (MPI_SUCCESS != (mpi_code = MPI_Type_free(&H5_subfiling_rpc_msg_type)))
- H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
+ if (H5_subfiling_rpc_msg_type != MPI_DATATYPE_NULL) {
+ if (!mpi_finalized) {
+ if (MPI_SUCCESS != (mpi_code = MPI_Type_free(&H5_subfiling_rpc_msg_type)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
+ }
+#ifdef H5FD_SUBFILING_DEBUG
+ else
+ HDprintf("** WARNING **: HDF5 is terminating the Subfiling VFD after MPI_Finalize() was "
+ "called - an HDF5 ID was probably left unclosed\n");
+#endif
+ }
/* Clean up resources */
if (H5_subfiling_terminate() < 0)
@@ -1297,10 +1314,15 @@ done:
static herr_t
H5FD__subfiling_close_int(H5FD_subfiling_t *file_ptr)
{
+ int mpi_finalized;
+ int mpi_code;
herr_t ret_value = SUCCEED;
HDassert(file_ptr);
+ if (MPI_SUCCESS != (mpi_code = MPI_Finalized(&mpi_finalized)))
+ H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Finalized failed", mpi_code);
+
if (file_ptr->sf_file && H5FD_close(file_ptr->sf_file) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTCLOSEFILE, FAIL, "unable to close subfile");
if (file_ptr->stub_file && H5FD_close(file_ptr->stub_file) < 0)
@@ -1311,13 +1333,15 @@ H5FD__subfiling_close_int(H5FD_subfiling_t *file_ptr)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_ARGS, FAIL, "can't close IOC FAPL");
file_ptr->fa.ioc_fapl_id = H5I_INVALID_HID;
- if (H5_mpi_comm_free(&file_ptr->comm) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free MPI Communicator");
- if (H5_mpi_info_free(&file_ptr->info) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free MPI Info object");
+ if (!mpi_finalized) {
+ if (H5_mpi_comm_free(&file_ptr->comm) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free MPI Communicator");
+ if (H5_mpi_info_free(&file_ptr->info) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free MPI Info object");
- if (H5_mpi_comm_free(&file_ptr->ext_comm) < 0)
- H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI communicator");
+ if (H5_mpi_comm_free(&file_ptr->ext_comm) < 0)
+ H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI communicator");
+ }
file_ptr->fail_to_encode = FALSE;
diff --git a/src/H5FDsubfiling/H5subfiling_common.c b/src/H5FDsubfiling/H5subfiling_common.c
index 58f3643..8fea794 100644
--- a/src/H5FDsubfiling/H5subfiling_common.c
+++ b/src/H5FDsubfiling/H5subfiling_common.c
@@ -338,8 +338,18 @@ done:
static herr_t
H5_free_subfiling_object_int(subfiling_context_t *sf_context)
{
+ int mpi_finalized;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
+
HDassert(sf_context);
+ if (MPI_SUCCESS != (mpi_code = MPI_Finalized(&mpi_finalized))) {
+ /* Assume MPI is finalized or worse, and try to clean up what we can */
+ H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Finalized failed", mpi_code);
+ mpi_finalized = 1;
+ }
+
sf_context->sf_context_id = -1;
sf_context->h5_file_id = UINT64_MAX;
sf_context->sf_num_fids = 0;
@@ -352,28 +362,38 @@ H5_free_subfiling_object_int(subfiling_context_t *sf_context)
sf_context->sf_base_addr = -1;
if (sf_context->sf_msg_comm != MPI_COMM_NULL) {
- if (H5_mpi_comm_free(&sf_context->sf_msg_comm) < 0)
- return FAIL;
+ if (!mpi_finalized) {
+ if (H5_mpi_comm_free(&sf_context->sf_msg_comm) < 0)
+ return FAIL;
+ }
sf_context->sf_msg_comm = MPI_COMM_NULL;
}
if (sf_context->sf_data_comm != MPI_COMM_NULL) {
- if (H5_mpi_comm_free(&sf_context->sf_data_comm) < 0)
- return FAIL;
+ if (!mpi_finalized) {
+ if (H5_mpi_comm_free(&sf_context->sf_data_comm) < 0)
+ return FAIL;
+ }
sf_context->sf_data_comm = MPI_COMM_NULL;
}
if (sf_context->sf_eof_comm != MPI_COMM_NULL) {
- if (H5_mpi_comm_free(&sf_context->sf_eof_comm) < 0)
- return FAIL;
+ if (!mpi_finalized) {
+ if (H5_mpi_comm_free(&sf_context->sf_eof_comm) < 0)
+ return FAIL;
+ }
sf_context->sf_eof_comm = MPI_COMM_NULL;
}
if (sf_context->sf_node_comm != MPI_COMM_NULL) {
- if (H5_mpi_comm_free(&sf_context->sf_node_comm) < 0)
- return FAIL;
+ if (!mpi_finalized) {
+ if (H5_mpi_comm_free(&sf_context->sf_node_comm) < 0)
+ return FAIL;
+ }
sf_context->sf_node_comm = MPI_COMM_NULL;
}
if (sf_context->sf_group_comm != MPI_COMM_NULL) {
- if (H5_mpi_comm_free(&sf_context->sf_group_comm) < 0)
- return FAIL;
+ if (!mpi_finalized) {
+ if (H5_mpi_comm_free(&sf_context->sf_group_comm) < 0)
+ return FAIL;
+ }
sf_context->sf_group_comm = MPI_COMM_NULL;
}
@@ -402,16 +422,24 @@ H5_free_subfiling_object_int(subfiling_context_t *sf_context)
HDfree(sf_context);
- return SUCCEED;
+ H5_SUBFILING_FUNC_LEAVE;
}
static herr_t
H5_free_subfiling_topology(sf_topology_t *topology)
{
+ int mpi_finalized;
+ int mpi_code;
herr_t ret_value = SUCCEED;
HDassert(topology);
+ if (MPI_SUCCESS != (mpi_code = MPI_Finalized(&mpi_finalized))) {
+ /* Assume MPI is finalized or worse, but clean up what we can */
+ H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Finalized failed", mpi_code);
+ mpi_finalized = 1;
+ }
+
#ifndef NDEBUG
{
hbool_t topology_cached = FALSE;
@@ -442,8 +470,9 @@ H5_free_subfiling_topology(sf_topology_t *topology)
HDfree(topology->io_concentrators);
topology->io_concentrators = NULL;
- if (H5_mpi_comm_free(&topology->app_comm) < 0)
- H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI communicator");
+ if (!mpi_finalized)
+ if (H5_mpi_comm_free(&topology->app_comm) < 0)
+ H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI communicator");
HDfree(topology);