From e3fb9cdfb6d762af74513be8cceee98c03560b24 Mon Sep 17 00:00:00 2001 From: Quincey Koziol Date: Thu, 20 Aug 2020 14:31:35 -0500 Subject: Avoid creating MPI datatypes on ranks with 0 chunks to write' --- src/H5Dchunk.c | 121 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 68 insertions(+), 53 deletions(-) diff --git a/src/H5Dchunk.c b/src/H5Dchunk.c index ee83564..f4c1c8a 100644 --- a/src/H5Dchunk.c +++ b/src/H5Dchunk.c @@ -4961,6 +4961,7 @@ H5D__chunk_collective_fill(const H5D_t *dset, H5D_chunk_coll_info_t *chunk_info, H5FD_mpio_xfer_t prev_xfer_mode; /* Previous data xfer mode */ hbool_t have_xfer_mode = FALSE; /* Whether the previous xffer mode has been retrieved */ hbool_t need_addr_sort = FALSE; + hbool_t created_mpi_datatypes = FALSE; /* Whether MPI datatypes were created */ int i; /* Local index variable */ herr_t ret_value = SUCCEED; /* Return value */ @@ -4983,9 +4984,9 @@ H5D__chunk_collective_fill(const H5D_t *dset, H5D_chunk_coll_info_t *chunk_info, HGOTO_ERROR(H5E_DATASET, H5E_BADVALUE, FAIL, "Resulted in division by zero") num_blocks = (size_t)(chunk_info->num_io / (size_t)mpi_size); /* value should be the same on all procs */ - /* after evenly distributing the blocks between processes, are - there any leftover blocks for each individual process - (round-robin) */ + /* After evenly distributing the blocks between processes, are there any + * leftover blocks for each individual process (round-robin)? + */ leftover_blocks = (size_t)(chunk_info->num_io % (size_t)mpi_size); /* Cast values to types needed by MPI */ @@ -4993,58 +4994,70 @@ H5D__chunk_collective_fill(const H5D_t *dset, H5D_chunk_coll_info_t *chunk_info, H5_CHECKED_ASSIGN(leftover, int, leftover_blocks, size_t); H5_CHECKED_ASSIGN(block_len, int, chunk_size, size_t); - /* Allocate buffers */ - /* (MSC - should not need block_lens if MPI_type_create_hindexed_block is working) */ - if(NULL == (block_lens = (int *)H5MM_malloc((size_t)(blocks + 1) * sizeof(int)))) - HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate chunk lengths buffer") - if(NULL == (chunk_disp_array = (MPI_Aint *)H5MM_malloc((size_t)(blocks + 1) * sizeof(MPI_Aint)))) - HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate chunk file displacement buffer") + /* Check if we have any chunks to write on this rank */ + if(num_blocks > 0 || (leftover && leftover > mpi_rank)) { + /* Allocate buffers */ + /* (MSC - should not need block_lens if MPI_type_create_hindexed_block is working) */ + if(NULL == (block_lens = (int *)H5MM_malloc((size_t)(blocks + 1) * sizeof(int)))) + HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate chunk lengths buffer") + if(NULL == (chunk_disp_array = (MPI_Aint *)H5MM_malloc((size_t)(blocks + 1) * sizeof(MPI_Aint)))) + HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate chunk file displacement buffer") - for(i = 0 ; i < blocks ; i++) { - /* store the chunk address as an MPI_Aint */ - chunk_disp_array[i] = (MPI_Aint)(chunk_info->addr[i + mpi_rank*blocks]); + for(i = 0 ; i < blocks ; i++) { + /* store the chunk address as an MPI_Aint */ + chunk_disp_array[i] = (MPI_Aint)(chunk_info->addr[i + (mpi_rank * blocks)]); - /* MSC - should not need this if MPI_type_create_hindexed_block is working */ - block_lens[i] = block_len; + /* MSC - should not need this if MPI_type_create_hindexed_block is working */ + block_lens[i] = block_len; - /* make sure that the addresses in the datatype are - monotonically non decreasing */ - if(i && (chunk_disp_array[i] < chunk_disp_array[i - 1])) - need_addr_sort = TRUE; - } /* end for */ + /* Make sure that the addresses in the datatype are + * monotonically non-decreasing + */ + if(i && (chunk_disp_array[i] < chunk_disp_array[i - 1])) + need_addr_sort = TRUE; + } /* end for */ - /* calculate if there are any leftover blocks after evenly - distributing. If there are, then round robin the distribution - to processes 0 -> leftover. */ - if(leftover && leftover > mpi_rank) { - chunk_disp_array[blocks] = (MPI_Aint)chunk_info->addr[blocks*mpi_size + mpi_rank]; - if(blocks && (chunk_disp_array[blocks] < chunk_disp_array[blocks - 1])) - need_addr_sort = TRUE; - block_lens[blocks] = block_len; - blocks++; - } + /* Calculate if there are any leftover blocks after evenly + * distributing. If there are, then round-robin the distribution + * to processes 0 -> leftover. + */ + if(leftover && leftover > mpi_rank) { + chunk_disp_array[blocks] = (MPI_Aint)chunk_info->addr[(blocks * mpi_size) + mpi_rank]; + if(blocks && (chunk_disp_array[blocks] < chunk_disp_array[blocks - 1])) + need_addr_sort = TRUE; + block_lens[blocks] = block_len; + blocks++; + } - /* - * Ensure that the blocks are sorted in monotonically non-decreasing - * order of offset in the file. - */ - if(need_addr_sort) - HDqsort(chunk_disp_array, blocks, sizeof(MPI_Aint), H5D__chunk_cmp_addr); + /* Ensure that the blocks are sorted in monotonically non-decreasing + * order of offset in the file. + */ + if(need_addr_sort) + HDqsort(chunk_disp_array, blocks, sizeof(MPI_Aint), H5D__chunk_cmp_addr); - /* MSC - should use this if MPI_type_create_hindexed block is working: - * mpi_code = MPI_Type_create_hindexed_block(blocks, block_len, chunk_disp_array, MPI_BYTE, &file_type); - */ - mpi_code = MPI_Type_create_hindexed(blocks, block_lens, chunk_disp_array, MPI_BYTE, &file_type); - if(mpi_code != MPI_SUCCESS) - HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed failed", mpi_code) - if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&file_type))) - HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code) - - mpi_code = MPI_Type_create_hvector(blocks, block_len, 0, MPI_BYTE, &mem_type); - if(mpi_code != MPI_SUCCESS) - HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hvector failed", mpi_code) - if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&mem_type))) - HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code) + /* MSC - should use this if MPI_type_create_hindexed block is working: + * mpi_code = MPI_Type_create_hindexed_block(blocks, block_len, chunk_disp_array, MPI_BYTE, &file_type); + */ + mpi_code = MPI_Type_create_hindexed(blocks, block_lens, chunk_disp_array, MPI_BYTE, &file_type); + if(mpi_code != MPI_SUCCESS) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed failed", mpi_code) + if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&file_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code) + + mpi_code = MPI_Type_create_hvector(blocks, block_len, 0, MPI_BYTE, &mem_type); + if(mpi_code != MPI_SUCCESS) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hvector failed", mpi_code) + if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&mem_type))) + HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code) + + /* Indicate that the MPI types were created */ + created_mpi_datatypes = TRUE; + } /* end if */ + else { + /* Set up file & memory MPI types, to participate in collective write */ + file_type = MPI_BYTE; + mem_type = MPI_BYTE; + } /* end else */ /* Set MPI-IO VFD properties */ @@ -5076,10 +5089,12 @@ done: HDONE_ERROR(H5E_DATASET, H5E_CANTSET, FAIL, "can't set transfer mode") /* free things */ - if(MPI_SUCCESS != (mpi_code = MPI_Type_free(&file_type))) - HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code) - if(MPI_SUCCESS != (mpi_code = MPI_Type_free(&mem_type))) - HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code) + if(created_mpi_datatypes) { + if(MPI_SUCCESS != (mpi_code = MPI_Type_free(&file_type))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code) + if(MPI_SUCCESS != (mpi_code = MPI_Type_free(&mem_type))) + HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code) + } /* end if */ H5MM_xfree(chunk_disp_array); H5MM_xfree(block_lens); -- cgit v0.12 From be11bcf2ae80906fbd8aed55b7c3cfed821f6a43 Mon Sep 17 00:00:00 2001 From: Quincey Koziol Date: Thu, 20 Aug 2020 17:08:24 -0500 Subject: Simplify code to avoid using a boolean to free MPI types --- src/H5Dchunk.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/H5Dchunk.c b/src/H5Dchunk.c index f4c1c8a..ccffc3e 100644 --- a/src/H5Dchunk.c +++ b/src/H5Dchunk.c @@ -4957,11 +4957,10 @@ H5D__chunk_collective_fill(const H5D_t *dset, H5D_chunk_coll_info_t *chunk_info, int blocks, leftover, block_len; /* converted to int for MPI */ MPI_Aint *chunk_disp_array = NULL; int *block_lens = NULL; - MPI_Datatype mem_type, file_type; + MPI_Datatype mem_type = MPI_DATATYPE_NULL, file_type = MPI_DATATYPE_NULL; H5FD_mpio_xfer_t prev_xfer_mode; /* Previous data xfer mode */ hbool_t have_xfer_mode = FALSE; /* Whether the previous xffer mode has been retrieved */ hbool_t need_addr_sort = FALSE; - hbool_t created_mpi_datatypes = FALSE; /* Whether MPI datatypes were created */ int i; /* Local index variable */ herr_t ret_value = SUCCEED; /* Return value */ @@ -5049,9 +5048,6 @@ H5D__chunk_collective_fill(const H5D_t *dset, H5D_chunk_coll_info_t *chunk_info, HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hvector failed", mpi_code) if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&mem_type))) HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code) - - /* Indicate that the MPI types were created */ - created_mpi_datatypes = TRUE; } /* end if */ else { /* Set up file & memory MPI types, to participate in collective write */ @@ -5089,12 +5085,12 @@ done: HDONE_ERROR(H5E_DATASET, H5E_CANTSET, FAIL, "can't set transfer mode") /* free things */ - if(created_mpi_datatypes) { + if(MPI_DATATYPE_NULL != file_type) if(MPI_SUCCESS != (mpi_code = MPI_Type_free(&file_type))) HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code) + if(MPI_DATATYPE_NULL != mem_type) if(MPI_SUCCESS != (mpi_code = MPI_Type_free(&mem_type))) HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code) - } /* end if */ H5MM_xfree(chunk_disp_array); H5MM_xfree(block_lens); -- cgit v0.12 From 5a23ba8b59a2d91754dd8334aba12c860378c20a Mon Sep 17 00:00:00 2001 From: Quincey Koziol Date: Thu, 20 Aug 2020 18:22:39 -0500 Subject: Don't free builtin MPI_BYTE MPI type --- src/H5Dchunk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/H5Dchunk.c b/src/H5Dchunk.c index ccffc3e..9543654 100644 --- a/src/H5Dchunk.c +++ b/src/H5Dchunk.c @@ -5085,10 +5085,10 @@ done: HDONE_ERROR(H5E_DATASET, H5E_CANTSET, FAIL, "can't set transfer mode") /* free things */ - if(MPI_DATATYPE_NULL != file_type) + if(MPI_DATATYPE_NULL != file_type && MPI_BYTE != file_type) if(MPI_SUCCESS != (mpi_code = MPI_Type_free(&file_type))) HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code) - if(MPI_DATATYPE_NULL != mem_type) + if(MPI_DATATYPE_NULL != mem_type && MPI_BYTE != mem_type) if(MPI_SUCCESS != (mpi_code = MPI_Type_free(&mem_type))) HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code) H5MM_xfree(chunk_disp_array); -- cgit v0.12 From 3c5512eac4ecbd6f6a0fe1276de89c4fd2ae48e5 Mon Sep 17 00:00:00 2001 From: Quincey Koziol Date: Thu, 20 Aug 2020 18:24:07 -0500 Subject: Simplify default use of MPI_BYTE --- src/H5Dchunk.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/H5Dchunk.c b/src/H5Dchunk.c index 9543654..42ffd3e 100644 --- a/src/H5Dchunk.c +++ b/src/H5Dchunk.c @@ -4957,7 +4957,7 @@ H5D__chunk_collective_fill(const H5D_t *dset, H5D_chunk_coll_info_t *chunk_info, int blocks, leftover, block_len; /* converted to int for MPI */ MPI_Aint *chunk_disp_array = NULL; int *block_lens = NULL; - MPI_Datatype mem_type = MPI_DATATYPE_NULL, file_type = MPI_DATATYPE_NULL; + MPI_Datatype mem_type = MPI_BYTE, file_type = MPI_BYTE; H5FD_mpio_xfer_t prev_xfer_mode; /* Previous data xfer mode */ hbool_t have_xfer_mode = FALSE; /* Whether the previous xffer mode has been retrieved */ hbool_t need_addr_sort = FALSE; @@ -5049,11 +5049,6 @@ H5D__chunk_collective_fill(const H5D_t *dset, H5D_chunk_coll_info_t *chunk_info, if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&mem_type))) HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code) } /* end if */ - else { - /* Set up file & memory MPI types, to participate in collective write */ - file_type = MPI_BYTE; - mem_type = MPI_BYTE; - } /* end else */ /* Set MPI-IO VFD properties */ @@ -5085,10 +5080,10 @@ done: HDONE_ERROR(H5E_DATASET, H5E_CANTSET, FAIL, "can't set transfer mode") /* free things */ - if(MPI_DATATYPE_NULL != file_type && MPI_BYTE != file_type) + if(MPI_BYTE != file_type) if(MPI_SUCCESS != (mpi_code = MPI_Type_free(&file_type))) HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code) - if(MPI_DATATYPE_NULL != mem_type && MPI_BYTE != mem_type) + if(MPI_BYTE != mem_type) if(MPI_SUCCESS != (mpi_code = MPI_Type_free(&mem_type))) HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code) H5MM_xfree(chunk_disp_array); -- cgit v0.12 From 3e4f255c5031f14a68d387df0762f7a2210df5a3 Mon Sep 17 00:00:00 2001 From: Quincey Koziol Date: Fri, 21 Aug 2020 16:28:19 -0500 Subject: Added release note for MPI type fix --- release_docs/RELEASE.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt index be88cd1..a7e8735 100644 --- a/release_docs/RELEASE.txt +++ b/release_docs/RELEASE.txt @@ -143,6 +143,12 @@ Bug Fixes since HDF5-1.12.0 release ================================== Library ------- + - Avoid setting up complex MPI types with 0-length vectors, which some + MPI implementations don't handle well. (In particular, IBM + SpectrumScale MPI on the Summit system at ORNL) + + (QAK - 2020/08/21) + - Explicitly declared dlopen to use RTLD_LOCAL dlopen documentation states that if neither RTLD_GLOBAL nor -- cgit v0.12