From 663321087a73e760a028517584731eb8ef308ba2 Mon Sep 17 00:00:00 2001 From: Houjun Tang Date: Mon, 11 Jul 2022 13:59:19 -0700 Subject: Support for UnifyFS with MPI_File_sync (#1801) * Initial implementation for supporting UnifyFS in HDF5 with MPI_File_sync after write * Committing clang-format changes * Fix format * Fix env variable and return value check * Fix flag retrieve * Fix issues with getting/setting the flag * Fix merge conflicts * Update * Committing clang-format changes * Update based on suggestions * Committing clang-format changes Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com> --- src/H5Dio.c | 12 ++++++++++++ src/H5FDmpi.c | 37 +++++++++++++++++++++++++++++++++++++ src/H5FDmpio.c | 44 ++++++++++++++++++++++++++++++++++---------- src/H5FDprivate.h | 1 + src/H5FDpublic.h | 1 + src/H5Fmpi.c | 1 - src/H5Fprivate.h | 1 + src/H5Fquery.c | 31 +++++++++++++++++++++++++++++++ src/H5mpi.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/H5private.h | 1 + 10 files changed, 172 insertions(+), 11 deletions(-) diff --git a/src/H5Dio.c b/src/H5Dio.c index 470b245..a7cb937 100644 --- a/src/H5Dio.c +++ b/src/H5Dio.c @@ -830,6 +830,18 @@ H5D__ioinfo_adjust(H5D_io_info_t *io_info, const H5D_t *dset, const H5S_t *file_ } /* end if */ } /* end if */ else { + /* Fail when file sync is required, since it requires collective write */ + if (io_info->op_type == H5D_IO_OP_WRITE) { + hbool_t mpi_file_sync_required = FALSE; + if (H5F_shared_get_mpi_file_sync_required(io_info->f_sh, &mpi_file_sync_required) < 0) + HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get MPI file_sync_required flag") + + if (mpi_file_sync_required) + HGOTO_ERROR( + H5E_DATASET, H5E_NO_INDEPENDENT, FAIL, + "Can't perform independent write when MPI_File_sync is required by ROMIO driver.") + } + /* Check if there are any filters in the pipeline. If there are, * we cannot break to independent I/O if this is a write operation * with multiple ranks involved; otherwise, there will be metadata diff --git a/src/H5FDmpi.c b/src/H5FDmpi.c index 45791e8..80f2ead 100644 --- a/src/H5FDmpi.c +++ b/src/H5FDmpi.c @@ -252,6 +252,43 @@ H5FD_mpi_haddr_to_MPIOff(haddr_t addr, MPI_Offset *mpi_off /*out*/) FUNC_LEAVE_NOAPI(ret_value) } +/*------------------------------------------------------------------------- + * Function: H5FD_mpi_get_file_sync_required + * + * Purpose: Retrieves the mpi_file_sync_required used for the file + * + * Return: Success: Non-negative + * + * Failure: Negative + * + * Programmer: Houjun Tang + * May 19, 2022 + * + *------------------------------------------------------------------------- + */ +herr_t +H5FD_mpi_get_file_sync_required(H5FD_t *file, hbool_t *file_sync_required) +{ + const H5FD_class_t *cls; + uint64_t flags = H5FD_CTL_ROUTE_TO_TERMINAL_VFD_FLAG; + void * file_sync_required_ptr = (void *)(&file_sync_required); + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI(FAIL) + + HDassert(file); + cls = (const H5FD_class_t *)(file->cls); + HDassert(cls); + HDassert(cls->ctl); /* All MPI drivers must implement this */ + + /* Dispatch to driver */ + if ((cls->ctl)(file, H5FD_CTL_GET_MPI_FILE_SYNC_OPCODE, flags, NULL, file_sync_required_ptr) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "driver get_mpi_file_synce request failed") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD_mpi_get_file_sync_required() */ + #ifdef NOT_YET /*------------------------------------------------------------------------- diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c index ce32bc7..f442644 100644 --- a/src/H5FDmpio.c +++ b/src/H5FDmpio.c @@ -60,16 +60,17 @@ static char H5FD_mpi_native_g[] = "native"; * driver doesn't bother to keep it updated since it's an expensive operation. */ typedef struct H5FD_mpio_t { - H5FD_t pub; /* Public stuff, must be first */ - MPI_File f; /* MPIO file handle */ - MPI_Comm comm; /* MPI Communicator */ - MPI_Info info; /* MPI info object */ - int mpi_rank; /* This process's rank */ - int mpi_size; /* Total number of processes */ - haddr_t eof; /* End-of-file marker */ - haddr_t eoa; /* End-of-address marker */ - haddr_t last_eoa; /* Last known end-of-address marker */ - haddr_t local_eof; /* Local end-of-file address for each process */ + H5FD_t pub; /* Public stuff, must be first */ + MPI_File f; /* MPIO file handle */ + MPI_Comm comm; /* MPI Communicator */ + MPI_Info info; /* MPI info object */ + int mpi_rank; /* This process's rank */ + int mpi_size; /* Total number of processes */ + haddr_t eof; /* End-of-file marker */ + haddr_t eoa; /* End-of-address marker */ + haddr_t last_eoa; /* Last known end-of-address marker */ + haddr_t local_eof; /* Local end-of-file address for each process */ + hbool_t mpi_file_sync_required; /* Whether the ROMIO driver requires MPI_File_sync after write */ } H5FD_mpio_t; /* Private Prototypes */ @@ -964,6 +965,10 @@ H5FD__mpio_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t H5_ATTR file->mpi_rank = mpi_rank; file->mpi_size = mpi_size; + /* Retrieve the flag indicating whether MPI_File_sync is needed after each write */ + if (H5_mpio_get_file_sync_required(fh, &file->mpi_file_sync_required) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "unable to get mpi_file_sync_required hint") + /* Only processor p0 will get the filesize and broadcast it. */ if (mpi_rank == 0) { /* If MPI_File_get_size fails, broadcast file size as -1 to signal error */ @@ -1629,6 +1634,12 @@ H5FD__mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, h if (MPI_SUCCESS != (mpi_code = MPI_File_write_at_all(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat))) HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at_all failed", mpi_code) + + /* Do MPI_File_sync when needed by underlying ROMIO driver */ + if (file->mpi_file_sync_required) { + if (MPI_SUCCESS != (mpi_code = MPI_File_sync(file->f))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_sync failed", mpi_code) + } } /* end if */ else { if (type != H5FD_MEM_DRAW) @@ -2638,6 +2649,12 @@ H5FD__mpio_write_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t co if (MPI_SUCCESS != (mpi_code = MPI_File_write_at_all(file->f, mpi_off, mpi_bufs_base, size_i, buf_type, &mpi_stat))) HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at_all failed", mpi_code) + + /* Do MPI_File_sync when needed by underlying ROMIO driver */ + if (file->mpi_file_sync_required) { + if (MPI_SUCCESS != (mpi_code = MPI_File_sync(file->f))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_sync failed", mpi_code) + } } /* end if */ else if (size_i > 0) { #ifdef H5FDmpio_DEBUG @@ -3020,6 +3037,7 @@ done: * H5FD_CTL_GET_MPI_COMMUNICATOR_OPCODE * H5FD_CTL_GET_MPI_RANK_OPCODE * H5FD_CTL_GET_MPI_SIZE_OPCODE + * H5FD_CTL_GET_MPI_FILE_SYNC_OPCODE * * Note that these opcodes must be supported by all VFDs that * support MPI. @@ -3063,6 +3081,12 @@ H5FD__mpio_ctl(H5FD_t *_file, uint64_t op_code, uint64_t flags, const void H5_AT **((int **)output) = file->mpi_size; break; + case H5FD_CTL_GET_MPI_FILE_SYNC_OPCODE: + HDassert(output); + HDassert(*output); + **((hbool_t **)output) = file->mpi_file_sync_required; + break; + default: /* unknown op code */ if (flags & H5FD_CTL_FAIL_IF_UNKNOWN_FLAG) { diff --git a/src/H5FDprivate.h b/src/H5FDprivate.h index bcbc693..9f0c15f 100644 --- a/src/H5FDprivate.h +++ b/src/H5FDprivate.h @@ -194,6 +194,7 @@ H5_DLL herr_t H5FD_get_mpio_atomicity(H5FD_t *file, hbool_t *flag); H5_DLL int H5FD_mpi_get_rank(H5FD_t *file); H5_DLL int H5FD_mpi_get_size(H5FD_t *file); H5_DLL MPI_Comm H5FD_mpi_get_comm(H5FD_t *file); +H5_DLL herr_t H5FD_mpi_get_file_sync_required(H5FD_t *file, hbool_t *file_sync_required); #endif /* H5_HAVE_PARALLEL */ #endif /* H5FDprivate_H */ diff --git a/src/H5FDpublic.h b/src/H5FDpublic.h index 5221e35..e19d4e7 100644 --- a/src/H5FDpublic.h +++ b/src/H5FDpublic.h @@ -196,6 +196,7 @@ #define H5FD_CTL_MEM_ALLOC 5 #define H5FD_CTL_MEM_FREE 6 #define H5FD_CTL_MEM_COPY 7 +#define H5FD_CTL_GET_MPI_FILE_SYNC_OPCODE 8 /* ctl function flags: */ diff --git a/src/H5Fmpi.c b/src/H5Fmpi.c index 02d8d52..8768104 100644 --- a/src/H5Fmpi.c +++ b/src/H5Fmpi.c @@ -587,5 +587,4 @@ done: FUNC_LEAVE_NOAPI(ret_value) } /* end H5F_mpi_get_file_block_type() */ - #endif /* H5_HAVE_PARALLEL */ diff --git a/src/H5Fprivate.h b/src/H5Fprivate.h index 629aee1..efce71d 100644 --- a/src/H5Fprivate.h +++ b/src/H5Fprivate.h @@ -974,6 +974,7 @@ H5_DLL herr_t H5F_mpi_retrieve_comm(hid_t loc_id, hid_t acspl_id, MPI_Comm *mp H5_DLL herr_t H5F_mpi_get_file_block_type(hbool_t commit, MPI_Datatype *new_type, hbool_t *new_type_derived); H5_DLL hbool_t H5F_get_coll_metadata_reads(const H5F_t *f); H5_DLL void H5F_set_coll_metadata_reads(H5F_t *f, H5P_coll_md_read_flag_t *file_flag, hbool_t *context_flag); +H5_DLL herr_t H5F_shared_get_mpi_file_sync_required(const H5F_shared_t *f_sh, hbool_t *flag); #endif /* H5_HAVE_PARALLEL */ /* External file cache routines */ diff --git a/src/H5Fquery.c b/src/H5Fquery.c index a625897..b2b0972 100644 --- a/src/H5Fquery.c +++ b/src/H5Fquery.c @@ -1068,6 +1068,37 @@ H5F_coll_md_read(const H5F_t *f) FUNC_LEAVE_NOAPI(f->shared->coll_md_read) } /* end H5F_coll_md_read() */ + +/*------------------------------------------------------------------------- + * Function: H5F_shared_get_mpi_file_sync_required + * + * Purpose: Returns the mpi_file_sync_required flag + * + * Return: Success: Non-negative + * Failure: Negative + * + * Programmer: Houjun Tang + * May 19, 2022 + * + *------------------------------------------------------------------------- + */ +herr_t +H5F_shared_get_mpi_file_sync_required(const H5F_shared_t *f_sh, hbool_t *flag /*out*/) +{ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(FAIL) + + HDassert(f_sh); + HDassert(flag); + + /* Dispatch to driver */ + if ((ret_value = H5FD_mpi_get_file_sync_required(f_sh->lf, flag)) < 0) + HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "driver get_file_sync_required request failed") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5F_shared_get_mpi_file_sync_required() */ #endif /* H5_HAVE_PARALLEL */ /*------------------------------------------------------------------------- diff --git a/src/H5mpi.c b/src/H5mpi.c index f5d709a..f951e1e 100644 --- a/src/H5mpi.c +++ b/src/H5mpi.c @@ -782,4 +782,58 @@ done: FUNC_LEAVE_NOAPI(ret_value) } /* end H5_mpio_gatherv_alloc_simple() */ +/*------------------------------------------------------------------------- + * Function: H5_mpio_get_file_sync_required + * + * Purpose: Retrieve the MPI hint indicating whether the data written + * by the MPI ROMIO driver is immediately visible to all MPI + * ranks. + * + * Notes: This routine is designed for supporting UnifyFS that needs + * MPI_File_sync in order to make the written data available + * to all ranks. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: Houjun Tang, April 7, 2022 + * + *------------------------------------------------------------------------- + */ +herr_t +H5_mpio_get_file_sync_required(MPI_File fh, hbool_t *file_sync_required) +{ + MPI_Info info_used; + int flag; + char value[MPI_MAX_INFO_VAL]; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI(FAIL) + + HDassert(file_sync_required); + + *file_sync_required = FALSE; + + if (MPI_SUCCESS != MPI_File_get_info(fh, &info_used)) + HGOTO_ERROR(H5E_LIB, H5E_CANTGET, FAIL, "can't get MPI info") + + if (MPI_SUCCESS != + MPI_Info_get(info_used, "romio_visibility_immediate", MPI_MAX_INFO_VAL - 1, value, &flag)) + HGOTO_ERROR(H5E_LIB, H5E_CANTGET, FAIL, "can't get MPI info") + + if (flag && !HDstrcmp(value, "false")) + *file_sync_required = TRUE; + + if (MPI_SUCCESS != MPI_Info_free(&info_used)) + HGOTO_ERROR(H5E_LIB, H5E_CANTFREE, FAIL, "can't free MPI info") + + /* Force setting the flag via env variable (temp solution before the flag is implemented in MPI) */ + char *sync_env_var = HDgetenv("HDF5_DO_MPI_FILE_SYNC"); + if (sync_env_var && (!HDstrcmp(sync_env_var, "TRUE") || !HDstrcmp(sync_env_var, "1"))) + *file_sync_required = TRUE; + if (sync_env_var && (!HDstrcmp(sync_env_var, "FALSE") || !HDstrcmp(sync_env_var, "0"))) + *file_sync_required = FALSE; + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5_mpio_get_file_sync_required() */ #endif /* H5_HAVE_PARALLEL */ diff --git a/src/H5private.h b/src/H5private.h index b19b146..ed15d8b 100644 --- a/src/H5private.h +++ b/src/H5private.h @@ -2614,6 +2614,7 @@ H5_DLL herr_t H5_mpio_gatherv_alloc_simple(void *send_buf, int send_count, MPI_ MPI_Datatype recv_type, hbool_t allgather, int root, MPI_Comm comm, int mpi_rank, int mpi_size, void **out_buf, size_t *out_buf_num_entries); +H5_DLL herr_t H5_mpio_get_file_sync_required(MPI_File fh, hbool_t *file_sync_required); #endif /* H5_HAVE_PARALLEL */ /* Functions for debugging */ -- cgit v0.12