diff options
Diffstat (limited to 'src/H5FDmpio.c')
-rw-r--r-- | src/H5FDmpio.c | 187 |
1 files changed, 108 insertions, 79 deletions
diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c index 9417d46..ee3277d 100644 --- a/src/H5FDmpio.c +++ b/src/H5FDmpio.c @@ -5,12 +5,10 @@ * * * This file is part of HDF5. The full HDF5 copyright notice, including * * terms governing use, modification, and redistribution, is contained in * - * the files COPYING and Copyright.html. COPYING can be found at the root * - * of the source code distribution tree; Copyright.html can be found at the * - * root level of an installed copy of the electronic HDF5 document set and * - * is linked from the top-level documents page. It can also be found at * - * http://hdfgroup.org/HDF5/doc/Copyright.html. If you do not have * - * access to either file, you may request a copy from help@hdfgroup.org. * + * the COPYING file, which can be found at the root of the source code * + * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. * + * If you do not have access to either file, you may request a copy from * + * help@hdfgroup.org. * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ /* @@ -25,6 +23,7 @@ #include "H5private.h" /* Generic Functions */ +#include "H5CXprivate.h" /* API Contexts */ #include "H5Dprivate.h" /* Dataset functions */ #include "H5Eprivate.h" /* Error handling */ #include "H5Fprivate.h" /* File access */ @@ -1061,7 +1060,7 @@ done: fprintf(stdout, "Leaving H5FD_mpio_open\n" ); #endif FUNC_LEAVE_NOAPI(ret_value) -} +} /* end H5FD_mpio_open() */ /*------------------------------------------------------------------------- @@ -1152,10 +1151,11 @@ H5FD_mpio_query(const H5FD_t H5_ATTR_UNUSED *_file, unsigned long *flags /* out /* Set the VFL feature flags that this driver supports */ if(flags) { *flags=0; - *flags|=H5FD_FEAT_AGGREGATE_METADATA; /* OK to aggregate metadata allocations */ - *flags|=H5FD_FEAT_AGGREGATE_SMALLDATA; /* OK to aggregate "small" raw data allocations */ - *flags|=H5FD_FEAT_HAS_MPI; /* This driver uses MPI */ - *flags|=H5FD_FEAT_ALLOCATE_EARLY; /* Allocate space early instead of late */ + *flags |= H5FD_FEAT_AGGREGATE_METADATA; /* OK to aggregate metadata allocations */ + *flags |= H5FD_FEAT_AGGREGATE_SMALLDATA; /* OK to aggregate "small" raw data allocations */ + *flags |= H5FD_FEAT_HAS_MPI; /* This driver uses MPI */ + *flags |= H5FD_FEAT_ALLOCATE_EARLY; /* Allocate space early instead of late */ + *flags |= H5FD_FEAT_DEFAULT_VFD_COMPATIBLE; /* VFD creates a file which can be opened with the default VFD */ } /* end if */ FUNC_LEAVE_NOAPI(SUCCEED) @@ -1411,8 +1411,8 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t dxpl_id, haddr_t addr, size_t size, - void *buf/*out*/) +H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, + hid_t H5_ATTR_UNUSED dxpl_id, haddr_t addr, size_t size, void *buf/*out*/) { H5FD_mpio_t *file = (H5FD_mpio_t*)_file; MPI_Offset mpi_off; @@ -1431,7 +1431,6 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t dxpl_id, had int io_size; /* Actual number of bytes requested */ int n; #endif - H5P_genplist_t *plist = NULL; /* Property list pointer */ hbool_t use_view_this_time = FALSE; herr_t ret_value = SUCCEED; @@ -1443,9 +1442,6 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t dxpl_id, had #endif HDassert(file); HDassert(H5FD_MPIO==file->pub.driver_id); - /* Make certain we have the correct type of property list */ - HDassert(H5I_GENPROP_LST==H5I_get_type(dxpl_id)); - HDassert(TRUE==H5P_isa_class(dxpl_id,H5P_DATASET_XFER)); HDassert(buf); /* Portably initialize MPI status variable */ @@ -1468,13 +1464,9 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t dxpl_id, had if(type == H5FD_MEM_DRAW) { H5FD_mpio_xfer_t xfer_mode; /* I/O tranfer mode */ - /* Obtain the data transfer properties */ - if(NULL == (plist = (H5P_genplist_t *)H5I_object(dxpl_id))) - HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list") - - /* get the transfer mode from the dxpl */ - if(H5P_get(plist, H5D_XFER_IO_XFER_MODE_NAME, &xfer_mode)<0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode") + /* Get the transfer mode from the API context */ + if(H5CX_get_io_xfer_mode(&xfer_mode) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode") /* * Set up for a fancy xfer using complex types, or single byte block. We @@ -1488,11 +1480,9 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t dxpl_id, had /* Remember that views are used */ use_view_this_time = TRUE; - /* prepare for a full-blown xfer using btype, ftype, and disp */ - if(H5P_get(plist, H5FD_MPI_XFER_MEM_MPI_TYPE_NAME, &buf_type) < 0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O type property") - if(H5P_get(plist, H5FD_MPI_XFER_FILE_MPI_TYPE_NAME, &file_type) < 0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O type property") + /* Prepare for a full-blown xfer using btype, ftype, and disp */ + if(H5CX_get_mpi_coll_datatypes(&buf_type, &file_type) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O datatypes") /* * Set the file view when we are using MPI derived types @@ -1509,18 +1499,15 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t dxpl_id, had /* Read the data. */ if(use_view_this_time) { - H5FD_mpio_collective_opt_t coll_opt_mode; + H5FD_mpio_collective_opt_t coll_opt_mode; #ifdef H5FDmpio_DEBUG if (H5FD_mpio_Debug[(int)'t']) fprintf(stdout, "H5FD_mpio_read: using MPIO collective mode\n"); #endif /* Get the collective_opt property to check whether the application wants to do IO individually. */ - HDassert(plist); - - /* get the transfer mode from the dxpl */ - if(H5P_get(plist, H5D_XFER_MPIO_COLLECTIVE_OPT_NAME, &coll_opt_mode) < 0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property") + if(H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property") if(coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) { #ifdef H5FDmpio_DEBUG @@ -1711,8 +1698,8 @@ done: *------------------------------------------------------------------------- */ static herr_t -H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, - size_t size, const void *buf) +H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, + haddr_t addr, size_t size, const void *buf) { H5FD_mpio_t *file = (H5FD_mpio_t*)_file; MPI_Offset mpi_off; @@ -1730,7 +1717,6 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, #endif int size_i; hbool_t use_view_this_time = FALSE; - H5P_genplist_t *plist = NULL; /* Property list pointer */ H5FD_mpio_xfer_t xfer_mode; /* I/O tranfer mode */ herr_t ret_value = SUCCEED; @@ -1742,11 +1728,11 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, #endif HDassert(file); HDassert(H5FD_MPIO==file->pub.driver_id); - /* Make certain we have the correct type of property list */ - HDassert(H5I_GENPROP_LST==H5I_get_type(dxpl_id)); - HDassert(TRUE==H5P_isa_class(dxpl_id,H5P_DATASET_XFER)); HDassert(buf); + /* Verify that no data is written when between MPI_Barrier()s during file flush */ + HDassert(!H5CX_get_mpi_file_flushing()); + /* Portably initialize MPI status variable */ HDmemset(&mpi_stat, 0, sizeof(MPI_Status)); @@ -1762,13 +1748,9 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, fprintf(stdout, "in H5FD_mpio_write mpi_off=%ld size_i=%d\n", (long)mpi_off, size_i); #endif - /* Obtain the data transfer properties */ - if(NULL == (plist = (H5P_genplist_t *)H5I_object(dxpl_id))) - HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list") - - /* get the transfer mode from the dxpl */ - if(H5P_get(plist, H5D_XFER_IO_XFER_MODE_NAME, &xfer_mode)<0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode") + /* Get the transfer mode from the API context */ + if(H5CX_get_io_xfer_mode(&xfer_mode) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode") /* * Set up for a fancy xfer using complex types, or single byte block. We @@ -1782,11 +1764,9 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, /* Remember that views are used */ use_view_this_time = TRUE; - /* prepare for a full-blown xfer using btype, ftype, and disp */ - if(H5P_get(plist, H5FD_MPI_XFER_MEM_MPI_TYPE_NAME, &buf_type) < 0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O type property") - if(H5P_get(plist, H5FD_MPI_XFER_FILE_MPI_TYPE_NAME, &file_type) < 0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O type property") + /* Prepare for a full-blown xfer using btype, ftype, and disp */ + if(H5CX_get_mpi_coll_datatypes(&buf_type, &file_type) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O datatypes") /* * Set the file view when we are using MPI derived types @@ -1810,10 +1790,8 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, #endif /* Get the collective_opt property to check whether the application wants to do IO individually. */ - HDassert(plist); - /* get the transfer mode from the dxpl */ - if(H5P_get(plist, H5D_XFER_MPIO_COLLECTIVE_OPT_NAME, &coll_opt_mode)<0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property") + if(H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property") if(coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) { #ifdef H5FDmpio_DEBUG @@ -1917,10 +1895,9 @@ H5FD_mpio_flush(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t closing) HDassert(H5FD_MPIO == file->pub.driver_id); /* Only sync the file if we are not going to immediately close it */ - if(!closing) { + if(!closing) if(MPI_SUCCESS != (mpi_code = MPI_File_sync(file->f))) HMPI_GOTO_ERROR(FAIL, "MPI_File_sync failed", mpi_code) - } /* end if */ done: #ifdef H5FDmpio_DEBUG @@ -1937,12 +1914,33 @@ done: * * Purpose: Make certain the file's size matches it's allocated size * + * This is a little sticky in the mpio case, as it is not + * easy for us to track the current EOF by extracting it from + * write calls. + * + * Instead, we first check to see if the eoa has changed since + * the last call to this function. If it has, we call + * MPI_File_get_size() to determine the current EOF, and + * only call MPI_File_set_size() if this value disagrees + * with the current eoa. + * * Return: Success: Non-negative * Failure: Negative * * Programmer: Quincey Koziol * January 31, 2008 * + * Changes: Heavily reworked to avoid unnecessary MPI_File_set_size() + * calls. The hope is that these calls are superfluous in the + * typical case, allowing us to avoid truncates most of the + * time. + * + * The basic idea is to query the file system to get the + * current eof, and only truncate if the file systems + * conception of the eof disagrees with our eoa. + * + * JRM -- 10/27/17 + * *------------------------------------------------------------------------- */ static herr_t @@ -1960,29 +1958,60 @@ H5FD_mpio_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t H5_ATTR_ HDassert(file); HDassert(H5FD_MPIO == file->pub.driver_id); - /* Extend the file to make sure it's large enough, then sync. - * Unfortunately, keeping track of EOF is an expensive operation, so - * we can't just check whether EOF<EOA like with other drivers. - * Therefore we'll just read the byte at EOA-1 and then write it back. */ - if(file->eoa > file->last_eoa) { - int mpi_code; /* mpi return code */ - MPI_Offset mpi_off; + if(!H5F_addr_eq(file->eoa, file->last_eoa)) { + int mpi_code; /* mpi return code */ + MPI_Offset size; + MPI_Offset needed_eof; + + /* In principle, it is possible for the size returned by the + * call to MPI_File_get_size() to depend on whether writes from + * all proceeses have completed at the time process 0 makes the + * call. + * + * In practice, most (all?) truncate calls will come after a barrier + * and with no interviening writes to the file (with the possible + * exception of sueprblock / superblock extension message updates). + * + * Check the "MPI file closing" flag in the API context to determine + * if we can skip the barrier. + */ + if(!H5CX_get_mpi_file_flushing()) + if(MPI_SUCCESS != (mpi_code = MPI_Barrier(file->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code) - if(H5FD_mpi_haddr_to_MPIOff(file->eoa, &mpi_off) < 0) - HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "cannot convert from haddr_t to MPI_Offset") + /* Only processor p0 will get the filesize and broadcast it. */ + /* (Note that throwing an error here will cause non-rank 0 processes + * to hang in following Bcast. -QAK, 3/17/2018) + */ + if(0 == file->mpi_rank) + if(MPI_SUCCESS != (mpi_code = MPI_File_get_size(file->f, &size))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_get_size failed", mpi_code) - /* Extend the file's size */ - if(MPI_SUCCESS != (mpi_code = MPI_File_set_size(file->f, mpi_off))) - HMPI_GOTO_ERROR(FAIL, "MPI_File_set_size failed", mpi_code) + /* Broadcast file size */ + if(MPI_SUCCESS != (mpi_code = MPI_Bcast(&size, (int)sizeof(MPI_Offset), MPI_BYTE, 0, file->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code) - /* Don't let any proc return until all have extended the file. - * (Prevents race condition where some processes go ahead and write - * more data to the file before all the processes have finished making - * it the shorter length, potentially truncating the file and dropping - * the new data written) - */ - if(MPI_SUCCESS != (mpi_code = MPI_Barrier(file->comm))) - HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code) + if(H5FD_mpi_haddr_to_MPIOff(file->eoa, &needed_eof) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "cannot convert from haddr_t to MPI_Offset") + + /* eoa != eof. Set eof to eoa */ + if(size != needed_eof) { + /* Extend the file's size */ + if(MPI_SUCCESS != (mpi_code = MPI_File_set_size(file->f, needed_eof))) + HMPI_GOTO_ERROR(FAIL, "MPI_File_set_size failed", mpi_code) + + /* In general, we must wait until all processes have finished + * the truncate before any process can continue, since it is + * possible that a process would write at the end of the + * file, and this write would be discarded by the truncate. + * + * While this is an issue for a user initiated flush, it may + * not be an issue at file close. If so, we may be able to + * optimize out the following barrier in that case. + */ + if(MPI_SUCCESS != (mpi_code = MPI_Barrier(file->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code) + } /* end if */ /* Update the 'last' eoa value */ file->last_eoa = file->eoa; |