diff options
author | Quincey Koziol <koziol@hdfgroup.org> | 2004-01-31 01:38:44 (GMT) |
---|---|---|
committer | Quincey Koziol <koziol@hdfgroup.org> | 2004-01-31 01:38:44 (GMT) |
commit | 138bc92ebdb7c6e1ad379dcdabae21bf0a79ab0d (patch) | |
tree | 046bd488f60127ac3a6ba0edbd482b44f022c788 /src/H5FDmpio.c | |
parent | f499912c3247e592a0eeef7207b917428756b094 (diff) | |
download | hdf5-138bc92ebdb7c6e1ad379dcdabae21bf0a79ab0d.zip hdf5-138bc92ebdb7c6e1ad379dcdabae21bf0a79ab0d.tar.gz hdf5-138bc92ebdb7c6e1ad379dcdabae21bf0a79ab0d.tar.bz2 |
[svn-r8126] Purpose:
Bug fix/optimization
Description:
Address slowdown in MPI-I/O file metadata operations that was introduced
mid-stream. We now _require_ a POSIX compliant parallel file system for the
MPI-I/O file driver (as well as for the MPI-POSIX file driver).
Also optimized file open operation when the file is being created by
reducing the number of collective & syncronizing calls.
Additionally, refactor the MPI routines into a common place, eliminating
duplicated code.
Platforms tested:
FreeBSD 4.9 (sleipnir) w/parallel
h5committest
Diffstat (limited to 'src/H5FDmpio.c')
-rw-r--r-- | src/H5FDmpio.c | 742 |
1 files changed, 176 insertions, 566 deletions
diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c index 0a62158..6c5d50e 100644 --- a/src/H5FDmpio.c +++ b/src/H5FDmpio.c @@ -30,7 +30,7 @@ #include "H5Eprivate.h" /* Error handling */ #include "H5Fprivate.h" /* File access */ #include "H5FDprivate.h" /* File drivers */ -#include "H5FDmpio.h" /* MPI I/O file driver */ +#include "H5FDmpi.h" /* MPI-based file drivers */ #include "H5Iprivate.h" /* IDs */ #include "H5MMprivate.h" /* Memory management */ #include "H5Pprivate.h" /* Property lists */ @@ -57,14 +57,13 @@ typedef struct H5FD_mpio_t { MPI_Info info; /*file information */ int mpi_rank; /* This process's rank */ int mpi_size; /* Total number of processes */ + hbool_t truncate_pending; /* Whether a file truncation is pending */ haddr_t eof; /*end-of-file marker */ haddr_t eoa; /*end-of-address marker */ haddr_t last_eoa; /* Last known end-of-address marker */ } H5FD_mpio_t; -/* Prototypes */ -static haddr_t H5FD_mpio_MPIOff_to_haddr(MPI_Offset mpi_off); -static herr_t H5FD_mpio_haddr_to_MPIOff(haddr_t addr, MPI_Offset *mpi_off/*out*/); +/* Private Prototypes */ /* Callbacks */ static void *H5FD_mpio_fapl_get(H5FD_t *_file); @@ -83,9 +82,9 @@ static herr_t H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, hadd static herr_t H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size, const void *buf); static herr_t H5FD_mpio_flush(H5FD_t *_file, hid_t dxpl_id, unsigned closing); -static herr_t H5FD_mpio_comm_info_dup(MPI_Comm comm, MPI_Info info, - MPI_Comm *comm_new, MPI_Info *info_new); -static herr_t H5FD_mpio_comm_info_free(MPI_Comm *comm, MPI_Info *info); +static int H5FD_mpio_mpi_rank(const H5FD_t *_file); +static int H5FD_mpio_mpi_size(const H5FD_t *_file); +static MPI_Comm H5FD_mpio_communicator(const H5FD_t *_file); /* MPIO-specific file access properties */ typedef struct H5FD_mpio_fapl_t { @@ -94,7 +93,8 @@ typedef struct H5FD_mpio_fapl_t { } H5FD_mpio_fapl_t; /* The MPIO file driver information */ -static const H5FD_class_t H5FD_mpio_g = { +static const H5FD_class_mpi_t H5FD_mpio_g = { + { /* Start of superclass information */ "mpio", /*name */ HADDR_MAX, /*maxaddr */ H5F_CLOSE_SEMI, /* fc_degree */ @@ -124,6 +124,10 @@ static const H5FD_class_t H5FD_mpio_g = { NULL, /*lock */ NULL, /*unlock */ H5FD_FLMAP_SINGLE /*fl_map */ + }, /* End of superclass information */ + H5FD_mpio_mpi_rank, /*get_rank */ + H5FD_mpio_mpi_size, /*get_size */ + H5FD_mpio_communicator /*get_comm */ }; #ifdef H5FDmpio_DEBUG @@ -146,29 +150,10 @@ static int H5FD_mpio_Debug[256] = 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; #endif -#ifdef OLD_METADATA_WRITE -/* Global var to allow elimination of redundant metadata writes - * to be controlled by the value of an environment variable. */ -/* Use the elimination by default unless this is the Intel Red machine */ -#ifndef __PUMAGON__ -hbool_t H5_mpi_1_metawrite_g = TRUE; -#else -hbool_t H5_mpi_1_metawrite_g = FALSE; -#endif -#endif /* OLD_METADATA_WRITE */ - /* Interface initialization */ #define INTERFACE_INIT H5FD_mpio_init static int interface_initialize_g = 0; -/* ======== Temporary, Local data transfer properties ======== */ -/* Definitions for memory MPI type property */ -#define H5FD_MPIO_XFER_MEM_MPI_TYPE_NAME "H5FD_mpio_mem_mpi_type" -#define H5FD_MPIO_XFER_MEM_MPI_TYPE_SIZE sizeof(MPI_Datatype) -/* Definitions for file MPI type property */ -#define H5FD_MPIO_XFER_FILE_MPI_TYPE_NAME "H5FD_mpio_file_mpi_type" -#define H5FD_MPIO_XFER_FILE_MPI_TYPE_SIZE sizeof(MPI_Datatype) - /*------------------------------------------------------------------------- * Function: H5FD_mpio_init @@ -198,7 +183,7 @@ H5FD_mpio_init(void) FUNC_ENTER_NOAPI(H5FD_mpio_init, FAIL) if (H5I_VFL!=H5Iget_type(H5FD_MPIO_g)) - H5FD_MPIO_g = H5FDregister(&H5FD_mpio_g); + H5FD_MPIO_g = H5FD_register((const H5FD_class_t *)&H5FD_mpio_g,sizeof(H5FD_class_mpi_t)); #ifdef H5FDmpio_DEBUG if (!H5FD_mpio_Debug_inited) @@ -504,304 +489,6 @@ done: /*------------------------------------------------------------------------- - * Function: H5FD_mpio_communicator - * - * Purpose: Returns the MPI communicator for the file. - * - * Return: Success: The communicator - * - * Failure: NULL - * - * Programmer: Robb Matzke - * Monday, August 9, 1999 - * - * Modifications: - * - *------------------------------------------------------------------------- - */ -MPI_Comm -H5FD_mpio_communicator(H5FD_t *_file) -{ - H5FD_mpio_t *file = (H5FD_mpio_t*)_file; - MPI_Comm ret_value; /* Return value */ - - FUNC_ENTER_NOAPI(H5FD_mpio_communicator, MPI_COMM_NULL) - - assert(file); - assert(H5FD_MPIO==file->pub.driver_id); - - /* Set return value */ - ret_value=file->comm; - -done: - FUNC_LEAVE_NOAPI(ret_value) -} - - -/*------------------------------------------------------------------------- - * Function: H5FD_mpio_mpi_rank - * - * Purpose: Returns the MPI rank for a process - * - * Return: Success: non-negative - * Failure: negative - * - * Programmer: Quincey Koziol - * Thursday, May 16, 2002 - * - * Modifications: - * - *------------------------------------------------------------------------- - */ -int -H5FD_mpio_mpi_rank(H5FD_t *_file) -{ - H5FD_mpio_t *file = (H5FD_mpio_t*)_file; - int ret_value; /* Return value */ - - FUNC_ENTER_NOAPI(H5FD_mpio_mpi_rank, FAIL) - - assert(file); - assert(H5FD_MPIO==file->pub.driver_id); - - /* Set return value */ - ret_value=file->mpi_rank; - -done: - FUNC_LEAVE_NOAPI(ret_value) -} /* end H5FD_mpio_mpi_rank() */ - - -/*------------------------------------------------------------------------- - * Function: H5FD_mpio_mpi_size - * - * Purpose: Returns the number of MPI processes - * - * Return: Success: non-negative - * Failure: negative - * - * Programmer: Quincey Koziol - * Thursday, May 16, 2002 - * - * Modifications: - * - *------------------------------------------------------------------------- - */ -int -H5FD_mpio_mpi_size(H5FD_t *_file) -{ - H5FD_mpio_t *file = (H5FD_mpio_t*)_file; - int ret_value; /* Return value */ - - FUNC_ENTER_NOAPI(H5FD_mpio_mpi_size, FAIL) - - assert(file); - assert(H5FD_MPIO==file->pub.driver_id); - - /* Set return value */ - ret_value=file->mpi_size; - -done: - FUNC_LEAVE_NOAPI(ret_value) -} /* end H5FD_mpio_mpi_size() */ - - -/*------------------------------------------------------------------------- - * Function: H5FD_mpio_setup - * - * Purpose: Set the buffer type BTYPE, file type FTYPE for a data - * transfer. Also request a MPI type transfer. - * - * Return: Success: 0 - * Failure: -1 - * - * Programmer: Robb Matzke - * Monday, August 9, 1999 - * - * Modifications: - * - * Quincey Koziol - 2002/06/17 - * Removed 'disp' parameter, read & write routines will use - * the address of the dataset in MPI_File_set_view() calls, as - * necessary. - * - * Quincey Koziol - 2002/06/17 - * Changed to set temporary properties in a dxpl, instead of - * flags in the file struct, which will make this more threadsafe. - * - *------------------------------------------------------------------------- - */ -herr_t -H5FD_mpio_setup(hid_t dxpl_id, MPI_Datatype btype, MPI_Datatype ftype) -{ - H5P_genplist_t *plist; /* Property list pointer */ - herr_t ret_value=SUCCEED; /* Return value */ - - FUNC_ENTER_NOAPI(H5FD_mpio_setup, FAIL) - - /* Check arguments */ - if(NULL == (plist = H5P_object_verify(dxpl_id,H5P_DATASET_XFER))) - HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dataset transfer list") - - /* Set buffer MPI type */ - if(H5P_insert(plist,H5FD_MPIO_XFER_MEM_MPI_TYPE_NAME,H5FD_MPIO_XFER_MEM_MPI_TYPE_SIZE,&btype,NULL,NULL,NULL,NULL,NULL,NULL)<0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't insert MPI-I/O property") - - /* Set file MPI type */ - if(H5P_insert(plist,H5FD_MPIO_XFER_FILE_MPI_TYPE_NAME,H5FD_MPIO_XFER_FILE_MPI_TYPE_SIZE,&ftype,NULL,NULL,NULL,NULL,NULL,NULL)<0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't insert MPI-I/O property") - -done: - FUNC_LEAVE_NOAPI(ret_value) -} - - -/*------------------------------------------------------------------------- - * Function: H5FD_mpio_teardown - * - * Purpose: Remove the temporary MPI-I/O properties from dxpl. - * - * Return: Success: Non-negative - * Failure: Negative - * - * Programmer: Quincey Koziol - * Monday, June 17, 2002 - * - * Modifications: - * - *------------------------------------------------------------------------- - */ -herr_t -H5FD_mpio_teardown(hid_t dxpl_id) -{ - H5P_genplist_t *plist; /* Property list pointer */ - herr_t ret_value=SUCCEED; /* Return value */ - - FUNC_ENTER_NOAPI(H5FD_mpio_teardown, FAIL) - - /* Check arguments */ - if(NULL == (plist = H5P_object_verify(dxpl_id,H5P_DATASET_XFER))) - HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dataset transfer list") - - /* Remove buffer MPI type */ - if(H5P_remove(dxpl_id,plist,H5FD_MPIO_XFER_MEM_MPI_TYPE_NAME)<0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTDELETE, FAIL, "can't remove MPI-I/O property") - - /* Remove file MPI type */ - if(H5P_remove(dxpl_id,plist,H5FD_MPIO_XFER_FILE_MPI_TYPE_NAME)<0) - HGOTO_ERROR(H5E_PLIST, H5E_CANTDELETE, FAIL, "can't remove MPI-I/O property") - -done: - FUNC_LEAVE_NOAPI(ret_value) -} - - -/*------------------------------------------------------------------------- - * Function: H5FD_mpio_wait_for_left_neighbor - * - * Purpose: Blocks until (empty) msg is received from immediately - * lower-rank neighbor. In conjunction with - * H5FD_mpio_signal_right_neighbor, useful for enforcing - * 1-process-at-at-time access to critical regions to avoid race - * conditions (though it is overkill to require that the - * processes be allowed to proceed strictly in order of their - * rank). - * - * Note: This routine doesn't read or write any file, just performs - * interprocess coordination. It really should reside in a - * separate package of such routines. - * - * Return: Success: 0 - * Failure: -1 - * - * Programmer: rky - * 19981207 - * - * Modifications: - * Robb Matzke, 1999-08-09 - * Modified to work with the virtual file layer. - *------------------------------------------------------------------------- - */ -herr_t -H5FD_mpio_wait_for_left_neighbor(H5FD_t *_file) -{ - H5FD_mpio_t *file = (H5FD_mpio_t*)_file; - char msgbuf[1]; - MPI_Status rcvstat; - int mpi_code; /* mpi return code */ - herr_t ret_value=SUCCEED; /* Return value */ - - FUNC_ENTER_NOAPI(H5FD_mpio_wait_for_left_neighbor, FAIL) - - assert(file); - assert(H5FD_MPIO==file->pub.driver_id); - - /* Portably initialize MPI status variable */ - HDmemset(&rcvstat,0,sizeof(MPI_Status)); - - /* p0 has no left neighbor; all other procs wait for msg */ - if (file->mpi_rank != 0) { - if (MPI_SUCCESS != (mpi_code=MPI_Recv( &msgbuf, 1, MPI_CHAR, - file->mpi_rank-1, MPI_ANY_TAG, file->comm, &rcvstat ))) - HMPI_GOTO_ERROR(FAIL, "MPI_Recv failed", mpi_code) - } - -done: - FUNC_LEAVE_NOAPI(ret_value) -} - - -/*------------------------------------------------------------------------- - * Function: H5FD_mpio_signal_right_neighbor - * - * Purpose: Blocks until (empty) msg is received from immediately - * lower-rank neighbor. In conjunction with - * H5FD_mpio_wait_for_left_neighbor, useful for enforcing - * 1-process-at-at-time access to critical regions to avoid race - * conditions (though it is overkill to require that the - * processes be allowed to proceed strictly in order of their - * rank). - * - * Note: This routine doesn't read or write any file, just performs - * interprocess coordination. It really should reside in a - * separate package of such routines. - * - * Return: Success: 0 - * Failure: -1 - * - * Programmer: rky - * 19981207 - * - * Modifications: - * Robb Matzke, 1999-08-09 - * Modified to work with the virtual file layer. - *------------------------------------------------------------------------- - */ -herr_t -H5FD_mpio_signal_right_neighbor(H5FD_t *_file) -{ - H5FD_mpio_t *file = (H5FD_mpio_t*)_file; - char msgbuf[1]; - int mpi_code; /* mpi return code */ - herr_t ret_value=SUCCEED; /* Return value */ - - FUNC_ENTER_NOAPI(H5FD_mpio_signal_right_neighbor, FAIL) - - assert(file); - assert(H5FD_MPIO==file->pub.driver_id); - - if (file->mpi_rank != (file->mpi_size-1)) { - if (MPI_SUCCESS != (mpi_code=MPI_Send(&msgbuf, 0/*empty msg*/, MPI_CHAR, - file->mpi_rank+1, 0, file->comm))) - HMPI_GOTO_ERROR(FAIL, "MPI_Send failed", mpi_code) - } - -done: - FUNC_LEAVE_NOAPI(ret_value) -} - - -/*------------------------------------------------------------------------- * Function: H5FD_mpio_fapl_get * * Purpose: Returns a file access property list which could be used to @@ -837,7 +524,7 @@ H5FD_mpio_fapl_get(H5FD_t *_file) HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed") /* Duplicate communicator and Info object. */ - if (FAIL==H5FD_mpio_comm_info_dup(file->comm, file->info, + if (FAIL==H5FD_mpi_comm_info_dup(file->comm, file->info, &fa->comm, &fa->info)) HGOTO_ERROR(H5E_INTERNAL, H5E_CANTCOPY, NULL, "Communicator/Info duplicate failed") @@ -885,7 +572,7 @@ fprintf(stderr, "enter H5FD_mpio_fapl_copy\n"); HDmemcpy(new_fa, old_fa, sizeof(H5FD_mpio_fapl_t)); /* Duplicate communicator and Info object. */ - if (FAIL==H5FD_mpio_comm_info_dup(old_fa->comm, old_fa->info, + if (FAIL==H5FD_mpi_comm_info_dup(old_fa->comm, old_fa->info, &new_fa->comm, &new_fa->info)) HGOTO_ERROR(H5E_INTERNAL, H5E_CANTCOPY, NULL, "Communicator/Info duplicate failed") ret_value = new_fa; @@ -936,7 +623,7 @@ fprintf(stderr, "in H5FD_mpio_fapl_free\n"); /* Free the internal communicator and INFO object */ assert(MPI_COMM_NULL!=fa->comm); - H5FD_mpio_comm_info_free(&fa->comm, &fa->info); + H5FD_mpi_comm_info_free(&fa->comm, &fa->info); H5MM_xfree(fa); done: @@ -1008,10 +695,9 @@ H5FD_mpio_open(const char *name, unsigned flags, hid_t fapl_id, const H5FD_mpio_fapl_t *fa=NULL; H5FD_mpio_fapl_t _fa; H5P_genplist_t *plist; /* Property list pointer */ - H5FD_t *ret_value; /* Return value */ MPI_Comm comm_dup=MPI_COMM_NULL; MPI_Info info_dup=MPI_INFO_NULL; - + H5FD_t *ret_value; /* Return value */ FUNC_ENTER_NOAPI(H5FD_mpio_open, NULL) @@ -1035,7 +721,7 @@ H5FD_mpio_open(const char *name, unsigned flags, hid_t fapl_id, } /* Duplicate communicator and Info object for use by this file. */ - if (FAIL==H5FD_mpio_comm_info_dup(fa->comm, fa->info, &comm_dup, &info_dup)) + if (FAIL==H5FD_mpi_comm_info_dup(fa->comm, fa->info, &comm_dup, &info_dup)) HGOTO_ERROR(H5E_INTERNAL, H5E_CANTCOPY, NULL, "Communicator/Info duplicate failed") /* convert HDF5 flags to MPI-IO flags */ @@ -1065,8 +751,7 @@ H5FD_mpio_open(const char *name, unsigned flags, hid_t fapl_id, #endif /*OKAY: CAST DISCARDS CONST*/ - mpi_code=MPI_File_open(comm_dup, (char*)name, mpi_amode, info_dup, &fh); - if (MPI_SUCCESS != mpi_code) + if (MPI_SUCCESS != (mpi_code=MPI_File_open(comm_dup, (char*)name, mpi_amode, info_dup, &fh))) HMPI_GOTO_ERROR(NULL, "MPI_File_open failed", mpi_code) file_opened=1; @@ -1076,40 +761,64 @@ H5FD_mpio_open(const char *name, unsigned flags, hid_t fapl_id, if (MPI_SUCCESS != (mpi_code=MPI_Comm_size (comm_dup, &mpi_size))) HMPI_GOTO_ERROR(NULL, "MPI_Comm_size failed", mpi_code) -/* Following changes in handling file-truncation made be rkyates and ppweidhaas, sep 99 */ - - /* Only processor p0 will get the filesize and broadcast it. */ - if (mpi_rank == 0) { - /* Get current file size */ - if (MPI_SUCCESS != (mpi_code=MPI_File_get_size(fh, &size))) - HMPI_GOTO_ERROR(NULL, "MPI_File_get_size failed", mpi_code) - } - - /* Broadcast file-size */ - if (MPI_SUCCESS != (mpi_code=MPI_Bcast(&size, sizeof(MPI_Offset), MPI_BYTE, 0, comm_dup))) - HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code) - - /* Only if size > 0, truncate the file - if requested */ - if (size && (flags & H5F_ACC_TRUNC)) { - if (MPI_SUCCESS != (mpi_code=MPI_File_set_size(fh, (MPI_Offset)0))) - HMPI_GOTO_ERROR(NULL, "MPI_File_set_size failed", mpi_code) - - /* Don't let any proc return until all have truncated the file. */ - if (MPI_SUCCESS!= (mpi_code=MPI_Barrier(comm_dup))) - HMPI_GOTO_ERROR(NULL, "MPI_Barrier failed", mpi_code) - size = 0; - } - /* Build the return value and initialize it */ if (NULL==(file=H5MM_calloc(sizeof(H5FD_mpio_t)))) HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed") - file->f = fh; file->comm = comm_dup; file->info = info_dup; file->mpi_rank = mpi_rank; file->mpi_size = mpi_size; - file->eof = H5FD_mpio_MPIOff_to_haddr(size); + + /* Determine if the file should be truncated */ + if(flags & H5F_ACC_TRUNC) { +#ifdef H5_MPI_FILE_SET_SIZE_BIG + /* Indicate that a 'truncate' operation is pending on the file */ + file->truncate_pending=TRUE; + + /* File is treated as zero size now */ + size=0; +#else /* H5_MPI_FILE_SET_SIZE_BIG */ + /* Only processor p0 will get the filesize and broadcast it. */ + if (mpi_rank == 0) { + /* Get current file size */ + if (MPI_SUCCESS != (mpi_code=MPI_File_get_size(fh, &size))) + HMPI_GOTO_ERROR(NULL, "MPI_File_get_size failed", mpi_code) + } /* end if */ + + /* Broadcast file-size */ + if (MPI_SUCCESS != (mpi_code=MPI_Bcast(&size, sizeof(MPI_Offset), MPI_BYTE, 0, comm_dup))) + HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code) + + /* Only truncate the file if it is non-zero length */ + if(size) { + if (MPI_SUCCESS != (mpi_code=MPI_File_set_size(fh, (MPI_Offset)0))) + HMPI_GOTO_ERROR(NULL, "MPI_File_set_size failed", mpi_code) + + /* Don't let any proc return until all have truncated the file. */ + if (MPI_SUCCESS!= (mpi_code=MPI_Barrier(comm_dup))) + HMPI_GOTO_ERROR(NULL, "MPI_Barrier failed", mpi_code) + + /* File is zero size now */ + size = 0; + } /* end if */ +#endif /* H5_MPI_FILE_SET_SIZE_BIG */ + } /* end if */ + else { + /* Only processor p0 will get the filesize and broadcast it. */ + if (mpi_rank == 0) { + /* Get current file size */ + if (MPI_SUCCESS != (mpi_code=MPI_File_get_size(fh, &size))) + HMPI_GOTO_ERROR(NULL, "MPI_File_get_size failed", mpi_code) + } /* end if */ + + /* Broadcast file size */ + if (MPI_SUCCESS != (mpi_code=MPI_Bcast(&size, sizeof(MPI_Offset), MPI_BYTE, 0, comm_dup))) + HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code) + } /* end else */ + + /* Set the size of the file (from library's perspective) */ + file->eof = H5FD_mpi_MPIOff_to_haddr(size); /* Set return value */ ret_value=(H5FD_t*)file; @@ -1161,8 +870,8 @@ static herr_t H5FD_mpio_close(H5FD_t *_file) { H5FD_mpio_t *file = (H5FD_mpio_t*)_file; - int mpi_code; /* mpi return code */ - herr_t ret_value=SUCCEED; /* Return value */ + int mpi_code; /* MPI return code */ + herr_t ret_value=SUCCEED; /* Return value */ FUNC_ENTER_NOAPI(H5FD_mpio_close, FAIL) @@ -1172,13 +881,32 @@ H5FD_mpio_close(H5FD_t *_file) #endif assert(file); assert(H5FD_MPIO==file->pub.driver_id); + assert(file->eoa>0); + +#ifdef H5_MPI_FILE_SET_SIZE_BIG + /* Check if we should truncate the file */ + if(file->truncate_pending) { + MPI_Offset mpi_off; /* Offset to write test data at */ + + /* Some numeric conversions */ + if (H5FD_mpi_haddr_to_MPIOff(file->eoa, &mpi_off)<0) + HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off") + + /* Truncate the extra data off the end */ + /* (Don't worry about a barrier after the call, since we are closing) */ + if (MPI_SUCCESS != (mpi_code=MPI_File_set_size(file->f, mpi_off))) + HMPI_DONE_ERROR(NULL, "MPI_File_set_size failed", mpi_code) + } /* end if */ +#else /* H5_MPI_FILE_SET_SIZE_BIG */ + assert(!file->truncate_pending); +#endif /* H5_MPI_FILE_SET_SIZE_BIG */ /* MPI_File_close sets argument to MPI_FILE_NULL */ if (MPI_SUCCESS != (mpi_code=MPI_File_close(&(file->f)/*in,out*/))) HMPI_GOTO_ERROR(FAIL, "MPI_File_close failed", mpi_code) /* Clean up other stuff */ - H5FD_mpio_comm_info_free(&file->comm, &file->info); + H5FD_mpi_comm_info_free(&file->comm, &file->info); H5MM_xfree(file); done: @@ -1490,7 +1218,7 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t UNUSED type, hid_t dxpl_id, haddr_t add HDmemset(&mpi_stat,0,sizeof(MPI_Status)); /* some numeric conversions */ - if (H5FD_mpio_haddr_to_MPIOff(addr, &mpi_off/*out*/)<0) + if (H5FD_mpi_haddr_to_MPIOff(addr, &mpi_off/*out*/)<0) HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off") size_i = (int)size; if ((hsize_t)size_i != size) @@ -1525,16 +1253,16 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t UNUSED type, hid_t dxpl_id, haddr_t add use_view_this_time=TRUE; /* prepare for a full-blown xfer using btype, ftype, and disp */ - if(H5P_get(plist,H5FD_MPIO_XFER_MEM_MPI_TYPE_NAME,&buf_type)<0) + if(H5P_get(plist,H5FD_MPI_XFER_MEM_MPI_TYPE_NAME,&buf_type)<0) HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O type property") - if(H5P_get(plist,H5FD_MPIO_XFER_FILE_MPI_TYPE_NAME,&file_type)<0) + if(H5P_get(plist,H5FD_MPI_XFER_FILE_MPI_TYPE_NAME,&file_type)<0) HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O type property") /* * Set the file view when we are using MPI derived types */ /*OKAY: CAST DISCARDS CONST QUALIFIER*/ - if (MPI_SUCCESS != (mpi_code=MPI_File_set_view(file->f, mpi_off, MPI_BYTE, file_type, (char*)"native", file->info))) + if (MPI_SUCCESS != (mpi_code=MPI_File_set_view(file->f, mpi_off, MPI_BYTE, file_type, H5FD_mpi_native_g, file->info))) HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) /* When using types, use the address as the displacement for @@ -1557,7 +1285,7 @@ H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t UNUSED type, hid_t dxpl_id, haddr_t add * Reset the file view when we used MPI derived types */ /*OKAY: CAST DISCARDS CONST QUALIFIER*/ - if (MPI_SUCCESS != (mpi_code=MPI_File_set_view(file->f, 0, MPI_BYTE, MPI_BYTE, (char*)"native", file->info))) + if (MPI_SUCCESS != (mpi_code=MPI_File_set_view(file->f, 0, MPI_BYTE, MPI_BYTE, H5FD_mpi_native_g, file->info))) HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) } else { if (MPI_SUCCESS!= (mpi_code=MPI_File_read_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat))) @@ -1754,7 +1482,7 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, HDmemset(&mpi_stat,0,sizeof(MPI_Status)); /* some numeric conversions */ - if (H5FD_mpio_haddr_to_MPIOff(addr, &mpi_off)<0) + if (H5FD_mpi_haddr_to_MPIOff(addr, &mpi_off)<0) HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off") size_i = (int)size; if ((hsize_t)size_i != size) @@ -1789,16 +1517,16 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, use_view_this_time=TRUE; /* prepare for a full-blown xfer using btype, ftype, and disp */ - if(H5P_get(plist,H5FD_MPIO_XFER_MEM_MPI_TYPE_NAME,&buf_type)<0) + if(H5P_get(plist,H5FD_MPI_XFER_MEM_MPI_TYPE_NAME,&buf_type)<0) HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O type property") - if(H5P_get(plist,H5FD_MPIO_XFER_FILE_MPI_TYPE_NAME,&file_type)<0) + if(H5P_get(plist,H5FD_MPI_XFER_FILE_MPI_TYPE_NAME,&file_type)<0) HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O type property") /* * Set the file view when we are using MPI derived types */ /*OKAY: CAST DISCARDS CONST QUALIFIER*/ - if (MPI_SUCCESS != (mpi_code=MPI_File_set_view(file->f, mpi_off, MPI_BYTE, file_type, (char*)"native", file->info))) + if (MPI_SUCCESS != (mpi_code=MPI_File_set_view(file->f, mpi_off, MPI_BYTE, file_type, H5FD_mpi_native_g, file->info))) HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) /* When using types, use the address as the displacement for @@ -1828,36 +1556,17 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, if (MPI_SUCCESS!= (mpi_code=MPI_Barrier(file->comm))) HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code) -#ifdef OLD_METADATA_WRITE - /* Only p<round> will do the actual write if all procs in comm write same metadata */ - if (H5_mpi_1_metawrite_g) { - if (file->mpi_rank != H5_PAR_META_WRITE) { + /* Only one process will do the actual write if all procs in comm write same metadata */ + if (file->mpi_rank != H5_PAR_META_WRITE) { #ifdef H5FDmpio_DEBUG - if (H5FD_mpio_Debug[(int)'w']) { - fprintf(stdout, - " proc %d: in H5FD_mpio_write (write omitted)\n", - file->mpi_rank ); - } -#endif - HGOTO_DONE(SUCCEED) /* skip the actual write */ + if (H5FD_mpio_Debug[(int)'w']) { + fprintf(stdout, + " proc %d: in H5FD_mpio_write (write omitted)\n", + file->mpi_rank ); } - } -#else /* OLD_METADATA_WRITE */ - /* Remember that views are used */ - use_view_this_time=TRUE; - - /* - * Set the file view when we are using MPI derived types - */ - /*OKAY: CAST DISCARDS CONST QUALIFIER*/ - if (MPI_SUCCESS != (mpi_code=MPI_File_set_view(file->f, mpi_off, MPI_BYTE, MPI_BYTE, (char*)"native", file->info))) - HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) - - /* When using types, use the address as the displacement for - * MPI_File_set_view and reset the address for the read to zero - */ - mpi_off=0; -#endif /* OLD_METADATA_WRITE */ +#endif + HGOTO_DONE(SUCCEED) /* skip the actual write */ + } /* end if */ } /* end if */ /* Write the data. */ @@ -1874,7 +1583,7 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, * Reset the file view when we used MPI derived types */ /*OKAY: CAST DISCARDS CONST QUALIFIER*/ - if (MPI_SUCCESS != (mpi_code=MPI_File_set_view(file->f, 0, MPI_BYTE, MPI_BYTE, (char*)"native", file->info))) + if (MPI_SUCCESS != (mpi_code=MPI_File_set_view(file->f, 0, MPI_BYTE, MPI_BYTE, H5FD_mpi_native_g, file->info))) HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code) } else { /*OKAY: CAST DISCARDS CONST QUALIFIER*/ @@ -1906,20 +1615,17 @@ H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, file->eof = HADDR_UNDEF; done: -#ifdef OLD_METADATA_WRITE - /* if only p<round> writes, need to broadcast the ret_value to other processes */ - if ((type!=H5FD_MEM_DRAW) && H5_mpi_1_metawrite_g) { + /* if only one process writes, need to broadcast the ret_value to other processes */ + if (type!=H5FD_MEM_DRAW) { if (MPI_SUCCESS != (mpi_code=MPI_Bcast(&ret_value, sizeof(ret_value), MPI_BYTE, H5_PAR_META_WRITE, file->comm))) HMPI_DONE_ERROR(FAIL, "MPI_Bcast failed", mpi_code) } /* end if */ -#endif /* OLD_METADATA_WRITE */ #ifdef H5FDmpio_DEBUG if (H5FD_mpio_Debug[(int)'t']) fprintf(stdout, "proc %d: Leaving H5FD_mpio_write with ret_value=%d\n", file->mpi_rank, ret_value ); #endif - FUNC_LEAVE_NOAPI(ret_value) } @@ -1960,10 +1666,6 @@ H5FD_mpio_flush(H5FD_t *_file, hid_t UNUSED dxpl_id, unsigned closing) int mpi_code; /* mpi return code */ MPI_Offset mpi_off; herr_t ret_value=SUCCEED; -#ifndef H5_MPI_FILE_SET_SIZE_BIG - uint8_t byte=0; - MPI_Status mpi_stat; -#endif /* H5_MPI_FILE_SET_SIZE_BIG */ FUNC_ENTER_NOAPI(H5FD_mpio_flush, FAIL) @@ -1974,26 +1676,30 @@ H5FD_mpio_flush(H5FD_t *_file, hid_t UNUSED dxpl_id, unsigned closing) assert(file); assert(H5FD_MPIO==file->pub.driver_id); -#ifndef H5_MPI_FILE_SET_SIZE_BIG - /* Portably initialize MPI status variable */ - HDmemset(&mpi_stat,0,sizeof(MPI_Status)); -#endif /* H5_MPI_FILE_SET_SIZE_BIG */ - /* Extend the file to make sure it's large enough, then sync. * Unfortunately, keeping track of EOF is an expensive operation, so * we can't just check whether EOF<EOA like with other drivers. * Therefore we'll just read the byte at EOA-1 and then write it back. */ if(file->eoa>file->last_eoa) { #ifdef H5_MPI_FILE_SET_SIZE_BIG - if (H5FD_mpio_haddr_to_MPIOff(file->eoa, &mpi_off)<0) + if (H5FD_mpi_haddr_to_MPIOff(file->eoa, &mpi_off)<0) HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "cannot convert from haddr_t to MPI_Offset") /* Extend the file's size */ if (MPI_SUCCESS != (mpi_code=MPI_File_set_size(file->f, mpi_off))) HMPI_GOTO_ERROR(FAIL, "MPI_File_set_size failed", mpi_code) + + /* File does not need to be truncated now */ + file->truncate_pending=FALSE; #else /* H5_MPI_FILE_SET_SIZE_BIG */ if (0==file->mpi_rank) { - if (H5FD_mpio_haddr_to_MPIOff(file->eoa-1, &mpi_off)<0) + uint8_t byte=0; + MPI_Status mpi_stat; + + /* Portably initialize MPI status variable */ + HDmemset(&mpi_stat,0,sizeof(MPI_Status)); + + if (H5FD_mpi_haddr_to_MPIOff(file->eoa-1, &mpi_off)<0) HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "cannot convert from haddr_t to MPI_Offset") if (MPI_SUCCESS != (mpi_code=MPI_File_read_at(file->f, mpi_off, &byte, 1, MPI_BYTE, &mpi_stat))) HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code) @@ -2032,201 +1738,105 @@ done: /*------------------------------------------------------------------------- - * Function: H5FD_mpio_MPIOff_to_haddr + * Function: H5FD_mpio_mpi_rank * - * Purpose: Convert an MPI_Offset value to haddr_t. + * Purpose: Returns the MPI rank for a process * - * Return: Success: The haddr_t equivalent of the MPI_OFF - * argument. - * - * Failure: HADDR_UNDEF + * Return: Success: non-negative + * Failure: negative * - * Programmer: Unknown - * January 30, 1998 + * Programmer: Quincey Koziol + * Thursday, May 16, 2002 * * Modifications: - * Robb Matzke, 1999-04-23 - * An error is reported for address overflows. The ADDR output - * argument is optional. * - * Robb Matzke, 1999-08-06 - * Modified to work with the virtual file layer. - *------------------------------------------------------------------------- + *------------------------------------------------------------------------- */ -static haddr_t -H5FD_mpio_MPIOff_to_haddr(MPI_Offset mpi_off) +static int +H5FD_mpio_mpi_rank(const H5FD_t *_file) { - haddr_t ret_value=HADDR_UNDEF; + const H5FD_mpio_t *file = (const H5FD_mpio_t*)_file; + int ret_value; /* Return value */ + + FUNC_ENTER_NOAPI(H5FD_mpio_mpi_rank, FAIL) - FUNC_ENTER_NOAPI_NOINIT_NOFUNC(H5FD_mpio_MPIOff_to_haddr) + assert(file); + assert(H5FD_MPIO==file->pub.driver_id); - if (mpi_off != (MPI_Offset)(haddr_t)mpi_off) - ret_value=HADDR_UNDEF; - else - ret_value=(haddr_t)mpi_off; + /* Set return value */ + ret_value=file->mpi_rank; +done: FUNC_LEAVE_NOAPI(ret_value) -} +} /* end H5FD_mpio_mpi_rank() */ /*------------------------------------------------------------------------- - * Function: H5FD_mpio_haddr_to_MPIOff - * - * Purpose: Convert an haddr_t value to MPI_Offset. + * Function: H5FD_mpio_mpi_size * - * Return: Success: Non-negative, the MPI_OFF argument contains - * the converted value. + * Purpose: Returns the number of MPI processes * - * Failure: Negative, MPI_OFF is undefined. + * Return: Success: non-negative + * Failure: negative * - * Programmer: Unknown - * January 30, 1998 + * Programmer: Quincey Koziol + * Thursday, May 16, 2002 * * Modifications: - * Robb Matzke, 1999-04-23 - * An error is reported for address overflows. The ADDR output - * argument is optional. - * - * Robb Matzke, 1999-07-28 - * The ADDR argument is passed by value. * - * Robb Matzke, 1999-08-06 - * Modified to work with the virtual file layer. *------------------------------------------------------------------------- */ -static herr_t -H5FD_mpio_haddr_to_MPIOff(haddr_t addr, MPI_Offset *mpi_off/*out*/) +static int +H5FD_mpio_mpi_size(const H5FD_t *_file) { - herr_t ret_value=FAIL; - - FUNC_ENTER_NOAPI_NOINIT_NOFUNC(H5FD_mpio_haddr_to_MPIOff) - - if (mpi_off) - *mpi_off = (MPI_Offset)addr; - if (addr != (haddr_t)(MPI_Offset)addr) - ret_value=FAIL; - else - ret_value=SUCCEED; - - FUNC_LEAVE_NOAPI(ret_value) -} + const H5FD_mpio_t *file = (const H5FD_mpio_t*)_file; + int ret_value; /* Return value */ - -/*------------------------------------------------------------------------- - * Function: H5FD_mpio_comm_info_dup - * - * Purpose: Make duplicates of communicator and Info object. - * If the Info object is in fact MPI_INFO_NULL, no duplicate - * is made but the same value assigned to the new Info object - * handle. - * - * Return: Success: Non-negative. The new communicator and Info - * object handles are returned via comm_new and - * info_new pointers. - * - * Failure: Negative. - * - * Programmer: Albert Cheng - * Jan 8, 2003 - * - * Modifications: - *------------------------------------------------------------------------- - */ -static herr_t -H5FD_mpio_comm_info_dup(MPI_Comm comm, MPI_Info info, MPI_Comm *comm_new, MPI_Info *info_new) -{ - herr_t ret_value=SUCCEED; - MPI_Comm comm_dup=MPI_COMM_NULL; - MPI_Info info_dup=MPI_INFO_NULL; - int mpi_code; - - FUNC_ENTER_NOAPI(H5FD_mpio_comm_info_dup, FAIL) + FUNC_ENTER_NOAPI(H5FD_mpio_mpi_size, FAIL) -#ifdef H5FDmpio_DEBUG -if (H5FD_mpio_Debug[(int)'t']) -fprintf(stderr, "In H5FD_mpio_comm_info_dup: argument comm/info = %d/%ld\n", comm, (long)info); -#endif - /* Check arguments */ - if (MPI_COMM_NULL == comm) - HGOTO_ERROR(H5E_INTERNAL, H5E_BADVALUE, FAIL, "not a valid argument") - if (!comm_new || !info_new) - HGOTO_ERROR(H5E_INTERNAL, H5E_BADVALUE, FAIL, "bad pointers") - - /* Dup them. Using temporary variables for error recovery cleanup. */ - if (MPI_SUCCESS != (mpi_code=MPI_Comm_dup(comm, &comm_dup))) - HMPI_GOTO_ERROR(FAIL, "MPI_Comm_dup failed", mpi_code) - if (MPI_INFO_NULL != info){ - if (MPI_SUCCESS != (mpi_code=MPI_Info_dup(info, &info_dup))) - HMPI_GOTO_ERROR(FAIL, "MPI_Info_dup failed", mpi_code) - }else{ - /* No dup, just copy it. */ - info_dup = info; - } + assert(file); + assert(H5FD_MPIO==file->pub.driver_id); - /* copy them to the return arguments */ - *comm_new = comm_dup; - *info_new = info_dup; + /* Set return value */ + ret_value=file->mpi_size; done: - if (FAIL == ret_value){ - /* need to free anything created here */ - if (MPI_COMM_NULL != comm_dup) - MPI_Comm_free(&comm_dup); - if (MPI_INFO_NULL != info_dup) - MPI_Info_free(&info_dup); - } - -#ifdef H5FDmpio_DEBUG -if (H5FD_mpio_Debug[(int)'t']) -fprintf(stderr, "Leaving H5FD_mpio_comm_info_dup\n"); -#endif FUNC_LEAVE_NOAPI(ret_value) -} +} /* end H5FD_mpio_mpi_size() */ /*------------------------------------------------------------------------- - * Function: H5FD_mpio_comm_info_free + * Function: H5FD_mpio_communicator * - * Purpose: Free the communicator and Info object. - * If comm or info is in fact MPI_COMM_NULL or MPI_INFO_NULL - * respectively, no action occurs to it. + * Purpose: Returns the MPI communicator for the file. * - * Return: Success: Non-negative. The values the pointers refer - * to will be set to the corresponding NULL - * handles. + * Return: Success: The communicator * - * Failure: Negative. + * Failure: NULL * - * Programmer: Albert Cheng - * Jan 8, 2003 + * Programmer: Robb Matzke + * Monday, August 9, 1999 * * Modifications: + * *------------------------------------------------------------------------- */ -static herr_t -H5FD_mpio_comm_info_free(MPI_Comm *comm, MPI_Info *info) +static MPI_Comm +H5FD_mpio_communicator(const H5FD_t *_file) { - herr_t ret_value=SUCCEED; - FUNC_ENTER_NOAPI(H5FD_mpio_comm_info_free, FAIL) + const H5FD_mpio_t *file = (const H5FD_mpio_t*)_file; + MPI_Comm ret_value; /* Return value */ -#ifdef H5FDmpio_DEBUG -if (H5FD_mpio_Debug[(int)'t']) -fprintf(stderr, "in H5FD_mpio_comm_info_free\n"); -#endif - /* Check arguments */ - if (!comm || !info) - HGOTO_ERROR(H5E_INTERNAL, H5E_BADVALUE, FAIL, "not a valid argument") + FUNC_ENTER_NOAPI(H5FD_mpio_communicator, MPI_COMM_NULL) + + assert(file); + assert(H5FD_MPIO==file->pub.driver_id); - if (MPI_COMM_NULL != *comm) - MPI_Comm_free(comm); - if (MPI_INFO_NULL != *info) - MPI_Info_free(info); + /* Set return value */ + ret_value=file->comm; done: -#ifdef H5FDmpio_DEBUG -if (H5FD_mpio_Debug[(int)'t']) -fprintf(stderr, "Leaving H5FD_mpio_comm_info_free\n"); -#endif FUNC_LEAVE_NOAPI(ret_value) } + #endif /* H5_HAVE_PARALLEL */ |