diff options
author | Quincey Koziol <koziol@hdfgroup.org> | 2002-07-15 15:21:32 (GMT) |
---|---|---|
committer | Quincey Koziol <koziol@hdfgroup.org> | 2002-07-15 15:21:32 (GMT) |
commit | 363ec52b7cab5638e3d1479b67068754a2e10011 (patch) | |
tree | 23fb21609fb9a9bbadfee4c13d27326df0413d4b | |
parent | c3b0c0f3c4ed622450862fdfd2ade5eb96d4ef1a (diff) | |
download | hdf5-363ec52b7cab5638e3d1479b67068754a2e10011.zip hdf5-363ec52b7cab5638e3d1479b67068754a2e10011.tar.gz hdf5-363ec52b7cab5638e3d1479b67068754a2e10011.tar.bz2 |
[svn-r5799] Purpose:
New feature.
Description:
Added MPI-posix VFL driver. This driver uses MPI to coordinate actions, but
performs I/O directly with posix sec(2) I/O functions. This driver should
_NOT_ be used if the file accessed is not on a parallel filesystem.
Platforms tested:
FreeBSD 4.6 (sleipnir) w/parallel & IRIX64 6.5 (modi4) w/parallel
-rw-r--r-- | src/H5D.c | 38 | ||||
-rw-r--r-- | src/H5Distore.c | 44 | ||||
-rw-r--r-- | src/H5Dseq.c | 79 | ||||
-rw-r--r-- | src/H5F.c | 5 | ||||
-rw-r--r-- | src/H5FD.c | 2 | ||||
-rw-r--r-- | src/H5FDmpio.c | 4 | ||||
-rw-r--r-- | src/H5FDmpiposix.c | 1135 | ||||
-rw-r--r-- | src/H5FDmpiposix.h | 62 | ||||
-rw-r--r-- | src/H5Farray.c | 76 | ||||
-rw-r--r-- | src/H5Fistore.c | 44 | ||||
-rw-r--r-- | src/H5Fpkg.h | 1 | ||||
-rw-r--r-- | src/H5Fseq.c | 79 | ||||
-rw-r--r-- | src/Makefile.in | 14 | ||||
-rw-r--r-- | src/hdf5.h | 11 |
14 files changed, 1427 insertions, 167 deletions
@@ -30,10 +30,12 @@ /*#define H5D_DEBUG*/ /* - * The MPIO driver is needed because there are kludges in this file and - * places where we check for things that aren't handled by this driver. + * The MPIO & MPIPOSIX drivers are needed because there are kludges in this + * file and places where we check for things that aren't handled by these + * drivers. */ #include "H5FDmpio.h" +#include "H5FDmpiposix.h" #ifdef H5_HAVE_PARALLEL /* Remove this if H5R_DATASET_REGION is no longer used in this file */ @@ -1541,8 +1543,8 @@ H5D_create(H5G_entry_t *loc, const char *name, const H5T_t *type, HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, NULL, "unable to locate insertion point"); #ifdef H5_HAVE_PARALLEL - /* If MPIO is used, no filter support yet. */ - if(IS_H5FD_MPIO(f) && dcpl_pline.nfilters > 0) + /* If MPIO or MPIPOSIX is used, no filter support yet. */ + if((IS_H5FD_MPIO(f) || IS_H5FD_MPIPOSIX(f)) && dcpl_pline.nfilters > 0) HGOTO_ERROR(H5E_DATASET, H5E_UNSUPPORTED, NULL, "Parallel I/O does not support filters yet"); #endif /*H5_HAVE_PARALLEL*/ @@ -2018,8 +2020,8 @@ H5D_open_oid(H5G_entry_t *ent) HGOTO_ERROR(H5E_DATASET, H5E_CANTSET, NULL, "can't set pipeline"); #ifdef H5_HAVE_PARALLEL - /* If MPIO is used, no filter support yet. */ - if(IS_H5FD_MPIO(dataset->ent.file) && pline.nfilters > 0) + /* If MPIO or MPIPOSIX is used, no filter support yet. */ + if((IS_H5FD_MPIO(dataset->ent.file) || IS_H5FD_MPIPOSIX(dataset->ent.file)) && pline.nfilters > 0) HGOTO_ERROR (H5E_DATASET, H5E_UNSUPPORTED, NULL, "Parallel IO does not support filters yet"); #endif /*H5_HAVE_PARALLEL*/ @@ -2224,7 +2226,7 @@ H5D_read(H5D_t *dataset, const H5T_t *mem_type, const H5S_t *mem_space, H5S_t *free_this_space=NULL; /*data space to free */ #ifdef H5_HAVE_PARALLEL H5FD_mpio_dxpl_t *dx = NULL; - H5FD_mpio_xfer_t xfer_mode; /*xfer_mode for this request */ + H5FD_mpio_xfer_t xfer_mode=H5FD_MPIO_INDEPENDENT; /*xfer_mode for this request */ hbool_t xfer_mode_changed=0; /*xfer_mode needs restore */ hbool_t doing_mpio=0; /*This is an MPIO access */ #endif /*H5_HAVE_PARALLEL*/ @@ -2280,10 +2282,10 @@ H5D_read(H5D_t *dataset, const H5T_t *mem_type, const H5S_t *mem_space, xfer_mode = dx->xfer_mode; } } /* end if */ - /* Collective access is not permissible without the MPIO driver */ + /* Collective access is not permissible without the MPIO or MPIPOSIX driver */ if (doing_mpio && xfer_mode==H5FD_MPIO_COLLECTIVE && - !(IS_H5FD_MPIO(dataset->ent.file))) - HGOTO_ERROR (H5E_DATASET, H5E_UNSUPPORTED, FAIL, "collective access for MPIO driver only"); + !(IS_H5FD_MPIO(dataset->ent.file) || IS_H5FD_MPIPOSIX(dataset->ent.file))) + HGOTO_ERROR (H5E_DATASET, H5E_UNSUPPORTED, FAIL, "collective access for MPIO & MPIPOSIX drivers only"); /* Set the "parallel I/O possible" flag, for H5S_find() */ if (H5S_mpi_opt_types_g && IS_H5FD_MPIO(dataset->ent.file)) { @@ -2640,7 +2642,7 @@ H5D_write(H5D_t *dataset, const H5T_t *mem_type, const H5S_t *mem_space, H5S_t *free_this_space=NULL; /*data space to free */ #ifdef H5_HAVE_PARALLEL H5FD_mpio_dxpl_t *dx = NULL; - H5FD_mpio_xfer_t xfer_mode; /*xfer_mode for this request */ + H5FD_mpio_xfer_t xfer_mode=H5FD_MPIO_INDEPENDENT; /*xfer_mode for this request */ hbool_t xfer_mode_changed=0; /*xfer_mode needs restore */ hbool_t doing_mpio=0; /*This is an MPIO access */ #endif /*H5_HAVE_PARALLEL*/ @@ -2677,17 +2679,17 @@ H5D_write(H5D_t *dataset, const H5T_t *mem_type, const H5S_t *mem_space, HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a dataset creation property list"); #ifdef H5_HAVE_PARALLEL - /* If MPIO is used, no VL datatype support yet. */ + /* If MPIO or MPIPOSIX is used, no VL datatype support yet. */ /* This is because they use the global heap in the file and we don't */ /* support parallel access of that yet */ - if (IS_H5FD_MPIO(dataset->ent.file) && H5T_get_class(mem_type)==H5T_VLEN) + if ( (IS_H5FD_MPIO(dataset->ent.file) || IS_H5FD_MPIPOSIX(dataset->ent.file)) && H5T_get_class(mem_type)==H5T_VLEN) HGOTO_ERROR (H5E_DATASET, H5E_UNSUPPORTED, FAIL, "Parallel IO does not support writing VL datatypes yet"); #endif /*H5_HAVE_PARALLEL*/ #ifdef H5_HAVE_PARALLEL - /* If MPIO is used, no dataset region reference support yet. */ + /* If MPIO or MPIPOSIX is used, no dataset region reference datatype support yet. */ /* This is because they use the global heap in the file and we don't */ /* support parallel access of that yet */ - if (IS_H5FD_MPIO(dataset->ent.file) && + if ((IS_H5FD_MPIO(dataset->ent.file) || IS_H5FD_MPIPOSIX(dataset->ent.file)) && H5T_get_class(mem_type)==H5T_REFERENCE && H5T_get_ref_type(mem_type)==H5R_DATASET_REGION) HGOTO_ERROR (H5E_DATASET, H5E_UNSUPPORTED, FAIL, "Parallel IO does not support writing region reference datatypes yet"); @@ -2715,9 +2717,9 @@ H5D_write(H5D_t *dataset, const H5T_t *mem_type, const H5S_t *mem_space, xfer_mode = dx->xfer_mode; } } /* end if */ - /* Collective access is not permissible without the MPIO driver */ + /* Collective access is not permissible without the MPIO or MPIPOSIX driver */ if (doing_mpio && xfer_mode==H5FD_MPIO_COLLECTIVE && - !(IS_H5FD_MPIO(dataset->ent.file))) + !(IS_H5FD_MPIO(dataset->ent.file) || IS_H5FD_MPIPOSIX(dataset->ent.file))) HGOTO_ERROR (H5E_DATASET, H5E_UNSUPPORTED, FAIL, "collective access for MPIO driver only"); /* Set the "parallel I/O possible" flag, for H5S_find() */ @@ -3269,7 +3271,7 @@ H5D_init_storage(H5D_t *dset, const H5S_t *space) * If the dataset is accessed via parallel I/O, allocate file space * for all chunks now and initialize each chunk with the fill value. */ - if (IS_H5FD_MPIO(dset->ent.file)) { + if (IS_H5FD_MPIO(dset->ent.file) || IS_H5FD_MPIPOSIX(dset->ent.file)) { /* We only handle simple data spaces so far */ int ndims; hsize_t dim[H5O_LAYOUT_NDIMS]; diff --git a/src/H5Distore.c b/src/H5Distore.c index a44ce5a..a9f7f56 100644 --- a/src/H5Distore.c +++ b/src/H5Distore.c @@ -45,8 +45,9 @@ #include "H5Sprivate.h" /* Dataspaces */ #include "H5Vprivate.h" -/* MPIO driver needed for special checks */ +/* MPIO & MPIPOSIX drivers needed for special checks */ #include "H5FDmpio.h" +#include "H5FDmpiposix.h" /* * Feature: If this constant is defined then every cache preemption and load @@ -1781,12 +1782,12 @@ H5F_istore_read(H5F_t *f, hid_t dxpl_id, const H5O_layout_t *layout, #ifdef H5_HAVE_PARALLEL /* - * If MPIO is used and file can be written to, we must bypass the + * If MPIO or MPIPOSIX is used and file can be written to, we must bypass the * chunk-cache scheme because other MPI processes could be writing to * other elements in the same chunk. * Do a direct write-through of only the elements requested. */ - || (IS_H5FD_MPIO(f) && (H5F_ACC_RDWR & f->shared->flags)) + || ((IS_H5FD_MPIO(f) ||IS_H5FD_MPIPOSIX(f)) && (H5F_ACC_RDWR & f->shared->flags)) #endif /* H5_HAVE_PARALLEL */ ) { H5O_layout_t l; /* temporary layout */ @@ -1965,11 +1966,11 @@ H5F_istore_write(H5F_t *f, hid_t dxpl_id, const H5O_layout_t *layout, #ifdef H5_HAVE_PARALLEL /* - * If MPIO is used, must bypass the chunk-cache scheme because other + * If MPIO or MPIPOSIX is used, must bypass the chunk-cache scheme because other * MPI processes could be writing to other elements in the same chunk. * Do a direct write-through of only the elements requested. */ - || (IS_H5FD_MPIO(f) && (H5F_ACC_RDWR & f->shared->flags)) + || ((IS_H5FD_MPIO(f) ||IS_H5FD_MPIPOSIX(f)) && (H5F_ACC_RDWR & f->shared->flags)) #endif /* H5_HAVE_PARALLEL */ ) { H5O_layout_t l; /* temporary layout */ @@ -2416,10 +2417,22 @@ H5F_istore_allocate(H5F_t *f, hid_t dxpl_id, const H5O_layout_t *layout, } /* end if */ /* Retrieve up MPI parameters */ - if ((mpi_rank=H5FD_mpio_mpi_rank(f->shared->lf))<0) - HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "Can't retrieve MPI rank"); - if ((mpi_size=H5FD_mpio_mpi_size(f->shared->lf))<0) - HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "Can't retrieve MPI size"); + if(IS_H5FD_MPIO(f)) { + if ((mpi_rank=H5FD_mpio_mpi_rank(f->shared->lf))<0) + HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "Can't retrieve MPI rank"); + if ((mpi_size=H5FD_mpio_mpi_size(f->shared->lf))<0) + HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "Can't retrieve MPI size"); + } /* end if */ + else { + /* Sanity Check */ + assert(IS_H5FD_MPIPOSIX(f)); + + /* Get the MPI rank & size */ + if ((mpi_rank=H5FD_mpiposix_mpi_rank(f->shared->lf))<0) + HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "Can't retrieve MPI rank"); + if ((mpi_size=H5FD_mpiposix_mpi_size(f->shared->lf))<0) + HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "Can't retrieve MPI size"); + } /* end else */ /* Loop over all chunks */ carry=0; @@ -2469,8 +2482,17 @@ H5F_istore_allocate(H5F_t *f, hid_t dxpl_id, const H5O_layout_t *layout, * still writing out chunks and other processes race ahead to read * them in, getting bogus data. */ - if (MPI_Barrier(H5FD_mpio_communicator(f->shared->lf))) - HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "MPI_Barrier failed"); + if(IS_H5FD_MPIO(f)) { + if (MPI_Barrier(H5FD_mpio_communicator(f->shared->lf))) + HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "MPI_Barrier failed"); + } /* end if */ + else { + /* Sanity Check */ + assert(IS_H5FD_MPIPOSIX(f)); + + if (MPI_Barrier(H5FD_mpiposix_communicator(f->shared->lf))) + HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "MPI_Barrier failed"); + } /* end else */ } /* end if */ done: diff --git a/src/H5Dseq.c b/src/H5Dseq.c index 0a4da84..8f988ff 100644 --- a/src/H5Dseq.c +++ b/src/H5Dseq.c @@ -26,8 +26,9 @@ #include "H5Pprivate.h" #include "H5Vprivate.h" -/* MPIO driver functions are needed for some special checks */ +/* MPIO & MPIPOSIX driver functions are needed for some special checks */ #include "H5FDmpio.h" +#include "H5FDmpiposix.h" /* Interface initialization */ #define PABLO_MASK H5Fseq_mask @@ -182,27 +183,29 @@ H5F_seq_readv(H5F_t *f, hid_t dxpl_id, const H5O_layout_t *layout, #ifdef H5_HAVE_PARALLEL { - /* Get the transfer mode */ H5FD_mpio_dxpl_t *dx; hid_t driver_id; /* VFL driver ID */ - /* Get the plist structure */ - if(NULL == (plist = H5I_object(dxpl_id))) - HGOTO_ERROR(H5E_ATOM, H5E_BADATOM, FAIL, "can't find object for ID"); - - /* Get the driver ID */ - if(H5P_get(plist, H5D_XFER_VFL_ID_NAME, &driver_id)<0) - HGOTO_ERROR (H5E_PLIST, H5E_CANTGET, FAIL, "Can't retrieve VFL driver ID"); - - /* Check if we are using the MPIO driver */ - if(H5FD_MPIO==driver_id) { - /* Get the driver information */ - if(H5P_get(plist, H5D_XFER_VFL_INFO_NAME, &dx)<0) - HGOTO_ERROR (H5E_PLIST, H5E_CANTGET, FAIL, "Can't retrieve VFL driver info"); - - /* Check if we are not using independent I/O */ - if(H5FD_MPIO_INDEPENDENT!=dx->xfer_mode) - xfer_mode = dx->xfer_mode; + /* Get the transfer mode for MPIO transfers */ + if(IS_H5FD_MPIO(f)) { + /* Get the plist structure */ + if(NULL == (plist = H5I_object(dxpl_id))) + HGOTO_ERROR(H5E_ATOM, H5E_BADATOM, FAIL, "can't find object for ID"); + + /* Get the driver ID */ + if(H5P_get(plist, H5D_XFER_VFL_ID_NAME, &driver_id)<0) + HGOTO_ERROR (H5E_PLIST, H5E_CANTGET, FAIL, "Can't retrieve VFL driver ID"); + + /* Check if we are using the MPIO driver (for the DXPL) */ + if(H5FD_MPIO==driver_id) { + /* Get the driver information */ + if(H5P_get(plist, H5D_XFER_VFL_INFO_NAME, &dx)<0) + HGOTO_ERROR (H5E_PLIST, H5E_CANTGET, FAIL, "Can't retrieve VFL driver info"); + + /* Check if we are not using independent I/O */ + if(H5FD_MPIO_INDEPENDENT!=dx->xfer_mode) + xfer_mode = dx->xfer_mode; + } /* end if */ } /* end if */ } @@ -564,27 +567,29 @@ H5F_seq_writev(H5F_t *f, hid_t dxpl_id, const H5O_layout_t *layout, #ifdef H5_HAVE_PARALLEL { - /* Get the transfer mode */ H5FD_mpio_dxpl_t *dx; hid_t driver_id; /* VFL driver ID */ - /* Get the plist structure */ - if(NULL == (plist = H5I_object(dxpl_id))) - HGOTO_ERROR(H5E_ATOM, H5E_BADATOM, FAIL, "can't find object for ID"); - - /* Get the driver ID */ - if(H5P_get(plist, H5D_XFER_VFL_ID_NAME, &driver_id)<0) - HGOTO_ERROR (H5E_PLIST, H5E_CANTGET, FAIL, "Can't retrieve VFL driver ID"); - - /* Check if we are using the MPIO driver */ - if(H5FD_MPIO==driver_id) { - /* Get the driver information */ - if(H5P_get(plist, H5D_XFER_VFL_INFO_NAME, &dx)<0) - HGOTO_ERROR (H5E_PLIST, H5E_CANTGET, FAIL, "Can't retrieve VFL driver info"); - - /* Check if we are not using independent I/O */ - if(H5FD_MPIO_INDEPENDENT!=dx->xfer_mode) - xfer_mode = dx->xfer_mode; + /* Get the transfer mode for MPIO transfers */ + if(IS_H5FD_MPIO(f)) { + /* Get the plist structure */ + if(NULL == (plist = H5I_object(dxpl_id))) + HGOTO_ERROR(H5E_ATOM, H5E_BADATOM, FAIL, "can't find object for ID"); + + /* Get the driver ID */ + if(H5P_get(plist, H5D_XFER_VFL_ID_NAME, &driver_id)<0) + HGOTO_ERROR (H5E_PLIST, H5E_CANTGET, FAIL, "Can't retrieve VFL driver ID"); + + /* Check if we are using the MPIO driver (for the DXPL) */ + if(H5FD_MPIO==driver_id) { + /* Get the driver information */ + if(H5P_get(plist, H5D_XFER_VFL_INFO_NAME, &dx)<0) + HGOTO_ERROR (H5E_PLIST, H5E_CANTGET, FAIL, "Can't retrieve VFL driver info"); + + /* Check if we are not using independent I/O */ + if(H5FD_MPIO_INDEPENDENT!=dx->xfer_mode) + xfer_mode = dx->xfer_mode; + } /* end if */ } /* end if */ } @@ -22,6 +22,7 @@ #include "H5FDcore.h" /*temporary in-memory files */ #include "H5FDfamily.h" /*family of files */ #include "H5FDmpio.h" /*MPI-2 I/O */ +#include "H5FDmpiposix.h" /*MPI-2 & posix I/O */ #include "H5FDgass.h" /*GASS I/O */ #include "H5FDstream.h" /*in-memory files streamed via sockets */ #include "H5FDsrb.h" /*SRB I/O */ @@ -215,7 +216,7 @@ H5F_init_interface(void) /* Allow MPI buf-and-file-type optimizations? */ const char *s = HDgetenv ("HDF5_MPI_1_METAWRITE"); if (s && HDisdigit(*s)) { - H5_mpi_1_metawrite_g = (int)HDstrtol (s, NULL, 0); + H5_mpiposix_1_metawrite_g = H5_mpi_1_metawrite_g = (int)HDstrtol (s, NULL, 0); } } #endif /* H5_HAVE_PARALLEL */ @@ -304,6 +305,7 @@ H5F_init_interface(void) if ((status=H5FD_MULTI)<0) goto end_registration; #ifdef H5_HAVE_PARALLEL if ((status=H5FD_MPIO)<0) goto end_registration; + if ((status=H5FD_MPIPOSIX)<0) goto end_registration; #endif /* H5_HAVE_PARALLEL */ #ifdef H5_HAVE_STREAM if ((status=H5FD_STREAM)<0) goto end_registration; @@ -2571,7 +2573,6 @@ H5F_close(H5F_t *f) { H5F_close_degree_t fc_degree; /* What action to take when closing the last file ID for a file */ hid_t *oid_list; /* List of IDs still open */ - unsigned oid_count; /* Number of IDs still open */ unsigned i; /* Local index variable */ unsigned closing=0; /* Indicate that the file will be closed */ herr_t ret_value = SUCCEED; /* Return value */ @@ -1611,7 +1611,7 @@ H5FD_real_alloc(H5FD_t *file, H5FD_mem_t type, hsize_t size) } } else { hsize_t wasted; - haddr_t oldeoa; + haddr_t oldeoa=0; haddr_t eoa = (file->cls->get_eoa)(file); #ifdef H5F_DEBUG diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c index 7f36862..df4b95c 100644 --- a/src/H5FDmpio.c +++ b/src/H5FDmpio.c @@ -72,9 +72,9 @@ static haddr_t H5FD_mpio_get_eoa(H5FD_t *_file); static herr_t H5FD_mpio_set_eoa(H5FD_t *_file, haddr_t addr); static haddr_t H5FD_mpio_get_eof(H5FD_t *_file); static herr_t H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t type, hid_t fapl_id, haddr_t addr, - size_t size, void *buf); + size_t size, void *buf); static herr_t H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t fapl_id, haddr_t addr, - size_t size, const void *buf); + size_t size, const void *buf); static herr_t H5FD_mpio_flush(H5FD_t *_file, unsigned closing); /* MPIO-specific file access properties */ diff --git a/src/H5FDmpiposix.c b/src/H5FDmpiposix.c new file mode 100644 index 0000000..82533d0 --- /dev/null +++ b/src/H5FDmpiposix.c @@ -0,0 +1,1135 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright by the Board of Trustees of the University of Illinois. * + * All rights reserved. * + * * + * This file is part of HDF5. The full HDF5 copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the files COPYING and Copyright.html. COPYING can be found at the root * + * of the source code distribution tree; Copyright.html can be found at the * + * root level of an installed copy of the electronic HDF5 document set and * + * is linked from the top-level documents page. It can also be found at * + * http://hdf.ncsa.uiuc.edu/HDF5/doc/Copyright.html. If you do not have * + * access to either file, you may request a copy from hdfhelp@ncsa.uiuc.edu. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +/* + * Programmer: Quincey Koziol <koziol@ncsa.uiuc.ed> + * Thursday, July 11, 2002 + * + * Purpose: This is a "combination" MPI-2 and posix I/O driver. + * It uses MPI for coodinating the actions of several processes + * and posix I/O calls to do the actual I/O to the disk. + * + * This driver was derived from the H5FDmpio.c driver and may + * share bugs/quirks/etc. + * + * Limitations: + * There is no "collective" I/O mode with this driver. + * + * This will almost certainly _not_ work correctly for files + * accessed on distributed parallel systems with the file located + * on a non-parallel filesystem. + * + */ +#include "H5private.h" /*library functions */ +#include "H5Eprivate.h" /*error handling */ +#include "H5Fprivate.h" /*files */ +#include "H5FDprivate.h" /*file driver */ +#include "H5FDmpiposix.h" /* MPI/posix I/O file driver */ +#include "H5Iprivate.h" /* IDs */ +#include "H5MMprivate.h" /*memory allocation */ +#include "H5Pprivate.h" /*property lists */ + +/* + * The driver identification number, initialized at runtime if H5_HAVE_PARALLEL + * is defined. This allows applications to still have the H5FD_MPIPOSIX + * "constants" in their source code (it also makes this file strictly ANSI + * compliant when H5_HAVE_PARALLEL isn't defined) + */ +static hid_t H5FD_MPIPOSIX_g = 0; + +#ifdef H5_HAVE_PARALLEL + +/* File operations */ +#define OP_UNKNOWN 0 +#define OP_READ 1 +#define OP_WRITE 2 + +/* + * The description of a file belonging to this driver. + * The EOF value + * is only used just after the file is opened in order for the library to + * determine whether the file is empty, truncated, or okay. The MPIPOSIX driver + * doesn't bother to keep it updated since it's an expensive operation. + */ +typedef struct H5FD_mpiposix_t { + H5FD_t pub; /*public stuff, must be first */ + int fd; /*the unix file handle */ + MPI_Comm comm; /*communicator */ + int mpi_rank; /* This process's rank */ + int mpi_size; /* Total number of processes */ + int mpi_round; /* Current round robin process (for metadata I/O) */ + haddr_t eof; /*end-of-file marker */ + haddr_t eoa; /*end-of-address marker */ + haddr_t last_eoa; /* Last known end-of-address marker */ + haddr_t pos; /* Current file I/O position */ + int op; /* Last file I/O operation */ +#ifndef WIN32 + /* + * On most systems the combination of device and i-node number uniquely + * identify a file. + */ + dev_t device; /*file device number */ + ino_t inode; /*file i-node number */ +#else + /* + * On WIN32 the low-order word of a unique identifier associated with the + * file and the volume serial number uniquely identify a file. This number + * (which, both? -rpm) may change when the system is restarted or when the + * file is opened. After a process opens a file, the identifier is + * constant until the file is closed. An application can use this + * identifier and the volume serial number to determine whether two + * handles refer to the same file. + */ + int fileindexlo; + int fileindexhi; +#endif +} H5FD_mpiposix_t; + +/* + * This driver supports systems that have the lseek64() function by defining + * some macros here so we don't have to have conditional compilations later + * throughout the code. + * + * file_offset_t: The datatype for file offsets, the second argument of + * the lseek() or lseek64() call. + * + * file_seek: The function which adjusts the current file position, + * either lseek() or lseek64(). + */ +/* adding for windows NT file system support. */ +/* pvn: added __MWERKS__ support. */ + +#ifdef H5_HAVE_LSEEK64 +# define file_offset_t off64_t +# define file_seek lseek64 +# define file_truncate ftruncate64 +#elif defined (WIN32) && !defined(__MWERKS__) +# /*MSVC*/ +# define file_offset_t __int64 +# define file_seek _lseeki64 +# define file_truncate _ftruncatei64 +#else +# define file_offset_t off_t +# define file_seek HDlseek +# define file_truncate HDftruncate +#endif + +/* + * These macros check for overflow of various quantities. These macros + * assume that file_offset_t is signed and haddr_t and size_t are unsigned. + * + * ADDR_OVERFLOW: Checks whether a file address of type `haddr_t' + * is too large to be represented by the second argument + * of the file seek function. + * + * SIZE_OVERFLOW: Checks whether a buffer size of type `hsize_t' is too + * large to be represented by the `size_t' type. + * + * REGION_OVERFLOW: Checks whether an address and size pair describe data + * which can be addressed entirely by the second + * argument of the file seek function. + */ +#define MAXADDR (((haddr_t)1<<(8*sizeof(file_offset_t)-1))-1) +#define ADDR_OVERFLOW(A) (HADDR_UNDEF==(A) || \ + ((A) & ~(haddr_t)MAXADDR)) +#define SIZE_OVERFLOW(Z) ((Z) & ~(hsize_t)MAXADDR) +#define REGION_OVERFLOW(A,Z) (ADDR_OVERFLOW(A) || SIZE_OVERFLOW(Z) || \ + sizeof(file_offset_t)<sizeof(size_t) || \ + HADDR_UNDEF==(A)+(Z) || \ + (file_offset_t)((A)+(Z))<(file_offset_t)(A)) + +/* Callbacks */ +static void *H5FD_mpiposix_fapl_get(H5FD_t *_file); +static H5FD_t *H5FD_mpiposix_open(const char *name, unsigned flags, hid_t fapl_id, + haddr_t maxaddr); +static herr_t H5FD_mpiposix_close(H5FD_t *_file); +static int H5FD_mpiposix_cmp(const H5FD_t *_f1, const H5FD_t *_f2); +static herr_t H5FD_mpiposix_query(const H5FD_t *_f1, unsigned long *flags); +static haddr_t H5FD_mpiposix_get_eoa(H5FD_t *_file); +static herr_t H5FD_mpiposix_set_eoa(H5FD_t *_file, haddr_t addr); +static haddr_t H5FD_mpiposix_get_eof(H5FD_t *_file); +static herr_t H5FD_mpiposix_read(H5FD_t *_file, H5FD_mem_t type, hid_t fapl_id, haddr_t addr, + size_t size, void *buf); +static herr_t H5FD_mpiposix_write(H5FD_t *_file, H5FD_mem_t type, hid_t fapl_id, haddr_t addr, + size_t size, const void *buf); +static herr_t H5FD_mpiposix_flush(H5FD_t *_file, unsigned closing); + +/* MPIPOSIX-specific file access properties */ +typedef struct H5FD_mpiposix_fapl_t { + MPI_Comm comm; /*communicator */ +} H5FD_mpiposix_fapl_t; + +/* The MPIPOSIX file driver information */ +static const H5FD_class_t H5FD_mpiposix_g = { + "mpiposix", /*name */ + MAXADDR, /*maxaddr */ + H5F_CLOSE_SEMI, /* fc_degree */ + NULL, /*sb_size */ + NULL, /*sb_encode */ + NULL, /*sb_decode */ + sizeof(H5FD_mpiposix_fapl_t), /*fapl_size */ + H5FD_mpiposix_fapl_get, /*fapl_get */ + NULL, /*fapl_copy */ + NULL, /*fapl_free */ + 0, /*dxpl_size */ + NULL, /*dxpl_copy */ + NULL, /*dxpl_free */ + H5FD_mpiposix_open, /*open */ + H5FD_mpiposix_close, /*close */ + H5FD_mpiposix_cmp, /*cmp */ + H5FD_mpiposix_query, /*query */ + NULL, /*alloc */ + NULL, /*free */ + H5FD_mpiposix_get_eoa, /*get_eoa */ + H5FD_mpiposix_set_eoa, /*set_eoa */ + H5FD_mpiposix_get_eof, /*get_eof */ + H5FD_mpiposix_read, /*read */ + H5FD_mpiposix_write, /*write */ + H5FD_mpiposix_flush, /*flush */ + H5FD_FLMAP_SINGLE, /*fl_map */ +}; + +/* Global var to allow elimination of redundant metadata writes + * to be controlled by the value of an environment variable. */ +/* Use the elimination by default unless this is the Intel Red machine */ +#ifndef __PUMAGON__ +hbool_t H5_mpiposix_1_metawrite_g = TRUE; +#else +hbool_t H5_mpiposix_1_metawrite_g = FALSE; +#endif + +/* Interface initialization */ +#define PABLO_MASK H5FD_mpiposix_mask +#define INTERFACE_INIT H5FD_mpiposix_init +static int interface_initialize_g = 0; + + +/*------------------------------------------------------------------------- + * Function: H5FD_mpiposix_init + * + * Purpose: Initialize this driver by registering the driver with the + * library. + * + * Return: Success: The driver ID for the mpiposix driver. + * + * Failure: Negative. + * + * Programmer: Quincey Koziol + * Thursday, July 11, 2002 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +hid_t +H5FD_mpiposix_init(void) +{ + FUNC_ENTER_NOAPI(H5FD_mpiposix_init, FAIL); + + if (H5I_VFL!=H5Iget_type(H5FD_MPIPOSIX_g)) + H5FD_MPIPOSIX_g = H5FDregister(&H5FD_mpiposix_g); + + FUNC_LEAVE(H5FD_MPIPOSIX_g); +} /* end H5FD_mpiposix_init() */ + + +/*------------------------------------------------------------------------- + * Function: H5Pset_fapl_mpiposix + * + * Purpose: Store the user supplied MPI communicator COMM in + * the file access property list FAPL_ID which can then be used + * to create and/or open the file. This function is available + * only in the parallel HDF5 library and is not collective. + * + * COMM is the MPI communicator to be used for file open as + * defined in MPI_FILE_OPEN of MPI-2. This function does not + * make a duplicated communicator. Any modification to COMM + * after this function call returns may have undetermined effect + * on the access property list. Users should not modify the + * communicator while it is defined in a property list. + * + * Return: Success: Non-negative + * Failure: Negative + * + * Programmer: Quincey Koziol + * Thursday, July 11, 2002 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +herr_t +H5Pset_fapl_mpiposix(hid_t fapl_id, MPI_Comm comm) +{ + H5FD_mpiposix_fapl_t fa; + H5P_genplist_t *plist; /* Property list pointer */ + herr_t ret_value=FAIL; + + FUNC_ENTER_API(H5Pset_fapl_mpiposix, FAIL); + H5TRACE2("e","iMc",fapl_id,comm); + + /* Check arguments */ + if(TRUE!=H5P_isa_class(fapl_id,H5P_FILE_ACCESS) || NULL == (plist = H5I_object(fapl_id))) + HRETURN_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a file access list"); +#ifdef LATER +#warning "We need to verify that COMM contains sensible information." +#endif + + /* Initialize driver specific properties */ + fa.comm = comm; + + ret_value= H5P_set_driver(plist, H5FD_MPIPOSIX, &fa); + + FUNC_LEAVE(ret_value); +} /* end H5Pset_fapl_mpiposix() */ + + +/*------------------------------------------------------------------------- + * Function: H5Pget_fapl_mpiposix + * + * Purpose: If the file access property list is set to the H5FD_MPIPOSIX + * driver then this function returns the MPI communicator and + * information through the COMM pointer. + * + * Return: Success: Non-negative with the communicator and + * information returned through the COMM + * argument if non-null. This piece of + * information is copied and is therefore + * valid only until the file access property + * list is modified or closed. + * + * Failure: Negative + * + * Programmer: Quincey Koziol + * Thursday, July 11, 2002 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +herr_t +H5Pget_fapl_mpiposix(hid_t fapl_id, MPI_Comm *comm/*out*/) +{ + H5FD_mpiposix_fapl_t *fa; + H5P_genplist_t *plist; /* Property list pointer */ + + FUNC_ENTER_API(H5Pget_fapl_mpiposix, FAIL); + H5TRACE2("e","ix",fapl_id,comm); + + if(TRUE!=H5P_isa_class(fapl_id,H5P_FILE_ACCESS) || NULL == (plist = H5I_object(fapl_id))) + HRETURN_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a file access list"); + if (H5FD_MPIPOSIX!=H5P_get_driver(plist)) + HRETURN_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "incorrect VFL driver"); + if (NULL==(fa=H5P_get_driver_info(plist))) + HRETURN_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "bad VFL driver info"); + + /* Get MPI Communicator */ + if (comm) + *comm = fa->comm; + + FUNC_LEAVE(SUCCEED); +} /* end H5Pget_fapl_mpiposix() */ + + +/*------------------------------------------------------------------------- + * Function: H5FD_mpiposix_communicator + * + * Purpose: Returns the MPI communicator for the file. + * + * Return: Success: The communicator + * + * Failure: NULL + * + * Programmer: Quincey Koziol + * Thursday, July 11, 2002 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +MPI_Comm +H5FD_mpiposix_communicator(H5FD_t *_file) +{ + H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file; + + FUNC_ENTER_NOAPI(H5FD_mpiposix_communicator, MPI_COMM_NULL); + + assert(file); + assert(H5FD_MPIPOSIX==file->pub.driver_id); + + FUNC_LEAVE(file->comm); +} /* end H5FD_mpi_posix_communicator() */ + + +/*------------------------------------------------------------------------- + * Function: H5FD_mpiposix_mpi_rank + * + * Purpose: Returns the MPI rank for a process + * + * Return: Success: non-negative + * Failure: negative + * + * Programmer: Quincey Koziol + * Thursday, July 11, 2002 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +int +H5FD_mpiposix_mpi_rank(H5FD_t *_file) +{ + H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file; + + FUNC_ENTER_NOAPI(H5FD_mpiposix_mpi_rank, FAIL); + + assert(file); + assert(H5FD_MPIPOSIX==file->pub.driver_id); + + FUNC_LEAVE(file->mpi_rank); +} /* end H5FD_mpiposix_mpi_rank() */ + + +/*------------------------------------------------------------------------- + * Function: H5FD_mpiposix_mpi_size + * + * Purpose: Returns the number of MPI processes + * + * Return: Success: non-negative + * Failure: negative + * + * Programmer: Quincey Koziol + * Thursday, July 11, 2002 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +int +H5FD_mpiposix_mpi_size(H5FD_t *_file) +{ + H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file; + + FUNC_ENTER_NOAPI(H5FD_mpiposix_mpi_rank, FAIL); + + assert(file); + assert(H5FD_MPIPOSIX==file->pub.driver_id); + + FUNC_LEAVE(file->mpi_size); +} /* end H5FD_mpiposix_mpi_size() */ + + +/*------------------------------------------------------------------------- + * Function: H5FD_mpiposix_fapl_get + * + * Purpose: Returns a file access property list which could be used to + * create another file the same as this one. + * + * Return: Success: Ptr to new file access property list with all + * fields copied from the file pointer. + * + * Failure: NULL + * + * Programmer: Quincey Koziol + * Thursday, July 11, 2002 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static void * +H5FD_mpiposix_fapl_get(H5FD_t *_file) +{ + H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file; + H5FD_mpiposix_fapl_t *fa = NULL; + + FUNC_ENTER_NOAPI(H5FD_mpiposix_fapl_get, NULL); + + assert(file); + assert(H5FD_MPIPOSIX==file->pub.driver_id); + + if (NULL==(fa=H5MM_calloc(sizeof(H5FD_mpiposix_fapl_t)))) + HRETURN_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed"); + + /* These should be copied. --QAK, 2002-07-11 */ + fa->comm = file->comm; + + FUNC_LEAVE(fa); +} /* end H5FD_mpiposix_fapl_get() */ + + +/*------------------------------------------------------------------------- + * Function: H5FD_mpiposix_open + * + * Purpose: Opens a file with name NAME. The FLAGS are a bit field with + * purpose similar to the second argument of open(2) and which + * are defined in H5Fpublic.h. The file access property list + * FAPL_ID contains the properties driver properties and MAXADDR + * is the largest address which this file will be expected to + * access. This is collective. + * + * Return: Success: A new file pointer. + * Failure: NULL + * + * Programmer: Quincey Koziol + * Thursday, July 11, 2002 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static H5FD_t * +H5FD_mpiposix_open(const char *name, unsigned flags, hid_t fapl_id, + haddr_t maxaddr) +{ + H5FD_mpiposix_t *file=NULL; /* New MPIPOSIX file struct */ + int o_flags; /* Flags for file open call */ + int fd=(-1); /* File handle for file opened */ + int mpi_rank; /* MPI rank of this process */ + int mpi_size; /* Total number of MPI processes */ + int mpi_code; /* mpi return code */ + const H5FD_mpiposix_fapl_t *fa=NULL; /* MPIPOSIX file access property list information */ + H5FD_mpiposix_fapl_t _fa; /* Private copy of default file access property list information */ + H5P_genplist_t *plist; /* Property list pointer */ + h5_stat_t sb; /* Portable 'stat' struct */ +#ifdef WIN32 + HFILE filehandle; + struct _BY_HANDLE_FILE_INFORMATION fileinfo; + int results; +#endif + H5FD_t *ret_value=NULL; /* Return value */ + + FUNC_ENTER_NOAPI(H5FD_mpiposix_open, NULL); + + /* Check arguments */ + if (!name || !*name) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, NULL, "invalid file name"); + if (0==maxaddr || HADDR_UNDEF==maxaddr) + HGOTO_ERROR(H5E_ARGS, H5E_BADRANGE, NULL, "bogus maxaddr"); + if (ADDR_OVERFLOW(maxaddr)) + HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, NULL, "bogus maxaddr"); + + /* Obtain a pointer to mpiposix-specific file access properties */ + if(TRUE!=H5P_isa_class(fapl_id,H5P_FILE_ACCESS) || NULL == (plist = H5I_object(fapl_id))) + HRETURN_ERROR(H5E_ARGS, H5E_BADTYPE, NULL, "not a file access property list"); + if (H5P_DEFAULT==fapl_id || H5FD_MPIPOSIX!=H5P_get_driver(plist)) { + _fa.comm = MPI_COMM_SELF; /*default*/ + fa = &_fa; + } /* end if */ + else { + fa = H5P_get_driver_info(plist); + assert(fa); + } /* end else */ + + /* Get the MPI rank of this process and the total number of processes */ + if (MPI_SUCCESS != (mpi_code=MPI_Comm_rank (fa->comm, &mpi_rank))) + HMPI_GOTO_ERROR(NULL, "MPI_Comm_rank failed", mpi_code); + if (MPI_SUCCESS != (mpi_code=MPI_Comm_size (fa->comm, &mpi_size))) + HMPI_GOTO_ERROR(NULL, "MPI_Comm_size failed", mpi_code); + + /* Build the open flags */ + o_flags = (H5F_ACC_RDWR & flags) ? O_RDWR : O_RDONLY; + + /* Only set the creation flag(s) for process 0 */ + if(mpi_rank==0) { + if (H5F_ACC_TRUNC & flags) + o_flags |= O_TRUNC; + if (H5F_ACC_CREAT & flags) + o_flags |= O_CREAT; + if (H5F_ACC_EXCL & flags) + o_flags |= O_EXCL; + } /* end if */ + + /* Process 0 opens (or creates) the file while the rest of the + * processes wait. Then process 0 signals the other processes and they + * open (never create) the file and all processes proceed. + */ + /* Process 0 opens (or creates) file and broadcasts result to other processes */ + if(mpi_rank==0) { + /* Open the file */ + fd=HDopen(name, o_flags, 0666); + } /* end if */ + + /* Broadcast the results of the open() from process 0 */ + /* This is necessary because of the "tentative open" code in H5F_open() + * where the file is attempted to be opened with different flags from the + * user's, in order to check for the file's existence, etc. Here, process 0 + * gets different flags from the other processes (since it is in charge of + * creating the file, if necessary) and can fail in situations where the + * other process's file opens would succeed, so allow the other processes + * to check for that situation and bail out now also. - QAK + */ + if (MPI_SUCCESS != (mpi_code= MPI_Bcast(&fd, sizeof(int), MPI_BYTE, 0, fa->comm))) + HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code); + + /* If the file open on process 0 failed, bail out on all processes now */ + if(fd<0) + HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to open file"); + + /* Other processes (non 0) wait for broadcast result from process 0 and then open file */ + if(mpi_rank!=0) { + /* Open the file */ + if ((fd=HDopen(name, o_flags, 0666))<0) + HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to open file"); + } /* end if */ + + /* Process 0 fstat()s the file and broadcasts the results to the other processes */ + if(mpi_rank==0) { + /* Get the stat information */ + if (HDfstat(fd, &sb)<0) + HGOTO_ERROR(H5E_FILE, H5E_BADFILE, NULL, "unable to fstat file"); + } /* end if */ + + /* Broadcast the results of the fstat() from process 0 */ + if (MPI_SUCCESS != (mpi_code= MPI_Bcast(&sb, sizeof(h5_stat_t), MPI_BYTE, 0, fa->comm))) + HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code); + + /* Build the file struct and initialize it */ + if (NULL==(file=H5MM_calloc(sizeof(H5FD_mpiposix_t)))) + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed"); + + /* Set the general file information */ + file->fd = fd; + file->eof = sb.st_size; + + /* Set the MPI information */ + file->comm = fa->comm; + file->mpi_rank = mpi_rank; + file->mpi_size = mpi_size; + file->mpi_round = 0; /* Start metadata writes with process 0 */ + + /* Reset the last file I/O operation */ + file->pos = HADDR_UNDEF; + file->op = OP_UNKNOWN; + + /* Set the information for the file's device and inode */ +#ifdef WIN32 + filehandle = _get_osfhandle(fd); + results = GetFileInformationByHandle((HANDLE)filehandle, &fileinfo); + file->fileindexhi = fileinfo.nFileIndexHigh; + file->fileindexlo = fileinfo.nFileIndexLow; +#else + file->device = sb.st_dev; + file->inode = sb.st_ino; +#endif + + /* Indicate success */ + ret_value=(H5FD_t *)file; + +done: + /* Error cleanup */ + if(ret_value==NULL) { + /* Close the file if it was left open */ + if(fd!=(-1)) + HDclose(fd); + } /* end if */ + + FUNC_LEAVE(ret_value); +} /* end H5FD_mpiposix_open() */ + + +/*------------------------------------------------------------------------- + * Function: H5FD_mpiposix_close + * + * Purpose: Closes a file. + * + * Return: Success: Non-negative + * Failure: Negative + * + * Programmer: Quincey Koziol + * Thursday, July 11, 2002 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD_mpiposix_close(H5FD_t *_file) +{ + H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file; + + FUNC_ENTER_NOAPI(H5FD_mpiposix_close, FAIL); + + assert(file); + assert(H5FD_MPIPOSIX==file->pub.driver_id); + + /* Close the unix file */ + if (HDclose(file->fd)<0) + HRETURN_ERROR(H5E_IO, H5E_CANTCLOSEFILE, FAIL, "unable to close file"); + + H5MM_xfree(file); + + FUNC_LEAVE(SUCCEED); +} /* end H5FD_mpiposix_close() */ + + +/*------------------------------------------------------------------------- + * Function: H5FD_mpiposix_cmp + * + * Purpose: Compares two files belonging to this driver using an + * arbitrary (but consistent) ordering. + * + * Return: Success: A value like strcmp() + * Failure: never fails (arguments were checked by the + * caller). + * + * Programmer: Quincey Koziol + * Thursday, July 11, 2002 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static int +H5FD_mpiposix_cmp(const H5FD_t *_f1, const H5FD_t *_f2) +{ + const H5FD_mpiposix_t *f1 = (const H5FD_mpiposix_t*)_f1; + const H5FD_mpiposix_t *f2 = (const H5FD_mpiposix_t*)_f2; + int ret_value=0; + + FUNC_ENTER_NOAPI(H5FD_mpiposix_cmp, H5FD_VFD_DEFAULT); + +#ifdef WIN32 + if (f1->fileindexhi < f2->fileindexhi) ret_value= -1; + if (f1->fileindexhi > f2->fileindexhi) ret_value= 1; + + if (f1->fileindexlo < f2->fileindexlo) ret_value= -1; + if (f1->fileindexlo > f2->fileindexlo) ret_value= 1; + +#else + if (f1->device < f2->device) ret_value= -1; + if (f1->device > f2->device) ret_value= 1; + + if (f1->inode < f2->inode) ret_value= -1; + if (f1->inode > f2->inode) ret_value= 1; +#endif + + FUNC_LEAVE(ret_value); +} /* end H5FD_mpiposix_cmp() */ + + +/*------------------------------------------------------------------------- + * Function: H5FD_mpiposix_query + * + * Purpose: Set the flags that this VFL driver is capable of supporting. + * (listed in H5FDpublic.h) + * + * Return: Success: non-negative + * Failure: negative + * + * Programmer: Quincey Koziol + * Thursday, July 11, 2002 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD_mpiposix_query(const H5FD_t UNUSED *_file, unsigned long *flags /* out */) +{ + herr_t ret_value=SUCCEED; + + FUNC_ENTER_NOAPI(H5FD_mpiposix_query, FAIL); + + /* Set the VFL feature flags that this driver supports */ + if(flags) { + *flags=0; + *flags|=H5FD_FEAT_AGGREGATE_METADATA; /* OK to aggregate metadata allocations */ + + /* Distinguish between updating the metadata accumulator on writes and + * reads. This is particularly (perhaps only, even) important for MPI-I/O + * where we guarantee that writes are collective, but reads may not be. + * If we were to allow the metadata accumulator to be written during a + * read operation, the application would hang. + */ + *flags|=H5FD_FEAT_ACCUMULATE_METADATA_WRITE; /* OK to accumulate metadata for faster writes */ + + *flags|=H5FD_FEAT_AGGREGATE_SMALLDATA; /* OK to aggregate "small" raw data allocations */ + } /* end if */ + + FUNC_LEAVE(ret_value); +} /* end H5FD_mpiposix_query() */ + + +/*------------------------------------------------------------------------- + * Function: H5FD_mpiposix_get_eoa + * + * Purpose: Gets the end-of-address marker for the file. The EOA marker + * is the first address past the last byte allocated in the + * format address space. + * + * Return: Success: The end-of-address marker. + * Failure: HADDR_UNDEF + * + * Programmer: Quincey Koziol + * Thursday, July 11, 2002 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static haddr_t +H5FD_mpiposix_get_eoa(H5FD_t *_file) +{ + H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file; + + FUNC_ENTER_NOAPI(H5FD_mpiposix_get_eoa, HADDR_UNDEF); + + assert(file); + assert(H5FD_MPIPOSIX==file->pub.driver_id); + + FUNC_LEAVE(file->eoa); +} /* end H5FD_mpiposix_get_eoa() */ + + +/*------------------------------------------------------------------------- + * Function: H5FD_mpiposix_set_eoa + * + * Purpose: Set the end-of-address marker for the file. This function is + * called shortly after an existing HDF5 file is opened in order + * to tell the driver where the end of the HDF5 data is located. + * + * Return: Success: non-negative + * Failure: negative + * + * Programmer: Quincey Koziol + * Thursday, July 11, 2002 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD_mpiposix_set_eoa(H5FD_t *_file, haddr_t addr) +{ + H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file; + + FUNC_ENTER_NOAPI(H5FD_mpiposix_set_eoa, FAIL); + + assert(file); + assert(H5FD_MPIPOSIX==file->pub.driver_id); + + file->eoa = addr; + + FUNC_LEAVE(SUCCEED); +} /* end H5FD_mpi_posix_set_eoa() */ + + +/*------------------------------------------------------------------------- + * Function: H5FD_mpiposix_get_eof + * + * Purpose: Gets the end-of-file marker for the file. The EOF marker + * is the real size of the file. + * + * The MPIPOSIX driver doesn't bother keeping this field updated + * since that's a relatively expensive operation. Fortunately + * the library only needs the EOF just after the file is opened + * in order to determine whether the file is empty, truncated, + * or okay. + * + * Return: Success: The end-of-address marker. + * Failure: HADDR_UNDEF + * + * Programmer: Quincey Koziol + * Thursday, July 11, 2002 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static haddr_t +H5FD_mpiposix_get_eof(H5FD_t *_file) +{ + H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file; + + FUNC_ENTER_NOAPI(H5FD_mpiposix_get_eof, HADDR_UNDEF); + + assert(file); + assert(H5FD_MPIPOSIX==file->pub.driver_id); + + FUNC_LEAVE(MAX(file->eof,file->eoa)); +} /* end H5FD_mpiposix_get_eof() */ + + +/*------------------------------------------------------------------------- + * Function: H5FD_mpiposix_read + * + * Purpose: Reads SIZE bytes of data from FILE beginning at address ADDR + * into buffer BUF according to data transfer properties in + * DXPL_ID using potentially complex file and buffer types to + * effect the transfer. + * + * Reading past the end of the file returns zeros instead of + * failing. + * + * Return: Success: Non-negative. Result is stored in caller-supplied + * buffer BUF. + * Failure: Negative, Contents of buffer BUF are undefined. + * + * Programmer: Quincey Koziol + * Thursday, July 11, 2002 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD_mpiposix_read(H5FD_t *_file, H5FD_mem_t UNUSED type, hid_t UNUSED dxpl_id, haddr_t addr, size_t size, + void *buf/*out*/) +{ + H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file; + ssize_t nbytes; /* Number of bytes read each I/O call */ + herr_t ret_value=SUCCEED; + + FUNC_ENTER_NOAPI(H5FD_mpiposix_read, FAIL); + + assert(file); + assert(H5FD_MPIPOSIX==file->pub.driver_id); + assert(buf); + + /* Check for overflow conditions */ + if (HADDR_UNDEF==addr) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "addr undefined"); + if (REGION_OVERFLOW(addr, size)) + HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow"); + if (addr+size>file->eoa) + HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow"); + + /* Seek to the correct location */ + if ((addr!=file->pos || OP_READ!=file->op) && + file_seek(file->fd, (file_offset_t)addr, SEEK_SET)<0) + HGOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL, "unable to seek to proper position"); + + /* + * Read data, being careful of interrupted system calls, partial results, + * and the end of the file. + */ + while (size>0) { + do { + nbytes = HDread(file->fd, buf, size); + } while (-1==nbytes && EINTR==errno); + if (-1==nbytes) + HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed"); + if (0==nbytes) { + /* end of file but not end of format address space */ + HDmemset(buf, 0, size); + size = 0; + } /* end if */ + assert(nbytes>=0); + assert((size_t)nbytes<=size); + size -= nbytes; + addr += (haddr_t)nbytes; + buf = (char*)buf + nbytes; + } + + /* Update current position */ + file->pos = addr; + file->op = OP_READ; + +done: + /* Check for error */ + if(ret_value<0) { + /* Reset last file I/O information */ + file->pos = HADDR_UNDEF; + file->op = OP_UNKNOWN; + } /* end if */ + + FUNC_LEAVE(ret_value); +} /* end H5FD_mpiposix_read() */ + + +/*------------------------------------------------------------------------- + * Function: H5FD_mpiposix_write + * + * Purpose: Writes SIZE bytes of data to FILE beginning at address ADDR + * from buffer BUF according to data transfer properties in + * DXPL_ID using potentially complex file and buffer types to + * effect the transfer. + * + * Return: Success: non-negative + * Failure: negative + * + * Programmer: Quincey Koziol + * Thursday, July 11, 2002 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD_mpiposix_write(H5FD_t *_file, H5FD_mem_t type, hid_t UNUSED dxpl_id, haddr_t addr, + size_t size, const void *buf) +{ + H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file; + int mpi_code; /* MPI return code */ + ssize_t nbytes; /* Number of bytes written each I/O call */ + herr_t ret_value=SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(H5FD_mpiposix_write, FAIL); + + assert(file); + assert(H5FD_MPIPOSIX==file->pub.driver_id); + assert(buf); + + /* Check for overflow conditions */ + if (HADDR_UNDEF==addr) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "addr undefined"); + if (REGION_OVERFLOW(addr, size)) + HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow"); + if (addr+size>file->eoa) + HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow"); + + /* Only p<round> will do the actual write if all procs in comm write same data */ + if ((type!=H5FD_MEM_DRAW) && H5_mpiposix_1_metawrite_g) { + if (file->mpi_rank != file->mpi_round) + HGOTO_DONE(SUCCEED) /* skip the actual write */ + } /* end if */ + + /* Seek to the correct location */ + if ((addr!=file->pos || OP_WRITE!=file->op) && + file_seek(file->fd, (file_offset_t)addr, SEEK_SET)<0) + HGOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL, + "unable to seek to proper position"); + + /* + * Write the data, being careful of interrupted system calls and partial + * results + */ + while (size>0) { + do { + nbytes = HDwrite(file->fd, buf, size); + } while (-1==nbytes && EINTR==errno); + if (-1==nbytes) + HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "file write failed"); + assert(nbytes>0); + assert((size_t)nbytes<=size); + size -= nbytes; + addr += (haddr_t)nbytes; + buf = (const char*)buf + nbytes; + } /* end while */ + + /* Update current last file I/O information */ + file->pos = addr; + file->op = OP_WRITE; + +done: + /* Check for error */ + if(ret_value<0) { + /* Reset last file I/O information */ + file->pos = HADDR_UNDEF; + file->op = OP_UNKNOWN; + } /* end if */ + /* Guard against getting into metadata broadcast in failure cases */ + else { + /* if only p<round> writes, need to broadcast the ret_value to other processes */ + if ((type!=H5FD_MEM_DRAW) && H5_mpiposix_1_metawrite_g) { + if (MPI_SUCCESS != (mpi_code= MPI_Bcast(&ret_value, sizeof(ret_value), MPI_BYTE, file->mpi_round, file->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code); + + /* Round-robin rotate to the next process */ + file->mpi_round = (++file->mpi_round)%file->mpi_size; +#ifdef QAK + { + int max,min; + + MPI_Allreduce(&file->mpi_round, &max, 1, MPI_INT, MPI_MAX, file->comm); + MPI_Allreduce(&file->mpi_round, &min, 1, MPI_INT, MPI_MIN, file->comm); + if(max!=file->mpi_round) + printf("%s: rank=%d, round=%d, max=%d\n",FUNC,file->mpi_rank,file->mpi_round,max); + if(min!=file->mpi_round) + printf("%s: rank=%d, round=%d, min=%d\n",FUNC,file->mpi_rank,file->mpi_round,min); + } +#endif /* QAK */ + } /* end if */ + } /* end else */ + + FUNC_LEAVE(ret_value); +} /* end H5FD_mpiposix_write() */ + + +/*------------------------------------------------------------------------- + * Function: H5FD_mpiposix_flush + * + * Purpose: Makes sure that all data is on disk. This is collective. + * + * Return: Success: Non-negative + * Failure: Negative + * + * Programmer: Quincey Koziol + * Thursday, July 11, 2002 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD_mpiposix_flush(H5FD_t *_file, unsigned UNUSED closing) +{ + H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file; +#ifdef WIN32 + HFILE filehandle; /* Windows file handle */ + LARGE_INTEGER li; /* 64-bit integer for SetFilePointer() call */ +#endif /* WIN32 */ + int mpi_code; /* MPI return code */ + herr_t ret_value=SUCCEED; + + FUNC_ENTER_NOAPI(H5FD_mpiposix_flush, FAIL); + + assert(file); + assert(H5FD_MPIPOSIX==file->pub.driver_id); + + /* Extend the file to make sure it's large enough */ + if(file->eoa>file->last_eoa) { + /* Use the round-robin process to truncate (extend) the file */ + if(file->mpi_rank == file->mpi_round) { +#ifdef WIN32 + /* Map the posix file handle to a Windows file handle */ + filehandle = _get_osfhandle(fd); + + /* Translate 64-bit integers into form Windows wants */ + /* [This algorithm is from the Windows documentation for SetFilePointer()] */ + li.QuadPart = file->eoa; + SetFilePointer((HANDLE)filehandle,li.LowPart,&li.HighPart,FILE_BEGIN); + if(SetEndOfFile((HANDLE)filehandle)==0) + HGOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL, "unable to extend file properly"); +#else /* WIN32 */ + if(-1==file_truncate(file->fd, (file_offset_t)file->eoa)) + HGOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL, "unable to extend file properly"); +#endif /* WIN32 */ + } /* end if */ + + /* Don't let any proc return until all have extended the file. + * (Prevents race condition where some processes go ahead and write + * more data to the file before all the processes have finished making + * it the shorter length, potentially truncating the file and dropping + * the new data written) + */ + if (MPI_SUCCESS!= (mpi_code=MPI_Barrier(file->comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); + + /* Update the 'last' eoa and eof values */ + file->last_eoa=file->eoa; + file->eof = file->eoa; + + /* Reset last file I/O information */ + file->pos = HADDR_UNDEF; + file->op = OP_UNKNOWN; + } /* end if */ + +done: + FUNC_LEAVE(ret_value); +} /* end H5FD_mpiposix_flush() */ + +#endif /*H5_HAVE_PARALLEL*/ + diff --git a/src/H5FDmpiposix.h b/src/H5FDmpiposix.h new file mode 100644 index 0000000..81d05d4 --- /dev/null +++ b/src/H5FDmpiposix.h @@ -0,0 +1,62 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright by the Board of Trustees of the University of Illinois. * + * All rights reserved. * + * * + * This file is part of HDF5. The full HDF5 copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the files COPYING and Copyright.html. COPYING can be found at the root * + * of the source code distribution tree; Copyright.html can be found at the * + * root level of an installed copy of the electronic HDF5 document set and * + * is linked from the top-level documents page. It can also be found at * + * http://hdf.ncsa.uiuc.edu/HDF5/doc/Copyright.html. If you do not have * + * access to either file, you may request a copy from hdfhelp@ncsa.uiuc.edu. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +/* + * Programmer: Quincey Koziol <koziol@ncsa.uiuc.edu> + * Thursday, July 11, 2002 + * + * Purpose: The public header file for the mpiposix driver. + */ + +#ifndef __H5FDmpiposix_H +#define __H5FDmpiposix_H + +#include "H5FDpublic.h" +#include "H5Ipublic.h" + +#ifdef H5_HAVE_PARALLEL +# define H5FD_MPIPOSIX (H5FD_mpiposix_init()) +#else +# define H5FD_MPIPOSIX (-1) +#endif + +#ifdef H5_HAVE_PARALLEL + +/* Macros */ + +#define IS_H5FD_MPIPOSIX(f) /* (H5F_t *f) */ \ + (H5FD_MPIPOSIX==H5F_get_driver_id(f)) + +/* Function prototypes */ +#ifdef __cplusplus +extern "C" { +#endif + +__DLL__ hid_t H5FD_mpiposix_init(void); +__DLL__ herr_t H5Pset_fapl_mpiposix(hid_t fapl_id, MPI_Comm comm); +__DLL__ herr_t H5Pget_fapl_mpiposix(hid_t fapl_id, MPI_Comm *comm/*out*/); +__DLL__ MPI_Comm H5FD_mpiposix_communicator(H5FD_t *_file); +__DLL__ herr_t H5FD_mpiposix_closing(H5FD_t *file); +__DLL__ int H5FD_mpiposix_mpi_rank(H5FD_t *_file); +__DLL__ int H5FD_mpiposix_mpi_size(H5FD_t *_file); + +#ifdef __cplusplus +} +#endif + +#endif /*H5_HAVE_PARALLEL*/ + +#endif /* __H5FDmpiposix_H */ + + diff --git a/src/H5Farray.c b/src/H5Farray.c index cefed52..d2f84be 100644 --- a/src/H5Farray.c +++ b/src/H5Farray.c @@ -171,27 +171,29 @@ H5F_arr_read(H5F_t *f, hid_t dxpl_id, const H5O_layout_t *layout, #ifdef H5_HAVE_PARALLEL { - /* Get the transfer mode */ H5FD_mpio_dxpl_t *dx; hid_t driver_id; /* VFL driver ID */ - /* Get the plist structure */ - if(NULL == (plist = H5I_object(dxpl_id))) - HGOTO_ERROR(H5E_ATOM, H5E_BADATOM, FAIL, "can't find object for ID"); - - /* Get the driver ID */ - if(H5P_get(plist, H5D_XFER_VFL_ID_NAME, &driver_id)<0) - HGOTO_ERROR (H5E_PLIST, H5E_CANTGET, FAIL, "Can't retrieve VFL driver ID"); - - /* Check if we are using the MPIO driver */ - if(H5FD_MPIO==driver_id) { - /* Get the driver information */ - if(H5P_get(plist, H5D_XFER_VFL_INFO_NAME, &dx)<0) - HGOTO_ERROR (H5E_PLIST, H5E_CANTGET, FAIL, "Can't retrieve VFL driver info"); - - /* Check if we are not using independent I/O */ - if(H5FD_MPIO_INDEPENDENT!=dx->xfer_mode) - xfer_mode = dx->xfer_mode; + /* Get the transfer mode for MPIO transfers */ + if(IS_H5FD_MPIO(f)) { + /* Get the plist structure */ + if(NULL == (plist = H5I_object(dxpl_id))) + HGOTO_ERROR(H5E_ATOM, H5E_BADATOM, FAIL, "can't find object for ID"); + + /* Get the driver ID */ + if(H5P_get(plist, H5D_XFER_VFL_ID_NAME, &driver_id)<0) + HGOTO_ERROR (H5E_PLIST, H5E_CANTGET, FAIL, "Can't retrieve VFL driver ID"); + + /* Check if we are using the MPIO driver (for the DXPL) */ + if(H5FD_MPIO==driver_id) { + /* Get the driver information */ + if(H5P_get(plist, H5D_XFER_VFL_INFO_NAME, &dx)<0) + HGOTO_ERROR (H5E_PLIST, H5E_CANTGET, FAIL, "Can't retrieve VFL driver info"); + + /* Check if we are not using independent I/O */ + if(H5FD_MPIO_INDEPENDENT!=dx->xfer_mode) + xfer_mode = dx->xfer_mode; + } /* end if */ } /* end if */ } @@ -412,27 +414,29 @@ H5F_arr_write(H5F_t *f, hid_t dxpl_id, const H5O_layout_t *layout, #ifdef H5_HAVE_PARALLEL { - /* Get the transfer mode */ H5FD_mpio_dxpl_t *dx; hid_t driver_id; /* VFL driver ID */ - /* Get the plist structure */ - if(NULL == (plist = H5I_object(dxpl_id))) - HGOTO_ERROR(H5E_ATOM, H5E_BADATOM, FAIL, "can't find object for ID"); - - /* Get the driver ID */ - if(H5P_get(plist, H5D_XFER_VFL_ID_NAME, &driver_id)<0) - HGOTO_ERROR (H5E_PLIST, H5E_CANTGET, FAIL, "Can't retrieve VFL driver ID"); - - /* Check if we are using the MPIO driver */ - if(H5FD_MPIO==driver_id) { - /* Get the driver information */ - if(H5P_get(plist, H5D_XFER_VFL_INFO_NAME, &dx)<0) - HGOTO_ERROR (H5E_PLIST, H5E_CANTGET, FAIL, "Can't retrieve VFL driver info"); - - /* Check if we are not using independent I/O */ - if(H5FD_MPIO_INDEPENDENT!=dx->xfer_mode) - xfer_mode = dx->xfer_mode; + /* Get the transfer mode for MPIO transfers */ + if(IS_H5FD_MPIO(f)) { + /* Get the plist structure */ + if(NULL == (plist = H5I_object(dxpl_id))) + HGOTO_ERROR(H5E_ATOM, H5E_BADATOM, FAIL, "can't find object for ID"); + + /* Get the driver ID */ + if(H5P_get(plist, H5D_XFER_VFL_ID_NAME, &driver_id)<0) + HGOTO_ERROR (H5E_PLIST, H5E_CANTGET, FAIL, "Can't retrieve VFL driver ID"); + + /* Check if we are using the MPIO driver (for the DXPL) */ + if(H5FD_MPIO==driver_id) { + /* Get the driver information */ + if(H5P_get(plist, H5D_XFER_VFL_INFO_NAME, &dx)<0) + HGOTO_ERROR (H5E_PLIST, H5E_CANTGET, FAIL, "Can't retrieve VFL driver info"); + + /* Check if we are not using independent I/O */ + if(H5FD_MPIO_INDEPENDENT!=dx->xfer_mode) + xfer_mode = dx->xfer_mode; + } /* end if */ } /* end if */ } diff --git a/src/H5Fistore.c b/src/H5Fistore.c index a44ce5a..a9f7f56 100644 --- a/src/H5Fistore.c +++ b/src/H5Fistore.c @@ -45,8 +45,9 @@ #include "H5Sprivate.h" /* Dataspaces */ #include "H5Vprivate.h" -/* MPIO driver needed for special checks */ +/* MPIO & MPIPOSIX drivers needed for special checks */ #include "H5FDmpio.h" +#include "H5FDmpiposix.h" /* * Feature: If this constant is defined then every cache preemption and load @@ -1781,12 +1782,12 @@ H5F_istore_read(H5F_t *f, hid_t dxpl_id, const H5O_layout_t *layout, #ifdef H5_HAVE_PARALLEL /* - * If MPIO is used and file can be written to, we must bypass the + * If MPIO or MPIPOSIX is used and file can be written to, we must bypass the * chunk-cache scheme because other MPI processes could be writing to * other elements in the same chunk. * Do a direct write-through of only the elements requested. */ - || (IS_H5FD_MPIO(f) && (H5F_ACC_RDWR & f->shared->flags)) + || ((IS_H5FD_MPIO(f) ||IS_H5FD_MPIPOSIX(f)) && (H5F_ACC_RDWR & f->shared->flags)) #endif /* H5_HAVE_PARALLEL */ ) { H5O_layout_t l; /* temporary layout */ @@ -1965,11 +1966,11 @@ H5F_istore_write(H5F_t *f, hid_t dxpl_id, const H5O_layout_t *layout, #ifdef H5_HAVE_PARALLEL /* - * If MPIO is used, must bypass the chunk-cache scheme because other + * If MPIO or MPIPOSIX is used, must bypass the chunk-cache scheme because other * MPI processes could be writing to other elements in the same chunk. * Do a direct write-through of only the elements requested. */ - || (IS_H5FD_MPIO(f) && (H5F_ACC_RDWR & f->shared->flags)) + || ((IS_H5FD_MPIO(f) ||IS_H5FD_MPIPOSIX(f)) && (H5F_ACC_RDWR & f->shared->flags)) #endif /* H5_HAVE_PARALLEL */ ) { H5O_layout_t l; /* temporary layout */ @@ -2416,10 +2417,22 @@ H5F_istore_allocate(H5F_t *f, hid_t dxpl_id, const H5O_layout_t *layout, } /* end if */ /* Retrieve up MPI parameters */ - if ((mpi_rank=H5FD_mpio_mpi_rank(f->shared->lf))<0) - HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "Can't retrieve MPI rank"); - if ((mpi_size=H5FD_mpio_mpi_size(f->shared->lf))<0) - HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "Can't retrieve MPI size"); + if(IS_H5FD_MPIO(f)) { + if ((mpi_rank=H5FD_mpio_mpi_rank(f->shared->lf))<0) + HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "Can't retrieve MPI rank"); + if ((mpi_size=H5FD_mpio_mpi_size(f->shared->lf))<0) + HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "Can't retrieve MPI size"); + } /* end if */ + else { + /* Sanity Check */ + assert(IS_H5FD_MPIPOSIX(f)); + + /* Get the MPI rank & size */ + if ((mpi_rank=H5FD_mpiposix_mpi_rank(f->shared->lf))<0) + HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "Can't retrieve MPI rank"); + if ((mpi_size=H5FD_mpiposix_mpi_size(f->shared->lf))<0) + HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "Can't retrieve MPI size"); + } /* end else */ /* Loop over all chunks */ carry=0; @@ -2469,8 +2482,17 @@ H5F_istore_allocate(H5F_t *f, hid_t dxpl_id, const H5O_layout_t *layout, * still writing out chunks and other processes race ahead to read * them in, getting bogus data. */ - if (MPI_Barrier(H5FD_mpio_communicator(f->shared->lf))) - HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "MPI_Barrier failed"); + if(IS_H5FD_MPIO(f)) { + if (MPI_Barrier(H5FD_mpio_communicator(f->shared->lf))) + HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "MPI_Barrier failed"); + } /* end if */ + else { + /* Sanity Check */ + assert(IS_H5FD_MPIPOSIX(f)); + + if (MPI_Barrier(H5FD_mpiposix_communicator(f->shared->lf))) + HGOTO_ERROR(H5E_INTERNAL, H5E_MPI, FAIL, "MPI_Barrier failed"); + } /* end else */ } /* end if */ done: diff --git a/src/H5Fpkg.h b/src/H5Fpkg.h index 10ed39b..189cbc7 100644 --- a/src/H5Fpkg.h +++ b/src/H5Fpkg.h @@ -161,6 +161,7 @@ struct H5F_t { #ifdef H5_HAVE_PARALLEL __DLLVAR__ hbool_t H5_mpi_1_metawrite_g; +__DLLVAR__ hbool_t H5_mpiposix_1_metawrite_g; #endif /* H5_HAVE_PARALLEL */ /* Private functions, not part of the publicly documented API */ diff --git a/src/H5Fseq.c b/src/H5Fseq.c index 0a4da84..8f988ff 100644 --- a/src/H5Fseq.c +++ b/src/H5Fseq.c @@ -26,8 +26,9 @@ #include "H5Pprivate.h" #include "H5Vprivate.h" -/* MPIO driver functions are needed for some special checks */ +/* MPIO & MPIPOSIX driver functions are needed for some special checks */ #include "H5FDmpio.h" +#include "H5FDmpiposix.h" /* Interface initialization */ #define PABLO_MASK H5Fseq_mask @@ -182,27 +183,29 @@ H5F_seq_readv(H5F_t *f, hid_t dxpl_id, const H5O_layout_t *layout, #ifdef H5_HAVE_PARALLEL { - /* Get the transfer mode */ H5FD_mpio_dxpl_t *dx; hid_t driver_id; /* VFL driver ID */ - /* Get the plist structure */ - if(NULL == (plist = H5I_object(dxpl_id))) - HGOTO_ERROR(H5E_ATOM, H5E_BADATOM, FAIL, "can't find object for ID"); - - /* Get the driver ID */ - if(H5P_get(plist, H5D_XFER_VFL_ID_NAME, &driver_id)<0) - HGOTO_ERROR (H5E_PLIST, H5E_CANTGET, FAIL, "Can't retrieve VFL driver ID"); - - /* Check if we are using the MPIO driver */ - if(H5FD_MPIO==driver_id) { - /* Get the driver information */ - if(H5P_get(plist, H5D_XFER_VFL_INFO_NAME, &dx)<0) - HGOTO_ERROR (H5E_PLIST, H5E_CANTGET, FAIL, "Can't retrieve VFL driver info"); - - /* Check if we are not using independent I/O */ - if(H5FD_MPIO_INDEPENDENT!=dx->xfer_mode) - xfer_mode = dx->xfer_mode; + /* Get the transfer mode for MPIO transfers */ + if(IS_H5FD_MPIO(f)) { + /* Get the plist structure */ + if(NULL == (plist = H5I_object(dxpl_id))) + HGOTO_ERROR(H5E_ATOM, H5E_BADATOM, FAIL, "can't find object for ID"); + + /* Get the driver ID */ + if(H5P_get(plist, H5D_XFER_VFL_ID_NAME, &driver_id)<0) + HGOTO_ERROR (H5E_PLIST, H5E_CANTGET, FAIL, "Can't retrieve VFL driver ID"); + + /* Check if we are using the MPIO driver (for the DXPL) */ + if(H5FD_MPIO==driver_id) { + /* Get the driver information */ + if(H5P_get(plist, H5D_XFER_VFL_INFO_NAME, &dx)<0) + HGOTO_ERROR (H5E_PLIST, H5E_CANTGET, FAIL, "Can't retrieve VFL driver info"); + + /* Check if we are not using independent I/O */ + if(H5FD_MPIO_INDEPENDENT!=dx->xfer_mode) + xfer_mode = dx->xfer_mode; + } /* end if */ } /* end if */ } @@ -564,27 +567,29 @@ H5F_seq_writev(H5F_t *f, hid_t dxpl_id, const H5O_layout_t *layout, #ifdef H5_HAVE_PARALLEL { - /* Get the transfer mode */ H5FD_mpio_dxpl_t *dx; hid_t driver_id; /* VFL driver ID */ - /* Get the plist structure */ - if(NULL == (plist = H5I_object(dxpl_id))) - HGOTO_ERROR(H5E_ATOM, H5E_BADATOM, FAIL, "can't find object for ID"); - - /* Get the driver ID */ - if(H5P_get(plist, H5D_XFER_VFL_ID_NAME, &driver_id)<0) - HGOTO_ERROR (H5E_PLIST, H5E_CANTGET, FAIL, "Can't retrieve VFL driver ID"); - - /* Check if we are using the MPIO driver */ - if(H5FD_MPIO==driver_id) { - /* Get the driver information */ - if(H5P_get(plist, H5D_XFER_VFL_INFO_NAME, &dx)<0) - HGOTO_ERROR (H5E_PLIST, H5E_CANTGET, FAIL, "Can't retrieve VFL driver info"); - - /* Check if we are not using independent I/O */ - if(H5FD_MPIO_INDEPENDENT!=dx->xfer_mode) - xfer_mode = dx->xfer_mode; + /* Get the transfer mode for MPIO transfers */ + if(IS_H5FD_MPIO(f)) { + /* Get the plist structure */ + if(NULL == (plist = H5I_object(dxpl_id))) + HGOTO_ERROR(H5E_ATOM, H5E_BADATOM, FAIL, "can't find object for ID"); + + /* Get the driver ID */ + if(H5P_get(plist, H5D_XFER_VFL_ID_NAME, &driver_id)<0) + HGOTO_ERROR (H5E_PLIST, H5E_CANTGET, FAIL, "Can't retrieve VFL driver ID"); + + /* Check if we are using the MPIO driver (for the DXPL) */ + if(H5FD_MPIO==driver_id) { + /* Get the driver information */ + if(H5P_get(plist, H5D_XFER_VFL_INFO_NAME, &dx)<0) + HGOTO_ERROR (H5E_PLIST, H5E_CANTGET, FAIL, "Can't retrieve VFL driver info"); + + /* Check if we are not using independent I/O */ + if(H5FD_MPIO_INDEPENDENT!=dx->xfer_mode) + xfer_mode = dx->xfer_mode; + } /* end if */ } /* end if */ } diff --git a/src/Makefile.in b/src/Makefile.in index 8875f15..78aef12 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -20,8 +20,8 @@ CLEAN=libhdf5.settings ## Source and object files for the library (lexicographically)... LIB_SRC=H5.c H5A.c H5AC.c H5B.c H5D.c H5E.c H5F.c H5Farray.c H5Fcontig.c \ - H5Fistore.c H5Fseq.c H5FD.c H5FDsec2.c H5FDfamily.c H5FDmpio.c H5FDcore.c \ - H5FDmulti.c H5FDgass.c H5FDlog.c H5FDsrb.c H5FDstdio.c \ + H5Fistore.c H5Fseq.c H5FD.c H5FDcore.c H5FDfamily.c H5FDgass.c H5FDlog.c \ + H5FDmpio.c H5FDmpiposix.c H5FDmulti.c H5FDsec2.c H5FDsrb.c H5FDstdio.c \ H5FDstream.c H5FL.c H5G.c H5Gent.c H5Gnode.c H5Gstab.c H5HG.c H5HL.c H5I.c \ H5MF.c H5MM.c H5O.c H5Oattr.c H5Ocomp.c H5Ocont.c H5Odtype.c H5Oefl.c \ H5Ofill.c H5Olayout.c H5Omtime.c H5Oname.c H5Onull.c H5Osdspace.c \ @@ -37,11 +37,11 @@ MOSTLYCLEAN=H5detect.o H5detect.lo H5detect H5Tinit.o H5Tinit.lo H5Tinit.c ## Public header files (to be installed)... PUB_HDR=H5public.h H5Apublic.h H5ACpublic.h H5Bpublic.h H5Dpublic.h H5Epublic.h \ - H5Fpublic.h H5FDpublic.h H5FDfamily.h H5FDgass.h H5FDmpio.h \ - H5FDlog.h H5FDsec2.h H5FDsrb.h H5FDstream.h H5FDcore.h H5FDmulti.h \ - H5FDstdio.h H5Gpublic.h H5HGpublic.h H5HLpublic.h H5Ipublic.h H5MMpublic.h \ - H5Opublic.h H5Ppublic.h H5Rpublic.h H5Spublic.h H5Tpublic.h H5Zpublic.h \ - H5pubconf.h hdf5.h H5api_adpt.h + H5Fpublic.h H5FDpublic.h H5FDcore.h H5FDfamily.h H5FDgass.h H5FDlog.h \ + H5FDmpio.h H5FDmpiposix.h H5FDmulti.h H5FDsec2.h H5FDsrb.h H5FDstdio.h \ + H5FDstream.h H5Gpublic.h H5HGpublic.h H5HLpublic.h H5Ipublic.h \ + H5MMpublic.h H5Opublic.h H5Ppublic.h H5Rpublic.h H5Spublic.h H5Tpublic.h \ + H5Zpublic.h H5pubconf.h hdf5.h H5api_adpt.h ## Other header files (not to be installed)... PRIVATE_HDR=H5private.h H5Aprivate.h H5Apkg.h H5ACprivate.h H5Bprivate.h \ @@ -41,13 +41,14 @@ /* Predefined file drivers */ #include "H5FDcore.h" /* Files stored entirely in memory */ #include "H5FDfamily.h" /* File families */ +#include "H5FDgass.h" /* Remote files using GASS I/O */ +#include "H5FDlog.h" /* sec2 driver with I/O logging (for debugging) */ #include "H5FDmpio.h" /* Parallel files using MPI-2 I/O */ +#include "H5FDmpiposix.h" /* Parallel files using combination MPI-2 & posix I/O */ +#include "H5FDmulti.h" /* Usage-partitioned file family */ #include "H5FDsec2.h" /* POSIX unbuffered file I/O */ -#include "H5FDstdio.h" /* Standard C buffered I/O */ #include "H5FDsrb.h" /* Remote access using SRB */ -#include "H5FDgass.h" /* Remote files using GASS I/O */ -#include "H5FDstream.h" /* in-memory files streamed via sockets */ -#include "H5FDmulti.h" /* Usage-partitioned file family */ -#include "H5FDlog.h" /* sec2 driver with I/O logging (for debugging) */ +#include "H5FDstdio.h" /* Standard C buffered I/O */ +#include "H5FDstream.h" /* In-memory files streamed via sockets */ #endif |