/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 * Copyright by The HDF Group.                                               *
 * Copyright by the Board of Trustees of the University of Illinois.         *
 * All rights reserved.                                                      *
 *                                                                           *
 * This file is part of HDF5.  The full HDF5 copyright notice, including     *
 * terms governing use, modification, and redistribution, is contained in    *
 * the files COPYING and Copyright.html.  COPYING can be found at the root   *
 * of the source code distribution tree; Copyright.html can be found at the  *
 * root level of an installed copy of the electronic HDF5 document set and   *
 * is linked from the top-level documents page.  It can also be found at     *
 * http://hdfgroup.org/HDF5/doc/Copyright.html.  If you do not have          *
 * access to either file, you may request a copy from help@hdfgroup.org.     *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

/*
 * Programmer:  Quincey Koziol <koziol@hdfgroup.org>
 *              Thursday, July 11, 2002
 *
 * Purpose:     This is a "combination" MPI-2 and posix I/O driver.
 *              It uses MPI for coordinating the actions of several processes
 *              and posix I/O calls to do the actual I/O to the disk.
 *
 *              This driver was derived from the H5FDmpio.c driver and may
 *              share bugs/quirks/etc.
 *
 * Limitations:
 *              There is no "collective" I/O mode with this driver.
 *
 *              This will almost certainly _not_ work correctly for files
 *              accessed on distributed parallel systems with the file located
 *              on a non-parallel filesystem.
 *
 */

/* Interface initialization */
#define H5_INTERFACE_INIT_FUNC  H5FD_mpiposix_init_interface


#include "H5private.h"      /* Generic Functions            */
#include "H5ACprivate.h"    /* Metadata cache               */
#include "H5Eprivate.h"     /* Error handling               */
#include "H5Fprivate.h"     /* File access                  */
#include "H5FDprivate.h"    /* File drivers                 */
#include "H5FDmpi.h"        /* MPI-based file drivers       */
#include "H5Iprivate.h"     /* IDs                          */
#include "H5MMprivate.h"    /* Memory management            */
#include "H5Pprivate.h"     /* Property lists               */

/* Features:
 *   H5_HAVE_GPFS   -- issue gpfs_fcntl() calls to hopefully improve
 *                     performance when accessing files on a GPFS
 *                     file system.
 *
 *   REPORT_IO      -- if set then report all POSIX file calls to stderr.
 *
 */
/* #define REPORT_IO */

#ifdef H5_HAVE_GPFS
#   include <gpfs_fcntl.h>
#endif

#ifdef H5_HAVE_PARALLEL

/*
 * The driver identification number, initialized at runtime if H5_HAVE_PARALLEL
 * is defined. This allows applications to still have the H5FD_MPIPOSIX
 * "constants" in their source code (it also makes this file strictly ANSI
 * compliant when H5_HAVE_PARALLEL isn't defined)
 */
static hid_t H5FD_MPIPOSIX_g = 0;

/*
 * The description of a file belonging to this driver.
 * The EOF value is only used just after the file is opened in order for the
 * library to determine whether the file is empty, truncated, or okay. The
 * MPIPOSIX driver doesn't bother to keep it updated since it's an expensive
 * operation.
 */
typedef struct H5FD_mpiposix_t {
    H5FD_t          pub;            /* public stuff, must be first          */
    int             fd;             /* the unix file handle                 */
    MPI_Comm        comm;           /* communicator                         */
    int             mpi_rank;       /* This process's rank                  */
    int             mpi_size;       /* Total number of processes            */
    haddr_t         eof;            /* end-of-file marker                   */
    haddr_t         eoa;            /* end-of-address marker                */
    haddr_t         last_eoa;       /* Last known end-of-address marker     */
    haddr_t         pos;            /* Current file I/O position            */
    H5FD_file_op_t  op;             /* Last file I/O operation              */
    hsize_t         naccess;        /* Number of (write) accesses to file   */
#ifdef H5_HAVE_GPFS
    size_t          blksize;        /* Block size of file system            */
#endif
    hbool_t         use_gpfs;       /* Use GPFS to write things             */
#ifndef H5_HAVE_WIN32_API
    /* On most systems the combination of device and i-node number uniquely
     * identify a file.  Note that Cygwin, MinGW and other Windows POSIX
     * environments have the stat function (which fakes inodes)
     * and will use the 'device + inodes' scheme as opposed to the
     * Windows code further below.
     */
    dev_t           device;     /* file device number   */
#ifdef H5_VMS
    ino_t           inode[3];   /* file i-node number   */
#else
    ino_t           inode;      /* file i-node number   */
#endif /* H5_VMS */
#else
    /* Files in windows are uniquely identified by the volume serial
     * number and the file index (both low and high parts).
     *
     * There are caveats where these numbers can change, especially
     * on FAT file systems.  On NTFS, however, a file should keep
     * those numbers the same until renamed or deleted (though you
     * can use ReplaceFile() on NTFS to keep the numbers the same
     * while renaming).
     *
     * See the MSDN "BY_HANDLE_FILE_INFORMATION Structure" entry for
     * more information.
     *
     * http://msdn.microsoft.com/en-us/library/aa363788(v=VS.85).aspx
     */
    DWORD           nFileIndexLow;
    DWORD           nFileIndexHigh;
    DWORD           dwVolumeSerialNumber;
    
    HANDLE          hFile;      /* Native windows file handle */
#endif  /* H5_HAVE_WIN32_API */
} H5FD_mpiposix_t;

/*
 * These macros check for overflow of various quantities.  These macros
 * assume that HDoff_t is signed and haddr_t and size_t are unsigned.
 *
 * ADDR_OVERFLOW:   Checks whether a file address of type `haddr_t'
 *                  is too large to be represented by the second argument
 *                  of the file seek function.
 *
 * SIZE_OVERFLOW:   Checks whether a buffer size of type `hsize_t' is too
 *                  large to be represented by the `size_t' type.
 *
 * REGION_OVERFLOW: Checks whether an address and size pair describe data
 *                  which can be addressed entirely by the second
 *                  argument of the file seek function.
 */
#define MAXADDR                 (((haddr_t)1 << (8*sizeof(HDoff_t) - 1)) - 1)
#define ADDR_OVERFLOW(A)        (HADDR_UNDEF == (A) || ((A) & ~(haddr_t)MAXADDR))
#define SIZE_OVERFLOW(Z)        ((Z) & ~(hsize_t)MAXADDR)
#define REGION_OVERFLOW(A,Z)    (ADDR_OVERFLOW(A) || SIZE_OVERFLOW(Z) ||    \
                                    HADDR_UNDEF == (A) + (Z) ||             \
                                    (HDoff_t)((A) + (Z)) < (HDoff_t)(A))

/* Callbacks */
static herr_t H5FD_mpiposix_term(void);
static void *H5FD_mpiposix_fapl_get(H5FD_t *_file);
static void *H5FD_mpiposix_fapl_copy(const void *_old_fa);
static herr_t H5FD_mpiposix_fapl_free(void *_fa);
static H5FD_t *H5FD_mpiposix_open(const char *name, unsigned flags, hid_t fapl_id,
            haddr_t maxaddr);
static herr_t H5FD_mpiposix_close(H5FD_t *_file);
static int H5FD_mpiposix_cmp(const H5FD_t *_f1, const H5FD_t *_f2);
static herr_t H5FD_mpiposix_query(const H5FD_t *_f1, unsigned long *flags);
static haddr_t H5FD_mpiposix_get_eoa(const H5FD_t *_file, H5FD_mem_t UNUSED type);
static herr_t H5FD_mpiposix_set_eoa(H5FD_t *_file, H5FD_mem_t type, haddr_t addr);
static haddr_t H5FD_mpiposix_get_eof(const H5FD_t *_file);
static herr_t  H5FD_mpiposix_get_handle(H5FD_t *_file, hid_t fapl, void** file_handle);
static herr_t H5FD_mpiposix_read(H5FD_t *_file, H5FD_mem_t type, hid_t fapl_id, haddr_t addr,
        size_t size, void *buf);
static herr_t H5FD_mpiposix_write(H5FD_t *_file, H5FD_mem_t type, hid_t fapl_id, haddr_t addr,
        size_t size, const void *buf);
static herr_t H5FD_mpiposix_truncate(H5FD_t *_file, hid_t dxpl_id, hbool_t closing);
static int H5FD_mpiposix_mpi_rank(const H5FD_t *_file);
static int H5FD_mpiposix_mpi_size(const H5FD_t *_file);
static MPI_Comm H5FD_mpiposix_communicator(const H5FD_t *_file);

/* MPIPOSIX-specific file access properties */
typedef struct H5FD_mpiposix_fapl_t {
    hbool_t     use_gpfs;   /* use GPFS hints       */
    MPI_Comm    comm;       /* communicator         */
} H5FD_mpiposix_fapl_t;

/* The MPIPOSIX file driver information */
static const H5FD_class_mpi_t H5FD_mpiposix_g = {
    {   /* Start of superclass information */
    "mpiposix",                     /* name             */
    MAXADDR,                        /* maxaddr          */
    H5F_CLOSE_SEMI,                 /* fc_degree        */
    H5FD_mpiposix_term,             /* terminate        */
    NULL,                           /* sb_size          */
    NULL,                           /* sb_encode        */
    NULL,                           /* sb_decode        */
    sizeof(H5FD_mpiposix_fapl_t),   /* fapl_size        */
    H5FD_mpiposix_fapl_get,         /* fapl_get         */
    H5FD_mpiposix_fapl_copy,        /* fapl_copy        */
    H5FD_mpiposix_fapl_free,        /* fapl_free        */
    0,                              /* dxpl_size        */
    NULL,                           /* dxpl_copy        */
    NULL,                           /* dxpl_free        */
    H5FD_mpiposix_open,             /* open             */
    H5FD_mpiposix_close,            /* close            */
    H5FD_mpiposix_cmp,              /* cmp              */
    H5FD_mpiposix_query,            /* query            */
    NULL,                           /* get_type_map     */
    NULL,                           /* alloc            */
    NULL,                           /* free             */
    H5FD_mpiposix_get_eoa,          /* get_eoa          */
    H5FD_mpiposix_set_eoa,          /* set_eoa          */
    H5FD_mpiposix_get_eof,          /* get_eof          */
    H5FD_mpiposix_get_handle,       /* get_handle       */
    H5FD_mpiposix_read,             /* read             */
    H5FD_mpiposix_write,            /* write            */
    NULL,                           /* flush            */
    H5FD_mpiposix_truncate,         /* truncate         */
    NULL,                           /* lock             */
    NULL,                           /* unlock           */
    H5FD_FLMAP_DICHOTOMY            /* fl_map           */
    },  /* End of superclass information */
    H5FD_mpiposix_mpi_rank,         /* get_rank         */
    H5FD_mpiposix_mpi_size,         /* get_size         */
    H5FD_mpiposix_communicator      /* get_comm         */
};


/*-------------------------------------------------------------------------
 * Function:    H5FD_mpiposix_init_interface
 *
 * Purpose:     Initializes any interface-specific data or routines. 
 *
 * Return:      Success:    The driver ID for the mpiposix driver.
 *              Failure:    Negative.
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD_mpiposix_init_interface(void)
{
    FUNC_ENTER_NOAPI_NOINIT_NOERR

    FUNC_LEAVE_NOAPI(H5FD_mpiposix_init())
} /* H5FD_mpiposix_init_interface() */


/*-------------------------------------------------------------------------
 * Function:    H5FD_mpiposix_init
 *
 * Purpose:     Initialize this driver by registering the driver with the
 *              library.
 *
 * Return:      Success:    The driver ID for the mpiposix driver.
 *              Failure:    Negative.
 *
 * Programmer:  Quincey Koziol
 *              Thursday, July 11, 2002
 *
 *-------------------------------------------------------------------------
 */
hid_t
H5FD_mpiposix_init(void)
{
    hid_t ret_value = H5FD_MPIPOSIX_g;    /* Return value */

    FUNC_ENTER_NOAPI(FAIL)

    if (H5I_VFL != H5Iget_type(H5FD_MPIPOSIX_g))
        H5FD_MPIPOSIX_g = H5FD_register((const H5FD_class_t *)&H5FD_mpiposix_g,sizeof(H5FD_class_mpi_t),FALSE);

    /* Set return value */
    ret_value = H5FD_MPIPOSIX_g;

done:
    FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD_mpiposix_init() */


/*---------------------------------------------------------------------------
 * Function:    H5FD_mpiposix_term
 *
 * Purpose:     Shut down the VFD
 *
 * Returns:     SUCCEED (can't fail)
 *
 * Programmer:  Quincey Koziol
 *              Friday, Jan 30, 2004
 *
 *---------------------------------------------------------------------------
 */
static herr_t
H5FD_mpiposix_term(void)
{
    FUNC_ENTER_NOAPI_NOINIT_NOERR

    /* Reset VFL ID */
    H5FD_MPIPOSIX_g = 0;

    FUNC_LEAVE_NOAPI(SUCCEED)
} /* end H5FD_mpiposix_term() */


/*-------------------------------------------------------------------------
 * Function:    H5Pset_fapl_mpiposix
 *
 * Purpose:     Store the user supplied MPI communicator COMM in
 *              the file access property list FAPL_ID which can then be used
 *              to create and/or open the file.  This function is available
 *              only in the parallel HDF5 library and is not collective.
 *
 *              comm is the MPI communicator to be used for file open as
 *              defined in MPI_FILE_OPEN of MPI-2. This function makes a
 *              duplicate of comm. Any modification to comm after this function
 *              call returns has no effect on the access property list.
 *
 *              If fapl_id has previously set comm value, it will be replaced
 *              and the old communicator is freed.
 *
 * Return:      SUCCEED/FAIL
 *
 * Programmer:  Quincey Koziol
 *              Thursday, July 11, 2002
 *
 *-------------------------------------------------------------------------
 */
herr_t
H5Pset_fapl_mpiposix(hid_t fapl_id, MPI_Comm comm, hbool_t use_gpfs)
{
    H5FD_mpiposix_fapl_t    fa;
    H5P_genplist_t          *plist;      /* Property list pointer */
    herr_t                  ret_value;

    FUNC_ENTER_API(FAIL)
    H5TRACE3("e", "iMcb", fapl_id, comm, use_gpfs);

    /* Check arguments */
    if(NULL == (plist = H5P_object_verify(fapl_id,H5P_FILE_ACCESS)))
        HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a file access list")
    if (MPI_COMM_NULL == comm)
        HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a valid communicator")

    /* Initialize driver specific properties */
    fa.comm = comm;
    fa.use_gpfs = use_gpfs;

    /* duplication is done during driver setting. */
    ret_value = H5P_set_driver(plist, H5FD_MPIPOSIX, &fa);

done:
    FUNC_LEAVE_API(ret_value)
} /* end H5Pset_fapl_mpiposix() */


/*-------------------------------------------------------------------------
 * Function:    H5Pget_fapl_mpiposix
 *
 * Purpose:     If the file access property list is set to the H5FD_MPIPOSIX
 *              driver then this function returns a duplicate of the MPI
 *              communicator through the comm pointer. It is the responsibility
 *              of the application to free the returned communicator.
 *
 * Return:      Success:    Non-negative with the communicator and
 *                          information returned through the COMM
 *                          argument if non-null.  Since it is a duplicate
 *                          of the stored object, future modifications to
 *                          the access property list do not affect it and
 *                          it is the responsibility of the application to
 *                          free it.
 *              Failure:    Negative
 *
 * Programmer:  Quincey Koziol
 *              Thursday, July 11, 2002
 *
 *-------------------------------------------------------------------------
 */
herr_t
H5Pget_fapl_mpiposix(hid_t fapl_id, MPI_Comm *comm/*out*/, hbool_t *use_gpfs/*out*/)
{
    H5FD_mpiposix_fapl_t    *fa;
    H5P_genplist_t          *plist;                 /* Property list pointer    */
    int                     mpi_code;               /* mpi return code          */
    herr_t                  ret_value = SUCCEED;    /* Return value             */

    FUNC_ENTER_API(FAIL)
    H5TRACE3("e", "ixx", fapl_id, comm, use_gpfs);

    if(NULL == (plist = H5P_object_verify(fapl_id,H5P_FILE_ACCESS)))
        HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a file access list")
    if (H5FD_MPIPOSIX != H5P_get_driver(plist))
        HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "incorrect VFL driver")
    if (NULL == (fa = H5P_get_driver_info(plist)))
        HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "bad VFL driver info")

    /* Get MPI Communicator */
    if (comm){
        if (MPI_SUCCESS != (mpi_code = MPI_Comm_dup(fa->comm, comm)))
            HMPI_GOTO_ERROR(FAIL, "MPI_Comm_dup failed", mpi_code)
    }

    if (use_gpfs)
        *use_gpfs = fa->use_gpfs;

done:
    FUNC_LEAVE_API(ret_value)
} /* end H5Pget_fapl_mpiposix() */


/*-------------------------------------------------------------------------
 * Function:    H5FD_mpiposix_fapl_get
 *
 * Purpose:     Returns a file access property list which could be used to
 *              create another file the same as this one.
 *
 * Return:      Success:    Ptr to new file access property list with all
 *                          fields copied from the file pointer.
 *              Failure:    NULL
 *
 * Programmer:  Quincey Koziol
 *              Thursday, July 11, 2002
 *
 *-------------------------------------------------------------------------
 */
static void *
H5FD_mpiposix_fapl_get(H5FD_t *_file)
{
    H5FD_mpiposix_t         *file = (H5FD_mpiposix_t*)_file;
    H5FD_mpiposix_fapl_t    *fa = NULL;
    int                     mpi_code;       /* MPI return code */
    void                    *ret_value;     /* Return value */

    FUNC_ENTER_NOAPI_NOINIT

    HDassert(file);
    HDassert(H5FD_MPIPOSIX == file->pub.driver_id);

    if (NULL == (fa = H5MM_calloc(sizeof(H5FD_mpiposix_fapl_t))))
        HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed")

    /* Duplicate the communicator. */
    if (MPI_SUCCESS != (mpi_code=MPI_Comm_dup(file->comm, &fa->comm)))
        HMPI_GOTO_ERROR(NULL, "MPI_Comm_dup failed", mpi_code)

    fa->use_gpfs = file->use_gpfs;

    /* Set return value */
    ret_value=fa;

done:
    FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD_mpiposix_fapl_get() */


/*-------------------------------------------------------------------------
 * Function:    H5FD_mpiposix_fapl_copy
 *
 * Purpose:     Copies the mpiposix-specific file access properties.
 *
 * Return:      Success:    Ptr to a new property list
 *              Failure:    NULL
 *
 * Programmer:  Albert Cheng
 *              Apr 24, 2003
 *
 *-------------------------------------------------------------------------
 */
static void *
H5FD_mpiposix_fapl_copy(const void *_old_fa)
{
    void    *ret_value = NULL;
    const H5FD_mpiposix_fapl_t *old_fa = (const H5FD_mpiposix_fapl_t*)_old_fa;
    H5FD_mpiposix_fapl_t  *new_fa = NULL;
    int    mpi_code;  /* MPI return code */

    FUNC_ENTER_NOAPI_NOINIT

    if (NULL == (new_fa = H5MM_malloc(sizeof(H5FD_mpiposix_fapl_t))))
        HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed")

    /* Copy the general information */
    HDmemcpy(new_fa, old_fa, sizeof(H5FD_mpiposix_fapl_t));

    /* Duplicate communicator. */
    if (MPI_SUCCESS != (mpi_code = MPI_Comm_dup(old_fa->comm, &new_fa->comm)))
        HMPI_GOTO_ERROR(NULL, "MPI_Comm_dup failed", mpi_code)

    new_fa->use_gpfs = old_fa->use_gpfs;
    ret_value = new_fa;

done:
    if (NULL == ret_value){
        /* cleanup */
        if (new_fa)
            H5MM_xfree(new_fa);
    }

    FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD_mpiposix_fapl_copy() */


/*-------------------------------------------------------------------------
 * Function:    H5FD_mpiposix_fapl_free
 *
 * Purpose:     Frees the mpiposix-specific file access properties.
 *
 * Return:      SUCCEED (can't fail)
 *
 * Programmer:  Albert Cheng
 *              Apr 24, 2003
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD_mpiposix_fapl_free(void *_fa)
{
    H5FD_mpiposix_fapl_t  *fa = (H5FD_mpiposix_fapl_t*)_fa;

    FUNC_ENTER_NOAPI_NOINIT_NOERR

    HDassert(fa);

    /* Free the internal communicator */
    HDassert(MPI_COMM_NULL != fa->comm);
    MPI_Comm_free(&fa->comm);
    H5MM_xfree(fa);

    FUNC_LEAVE_NOAPI(SUCCEED)
} /* end H5FD_mpiposix_fapl_free() */


/*-------------------------------------------------------------------------
 * Function:    H5FD_mpiposix_open
 *
 * Purpose:     Opens a file with name NAME.  The FLAGS are a bit field with
 *              purpose similar to the second argument of open(2) and which
 *              are defined in H5Fpublic.h. The file access property list
 *              FAPL_ID contains the properties driver properties and MAXADDR
 *              is the largest address which this file will be expected to
 *              access.  This is collective.
 *
 * Return:      Success:    A new file pointer.
 *              Failure:    NULL
 *
 * Programmer:  Quincey Koziol
 *              Thursday, July 11, 2002
 *
 *-------------------------------------------------------------------------
 */
static H5FD_t *
H5FD_mpiposix_open(const char *name, unsigned flags, hid_t fapl_id,
         haddr_t maxaddr)
{
    H5FD_mpiposix_t *file = NULL;   /* New MPIPOSIX file struct         */
    int             o_flags;        /* Flags for file open call         */
    int             fd = -1;        /* File handle for file opened      */
    int             mpi_rank;       /* MPI rank of this process         */
    int             mpi_size;       /* Total number of MPI processes    */
    int             mpi_code;       /* mpi return code                  */
    const H5FD_mpiposix_fapl_t  *fa = NULL;     /* MPIPOSIX file access property list information                   */
    H5FD_mpiposix_fapl_t        _fa;            /* Private copy of default file access property list information    */
    H5P_genplist_t  *plist;         /* Property list pointer            */
    h5_stat_t       sb;             /* Portable 'stat' struct           */
#ifdef H5_HAVE_WIN32_API
    struct _BY_HANDLE_FILE_INFORMATION fileinfo;
#endif
    H5FD_t          *ret_value = NULL; /* Return value */
    MPI_Comm        comm_dup = MPI_COMM_NULL;

    FUNC_ENTER_NOAPI_NOINIT

    /* Check arguments */
    if (!name || !*name)
        HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, NULL, "invalid file name")
    if (0 == maxaddr || HADDR_UNDEF == maxaddr)
        HGOTO_ERROR(H5E_ARGS, H5E_BADRANGE, NULL, "bogus maxaddr")
    if (ADDR_OVERFLOW(maxaddr))
        HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, NULL, "bogus maxaddr")

    /* Obtain a pointer to mpiposix-specific file access properties */
    if(NULL == (plist = H5P_object_verify(fapl_id,H5P_FILE_ACCESS)))
        HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, NULL, "not a file access property list")
    if (H5P_FILE_ACCESS_DEFAULT == fapl_id || H5FD_MPIPOSIX != H5P_get_driver(plist)) {
        _fa.comm = MPI_COMM_SELF; /*default*/
        _fa.use_gpfs = FALSE;
        fa = &_fa;
    } /* end if */
    else {
        if(NULL == (fa = (const H5FD_mpiposix_fapl_t *)H5P_get_driver_info(plist)))
            HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, NULL, "bad VFL driver info")
    } /* end else */

    /* Duplicate the communicator for use by this file. */
    if (MPI_SUCCESS != (mpi_code = MPI_Comm_dup(fa->comm, &comm_dup)))
        HMPI_GOTO_ERROR(NULL, "MPI_Comm_dup failed", mpi_code)

    /* Get the MPI rank of this process and the total number of processes */
    if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank (comm_dup, &mpi_rank)))
        HMPI_GOTO_ERROR(NULL, "MPI_Comm_rank failed", mpi_code)
    if (MPI_SUCCESS != (mpi_code = MPI_Comm_size (comm_dup, &mpi_size)))
        HMPI_GOTO_ERROR(NULL, "MPI_Comm_size failed", mpi_code)

    /* Build the open flags */
    o_flags = (H5F_ACC_RDWR & flags) ? O_RDWR : O_RDONLY;

    /* Only set the creation flag(s) for process 0 */
    if(0 == mpi_rank) {
        if (H5F_ACC_TRUNC & flags)
            o_flags |= O_TRUNC;
        if (H5F_ACC_CREAT & flags)
            o_flags |= O_CREAT;
        if (H5F_ACC_EXCL & flags)
            o_flags |= O_EXCL;
    } /* end if */

    /* Process 0 opens (or creates) the file while the rest of the
     * processes wait.  Then process 0 signals the other processes and they
     * open (never create) the file and all processes proceed.
     */
    /* Process 0 opens (or creates) file and broadcasts result to other processes */
    if(0 == mpi_rank) {
        /* Open the file */
        fd = HDopen(name, o_flags, 0666);
    } /* end if */

    /* Broadcast the results of the open() from process 0
     *
     * This is necessary because of the "tentative open" code in H5F_open()
     * where the file is attempted to be opened with different flags from the
     * user's, in order to check for the file's existence, etc.  Here, process 0
     * gets different flags from the other processes (since it is in charge of
     * creating the file, if necessary) and can fail in situations where the
     * other process's file opens would succeed, so allow the other processes
     * to check for that situation and bail out now also. - QAK
     */
    if (MPI_SUCCESS != (mpi_code= MPI_Bcast(&fd, sizeof(int), MPI_BYTE, 0, comm_dup)))
        HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code)

    /* If the file open on process 0 failed, bail out on all processes now */
    if(fd < 0)
        HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to open file")

    /* Other processes (non 0) wait for broadcast result from process 0 and then open file */
    if(mpi_rank != 0) {
        /* Open the file */
        if ((fd = HDopen(name, o_flags, 0666)) < 0)
            HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to open file")
    } /* end if */

    /* Process 0 fstat()s the file and broadcasts the results to the other processes */
    if(0 == mpi_rank) {
        /* Get the stat information */
        if (HDfstat(fd, &sb) < 0)
            HGOTO_ERROR(H5E_FILE, H5E_BADFILE, NULL, "unable to fstat file")
    } /* end if */

    /* Broadcast the results of the fstat() from process 0 */
    if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&sb, sizeof(h5_stat_t), MPI_BYTE, 0, comm_dup)))
        HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code)

#ifdef H5_HAVE_GPFS
    if (fa->use_gpfs) {
        /*
         * Free all byte range tokens. This is a good thing to do if raw data
         * is aligned on 256kB boundaries (a GPFS page is 256kB). Care should
         * be taken that there aren't too many sub-page writes, or the mmfsd
         * may become overwhelmed.  This should probably eventually be passed
         * down here as a property. The gpfs_fcntl() will most likely fail if
         * 'fd' isn't on a GPFS file system. */
        struct {
            gpfsFcntlHeader_t   hdr;
            gpfsFreeRange_t     fr;
        } hint;
        HDmemset(&hint, 0, sizeof hint);
        hint.hdr.totalLength = sizeof hint;
        hint.hdr.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
        hint.fr.structLen = sizeof hint.fr;
        hint.fr.structType = GPFS_FREE_RANGE;
        hint.fr.start = 0;
        hint.fr.length = 0;

        if (gpfs_fcntl(fd, &hint) < 0)
            HGOTO_ERROR(H5E_FILE, H5E_FCNTL, NULL, "failed to send hints to GPFS")
    }
#endif  /* H5_HAVE_GPFS */

    /* Build the file struct and initialize it */
    if (NULL == (file=H5MM_calloc(sizeof(H5FD_mpiposix_t))))
        HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed")

#ifdef REPORT_IO
    HDfprintf(stderr, "open:  rank=%d name=%s file=0x%08lx\n", mpi_rank, name, (unsigned long)file);
#endif

    /* Set the general file information */
    file->fd = fd;
    file->eof = sb.st_size;

    /* for H5_HAVE_WIN32_API support. H5_HAVE_WIN32_API 'stat' does not have
     * st_blksize and st_blksize is only used for the H5_HAVE_GPFS case.
     */
#ifdef H5_HAVE_GPFS
    file->blksize = sb.st_blksize;
#endif

    /* Set this field in the H5FD_mpiposix_t struct for later use */
    file->use_gpfs = fa->use_gpfs;

    /* Set the MPI information */
    file->comm = comm_dup;
    file->mpi_rank = mpi_rank;
    file->mpi_size = mpi_size;

    /* Reset the last file I/O operation */
    file->pos = HADDR_UNDEF;
    file->op = OP_UNKNOWN;

    /* Set the information for the file's device and inode */
#ifdef H5_HAVE_WIN32_API
    file->hFile = (HANDLE)_get_osfhandle(fd);
    if(INVALID_HANDLE_VALUE == file->hFile)
        HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to get Windows file handle")

    if(!GetFileInformationByHandle((HANDLE)file->hFile, &fileinfo))
        HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to get Windows file information")

    file->nFileIndexHigh = fileinfo.nFileIndexHigh;
    file->nFileIndexLow = fileinfo.nFileIndexLow;
    file->dwVolumeSerialNumber = fileinfo.dwVolumeSerialNumber;
#else /* H5_HAVE_WIN32_API */
    file->device = sb.st_dev;
#ifdef H5_VMS
    file->inode[0] = sb.st_ino[0];
    file->inode[1] = sb.st_ino[1];
    file->inode[2] = sb.st_ino[2];
#else /* H5_VMS */
    file->inode = sb.st_ino;
#endif /* H5_VMS */
#endif /* H5_HAVE_WIN32_API */

    /* Indicate success */
    ret_value = (H5FD_t *)file;

done:
    /* Error cleanup */
    if(NULL == ret_value) {
        /* Close the file if it was left open */
        if(fd != -1)
            HDclose(fd);
        if (MPI_COMM_NULL != comm_dup)
            MPI_Comm_free(&comm_dup);
    } /* end if */

    FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD_mpiposix_open() */


/*-------------------------------------------------------------------------
 * Function:    H5FD_mpiposix_close
 *
 * Purpose:     Closes a file.
 *
 * Return:      SUCCEED/FAIL
 *
 * Programmer:  Quincey Koziol
 *              Thursday, July 11, 2002
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD_mpiposix_close(H5FD_t *_file)
{
    H5FD_mpiposix_t     *file = (H5FD_mpiposix_t*)_file;
    herr_t              ret_value = SUCCEED;       /* Return value */

    FUNC_ENTER_NOAPI_NOINIT

    HDassert(file);
    HDassert(H5FD_MPIPOSIX == file->pub.driver_id);

    /* Close the unix file */
    if(HDclose(file->fd) < 0)
        HGOTO_ERROR(H5E_IO, H5E_CANTCLOSEFILE, FAIL, "unable to close file")

    /* make sure all processes have closed the file before returning. */
    MPI_Barrier(file->comm);
    /* Clean up other stuff */
    MPI_Comm_free(&file->comm);
    H5MM_xfree(file);

done:
    FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD_mpiposix_close() */


/*-------------------------------------------------------------------------
 * Function:    H5FD_mpiposix_cmp
 *
 * Purpose:     Compares two files belonging to this driver using an
 *              arbitrary (but consistent) ordering.
 *
 * Return:      Success:    A value like strcmp()
 *              Failure:    never fails (arguments were checked by the caller).
 *
 * Programmer:  Quincey Koziol
 *              Thursday, July 11, 2002
 *
 *-------------------------------------------------------------------------
 */
static int
H5FD_mpiposix_cmp(const H5FD_t *_f1, const H5FD_t *_f2)
{
    const H5FD_mpiposix_t  *f1 = (const H5FD_mpiposix_t*)_f1;
    const H5FD_mpiposix_t  *f2 = (const H5FD_mpiposix_t*)_f2;
    int ret_value = 0;

    FUNC_ENTER_NOAPI_NOINIT_NOERR

#ifdef H5_HAVE_WIN32_API
    if(f1->dwVolumeSerialNumber < f2->dwVolumeSerialNumber) HGOTO_DONE(-1)
    if(f1->dwVolumeSerialNumber > f2->dwVolumeSerialNumber) HGOTO_DONE(1)

    if(f1->nFileIndexHigh < f2->nFileIndexHigh) HGOTO_DONE(-1)
    if(f1->nFileIndexHigh > f2->nFileIndexHigh) HGOTO_DONE(1)

    if(f1->nFileIndexLow < f2->nFileIndexLow) HGOTO_DONE(-1)
    if(f1->nFileIndexLow > f2->nFileIndexLow) HGOTO_DONE(1)
#else /* H5_HAVE_WIN32_API */
#ifdef H5_DEV_T_IS_SCALAR
    if(f1->device < f2->device) HGOTO_DONE(-1)
    if(f1->device > f2->device) HGOTO_DONE(1)
#else /* H5_DEV_T_IS_SCALAR */
    /* If dev_t isn't a scalar value on this system, just use memcmp to
     * determine if the values are the same or not.  The actual return value
     * shouldn't really matter...
     */
    if(HDmemcmp(&(f1->device),&(f2->device),sizeof(dev_t)) < 0) HGOTO_DONE(-1)
    if(HDmemcmp(&(f1->device),&(f2->device),sizeof(dev_t)) > 0) HGOTO_DONE(1)
#endif /* H5_DEV_T_IS_SCALAR */
#ifdef H5_VMS
    if(HDmemcmp(&(f1->inode), &(f2->inode), 3 * sizeof(ino_t)) < 0) HGOTO_DONE(-1)
    if(HDmemcmp(&(f1->inode), &(f2->inode), 3 * sizeof(ino_t)) > 0) HGOTO_DONE(1)
#else /* H5_VMS */
    if(f1->inode < f2->inode) HGOTO_DONE(-1)
    if(f1->inode > f2->inode) HGOTO_DONE(1)
#endif /* H5_VMS */
#endif /* H5_HAVE_WIN32_API */

done:
    FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD_mpiposix_cmp() */


/*-------------------------------------------------------------------------
 * Function:    H5FD_mpiposix_query
 *
 * Purpose:     Set the flags that this VFL driver is capable of supporting.
 *              (listed in H5FDpublic.h)
 *
 * Return:      SUCCEED (can't fail)
 *
 * Programmer:  Quincey Koziol
 *              Thursday, July 11, 2002
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD_mpiposix_query(const H5FD_t UNUSED *_file, unsigned long *flags /* out */)
{
    FUNC_ENTER_NOAPI_NOINIT_NOERR

    /* Set the VFL feature flags that this driver supports */
    if(flags) {
        *flags=0;
        *flags |= H5FD_FEAT_AGGREGATE_METADATA;  /* OK to aggregate metadata allocations */
        *flags |= H5FD_FEAT_AGGREGATE_SMALLDATA; /* OK to aggregate "small" raw data allocations */
        *flags |= H5FD_FEAT_HAS_MPI;             /* This driver uses MPI */
        *flags |= H5FD_FEAT_ALLOCATE_EARLY;      /* Allocate space early instead of late */
    } /* end if */

    FUNC_LEAVE_NOAPI(SUCCEED)
} /* end H5FD_mpiposix_query() */


/*-------------------------------------------------------------------------
 * Function:    H5FD_mpiposix_get_eoa
 *
 * Purpose:     Gets the end-of-address marker for the file. The EOA marker
 *              is the first address past the last byte allocated in the
 *              format address space.
 *
 * Return:      Success:    The end-of-address marker.
 *              Failure:    HADDR_UNDEF
 *
 * Programmer:  Quincey Koziol
 *              Thursday, July 11, 2002
 *
 *-------------------------------------------------------------------------
 */
static haddr_t
H5FD_mpiposix_get_eoa(const H5FD_t *_file, H5FD_mem_t UNUSED type)
{
    const H5FD_mpiposix_t *file = (const H5FD_mpiposix_t*)_file;

    FUNC_ENTER_NOAPI_NOINIT_NOERR

    HDassert(file);
    HDassert(H5FD_MPIPOSIX == file->pub.driver_id);

    FUNC_LEAVE_NOAPI(file->eoa)
} /* end H5FD_mpiposix_get_eoa() */


/*-------------------------------------------------------------------------
 * Function:    H5FD_mpiposix_set_eoa
 *
 * Purpose:     Set the end-of-address marker for the file. This function is
 *              called shortly after an existing HDF5 file is opened in order
 *              to tell the driver where the end of the HDF5 data is located.
 *
 * Return:      SUCCEED (can't fail)
 *
 * Programmer:  Quincey Koziol
 *              Thursday, July 11, 2002
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD_mpiposix_set_eoa(H5FD_t *_file, H5FD_mem_t UNUSED type, haddr_t addr)
{
    H5FD_mpiposix_t  *file = (H5FD_mpiposix_t*)_file;

    FUNC_ENTER_NOAPI_NOINIT_NOERR

    HDassert(file);
    HDassert(H5FD_MPIPOSIX == file->pub.driver_id);

    file->eoa = addr;

    FUNC_LEAVE_NOAPI(SUCCEED)
} /* end H5FD_mpi_posix_set_eoa() */


/*-------------------------------------------------------------------------
 * Function:    H5FD_mpiposix_get_eof
 *
 * Purpose:     Gets the end-of-file marker for the file. The EOF marker
 *              is the real size of the file.
 *
 *              The MPIPOSIX driver doesn't bother keeping this field updated
 *              since that's a relatively expensive operation. Fortunately
 *              the library only needs the EOF just after the file is opened
 *              in order to determine whether the file is empty, truncated,
 *              or okay.
 *
 * Return:      Success:    The end-of-address marker.
 *              Failure:    HADDR_UNDEF
 *
 * Programmer:  Quincey Koziol
 *              Thursday, July 11, 2002
 *
 *-------------------------------------------------------------------------
 */
static haddr_t
H5FD_mpiposix_get_eof(const H5FD_t *_file)
{
    const H5FD_mpiposix_t  *file = (const H5FD_mpiposix_t*)_file;

    FUNC_ENTER_NOAPI_NOINIT_NOERR

    HDassert(file);
    HDassert(H5FD_MPIPOSIX == file->pub.driver_id);

    FUNC_LEAVE_NOAPI(MAX(file->eof, file->eoa))
} /* end H5FD_mpiposix_get_eof() */


/*-------------------------------------------------------------------------
 * Function:       H5FD_mpiposix_get_handle
 *
 * Purpose:        Returns the file handle of MPI-POSIX file driver.
 *
 * Returns:        SUCCEED/FAIL
 *
 * Programmer:     Raymond Lu
 *                 Sept. 16, 2002
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD_mpiposix_get_handle(H5FD_t *_file, hid_t UNUSED fapl, void** file_handle)
{
    H5FD_mpiposix_t       *file = (H5FD_mpiposix_t *)_file;
    herr_t                ret_value = SUCCEED;

    FUNC_ENTER_NOAPI_NOINIT

    if(!file_handle)
        HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file handle not valid")

    *file_handle = &(file->fd);

done:
    FUNC_LEAVE_NOAPI(ret_value)
}


/*-------------------------------------------------------------------------
 * Function:    H5FD_mpiposix_read
 *
 * Purpose:     Reads SIZE bytes of data from FILE beginning at address ADDR
 *              into buffer BUF according to data transfer properties in
 *              DXPL_ID using potentially complex file and buffer types to
 *              effect the transfer.
 *
 *              Reading past the end of the file returns zeros instead of
 *              failing.
 *
 * Return:      Success:    Non-negative. Result is stored in caller-supplied
 *                          buffer BUF.
 *              Failure:    Negative, Contents of buffer BUF are undefined.
 *
 * Programmer:  Quincey Koziol
 *              Thursday, July 11, 2002
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD_mpiposix_read(H5FD_t *_file, H5FD_mem_t UNUSED type, hid_t UNUSED dxpl_id,
        haddr_t addr, size_t size, void *buf/*out*/)
{
    H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file;
    ssize_t         nbytes;         /* Number of bytes read each I/O call */
    herr_t          ret_value = SUCCEED;

    FUNC_ENTER_NOAPI_NOINIT

    HDassert(file);
    HDassert(H5FD_MPIPOSIX == file->pub.driver_id);
    HDassert(buf);

    /* Check for overflow conditions */
    if (HADDR_UNDEF == addr)
        HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "addr undefined")
    if (REGION_OVERFLOW(addr, size))
        HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow")
    if((addr + size) > file->eoa)
        HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow")

#ifdef REPORT_IO
    {
        int commrank;
        MPI_Comm_rank(MPI_COMM_WORLD, &commrank);
        HDfprintf(stderr, "read:  rank=%d file=0x%08lx type=%d, addr=%a size=%Zu\n",
                commrank, (unsigned long)file, (int)type, addr, size);
    }
#endif

    /* Seek to the correct location */
    if(addr != file->pos || OP_READ != file->op) {
        if(HDlseek(file->fd, (HDoff_t)addr, SEEK_SET) < 0)
            HGOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL, "unable to seek to proper position")
    } /* end if */

    /* Read data, being careful of interrupted system calls, partial results,
     * and the end of the file.
     */
    while(size > 0) {

        h5_posix_io_t           bytes_in    = 0;    /* # of bytes to read       */
        h5_posix_io_ret_t       bytes_read  = -1;   /* # of bytes actually read */ 

        /* Trying to read more bytes than the return type can handle is
         * undefined behavior in POSIX.
         */
        if(size > H5_POSIX_MAX_IO_BYTES)
            bytes_in = H5_POSIX_MAX_IO_BYTES;
        else
            bytes_in = (h5_posix_io_t)size;

        do {
            bytes_read = HDread(file->fd, buf, bytes_in);
        } while(-1 == bytes_read && EINTR == errno);
        
        if(-1 == bytes_read) { /* error */
            int myerrno = errno;
            time_t mytime = HDtime(NULL);
            HDoff_t myoffset = HDlseek(file->fd, (HDoff_t)0, SEEK_CUR);

            HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed: time = %s, file descriptor = %d, errno = %d, error message = '%s', buf = %p, total read size = %llu, bytes this sub-read = %llu, bytes actually read = %llu, offset = %llu", HDctime(&mytime), file->fd, myerrno, HDstrerror(myerrno), buf, (unsigned long long)size, (unsigned long long)bytes_in, (unsigned long long)bytes_read, (unsigned long long)myoffset);
        } /* end if */
        
        if(0 == bytes_read) {
            /* end of file but not end of format address space */
            HDmemset(buf, 0, size);
            break;
        } /* end if */
        
        HDassert(bytes_read >= 0);
        HDassert((size_t)bytes_read <= size);
        
        size -= (size_t)bytes_read;
        addr += (haddr_t)bytes_read;
        buf = (char *)buf + bytes_read;
    } /* end while */

    /* Update current position */
    file->pos = addr;
    file->op = OP_READ;

done:
    /* Check for error */
    if(ret_value < 0) {
        /* Reset last file I/O information */
        file->pos = HADDR_UNDEF;
        file->op = OP_UNKNOWN;
    } /* end if */

    FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD_mpiposix_read() */


/*-------------------------------------------------------------------------
 * Function:    H5FD_mpiposix_write
 *
 * Purpose:     Writes SIZE bytes of data to FILE beginning at address ADDR
 *              from buffer BUF according to data transfer properties in
 *              DXPL_ID using potentially complex file and buffer types to
 *              effect the transfer.
 *
 * Return:      SUCCEED/FAIL
 *
 * Programmer:  Quincey Koziol
 *              Thursday, July 11, 2002
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD_mpiposix_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
    size_t size, const void *buf)
{
    H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file;
#if 0 /* JRM */
    int      mpi_code;  /* MPI return code */
#endif /* JRM */
    ssize_t         nbytes;                 /* Number of bytes written each I/O call    */
    H5P_genplist_t  *plist;                 /* Property list pointer                    */
    herr_t          ret_value = SUCCEED;    /* Return value                             */

    FUNC_ENTER_NOAPI_NOINIT

    HDassert(file);
    HDassert(H5FD_MPIPOSIX == file->pub.driver_id);
    HDassert(H5I_GENPROP_LST == H5I_get_type(dxpl_id));
    HDassert(TRUE == H5P_isa_class(dxpl_id,H5P_DATASET_XFER));
    HDassert(buf);

    /* Check for overflow conditions */
    if (HADDR_UNDEF == addr)
        HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "addr undefined")
    if (REGION_OVERFLOW(addr, size))
        HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow")
    if ((addr + size) > file->eoa)
        HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow")

    /* Obtain the data transfer properties */
    if(NULL == (plist = H5I_object(dxpl_id)))
        HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list")

    /* Metadata specific actions */
    /* All metadata is now written from process 0 -- thus this function
     * needs to be re-written to reflect this.  For now I have simply
     * commented out the code that attempts to synchronize metadata
     * writes between processes, but we should really just flag an error
     * whenever any process other than process 0 attempts to write
     * metadata.
     *             -- JRM 9/1/05
     */
    if(type != H5FD_MEM_DRAW) {
        unsigned    block_before_meta_write = 0;    /* Whether to block before a metadata write */

        /* Check if we need to synchronize all processes before attempting
         * metadata write (Prevents race condition where the process writing
         * the metadata goes ahead and writes the metadata to the file before
         * all the processes have read the data, "transmitting" data from the
         * "future" to the reading process. -QAK )
         *
         * The only time we don't want to block before a metadata write is when
         * we are flushing out a bunch of metadata.  Then, we block before the
         * first write and don't block for further writes in the sequence.
         */
        if(H5P_exist_plist(plist,H5AC_BLOCK_BEFORE_META_WRITE_NAME) > 0)
            if(H5P_get(plist,H5AC_BLOCK_BEFORE_META_WRITE_NAME,&block_before_meta_write) < 0)
                HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get H5AC property")

#if 0 /* JRM */
        if(block_before_meta_write)
            if (MPI_SUCCESS != (mpi_code = MPI_Barrier(file->comm)))
                HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code)
#endif /* JRM */

#if 0 /* JRM -- 3/23/10 */ /* this is no longer always the case */
        /* Only one process will do the actual write if all procs in comm write same metadata */
        if (file->mpi_rank != H5_PAR_META_WRITE)
            HGOTO_DONE(SUCCEED) /* skip the actual write */
#endif /* JRM */
    } /* end if */

#ifdef REPORT_IO
    {
        int commrank;
        MPI_Comm_rank(MPI_COMM_WORLD, &commrank);
        HDfprintf(stderr, "write: rank=%d file=0x%08lx type=%d, addr=%a size=%Zu %s\n",
                commrank, (unsigned long)file, (int)type, addr, size,
                0 == file->naccess ? "(FIRST ACCESS)" : "");
    }
#endif

    if (0 == file->naccess++) {
        /* First write access to this file */
#ifdef H5_HAVE_GPFS
        if (file->use_gpfs) {
            struct {
                gpfsFcntlHeader_t           hdr;
                gpfsMultipleAccessRange_t   mar;
            } hint;
            HDmemset(&hint, 0, sizeof hint);
            hint.hdr.totalLength = sizeof hint;
            hint.hdr.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
            hint.mar.structLen = sizeof hint.mar;
            hint.mar.structType = GPFS_MULTIPLE_ACCESS_RANGE;
            hint.mar.accRangeCnt = 1;
            hint.mar.accRangeArray[0].blockNumber = addr / file->blksize;
            hint.mar.accRangeArray[0].start = addr % file->blksize;
            hint.mar.accRangeArray[0].length = MIN(file->blksize-hint.mar.accRangeArray[0].start, size);
            hint.mar.accRangeArray[0].isWrite = 1;
            if (gpfs_fcntl(file->fd, &hint)<0)
                HGOTO_ERROR(H5E_FILE, H5E_FCNTL, NULL, "failed to send hints to GPFS")
        }
#endif  /* H5_HAVE_GPFS */
    }

    /* Seek to the correct location */
    if(addr != file->pos || OP_WRITE != file->op) {
        if(HDlseek(file->fd, (HDoff_t)addr, SEEK_SET) < 0)
            HSYS_GOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL, "unable to seek to proper position")
    } /* end if */

    /* Write data, being careful of interrupted system calls, partial results,
     * and the end of the file.
     */
    while(size > 0) {

        h5_posix_io_t           bytes_in    = 0;    /* # of bytes to write          */
        h5_posix_io_ret_t       bytes_wrote = -1;   /* # of bytes actually written  */ 

        /* Trying to write more bytes than the return type can handle is
         * undefined behavior in POSIX.
         */
        if(size > H5_POSIX_MAX_IO_BYTES)
            bytes_in = H5_POSIX_MAX_IO_BYTES;
        else
            bytes_in = (h5_posix_io_t)size;

        do {
            bytes_wrote = HDwrite(file->fd, buf, bytes_in);
        } while(-1 == bytes_wrote && EINTR == errno);
        
        if(-1 == bytes_wrote) { /* error */
            int myerrno = errno;
            time_t mytime = HDtime(NULL);
            HDoff_t myoffset = HDlseek(file->fd, (HDoff_t)0, SEEK_CUR);

            HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file write failed: time = %s, file descriptor = %d, errno = %d, error message = '%s', buf = %p, total write size = %llu, bytes this sub-write = %llu, bytes actually written = %llu, offset = %llu", HDctime(&mytime), file->fd, myerrno, HDstrerror(myerrno), buf, (unsigned long long)size, (unsigned long long)bytes_in, (unsigned long long)bytes_wrote, (unsigned long long)myoffset);
        } /* end if */
        
        if(0 == bytes_wrote) {
            /* end of file but not end of format address space */
            HDmemset(buf, 0, size);
            break;
        } /* end if */
        
        HDassert(bytes_wrote >= 0);
        HDassert((size_t)bytes_wrote <= size);
        
        size -= (size_t)bytes_wrote;
        addr += (haddr_t)bytes_wrote;
        buf = (char *)buf + bytes_wrote;
    } /* end while */

    /* Update current position */
    file->pos = addr;
    file->op = OP_WRITE;

done:
    /* Check for error */
    if(ret_value < 0) {
        /* Reset last file I/O information */
        file->pos = HADDR_UNDEF;
        file->op = OP_UNKNOWN;
    } /* end if */
#if 0 /* JRM */
    /* Since metadata writes are now done by process 0 only, this broadcast
     * is no longer needed.  I leave it in and commented out to remind us
     * that we need to re-work this function to reflect this reallity.
     *
     *                                          -- JRM 9/1/05
     */

    /* Guard against getting into metadata broadcast in failure cases */
    else {
        /* when only one process writes, need to broadcast the ret_value to other processes */
        if (type != H5FD_MEM_DRAW) {
            if (MPI_SUCCESS != (mpi_code= MPI_Bcast(&ret_value, sizeof(ret_value), MPI_BYTE, H5_PAR_META_WRITE, file->comm)))
                HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code)
        } /* end if */
    } /* end else */
#endif /* JRM */

    FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD_mpiposix_write() */


/*-------------------------------------------------------------------------
 * Function:    H5FD_mpiposix_truncate
 *
 * Purpose:     Makes sure that the true file size is the same (or larger)
 *              than the end-of-address.
 *
 * Return:      SUCCEED/FAIL
 *
 * Programmer:  Quincey Koziol
 *              Thursday, July 11, 2002
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD_mpiposix_truncate(H5FD_t *_file, hid_t UNUSED dxpl_id, hbool_t UNUSED closing)
{
    H5FD_mpiposix_t  *file = (H5FD_mpiposix_t*)_file;
    herr_t          ret_value = SUCCEED;                /* Return value     */
    int             mpi_code;                           /* MPI return code  */

    FUNC_ENTER_NOAPI_NOINIT

    HDassert(file);
    HDassert(H5FD_MPIPOSIX == file->pub.driver_id);

    /* Extend the file to make sure it's large enough */
    if(file->eoa > file->last_eoa) {
        /* Use the round-robin process to truncate (extend) the file */
        if(file->mpi_rank == H5_PAR_META_WRITE) {

#ifdef H5_HAVE_WIN32_API
            LARGE_INTEGER   li;         /* 64-bit (union) integer for SetFilePointer() call */
            DWORD           dwPtrLow;   /* Low-order pointer bits from SetFilePointer()
                                         * Only used as an error code here.
                                         */
            DWORD           dwError;    /* DWORD error code from GetLastError() */
            BOOL            bError;     /* Boolean error flag */

            /* Windows uses this odd QuadPart union for 32/64-bit portability */
            li.QuadPart = (__int64)file->eoa;

            /* Extend the file to make sure it's large enough.
             *
             * Since INVALID_SET_FILE_POINTER can technically be a valid return value
             * from SetFilePointer(), we also need to check GetLastError().
             */
            dwPtrLow = SetFilePointer(file->hFile, li.LowPart, &li.HighPart, FILE_BEGIN);
            if(INVALID_SET_FILE_POINTER == dwPtrLow) {
                dwError = GetLastError();
                if(dwError != NO_ERROR )
                    HGOTO_ERROR(H5E_FILE, H5E_FILEOPEN, FAIL, "unable to set file pointer")
            }

            bError = SetEndOfFile(file->hFile);
            if(0 == bError)
                HGOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL, "unable to extend file properly")
#else /* H5_HAVE_WIN32_API */
#ifdef H5_VMS
            /* Reset seek offset to the beginning of the file, so that the file isn't
             * re-extended later.  This may happen on Open VMS. */
            if(-1 == HDlseek(file->fd, (HDoff_t)0, SEEK_SET))
                HSYS_GOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL, "unable to seek to proper position")
#endif /* H5_VMS */
            if(-1 == HDftruncate(file->fd, (HDoff_t)file->eoa))
                HSYS_GOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL, "unable to extend file properly")
#endif /* H5_HAVE_WIN32_API */
        } /* end if */

        /* Don't let any proc return until all have extended the file.
         * (Prevents race condition where some processes go ahead and write
         * more data to the file before all the processes have finished making
         * it the shorter length, potentially truncating the file and dropping
         * the new data written)
         */
        if(MPI_SUCCESS != (mpi_code = MPI_Barrier(file->comm)))
            HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code)

        /* Update the 'last' eoa and eof values */
        file->last_eoa = file->eoa;
        file->eof = file->eoa;

        /* Reset last file I/O information */
        file->pos = HADDR_UNDEF;
        file->op = OP_UNKNOWN;
    } /* end if */

done:
    FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD_mpiposix_truncate() */


/*-------------------------------------------------------------------------
 * Function:    H5FD_mpiposix_mpi_rank
 *
 * Purpose:     Returns the MPI rank for a process
 *
 * Return:      MPI rank.  Cannot report failure.
 *
 * Programmer:  Quincey Koziol
 *              Thursday, July 11, 2002
 *
 *-------------------------------------------------------------------------
 */
static int
H5FD_mpiposix_mpi_rank(const H5FD_t *_file)
{
    const H5FD_mpiposix_t *file = (const H5FD_mpiposix_t*)_file;

    FUNC_ENTER_NOAPI_NOINIT_NOERR

    HDassert(file);
    HDassert(H5FD_MPIPOSIX == file->pub.driver_id);

    FUNC_LEAVE_NOAPI(file->mpi_rank)
} /* end H5FD_mpiposix_mpi_rank() */


/*-------------------------------------------------------------------------
 * Function:    H5FD_mpiposix_mpi_size
 *
 * Purpose:     Returns the number of MPI processes
 *
 * Return:      The number of MPI processes.  Cannot report failure.
 *
 * Programmer:  Quincey Koziol
 *              Thursday, July 11, 2002
 *
 *-------------------------------------------------------------------------
 */
static int
H5FD_mpiposix_mpi_size(const H5FD_t *_file)
{
    const H5FD_mpiposix_t *file = (const H5FD_mpiposix_t*)_file;

    FUNC_ENTER_NOAPI_NOINIT_NOERR

    HDassert(file);
    HDassert(H5FD_MPIPOSIX == file->pub.driver_id);

    FUNC_LEAVE_NOAPI(file->mpi_size)
} /* end H5FD_mpiposix_mpi_size() */


/*-------------------------------------------------------------------------
 * Function:    H5FD_mpiposix_communicator
 *
 * Purpose:     Returns the MPI communicator for the file.
 *
 * Return:      The MPI communicator.  Cannot report failure.
 *
 * Programmer:  Quincey Koziol
 *              Thursday, July 11, 2002
 *
 *-------------------------------------------------------------------------
 */
static MPI_Comm
H5FD_mpiposix_communicator(const H5FD_t *_file)
{
    const H5FD_mpiposix_t *file = (const H5FD_mpiposix_t*)_file;

    FUNC_ENTER_NOAPI_NOINIT_NOERR

    HDassert(file);
    HDassert(H5FD_MPIPOSIX == file->pub.driver_id);

    FUNC_LEAVE_NOAPI(file->comm)
} /* end H5FD_mpi_posix_communicator() */

#endif /*H5_HAVE_PARALLEL*/