summaryrefslogtreecommitdiffstats
path: root/src/H5FDmpiposix.c
diff options
context:
space:
mode:
authorQuincey Koziol <koziol@hdfgroup.org>2002-07-15 15:21:32 (GMT)
committerQuincey Koziol <koziol@hdfgroup.org>2002-07-15 15:21:32 (GMT)
commit363ec52b7cab5638e3d1479b67068754a2e10011 (patch)
tree23fb21609fb9a9bbadfee4c13d27326df0413d4b /src/H5FDmpiposix.c
parentc3b0c0f3c4ed622450862fdfd2ade5eb96d4ef1a (diff)
downloadhdf5-363ec52b7cab5638e3d1479b67068754a2e10011.zip
hdf5-363ec52b7cab5638e3d1479b67068754a2e10011.tar.gz
hdf5-363ec52b7cab5638e3d1479b67068754a2e10011.tar.bz2
[svn-r5799] Purpose:
New feature. Description: Added MPI-posix VFL driver. This driver uses MPI to coordinate actions, but performs I/O directly with posix sec(2) I/O functions. This driver should _NOT_ be used if the file accessed is not on a parallel filesystem. Platforms tested: FreeBSD 4.6 (sleipnir) w/parallel & IRIX64 6.5 (modi4) w/parallel
Diffstat (limited to 'src/H5FDmpiposix.c')
-rw-r--r--src/H5FDmpiposix.c1135
1 files changed, 1135 insertions, 0 deletions
diff --git a/src/H5FDmpiposix.c b/src/H5FDmpiposix.c
new file mode 100644
index 0000000..82533d0
--- /dev/null
+++ b/src/H5FDmpiposix.c
@@ -0,0 +1,1135 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright by the Board of Trustees of the University of Illinois. *
+ * All rights reserved. *
+ * *
+ * This file is part of HDF5. The full HDF5 copyright notice, including *
+ * terms governing use, modification, and redistribution, is contained in *
+ * the files COPYING and Copyright.html. COPYING can be found at the root *
+ * of the source code distribution tree; Copyright.html can be found at the *
+ * root level of an installed copy of the electronic HDF5 document set and *
+ * is linked from the top-level documents page. It can also be found at *
+ * http://hdf.ncsa.uiuc.edu/HDF5/doc/Copyright.html. If you do not have *
+ * access to either file, you may request a copy from hdfhelp@ncsa.uiuc.edu. *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+/*
+ * Programmer: Quincey Koziol <koziol@ncsa.uiuc.ed>
+ * Thursday, July 11, 2002
+ *
+ * Purpose: This is a "combination" MPI-2 and posix I/O driver.
+ * It uses MPI for coodinating the actions of several processes
+ * and posix I/O calls to do the actual I/O to the disk.
+ *
+ * This driver was derived from the H5FDmpio.c driver and may
+ * share bugs/quirks/etc.
+ *
+ * Limitations:
+ * There is no "collective" I/O mode with this driver.
+ *
+ * This will almost certainly _not_ work correctly for files
+ * accessed on distributed parallel systems with the file located
+ * on a non-parallel filesystem.
+ *
+ */
+#include "H5private.h" /*library functions */
+#include "H5Eprivate.h" /*error handling */
+#include "H5Fprivate.h" /*files */
+#include "H5FDprivate.h" /*file driver */
+#include "H5FDmpiposix.h" /* MPI/posix I/O file driver */
+#include "H5Iprivate.h" /* IDs */
+#include "H5MMprivate.h" /*memory allocation */
+#include "H5Pprivate.h" /*property lists */
+
+/*
+ * The driver identification number, initialized at runtime if H5_HAVE_PARALLEL
+ * is defined. This allows applications to still have the H5FD_MPIPOSIX
+ * "constants" in their source code (it also makes this file strictly ANSI
+ * compliant when H5_HAVE_PARALLEL isn't defined)
+ */
+static hid_t H5FD_MPIPOSIX_g = 0;
+
+#ifdef H5_HAVE_PARALLEL
+
+/* File operations */
+#define OP_UNKNOWN 0
+#define OP_READ 1
+#define OP_WRITE 2
+
+/*
+ * The description of a file belonging to this driver.
+ * The EOF value
+ * is only used just after the file is opened in order for the library to
+ * determine whether the file is empty, truncated, or okay. The MPIPOSIX driver
+ * doesn't bother to keep it updated since it's an expensive operation.
+ */
+typedef struct H5FD_mpiposix_t {
+ H5FD_t pub; /*public stuff, must be first */
+ int fd; /*the unix file handle */
+ MPI_Comm comm; /*communicator */
+ int mpi_rank; /* This process's rank */
+ int mpi_size; /* Total number of processes */
+ int mpi_round; /* Current round robin process (for metadata I/O) */
+ haddr_t eof; /*end-of-file marker */
+ haddr_t eoa; /*end-of-address marker */
+ haddr_t last_eoa; /* Last known end-of-address marker */
+ haddr_t pos; /* Current file I/O position */
+ int op; /* Last file I/O operation */
+#ifndef WIN32
+ /*
+ * On most systems the combination of device and i-node number uniquely
+ * identify a file.
+ */
+ dev_t device; /*file device number */
+ ino_t inode; /*file i-node number */
+#else
+ /*
+ * On WIN32 the low-order word of a unique identifier associated with the
+ * file and the volume serial number uniquely identify a file. This number
+ * (which, both? -rpm) may change when the system is restarted or when the
+ * file is opened. After a process opens a file, the identifier is
+ * constant until the file is closed. An application can use this
+ * identifier and the volume serial number to determine whether two
+ * handles refer to the same file.
+ */
+ int fileindexlo;
+ int fileindexhi;
+#endif
+} H5FD_mpiposix_t;
+
+/*
+ * This driver supports systems that have the lseek64() function by defining
+ * some macros here so we don't have to have conditional compilations later
+ * throughout the code.
+ *
+ * file_offset_t: The datatype for file offsets, the second argument of
+ * the lseek() or lseek64() call.
+ *
+ * file_seek: The function which adjusts the current file position,
+ * either lseek() or lseek64().
+ */
+/* adding for windows NT file system support. */
+/* pvn: added __MWERKS__ support. */
+
+#ifdef H5_HAVE_LSEEK64
+# define file_offset_t off64_t
+# define file_seek lseek64
+# define file_truncate ftruncate64
+#elif defined (WIN32) && !defined(__MWERKS__)
+# /*MSVC*/
+# define file_offset_t __int64
+# define file_seek _lseeki64
+# define file_truncate _ftruncatei64
+#else
+# define file_offset_t off_t
+# define file_seek HDlseek
+# define file_truncate HDftruncate
+#endif
+
+/*
+ * These macros check for overflow of various quantities. These macros
+ * assume that file_offset_t is signed and haddr_t and size_t are unsigned.
+ *
+ * ADDR_OVERFLOW: Checks whether a file address of type `haddr_t'
+ * is too large to be represented by the second argument
+ * of the file seek function.
+ *
+ * SIZE_OVERFLOW: Checks whether a buffer size of type `hsize_t' is too
+ * large to be represented by the `size_t' type.
+ *
+ * REGION_OVERFLOW: Checks whether an address and size pair describe data
+ * which can be addressed entirely by the second
+ * argument of the file seek function.
+ */
+#define MAXADDR (((haddr_t)1<<(8*sizeof(file_offset_t)-1))-1)
+#define ADDR_OVERFLOW(A) (HADDR_UNDEF==(A) || \
+ ((A) & ~(haddr_t)MAXADDR))
+#define SIZE_OVERFLOW(Z) ((Z) & ~(hsize_t)MAXADDR)
+#define REGION_OVERFLOW(A,Z) (ADDR_OVERFLOW(A) || SIZE_OVERFLOW(Z) || \
+ sizeof(file_offset_t)<sizeof(size_t) || \
+ HADDR_UNDEF==(A)+(Z) || \
+ (file_offset_t)((A)+(Z))<(file_offset_t)(A))
+
+/* Callbacks */
+static void *H5FD_mpiposix_fapl_get(H5FD_t *_file);
+static H5FD_t *H5FD_mpiposix_open(const char *name, unsigned flags, hid_t fapl_id,
+ haddr_t maxaddr);
+static herr_t H5FD_mpiposix_close(H5FD_t *_file);
+static int H5FD_mpiposix_cmp(const H5FD_t *_f1, const H5FD_t *_f2);
+static herr_t H5FD_mpiposix_query(const H5FD_t *_f1, unsigned long *flags);
+static haddr_t H5FD_mpiposix_get_eoa(H5FD_t *_file);
+static herr_t H5FD_mpiposix_set_eoa(H5FD_t *_file, haddr_t addr);
+static haddr_t H5FD_mpiposix_get_eof(H5FD_t *_file);
+static herr_t H5FD_mpiposix_read(H5FD_t *_file, H5FD_mem_t type, hid_t fapl_id, haddr_t addr,
+ size_t size, void *buf);
+static herr_t H5FD_mpiposix_write(H5FD_t *_file, H5FD_mem_t type, hid_t fapl_id, haddr_t addr,
+ size_t size, const void *buf);
+static herr_t H5FD_mpiposix_flush(H5FD_t *_file, unsigned closing);
+
+/* MPIPOSIX-specific file access properties */
+typedef struct H5FD_mpiposix_fapl_t {
+ MPI_Comm comm; /*communicator */
+} H5FD_mpiposix_fapl_t;
+
+/* The MPIPOSIX file driver information */
+static const H5FD_class_t H5FD_mpiposix_g = {
+ "mpiposix", /*name */
+ MAXADDR, /*maxaddr */
+ H5F_CLOSE_SEMI, /* fc_degree */
+ NULL, /*sb_size */
+ NULL, /*sb_encode */
+ NULL, /*sb_decode */
+ sizeof(H5FD_mpiposix_fapl_t), /*fapl_size */
+ H5FD_mpiposix_fapl_get, /*fapl_get */
+ NULL, /*fapl_copy */
+ NULL, /*fapl_free */
+ 0, /*dxpl_size */
+ NULL, /*dxpl_copy */
+ NULL, /*dxpl_free */
+ H5FD_mpiposix_open, /*open */
+ H5FD_mpiposix_close, /*close */
+ H5FD_mpiposix_cmp, /*cmp */
+ H5FD_mpiposix_query, /*query */
+ NULL, /*alloc */
+ NULL, /*free */
+ H5FD_mpiposix_get_eoa, /*get_eoa */
+ H5FD_mpiposix_set_eoa, /*set_eoa */
+ H5FD_mpiposix_get_eof, /*get_eof */
+ H5FD_mpiposix_read, /*read */
+ H5FD_mpiposix_write, /*write */
+ H5FD_mpiposix_flush, /*flush */
+ H5FD_FLMAP_SINGLE, /*fl_map */
+};
+
+/* Global var to allow elimination of redundant metadata writes
+ * to be controlled by the value of an environment variable. */
+/* Use the elimination by default unless this is the Intel Red machine */
+#ifndef __PUMAGON__
+hbool_t H5_mpiposix_1_metawrite_g = TRUE;
+#else
+hbool_t H5_mpiposix_1_metawrite_g = FALSE;
+#endif
+
+/* Interface initialization */
+#define PABLO_MASK H5FD_mpiposix_mask
+#define INTERFACE_INIT H5FD_mpiposix_init
+static int interface_initialize_g = 0;
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpiposix_init
+ *
+ * Purpose: Initialize this driver by registering the driver with the
+ * library.
+ *
+ * Return: Success: The driver ID for the mpiposix driver.
+ *
+ * Failure: Negative.
+ *
+ * Programmer: Quincey Koziol
+ * Thursday, July 11, 2002
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+hid_t
+H5FD_mpiposix_init(void)
+{
+ FUNC_ENTER_NOAPI(H5FD_mpiposix_init, FAIL);
+
+ if (H5I_VFL!=H5Iget_type(H5FD_MPIPOSIX_g))
+ H5FD_MPIPOSIX_g = H5FDregister(&H5FD_mpiposix_g);
+
+ FUNC_LEAVE(H5FD_MPIPOSIX_g);
+} /* end H5FD_mpiposix_init() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5Pset_fapl_mpiposix
+ *
+ * Purpose: Store the user supplied MPI communicator COMM in
+ * the file access property list FAPL_ID which can then be used
+ * to create and/or open the file. This function is available
+ * only in the parallel HDF5 library and is not collective.
+ *
+ * COMM is the MPI communicator to be used for file open as
+ * defined in MPI_FILE_OPEN of MPI-2. This function does not
+ * make a duplicated communicator. Any modification to COMM
+ * after this function call returns may have undetermined effect
+ * on the access property list. Users should not modify the
+ * communicator while it is defined in a property list.
+ *
+ * Return: Success: Non-negative
+ * Failure: Negative
+ *
+ * Programmer: Quincey Koziol
+ * Thursday, July 11, 2002
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5Pset_fapl_mpiposix(hid_t fapl_id, MPI_Comm comm)
+{
+ H5FD_mpiposix_fapl_t fa;
+ H5P_genplist_t *plist; /* Property list pointer */
+ herr_t ret_value=FAIL;
+
+ FUNC_ENTER_API(H5Pset_fapl_mpiposix, FAIL);
+ H5TRACE2("e","iMc",fapl_id,comm);
+
+ /* Check arguments */
+ if(TRUE!=H5P_isa_class(fapl_id,H5P_FILE_ACCESS) || NULL == (plist = H5I_object(fapl_id)))
+ HRETURN_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a file access list");
+#ifdef LATER
+#warning "We need to verify that COMM contains sensible information."
+#endif
+
+ /* Initialize driver specific properties */
+ fa.comm = comm;
+
+ ret_value= H5P_set_driver(plist, H5FD_MPIPOSIX, &fa);
+
+ FUNC_LEAVE(ret_value);
+} /* end H5Pset_fapl_mpiposix() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5Pget_fapl_mpiposix
+ *
+ * Purpose: If the file access property list is set to the H5FD_MPIPOSIX
+ * driver then this function returns the MPI communicator and
+ * information through the COMM pointer.
+ *
+ * Return: Success: Non-negative with the communicator and
+ * information returned through the COMM
+ * argument if non-null. This piece of
+ * information is copied and is therefore
+ * valid only until the file access property
+ * list is modified or closed.
+ *
+ * Failure: Negative
+ *
+ * Programmer: Quincey Koziol
+ * Thursday, July 11, 2002
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+herr_t
+H5Pget_fapl_mpiposix(hid_t fapl_id, MPI_Comm *comm/*out*/)
+{
+ H5FD_mpiposix_fapl_t *fa;
+ H5P_genplist_t *plist; /* Property list pointer */
+
+ FUNC_ENTER_API(H5Pget_fapl_mpiposix, FAIL);
+ H5TRACE2("e","ix",fapl_id,comm);
+
+ if(TRUE!=H5P_isa_class(fapl_id,H5P_FILE_ACCESS) || NULL == (plist = H5I_object(fapl_id)))
+ HRETURN_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a file access list");
+ if (H5FD_MPIPOSIX!=H5P_get_driver(plist))
+ HRETURN_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "incorrect VFL driver");
+ if (NULL==(fa=H5P_get_driver_info(plist)))
+ HRETURN_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "bad VFL driver info");
+
+ /* Get MPI Communicator */
+ if (comm)
+ *comm = fa->comm;
+
+ FUNC_LEAVE(SUCCEED);
+} /* end H5Pget_fapl_mpiposix() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpiposix_communicator
+ *
+ * Purpose: Returns the MPI communicator for the file.
+ *
+ * Return: Success: The communicator
+ *
+ * Failure: NULL
+ *
+ * Programmer: Quincey Koziol
+ * Thursday, July 11, 2002
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+MPI_Comm
+H5FD_mpiposix_communicator(H5FD_t *_file)
+{
+ H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file;
+
+ FUNC_ENTER_NOAPI(H5FD_mpiposix_communicator, MPI_COMM_NULL);
+
+ assert(file);
+ assert(H5FD_MPIPOSIX==file->pub.driver_id);
+
+ FUNC_LEAVE(file->comm);
+} /* end H5FD_mpi_posix_communicator() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpiposix_mpi_rank
+ *
+ * Purpose: Returns the MPI rank for a process
+ *
+ * Return: Success: non-negative
+ * Failure: negative
+ *
+ * Programmer: Quincey Koziol
+ * Thursday, July 11, 2002
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+int
+H5FD_mpiposix_mpi_rank(H5FD_t *_file)
+{
+ H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file;
+
+ FUNC_ENTER_NOAPI(H5FD_mpiposix_mpi_rank, FAIL);
+
+ assert(file);
+ assert(H5FD_MPIPOSIX==file->pub.driver_id);
+
+ FUNC_LEAVE(file->mpi_rank);
+} /* end H5FD_mpiposix_mpi_rank() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpiposix_mpi_size
+ *
+ * Purpose: Returns the number of MPI processes
+ *
+ * Return: Success: non-negative
+ * Failure: negative
+ *
+ * Programmer: Quincey Koziol
+ * Thursday, July 11, 2002
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+int
+H5FD_mpiposix_mpi_size(H5FD_t *_file)
+{
+ H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file;
+
+ FUNC_ENTER_NOAPI(H5FD_mpiposix_mpi_rank, FAIL);
+
+ assert(file);
+ assert(H5FD_MPIPOSIX==file->pub.driver_id);
+
+ FUNC_LEAVE(file->mpi_size);
+} /* end H5FD_mpiposix_mpi_size() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpiposix_fapl_get
+ *
+ * Purpose: Returns a file access property list which could be used to
+ * create another file the same as this one.
+ *
+ * Return: Success: Ptr to new file access property list with all
+ * fields copied from the file pointer.
+ *
+ * Failure: NULL
+ *
+ * Programmer: Quincey Koziol
+ * Thursday, July 11, 2002
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+static void *
+H5FD_mpiposix_fapl_get(H5FD_t *_file)
+{
+ H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file;
+ H5FD_mpiposix_fapl_t *fa = NULL;
+
+ FUNC_ENTER_NOAPI(H5FD_mpiposix_fapl_get, NULL);
+
+ assert(file);
+ assert(H5FD_MPIPOSIX==file->pub.driver_id);
+
+ if (NULL==(fa=H5MM_calloc(sizeof(H5FD_mpiposix_fapl_t))))
+ HRETURN_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed");
+
+ /* These should be copied. --QAK, 2002-07-11 */
+ fa->comm = file->comm;
+
+ FUNC_LEAVE(fa);
+} /* end H5FD_mpiposix_fapl_get() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpiposix_open
+ *
+ * Purpose: Opens a file with name NAME. The FLAGS are a bit field with
+ * purpose similar to the second argument of open(2) and which
+ * are defined in H5Fpublic.h. The file access property list
+ * FAPL_ID contains the properties driver properties and MAXADDR
+ * is the largest address which this file will be expected to
+ * access. This is collective.
+ *
+ * Return: Success: A new file pointer.
+ * Failure: NULL
+ *
+ * Programmer: Quincey Koziol
+ * Thursday, July 11, 2002
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+static H5FD_t *
+H5FD_mpiposix_open(const char *name, unsigned flags, hid_t fapl_id,
+ haddr_t maxaddr)
+{
+ H5FD_mpiposix_t *file=NULL; /* New MPIPOSIX file struct */
+ int o_flags; /* Flags for file open call */
+ int fd=(-1); /* File handle for file opened */
+ int mpi_rank; /* MPI rank of this process */
+ int mpi_size; /* Total number of MPI processes */
+ int mpi_code; /* mpi return code */
+ const H5FD_mpiposix_fapl_t *fa=NULL; /* MPIPOSIX file access property list information */
+ H5FD_mpiposix_fapl_t _fa; /* Private copy of default file access property list information */
+ H5P_genplist_t *plist; /* Property list pointer */
+ h5_stat_t sb; /* Portable 'stat' struct */
+#ifdef WIN32
+ HFILE filehandle;
+ struct _BY_HANDLE_FILE_INFORMATION fileinfo;
+ int results;
+#endif
+ H5FD_t *ret_value=NULL; /* Return value */
+
+ FUNC_ENTER_NOAPI(H5FD_mpiposix_open, NULL);
+
+ /* Check arguments */
+ if (!name || !*name)
+ HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, NULL, "invalid file name");
+ if (0==maxaddr || HADDR_UNDEF==maxaddr)
+ HGOTO_ERROR(H5E_ARGS, H5E_BADRANGE, NULL, "bogus maxaddr");
+ if (ADDR_OVERFLOW(maxaddr))
+ HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, NULL, "bogus maxaddr");
+
+ /* Obtain a pointer to mpiposix-specific file access properties */
+ if(TRUE!=H5P_isa_class(fapl_id,H5P_FILE_ACCESS) || NULL == (plist = H5I_object(fapl_id)))
+ HRETURN_ERROR(H5E_ARGS, H5E_BADTYPE, NULL, "not a file access property list");
+ if (H5P_DEFAULT==fapl_id || H5FD_MPIPOSIX!=H5P_get_driver(plist)) {
+ _fa.comm = MPI_COMM_SELF; /*default*/
+ fa = &_fa;
+ } /* end if */
+ else {
+ fa = H5P_get_driver_info(plist);
+ assert(fa);
+ } /* end else */
+
+ /* Get the MPI rank of this process and the total number of processes */
+ if (MPI_SUCCESS != (mpi_code=MPI_Comm_rank (fa->comm, &mpi_rank)))
+ HMPI_GOTO_ERROR(NULL, "MPI_Comm_rank failed", mpi_code);
+ if (MPI_SUCCESS != (mpi_code=MPI_Comm_size (fa->comm, &mpi_size)))
+ HMPI_GOTO_ERROR(NULL, "MPI_Comm_size failed", mpi_code);
+
+ /* Build the open flags */
+ o_flags = (H5F_ACC_RDWR & flags) ? O_RDWR : O_RDONLY;
+
+ /* Only set the creation flag(s) for process 0 */
+ if(mpi_rank==0) {
+ if (H5F_ACC_TRUNC & flags)
+ o_flags |= O_TRUNC;
+ if (H5F_ACC_CREAT & flags)
+ o_flags |= O_CREAT;
+ if (H5F_ACC_EXCL & flags)
+ o_flags |= O_EXCL;
+ } /* end if */
+
+ /* Process 0 opens (or creates) the file while the rest of the
+ * processes wait. Then process 0 signals the other processes and they
+ * open (never create) the file and all processes proceed.
+ */
+ /* Process 0 opens (or creates) file and broadcasts result to other processes */
+ if(mpi_rank==0) {
+ /* Open the file */
+ fd=HDopen(name, o_flags, 0666);
+ } /* end if */
+
+ /* Broadcast the results of the open() from process 0 */
+ /* This is necessary because of the "tentative open" code in H5F_open()
+ * where the file is attempted to be opened with different flags from the
+ * user's, in order to check for the file's existence, etc. Here, process 0
+ * gets different flags from the other processes (since it is in charge of
+ * creating the file, if necessary) and can fail in situations where the
+ * other process's file opens would succeed, so allow the other processes
+ * to check for that situation and bail out now also. - QAK
+ */
+ if (MPI_SUCCESS != (mpi_code= MPI_Bcast(&fd, sizeof(int), MPI_BYTE, 0, fa->comm)))
+ HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code);
+
+ /* If the file open on process 0 failed, bail out on all processes now */
+ if(fd<0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to open file");
+
+ /* Other processes (non 0) wait for broadcast result from process 0 and then open file */
+ if(mpi_rank!=0) {
+ /* Open the file */
+ if ((fd=HDopen(name, o_flags, 0666))<0)
+ HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to open file");
+ } /* end if */
+
+ /* Process 0 fstat()s the file and broadcasts the results to the other processes */
+ if(mpi_rank==0) {
+ /* Get the stat information */
+ if (HDfstat(fd, &sb)<0)
+ HGOTO_ERROR(H5E_FILE, H5E_BADFILE, NULL, "unable to fstat file");
+ } /* end if */
+
+ /* Broadcast the results of the fstat() from process 0 */
+ if (MPI_SUCCESS != (mpi_code= MPI_Bcast(&sb, sizeof(h5_stat_t), MPI_BYTE, 0, fa->comm)))
+ HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code);
+
+ /* Build the file struct and initialize it */
+ if (NULL==(file=H5MM_calloc(sizeof(H5FD_mpiposix_t))))
+ HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed");
+
+ /* Set the general file information */
+ file->fd = fd;
+ file->eof = sb.st_size;
+
+ /* Set the MPI information */
+ file->comm = fa->comm;
+ file->mpi_rank = mpi_rank;
+ file->mpi_size = mpi_size;
+ file->mpi_round = 0; /* Start metadata writes with process 0 */
+
+ /* Reset the last file I/O operation */
+ file->pos = HADDR_UNDEF;
+ file->op = OP_UNKNOWN;
+
+ /* Set the information for the file's device and inode */
+#ifdef WIN32
+ filehandle = _get_osfhandle(fd);
+ results = GetFileInformationByHandle((HANDLE)filehandle, &fileinfo);
+ file->fileindexhi = fileinfo.nFileIndexHigh;
+ file->fileindexlo = fileinfo.nFileIndexLow;
+#else
+ file->device = sb.st_dev;
+ file->inode = sb.st_ino;
+#endif
+
+ /* Indicate success */
+ ret_value=(H5FD_t *)file;
+
+done:
+ /* Error cleanup */
+ if(ret_value==NULL) {
+ /* Close the file if it was left open */
+ if(fd!=(-1))
+ HDclose(fd);
+ } /* end if */
+
+ FUNC_LEAVE(ret_value);
+} /* end H5FD_mpiposix_open() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpiposix_close
+ *
+ * Purpose: Closes a file.
+ *
+ * Return: Success: Non-negative
+ * Failure: Negative
+ *
+ * Programmer: Quincey Koziol
+ * Thursday, July 11, 2002
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD_mpiposix_close(H5FD_t *_file)
+{
+ H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file;
+
+ FUNC_ENTER_NOAPI(H5FD_mpiposix_close, FAIL);
+
+ assert(file);
+ assert(H5FD_MPIPOSIX==file->pub.driver_id);
+
+ /* Close the unix file */
+ if (HDclose(file->fd)<0)
+ HRETURN_ERROR(H5E_IO, H5E_CANTCLOSEFILE, FAIL, "unable to close file");
+
+ H5MM_xfree(file);
+
+ FUNC_LEAVE(SUCCEED);
+} /* end H5FD_mpiposix_close() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpiposix_cmp
+ *
+ * Purpose: Compares two files belonging to this driver using an
+ * arbitrary (but consistent) ordering.
+ *
+ * Return: Success: A value like strcmp()
+ * Failure: never fails (arguments were checked by the
+ * caller).
+ *
+ * Programmer: Quincey Koziol
+ * Thursday, July 11, 2002
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+static int
+H5FD_mpiposix_cmp(const H5FD_t *_f1, const H5FD_t *_f2)
+{
+ const H5FD_mpiposix_t *f1 = (const H5FD_mpiposix_t*)_f1;
+ const H5FD_mpiposix_t *f2 = (const H5FD_mpiposix_t*)_f2;
+ int ret_value=0;
+
+ FUNC_ENTER_NOAPI(H5FD_mpiposix_cmp, H5FD_VFD_DEFAULT);
+
+#ifdef WIN32
+ if (f1->fileindexhi < f2->fileindexhi) ret_value= -1;
+ if (f1->fileindexhi > f2->fileindexhi) ret_value= 1;
+
+ if (f1->fileindexlo < f2->fileindexlo) ret_value= -1;
+ if (f1->fileindexlo > f2->fileindexlo) ret_value= 1;
+
+#else
+ if (f1->device < f2->device) ret_value= -1;
+ if (f1->device > f2->device) ret_value= 1;
+
+ if (f1->inode < f2->inode) ret_value= -1;
+ if (f1->inode > f2->inode) ret_value= 1;
+#endif
+
+ FUNC_LEAVE(ret_value);
+} /* end H5FD_mpiposix_cmp() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpiposix_query
+ *
+ * Purpose: Set the flags that this VFL driver is capable of supporting.
+ * (listed in H5FDpublic.h)
+ *
+ * Return: Success: non-negative
+ * Failure: negative
+ *
+ * Programmer: Quincey Koziol
+ * Thursday, July 11, 2002
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD_mpiposix_query(const H5FD_t UNUSED *_file, unsigned long *flags /* out */)
+{
+ herr_t ret_value=SUCCEED;
+
+ FUNC_ENTER_NOAPI(H5FD_mpiposix_query, FAIL);
+
+ /* Set the VFL feature flags that this driver supports */
+ if(flags) {
+ *flags=0;
+ *flags|=H5FD_FEAT_AGGREGATE_METADATA; /* OK to aggregate metadata allocations */
+
+ /* Distinguish between updating the metadata accumulator on writes and
+ * reads. This is particularly (perhaps only, even) important for MPI-I/O
+ * where we guarantee that writes are collective, but reads may not be.
+ * If we were to allow the metadata accumulator to be written during a
+ * read operation, the application would hang.
+ */
+ *flags|=H5FD_FEAT_ACCUMULATE_METADATA_WRITE; /* OK to accumulate metadata for faster writes */
+
+ *flags|=H5FD_FEAT_AGGREGATE_SMALLDATA; /* OK to aggregate "small" raw data allocations */
+ } /* end if */
+
+ FUNC_LEAVE(ret_value);
+} /* end H5FD_mpiposix_query() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpiposix_get_eoa
+ *
+ * Purpose: Gets the end-of-address marker for the file. The EOA marker
+ * is the first address past the last byte allocated in the
+ * format address space.
+ *
+ * Return: Success: The end-of-address marker.
+ * Failure: HADDR_UNDEF
+ *
+ * Programmer: Quincey Koziol
+ * Thursday, July 11, 2002
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+static haddr_t
+H5FD_mpiposix_get_eoa(H5FD_t *_file)
+{
+ H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file;
+
+ FUNC_ENTER_NOAPI(H5FD_mpiposix_get_eoa, HADDR_UNDEF);
+
+ assert(file);
+ assert(H5FD_MPIPOSIX==file->pub.driver_id);
+
+ FUNC_LEAVE(file->eoa);
+} /* end H5FD_mpiposix_get_eoa() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpiposix_set_eoa
+ *
+ * Purpose: Set the end-of-address marker for the file. This function is
+ * called shortly after an existing HDF5 file is opened in order
+ * to tell the driver where the end of the HDF5 data is located.
+ *
+ * Return: Success: non-negative
+ * Failure: negative
+ *
+ * Programmer: Quincey Koziol
+ * Thursday, July 11, 2002
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD_mpiposix_set_eoa(H5FD_t *_file, haddr_t addr)
+{
+ H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file;
+
+ FUNC_ENTER_NOAPI(H5FD_mpiposix_set_eoa, FAIL);
+
+ assert(file);
+ assert(H5FD_MPIPOSIX==file->pub.driver_id);
+
+ file->eoa = addr;
+
+ FUNC_LEAVE(SUCCEED);
+} /* end H5FD_mpi_posix_set_eoa() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpiposix_get_eof
+ *
+ * Purpose: Gets the end-of-file marker for the file. The EOF marker
+ * is the real size of the file.
+ *
+ * The MPIPOSIX driver doesn't bother keeping this field updated
+ * since that's a relatively expensive operation. Fortunately
+ * the library only needs the EOF just after the file is opened
+ * in order to determine whether the file is empty, truncated,
+ * or okay.
+ *
+ * Return: Success: The end-of-address marker.
+ * Failure: HADDR_UNDEF
+ *
+ * Programmer: Quincey Koziol
+ * Thursday, July 11, 2002
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+static haddr_t
+H5FD_mpiposix_get_eof(H5FD_t *_file)
+{
+ H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file;
+
+ FUNC_ENTER_NOAPI(H5FD_mpiposix_get_eof, HADDR_UNDEF);
+
+ assert(file);
+ assert(H5FD_MPIPOSIX==file->pub.driver_id);
+
+ FUNC_LEAVE(MAX(file->eof,file->eoa));
+} /* end H5FD_mpiposix_get_eof() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpiposix_read
+ *
+ * Purpose: Reads SIZE bytes of data from FILE beginning at address ADDR
+ * into buffer BUF according to data transfer properties in
+ * DXPL_ID using potentially complex file and buffer types to
+ * effect the transfer.
+ *
+ * Reading past the end of the file returns zeros instead of
+ * failing.
+ *
+ * Return: Success: Non-negative. Result is stored in caller-supplied
+ * buffer BUF.
+ * Failure: Negative, Contents of buffer BUF are undefined.
+ *
+ * Programmer: Quincey Koziol
+ * Thursday, July 11, 2002
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD_mpiposix_read(H5FD_t *_file, H5FD_mem_t UNUSED type, hid_t UNUSED dxpl_id, haddr_t addr, size_t size,
+ void *buf/*out*/)
+{
+ H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file;
+ ssize_t nbytes; /* Number of bytes read each I/O call */
+ herr_t ret_value=SUCCEED;
+
+ FUNC_ENTER_NOAPI(H5FD_mpiposix_read, FAIL);
+
+ assert(file);
+ assert(H5FD_MPIPOSIX==file->pub.driver_id);
+ assert(buf);
+
+ /* Check for overflow conditions */
+ if (HADDR_UNDEF==addr)
+ HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "addr undefined");
+ if (REGION_OVERFLOW(addr, size))
+ HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow");
+ if (addr+size>file->eoa)
+ HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow");
+
+ /* Seek to the correct location */
+ if ((addr!=file->pos || OP_READ!=file->op) &&
+ file_seek(file->fd, (file_offset_t)addr, SEEK_SET)<0)
+ HGOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL, "unable to seek to proper position");
+
+ /*
+ * Read data, being careful of interrupted system calls, partial results,
+ * and the end of the file.
+ */
+ while (size>0) {
+ do {
+ nbytes = HDread(file->fd, buf, size);
+ } while (-1==nbytes && EINTR==errno);
+ if (-1==nbytes)
+ HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed");
+ if (0==nbytes) {
+ /* end of file but not end of format address space */
+ HDmemset(buf, 0, size);
+ size = 0;
+ } /* end if */
+ assert(nbytes>=0);
+ assert((size_t)nbytes<=size);
+ size -= nbytes;
+ addr += (haddr_t)nbytes;
+ buf = (char*)buf + nbytes;
+ }
+
+ /* Update current position */
+ file->pos = addr;
+ file->op = OP_READ;
+
+done:
+ /* Check for error */
+ if(ret_value<0) {
+ /* Reset last file I/O information */
+ file->pos = HADDR_UNDEF;
+ file->op = OP_UNKNOWN;
+ } /* end if */
+
+ FUNC_LEAVE(ret_value);
+} /* end H5FD_mpiposix_read() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpiposix_write
+ *
+ * Purpose: Writes SIZE bytes of data to FILE beginning at address ADDR
+ * from buffer BUF according to data transfer properties in
+ * DXPL_ID using potentially complex file and buffer types to
+ * effect the transfer.
+ *
+ * Return: Success: non-negative
+ * Failure: negative
+ *
+ * Programmer: Quincey Koziol
+ * Thursday, July 11, 2002
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD_mpiposix_write(H5FD_t *_file, H5FD_mem_t type, hid_t UNUSED dxpl_id, haddr_t addr,
+ size_t size, const void *buf)
+{
+ H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file;
+ int mpi_code; /* MPI return code */
+ ssize_t nbytes; /* Number of bytes written each I/O call */
+ herr_t ret_value=SUCCEED; /* Return value */
+
+ FUNC_ENTER_NOAPI(H5FD_mpiposix_write, FAIL);
+
+ assert(file);
+ assert(H5FD_MPIPOSIX==file->pub.driver_id);
+ assert(buf);
+
+ /* Check for overflow conditions */
+ if (HADDR_UNDEF==addr)
+ HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "addr undefined");
+ if (REGION_OVERFLOW(addr, size))
+ HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow");
+ if (addr+size>file->eoa)
+ HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow");
+
+ /* Only p<round> will do the actual write if all procs in comm write same data */
+ if ((type!=H5FD_MEM_DRAW) && H5_mpiposix_1_metawrite_g) {
+ if (file->mpi_rank != file->mpi_round)
+ HGOTO_DONE(SUCCEED) /* skip the actual write */
+ } /* end if */
+
+ /* Seek to the correct location */
+ if ((addr!=file->pos || OP_WRITE!=file->op) &&
+ file_seek(file->fd, (file_offset_t)addr, SEEK_SET)<0)
+ HGOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL,
+ "unable to seek to proper position");
+
+ /*
+ * Write the data, being careful of interrupted system calls and partial
+ * results
+ */
+ while (size>0) {
+ do {
+ nbytes = HDwrite(file->fd, buf, size);
+ } while (-1==nbytes && EINTR==errno);
+ if (-1==nbytes)
+ HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "file write failed");
+ assert(nbytes>0);
+ assert((size_t)nbytes<=size);
+ size -= nbytes;
+ addr += (haddr_t)nbytes;
+ buf = (const char*)buf + nbytes;
+ } /* end while */
+
+ /* Update current last file I/O information */
+ file->pos = addr;
+ file->op = OP_WRITE;
+
+done:
+ /* Check for error */
+ if(ret_value<0) {
+ /* Reset last file I/O information */
+ file->pos = HADDR_UNDEF;
+ file->op = OP_UNKNOWN;
+ } /* end if */
+ /* Guard against getting into metadata broadcast in failure cases */
+ else {
+ /* if only p<round> writes, need to broadcast the ret_value to other processes */
+ if ((type!=H5FD_MEM_DRAW) && H5_mpiposix_1_metawrite_g) {
+ if (MPI_SUCCESS != (mpi_code= MPI_Bcast(&ret_value, sizeof(ret_value), MPI_BYTE, file->mpi_round, file->comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code);
+
+ /* Round-robin rotate to the next process */
+ file->mpi_round = (++file->mpi_round)%file->mpi_size;
+#ifdef QAK
+ {
+ int max,min;
+
+ MPI_Allreduce(&file->mpi_round, &max, 1, MPI_INT, MPI_MAX, file->comm);
+ MPI_Allreduce(&file->mpi_round, &min, 1, MPI_INT, MPI_MIN, file->comm);
+ if(max!=file->mpi_round)
+ printf("%s: rank=%d, round=%d, max=%d\n",FUNC,file->mpi_rank,file->mpi_round,max);
+ if(min!=file->mpi_round)
+ printf("%s: rank=%d, round=%d, min=%d\n",FUNC,file->mpi_rank,file->mpi_round,min);
+ }
+#endif /* QAK */
+ } /* end if */
+ } /* end else */
+
+ FUNC_LEAVE(ret_value);
+} /* end H5FD_mpiposix_write() */
+
+
+/*-------------------------------------------------------------------------
+ * Function: H5FD_mpiposix_flush
+ *
+ * Purpose: Makes sure that all data is on disk. This is collective.
+ *
+ * Return: Success: Non-negative
+ * Failure: Negative
+ *
+ * Programmer: Quincey Koziol
+ * Thursday, July 11, 2002
+ *
+ * Modifications:
+ *
+ *-------------------------------------------------------------------------
+ */
+static herr_t
+H5FD_mpiposix_flush(H5FD_t *_file, unsigned UNUSED closing)
+{
+ H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file;
+#ifdef WIN32
+ HFILE filehandle; /* Windows file handle */
+ LARGE_INTEGER li; /* 64-bit integer for SetFilePointer() call */
+#endif /* WIN32 */
+ int mpi_code; /* MPI return code */
+ herr_t ret_value=SUCCEED;
+
+ FUNC_ENTER_NOAPI(H5FD_mpiposix_flush, FAIL);
+
+ assert(file);
+ assert(H5FD_MPIPOSIX==file->pub.driver_id);
+
+ /* Extend the file to make sure it's large enough */
+ if(file->eoa>file->last_eoa) {
+ /* Use the round-robin process to truncate (extend) the file */
+ if(file->mpi_rank == file->mpi_round) {
+#ifdef WIN32
+ /* Map the posix file handle to a Windows file handle */
+ filehandle = _get_osfhandle(fd);
+
+ /* Translate 64-bit integers into form Windows wants */
+ /* [This algorithm is from the Windows documentation for SetFilePointer()] */
+ li.QuadPart = file->eoa;
+ SetFilePointer((HANDLE)filehandle,li.LowPart,&li.HighPart,FILE_BEGIN);
+ if(SetEndOfFile((HANDLE)filehandle)==0)
+ HGOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL, "unable to extend file properly");
+#else /* WIN32 */
+ if(-1==file_truncate(file->fd, (file_offset_t)file->eoa))
+ HGOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL, "unable to extend file properly");
+#endif /* WIN32 */
+ } /* end if */
+
+ /* Don't let any proc return until all have extended the file.
+ * (Prevents race condition where some processes go ahead and write
+ * more data to the file before all the processes have finished making
+ * it the shorter length, potentially truncating the file and dropping
+ * the new data written)
+ */
+ if (MPI_SUCCESS!= (mpi_code=MPI_Barrier(file->comm)))
+ HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code);
+
+ /* Update the 'last' eoa and eof values */
+ file->last_eoa=file->eoa;
+ file->eof = file->eoa;
+
+ /* Reset last file I/O information */
+ file->pos = HADDR_UNDEF;
+ file->op = OP_UNKNOWN;
+ } /* end if */
+
+done:
+ FUNC_LEAVE(ret_value);
+} /* end H5FD_mpiposix_flush() */
+
+#endif /*H5_HAVE_PARALLEL*/
+