summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRobb Matzke <matzke@llnl.gov>2002-12-04 00:43:24 (GMT)
committerRobb Matzke <matzke@llnl.gov>2002-12-04 00:43:24 (GMT)
commitbfe92300fb50413b363835fdc4a4e6a8bd5ff8ee (patch)
treeae04221d68bbcce9b67a972a30936b0159c811ee
parent1a9fa4def1102afe9fe0e1770db474019fea77cc (diff)
downloadhdf5-bfe92300fb50413b363835fdc4a4e6a8bd5ff8ee.zip
hdf5-bfe92300fb50413b363835fdc4a4e6a8bd5ff8ee.tar.gz
hdf5-bfe92300fb50413b363835fdc4a4e6a8bd5ff8ee.tar.bz2
[svn-r6148] ./hdf5-devel/src/H5FDmpiposix.c
Purpose: Feature; Optimization Description: Clients pass `-1' or make their own #define for HDF5 functions that take an optional object ID. Blue's GPFS is slow for typical SAF restart dumps. Solution: Added a #define for H5I_INVALID_HID Added GPFS-specific code to H5FDmpiposix.c that tells mmfsd to forego byte range token prefetching. This code can be compiled into the library by defining USE_GPFS_HINTS. The plan is to either generalize this so it's detected during configure and turned on/off at runtime, or to move it up into DSL/SAF with the new HDF5 functions to that return the low-level file handle. Platforms tested: SuSE Linux (arborea), gcc and mpich-1.2.4 SunOS (baldric), gcc
-rw-r--r--src/H5FDmpiposix.c68
1 files changed, 58 insertions, 10 deletions
diff --git a/src/H5FDmpiposix.c b/src/H5FDmpiposix.c
index 4b5a26c..e50afb2 100644
--- a/src/H5FDmpiposix.c
+++ b/src/H5FDmpiposix.c
@@ -41,6 +41,14 @@
#include "H5MMprivate.h" /*memory allocation */
#include "H5Pprivate.h" /*property lists */
+/* Features:
+ * USE_GPFS_HINTS -- issue gpfs_fcntl() calls to hopefully improve
+ * performance when accessing files on a GPFS
+ * file system.
+ *
+ * REPORT_IO -- if set then report all POSIX file calls to stderr.
+ *
+ */
#ifdef USE_GPFS_HINTS
# include <gpfs_fcntl.h>
#endif
@@ -632,12 +640,14 @@ H5FD_mpiposix_open(const char *name, unsigned flags, hid_t fapl_id,
HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code);
#ifdef USE_GPFS_HINTS
- /* Prevent GPFS from prefetching byte range (BR) tokens */
{
+ /* Free all byte range tokens. This is a good thing to do if raw data is aligned on 256kB boundaries (a GPFS page is
+ * 256kB). Care should be taken that there aren't too many sub-page writes, or the mmfsd may become overwhelmed. This
+ * should probably eventually be passed down here as a property. The gpfs_fcntl() will most likely fail if `fd' isn't
+ * on a GPFS file system. */
struct {
gpfsFcntlHeader_t hdr;
gpfsFreeRange_t fr;
- gpfsMultipleAccessRange_t mar;
} hint;
memset(&hint, 0, sizeof hint);
hint.hdr.totalLength = sizeof hint;
@@ -646,15 +656,8 @@ H5FD_mpiposix_open(const char *name, unsigned flags, hid_t fapl_id,
hint.fr.structType = GPFS_FREE_RANGE;
hint.fr.start = 0;
hint.fr.length = 0;
- hint.mar.structLen = sizeof hint.mar;
- hint.mar.structType = GPFS_MULTIPLE_ACCESS_RANGE;
- hint.mar.accRangeCnt = 1;
- hint.mar.accRangeArray[0].blockNumber = 1 + mpi_rank;
- hint.mar.accRangeArray[0].start = 0;
- hint.mar.accRangeArray[0].length = sb.st_blksize;
- hint.mar.accRangeArray[0].isWrite = true;
- if (gpfs_fcntl(f->fd, &hint)<0)
+ if (gpfs_fcntl(fd, &hint)<0)
HGOTO_ERROR(H5E_FILE, H5E_FCNTL, NULL, "failed to send hints to GPFS");
if (0==mpi_rank)
@@ -666,6 +669,10 @@ H5FD_mpiposix_open(const char *name, unsigned flags, hid_t fapl_id,
if (NULL==(file=H5MM_calloc(sizeof(H5FD_mpiposix_t))))
HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed");
+#ifdef REPORT_IO
+ fprintf(stderr, "open: rank=%d name=%s file=0x%08lx\n", mpi_rank, name, (unsigned long)file);
+#endif
+
/* Set the general file information */
file->fd = fd;
file->eof = sb.st_size;
@@ -1030,6 +1037,15 @@ H5FD_mpiposix_read(H5FD_t *_file, H5FD_mem_t UNUSED type, hid_t UNUSED dxpl_id,
if (addr+size>file->eoa)
HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow");
+#ifdef REPORT_IO
+ {
+ int commrank;
+ MPI_Comm_rank(MPI_COMM_WORLD, &commrank);
+ fprintf(stderr, "read: rank=%d file=0x%08lx type=%d, addr=%lu size=%lu\n",
+ commrank, (unsigned long)file, (int)type, (unsigned long)addr, (unsigned long)size);
+ }
+#endif
+
/* Seek to the correct location */
if ((addr!=file->pos || OP_READ!=file->op) &&
file_seek(file->fd, (file_offset_t)addr, SEEK_SET)<0)
@@ -1150,6 +1166,38 @@ H5FD_mpiposix_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
HGOTO_DONE(SUCCEED) /* skip the actual write */
} /* end if */
+#ifdef REPORT_IO
+ {
+ int commrank;
+ MPI_Comm_rank(MPI_COMM_WORLD, &commrank);
+ fprintf(stderr, "write: rank=%d file=0x%08lx type=%d, addr=%lu size=%lu %s\n",
+ commrank, (unsigned long)file, (int)type, (unsigned long)addr, (unsigned long)size,
+ 0==file->naccess?"(FIRST ACCESS)":"");
+ }
+#endif
+
+ if (0==file->naccess++) {
+ /* First write access to this file */
+#ifdef USE_GPFS_HINTS
+ struct {
+ gpfsFcntlHeader_t hdr;
+ gpfsMultipleAccessRange_t mar;
+ } hint;
+ memset(&hint, 0, sizeof hint);
+ hint.hdr.totalLength = sizeof hint;
+ hint.hdr.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
+ hint.mar.structLen = sizeof hint.mar;
+ hint.mar.structType = GPFS_MULTIPLE_ACCESS_RANGE;
+ hint.mar.accRangeCnt = 1;
+ hint.mar.accRangeArray[0].blockNumber = addr / file->blksize;
+ hint.mar.accRangeArray[0].start = addr % file->blksize;
+ hint.mar.accRangeArray[0].length = MIN(file->blksize-hint.mar.accRangeArray[0].start, size);
+ hint.mar.accRangeArray[0].isWrite = 1;
+ if (gpfs_fcntl(file->fd, &hint)<0)
+ HGOTO_ERROR(H5E_FILE, H5E_FCNTL, NULL, "failed to send hints to GPFS");
+#endif
+ }
+
/* Seek to the correct location */
if ((addr!=file->pos || OP_WRITE!=file->op) &&
file_seek(file->fd, (file_offset_t)addr, SEEK_SET)<0)