summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlbert Cheng <acheng@hdfgroup.org>2002-06-02 00:27:08 (GMT)
committerAlbert Cheng <acheng@hdfgroup.org>2002-06-02 00:27:08 (GMT)
commitb84e706de98a6541f29e6ce47dc87d895fdeacc0 (patch)
treea9b32f2d962b909ccdedaa517386a3a54bf972bf
parentccfb48ff4a6a3baa517774392d9dca2b15ca733a (diff)
downloadhdf5-b84e706de98a6541f29e6ce47dc87d895fdeacc0.zip
hdf5-b84e706de98a6541f29e6ce47dc87d895fdeacc0.tar.gz
hdf5-b84e706de98a6541f29e6ce47dc87d895fdeacc0.tar.bz2
[svn-r5504] Purpose:
New feature Description: pio_perf.c: per_engine.c: folded in the new feature from v1.5. New feature added is the interleaved I/O performance test. Only POSIX interface is implemented for now. The MPIO and PHDF5 will be added later. pio_perf.h: Added a few macros to print rank and size of MPI_COMM_WORLD in a unifed style. Platforms tested: modi4, eirene, burrwhite
-rw-r--r--perform/pio_engine.c581
-rw-r--r--perform/pio_perf.c56
-rw-r--r--perform/pio_perf.h8
3 files changed, 426 insertions, 219 deletions
diff --git a/perform/pio_engine.c b/perform/pio_engine.c
index fabb1ac..928129c 100644
--- a/perform/pio_engine.c
+++ b/perform/pio_engine.c
@@ -34,7 +34,7 @@
/* Macro definitions */
/* sizes of various items. these sizes won't change during program execution */
-#define ELMT_SIZE ((int)sizeof(int)) /* we're doing ints */
+#define ELMT_SIZE (sizeof(int)) /* we're doing ints */
#define GOTOERROR(errcode) { ret_code = errcode; goto done; }
#define GOTODONE { goto done; }
@@ -108,9 +108,9 @@ typedef union _file_descr {
static char *pio_create_filename(iotype iot, const char *base_name,
char *fullname, size_t size);
static herr_t do_write(results *res, file_descr *fd, parameters *parms,
- long ndsets, off_t nelmts, size_t buf_size, void *buffer);
+ long ndsets, off_t nelmts, size_t blk_size, size_t buf_size, void *buffer);
static herr_t do_read(results *res, file_descr *fd, parameters *parms,
- long ndsets, off_t nelmts, size_t buf_size, void *buffer /*out*/);
+ long ndsets, off_t nelmts, size_t blk_size, size_t buf_size, void *buffer /*out*/);
static herr_t do_fopen(parameters *param, char *fname, file_descr *fd /*out*/,
int flags);
static herr_t do_fclose(iotype iot, file_descr *fd);
@@ -144,17 +144,13 @@ do_pio(parameters param)
off_t nelmts;
char *buffer = NULL; /*data buffer pointer */
size_t buf_size; /*data buffer size in bytes */
+ size_t blk_size; /*interleaved I/O block size */
/* HDF5 variables */
herr_t hrc; /*HDF5 return code */
/* Sanity check parameters */
- /* debug */
- if (pio_debug_level>=4) {
- h5_dump_info_object(h5_io_info_g);
- }
-
/* IO type */
iot = param.io_type;
@@ -182,6 +178,7 @@ do_pio(parameters param)
nelmts = param.num_elmts; /* number of elements per dataset */
maxprocs = param.num_procs; /* max number of mpi-processes to use */
buf_size = param.buf_size;
+ blk_size = param.block_size; /* interleaved IO block size */
if (nfiles < 0 ) {
fprintf(stderr,
@@ -204,33 +201,31 @@ do_pio(parameters param)
GOTOERROR(FAIL);
}
-
-#if akcdebug
-/* debug*/
-fprintf(stderr, "nfiles=%d\n", nfiles);
-fprintf(stderr, "ndsets=%ld\n", ndsets);
-fprintf(stderr, "nelmts=%ld\n", nelmts);
-fprintf(stderr, "maxprocs=%d\n", maxprocs);
-fprintf(stderr, "buffer size=%ld\n", buf_size);
-fprintf(stderr, "total data size=%ld\n", ndsets*nelmts*sizeof(int));
-nfiles=MIN(3, nfiles);
-/*ndsets=MIN(5, ndsets);*/
-/*nelmts=MIN(1000, nelmts);*/
-buf_size=MIN(1024*1024, buf_size);
-/* DEBUG END */
-#endif
-
- /* allocate data buffer */
- if(buf_size>0) {
+ /* allocate transfer buffer */
+ if(buf_size<=0) {
+ HDfprintf(stderr,
+ "Transfer buffer size (%Hd) must be > 0\n", (long_long)buf_size);
+ GOTOERROR(FAIL);
+ }else{
buffer = malloc(buf_size);
if (buffer == NULL){
- fprintf(stderr, "malloc for data buffer size (%ld) failed\n",
- buf_size);
+ HDfprintf(stderr, "malloc for transfer buffer size (%Hd) failed\n",
+ (long_long)buf_size);
GOTOERROR(FAIL);
}
}
+ /* Should only need blk_size <= buf_size. */
+ /* More restrictive condition for easier implementation for now. */
+ if (blk_size > 0 && (buf_size % blk_size)){
+ HDfprintf(stderr,
+ "Transfer buffer size (%Hd) must be a multiple of the "
+ "interleaved I/O block size (%Hd)\n",
+ (long_long)buf_size, (long_long)blk_size);
+ GOTOERROR(FAIL);
+ }
+
if (pio_debug_level >= 4) {
int myrank;
@@ -252,9 +247,6 @@ buf_size=MIN(1024*1024, buf_size);
sprintf(base_name, "#pio_tmp_%u", nf);
pio_create_filename(iot, base_name, fname, sizeof(fname));
-#if AKCDEBUG
-fprintf(stderr, "filename=%s\n", fname);
-#endif
set_time(res.timers, HDF5_GROSS_WRITE_FIXED_DIMS, START);
hrc = do_fopen(&param, fname, &fd, PIO_CREATE | PIO_WRITE);
@@ -262,7 +254,7 @@ fprintf(stderr, "filename=%s\n", fname);
VRFY((hrc == SUCCESS), "do_fopen failed");
set_time(res.timers, HDF5_FINE_WRITE_FIXED_DIMS, START);
- hrc = do_write(&res, &fd, &param, ndsets, nelmts, buf_size, buffer);
+ hrc = do_write(&res, &fd, &param, ndsets, nelmts, blk_size, buf_size, buffer);
set_time(res.timers, HDF5_FINE_WRITE_FIXED_DIMS, STOP);
VRFY((hrc == SUCCESS), "do_write failed");
@@ -285,7 +277,7 @@ fprintf(stderr, "filename=%s\n", fname);
VRFY((hrc == SUCCESS), "do_fopen failed");
set_time(res.timers, HDF5_FINE_READ_FIXED_DIMS, START);
- hrc = do_read(&res, &fd, &param, ndsets, nelmts, buf_size, buffer);
+ hrc = do_read(&res, &fd, &param, ndsets, nelmts, blk_size, buf_size, buffer);
set_time(res.timers, HDF5_FINE_READ_FIXED_DIMS, STOP);
VRFY((hrc == SUCCESS), "do_read failed");
@@ -448,7 +440,7 @@ pio_create_filename(iotype iot, const char *base_name, char *fullname, size_t si
*/
static herr_t
do_write(results *res, file_descr *fd, parameters *parms, long ndsets,
- off_t nelmts, size_t buf_size, void *buffer)
+ off_t nelmts, size_t blk_size, size_t buf_size, void *buffer)
{
int ret_code = SUCCESS;
int rc; /*routine return code */
@@ -456,29 +448,34 @@ do_write(results *res, file_descr *fd, parameters *parms, long ndsets,
MPI_Offset mpi_offset;
MPI_Status mpi_status;
long ndset;
- off_t nelmts_written;
- size_t nelmts_towrite;
+ off_t nelmts_xfer;
+ size_t nelmts_toxfer;
char dname[64];
off_t dset_offset; /*dataset offset in a file */
off_t file_offset; /*file offset of the next transfer */
off_t dset_size; /*one dataset size in bytes */
- size_t nelmts_in_buf;
+ size_t nelmts_in_buf; /*how many element the buffer holds */
+ size_t nelmts_in_blk; /*how many element a block holds */
off_t elmts_begin; /*first elmt this process transfer */
off_t elmts_count; /*number of elmts this process transfer */
hid_t dcpl = -1; /* Dataset creation property list */
/* HDF5 variables */
- herr_t hrc; /*HDF5 return code */
- hsize_t h5dims[1]; /*dataset dim sizes */
- hid_t h5dset_space_id = -1; /*dataset space ID */
- hid_t h5mem_space_id = -1; /*memory dataspace ID */
- hid_t h5ds_id = -1; /*dataset handle */
-
-#if AKCDEBUG
-fprintf(stderr, "In do_write\n");
-fprintf(stderr, "ndsets=%ld\n", ndsets);
-fprintf(stderr, "nelmts=%ld\n", nelmts);
-fprintf(stderr, "buffer size=%ld\n", buf_size);
+ herr_t hrc; /*HDF5 return code */
+ hsize_t h5dims[1]; /*dataset dim sizes */
+ hid_t h5dset_space_id = -1; /*dataset space ID */
+ hid_t h5mem_space_id = -1; /*memory dataspace ID */
+ hid_t h5ds_id = -1; /*dataset handle */
+ hsize_t h5mem_block[1]; /*memory space selection */
+ hsize_t h5mem_stride[1];
+ hsize_t h5mem_count[1];
+ hssize_t h5mem_start[1];
+#if 0
+ /* for future implementation */
+ hsize_t h5dset_block[1]; /*dset space selection */
+ hsize_t h5dset_stride[1];
+ hsize_t h5dset_count[1];
+ hssize_t h5dset_start[1];
#endif
/* calculate dataset parameters. data type is always native C int */
@@ -570,48 +567,113 @@ fprintf(stderr, "buffer size=%ld\n", buf_size);
break;
}
- /* Calculate the first element and how many elements this process
- * transfer. First calculate the beginning element of this process
- * and the next process. Count of elements is the difference between
- * these two beginnings. This way, it avoids any rounding errors.
- */
- elmts_begin = (long)(((double)nelmts)/pio_mpi_nprocs_g*pio_mpi_rank_g);
-
- if (pio_mpi_rank_g < (pio_mpi_nprocs_g - 1))
- elmts_count = (long)((((double)nelmts) / pio_mpi_nprocs_g * (pio_mpi_rank_g + 1))
- - (double)elmts_begin);
- else
- /* last process. Take whatever are left */
- elmts_count = nelmts - elmts_begin;
-
-#if AKCDEBUG
-fprintf(stderr, "proc %d: elmts_begin=%ld, elmts_count=%ld\n",
- pio_mpi_rank_g, elmts_begin, elmts_count);
-#endif
-
- nelmts_written = 0 ;
+ /* There are two kinds of transfer patterns, contiguous and interleaved.
+ * Let 0,1,2,...,n be data accessed by process 0,1,2,...,n
+ * where n is rank of the last process.
+ * In contiguous pattern, data are accessed as
+ * 000...111...222...nnn...
+ * In interleaved pattern, data are accessed as
+ * 012...n012...n...
+ * These are all in the scope of one dataset.
+ */
+ /* Calculate the total number of elements (elmts_count) to be
+ * transferred by this process. It may be different for different
+ * transfer pattern due to rounding to integral values.
+ */
+ if (blk_size==0){
+ /* Contiguous Pattern:
+ * Calculate the beginning element of this process and the next.
+ * elmts_count is the difference between these two beginnings.
+ * This way, it eliminates any rounding errors.
+ */
+ elmts_begin = (off_t)(((double)nelmts)/pio_mpi_nprocs_g*pio_mpi_rank_g);
+
+ /* Do not cast elmt_begin to other types, especially non-integral
+ * types, else it may introduce rounding discrepency. */
+ if (pio_mpi_rank_g < (pio_mpi_nprocs_g - 1))
+ elmts_count = (off_t)(((double)nelmts) / pio_mpi_nprocs_g * (pio_mpi_rank_g + 1))
+ - elmts_begin;
+ else
+ /* last process. Take whatever are left */
+ elmts_count = nelmts - elmts_begin;
+ }else{
+ /* Interleaved Pattern:
+ * Each process takes blk_size of elements, starting with the first
+ * process. So, the last process may have fewer or even none.
+ * Calculate the beginning element of this process and the next.
+ * The elmnts_begin here marks only the beginning of the first
+ * block accessed by this process.
+ */
+ /* Algorithm:
+ * First allocate equal blocks per process, i.e. one block each
+ * process for every block_size*nprocs.
+ * If there is remaining unallocated, give a block each to process
+ * starting at proc 0. The last process may get a partial block.
+ */
+ off_t remain_nelmts, remain_begin; /* unallocated remaining*/
+
+ nelmts_in_blk = blk_size/ELMT_SIZE;
+ elmts_begin = (off_t)(nelmts_in_blk*pio_mpi_rank_g);
+
+ /* must use integer calculation next */
+ /* allocate equal blocks per process */
+ elmts_count = (nelmts / (off_t)(nelmts_in_blk*pio_mpi_nprocs_g)) *
+ (off_t)nelmts_in_blk;
+ remain_nelmts = nelmts % ((off_t)(nelmts_in_blk*pio_mpi_nprocs_g));
+
+ /* allocate any remaining */
+ remain_begin = (off_t)(nelmts_in_blk*pio_mpi_rank_g);
+ if (remain_nelmts > remain_begin){
+ /* it gets something */
+ if (remain_nelmts > (remain_begin+(off_t)nelmts_in_blk)){
+ /* one full block */
+ elmts_count += nelmts_in_blk;
+ }else{
+ /* only a partial block */
+ elmts_count += remain_nelmts - remain_begin;
+ }
+ }
+ }
+ /* debug */
+ if (pio_debug_level >= 4) {
+ HDprint_rank(output);
+ HDfprintf(output, "Debug(do_write): "
+ "nelmts_in_blk=%Hd, elmts_begin=%Hd, elmts_count=%Hd\n",
+ (long_long)nelmts_in_blk, (long_long)elmts_begin,
+ (long_long)elmts_count);
+ }
+
+
+ /* The task is to transfer elmts_count elements, starting at
+ * elmts_begin position, using transfer buffer of buf_size bytes.
+ * If blk_size > 0, select blk_size at a time, in round robin
+ * fashion, according to number of process. Otherwise, select
+ * all elmt_count in contiguous.
+ */
+ nelmts_xfer = 0 ;
/* Start "raw data" write timer */
set_time(res->timers, HDF5_RAW_WRITE_FIXED_DIMS, START);
- while (nelmts_written < elmts_count){
- nelmts_towrite = elmts_count - nelmts_written;
-
- if ((elmts_count - nelmts_written) >= nelmts_in_buf) {
- nelmts_towrite = nelmts_in_buf;
+ while (nelmts_xfer < elmts_count){
+ /* transfer one buffer of data each round */
+ /* Note: because size_t is unsigned, avoid expressions that */
+ /* can be negative. */
+ if ((nelmts_xfer + (off_t)nelmts_in_buf) <= elmts_count) {
+ nelmts_toxfer = nelmts_in_buf;
} else {
- /* last write of a partial buffer */
- nelmts_towrite = elmts_count - nelmts_written;
+ /* last transfer of a partial buffer */
+ nelmts_toxfer = elmts_count - nelmts_xfer;
}
-#if AKCDEBUG
+#ifdef AKCDEBUG
/*Prepare write data*/
{
int *intptr = (int *)buffer;
register int i;
- for (i = 0; i < nelmts_towrite; ++i)
- *intptr++ = nelmts_towrite + i;
+ for (i = 0; i < nelmts_toxfer; ++i)
+ *intptr++ = nelmts_toxfer + i;
}
#endif
@@ -619,54 +681,87 @@ fprintf(stderr, "proc %d: elmts_begin=%ld, elmts_count=%ld\n",
/* Calculate offset of write within a dataset/file */
switch (parms->io_type) {
case POSIXIO:
- /* need to (off_t) the elmnts_begin expression because they */
- /* may be of smaller sized integer types */
- file_offset = dset_offset + (off_t)(elmts_begin + nelmts_written)*ELMT_SIZE;
-
-#if AKCDEBUG
-HDfprintf(stderr, "proc %d: write %Hd bytes at file-offset %Hd\n",
- pio_mpi_rank_g, (long_long)nelmts_towrite*ELMT_SIZE, (long_long)file_offset);
-#endif
-
- /* only care if seek returns error */
- rc = POSIXSEEK(fd->posixfd, file_offset) < 0 ? -1 : 0;
- VRFY((rc==0), "POSIXSEEK");
- /* check if all bytes are written */
- rc = ((nelmts_towrite*ELMT_SIZE) ==
- POSIXWRITE(fd->posixfd, buffer, nelmts_towrite*ELMT_SIZE));
- VRFY((rc != 0), "POSIXWRITE");
+ if (blk_size==0){
+ /* Contiguous pattern */
+ /* need to (off_t) the elmnts_begin expression because they */
+ /* may be of smaller sized integer types */
+ file_offset = dset_offset + (off_t)(elmts_begin + nelmts_xfer)*ELMT_SIZE;
+
+ /* only care if seek returns error */
+ rc = POSIXSEEK(fd->posixfd, file_offset) < 0 ? -1 : 0;
+ VRFY((rc==0), "POSIXSEEK");
+ /* check if all bytes are transferred */
+ rc = ((ssize_t)(nelmts_toxfer*ELMT_SIZE) ==
+ POSIXWRITE(fd->posixfd, buffer, nelmts_toxfer*ELMT_SIZE));
+ VRFY((rc != 0), "POSIXWRITE");
+ }else{
+ /* interleaved access pattern */
+ char *buf_p=buffer;
+ size_t xferred=0;
+ size_t toxfer=0;
+
+ file_offset = dset_offset + (off_t)(elmts_begin + nelmts_xfer)*ELMT_SIZE;
+ if (pio_debug_level >= 4) {
+HDprint_rank(output);
+HDfprintf(output,
+"Debug(do_write): "
+"nelmts_toxfer=%Hd, nelmts_xfer=%Hd\n"
+,
+(long_long)nelmts_toxfer, (long_long)nelmts_xfer);
+}
+ while (xferred < nelmts_toxfer){
+ if ((nelmts_toxfer - xferred) >= nelmts_in_blk)
+ toxfer = nelmts_in_blk;
+ else
+ toxfer = nelmts_toxfer - xferred;
+ /* Skip offset over blocks of other processes */
+ file_offset = dset_offset +
+ (off_t)(elmts_begin + (nelmts_xfer+xferred)*pio_mpi_nprocs_g)*ELMT_SIZE;
+ if (pio_debug_level >= 4) {
+HDprint_rank(output);
+HDfprintf(output,
+"Debug(do_write): "
+"nelmts_toxfer=%Hd, nelmts_xfer=%Hd"
+", toxfer=%Hd, xferred=%Hd"
+", file_offset=%Hd"
+"\n",
+(long_long)nelmts_toxfer, (long_long)nelmts_xfer,
+(long_long)toxfer, (long_long)xferred,
+(long_long)file_offset);
+}
+ /* only care if seek returns error */
+ rc = POSIXSEEK(fd->posixfd, file_offset) < 0 ? -1 : 0;
+ VRFY((rc==0), "POSIXSEEK");
+ /* check if all bytes are written */
+ rc = ((ssize_t)(toxfer*ELMT_SIZE) ==
+ POSIXWRITE(fd->posixfd, buf_p, toxfer*ELMT_SIZE));
+ VRFY((rc != 0), "POSIXWRITE");
+ xferred += toxfer;
+ }
+ }
break;
case MPIO:
- mpi_offset = dset_offset + (elmts_begin + nelmts_written)*ELMT_SIZE;
-
-#if AKCDEBUG
-fprintf(stderr, "proc %d: writes %ld bytes at mpi-offset %ld\n",
- pio_mpi_rank_g, nelmts_towrite*ELMT_SIZE, mpi_offset);
-#endif
-
+ mpi_offset = dset_offset + (elmts_begin + nelmts_xfer)*ELMT_SIZE;
mrc = MPI_File_write_at(fd->mpifd, mpi_offset, buffer,
- (int)(nelmts_towrite*ELMT_SIZE), MPI_CHAR,
+ (int)(nelmts_toxfer*ELMT_SIZE), MPI_CHAR,
&mpi_status);
VRFY((mrc==MPI_SUCCESS), "MPIO_WRITE");
break;
case PHDF5:
/*set up the dset space id to select the segment to process */
{
- hsize_t block[1], stride[1], count[1];
- hssize_t start[1];
-
- start[0] = elmts_begin + nelmts_written;
- stride[0] = block[0] = nelmts_towrite;
- count[0] = 1;
+ h5mem_start[0] = elmts_begin + nelmts_xfer;
+ h5mem_stride[0] = h5mem_block[0] = nelmts_toxfer;
+ h5mem_count[0] = 1;
hrc = H5Sselect_hyperslab(h5dset_space_id, H5S_SELECT_SET,
- start, stride, count, block);
+ h5mem_start, h5mem_stride, h5mem_count, h5mem_block);
VRFY((hrc >= 0), "H5Sset_hyperslab");
/*setup the memory space id too. Only start is different */
- start[0] = 0;
+ h5mem_start[0] = 0;
hrc = H5Sselect_hyperslab(h5mem_space_id, H5S_SELECT_SET,
- start, stride, count, block);
+ h5mem_start, h5mem_stride, h5mem_count, h5mem_block);
VRFY((hrc >= 0), "H5Sset_hyperslab");
}
@@ -677,7 +772,7 @@ fprintf(stderr, "proc %d: writes %ld bytes at mpi-offset %ld\n",
break;
}
- nelmts_written += nelmts_towrite;
+ nelmts_xfer += nelmts_toxfer;
}
/* Stop "raw data" write timer */
@@ -732,7 +827,7 @@ done:
*/
static herr_t
do_read(results *res, file_descr *fd, parameters *parms, long ndsets,
- off_t nelmts, size_t buf_size, void *buffer /*out*/)
+ off_t nelmts, size_t blk_size, size_t buf_size, void *buffer /*out*/)
{
int ret_code = SUCCESS;
int rc; /*routine return code */
@@ -740,28 +835,33 @@ do_read(results *res, file_descr *fd, parameters *parms, long ndsets,
MPI_Offset mpi_offset;
MPI_Status mpi_status;
long ndset;
- size_t nelmts_toread;
- off_t nelmts_read;
+ off_t nelmts_xfer;
+ size_t nelmts_toxfer;
char dname[64];
off_t dset_offset; /*dataset offset in a file */
off_t file_offset; /*file offset of the next transfer */
off_t dset_size; /*one dataset size in bytes */
- size_t nelmts_in_buf;
+ size_t nelmts_in_buf; /*how many element the buffer holds */
+ size_t nelmts_in_blk; /*how many element a block holds */
off_t elmts_begin; /*first elmt this process transfer */
off_t elmts_count; /*number of elmts this process transfer */
/* HDF5 variables */
- herr_t hrc; /*HDF5 return code */
- hsize_t h5dims[1]; /*dataset dim sizes */
+ herr_t hrc; /*HDF5 return code */
+ hsize_t h5dims[1]; /*dataset dim sizes */
hid_t h5dset_space_id = -1; /*dataset space ID */
hid_t h5mem_space_id = -1; /*memory dataspace ID */
- hid_t h5ds_id = -1; /*dataset handle */
-
-#if AKCDEBUG
-fprintf(stderr, "In do_read\n");
-fprintf(stderr, "ndsets=%ld\n", ndsets);
-fprintf(stderr, "nelmts=%ld\n", nelmts);
-fprintf(stderr, "buffer size=%ld\n", buf_size);
+ hid_t h5ds_id = -1; /*dataset handle */
+ hsize_t h5mem_block[1]; /*memory space selection */
+ hsize_t h5mem_stride[1];
+ hsize_t h5mem_count[1];
+ hssize_t h5mem_start[1];
+#if 0
+ /* for future implementation */
+ hsize_t h5dset_block[1]; /*dset space selection */
+ hsize_t h5dset_stride[1];
+ hsize_t h5dset_count[1];
+ hssize_t h5dset_start[1];
#endif
/* calculate dataset parameters. data type is always native C int */
@@ -815,72 +915,174 @@ fprintf(stderr, "buffer size=%ld\n", buf_size);
break;
}
- /*
- * Calculate the first element and how many elements this process
- * transfer. First calculate the beginning element of this process
- * and the next process. Count of elements is the difference between
- * these two beginnings. This way, it avoids any rounding errors.
- */
- elmts_begin = (long)(((double)nelmts)/pio_mpi_nprocs_g*pio_mpi_rank_g);
-
- if (pio_mpi_rank_g < (pio_mpi_nprocs_g - 1))
- elmts_count = (long)((((double)nelmts) / pio_mpi_nprocs_g * (pio_mpi_rank_g + 1)) -
- (double)elmts_begin);
- else
- /* last process. Take whatever are left */
- elmts_count = nelmts - elmts_begin;
-
-#if AKCDEBUG
-fprintf(stderr, "proc %d: elmts_begin=%ld, elmts_count=%ld\n",
- pio_mpi_rank_g, elmts_begin, elmts_count);
-#endif
-
- nelmts_read = 0 ;
+ /* There are two kinds of transfer patterns, contiguous and interleaved.
+ * Let 0,1,2,...,n be data accessed by process 0,1,2,...,n
+ * where n is rank of the last process.
+ * In contiguous pattern, data are accessed as
+ * 000...111...222...nnn...
+ * In interleaved pattern, data are accessed as
+ * 012...n012...n...
+ * These are all in the scope of one dataset.
+ */
+ /* Calculate the total number of elements (elmts_count) to be
+ * transferred by this process. It may be different for different
+ * transfer pattern due to rounding to integral values.
+ */
+ if (blk_size==0){
+ /* Contiguous Pattern:
+ * Calculate the beginning element of this process and the next.
+ * elmts_count is the difference between these two beginnings.
+ * This way, it eliminates any rounding errors.
+ */
+ elmts_begin = (off_t)(((double)nelmts)/pio_mpi_nprocs_g*pio_mpi_rank_g);
+
+ /* Do not cast elmt_begin to other types, especially non-integral
+ * types, else it may introduce rounding discrepency. */
+ if (pio_mpi_rank_g < (pio_mpi_nprocs_g - 1))
+ elmts_count = (off_t)(((double)nelmts) / pio_mpi_nprocs_g * (pio_mpi_rank_g + 1))
+ - elmts_begin;
+ else
+ /* last process. Take whatever are left */
+ elmts_count = nelmts - elmts_begin;
+ }else{
+ /* Interleaved Pattern:
+ * Each process takes blk_size of elements, starting with the first
+ * process. So, the last process may have fewer or even none.
+ * Calculate the beginning element of this process and the next.
+ * The elmnts_begin here marks only the beginning of the first
+ * block accessed by this process.
+ */
+ /* Algorithm:
+ * First allocate equal blocks per process, i.e. one block each
+ * process for every block_size*nprocs.
+ * If there is remaining unallocated, give a block each to process
+ * starting at proc 0. The last process may get a partial block.
+ */
+ off_t remain_nelmts, remain_begin; /* unallocated remaining*/
+
+ nelmts_in_blk = blk_size/ELMT_SIZE;
+ elmts_begin = (off_t)(nelmts_in_blk*pio_mpi_rank_g);
+
+ /* must use integer calculation next */
+ /* allocate equal blocks per process */
+ elmts_count = (nelmts / (off_t)(nelmts_in_blk*pio_mpi_nprocs_g)) *
+ (off_t)nelmts_in_blk;
+ remain_nelmts = nelmts % ((off_t)(nelmts_in_blk*pio_mpi_nprocs_g));
+
+ /* allocate any remaining */
+ remain_begin = (off_t)(nelmts_in_blk*pio_mpi_rank_g);
+ if (remain_nelmts > remain_begin){
+ /* it gets something */
+ if (remain_nelmts > (remain_begin+(off_t)nelmts_in_blk)){
+ /* one full block */
+ elmts_count += nelmts_in_blk;
+ }else{
+ /* only a partial block */
+ elmts_count += remain_nelmts - remain_begin;
+ }
+ }
+ }
+ /* debug */
+ if (pio_debug_level >= 4) {
+ HDprint_rank(output);
+ HDfprintf(output, "Debug(do_read): "
+ "nelmts_in_blk=%Hd, elmts_begin=%Hd, elmts_count=%Hd\n",
+ (long_long)nelmts_in_blk, (long_long)elmts_begin,
+ (long_long)elmts_count);
+ }
+
+
+ /* The task is to transfer elmts_count elements, starting at
+ * elmts_begin position, using transfer buffer of buf_size bytes.
+ * If blk_size > 0, select blk_size at a time, in round robin
+ * fashion, according to number of process. Otherwise, select
+ * all elmt_count in contiguous.
+ */
+ nelmts_xfer = 0 ;
/* Start "raw data" read timer */
set_time(res->timers, HDF5_RAW_READ_FIXED_DIMS, START);
- while (nelmts_read < elmts_count){
- nelmts_toread = elmts_count - nelmts_read;
-
- if ((elmts_count - nelmts_read) >= nelmts_in_buf)
- nelmts_toread = nelmts_in_buf;
- else
- /* last read of a partial buffer */
- nelmts_toread = elmts_count - nelmts_read;
+ while (nelmts_xfer < elmts_count){
+ /* transfer one buffer of data each round */
+ /* Note: because size_t is unsigned, avoid expressions that */
+ /* can be negative. */
+ if ((nelmts_xfer + (off_t)nelmts_in_buf) <= elmts_count) {
+ nelmts_toxfer = nelmts_in_buf;
+ } else {
+ /* last transfer of a partial buffer */
+ nelmts_toxfer = elmts_count - nelmts_xfer;
+ }
/* read */
/* Calculate offset of read within a dataset/file */
switch (parms->io_type){
case POSIXIO:
- /* need to (off_t) the elmnts_begin expression because they */
- /* may be of smaller sized integer types */
- file_offset = dset_offset + (off_t)(elmts_begin + nelmts_read)*ELMT_SIZE;
-
-#if AKCDEBUG
-HDfprintf(stderr, "proc %d: read %Hd bytes at file-offset %Hd\n",
- pio_mpi_rank_g, (long_long)nelmts_towrite*ELMT_SIZE, (long_long)file_offset);
-#endif
-
- /* only care if seek returns error */
- rc = POSIXSEEK(fd->posixfd, file_offset) < 0 ? -1 : 0;
- VRFY((rc==0), "POSIXSEEK");
- /* check if all bytes are read */
- rc = ((nelmts_toread*ELMT_SIZE) ==
- POSIXREAD(fd->posixfd, buffer, nelmts_toread*ELMT_SIZE));
- VRFY((rc != 0), "POSIXREAD");
+ if (blk_size==0){
+ /* Contiguous pattern */
+ /* need to (off_t) the elmnts_begin expression because they */
+ /* may be of smaller sized integer types */
+ file_offset = dset_offset + (off_t)(elmts_begin + nelmts_xfer)*ELMT_SIZE;
+
+ /* only care if seek returns error */
+ rc = POSIXSEEK(fd->posixfd, file_offset) < 0 ? -1 : 0;
+ VRFY((rc==0), "POSIXSEEK");
+ /* check if all bytes are transferred */
+ rc = ((ssize_t)(nelmts_toxfer*ELMT_SIZE) ==
+ POSIXREAD(fd->posixfd, buffer, nelmts_toxfer*ELMT_SIZE));
+ VRFY((rc != 0), "POSIXREAD");
+ }else{
+ /* interleaved access pattern */
+ char *buf_p=buffer;
+ size_t xferred=0;
+ size_t toxfer=0;
+
+ file_offset = dset_offset + (off_t)(elmts_begin + nelmts_xfer)*ELMT_SIZE;
+ if (pio_debug_level >= 4) {
+HDprint_rank(output);
+HDfprintf(output,
+"Debug(do_read): "
+"nelmts_toxfer=%Hd, nelmts_xfer=%Hd\n"
+,
+(long_long)nelmts_toxfer, (long_long)nelmts_xfer);
+}
+ while (xferred < nelmts_toxfer){
+ if ((nelmts_toxfer - xferred) >= nelmts_in_blk)
+ toxfer = nelmts_in_blk;
+ else
+ toxfer = nelmts_toxfer - xferred;
+ /* Skip offset over blocks of other processes */
+ file_offset = dset_offset +
+ (off_t)(elmts_begin + (nelmts_xfer+xferred)*pio_mpi_nprocs_g)*ELMT_SIZE;
+ if (pio_debug_level >= 4) {
+HDprint_rank(output);
+HDfprintf(output,
+"Debug(do_read):"
+"nelmts_toxfer=%Hd, nelmts_xfer=%Hd"
+", toxfer=%Hd, xferred=%Hd"
+", file_offset=%Hd"
+"\n",
+(long_long)nelmts_toxfer, (long_long)nelmts_xfer,
+(long_long)toxfer, (long_long)xferred,
+(long_long)file_offset);
+}
+ /* only care if seek returns error */
+ rc = POSIXSEEK(fd->posixfd, file_offset) < 0 ? -1 : 0;
+ VRFY((rc==0), "POSIXSEEK");
+ /* check if all bytes are transferred */
+ rc = ((ssize_t)(toxfer*ELMT_SIZE) ==
+ POSIXREAD(fd->posixfd, buf_p, toxfer*ELMT_SIZE));
+ VRFY((rc != 0), "POSIXREAD");
+ xferred += toxfer;
+ }
+ }
break;
case MPIO:
- mpi_offset = dset_offset + (elmts_begin + nelmts_read)*ELMT_SIZE;
-
-#if AKCDEBUG
-fprintf(stderr, "proc %d: read %ld bytes at mpi-offset %ld\n",
- pio_mpi_rank_g, nelmts_toread*ELMT_SIZE, mpi_offset);
-#endif
+ mpi_offset = dset_offset + (elmts_begin + nelmts_xfer)*ELMT_SIZE;
mrc = MPI_File_read_at(fd->mpifd, mpi_offset, buffer,
- (int)(nelmts_toread*ELMT_SIZE), MPI_CHAR,
+ (int)(nelmts_toxfer*ELMT_SIZE), MPI_CHAR,
&mpi_status);
VRFY((mrc==MPI_SUCCESS), "MPIO_read");
break;
@@ -888,20 +1090,17 @@ fprintf(stderr, "proc %d: read %ld bytes at mpi-offset %ld\n",
case PHDF5:
/*set up the dset space id to select the segment to process */
{
- hsize_t block[1], stride[1], count[1];
- hssize_t start[1];
-
- start[0] = elmts_begin + nelmts_read;
- stride[0] = block[0] = nelmts_toread;
- count[0] = 1;
+ h5mem_start[0] = elmts_begin + nelmts_xfer;
+ h5mem_stride[0] = h5mem_block[0] = nelmts_toxfer;
+ h5mem_count[0] = 1;
hrc = H5Sselect_hyperslab(h5dset_space_id, H5S_SELECT_SET,
- start, stride, count, block);
+ h5mem_start, h5mem_stride, h5mem_count, h5mem_block);
VRFY((hrc >= 0), "H5Sset_hyperslab");
/*setup the memory space id too. Only start is different */
- start[0] = 0;
+ h5mem_start[0] = 0;
hrc = H5Sselect_hyperslab(h5mem_space_id, H5S_SELECT_SET,
- start, stride, count, block);
+ h5mem_start, h5mem_stride, h5mem_count, h5mem_block);
VRFY((hrc >= 0), "H5Sset_hyperslab");
}
@@ -912,19 +1111,20 @@ fprintf(stderr, "proc %d: read %ld bytes at mpi-offset %ld\n",
break;
}
-#if AKCDEBUG & 0
+#ifdef AKCDEBUG
/*verify read data*/
{
int *intptr = (int *)buffer;
register int i;
- for (i = 0; i < nelmts_towrite; ++i)
+ for (i = 0; i < nelmts_toxfer; ++i)
/* TO BE IMPLEMENTED */
+ #error "NOT IMPLEMENTED YET"
;
}
#endif
- nelmts_read += nelmts_toread;
+ nelmts_xfer += nelmts_toxfer;
}
/* Stop "raw data" read timer */
@@ -1254,9 +1454,6 @@ stop_data_shipping(int handle)
#endif /* H5_HAVE_GPFS */
-#ifndef TIME_MPI
-#define TIME_MPI
-#endif
#ifdef TIME_MPI
/* instrument the MPI_File_wrirte_xxx and read_xxx calls to measure
* pure time spent in MPI_File code.
diff --git a/perform/pio_perf.c b/perform/pio_perf.c
index e43e32d..73ff4eb 100644
--- a/perform/pio_perf.c
+++ b/perform/pio_perf.c
@@ -72,9 +72,9 @@
#define ONE_MB (ONE_KB * ONE_KB)
#define ONE_GB (ONE_MB * ONE_KB)
-#define PIO_POSIX 0x10
-#define PIO_MPI 0x20
-#define PIO_HDF5 0x40
+#define PIO_POSIX 0x1
+#define PIO_MPI 0x2
+#define PIO_HDF5 0x4
/* report 0.0 in case t is zero too */
#define MB_PER_SEC(bytes,t) (((t)==0.0) ? 0.0 : ((((double)bytes) / ONE_MB) / (t)))
@@ -103,9 +103,9 @@ static const char *progname = "pio_perf";
* adding more, make sure that they don't clash with each other.
*/
#if 1
-static const char *s_opts = "ha:A:cD:f:P:p:X:x:nd:F:i:o:stT:";
+static const char *s_opts = "ha:A:B:cD:f:P:p:X:x:nd:F:i:o:stT:";
#else
-static const char *s_opts = "ha:A:bcD:f:P:p:X:x:nd:F:i:o:stT:";
+static const char *s_opts = "ha:A:bB:cD:f:P:p:X:x:nd:F:i:o:stT:";
#endif /* 1 */
static struct long_options l_opts[] = {
{ "help", no_arg, 'h' },
@@ -125,6 +125,15 @@ static struct long_options l_opts[] = {
{ "bin", no_arg, 'b' },
{ "bi", no_arg, 'b' },
#endif /* 0 */
+ { "block-size", require_arg, 'B' },
+ { "block-siz", require_arg, 'B' },
+ { "block-si", require_arg, 'B' },
+ { "block-s", require_arg, 'B' },
+ { "block-", require_arg, 'B' },
+ { "block", require_arg, 'B' },
+ { "bloc", require_arg, 'B' },
+ { "blo", require_arg, 'B' },
+ { "bl", require_arg, 'B' },
{ "chunk", no_arg, 'c' },
{ "chun", no_arg, 'c' },
{ "chu", no_arg, 'c' },
@@ -227,6 +236,7 @@ struct options {
int min_num_procs; /* minimum number of processes to use */
size_t max_xfer_size; /* maximum transfer buffer size */
size_t min_xfer_size; /* minimum transfer buffer size */
+ size_t block_size; /* interleaved block size */
int print_times; /* print times as well as throughputs */
int print_raw; /* print raw data throughput info */
off_t h5_alignment; /* alignment in HDF5 file */
@@ -361,25 +371,11 @@ run_test_loop(struct options *opts)
parameters parms;
int num_procs;
int doing_pio; /* if this process is doing PIO */
- int io_runs = PIO_HDF5 | PIO_MPI | PIO_POSIX; /* default to run all tests */
-
- if (opts->io_types & ~0x7) {
- /* we want to run only a select subset of these tests */
- io_runs = 0;
-
- if (opts->io_types & PIO_HDF5)
- io_runs |= PIO_HDF5;
-
- if (opts->io_types & PIO_MPI)
- io_runs |= PIO_MPI;
-
- if (opts->io_types & PIO_POSIX)
- io_runs |= PIO_POSIX;
- }
parms.num_files = opts->num_files;
parms.num_dsets = opts->num_dsets;
parms.num_iters = opts->num_iters;
+ parms.block_size = opts->block_size;
parms.h5_align = opts->h5_alignment;
parms.h5_thresh = opts->h5_threshold;
parms.h5_use_chunks = opts->h5_use_chunks;
@@ -417,13 +413,13 @@ run_test_loop(struct options *opts)
output_report(" # of files: %ld, # of dsets: %ld, # of elmts per dset: %ld\n",
parms.num_files, parms.num_dsets, parms.num_elmts);
- if (io_runs & PIO_POSIX)
+ if (opts->io_types & PIO_POSIX)
run_test(POSIXIO, parms, opts);
- if (io_runs & PIO_MPI)
+ if (opts->io_types & PIO_MPI)
run_test(MPIO, parms, opts);
- if (io_runs & PIO_HDF5)
+ if (opts->io_types & PIO_HDF5)
run_test(PHDF5, parms, opts);
/* Run the tests once if buf_size==0, but then break out */
@@ -965,6 +961,9 @@ report_parameters(struct options *opts)
recover_size_and_print((long_long)opts->min_xfer_size, ":");
recover_size_and_print((long_long)opts->max_xfer_size, "\n");
+ HDfprintf(output, "rank %d: Interleaved block size=", rank);
+ recover_size_and_print((long_long)opts->block_size, "\n");
+
{
char *prefix = getenv("HDF5_PARAPREFIX");
@@ -1005,6 +1004,7 @@ parse_command_line(int argc, char *argv[])
cl_opts->min_num_procs = 1;
cl_opts->max_xfer_size = 1 * ONE_MB;
cl_opts->min_xfer_size = 128 * ONE_KB;
+ cl_opts->block_size = 0; /* no interleaved I/O */
cl_opts->print_times = 0; /* Printing times is off by default */
cl_opts->print_raw = 0; /* Printing raw data throughput is off by default */
cl_opts->h5_alignment = 1; /* No alignment for HDF5 objects by default */
@@ -1018,8 +1018,6 @@ parse_command_line(int argc, char *argv[])
cl_opts->h5_alignment = parse_size_directive(opt_arg);
break;
case 'A':
- cl_opts->io_types &= ~0x7;
-
{
const char *end = opt_arg;
@@ -1058,6 +1056,9 @@ parse_command_line(int argc, char *argv[])
/* the future "binary" option */
break;
#endif /* 0 */
+ case 'B':
+ cl_opts->block_size = parse_size_directive(opt_arg);
+ break;
case 'c': /* Turn on chunked HDF5 dataset creation */
cl_opts->h5_use_chunks = 1;
break;
@@ -1164,9 +1165,8 @@ parse_command_line(int argc, char *argv[])
}
/* set default if none specified yet */
- if (!cl_opts->io_types){
+ if (!cl_opts->io_types)
cl_opts->io_types = PIO_HDF5 | PIO_MPI | PIO_POSIX; /* run all API */
- }
return cl_opts;
}
@@ -1244,6 +1244,8 @@ usage(const char *prog)
#if 0
printf(" -b, --binary The elusive binary option\n");
#endif /* 0 */
+ printf(" -B S, --block-size=S Interleaved block size\n");
+ printf(" [default: 0 no interleaved IO]\n");
printf(" -c, --chunk Create HDF5 datasets chunked [default: off]\n");
printf(" -d N, --num-dsets=N Number of datasets per file [default:1]\n");
printf(" -D DL, --debug=DL Indicate the debugging level\n");
diff --git a/perform/pio_perf.h b/perform/pio_perf.h
index b36dd82..89713c1 100644
--- a/perform/pio_perf.h
+++ b/perform/pio_perf.h
@@ -31,6 +31,7 @@ typedef struct parameters_ {
off_t num_elmts; /* Number of native ints in each dset */
int num_iters; /* Number of times to loop doing the IO */
size_t buf_size; /* Buffer size */
+ size_t block_size; /* interleaved block size */
hsize_t h5_align; /* HDF5 object alignment */
hsize_t h5_thresh; /* HDF5 object alignment threshold */
unsigned h5_use_chunks; /* Make HDF5 dataset chunked */
@@ -65,6 +66,13 @@ extern int pio_debug_level; /* The debug level:
* 4 - Even More Debugging (timer stuff)
*/
+#define HDprint_rank(f) /* print rank in MPI_COMM_WORLD */ \
+ HDfprintf(f, "%d: ", comm_world_rank_g);
+#define HDprint_size(f) /* print size of MPI_COMM_WORLD */ \
+ HDfprintf(f, "%d", comm_world_nprocs_g);
+#define HDprint_rank_size(f) /* print rank/size of MPI_COMM_WORLD */ \
+ HDfprintf(f, "%d/%d: ", comm_world_rank_g, comm_world_nprocs_g);
+
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */