From e858a1310df466b41c2e743915d8b57f999aa4ad Mon Sep 17 00:00:00 2001 From: Quincey Koziol Date: Tue, 2 Jul 2002 15:06:22 -0500 Subject: [svn-r5760] Purpose: New features, etc. Description: Bring over all the recent changes from the release branch. Platforms tested: IRIX64 6.5 (modi4) w/parallel --- perform/pio_engine.c | 1180 ++++++++++++++++++++++++++++++-------------------- perform/pio_perf.c | 127 ++++-- perform/pio_perf.h | 3 +- 3 files changed, 803 insertions(+), 507 deletions(-) diff --git a/perform/pio_engine.c b/perform/pio_engine.c index a9499d3..126d88a 100644 --- a/perform/pio_engine.c +++ b/perform/pio_engine.c @@ -45,9 +45,9 @@ /* sizes of various items. these sizes won't change during program execution */ /* The following three must have the same type */ -#define ELMT_SIZE (sizeof(int)) /* we're doing ints */ -#define ELMT_MPI_TYPE MPI_INT -#define ELMT_H5_TYPE H5T_NATIVE_INT +#define ELMT_SIZE (sizeof(unsigned char)) /* we're doing bytes */ +#define ELMT_MPI_TYPE MPI_BYTE +#define ELMT_H5_TYPE H5T_NATIVE_UCHAR #define GOTOERROR(errcode) { ret_code = errcode; goto done; } #define GOTODONE { goto done; } @@ -130,6 +130,7 @@ static herr_t do_fclose(iotype iot, file_descr *fd); static void do_cleanupfile(iotype iot, char *fname); /* GPFS-specific functions */ +#ifdef H5_HAVE_GPFS static void access_range(int handle, off_t start, off_t length, int is_write); static void free_range(int handle, off_t start, off_t length); static void clear_file_cache(int handle); @@ -137,6 +138,7 @@ static void cancel_hints(int handle); static void start_data_shipping(int handle, int num_insts); static void stop_data_shipping(int handle); static void invalidate_file_cache(const char *filename); +#endif /* H5_HAVE_GPFS */ /* * Function: do_pio @@ -156,12 +158,12 @@ do_pio(parameters param) iotype iot; char fname[FILENAME_MAX]; - int maxprocs; - long nfiles, nf; + long nf; long ndsets; - off_t nelmts; + off_t nbytes; /* Number of bytes per dataset */ char *buffer = NULL; /*data buffer pointer */ size_t buf_size; /*data buffer size in bytes */ + size_t blk_size; /*data block size in bytes */ /* HDF5 variables */ herr_t hrc; /*HDF5 return code */ @@ -190,16 +192,15 @@ do_pio(parameters param) GOTOERROR(FAIL); } - nfiles = param.num_files; /* number of files */ ndsets = param.num_dsets; /* number of datasets per file */ - nelmts = param.num_elmts; /* number of elements per dataset */ - maxprocs = param.num_procs; /* max number of mpi-processes to use */ + nbytes = param.num_bytes; /* number of bytes per dataset */ buf_size = param.buf_size; + blk_size = param.blk_size; - if (nfiles < 0 ) { + if (param.num_files < 0 ) { fprintf(stderr, "number of files must be >= 0 (%ld)\n", - nfiles); + param.num_files); GOTOERROR(FAIL); } @@ -210,26 +211,51 @@ do_pio(parameters param) GOTOERROR(FAIL); } - if (maxprocs <= 0 ) { + if (param.num_procs <= 0 ) { fprintf(stderr, "maximum number of process to use must be > 0 (%d)\n", - maxprocs); + param.num_procs); GOTOERROR(FAIL); } - /* allocate transfer buffer */ + /* Validate transfer buffer size & block size*/ + if(blk_size<=0) { + HDfprintf(stderr, + "Transfer block size (%Hd) must be > 0\n", (long_long)blk_size); + GOTOERROR(FAIL); + } if(buf_size<=0) { HDfprintf(stderr, "Transfer buffer size (%Hd) must be > 0\n", (long_long)buf_size); GOTOERROR(FAIL); - }else{ - buffer = malloc(buf_size); + } + if ((buf_size % blk_size) != 0){ + HDfprintf(stderr, + "Transfer buffer size (%Hd) must be a multiple of the " + "interleaved I/O block size (%Hd)\n", + (long_long)buf_size, (long_long)blk_size); + GOTOERROR(FAIL); + } + if((nbytes%pio_mpi_nprocs_g)!=0) { + HDfprintf(stderr, + "Dataset size (%Hd) must be a multiple of the " + "number of processes (%d)\n", + (long_long)nbytes, pio_mpi_nprocs_g); + GOTOERROR(FAIL); + } + if(((nbytes/pio_mpi_nprocs_g)%buf_size)!=0) { + HDfprintf(stderr, + "Dataset size/process (%Hd) must be a multiple of the " + "trasfer buffer size (%Hd)\n", + (long_long)(nbytes/pio_mpi_nprocs_g), (long_long)buf_size); + GOTOERROR(FAIL); + } - if (buffer == NULL){ - HDfprintf(stderr, "malloc for transfer buffer size (%Hd) failed\n", - (long_long)buf_size); - GOTOERROR(FAIL); - } + /* Allocate transfer buffer */ + if ((buffer = malloc(buf_size)) == NULL){ + HDfprintf(stderr, "malloc for transfer buffer size (%Hd) failed\n", + (long_long)(buf_size)); + GOTOERROR(FAIL); } if (pio_debug_level >= 4) { @@ -242,25 +268,26 @@ do_pio(parameters param) fprintf(output, "Timer details:\n"); } - for (nf = 1; nf <= nfiles; nf++) { + for (nf = 1; nf <= param.num_files; nf++) { /* * Write performance measurement */ /* Open file for write */ char base_name[256]; - MPI_Barrier(pio_comm_g); - sprintf(base_name, "#pio_tmp_%lu", nf); pio_create_filename(iot, base_name, fname, sizeof(fname)); + /* Need barrier to make sure everyone starts at the same time */ + MPI_Barrier(pio_comm_g); + set_time(res.timers, HDF5_GROSS_WRITE_FIXED_DIMS, START); hrc = do_fopen(¶m, fname, &fd, PIO_CREATE | PIO_WRITE); VRFY((hrc == SUCCESS), "do_fopen failed"); set_time(res.timers, HDF5_FINE_WRITE_FIXED_DIMS, START); - hrc = do_write(&res, &fd, ¶m, ndsets, nelmts, buf_size, buffer); + hrc = do_write(&res, &fd, ¶m, ndsets, nbytes, buf_size, buffer); set_time(res.timers, HDF5_FINE_WRITE_FIXED_DIMS, STOP); VRFY((hrc == SUCCESS), "do_write failed"); @@ -275,6 +302,10 @@ do_pio(parameters param) /* * Read performance measurement */ + /* Need barrier to make sure everyone is done writing and has + * closed the file. Also to make sure everyone starts reading + * at the same time. + */ MPI_Barrier(pio_comm_g); /* Open file for read */ @@ -284,7 +315,7 @@ do_pio(parameters param) VRFY((hrc == SUCCESS), "do_fopen failed"); set_time(res.timers, HDF5_FINE_READ_FIXED_DIMS, START); - hrc = do_read(&res, &fd, ¶m, ndsets, nelmts, buf_size, buffer); + hrc = do_read(&res, &fd, ¶m, ndsets, nbytes, buf_size, buffer); set_time(res.timers, HDF5_FINE_READ_FIXED_DIMS, STOP); VRFY((hrc == SUCCESS), "do_read failed"); @@ -295,6 +326,8 @@ do_pio(parameters param) VRFY((hrc == SUCCESS), "do_fclose failed"); } + /* Need barrier to make sure everyone is done with the file */ + /* before it may be removed by do_cleanupfile */ MPI_Barrier(pio_comm_g); do_cleanupfile(iot, fname); } @@ -447,23 +480,31 @@ pio_create_filename(iotype iot, const char *base_name, char *fullname, size_t si */ static herr_t do_write(results *res, file_descr *fd, parameters *parms, long ndsets, - off_t nelmts, size_t buf_size, void *buffer) + off_t nbytes, size_t buf_size, void *buffer) { int ret_code = SUCCESS; int rc; /*routine return code */ - int mrc; /*MPI return code */ - MPI_Offset mpi_offset; - MPI_Status mpi_status; long ndset; - off_t nelmts_xfer; - size_t nelmts_toxfer; + size_t blk_size; /* The block size to subdivide the xfer buffer into */ + off_t nbytes_xfer; /* Total number of bytes transferred so far */ + size_t nbytes_toxfer; /* Number of bytes to transfer a particular time */ char dname[64]; - off_t dset_offset=0; /*dataset offset in a file */ - off_t file_offset; /*file offset of the next transfer */ - off_t dset_size; /*one dataset size in bytes */ - size_t nelmts_in_buf; /*how many element the buffer holds */ - off_t elmts_begin; /*first elmt this process transfer */ - off_t elmts_count; /*number of elmts this process transfer */ + off_t dset_offset=0; /*dataset offset in a file */ + off_t bytes_begin; /*first elmt this process transfer */ + off_t bytes_count; /*number of elmts this process transfer */ + unsigned char *buf_p; /* Current buffer pointer */ + + /* POSIX variables */ + off_t file_offset; /* File offset of the next transfer */ + off_t posix_file_offset; /* Base file offset of the next transfer */ + + /* MPI variables */ + MPI_Offset mpi_file_offset;/* Base file offset of the next transfer*/ + MPI_Offset mpi_offset; /* Offset in MPI file */ + MPI_Datatype mpi_file_type; /* MPI derived type for file */ + MPI_Datatype mpi_blk_type; /* MPI derived type for buffer */ + MPI_Status mpi_status; + int mrc; /* MPI return code */ /* HDF5 variables */ herr_t hrc; /*HDF5 return code */ @@ -471,57 +512,146 @@ do_write(results *res, file_descr *fd, parameters *parms, long ndsets, hid_t h5dset_space_id = -1; /*dataset space ID */ hid_t h5mem_space_id = -1; /*memory dataspace ID */ hid_t h5ds_id = -1; /*dataset handle */ - hsize_t h5block[1]; /*dataspace selection */ + hsize_t h5block[1]; /*dataspace selection */ hsize_t h5stride[1]; hsize_t h5count[1]; hssize_t h5start[1]; + hssize_t h5offset[1]; /* Selection offset within dataspace */ hid_t h5dcpl = -1; /* Dataset creation property list */ hid_t h5dxpl = -1; /* Dataset transfer property list */ - /* calculate dataset parameters. data type is always native C int */ - dset_size = nelmts * (off_t)ELMT_SIZE; - nelmts_in_buf = buf_size/ELMT_SIZE; - - /* hdf5 data space setup */ - if (parms->io_type == PHDF5){ - if(nelmts>0) { - /* define a contiquous dataset of nelmts native ints */ - h5dims[0] = nelmts; - h5dset_space_id = H5Screate_simple(1, h5dims, NULL); - VRFY((h5dset_space_id >= 0), "H5Screate_simple"); - } /* end if */ - else { - h5dset_space_id = H5Screate(H5S_SCALAR); - VRFY((h5dset_space_id >= 0), "H5Screate"); - } /* end else */ - - /* Create the memory dataspace that corresponds to the xfer buffer */ - if(nelmts_in_buf>0) { - h5dims[0] = nelmts_in_buf; - h5mem_space_id = H5Screate_simple(1, h5dims, NULL); - VRFY((h5mem_space_id >= 0), "H5Screate_simple"); - } /* end if */ - else { - h5mem_space_id = H5Screate(H5S_SCALAR); - VRFY((h5mem_space_id >= 0), "H5Screate"); - } /* end else */ - - /* Create the dataset transfer property list */ - h5dxpl = H5Pcreate(H5P_DATASET_XFER); - if (h5dxpl < 0) { - fprintf(stderr, "HDF5 Property List Create failed\n"); - GOTOERROR(FAIL); - } + /* Get the parameters from the parameter block */ + blk_size=parms->blk_size; + + /* Prepare buffer for verifying data */ + if (parms->verify) + memset(buffer,pio_mpi_rank_g,buf_size); + + /* There are two kinds of transfer patterns, contiguous and interleaved. + * Let 0,1,2,...,n be data accessed by process 0,1,2,...,n + * where n is rank of the last process. + * In contiguous pattern, data are accessed as + * 000...111...222...nnn... + * In interleaved pattern, data are accessed as + * 012...n012...n... + * These are all in the scope of one dataset. + */ + if (parms->interleaved==0) { + /* Contiguous Pattern: */ + bytes_begin = (off_t)(((double)nbytes*pio_mpi_rank_g)/pio_mpi_nprocs_g); + } /* end if */ + else { + /* Interleaved Pattern: */ + bytes_begin = (off_t)(blk_size*pio_mpi_rank_g); + } /* end else */ + + /* Calculate the total number of bytes (bytes_count) to be + * transferred by this process. It may be different for different + * transfer pattern due to rounding to integral values. + */ + /* + * Calculate the beginning bytes of this process and the next. + * bytes_count is the difference between these two beginnings. + * This way, it eliminates any rounding errors. + * (This is tricky, don't mess with the formula, rounding errors + * can easily get introduced) */ + bytes_count = (off_t)(((double)nbytes*(pio_mpi_rank_g+1)) / pio_mpi_nprocs_g) + - (off_t)(((double)nbytes*pio_mpi_rank_g) / pio_mpi_nprocs_g); + + /* debug */ + if (pio_debug_level >= 4) { + HDprint_rank(output); + HDfprintf(output, "Debug(do_write): " + "buf_size=%Hd, bytes_begin=%Hd, bytes_count=%Hd\n", + (long_long)buf_size, (long_long)bytes_begin, + (long_long)bytes_count); + } - /* Change to collective I/O, if asked */ - if(parms->collective) { - hrc = H5Pset_dxpl_mpio(h5dxpl, H5FD_MPIO_COLLECTIVE); - if (hrc < 0) { - fprintf(stderr, "HDF5 Property List Set failed\n"); + /* I/O Access specific setup */ + switch (parms->io_type) { + case POSIXIO: + /* No extra setup */ + break; + + case MPIO: /* MPI-I/O setup */ + /* Build block's derived type */ + mrc = MPI_Type_contiguous((int)blk_size, + MPI_BYTE, &mpi_blk_type); + VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_CREATE"); + + /* Build file's derived type */ + mrc = MPI_Type_vector((int)(buf_size/blk_size), (int)1, + (int)pio_mpi_nprocs_g, mpi_blk_type, &mpi_file_type); + VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_CREATE"); + + /* Commit file type */ + mrc = MPI_Type_commit( &mpi_file_type ); + VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_COMMIT"); + + /* Commit buffer type */ + mrc = MPI_Type_commit( &mpi_blk_type ); + VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_COMMIT"); + break; + + case PHDF5: /* HDF5 setup */ + if(nbytes>0) { + /* define a contiquous dataset of nbytes native bytes */ + h5dims[0] = nbytes; + h5dset_space_id = H5Screate_simple(1, h5dims, NULL); + VRFY((h5dset_space_id >= 0), "H5Screate_simple"); + + /* Set up the file dset space id to select the pattern to access */ + if (parms->interleaved==0){ + /* Contiguous pattern */ + h5start[0] = bytes_begin; + h5stride[0] = h5block[0] = blk_size; + h5count[0] = buf_size/blk_size; + } /* end if */ + else { + /* Interleaved access pattern */ + /* Skip offset over blocks of other processes */ + h5start[0] = bytes_begin; + h5stride[0] = blk_size*pio_mpi_nprocs_g; + h5block[0] = blk_size; + h5count[0] = buf_size/blk_size; + } /* end else */ + hrc = H5Sselect_hyperslab(h5dset_space_id, H5S_SELECT_SET, + h5start, h5stride, h5count, h5block); + VRFY((hrc >= 0), "H5Sselect_hyperslab"); + } /* end if */ + else { + h5dset_space_id = H5Screate(H5S_SCALAR); + VRFY((h5dset_space_id >= 0), "H5Screate"); + } /* end else */ + + /* Create the memory dataspace that corresponds to the xfer buffer */ + if(buf_size>0) { + h5dims[0] = buf_size; + h5mem_space_id = H5Screate_simple(1, h5dims, NULL); + VRFY((h5mem_space_id >= 0), "H5Screate_simple"); + } /* end if */ + else { + h5mem_space_id = H5Screate(H5S_SCALAR); + VRFY((h5mem_space_id >= 0), "H5Screate"); + } /* end else */ + + /* Create the dataset transfer property list */ + h5dxpl = H5Pcreate(H5P_DATASET_XFER); + if (h5dxpl < 0) { + fprintf(stderr, "HDF5 Property List Create failed\n"); GOTOERROR(FAIL); + } + + /* Change to collective I/O, if asked */ + if(parms->collective) { + hrc = H5Pset_dxpl_mpio(h5dxpl, H5FD_MPIO_COLLECTIVE); + if (hrc < 0) { + fprintf(stderr, "HDF5 Property List Set failed\n"); + GOTOERROR(FAIL); + } /* end if */ } /* end if */ - } /* end if */ - } + break; + } /* end switch */ for (ndset = 1; ndset <= ndsets; ++ndset) { @@ -532,7 +662,7 @@ do_write(results *res, file_descr *fd, parameters *parms, long ndsets, case POSIXIO: case MPIO: /* both posix and mpi io just need dataset offset in file*/ - dset_offset = (ndset - 1) * dset_size; + dset_offset = (ndset - 1) * nbytes; break; case PHDF5: @@ -545,7 +675,7 @@ do_write(results *res, file_descr *fd, parameters *parms, long ndsets, /* Make the dataset chunked if asked */ if(parms->h5_use_chunks) { /* Set the chunk size to be the same as the buffer size */ - h5dims[0] = nelmts_in_buf; + h5dims[0] = buf_size; hrc = H5Pset_chunk(h5dcpl, 1, h5dims); if (hrc < 0) { fprintf(stderr, "HDF5 Property List Set failed\n"); @@ -583,201 +713,191 @@ do_write(results *res, file_descr *fd, parameters *parms, long ndsets, break; } - /* There are two kinds of transfer patterns, contiguous and interleaved. - * Let 0,1,2,...,n be data accessed by process 0,1,2,...,n - * where n is rank of the last process. - * In contiguous pattern, data are accessed as - * 000...111...222...nnn... - * In interleaved pattern, data are accessed as - * 012...n012...n... - * These are all in the scope of one dataset. - */ - /* Calculate the total number of elements (elmts_count) to be - * transferred by this process. It may be different for different - * transfer pattern due to rounding to integral values. - */ - if (parms->interleaved==0) { - /* Contiguous Pattern: - * Calculate the beginning element of this process and the next. - * elmts_count is the difference between these two beginnings. - * This way, it eliminates any rounding errors. - */ - elmts_begin = (off_t)(((double)nelmts)/pio_mpi_nprocs_g*pio_mpi_rank_g); - - /* Do not cast elmt_begin to other types, especially non-integral - * types, else it may introduce rounding discrepency. */ - if (pio_mpi_rank_g < (pio_mpi_nprocs_g - 1)) - elmts_count = (off_t)(((double)nelmts) / pio_mpi_nprocs_g * (pio_mpi_rank_g + 1)) - - elmts_begin; - else - /* last process. Take whatever are left */ - elmts_count = nelmts - elmts_begin; - } /* end if */ - else { - /* Interleaved Pattern: - * Each process takes buf_size of elements, starting with the first - * process. So, the last process may have fewer or even none. - * Calculate the beginning element of this process. - * The elmnts_begin here marks only the beginning of the first - * block accessed by this process. - */ - /* Algorithm: - * First allocate equal blocks per process, i.e. one block each - * process for every block_size*nprocs. - * If there is remaining unallocated, give a block each to process - * starting at proc 0. The last process may get a partial block. - */ - off_t remain_nelmts, remain_begin; /* unallocated remaining*/ - - elmts_begin = (off_t)(nelmts_in_buf*pio_mpi_rank_g); - - /* must use integer calculation next */ - /* allocate equal blocks per process */ - elmts_count = (nelmts / (off_t)(nelmts_in_buf*pio_mpi_nprocs_g)) * - (off_t)nelmts_in_buf; - remain_nelmts = nelmts % (off_t)(nelmts_in_buf*pio_mpi_nprocs_g); - - /* allocate any remaining */ - remain_begin = (off_t)(nelmts_in_buf*pio_mpi_rank_g); - if (remain_nelmts > remain_begin){ - /* it gets something */ - if (remain_nelmts > (remain_begin+(off_t)nelmts_in_buf)){ - /* one full block */ - elmts_count += nelmts_in_buf; - }else{ - /* only a partial block */ - elmts_count += remain_nelmts - remain_begin; - } - } - } - /* debug */ - if (pio_debug_level >= 4) { - HDprint_rank(output); - HDfprintf(output, "Debug(do_write): " - "nelmts_in_buf=%Hd, elmts_begin=%Hd, elmts_count=%Hd\n", - (long_long)nelmts_in_buf, (long_long)elmts_begin, - (long_long)elmts_count); - } - - - /* The task is to transfer elmts_count elements, starting at - * elmts_begin position, using transfer buffer of buf_size bytes. + /* The task is to transfer bytes_count bytes, starting at + * bytes_begin position, using transfer buffer of buf_size bytes. * If interleaved, select buf_size at a time, in round robin * fashion, according to number of process. Otherwise, select - * all elmt_count in contiguous. + * all bytes_count in contiguous. */ - nelmts_xfer = 0 ; - - /* Start "raw data" write timer */ - set_time(res->timers, HDF5_RAW_WRITE_FIXED_DIMS, START); + nbytes_xfer = 0 ; - while (nelmts_xfer < elmts_count){ - /* transfer one buffer of data each round */ - /* Note: because size_t is unsigned, avoid expressions that */ - /* can be negative. */ - if ((nelmts_xfer + (off_t)nelmts_in_buf) <= elmts_count) { - nelmts_toxfer = nelmts_in_buf; - } else { - /* last transfer of a partial buffer */ - nelmts_toxfer = elmts_count - nelmts_xfer; - } + /* Set base file offset for all I/O patterns and POSIX access */ + posix_file_offset = dset_offset + bytes_begin; - if (parms->verify) { - /*Prepare write data for verify later*/ - int *intptr = (int *)buffer; - size_t i; + /* Set base file offset for all I/O patterns and MPI access */ + mpi_file_offset = (MPI_Offset)(dset_offset + bytes_begin); - for (i = 0; i < nelmts_toxfer; ++i) - *intptr++ = pio_mpi_rank_g; - } + /* Start "raw data" write timer */ + set_time(res->timers, HDF5_RAW_WRITE_FIXED_DIMS, START); + while (nbytes_xfer < bytes_count){ /* Write */ /* Calculate offset of write within a dataset/file */ switch (parms->io_type) { case POSIXIO: - if (parms->interleaved==0) { - /* Contiguous pattern */ - /* need to (off_t) the elmnts_begin expression because they */ - /* may be of smaller sized integer types */ - file_offset = dset_offset + (off_t)(elmts_begin + nelmts_xfer)*(off_t)ELMT_SIZE; - } /* end if */ - else { - /* Interleaved access pattern */ - /* Skip offset over blocks of other processes */ - file_offset = dset_offset + - (off_t)(elmts_begin + (nelmts_xfer*pio_mpi_nprocs_g))*(off_t)ELMT_SIZE; - } /* end else */ - - /* only care if seek returns error */ - rc = POSIXSEEK(fd->posixfd, file_offset) < 0 ? -1 : 0; - VRFY((rc==0), "POSIXSEEK"); - /* check if all bytes are transferred */ - rc = ((ssize_t)(nelmts_toxfer*ELMT_SIZE) == - POSIXWRITE(fd->posixfd, buffer, nelmts_toxfer*ELMT_SIZE)); - VRFY((rc != 0), "POSIXWRITE"); - break; - - case MPIO: - if (parms->interleaved==0){ - /* Contiguous pattern */ - mpi_offset = dset_offset + (elmts_begin + nelmts_xfer)*(off_t)ELMT_SIZE; + /* Contiguous pattern */ + if (parms->interleaved==0) { + /* Compute file offset */ + file_offset = posix_file_offset + (off_t)nbytes_xfer; + + /* only care if seek returns error */ + rc = POSIXSEEK(fd->posixfd, file_offset) < 0 ? -1 : 0; + VRFY((rc==0), "POSIXSEEK"); + + /* check if all bytes are written */ + rc = ((ssize_t)buf_size == + POSIXWRITE(fd->posixfd, buffer, buf_size)); + VRFY((rc != 0), "POSIXWRITE"); + + /* Advance global offset in dataset */ + nbytes_xfer+=buf_size; } /* end if */ + /* Interleaved access pattern */ else { - /* Interleaved access pattern */ - /* Skip offset over blocks of other processes */ - mpi_offset = dset_offset + (elmts_begin + (nelmts_xfer*pio_mpi_nprocs_g))*(off_t)ELMT_SIZE; + /* Set the base of user's buffer */ + buf_p=(unsigned char *)buffer; + + /* Set the number of bytes to transfer this time */ + nbytes_toxfer = buf_size; + + /* Loop over the buffers to write */ + while(nbytes_toxfer>0) { + /* Skip offset over blocks of other processes */ + file_offset = posix_file_offset + + (off_t)(nbytes_xfer*pio_mpi_nprocs_g); + + /* only care if seek returns error */ + rc = POSIXSEEK(fd->posixfd, file_offset) < 0 ? -1 : 0; + VRFY((rc==0), "POSIXSEEK"); + + /* check if all bytes are written */ + rc = ((ssize_t)blk_size == + POSIXWRITE(fd->posixfd, buf_p, blk_size)); + VRFY((rc != 0), "POSIXWRITE"); + + /* Advance location in buffer */ + buf_p+=blk_size; + + /* Advance global offset in dataset */ + nbytes_xfer+=blk_size; + + /* Decrement number of bytes left this time */ + nbytes_toxfer-=blk_size; + } /* end while */ } /* end else */ + break; + case MPIO: + /* Independent file access */ if(parms->collective==0) { - mrc = MPI_File_write_at(fd->mpifd, mpi_offset, buffer, - (int)nelmts_toxfer, ELMT_MPI_TYPE, - &mpi_status); - VRFY((mrc==MPI_SUCCESS), "MPIO_WRITE"); + /* Contiguous pattern */ + if (parms->interleaved==0){ + /* Compute offset in file */ + mpi_offset = mpi_file_offset + + nbytes_xfer; + + /* Perform independent write */ + mrc = MPI_File_write_at(fd->mpifd, mpi_offset, buffer, + (int)(buf_size/blk_size), mpi_blk_type, + &mpi_status); + VRFY((mrc==MPI_SUCCESS), "MPIO_WRITE"); + + /* Advance global offset in dataset */ + nbytes_xfer+=buf_size; + } /* end if */ + /* Interleaved access pattern */ + else { + /* Set the base of user's buffer */ + buf_p=(unsigned char *)buffer; + + /* Set the number of bytes to transfer this time */ + nbytes_toxfer = buf_size; + + /* Loop over the buffers to write */ + while(nbytes_toxfer>0) { + /* Skip offset over blocks of other processes */ + mpi_offset = mpi_file_offset + + (nbytes_xfer*pio_mpi_nprocs_g); + + /* Perform independent write */ + mrc = MPI_File_write_at(fd->mpifd, mpi_offset, buf_p, + (int)1, mpi_blk_type, &mpi_status); + VRFY((mrc==MPI_SUCCESS), "MPIO_WRITE"); + + /* Advance location in buffer */ + buf_p+=blk_size; + + /* Advance global offset in dataset */ + nbytes_xfer+=blk_size; + + /* Decrement number of bytes left this time */ + nbytes_toxfer-=blk_size; + } /* end while */ + } /* end else */ } /* end if */ + /* Collective file access */ else { - mrc = MPI_File_write_at_all(fd->mpifd, mpi_offset, buffer, - (int)nelmts_toxfer, ELMT_MPI_TYPE, - &mpi_status); - VRFY((mrc==MPI_SUCCESS), "MPIO_WRITE"); + /* Contiguous access pattern */ + if (parms->interleaved==0){ + /* Compute offset in file */ + mpi_offset = mpi_file_offset + + nbytes_xfer; + + /* Perform independent write */ + mrc = MPI_File_write_at_all(fd->mpifd, mpi_offset, buffer, + (int)(buf_size/blk_size), mpi_blk_type, &mpi_status); + VRFY((mrc==MPI_SUCCESS), "MPIO_WRITE"); + + /* Advance global offset in dataset */ + nbytes_xfer+=buf_size; + } /* end if */ + /* Interleaved access pattern */ + else { + /* Compute offset in file */ + mpi_offset = mpi_file_offset + + (nbytes_xfer*pio_mpi_nprocs_g); + + /* Set the file view */ + mrc = MPI_File_set_view(fd->mpifd, mpi_offset, mpi_blk_type, + mpi_file_type, (char*)"native", h5_io_info_g); + VRFY((mrc==MPI_SUCCESS), "MPIO_VIEW"); + + /* Perform write */ + mrc = MPI_File_write_at_all(fd->mpifd, 0, buffer, + (int)(buf_size/blk_size), mpi_blk_type, &mpi_status); + VRFY((mrc==MPI_SUCCESS), "MPIO_WRITE"); + + /* Advance global offset in dataset */ + nbytes_xfer+=buf_size; + } /* end else */ } /* end else */ break; case PHDF5: - /* Set up the file dset space id to select the segment to process */ + /* Set up the file dset space id to move the selection to process */ if (parms->interleaved==0){ /* Contiguous pattern */ - h5start[0] = elmts_begin + nelmts_xfer; + h5offset[0] = nbytes_xfer; } /* end if */ else { /* Interleaved access pattern */ /* Skip offset over blocks of other processes */ - h5start[0] = elmts_begin + (nelmts_xfer*pio_mpi_nprocs_g); + h5offset[0] = (nbytes_xfer*pio_mpi_nprocs_g); } /* end else */ - h5stride[0] = h5block[0] = nelmts_toxfer; - h5count[0] = 1; - hrc = H5Sselect_hyperslab(h5dset_space_id, H5S_SELECT_SET, - h5start, h5stride, h5count, h5block); - VRFY((hrc >= 0), "H5Sset_hyperslab"); - - /* Only need selection in memory dataset if it is smaller than the whole buffer */ - if(nelmts_toxfer= 0), "H5Sset_hyperslab"); - } /* end if */ + hrc = H5Soffset_simple(h5dset_space_id, h5offset); + VRFY((hrc >= 0), "H5Soffset_simple"); - /* set write time here */ + /* Write the buffer out */ hrc = H5Dwrite(h5ds_id, ELMT_H5_TYPE, h5mem_space_id, h5dset_space_id, h5dxpl, buffer); VRFY((hrc >= 0), "H5Dwrite"); + + /* Increment number of bytes transferred */ + nbytes_xfer += buf_size; + break; } /* switch (parms->io_type) */ - - /* Increment number of elements transferred */ - nelmts_xfer += nelmts_toxfer; - } + } /* end while */ /* Stop "raw data" write timer */ set_time(res->timers, HDF5_RAW_WRITE_FIXED_DIMS, STOP); @@ -785,7 +905,7 @@ do_write(results *res, file_descr *fd, parameters *parms, long ndsets, /* Calculate write time */ /* Close dataset. Only HDF5 needs to do an explicit close. */ - if (parms->io_type == PHDF5){ + if (parms->io_type == PHDF5) { hrc = H5Dclose(h5ds_id); if (hrc < 0) { @@ -794,10 +914,21 @@ do_write(results *res, file_descr *fd, parameters *parms, long ndsets, } h5ds_id = -1; - } - } + } /* end if */ + } /* end for */ done: + /* release MPI-I/O objects */ + if (parms->io_type == MPIO) { + /* Free file type */ + mrc = MPI_Type_free( &mpi_file_type ); + VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_FREE"); + + /* Free buffer type */ + mrc = MPI_Type_free( &mpi_blk_type ); + VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_FREE"); + } /* end if */ + /* release HDF5 objects */ if (h5dset_space_id != -1) { hrc = H5Sclose(h5dset_space_id); @@ -841,82 +972,176 @@ done: */ static herr_t do_read(results *res, file_descr *fd, parameters *parms, long ndsets, - off_t nelmts, size_t buf_size, void *buffer /*out*/) + off_t nbytes, size_t buf_size, void *buffer /*out*/) { int ret_code = SUCCESS; int rc; /*routine return code */ - int mrc; /*MPI return code */ - MPI_Offset mpi_offset; - MPI_Status mpi_status; long ndset; - off_t nelmts_xfer; - size_t nelmts_toxfer; + size_t blk_size; /* The block size to subdivide the xfer buffer into */ + off_t nbytes_xfer; /* Total number of bytes transferred so far */ + size_t nbytes_toxfer; /* Number of bytes to transfer a particular time */ char dname[64]; - off_t dset_offset=0; /*dataset offset in a file */ - off_t file_offset; /*file offset of the next transfer */ - off_t dset_size; /*one dataset size in bytes */ - size_t nelmts_in_buf; /*how many element the buffer holds */ - off_t elmts_begin; /*first elmt this process transfer */ - off_t elmts_count; /*number of elmts this process transfer */ + off_t dset_offset=0; /*dataset offset in a file */ + off_t bytes_begin; /*first elmt this process transfer */ + off_t bytes_count; /*number of elmts this process transfer */ + unsigned char *buf_p; /* Current buffer pointer */ + + /* POSIX variables */ + off_t file_offset; /* File offset of the next transfer */ + off_t posix_file_offset; /* Base file offset of the next transfer */ + + /* MPI variables */ + MPI_Offset mpi_file_offset;/* Base file offset of the next transfer*/ + MPI_Offset mpi_offset; /* Offset in MPI file */ + MPI_Datatype mpi_file_type; /* MPI derived type for file */ + MPI_Datatype mpi_blk_type; /* MPI derived type for buffer */ + MPI_Status mpi_status; + int mrc; /* MPI return code */ /* HDF5 variables */ - herr_t hrc; /*HDF5 return code */ - hsize_t h5dims[1]; /*dataset dim sizes */ + herr_t hrc; /*HDF5 return code */ + hsize_t h5dims[1]; /*dataset dim sizes */ hid_t h5dset_space_id = -1; /*dataset space ID */ hid_t h5mem_space_id = -1; /*memory dataspace ID */ - hid_t h5ds_id = -1; /*dataset handle */ - hsize_t h5block[1]; /*dataspace selection */ + hid_t h5ds_id = -1; /*dataset handle */ + hsize_t h5block[1]; /*dataspace selection */ hsize_t h5stride[1]; hsize_t h5count[1]; hssize_t h5start[1]; + hssize_t h5offset[1]; /* Selection offset within dataspace */ hid_t h5dxpl = -1; /* Dataset transfer property list */ - /* calculate dataset parameters. data type is always native C int */ - dset_size = nelmts * (off_t)ELMT_SIZE; - nelmts_in_buf = buf_size/ELMT_SIZE; - - /* hdf5 data space setup */ - if (parms->io_type == PHDF5){ - if(nelmts>0) { - /* define a contiquous dataset of nelmts native ints */ - h5dims[0] = nelmts; - h5dset_space_id = H5Screate_simple(1, h5dims, NULL); - VRFY((h5dset_space_id >= 0), "H5Screate_simple"); - } /* end if */ - else { - h5dset_space_id = H5Screate(H5S_SCALAR); - VRFY((h5dset_space_id >= 0), "H5Screate"); - } /* end else */ - - /* Create the memory dataspace that corresponds to the xfer buffer */ - if(nelmts_in_buf>0) { - h5dims[0] = nelmts_in_buf; - h5mem_space_id = H5Screate_simple(1, h5dims, NULL); - VRFY((h5mem_space_id >= 0), "H5Screate_simple"); - } /* end if */ - else { - h5mem_space_id = H5Screate(H5S_SCALAR); - VRFY((h5mem_space_id >= 0), "H5Screate"); - } /* end else */ - - /* Create the dataset transfer property list */ - h5dxpl = H5Pcreate(H5P_DATASET_XFER); - if (h5dxpl < 0) { - fprintf(stderr, "HDF5 Property List Create failed\n"); - GOTOERROR(FAIL); - } + /* Get the parameters from the parameter block */ + blk_size=parms->blk_size; + + /* There are two kinds of transfer patterns, contiguous and interleaved. + * Let 0,1,2,...,n be data accessed by process 0,1,2,...,n + * where n is rank of the last process. + * In contiguous pattern, data are accessed as + * 000...111...222...nnn... + * In interleaved pattern, data are accessed as + * 012...n012...n... + * These are all in the scope of one dataset. + */ + if (parms->interleaved==0) { + /* Contiguous Pattern: */ + bytes_begin = (off_t)(((double)nbytes*pio_mpi_rank_g)/pio_mpi_nprocs_g); + } /* end if */ + else { + /* Interleaved Pattern: */ + bytes_begin = (off_t)(blk_size*pio_mpi_rank_g); + } /* end else */ + + /* Calculate the total number of bytes (bytes_count) to be + * transferred by this process. It may be different for different + * transfer pattern due to rounding to integral values. + */ + /* + * Calculate the beginning bytes of this process and the next. + * bytes_count is the difference between these two beginnings. + * This way, it eliminates any rounding errors. + * (This is tricky, don't mess with the formula, rounding errors + * can easily get introduced) */ + bytes_count = (off_t)(((double)nbytes*(pio_mpi_rank_g+1)) / pio_mpi_nprocs_g) + - (off_t)(((double)nbytes*pio_mpi_rank_g) / pio_mpi_nprocs_g); + + /* debug */ + if (pio_debug_level >= 4) { + HDprint_rank(output); + HDfprintf(output, "Debug(do_read): " + "buf_size=%Hd, bytes_begin=%Hd, bytes_count=%Hd\n", + (long_long)buf_size, (long_long)bytes_begin, + (long_long)bytes_count); + } - /* Change to collective I/O, if asked */ - if(parms->collective) { - hrc = H5Pset_dxpl_mpio(h5dxpl, H5FD_MPIO_COLLECTIVE); - if (hrc < 0) { - fprintf(stderr, "HDF5 Property List Set failed\n"); + /* I/O Access specific setup */ + switch (parms->io_type) { + case POSIXIO: + /* No extra setup */ + break; + + case MPIO: /* MPI-I/O setup */ + /* Build block's derived type */ + mrc = MPI_Type_contiguous((int)blk_size, + MPI_BYTE, &mpi_blk_type); + VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_CREATE"); + + /* Build file's derived type */ + mrc = MPI_Type_vector((int)(buf_size/blk_size), (int)1, + (int)pio_mpi_nprocs_g, mpi_blk_type, &mpi_file_type); + VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_CREATE"); + + /* Commit file type */ + mrc = MPI_Type_commit( &mpi_file_type ); + VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_COMMIT"); + + /* Commit buffer type */ + mrc = MPI_Type_commit( &mpi_blk_type ); + VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_COMMIT"); + break; + + case PHDF5: /* HDF5 setup */ + if(nbytes>0) { + /* define a contiquous dataset of nbytes native bytes */ + h5dims[0] = nbytes; + h5dset_space_id = H5Screate_simple(1, h5dims, NULL); + VRFY((h5dset_space_id >= 0), "H5Screate_simple"); + + /* Set up the file dset space id to select the pattern to access */ + if (parms->interleaved==0){ + /* Contiguous pattern */ + h5start[0] = bytes_begin; + h5stride[0] = h5block[0] = blk_size; + h5count[0] = buf_size/blk_size; + } /* end if */ + else { + /* Interleaved access pattern */ + /* Skip offset over blocks of other processes */ + h5start[0] = bytes_begin; + h5stride[0] = blk_size*pio_mpi_nprocs_g; + h5block[0] = blk_size; + h5count[0] = buf_size/blk_size; + } /* end else */ + hrc = H5Sselect_hyperslab(h5dset_space_id, H5S_SELECT_SET, + h5start, h5stride, h5count, h5block); + VRFY((hrc >= 0), "H5Sselect_hyperslab"); + } /* end if */ + else { + h5dset_space_id = H5Screate(H5S_SCALAR); + VRFY((h5dset_space_id >= 0), "H5Screate"); + } /* end else */ + + /* Create the memory dataspace that corresponds to the xfer buffer */ + if(buf_size>0) { + h5dims[0] = buf_size; + h5mem_space_id = H5Screate_simple(1, h5dims, NULL); + VRFY((h5mem_space_id >= 0), "H5Screate_simple"); + } /* end if */ + else { + h5mem_space_id = H5Screate(H5S_SCALAR); + VRFY((h5mem_space_id >= 0), "H5Screate"); + } /* end else */ + + /* Create the dataset transfer property list */ + h5dxpl = H5Pcreate(H5P_DATASET_XFER); + if (h5dxpl < 0) { + fprintf(stderr, "HDF5 Property List Create failed\n"); GOTOERROR(FAIL); + } + + /* Change to collective I/O, if asked */ + if(parms->collective) { + hrc = H5Pset_dxpl_mpio(h5dxpl, H5FD_MPIO_COLLECTIVE); + if (hrc < 0) { + fprintf(stderr, "HDF5 Property List Set failed\n"); + GOTOERROR(FAIL); + } /* end if */ } /* end if */ - } /* end if */ - } /* end if */ + break; + } /* end switch */ for (ndset = 1; ndset <= ndsets; ++ndset) { + /* Calculate dataset offset within a file */ /* create dataset */ @@ -924,7 +1149,7 @@ do_read(results *res, file_descr *fd, parameters *parms, long ndsets, case POSIXIO: case MPIO: /* both posix and mpi io just need dataset offset in file*/ - dset_offset = (ndset - 1) * dset_size; + dset_offset = (ndset - 1) * nbytes; break; case PHDF5: @@ -938,219 +1163,219 @@ do_read(results *res, file_descr *fd, parameters *parms, long ndsets, break; } - /* There are two kinds of transfer patterns, contiguous and interleaved. - * Let 0,1,2,...,n be data accessed by process 0,1,2,...,n - * where n is rank of the last process. - * In contiguous pattern, data are accessed as - * 000...111...222...nnn... - * In interleaved pattern, data are accessed as - * 012...n012...n... - * These are all in the scope of one dataset. - */ - /* Calculate the total number of elements (elmts_count) to be - * transferred by this process. It may be different for different - * transfer pattern due to rounding to integral values. - */ - if (parms->interleaved==0){ - /* Contiguous Pattern: - * Calculate the beginning element of this process and the next. - * elmts_count is the difference between these two beginnings. - * This way, it eliminates any rounding errors. - */ - elmts_begin = (off_t)(((double)nelmts)/pio_mpi_nprocs_g*pio_mpi_rank_g); - - /* Do not cast elmt_begin to other types, especially non-integral - * types, else it may introduce rounding discrepency. */ - if (pio_mpi_rank_g < (pio_mpi_nprocs_g - 1)) - elmts_count = (off_t)(((double)nelmts) / pio_mpi_nprocs_g * (pio_mpi_rank_g + 1)) - - elmts_begin; - else - /* last process. Take whatever are left */ - elmts_count = nelmts - elmts_begin; - } /* end if */ - else { - /* Interleaved Pattern: - * Each process takes buf_size of elements, starting with the first - * process. So, the last process may have fewer or even none. - * Calculate the beginning element of this process. - * The elmnts_begin here marks only the beginning of the first - * block accessed by this process. - */ - /* Algorithm: - * First allocate equal blocks per process, i.e. one block each - * process for every block_size*nprocs. - * If there is remaining unallocated, give a block each to process - * starting at proc 0. The last process may get a partial block. - */ - off_t remain_nelmts, remain_begin; /* unallocated remaining*/ - - elmts_begin = (off_t)(nelmts_in_buf*pio_mpi_rank_g); - - /* must use integer calculation next */ - /* allocate equal blocks per process */ - elmts_count = (nelmts / (off_t)(nelmts_in_buf*pio_mpi_nprocs_g)) * - (off_t)nelmts_in_buf; - remain_nelmts = nelmts % ((off_t)(nelmts_in_buf*pio_mpi_nprocs_g)); - - /* allocate any remaining */ - remain_begin = (off_t)(nelmts_in_buf*pio_mpi_rank_g); - if (remain_nelmts > remain_begin) { - /* it gets something */ - if (remain_nelmts > (remain_begin+(off_t)nelmts_in_buf)) { - /* one full block */ - elmts_count += nelmts_in_buf; - } /* end if */ - else { - /* only a partial block */ - elmts_count += remain_nelmts - remain_begin; - } /* end else */ - } /* end if */ - } /* end else */ - /* debug */ - if (pio_debug_level >= 4) { - HDprint_rank(output); - HDfprintf(output, "Debug(do_read): " - "nelmts_in_buf=%Hd, elmts_begin=%Hd, elmts_count=%Hd\n", - (long_long)nelmts_in_buf, (long_long)elmts_begin, - (long_long)elmts_count); - } - - - /* The task is to transfer elmts_count elements, starting at - * elmts_begin position, using transfer buffer of buf_size bytes. + /* The task is to transfer bytes_count bytes, starting at + * bytes_begin position, using transfer buffer of buf_size bytes. * If interleaved, select buf_size at a time, in round robin * fashion, according to number of process. Otherwise, select - * all elmt_count in contiguous. + * all bytes_count in contiguous. */ - nelmts_xfer = 0 ; + nbytes_xfer = 0 ; + + /* Set base file offset for all I/O patterns and POSIX access */ + posix_file_offset = dset_offset + bytes_begin; + + /* Set base file offset for all I/O patterns and MPI access */ + mpi_file_offset = (MPI_Offset)(dset_offset + bytes_begin); /* Start "raw data" read timer */ set_time(res->timers, HDF5_RAW_READ_FIXED_DIMS, START); - while (nelmts_xfer < elmts_count){ - /* transfer one buffer of data each round */ - /* Note: because size_t is unsigned, avoid expressions that */ - /* can be negative. */ - if ((nelmts_xfer + (off_t)nelmts_in_buf) <= elmts_count) { - nelmts_toxfer = nelmts_in_buf; - } else { - /* last transfer of a partial buffer */ - nelmts_toxfer = elmts_count - nelmts_xfer; - } - - /* read */ + while (nbytes_xfer < bytes_count){ + /* Read */ /* Calculate offset of read within a dataset/file */ - switch (parms->io_type){ + switch (parms->io_type) { case POSIXIO: - if (parms->interleaved==0){ - /* Contiguous pattern */ - /* need to (off_t) the elmnts_begin expression because they */ - /* may be of smaller sized integer types */ - file_offset = dset_offset + (off_t)(elmts_begin + nelmts_xfer)*(off_t)ELMT_SIZE; - } /* end if */ - else { - /* Interleaved access pattern */ - /* Skip offset over blocks of other processes */ - file_offset = dset_offset + - (off_t)(elmts_begin + (nelmts_xfer*pio_mpi_nprocs_g))*(off_t)ELMT_SIZE; - } /* end else */ - - /* only care if seek returns error */ - rc = POSIXSEEK(fd->posixfd, file_offset) < 0 ? -1 : 0; - VRFY((rc==0), "POSIXSEEK"); - /* check if all bytes are transferred */ - rc = ((ssize_t)(nelmts_toxfer*ELMT_SIZE) == - POSIXREAD(fd->posixfd, buffer, nelmts_toxfer*ELMT_SIZE)); - VRFY((rc != 0), "POSIXREAD"); - break; - - case MPIO: - if (parms->interleaved==0){ - /* Contiguous pattern */ - mpi_offset = dset_offset + (elmts_begin + nelmts_xfer)*(off_t)ELMT_SIZE; + /* Contiguous pattern */ + if (parms->interleaved==0) { + /* Compute file offset */ + file_offset = posix_file_offset + (off_t)nbytes_xfer; + + /* only care if seek returns error */ + rc = POSIXSEEK(fd->posixfd, file_offset) < 0 ? -1 : 0; + VRFY((rc==0), "POSIXSEEK"); + + /* check if all bytes are written */ + rc = ((ssize_t)buf_size == + POSIXREAD(fd->posixfd, buffer, buf_size)); + VRFY((rc != 0), "POSIXREAD"); + + /* Advance global offset in dataset */ + nbytes_xfer+=buf_size; } /* end if */ + /* Interleaved access pattern */ else { - /* Interleaved access pattern */ - /* Skip offset over blocks of other processes */ - mpi_offset = dset_offset + (elmts_begin + (nelmts_xfer*pio_mpi_nprocs_g))*(off_t)ELMT_SIZE; + /* Set the base of user's buffer */ + buf_p=(unsigned char *)buffer; + + /* Set the number of bytes to transfer this time */ + nbytes_toxfer = buf_size; + + /* Loop over the buffers to read */ + while(nbytes_toxfer>0) { + /* Skip offset over blocks of other processes */ + file_offset = posix_file_offset + + (off_t)(nbytes_xfer*pio_mpi_nprocs_g); + + /* only care if seek returns error */ + rc = POSIXSEEK(fd->posixfd, file_offset) < 0 ? -1 : 0; + VRFY((rc==0), "POSIXSEEK"); + + /* check if all bytes are written */ + rc = ((ssize_t)blk_size == + POSIXREAD(fd->posixfd, buf_p, blk_size)); + VRFY((rc != 0), "POSIXREAD"); + + /* Advance location in buffer */ + buf_p+=blk_size; + + /* Advance global offset in dataset */ + nbytes_xfer+=blk_size; + + /* Decrement number of bytes left this time */ + nbytes_toxfer-=blk_size; + } /* end while */ } /* end else */ + break; + case MPIO: + /* Independent file access */ if(parms->collective==0) { - mrc = MPI_File_read_at(fd->mpifd, mpi_offset, buffer, - (int)nelmts_toxfer, ELMT_MPI_TYPE, - &mpi_status); - VRFY((mrc==MPI_SUCCESS), "MPIO_read"); + /* Contiguous pattern */ + if (parms->interleaved==0){ + /* Compute offset in file */ + mpi_offset = mpi_file_offset + + nbytes_xfer; + + /* Perform independent read */ + mrc = MPI_File_read_at(fd->mpifd, mpi_offset, buffer, + (int)(buf_size/blk_size), mpi_blk_type, + &mpi_status); + VRFY((mrc==MPI_SUCCESS), "MPIO_READ"); + + /* Advance global offset in dataset */ + nbytes_xfer+=buf_size; + } /* end if */ + /* Interleaved access pattern */ + else { + /* Set the base of user's buffer */ + buf_p=(unsigned char *)buffer; + + /* Set the number of bytes to transfer this time */ + nbytes_toxfer = buf_size; + + /* Loop over the buffers to read */ + while(nbytes_toxfer>0) { + /* Skip offset over blocks of other processes */ + mpi_offset = mpi_file_offset + + (nbytes_xfer*pio_mpi_nprocs_g); + + /* Perform independent read */ + mrc = MPI_File_read_at(fd->mpifd, mpi_offset, buf_p, + (int)1, mpi_blk_type, &mpi_status); + VRFY((mrc==MPI_SUCCESS), "MPIO_READ"); + + /* Advance location in buffer */ + buf_p+=blk_size; + + /* Advance global offset in dataset */ + nbytes_xfer+=blk_size; + + /* Decrement number of bytes left this time */ + nbytes_toxfer-=blk_size; + } /* end while */ + } /* end else */ } /* end if */ + /* Collective file access */ else { - mrc = MPI_File_read_at_all(fd->mpifd, mpi_offset, buffer, - (int)nelmts_toxfer, ELMT_MPI_TYPE, - &mpi_status); - VRFY((mrc==MPI_SUCCESS), "MPIO_read"); + /* Contiguous access pattern */ + if (parms->interleaved==0){ + /* Compute offset in file */ + mpi_offset = mpi_file_offset + + nbytes_xfer; + + /* Perform collective read */ + mrc = MPI_File_read_at_all(fd->mpifd, mpi_offset, buffer, + (int)(buf_size/blk_size), mpi_blk_type, &mpi_status); + VRFY((mrc==MPI_SUCCESS), "MPIO_READ"); + + /* Advance global offset in dataset */ + nbytes_xfer+=buf_size; + } /* end if */ + /* Interleaved access pattern */ + else { + /* Compute offset in file */ + mpi_offset = mpi_file_offset + + (nbytes_xfer*pio_mpi_nprocs_g); + + /* Set the file view */ + mrc = MPI_File_set_view(fd->mpifd, mpi_offset, mpi_blk_type, + mpi_file_type, (char*)"native", h5_io_info_g); + VRFY((mrc==MPI_SUCCESS), "MPIO_VIEW"); + + /* Perform collective read */ + mrc = MPI_File_read_at_all(fd->mpifd, 0, buffer, + (int)(buf_size/blk_size), mpi_blk_type, &mpi_status); + VRFY((mrc==MPI_SUCCESS), "MPIO_READ"); + + /* Advance global offset in dataset */ + nbytes_xfer+=buf_size; + } /* end else */ } /* end else */ break; case PHDF5: - /* Set up the dset space id to select the segment to process */ + /* Set up the file dset space id to move the selection to process */ if (parms->interleaved==0){ /* Contiguous pattern */ - h5start[0] = elmts_begin + nelmts_xfer; + h5offset[0] = nbytes_xfer; } /* end if */ else { /* Interleaved access pattern */ /* Skip offset over blocks of other processes */ - h5start[0] = elmts_begin + (nelmts_xfer*pio_mpi_nprocs_g); + h5offset[0] = (nbytes_xfer*pio_mpi_nprocs_g); } /* end else */ - h5stride[0] = h5block[0] = nelmts_toxfer; - h5count[0] = 1; - hrc = H5Sselect_hyperslab(h5dset_space_id, H5S_SELECT_SET, - h5start, h5stride, h5count, h5block); - VRFY((hrc >= 0), "H5Sset_hyperslab"); - - /* Only need selection in memory dataset if it is smaller than the whole buffer */ - if(nelmts_toxfer= 0), "H5Sset_hyperslab"); - } /* end if */ + hrc = H5Soffset_simple(h5dset_space_id, h5offset); + VRFY((hrc >= 0), "H5Soffset_simple"); - /* set read time here */ + /* Read the buffer in */ hrc = H5Dread(h5ds_id, ELMT_H5_TYPE, h5mem_space_id, - h5dset_space_id, h5dxpl, buffer); + h5dset_space_id, h5dxpl, buffer); VRFY((hrc >= 0), "H5Dread"); + + /* Increment number of bytes transferred */ + nbytes_xfer += buf_size; + break; } /* switch (parms->io_type) */ + /* Verify raw data, if asked */ if (parms->verify) { - /*verify read data*/ - int *intptr = (int *)buffer; + /* Verify data read */ + unsigned char *ucharptr = (unsigned char *)buffer; size_t i; int nerror=0; - for (i = 0; i < nelmts_toxfer; ++i){ - if (*intptr++ != pio_mpi_rank_g){ + for (i = 0; i < buf_size; ++i){ + if (*ucharptr++ != pio_mpi_rank_g) { if (++nerror < 20){ /* report at most 20 errors */ HDprint_rank(output); HDfprintf(output, "read data error, expected (%Hd), " "got (%Hd)\n", (long_long)pio_mpi_rank_g, - (long_long)*(intptr-1)); - } - } - } + (long_long)*(ucharptr-1)); + } /* end if */ + } /* end if */ + } /* end for */ if (nerror >= 20) { HDprint_rank(output); HDfprintf(output, "..."); - HDfprintf(output, "total read data errors=%Hd\n", + HDfprintf(output, "total read data errors=%d\n", nerror); - } + } /* end if */ } /* if (parms->verify) */ - /* Increment number of elements transferred */ - nelmts_xfer += nelmts_toxfer; - } + } /* end while */ /* Stop "raw data" read timer */ set_time(res->timers, HDF5_RAW_READ_FIXED_DIMS, STOP); @@ -1158,7 +1383,7 @@ do_read(results *res, file_descr *fd, parameters *parms, long ndsets, /* Calculate read time */ /* Close dataset. Only HDF5 needs to do an explicit close. */ - if (parms->io_type == PHDF5){ + if (parms->io_type == PHDF5) { hrc = H5Dclose(h5ds_id); if (hrc < 0) { @@ -1167,10 +1392,21 @@ do_read(results *res, file_descr *fd, parameters *parms, long ndsets, } h5ds_id = -1; - } - } + } /* end if */ + } /* end for */ done: + /* release MPI-I/O objects */ + if (parms->io_type == MPIO) { + /* Free file type */ + mrc = MPI_Type_free( &mpi_file_type ); + VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_FREE"); + + /* Free buffer type */ + mrc = MPI_Type_free( &mpi_blk_type ); + VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_FREE"); + } /* end if */ + /* release HDF5 objects */ if (h5dset_space_id != -1) { hrc = H5Sclose(h5dset_space_id); @@ -1713,6 +1949,8 @@ invalidate_file_cache(const char *filename) #else +/* turn the stubs off since some compilers are warning they are not used */ +#if 0 /* H5_HAVE_GPFS isn't defined...stub functions */ static void @@ -1757,6 +1995,8 @@ invalidate_file_cache(const char UNUSED *filename) return; } +#endif /* 0 */ + #endif /* H5_HAVE_GPFS */ #ifdef TIME_MPI diff --git a/perform/pio_perf.c b/perform/pio_perf.c index 0f43f15..5ddac67 100644 --- a/perform/pio_perf.c +++ b/perform/pio_perf.c @@ -109,7 +109,7 @@ int pio_debug_level = 0;/* The debug level: */ /* local variables */ -static const char *progname = "pio_perf"; +static const char *progname = "h5perf"; /* * Command-line options: The user can specify short or long-named @@ -117,14 +117,11 @@ static const char *progname = "pio_perf"; * adding more, make sure that they don't clash with each other. */ #if 1 -static const char *s_opts = "ha:A:cCD:f:P:p:X:x:nd:F:i:Io:stT:w"; +static const char *s_opts = "a:A:B:cCd:D:e:F:hi:Ino:p:P:stT:wx:X:"; #else -static const char *s_opts = "ha:A:bcCD:f:P:p:X:x:nd:F:i:Io:stT:w"; +static const char *s_opts = "a:A:bB:cCd:D:e:F:hi:Ino:p:P:stT:wx:X:"; #endif /* 1 */ static struct long_options l_opts[] = { - { "help", no_arg, 'h' }, - { "hel", no_arg, 'h' }, - { "he", no_arg, 'h' }, { "align", require_arg, 'a' }, { "alig", require_arg, 'a' }, { "ali", require_arg, 'a' }, @@ -139,6 +136,15 @@ static struct long_options l_opts[] = { { "bin", no_arg, 'b' }, { "bi", no_arg, 'b' }, #endif /* 0 */ + { "block-size", require_arg, 'B' }, + { "block-siz", require_arg, 'B' }, + { "block-si", require_arg, 'B' }, + { "block-s", require_arg, 'B' }, + { "block-", require_arg, 'B' }, + { "block", require_arg, 'B' }, + { "bloc", require_arg, 'B' }, + { "blo", require_arg, 'B' }, + { "bl", require_arg, 'B' }, { "chunk", no_arg, 'c' }, { "chun", no_arg, 'c' }, { "chu", no_arg, 'c' }, @@ -156,13 +162,9 @@ static struct long_options l_opts[] = { { "debu", require_arg, 'D' }, { "deb", require_arg, 'D' }, { "de", require_arg, 'D' }, - { "file-size", require_arg, 'f' }, - { "file-siz", require_arg, 'f' }, - { "file-si", require_arg, 'f' }, - { "file-s", require_arg, 'f' }, - { "file", require_arg, 'f' }, - { "fil", require_arg, 'f' }, - { "fi", require_arg, 'f' }, + { "help", no_arg, 'h' }, + { "hel", no_arg, 'h' }, + { "he", no_arg, 'h' }, { "interleaved", require_arg, 'I' }, { "interleave", require_arg, 'I' }, { "interleav", require_arg, 'I' }, @@ -213,6 +215,11 @@ static struct long_options l_opts[] = { { "no-f", no_arg, 'n' }, { "no-", no_arg, 'n' }, { "no", no_arg, 'n' }, + { "num-bytes", require_arg, 'e' }, + { "num-byte", require_arg, 'e' }, + { "num-byt", require_arg, 'e' }, + { "num-by", require_arg, 'e' }, + { "num-b", require_arg, 'e' }, { "num-dsets", require_arg, 'd' }, { "num-dset", require_arg, 'd' }, { "num-dse", require_arg, 'd' }, @@ -260,14 +267,15 @@ static struct long_options l_opts[] = { struct options { long io_types; /* bitmask of which I/O types to test */ const char *output_file; /* file to print report to */ - off_t file_size; /* size of file */ long num_dsets; /* number of datasets */ long num_files; /* number of files */ + size_t num_bpp; /* number of bytes per proc per dset */ int num_iters; /* number of iterations */ int max_num_procs; /* maximum number of processes to use */ int min_num_procs; /* minimum number of processes to use */ size_t max_xfer_size; /* maximum transfer buffer size */ size_t min_xfer_size; /* minimum transfer buffer size */ + size_t blk_size; /* Block size */ unsigned interleaved; /* Interleaved vs. contiguous blocks */ unsigned collective; /* Collective vs. independent I/O */ int print_times; /* print times as well as throughputs */ @@ -410,6 +418,7 @@ run_test_loop(struct options *opts) parms.num_files = opts->num_files; parms.num_dsets = opts->num_dsets; parms.num_iters = opts->num_iters; + parms.blk_size = opts->blk_size; parms.interleaved = opts->interleaved; parms.collective = opts->collective; parms.h5_align = opts->h5_alignment; @@ -439,17 +448,16 @@ run_test_loop(struct options *opts) for (buf_size = opts->min_xfer_size; buf_size <= opts->max_xfer_size; buf_size <<= 1) { parms.buf_size = buf_size; - parms.num_elmts = opts->file_size / - (off_t)(parms.num_dsets * sizeof(int)); + parms.num_bytes = (off_t)opts->num_bpp*parms.num_procs; print_indent(1); output_report("Transfer Buffer Size: %ld bytes, File size: %.2f MBs\n", buf_size, - ((double)parms.num_dsets * (double)parms.num_elmts * - (double)sizeof(int)) / ONE_MB); + ((double)parms.num_dsets * (double)parms.num_bytes) + / ONE_MB); print_indent(1); - output_report(" # of files: %ld, # of dsets: %ld, # of elmts per dset: %ld\n", - parms.num_files, parms.num_dsets, parms.num_elmts); + output_report(" # of files: %ld, # of datasets: %ld, dataset size: %.2f MBs\n", + parms.num_files, parms.num_dsets, (double)parms.num_bytes/ONE_MB); if (opts->io_types & PIO_POSIX) run_test(POSIXIO, parms, opts); @@ -503,7 +511,7 @@ run_test(iotype iot, parameters parms, struct options *opts) minmax read_gross_mm = {0.0, 0.0, 0.0, 0}; minmax read_raw_mm = {0.0, 0.0, 0.0, 0}; - raw_size = (off_t)parms.num_dsets * (off_t)parms.num_elmts * (off_t)sizeof(int); + raw_size = (off_t)parms.num_dsets * (off_t)parms.num_bytes; parms.io_type = iot; print_indent(2); output_report("IO API = "); @@ -996,8 +1004,8 @@ report_parameters(struct options *opts) HDfprintf(output, "rank %d: IO API=", rank); print_io_api(opts->io_types); - HDfprintf(output, "rank %d: File size=", rank); - recover_size_and_print((long_long)opts->file_size, "\n"); + HDfprintf(output, "rank %d: Number of bytes per process per dataset=", rank); + recover_size_and_print((long_long)opts->num_bpp, "\n"); HDfprintf(output, "rank %d: Number of files=%Hd\n", rank, (long_long)opts->num_files); @@ -1008,9 +1016,21 @@ report_parameters(struct options *opts) HDfprintf(output, "rank %d: Number of processes=%d:%d\n", rank, opts->min_num_procs, opts->max_num_procs); + HDfprintf(output, "rank %d: Size of dataset(s)=", rank); + recover_size_and_print((long_long)(opts->num_bpp * opts->min_num_procs), ":"); + recover_size_and_print((long_long)(opts->num_bpp * opts->max_num_procs), "\n"); + + HDfprintf(output, "rank %d: File size=", rank); + recover_size_and_print((long_long)(opts->num_bpp * opts->min_num_procs + * opts->num_dsets), ":"); + recover_size_and_print((long_long)(opts->num_bpp * opts->max_num_procs + * opts->num_dsets), "\n"); + HDfprintf(output, "rank %d: Transfer buffer size=", rank); recover_size_and_print((long_long)opts->min_xfer_size, ":"); recover_size_and_print((long_long)opts->max_xfer_size, "\n"); + HDfprintf(output, "rank %d: Block size=", rank); + recover_size_and_print((long_long)opts->blk_size, "\n"); HDfprintf(output, "rank %d: Block Pattern in Dataset=", rank); if(opts->interleaved) @@ -1055,15 +1075,16 @@ parse_command_line(int argc, char *argv[]) cl_opts = (struct options *)malloc(sizeof(struct options)); cl_opts->output_file = NULL; - cl_opts->file_size = 64 * ONE_MB; cl_opts->io_types = 0; /* will set default after parsing options */ cl_opts->num_dsets = 1; cl_opts->num_files = 1; + cl_opts->num_bpp = 256 * ONE_KB; cl_opts->num_iters = 1; cl_opts->max_num_procs = comm_world_nprocs_g; cl_opts->min_num_procs = 1; cl_opts->max_xfer_size = 1 * ONE_MB; cl_opts->min_xfer_size = 128 * ONE_KB; + cl_opts->blk_size = 128 * ONE_KB; /* Default to writing 128K per block */ cl_opts->interleaved = 0; /* Default to contiguous blocks in dataset */ cl_opts->collective = 0; /* Default to independent I/O access */ cl_opts->print_times = FALSE; /* Printing times is off by default */ @@ -1119,6 +1140,9 @@ parse_command_line(int argc, char *argv[]) /* the future "binary" option */ break; #endif /* 0 */ + case 'B': + cl_opts->blk_size = parse_size_directive(opt_arg); + break; case 'c': /* Turn on chunked HDF5 dataset creation */ cl_opts->h5_use_chunks = TRUE; @@ -1187,8 +1211,8 @@ parse_command_line(int argc, char *argv[]) } break; - case 'f': - cl_opts->file_size = parse_size_directive(opt_arg); + case 'e': + cl_opts->num_bpp = parse_size_directive(opt_arg); break; case 'F': cl_opts->num_files = atoi(opt_arg); @@ -1318,15 +1342,20 @@ usage(const char *prog) #if 0 printf(" -b, --binary The elusive binary option\n"); #endif /* 0 */ + printf(" -B S, --block-size=S Block size within transfer buffer\n"); + printf(" (see below for description)\n"); + printf(" [default:128K]\n"); printf(" -c, --chunk Create HDF5 datasets chunked [default: off]\n"); - printf(" -C, --collective Use collective I/O for MPI and HDF5 APIs [default: off (i.e. independent I/O)]\n"); + printf(" -C, --collective Use collective I/O for MPI and HDF5 APIs\n"); + printf(" [default: off (i.e. independent I/O)]\n"); printf(" -d N, --num-dsets=N Number of datasets per file [default:1]\n"); printf(" -D DL, --debug=DL Indicate the debugging level\n"); printf(" [default: no debugging]\n"); - printf(" -f S, --file-size=S Size of a single file [default: 64M]\n"); + printf(" -e S, --num-bytes=S Number of bytes per process per dataset\n"); + printf(" [default: 256K]\n"); printf(" -F N, --num-files=N Number of files [default: 1]\n"); printf(" -i, --num-iterations Number of iterations to perform [default: 1]\n"); - printf(" -I --interleaved Interleaved block I/O (see below for example)\n"); + printf(" -I, --interleaved Interleaved block I/O (see below for example)\n"); printf(" [default: Contiguous block I/O]\n"); printf(" -n, --no-fill Don't write fill values to HDF5 dataset\n"); printf(" (Supported in HDF5 library v1.5 only)\n"); @@ -1348,7 +1377,7 @@ usage(const char *prog) printf(" M - Megabyte (%d)\n", ONE_MB); printf(" G - Gigabyte (%d)\n", ONE_GB); printf("\n"); - printf(" Example: 37M = 37 Megabytes = %d bytes\n", 37*ONE_MB); + printf(" Example: '37M' is 37 megabytes or %d bytes\n", 37*ONE_MB); printf("\n"); printf(" AL - is an API list. Valid values are:\n"); printf(" phdf5 - Parallel HDF5\n"); @@ -1357,24 +1386,50 @@ usage(const char *prog) printf("\n"); printf(" Example: --api=mpiio,phdf5\n"); printf("\n"); + printf(" Block size vs. Transfer buffer size:\n"); + printf(" The transfer buffer size is the size of a buffer in memory, which is\n"); + printf(" broken into 'block size' pieces and written to the file. The pattern\n"); + printf(" of the blocks in the file is described below in the 'Interleaved vs.\n"); + printf(" Contiguous blocks' example.\n"); + printf("\n"); + printf(" If the collective I/O option is given, the blocks in each transfer buffer\n"); + printf(" are written at once with an MPI derived type, for the MPI-I/O and PHDF5\n"); + printf(" APIs.\n"); + printf("\n"); printf(" Interleaved vs. Contiguous blocks:\n"); - printf(" For example, with a 4 process run,\n"); - printf(" Contiguous blocks are written to the file like so:\n"); - printf(" 1111222233334444\n"); - printf(" Interleaved blocks are written to the file like so:\n"); - printf(" 1234123412341234\n"); + printf(" When contiguous blocks are written to a dataset, the dataset is divided\n"); + printf(" into '# processes' regions and each process writes data to its own region.\n"); + printf(" When interleaved blocks are written to a dataset, space for the first\n"); + printf(" block of the first process is allocated in the dataset, then space is\n"); + printf(" allocated for the first block of the second process, etc. until space is\n"); + printf(" allocated for the first block of each process, then space is allocated for\n"); + printf(" the second block of the first process, the second block of the second\n"); + printf(" process, etc.\n"); + printf("\n"); + printf(" For example, with a 4 process run, 1MB bytes-per-process, 256KB transfer\n"); + printf(" buffer size, and 64KB block size,\n"); + printf(" 16 contiguous blocks per process are written to the file like so:\n"); + printf(" 1111111111111111222222222222222233333333333333334444444444444444\n"); + printf(" 16 interleaved blocks per process are written to the file like so:\n"); + printf(" 1234123412341234123412341234123412341234123412341234123412341234\n"); + printf(" If collective I/O is turned on, all of the four blocks per transfer\n"); + printf(" buffer will be written in one collective I/O call.\n"); printf("\n"); printf(" DL - is a list of debugging flags. Valid values are:\n"); printf(" 1 - Minimal\n"); printf(" 2 - Not quite everything\n"); printf(" 3 - Everything\n"); - printf(" 4 - Everything and the kitchen sink\n"); + printf(" 4 - The kitchen sink\n"); printf(" r - Raw data I/O throughput information\n"); printf(" t - Times as well as throughputs\n"); printf(" v - Verify data correctness\n"); printf("\n"); printf(" Example: --debug=2,r,t\n"); printf("\n"); + printf(" Environment variables:\n"); + printf(" HDF5_NOCLEANUP Do not remove data files if set [default remove]\n"); + printf(" HDF5_MPI_INFO MPI INFO object key=value separated by ;\n"); + printf(" HDF5_PARAPREFIX Paralllel data files prefix\n"); fflush(stdout); } } diff --git a/perform/pio_perf.h b/perform/pio_perf.h index e245e8a..5053eb2 100644 --- a/perform/pio_perf.h +++ b/perform/pio_perf.h @@ -36,9 +36,10 @@ typedef struct parameters_ { int num_procs; /* Maximum number of processes to use */ long num_files; /* Number of files to create */ long num_dsets; /* Number of datasets to create */ - off_t num_elmts; /* Number of native ints in each dset */ + off_t num_bytes; /* Number of bytes in each dset */ int num_iters; /* Number of times to loop doing the IO */ size_t buf_size; /* Buffer size */ + size_t blk_size; /* Block size */ unsigned interleaved; /* Interleaved vs. contiguous blocks */ unsigned collective; /* Collective vs. independent I/O */ hsize_t h5_align; /* HDF5 object alignment */ -- cgit v0.12