/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * Copyright by The HDF Group. * * Copyright by the Board of Trustees of the University of Illinois. * * All rights reserved. * * * * This file is part of HDF5. The full HDF5 copyright notice, including * * terms governing use, modification, and redistribution, is contained in * * the files COPYING and Copyright.html. COPYING can be found at the root * * of the source code distribution tree; Copyright.html can be found at the * * root level of an installed copy of the electronic HDF5 document set and * * is linked from the top-level documents page. It can also be found at * * http://hdfgroup.org/HDF5/doc/Copyright.html. If you do not have * * access to either file, you may request a copy from help@hdfgroup.org. * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ /* * (C) 1995-2001 Clemson University and Argonne National Laboratory. * * See COPYING in top-level directory. * * This is contributed by Robert Ross to the HDF5 software. * and was called mpi-io-test.c */ #include "hdf5.h" #include "H5private.h" #ifdef H5_HAVE_PARALLEL /* mpi-perf.c * * This is derived from code given to me by Rajeev Thakur. Dunno where * it originated. * * It's purpose is to produce aggregate bandwidth numbers for varying * block sizes, number of processors, an number of iterations. * * This is strictly an mpi program - it is used to test the MPI I/O * functionality implemented by Romio. * * Compiling is usually easiest with something like: * mpicc -Wall -Wstrict-prototypes mpi-io-test.c -o mpi-io-test * * NOTE: This code assumes that all command line arguments make it out to all * the processes that make up the parallel job, which isn't always the case. * So if it doesn't work on some platform, that might be why. */ /* Modifications: * Albert Cheng, Apr 30, 20001 * Changed MPI_File_open to use MPI_COMM_WORLD (was MPI_COMM_SELF). * Albert Cheng, May 5, 20001 * Changed MPI_File_seek then MPI_File_write or MPI_File_read to just * MPI_File_write_at and MPI_File_read_at. Some compiler, e.g., IBM * mpcc_r does not support MPI_File_seek and MPI_File_read or MPI_File_write. */ #include <stdio.h> #include <stdlib.h> #include <fcntl.h> #include <unistd.h> #include <errno.h> #include <string.h> #include <sys/time.h> #include <mpi.h> #ifndef MPI_FILE_NULL /*MPIO may be defined in mpi.h already */ # include <mpio.h> #endif /* DEFAULT VALUES FOR OPTIONS */ int64_t opt_block = 1048576*16; int opt_iter = 1; int opt_stripe = -1; int opt_correct = 0; int amode = O_RDWR | O_CREAT; char opt_file[256] = "/tmp/test.out\0"; char opt_pvfstab[256] = "notset\0"; int opt_pvfstab_set = 0; /* function prototypes */ int parse_args(int argc, char **argv); double Wtime(void); extern int errno; extern int debug_on; /* globals needed for getopt */ extern char *optarg; extern int optind, opterr; int main(int argc, char **argv) { char *buf, *tmp, *buf2, *tmp2, *check; int i, j, mynod=0, nprocs=1, err, my_correct = 1, correct, myerrno; double stim, etim; double write_tim = 0; double read_tim = 0; double read_bw, write_bw; double max_read_tim, max_write_tim; double min_read_tim, min_write_tim; double ave_read_tim, ave_write_tim; int64_t iter_jump = 0; int64_t seek_position = 0; MPI_File fh; MPI_Status status; int nchars; /* startup MPI and determine the rank of this process */ MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &mynod); /* parse the command line arguments */ parse_args(argc, argv); if (mynod == 0) printf("# Using mpi-io calls.\n"); /* kindof a weird hack- if the location of the pvfstab file was * specified on the command line, then spit out this location into * the appropriate environment variable: */ #if H5_HAVE_SETENV /* no setenv or unsetenv */ if (opt_pvfstab_set) { if((setenv("PVFSTAB_FILE", opt_pvfstab, 1)) < 0){ perror("setenv"); goto die_jar_jar_die; } } #endif /* this is how much of the file data is covered on each iteration of * the test. used to help determine the seek offset on each * iteration */ iter_jump = nprocs * opt_block; /* setup a buffer of data to write */ if (!(tmp = (char *) malloc(opt_block + 256))) { perror("malloc"); goto die_jar_jar_die; } buf = tmp + 128 - (((long)tmp) % 128); /* align buffer */ if (opt_correct) { /* do the same buffer setup for verifiable data */ if (!(tmp2 = (char *) malloc(opt_block + 256))) { perror("malloc2"); goto die_jar_jar_die; } buf2 = tmp + 128 - (((long)tmp) % 128); } /* open the file for writing */ err = MPI_File_open(MPI_COMM_WORLD, opt_file, MPI_MODE_CREATE | MPI_MODE_RDWR, MPI_INFO_NULL, &fh); if (err < 0) { fprintf(stderr, "node %d, open error: %s\n", mynod, strerror(errno)); goto die_jar_jar_die; } /* now repeat the write operations the number of times * specified on the command line */ for (j=0; j < opt_iter; j++) { /* calculate the appropriate position depending on the iteration * and rank of the current process */ seek_position = (j*iter_jump)+(mynod*opt_block); if (opt_correct) /* fill in buffer for iteration */ { for (i=mynod+j, check=buf; i<opt_block; i++,check++) *check=(char)i; } /* discover the starting time of the operation */ MPI_Barrier(MPI_COMM_WORLD); stim = MPI_Wtime(); /* write out the data */ nchars = opt_block/sizeof(char); err = MPI_File_write_at(fh, seek_position, buf, nchars, MPI_CHAR, &status); if(err){ fprintf(stderr, "node %d, write error: %s\n", mynod, strerror(errno)); } /* discover the ending time of the operation */ etim = MPI_Wtime(); write_tim += (etim - stim); /* we are done with this "write" iteration */ } err = MPI_File_close(&fh); if(err){ fprintf(stderr, "node %d, close error after write\n", mynod); } /* wait for everyone to synchronize at this point */ MPI_Barrier(MPI_COMM_WORLD); /* reopen the file to read the data back out */ err = MPI_File_open(MPI_COMM_WORLD, opt_file, MPI_MODE_CREATE | MPI_MODE_RDWR, MPI_INFO_NULL, &fh); if (err < 0) { fprintf(stderr, "node %d, open error: %s\n", mynod, strerror(errno)); goto die_jar_jar_die; } /* we are going to repeat the read operation the number of iterations * specified */ for (j=0; j < opt_iter; j++) { /* calculate the appropriate spot give the current iteration and * rank within the MPI processes */ seek_position = (j*iter_jump)+(mynod*opt_block); /* discover the start time */ MPI_Barrier(MPI_COMM_WORLD); stim = MPI_Wtime(); /* read in the file data */ if (!opt_correct){ err = MPI_File_read_at(fh, seek_position, buf, nchars, MPI_CHAR, &status); } else{ err = MPI_File_read_at(fh, seek_position, buf2, nchars, MPI_CHAR, &status); } myerrno = errno; /* discover the end time */ etim = MPI_Wtime(); read_tim += (etim - stim); if (err < 0) fprintf(stderr, "node %d, read error, loc = %Ld: %s\n", mynod, mynod*opt_block, strerror(myerrno)); /* if the user wanted to check correctness, compare the write * buffer to the read buffer */ if (opt_correct && memcmp(buf, buf2, opt_block)) { fprintf(stderr, "node %d, correctness test failed\n", mynod); my_correct = 0; MPI_Allreduce(&my_correct, &correct, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); } /* we are done with this read iteration */ } /* close the file */ err = MPI_File_close(&fh); if(err){ fprintf(stderr, "node %d, close error after write\n", mynod); } /* compute the read and write times */ MPI_Allreduce(&read_tim, &max_read_tim, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); MPI_Allreduce(&read_tim, &min_read_tim, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); MPI_Allreduce(&read_tim, &ave_read_tim, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); /* calculate the average from the sum */ ave_read_tim = ave_read_tim / nprocs; MPI_Allreduce(&write_tim, &max_write_tim, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); MPI_Allreduce(&write_tim, &min_write_tim, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); MPI_Allreduce(&write_tim, &ave_write_tim, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); /* calculate the average from the sum */ ave_write_tim = ave_write_tim / nprocs; /* print out the results on one node */ if (mynod == 0) { read_bw = ((int64_t)(opt_block*nprocs*opt_iter))/(max_read_tim*1000000.0); write_bw = ((int64_t)(opt_block*nprocs*opt_iter))/(max_write_tim*1000000.0); printf("nr_procs = %d, nr_iter = %d, blk_sz = %ld\n", nprocs, opt_iter, (long)opt_block); printf("# total_size = %ld\n", (long)(opt_block*nprocs*opt_iter)); printf("# Write: min_time = %f, max_time = %f, mean_time = %f\n", min_write_tim, max_write_tim, ave_write_tim); printf("# Read: min_time = %f, max_time = %f, mean_time = %f\n", min_read_tim, max_read_tim, ave_read_tim); printf("Write bandwidth = %f Mbytes/sec\n", write_bw); printf("Read bandwidth = %f Mbytes/sec\n", read_bw); if (opt_correct) { printf("Correctness test %s.\n", correct ? "passed" : "failed"); } } die_jar_jar_die: #if H5_HAVE_SETENV /* no setenv or unsetenv */ /* clear the environment variable if it was set earlier */ if (opt_pvfstab_set){ unsetenv("PVFSTAB_FILE"); } #endif free(tmp); if (opt_correct) free(tmp2); MPI_Finalize(); return(0); } int parse_args(int argc, char **argv) { int c; while ((c = getopt(argc, argv, "s:b:i:f:p:c")) != EOF) { switch (c) { case 's': /* stripe */ opt_stripe = atoi(optarg); break; case 'b': /* block size */ opt_block = atoi(optarg); break; case 'i': /* iterations */ opt_iter = atoi(optarg); break; case 'f': /* filename */ strncpy(opt_file, optarg, 255); break; case 'p': /* pvfstab file */ strncpy(opt_pvfstab, optarg, 255); opt_pvfstab_set = 1; break; case 'c': /* correctness */ opt_correct = 1; break; case '?': /* unknown */ default: break; } } return(0); } /* Wtime() - returns current time in sec., in a double */ double Wtime() { struct timeval t; gettimeofday(&t, NULL); return((double)t.tv_sec + (double)t.tv_usec / 1000000); } /* * Local variables: * c-indent-level: 3 * c-basic-offset: 3 * tab-width: 3 * End: */ #else /* H5_HAVE_PARALLEL */ /* dummy program since H5_HAVE_PARALLE is not configured in */ int main() { printf("No parallel performance because parallel is not configured in\n"); return(0); } #endif /* H5_HAVE_PARALLEL */