/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 * Copyright by The HDF Group.                                               *
 * Copyright by the Board of Trustees of the University of Illinois.         *
 * All rights reserved.                                                      *
 *                                                                           *
 * This file is part of HDF5.  The full HDF5 copyright notice, including     *
 * terms governing use, modification, and redistribution, is contained in    *
 * the files COPYING and Copyright.html.  COPYING can be found at the root   *
 * of the source code distribution tree; Copyright.html can be found at the  *
 * root level of an installed copy of the electronic HDF5 document set and   *
 * is linked from the top-level documents page.  It can also be found at     *
 * http://hdfgroup.org/HDF5/doc/Copyright.html.  If you do not have          *
 * access to either file, you may request a copy from help@hdfgroup.org.     *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

/*
 * Author: Albert Cheng of NCSA, May 1, 2001.
 * This is derived from code given to me by Robert Ross.
 *
 * NOTE: This code assumes that all command line arguments make it out to all
 * the processes that make up the parallel job, which isn't always the case.
 * So if it doesn't work on some platform, that might be why.
 */

#include "hdf5.h"
#include "H5private.h"
#ifdef H5_HAVE_PARALLEL
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#ifdef H5_HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <errno.h>
#include <string.h>
#if defined(H5_TIME_WITH_SYS_TIME)
#   include <sys/time.h>
#   include <time.h>
#elif defined(H5_HAVE_SYS_TIME_H)
#   include <sys/time.h>
#else
#   include <time.h>
#endif
#include <mpi.h>
#ifndef MPI_FILE_NULL           /*MPIO may be defined in mpi.h already       */
#   include <mpio.h>
#endif


/* Macro definitions */
/* Verify:
 * if val is false (0), print mesg and if fatal is true (non-zero), die.
 */
#define H5FATAL 1
#define VRFY(val, mesg, fatal) do {                                            \
    if (!val) {                                                                \
  	printf("Proc %d: ", mynod);                 \
        printf("*** Assertion failed (%s) at line %4d in %s\n",                \
      mesg, (int)__LINE__, __FILE__);                  \
  	if (fatal){                     \
      fflush(stdout);                   \
      goto die_jar_jar_die;                 \
  	}                       \
    }                                                                          \
} while(0)
#define RANK 1
hsize_t dims[RANK];     /* dataset dim sizes */
hsize_t block[RANK], stride[RANK], count[RANK];
hssize_t start[RANK];
hid_t fid;                  /* HDF5 file ID */
hid_t acc_tpl;    /* File access templates */
hid_t sid;       /* Dataspace ID */
hid_t file_dataspace;  /* File dataspace ID */
hid_t mem_dataspace;  /* memory dataspace ID */
hid_t dataset;    /* Dataset ID */
hsize_t opt_alignment  = 1;
hsize_t opt_threshold  = 1;
int  opt_split_vfd  = 0;
char  *meta_ext, *raw_ext;  /* holds the meta and raw file extension if */
        /* opt_split_vfd is set */


/* DEFAULT VALUES FOR OPTIONS */
int64_t opt_block     = 1048576*16;
int     opt_iter      = 1;
int     opt_stripe    = -1;
int     opt_correct   = 0;
int     amode         = O_RDWR | O_CREAT;
char    opt_file[256] = "perftest.out";
char    opt_pvfstab[256] = "notset";
int     opt_pvfstab_set = 0;

/* function prototypes */
static int parse_args(int argc, char **argv);

extern int errno;

/* globals needed for getopt */
extern char *optarg;

int main(int argc, char **argv)
{
    char *buf, *tmp, *buf2, *tmp2, *check;
    int i, j, mynod=0, nprocs=1, err, my_correct = 1, correct, myerrno;
    double stim, etim;
    double write_tim = 0;
    double read_tim = 0;
    double read_bw, write_bw;
    double max_read_tim, max_write_tim;
    double min_read_tim, min_write_tim;
    double ave_read_tim, ave_write_tim;
    int64_t iter_jump = 0;
    int64_t seek_position = 0;
    MPI_File fh;
    MPI_Status status;
    int nchars;
    herr_t ret;           /* Generic return value */

    /* startup MPI and determine the rank of this process */
    MPI_Init(&argc,&argv);
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &mynod);

    /* parse the command line arguments */
    parse_args(argc, argv);

    if (mynod == 0) printf("# Using hdf5-io calls.\n");


    /* kindof a weird hack- if the location of the pvfstab file was
     * specified on the command line, then spit out this location into
     * the appropriate environment variable: */

#if H5_HAVE_SETENV
/* no setenv or unsetenv */
    if (opt_pvfstab_set) {
            if((setenv("PVFSTAB_FILE", opt_pvfstab, 1)) < 0){
                    perror("setenv");
                    goto die_jar_jar_die;
            }
    }
#endif

    /* this is how much of the file data is covered on each iteration of
     * the test.  used to help determine the seek offset on each
     * iteration */
    iter_jump = nprocs * opt_block;

    /* setup a buffer of data to write */
    if (!(tmp = (char *) malloc(opt_block + 256))) {
            perror("malloc");
            goto die_jar_jar_die;
    }
    buf = tmp + 128 - (((long)tmp) % 128);  /* align buffer */

    if (opt_correct) {
            /* do the same buffer setup for verifiable data */
            if (!(tmp2 = (char *) malloc(opt_block + 256))) {
                    perror("malloc2");
                    goto die_jar_jar_die;
             }
            buf2 = tmp + 128 - (((long)tmp) % 128);
    }

    /* setup file access template with parallel IO access. */
    if (opt_split_vfd){
  	hid_t mpio_pl;

  	mpio_pl = H5Pcreate (H5P_FILE_ACCESS);
  	VRFY((acc_tpl >= 0), "", H5FATAL);
  	ret = H5Pset_fapl_mpio(mpio_pl, MPI_COMM_WORLD, MPI_INFO_NULL);
  	VRFY((ret >= 0), "", H5FATAL);

  	/* set optional allocation alignment */
  	if (opt_alignment*opt_threshold != 1){
      	ret = H5Pset_alignment(acc_tpl, opt_threshold, opt_alignment );
      	VRFY((ret >= 0), "H5Pset_alignment succeeded", !H5FATAL);
  	}

  	/* setup file access template */
  	acc_tpl = H5Pcreate (H5P_FILE_ACCESS);
  	VRFY((acc_tpl >= 0), "", H5FATAL);
  	ret = H5Pset_fapl_split(acc_tpl, meta_ext, mpio_pl, raw_ext, mpio_pl);
  	VRFY((ret >= 0), "H5Pset_fapl_split succeeded", H5FATAL);
  	ret = H5Pclose(mpio_pl);
  	VRFY((ret >= 0), "H5Pclose mpio_pl succeeded", H5FATAL);
    }else{
  	/* setup file access template */
  	acc_tpl = H5Pcreate (H5P_FILE_ACCESS);
  	VRFY((acc_tpl >= 0), "", H5FATAL);
  	ret = H5Pset_fapl_mpio(acc_tpl, MPI_COMM_WORLD, MPI_INFO_NULL);
  	VRFY((ret >= 0), "", H5FATAL);

  	/* set optional allocation alignment */
  	if (opt_alignment*opt_threshold != 1){
  	    ret = H5Pset_alignment(acc_tpl, opt_threshold, opt_alignment );
      	VRFY((ret >= 0), "H5Pset_alignment succeeded", !H5FATAL);
  	}
    }

    /* create the parallel file */
    fid = H5Fcreate(opt_file, H5F_ACC_TRUNC, H5P_DEFAULT, acc_tpl);
    VRFY((fid >= 0), "H5Fcreate succeeded", H5FATAL);

    /* define a contiquous dataset of opt_iter*nprocs*opt_block chars */
    dims[0] = opt_iter * nprocs * opt_block;
    sid = H5Screate_simple(RANK, dims, NULL);
    VRFY((sid >= 0), "H5Screate_simple succeeded", H5FATAL);
    dataset = H5Dcreate2(fid, "Dataset1", H5T_NATIVE_CHAR, sid,
      		H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
    VRFY((dataset >= 0), "H5Dcreate2 succeeded", H5FATAL);

    /* create the memory dataspace and the file dataspace */
    dims[0] = opt_block;
    mem_dataspace = H5Screate_simple(RANK, dims, NULL);
    VRFY((mem_dataspace >= 0), "", H5FATAL);
    file_dataspace = H5Dget_space(dataset);
    VRFY((file_dataspace >= 0), "H5Dget_space succeeded", H5FATAL);

    /* now each process writes a block of opt_block chars in round robbin
     * fashion until the whole dataset is covered.
     */
    for(j=0; j < opt_iter; j++) {
        /* setup a file dataspace selection */
        start[0] = (j*iter_jump)+(mynod*opt_block);
        stride[0] = block[0] = opt_block;
        count[0]= 1;
        ret=H5Sselect_hyperslab(file_dataspace, H5S_SELECT_SET, start, stride, count, block);
        VRFY((ret >= 0), "H5Sset_hyperslab succeeded", H5FATAL);

            if (opt_correct) /* fill in buffer for iteration */ {
                    for (i=mynod+j, check=buf; i<opt_block; i++,check++) *check=(char)i;
            }

            /* discover the starting time of the operation */
       MPI_Barrier(MPI_COMM_WORLD);
       stim = MPI_Wtime();

        /* write data */
        ret = H5Dwrite(dataset, H5T_NATIVE_CHAR, mem_dataspace, file_dataspace,
                H5P_DEFAULT, buf);
        VRFY((ret >= 0), "H5Dwrite dataset1 succeeded", !H5FATAL);

            /* discover the ending time of the operation */
       etim = MPI_Wtime();

       write_tim += (etim - stim);

            /* we are done with this "write" iteration */
    }

    /* close dataset and file */
    ret=H5Dclose(dataset);
    VRFY((ret >= 0), "H5Dclose succeeded", H5FATAL);
    ret=H5Fclose(fid);
    VRFY((ret >= 0), "H5Fclose succeeded", H5FATAL);



    /* wait for everyone to synchronize at this point */
    MPI_Barrier(MPI_COMM_WORLD);

    /* reopen the file for reading */
    fid=H5Fopen(opt_file,H5F_ACC_RDONLY,acc_tpl);
    VRFY((fid >= 0), "", H5FATAL);

    /* open the dataset */
    dataset = H5Dopen2(fid, "Dataset1", H5P_DEFAULT);
    VRFY((dataset >= 0), "H5Dopen succeeded", H5FATAL);

    /* we can re-use the same mem_dataspace and file_dataspace
     * the H5Dwrite used since the dimension size is the same.
     */

    /* we are going to repeat the read the same pattern the write used */
    for (j=0; j < opt_iter; j++) {
        /* setup a file dataspace selection */
        start[0] = (j*iter_jump)+(mynod*opt_block);
        stride[0] = block[0] = opt_block;
        count[0]= 1;
        ret=H5Sselect_hyperslab(file_dataspace, H5S_SELECT_SET, start, stride, count, block);
        VRFY((ret >= 0), "H5Sset_hyperslab succeeded", H5FATAL);
            /* seek to the appropriate spot give the current iteration and
             * rank within the MPI processes */

            /* discover the start time */
       MPI_Barrier(MPI_COMM_WORLD);
       stim = MPI_Wtime();

        /* read in the file data */
        if (!opt_correct){
            ret = H5Dread(dataset, H5T_NATIVE_CHAR, mem_dataspace, file_dataspace, H5P_DEFAULT, buf);
        }
        else{
            ret = H5Dread(dataset, H5T_NATIVE_CHAR, mem_dataspace, file_dataspace, H5P_DEFAULT, buf2);
        }
        myerrno = errno;

        /* discover the end time */
       etim = MPI_Wtime();
       read_tim += (etim - stim);
        VRFY((ret >= 0), "H5Dwrite dataset1 succeeded", !H5FATAL);


       if (ret < 0) fprintf(stderr, "node %d, read error, loc = %Ld: %s\n",
                    mynod, mynod*opt_block, strerror(myerrno));

        /* if the user wanted to check correctness, compare the write
         * buffer to the read buffer */
        if (opt_correct && memcmp(buf, buf2, opt_block)) {
                fprintf(stderr, "node %d, correctness test failed\n", mynod);
                my_correct = 0;
                MPI_Allreduce(&my_correct, &correct, 1, MPI_INT, MPI_MIN,
                        MPI_COMM_WORLD);
        }

        /* we are done with this read iteration */
    }

    /* close dataset and file */
    ret=H5Dclose(dataset);
    VRFY((ret >= 0), "H5Dclose succeeded", H5FATAL);
    ret=H5Fclose(fid);
    VRFY((ret >= 0), "H5Fclose succeeded", H5FATAL);
    ret=H5Pclose(acc_tpl);
    VRFY((ret >= 0), "H5Pclose succeeded", H5FATAL);

    /* compute the read and write times */
    MPI_Allreduce(&read_tim, &max_read_tim, 1, MPI_DOUBLE, MPI_MAX,
            MPI_COMM_WORLD);
    MPI_Allreduce(&read_tim, &min_read_tim, 1, MPI_DOUBLE, MPI_MIN,
            MPI_COMM_WORLD);
    MPI_Allreduce(&read_tim, &ave_read_tim, 1, MPI_DOUBLE, MPI_SUM,
            MPI_COMM_WORLD);

    /* calculate the average from the sum */
    ave_read_tim = ave_read_tim / nprocs;

    MPI_Allreduce(&write_tim, &max_write_tim, 1, MPI_DOUBLE, MPI_MAX,
            MPI_COMM_WORLD);
    MPI_Allreduce(&write_tim, &min_write_tim, 1, MPI_DOUBLE, MPI_MIN,
            MPI_COMM_WORLD);
    MPI_Allreduce(&write_tim, &ave_write_tim, 1, MPI_DOUBLE, MPI_SUM,
            MPI_COMM_WORLD);

    /* calculate the average from the sum */
    ave_write_tim = ave_write_tim / nprocs;

    /* print out the results on one node */
    if (mynod == 0) {
       read_bw = ((int64_t)(opt_block*nprocs*opt_iter))/(max_read_tim*1000000.0);
       write_bw = ((int64_t)(opt_block*nprocs*opt_iter))/(max_write_tim*1000000.0);

                    printf("nr_procs = %d, nr_iter = %d, blk_sz = %ld\n", nprocs,
            opt_iter, (long)opt_block);

                    printf("# total_size = %ld\n", (long)(opt_block*nprocs*opt_iter));

                    printf("# Write:  min_time = %f, max_time = %f, mean_time = %f\n",
                            min_write_tim, max_write_tim, ave_write_tim);
                    printf("# Read:  min_time = %f, max_time = %f, mean_time = %f\n",
                            min_read_tim, max_read_tim, ave_read_tim);

       printf("Write bandwidth = %f Mbytes/sec\n", write_bw);
       printf("Read bandwidth = %f Mbytes/sec\n", read_bw);

            if (opt_correct) {
                    printf("Correctness test %s.\n", correct ? "passed" : "failed");
            }
    }


die_jar_jar_die:

#if H5_HAVE_SETENV
/* no setenv or unsetenv */
    /* clear the environment variable if it was set earlier */
    if  (opt_pvfstab_set){
            unsetenv("PVFSTAB_FILE");
    }
#endif

    free(tmp);
    if (opt_correct) free(tmp2);
    MPI_Finalize();

    return(0);
}

static int
parse_args(int argc, char **argv)
{
    int c;

    while ((c = getopt(argc, argv, "s:b:i:f:p:a:2:c")) != EOF) {
        switch (c) {
            case 's': /* stripe */
                opt_stripe = atoi(optarg);
                break;
            case 'b': /* block size */
                opt_block = atoi(optarg);
                break;
            case 'i': /* iterations */
                opt_iter = atoi(optarg);
                break;
            case 'f': /* filename */
                strncpy(opt_file, optarg, 255);
                break;
            case 'p': /* pvfstab file */
                strncpy(opt_pvfstab, optarg, 255);
                opt_pvfstab_set = 1;
                break;
            case 'a': /* aligned allocation.
                       * syntax: -a<alignment>/<threshold>
                       * e.g., -a4096/512  allocate at 4096 bytes
                       * boundary if request size >= 512.
                       */
                {char *p;
                opt_alignment = atoi(optarg);
                if (p=(char*)strchr(optarg, '/'))
                    opt_threshold = atoi(p+1);
                }
                HDfprintf(stdout,
                    "alignment/threshold=%Hu/%Hu\n",
                     opt_alignment, opt_threshold);
                break;
            case '2': /* use 2-files, i.e., split file driver */
                opt_split_vfd=1;
                /* get meta and raw file extension. */
                /* syntax is <raw_ext>,<meta_ext> */
                meta_ext = raw_ext = optarg;
                while (*raw_ext != '\0'){
                    if (*raw_ext == ','){
                        *raw_ext = '\0';
                        raw_ext++;
                        break;
                    }
                    raw_ext++;
                }
                printf("split-file-vfd used: %s,%s\n",
                    meta_ext, raw_ext);
                break;
            case 'c': /* correctness */
                opt_correct = 1;
                break;
            case '?': /* unknown */
            default:
                break;
        }
    }

    return(0);
}

/*
 * Local variables:
 *  c-indent-level: 3
 *  c-basic-offset: 3
 *  tab-width: 3
 * End:
 */

#else /* H5_HAVE_PARALLEL */
/* dummy program since H5_HAVE_PARALLEL is not configured in */
int
main(int UNUSED argc, char UNUSED **argv)
{
    printf("No parallel performance because parallel is not configured in\n");
    return(0);
}
#endif /* H5_HAVE_PARALLEL */