/*
 * MPIO independent overlapping writes.
 *
 * First n-1 processes open 1 file.
 * Each of the n-1 process writes chunks of data to the file in round-robin
 * fashion, in a interleaved but not overlapped fashion.  Using increasing
 * chunk sizes for the benefits of testing different write sizes and also
 * reducing the numbers of writes.
 *
 * Last process (n-1) just waits.
 * First n-1 processes finish writing and cloose the file.
 * Last process opens the same file and verifies the data.
 */

#include "testphdf5.h"

/* FILENAME and filenames must have the same number of names */
const char *FILENAME[2]={
	    "MPItest",
	    NULL};
char	filenames[2][200];
int	nerrors = 0;
int	verbose = 0;
hid_t	fapl;				/* file access property list */

#define MPIO_TEST_WRITE_SIZE 1024*1024     /* 1 MB */

void
test_mpio_overlap_writes(char *filename)
{
    int mpi_size, mpi_rank;
    MPI_Comm comm;
    MPI_Info info = MPI_INFO_NULL;
    int color, mrc;
    MPI_File	fh;
    int i;
    int vrfyerrs;
    char  buf[4093];		/* use some prime number for size */
    int bufsize = sizeof(buf);
    int stride;
    MPI_Offset  mpi_off;
    MPI_Status  mpi_stat;


    if (verbose)
	printf("MPIO independent overlapping writes test on file %s\n",
	    filename);

    /* set up MPI parameters */
    MPI_Comm_size(MPI_COMM_WORLD,&mpi_size);
    MPI_Comm_rank(MPI_COMM_WORLD,&mpi_rank);

    /* Need at least 2 processes */
    if (mpi_size < 2) {
	if (MAINPROCESS)
	    printf("Need at least 2 processes to run MPIO test.\n");
	    printf(" -SKIP- \n");
	return;
    }

    /* splits processes 0 to n-2 into one comm. and the last one into another */
    color = ((mpi_rank < (mpi_size - 1)) ? 0 : 1);
    mrc = MPI_Comm_split (MPI_COMM_WORLD, color, mpi_rank, &comm);
    VRFY((mrc==MPI_SUCCESS), "Comm_split succeeded");

    if (color==0){
	/* First n-1 processes (color==0) open a file and write it */
	mrc = MPI_File_open(comm, filename, MPI_MODE_CREATE|MPI_MODE_RDWR,
		info, &fh);
	VRFY((mrc==MPI_SUCCESS), "");

	stride = 1;
	mpi_off = mpi_rank*stride;
	while (mpi_off < MPIO_TEST_WRITE_SIZE){
	    /* make sure the write does not exceed the TEST_WRITE_SIZE */
	    if (mpi_off+stride > MPIO_TEST_WRITE_SIZE)
		stride = MPIO_TEST_WRITE_SIZE - mpi_off;

	    /* set data to some trivial pattern for easy verification */
	    for (i=0; i<stride; i++)
		buf[i] = (mpi_off+i) & 0x7f;
	    mrc = MPI_File_write_at(fh, mpi_off, buf, stride, MPI_BYTE,
		    &mpi_stat);
	    VRFY((mrc==MPI_SUCCESS), "");
	    
	    /* move the offset pointer to last byte written by all processes */
	    mpi_off += (mpi_size - 1 - mpi_rank) * stride;

	    /* Increase chunk size without exceeding buffer size. */
	    /* Then move the starting offset for next write. */
	    stride *= 2;
	    if (stride > bufsize)
		stride = bufsize;
	    mpi_off += mpi_rank*stride;
	}

	/* close file and free the communicator */
	mrc = MPI_File_close(&fh);
	VRFY((mrc==MPI_SUCCESS), "MPI_FILE_CLOSE");
	mrc = MPI_Comm_free(&comm);
	VRFY((mrc==MPI_SUCCESS), "MPI_Comm_free");
    
	/* sync with the other waiting processes */
	mrc = MPI_Barrier(MPI_COMM_WORLD);
	VRFY((mrc==MPI_SUCCESS), "Sync after writes");
    }else{
	/* last process waits till writes are done,
	 * then opens file to verify data.
	 */
	mrc = MPI_Barrier(MPI_COMM_WORLD);
	VRFY((mrc==MPI_SUCCESS), "Sync after writes");

	mrc = MPI_File_open(comm, filename, MPI_MODE_RDONLY,
		info, &fh);
	VRFY((mrc==MPI_SUCCESS), "");

	stride = bufsize;
	for (mpi_off=0; mpi_off < MPIO_TEST_WRITE_SIZE; mpi_off += bufsize){
	    /* make sure it does not read beyond end of data */
	    if (mpi_off+stride > MPIO_TEST_WRITE_SIZE)
		stride = MPIO_TEST_WRITE_SIZE - mpi_off;
	    mrc = MPI_File_read_at(fh, mpi_off, buf, stride, MPI_BYTE,
		    &mpi_stat);
	    VRFY((mrc==MPI_SUCCESS), "");
	    vrfyerrs=0;
	    for (i=0; i<stride; i++){
		char expected;
		expected = (mpi_off+i) & 0x7f;
		if ((buf[i] != expected) &&
		    (vrfyerrs++ < MAX_ERR_REPORT || verbose))
			printf("proc %d: found data error at [%ld], expect %d, got %d\n",
			    mpi_rank, mpi_off+i, expected, buf[i]);
	    }
	    if (vrfyerrs > MAX_ERR_REPORT && !verbose)
		printf("proc %d: [more errors ...]\n", mpi_rank);
	}

	/* close file and free the communicator */
	mrc = MPI_File_close(&fh);
	VRFY((mrc==MPI_SUCCESS), "MPI_FILE_CLOSE");
	mrc = MPI_Comm_free(&comm);
	VRFY((mrc==MPI_SUCCESS), "MPI_Comm_free");
    }

    /*
     * one more sync to ensure all processes have done reading
     * before ending this test.
     */
    mrc = MPI_Barrier(MPI_COMM_WORLD);
    VRFY((mrc==MPI_SUCCESS), "Sync before leaving test");
}


#define MB      1048576        /* 1024*1024 == 2**20 */
#define GB      1073741824     /* 1024**3 == 2**30 */
#define TWO_GB_LESS1    2147483647     /* 2**31 - 1 */
#define FOUR_GB_LESS1   4294967295L     /* 2**32 - 1 */
/*
 * Verify that MPI_Offset exceeding 2**31 can be computed correctly.
 * Print any failure as information only, not as an error so that this
 * won't abort the remaining test or other separated tests.
 */
void
test_mpio_offset(void)
{
    int mpi_size, mpi_rank;
    MPI_Offset  mpi_off;
    MPI_Offset  mpi_off_old;
    int i;

    /* set up MPI parameters */
    MPI_Comm_size(MPI_COMM_WORLD,&mpi_size);
    MPI_Comm_rank(MPI_COMM_WORLD,&mpi_rank);

    if (verbose)
        printf("MPIO OFFSET test\n");

    /* verify correctness of assigning 2GB sizes */
    mpi_off = 2 * 1024 * (MPI_Offset)MB;
    INFO((mpi_off>0), "2GB OFFSET assignment no overflow");
    INFO((mpi_off-1)==TWO_GB_LESS1, "2GB OFFSET assignment succeed");

    /* verify correctness of increasing from below 2 GB to above 2GB */
    mpi_off = TWO_GB_LESS1;
    for (i=0; i < 3; i++){
        mpi_off_old = mpi_off;
        mpi_off = mpi_off + 1;
	/* no overflow */
        INFO((mpi_off>0), "2GB OFFSET increment no overflow");
	/* correct inc. */
        INFO((mpi_off-1)==mpi_off_old, "2GB OFFSET increment succeed");
    }

    /* verify correctness of assigning 4GB sizes */
    mpi_off = 4 * 1024 * (MPI_Offset)MB;
    INFO((mpi_off>0), "4GB OFFSET assignment no overflow");
    INFO((mpi_off-1)==FOUR_GB_LESS1, "4GB OFFSET assignment succeed");

    /* verify correctness of increasing from below 4 GB to above 4 GB */
    mpi_off = FOUR_GB_LESS1;
    for (i=0; i < 3; i++){
        mpi_off_old = mpi_off;
        mpi_off = mpi_off + 1;
	/* no overflow */
        INFO((mpi_off>0), "4GB OFFSET increment no overflow");
	/* correct inc. */
	INFO((mpi_off-1)==mpi_off_old, "4GB OFFSET increment succeed");
    }
}


/*
 * Test if MPIO can write file from under 2GB to over 2GB and then 
 * from under 4GB to over 4GB.
 * Each process writes 1MB in round robin fashion.
 * Then reads the file back in by reverse order, that is process 0 
 * reads the data of process n-1 and vice versa.
 */
void
test_mpio_gb_file(char *filename)
{
    int mpi_size, mpi_rank;
    MPI_Info info = MPI_INFO_NULL;
    int mrc;
    MPI_File	fh;
    int i, j, n;
    int vrfyerrs;
    int writerrs;		/* write errors */
    int ntimes;			/* how many times */
    char  *buf;
    char  expected;
    MPI_Offset  mpi_off;
    MPI_Status  mpi_stat;


    /* set up MPI parameters */
    MPI_Comm_size(MPI_COMM_WORLD,&mpi_size);
    MPI_Comm_rank(MPI_COMM_WORLD,&mpi_rank);

    if (verbose)
	printf("MPIO GB file test %s\n", filename);

    buf = malloc(MB);
    VRFY((buf!=NULL), "malloc succeed");

    /* open a new file. Remove it first in case it exists. */
    if (MAINPROCESS)
	remove(filename);
    MPI_Barrier(MPI_COMM_WORLD);	/* prevent racing condition */

    mrc = MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_CREATE|MPI_MODE_RDWR,
		info, &fh);
    VRFY((mrc==MPI_SUCCESS), "");

    printf("MPIO GB file write test %s\n", filename);

    /* instead of writing every bytes of the file, we will just write
     * some data around the 2 and 4 GB boundaries.  That should cover
     * potential integer overflow and filesystem size limits.
     */
    writerrs = 0;
    for (n=2; n <= 4; n+=2){
	ntimes = GB/MB*n/mpi_size + 1;
	for (i=ntimes-2; i <= ntimes; i++){
	    mpi_off = (i*mpi_size + mpi_rank)*(MPI_Offset)MB;
	    if (verbose)
		printf("proc %d: write to mpi_off=%016llx, %lld\n",
		    mpi_rank, mpi_off, mpi_off);
	    /* set data to some trivial pattern for easy verification */
	    for (j=0; j<MB; j++)
		*(buf+j) = i*mpi_size + mpi_rank;
	    if (verbose)
		printf("proc %d: writing %d bytes at offset %lld\n",
		    mpi_rank, MB, mpi_off);
	    mrc = MPI_File_write_at(fh, mpi_off, buf, MB, MPI_BYTE, &mpi_stat);
	    INFO((mrc==MPI_SUCCESS), "GB size file write");
	    if (mrc!=MPI_SUCCESS)
		writerrs++;
	}
    }

    /* close file and free the communicator */
    mrc = MPI_File_close(&fh);
    VRFY((mrc==MPI_SUCCESS), "MPI_FILE_CLOSE");
    
    mrc = MPI_Barrier(MPI_COMM_WORLD);
    VRFY((mrc==MPI_SUCCESS), "Sync after writes");

    /* open it again to verify the data written */
    /* but only if there was no write errors */
    printf("MPIO GB file read test %s\n", filename);
    if (writerrs){
	printf("proc %d: Skip read test due to previous write errors\n",
	    mpi_rank);
	return;
    }
    mrc = MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_RDONLY, info, &fh);
    VRFY((mrc==MPI_SUCCESS), "");

    /* Only read back parts of the file that have been written. */
    for (n=2; n <= 4; n+=2){
	ntimes = GB/MB*n/mpi_size + 1;
	for (i=ntimes-2; i <= ntimes; i++){
	    mpi_off = (i*mpi_size + (mpi_size - mpi_rank - 1))*(MPI_Offset)MB;
	    if (verbose)
		printf("proc %d: read from mpi_off=%016llx, %lld\n",
		    mpi_rank, mpi_off, mpi_off);
	    mrc = MPI_File_read_at(fh, mpi_off, buf, MB, MPI_BYTE, &mpi_stat);
	    INFO((mrc==MPI_SUCCESS), "GB size file read");
	    expected = i*mpi_size + (mpi_size - mpi_rank - 1);
	    vrfyerrs=0;
	    for (j=0; j<MB; j++){
		if ((*(buf+j) != expected) &&
		    (vrfyerrs++ < MAX_ERR_REPORT || verbose))
			printf("proc %d: found data error at [%ld+%d], expect %d, got %d\n",
			    mpi_rank, mpi_off, j, expected, *(buf+j));
	    }
	    if (vrfyerrs > MAX_ERR_REPORT && !verbose)
		printf("proc %d: [more errors ...]\n", mpi_rank);

	}
    }

    /* close file and free the communicator */
    mrc = MPI_File_close(&fh);
    VRFY((mrc==MPI_SUCCESS), "MPI_FILE_CLOSE");

    /*
     * one more sync to ensure all processes have done reading
     * before ending this test.
     */
    mrc = MPI_Barrier(MPI_COMM_WORLD);
    VRFY((mrc==MPI_SUCCESS), "Sync before leaving test");
}


/*
 * parse the command line options
 */
int
parse_options(int argc, char **argv)
{
    while (--argc){
	if (**(++argv) != '-'){
	    break;
	}else{
	    switch(*(*argv+1)){
		case 'v':   verbose = 1;
			    break;
		case 'f':   if (--argc < 1) {
				nerrors++;
				return(1);
			    }
			    if (**(++argv) == '-') {
				nerrors++;
				return(1);
			    }
			    paraprefix = *argv;
			    break;
		case 'h':   /* print help message--return with nerrors set */
			    return(1);
		default:    nerrors++;
			    return(1);
	    }
	}
    } /*while*/

    /* compose the test filenames */
    {
	int i, n;
	hid_t plist;

	plist = H5Pcreate (H5P_FILE_ACCESS);
	H5Pset_fapl_mpio(plist, MPI_COMM_WORLD, MPI_INFO_NULL);
	n = sizeof(FILENAME)/sizeof(FILENAME[0]) - 1;	/* exclude the NULL */

	for (i=0; i < n; i++)
	    if (h5_fixname(FILENAME[i],plist,filenames[i],sizeof(filenames[i]))
		== NULL){
		printf("h5_fixname failed\n");
		nerrors++;
		return(1);
	    }
	H5Pclose(plist);
	printf("Test filenames are:\n");
	for (i=0; i < n; i++)
	    printf("    %s\n", filenames[i]);
    }

    return(0);
}


/*
 * Show command usage
 */
void
usage(void)
{
    printf("Usage: t_mpi [-v] [-f <prefix>]\n");
    printf("\t-v\t\tverbose on\n");
    printf("\t-f <prefix>\tfilename prefix\n");
    printf("\n");
}


int
main(int argc, char **argv)
{
    int mpi_size, mpi_rank;				/* mpi variables */

    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);

    if (MAINPROCESS){
	printf("===================================\n");
	printf("MPI functionality tests\n");
	printf("===================================\n");
    }
    fapl = H5Pcreate (H5P_FILE_ACCESS);
    H5Pset_fapl_mpio(fapl, MPI_COMM_WORLD, MPI_INFO_NULL);

    if (parse_options(argc, argv) != 0){
	if (MAINPROCESS)
	    usage();
	goto finish;
    }

	MPI_BANNER("MPIO OFFSET overflow test...");
	test_mpio_offset();
	MPI_BANNER("MPIO GB size file test...");
	test_mpio_gb_file(filenames[0]);
	MPI_BANNER("MPIO independent overlapping writes...");
	test_mpio_overlap_writes(filenames[0]);

finish:
    if (MAINPROCESS){		/* only process 0 reports */
	printf("===================================\n");
	if (nerrors){
	    printf("***MPI tests detected %d errors***\n", nerrors);
	}
	else{
	    printf("MPI tests finished with no errors\n");
	}
	printf("===================================\n");
    }
    MPI_Finalize();
    h5_cleanup(FILENAME, fapl);
    /* always return 0 as this test is informational only. */
    return(0);
}