#! /bin/sh
# runs the hdf5/bin/snapshots
#
# $Id$

# local setup
DEBUGMODE=""
test -n "$DEBUGMODE" && echo "******** DEBUGMODE is $DEBUGMODE ************"

# the name of this program
PROGNAME=$HOME/bin-sys/runsnap

# Email any errors to whom
TOWHOM=${DEBUGMODE:+acheng}
TOWHOM=${TOWHOM:-hdf5lib@ncsa.uiuc.edu}

# Setup
HOSTNAME=`hostname | sed -e s/.ncsa.uiuc.edu//`
H5DIR=$HOME/HDF5/v_1_3/hdf5
TODAY=`date +%y%m%d`
H5VER=			# default to current CVS version
H5VERSTR=		# default to current CVS version

# Default to do checkout (only once) and test, no release.
# If srcdir is not used, don't launched multiple tests
SNAPSHOT="${DEBUGMODE:+echo }bin/snapshot"
SRCDIR="srcdir"
SNAPCMD="$SRCDIR op-configure --enable-stream-vfd op-configure --enable-static-exec test clean"
ENABLE_PARALLEL="op-configure --enable-parallel"

# various hosts
# DEC
# Gondolin: DEC
#DECHOST="skydive"	# skydive is being upgraded.
			# gondolin rsh/ssh don't work.
# HP
# sangamon: HPUX 10
# opus: HPUX 11
HPHOST="sangamon"	# HPUX 10 & 11 # opus removed because both its
			# NFS and AFS filesystems have problems.
# Linux
# Dangermouse, eirene: Linux
# Dangermouse used to die if gmake -j is used.
LINUXHOST=eirene

# SGI O2K
# modi4:	R10K,  IRIX64 6.5, default to -64,-mips4
#		Testing {parallel,serial}x{-64,-n32}x
O2KHOST=modi4
# regular, unleaded, premium:
#		R10K,  IRIX64 6.5, default to -n32,-mips4
# impact7:	R4400, IRIX 6.5,   default to -n32,-mips3
# o2-N:		R10K,  IRIX 6.5,   default to -n32,-mips4
# paz:		R4400, IRIX 6.5,   default to -n32,-mips3
SGIHOST="regular impact7 o2-3"	

# Sun
SUNHOST="arabica baldric"	# solaris 2.6 and 2.7

# FreeBSD
# AFS does not work well in hawkwind.  Use NFS/local space for its
# test directory.  ssh does not work for it either.
FREEBSDHOST="hawkwind"

# run both serial and parallel for PARALLELHOST
PARALLELHOST="modi4"

# set up default all hosts to test
ALLHOSTS="$O2KHOST $SUNHOST $SGIHOST $HPHOST $LINUXHOST $DECHOST $FREEBSDHOST"

# test hosts
TESTHOST=""

#################################
# Function definitions
#################################
SecOfDay()
{
    set `date '+%H %M %S'`
    t_sec=`expr $1 \* 3600 + $2 \* 60 + $3`
    echo $t_sec
}

# Calculated the elapsed time (in seconds) between the first
# and second time.  If second time is smaller than the first,
# we assume the clock has passed midnight and calculate appropriately.
ElapsedTime()
{
    if [ $2 -lt $1 ]; then
	t_sec=`expr 3600 \* 24 - $1 + $2`
    else
	t_sec=`expr $2 - $1`
    fi
    echo `expr $t_sec / 60`m `expr $t_sec % 60`s
}

# Report errors
# $1--an error message to be printed
REPORT_ERR()
{
    ERRMSG=$1
    # print it with a banner shifted right a bit
    echo "		*************************************"
    echo "		$ERRMSG"
    echo "		*************************************"
    # report it in the FAILED-LOG file too
    (date; echo "$ERRMSG") >> $FAILEDLOG
}

#
# Report results of the last test done
REPORT_RESULT()
{
    if [ $retcode -eq 0 ]; then
	echo "$TEST_TYPE tests succeeded in $HOSTNAME"
    else
	# test failed.
	REPORT_ERR "****$TEST_TYPE tests FAILED in $HOSTNAME****"
    fi
}

# Print a blank line
PRINT_BLANK()
{
    echo
}

# Print trailer summary
PRINT_TRAILER()
{
    echo "*** finished $TEST_TYPE tests in $HOSTNAME ***"
    date; EndTime=`SecOfDay`
    echo Total time = `ElapsedTime $StartTime $EndTime`
    PRINT_BLANK
    # reset StartTime for the next elapsed time report
    StartTime=`SecOfDay`
}

# Figure out which remote command to use to reach a host.
# Try rsh first, then ssh.
# $1--hostname to reach.
CHECK_RSH()
{
    # Figure out how to use ping command in this host.
    # Some hosts use "ping host count", some use "ping -c count host"
    # Test "ping -c ..." style first because some '-c' machines treat
    # the command 'ping localhost 3' means to ping host '3'.
    if [ -z "$PING" ]; then
	if ping -c 3 localhost >/dev/null 2>&1; then
	    PING='ping -c 3'
	    PINGCOUNT=
	elif ping localhost 3 >/dev/null 2>&1; then
	    PING=ping
	    PINGCOUNT=3
	else	# don't know how to use ping.  Set it to false.
	    PING=false
	    PINGCOUNT=
	fi
    fi
    #
    host=$1
    if $PING $host $PINGCOUNT >/dev/null 2>&1; then
	if rsh $host -n hostname >/dev/null 2>&1; then
	    RSH=rsh
	elif ssh $host -n hostname >/dev/null 2>&1; then
	    RSH=ssh
	else
	    echo cannot remote command with $host
	    RSH="false"
	fi
    else
	echo $host is down
	RSH="false"
    fi
}

# Try locate the HDF4 library
# This is a hack because there is no consistent place to find
# the valid HDF library.
LOCATE_HDF4()
{
    H4_SW=
    H4_BIN=
    OS=`uname -s`
    # this default is the best guess of locating hdf4 software
    h4paths_defaults="/usr/ncsa /usr/sdt /usr/local"

    case "$OS" in
	HP-UX)
	    h4paths="/afs/ncsa/packages/hdf/HPUX_10.20"
	    ;;
	IRIX)
	    h4paths="/afs/ncsa/packages/hdf/4.1r3_irix"
	    ;;
	IRIX64)
	    case "$CC" in
	    cc|"")	#default cc
		abi=`cc -show_defaults 2>&1 | grep 'default abi'`
		case $abi in
		*-n32)
		    h4paths="/afs/ncsa/packages/hdf/IRIX64-n32_6.5"
		    ;;
		*-64)
		    h4paths="/afs/ncsa/packages/hdf/IRIX64_6.5"
		    ;;
		*)
		    h4paths="/afs/ncsa/packages/hdf/IRIX64_6.5"
		    ;;
		esac	# $abi
		;;
	    *-n32)
		h4paths="/afs/ncsa/packages/hdf/IRIX64-n32_6.5"
		;;
	    *)
		h4paths="/afs/ncsa/packages/hdf/IRIX64_6.5"
		;;
	    esac
	    ;;
	Linux)
	    h4paths="/afs/ncsa/packages/hdf/linux"
	    ;;
	OSF1)
	    h4paths="/afs/ncsa/packages/hdf/OSF1_V4.0"
	    ;;
	*)
	    h4paths="$h4paths_defaults"
	    ;;
    esac

    # check if the hdf4 software is actually available
    for h4 in $h4paths; do
	if [ -f $h4/lib/libdf.a -a -f $h4/include/hdf.h ]; then
	    H4_SW="$h4/include,$h4/lib"
	    if [ -f $h4/bin/hdp ]; then
		H4_BIN=$h4/bin
	    fi
	    break
	fi
    done
}


# Run one snapshot test
# $*--Types of test being run
RUNSNAPTEST()
{
    SNAPCMD_OPT=""		# snapshot test option
    SRCDIRNAME=""
    CC_SAVED=$CC
    TEST_TYPE=$*
    retcode=0
    date
    echo "*** starting $TEST_TYPE tests in $HOSTNAME ***"
    echo "Uname -a: `uname -a`"

    # parse the test type and set options accordingly
    while [ $# -gt 0 ]; do
	case $1 in
	    -n32) # want -n32 option
		SRCDIRNAME=${SRCDIRNAME}-n32
		CC="cc -n32"
		export CC
		shift
		;;
	    parallel) # want parallel test
		SNAPCMD_OPT="$SNAPCMD_OPT $ENABLE_PARALLEL"
		SRCDIRNAME=${SRCDIRNAME}-pp
		shift
		;;
	    standard) # standard test
		shift
		;;
	    *) # unknown test
		echo "$0: unknown type of test ($1)"
		retcode=1
		shift
		;;
	esac
    done
    [ $retcode -ne 0 ] && errcode=$retcode && return $retcode

    # Track down the HDF4 software
    LOCATE_HDF4
    if [ -n "$H4_SW" ]; then
	SNAPCMD_OPT="$SNAPCMD_OPT hdf4 $H4_SW"
    fi
    if [ -n "$H4_BIN" ]; then
	PATH=${PATH}:${H4_BIN}
    fi
	
    if [ -n "${SRCDIRNAME}" ]; then
	SNAPCMD_OPT="$SNAPCMD_OPT srcdirname ${SRCDIRNAME}"
    fi

    # If LOGFILE already exists, it means this host has been tested today.
    # Do at most one run per day.
    LOGFILE=${LOGBASENAME}${SRCDIRNAME}_${TODAY}
    if [ -f $LOGFILE ]; then
	echo LOGFILE $LOGFILE exists.  No more run today.
	retcode=1 && errcode=$retcode && return $retcode
    fi

    echo Running snapshot with output saved in $LOGFILE
    (date; echo Hostname=$HOSTNAME) >> $LOGFILE

    (
    cd $H5DIR
    $SNAPSHOT $SNAPCMD $SNAPCMD_OPT
    ) >> $LOGFILE 2>&1
    retcode=$?
    [ $retcode -ne 0 ] && errcode=$retcode

    date >> $LOGFILE

    # restore CC
    CC=$CC_SAVED
}

# Flush the AFS files if applicable.
# Hopefully the flushing is done when the tests of this
# host are done rather than when the launching site try
# to pull them in at the same time.  This way, the afs
# server updates are spread out.
FLUSH_FILES()
{
    /usr/afsws/bin/fs flush $SNAPYARD
}


#################################
# Main
#################################
#################################
# Set up global variables
#################################
retcode=0			# error code of individula task
errcode=0			# error code of the whole test


#################################
# Parse options
#################################
while [ $# -gt 0 ]; do
    case "$1" in
	-r*)
	    # the version string has a leading _ but not for H5DIR name
	    H5VER="$1"
	    H5VERSTR=_`echo $H5VER | sed -e s/-r// -e s/\\\./_/g`
	    H5DIR=$HOME/HDF5/v$H5VERSTR/hdf5
	    PROGNAME="$PROGNAME $H5VER"
	    ;;
	-all)
	    TESTHOST=$ALLHOSTS
	    ;;
	*)
	    TESTHOST=$*
	    break
	    ;;
    esac
    shift
done

#################################
# Setup snapshot test directories
#################################
# Show the real physical path rather than the symbolic path
SNAPYARD=`cd $HOME/snapshots-hdf5${H5VERSTR} && /bin/pwd`
# Log file basename
LOGBASENAME=${SNAPYARD}/log/${HOSTNAME}
FAILEDLOG=${SNAPYARD}/log/FAILED_LOG_${TODAY}
CVSLOG=${SNAPYARD}/log/CVS_LOG_${TODAY}


#################################
# Setup to print a trailer summary when exiting not via
# the normal end of the script.
#################################
trap PRINT_TRAILER 0

#
StartTime=`SecOfDay`

# Do a checkout if one has not been done today
# Also check MANIFEST file
if [ ! -f $CVSLOG ]; then
    echo Running CVS checkout with output saved in $CVSLOG
    (cd $H5DIR; $SNAPSHOT checkout ) >> $CVSLOG 2>&1
    errcode=$?
    if [ $errcode -ne 0 ]; then
	# test failed.
	REPORT_ERR "****CVS checkout FAILED in $HOSTNAME****"
	exit $errcode
    fi
    echo Checking MAINFEST file ...
    (cd $H5DIR; bin/chkmanifest)
    errcode=$?
    if [ $errcode -ne 0 ]; then
	# test failed.
	REPORT_ERR "****MANIFEST check FAILED****"
    fi
    PRINT_BLANK
fi

# Decide to do test for the local host or for remote hosts
if [ -n "$TESTHOST" -a $HOSTNAME != "$TESTHOST" ]; then
    date
    echo "*** launching tests from $HOSTNAME ***"
    PRINT_BLANK
    TEST_TYPE="launching"
    cd ${SNAPYARD}/log
    for h in $TESTHOST; do
	TMP_OUTPUT="#$h.out"
	CHECK_RSH $h
	# launch concurrent tests only if srcdir is used
	if [ -n "$SRCDIR" ]; then
	    (echo $RSH $h -n $PROGNAME;
	    $RSH $h -n $PROGNAME) > $TMP_OUTPUT 2>&1 &
	else
	    (echo $RSH $h -n $PROGNAME;
	    $RSH $h -n $PROGNAME) > $TMP_OUTPUT 2>&1
	fi
    done
    # wait for all launched tests to finish, then cat them back out.
    wait
    for h in $TESTHOST; do
	TMP_OUTPUT="#$h.out"
	cat $TMP_OUTPUT
	# Verify test script did complete by checking the last lines
	(tail -2 $TMP_OUTPUT | grep -s '^Total time' > /dev/null 2>&1) ||
	    REPORT_ERR "****snaptest FAILED to complete in $h****"
	rm $TMP_OUTPUT
    done
    exit 0
fi

# Running the standard test in this host.
RUNSNAPTEST standard
REPORT_RESULT
PRINT_TRAILER

#
# Running the parallel test if this is parallel host too
if ( echo $PARALLELHOST | grep -s $HOSTNAME > /dev/null ) then
    RUNSNAPTEST parallel
    REPORT_RESULT
    PRINT_TRAILER
fi

#
# If this is modi4, run -n32 tests too.
if [ $HOSTNAME = modi4 ]; then
    #
    # Serial test
    RUNSNAPTEST -n32 standard
    REPORT_RESULT
    PRINT_TRAILER
    # parallel test
    RUNSNAPTEST -n32 parallel
    REPORT_RESULT
    PRINT_TRAILER
fi

FLUSH_FILES

# disable trailer summary printing since all trailers have been
# printed and we are exiting normally.
trap 0
exit $errcode