#!/bin/ksh
#**********************************************************************#
#*                                                                    *#
#* Copyright (c) 2002 by Sun Microsystems, Inc.                       *#
#* All rights reserved.                                               *#
#*                                                                    *#
#**********************************************************************#

# Sun Cluster Data Services Builder template version 1.0
#
# Probe method for scmbm.
#
# This method checks the health of the data service.

###############################################################################
# Parse program arguments.
#
function parse_args # [args ...]
{
	typeset opt

	while getopts 'R:G:T:' opt
	do
		case "$opt" in

		R)
		# Name of the scmbm resource.
		RESOURCE_NAME=$OPTARG
		;;

		G)
		# Name of the resource group in which the resource is
		# configured.
		RESOURCEGROUP_NAME=$OPTARG
		;;

		T)
		# Name of the resource type.
		RESOURCETYPE_NAME=$OPTARG
		;;

		*)
		logger -p ${SYSLOG_FACILITY}.err \
		-t [$RESOURCETYPE_NAME,$RESOURCEGROUP_NAME,$RESOURCE_NAME] \
		"ERROR: Option $OPTARG unknown"
		exit 1
		;;

		esac
	done

}

###############################################################################
# restart_service ()
#
# This function tries to restart the dataservice by calling the STOP method
# followed by the START method of the dataservice. However, if the dataservice
# has already died and there is not tag registered for the dataservice under
# PMF, then we failover the service to another node in the cluster.
#
function restart_service
{

        if [ "$EMP_DEBUG" = "ON" ]
        then
	        logger -p ${SYSLOG_FACILITY}.notice \
		        -t [$SYSLOG_TAG] \
		        "${ARGV0} Probe starting function restart_service"
        fi
	# In order to restart the dataservice, first, make 
	# sure that dataservice itself is still registered 
	# under PMF
	pmfadm -q $PMF_TAG >/dev/null 2>&1
	pmfstatus=$?
        if [ "$EMP_DEBUG" = "ON" ]
        then
	        logger -p ${SYSLOG_FACILITY}.notice \
		        -t [$SYSLOG_TAG] \
		        "${ARGV0} Probe: restart_service: PMF status=$pmfstatus, PMF Retry Count=$PMF_RETRY_CNT"
        fi
	if [ $pmfstatus -eq 0 -o $PMF_RETRY_CNT -eq 0 ]; then
		# Since the TAG  for the dataservice is still registered under
		# PMF, we will first stop the dataservice and start it back
		# again.

                if [ "$EMP_DEBUG" = "ON" ]
                then
	                logger -p ${SYSLOG_FACILITY}.notice \
		                -t [$SYSLOG_TAG] \
		                "${ARGV0} Probe decided to restart scmbm service"
                fi
		# Obtain the STOP method name and the STOP_TIMEOUT value for
		# this resource.
		STOP_TIMEOUT=`scha_resource_get -O STOP_TIMEOUT \
			-R $RESOURCE_NAME -G $RESOURCEGROUP_NAME`
		STOP_METHOD=`scha_resource_get -O STOP \
			-R $RESOURCE_NAME -G $RESOURCEGROUP_NAME`
                if [ "$EMP_DEBUG" = "ON" ]
                then
	                logger -p ${SYSLOG_FACILITY}.notice \
		                -t [$SYSLOG_TAG] \
		                "${ARGV0} Probe executes hatimerun -t $STOP_TIMEOUT $RT_BASEDIR/$STOP_METHOD -R $RESOURCE_NAME -G $RESOURCEGROUP_NAME -T $RESOURCETYPE_NAME"
                fi
		hatimerun -t $STOP_TIMEOUT $RT_BASEDIR/$STOP_METHOD \
			-R $RESOURCE_NAME -G $RESOURCEGROUP_NAME \
			-T $RESOURCETYPE_NAME >/dev/null 2>&1

		if [[ $? -ne 0 ]]; then
			logger -p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG] \
				"${ARGV0} Stop method failed."
			return 1
		fi

		# Obtain the START method name and the START_TIMEOUT value for
                # this resource.
                START_TIMEOUT=`scha_resource_get -O START_TIMEOUT \
                        -R $RESOURCE_NAME -G $RESOURCEGROUP_NAME`
                START_METHOD=`scha_resource_get -O START \
                        -R $RESOURCE_NAME -G $RESOURCEGROUP_NAME`

                if [ "$EMP_DEBUG" = "ON" ]
                then
	                logger -p ${SYSLOG_FACILITY}.notice \
		                -t [$SYSLOG_TAG] \
		                "${ARGV0} Probe executes hatimerun -t $START_TIMEOUT $RT_BASEDIR/$START_METHOD -R $RESOURCE_NAME -G $RESOURCEGROUP_NAME -T $RESOURCETYPE_NAME"
                fi
                hatimerun -t $START_TIMEOUT $RT_BASEDIR/$START_METHOD \
                        -R $RESOURCE_NAME -G $RESOURCEGROUP_NAME \
                        -T $RESOURCETYPE_NAME >/dev/null 2>&1

                if [[ $? -ne 0 ]]; then
                        logger -p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG] \
                                "${ARGV0} Start method failed."
                        return 1
                fi


	else
		# the fact that the TAG for the dataservice is not
		# present, implies that the dataservice has already
		# passed the max no of retries allowed under PMF. Hence
		# Hence, there is no point in trying to restart the
		# dataservice again. We might as well try to failover
		# to another node in the cluster.
                if [ "$EMP_DEBUG" = "ON" ]
                then
	                logger -p ${SYSLOG_FACILITY}.notice \
		                -t [$SYSLOG_TAG] \
		                "${ARGV0} Probe decided to failover scmbm service"
                fi
		scha_control -O GIVEOVER -G $RESOURCEGROUP_NAME \
			-R $RESOURCE_NAME
	fi

	return 0
}

###############################################################################
# decide_restart_or_failover ()
#
# This function decides the action to be taken upon the failure of a probe 
# The action could be to restart the data service locally or it could be to
# failover the data service to another node in the cluster. 
#
function decide_restart_or_failover
{
	
        if [ "$EMP_DEBUG" = "ON" ]
        then
	        logger -p ${SYSLOG_FACILITY}.notice \
		        -t [$SYSLOG_TAG] \
		        "${ARGV0} Probe starting function decide_restart_or_failover"
        fi
	if [ $RETRY_COUNT -eq 0 ]; then
	# If Retry_Count is set to zero than do not try to restart on this node.
	# Just try to fail over to a different node.
          if [ "$EMP_DEBUG" = "ON" ]
            then
	      logger -p ${SYSLOG_FACILITY}.notice \
	        -t [$SYSLOG_TAG] \
	        "${ARGV0} Probe decided to failover because retry count is set to $RETRY_COUNT, retries=$retries"
            fi
	    retries=0
	    scha_control -O GIVEOVER -G $RESOURCEGROUP_NAME -R $RESOURCE_NAME
	    if [ $? -ne 0 ]; then
	      logger -p ${SYSLOG_FACILITY}.err \
	        -t [$SYSLOG_TAG] \
	        "${ARGV0} Failover attempt failed."
	      exit 1
	    fi
	fi
	# Check if this is the first time we are trying to restart 
	if [ $retries -eq 0 ]; then
		# This is the first failure. Note the time when we are doing 
		# this first attempt. 
                if [ "$EMP_DEBUG" = "ON" ]
                then
	                logger -p ${SYSLOG_FACILITY}.notice \
		                -t [$SYSLOG_TAG] \
		                "${ARGV0} Probe decided to restart scmbm service because of first failure retries=$retries"
                fi
		start_time=`$RT_BASEDIR/gettime`
		retries=`expr $retries + 1`
		# Since this the first ever failure, we shall try to restart
		# the dataservice.
		restart_service
		if [[ $? -ne 0 ]] then
			logger -p ${SYSLOG_FACILITY}.err \
				-t [$SYSLOG_TAG] \
				"${ARGV0} Could not restart dataservice."
                        return 1
		fi
	else
		# This is not the first failure
		current_time=`$RT_BASEDIR/gettime`
		time_diff=`expr $current_time - $start_time`
		if [ $time_diff -ge $RETRY_INTERVAL ]; then
			# This failure happened after the time window
			# elapsed, so we reset the retries counter,
			# slide the window, and do a retry.
			retries=1
			start_time=$current_time
                	# Since the previous failure occured quite sometime
			# back (i.e beyond the retry_interval duration), we
			# will try to do a restart
                        if [ "$EMP_DEBUG" = "ON" ]
                        then
	                        logger -p ${SYSLOG_FACILITY}.notice \
		                        -t [$SYSLOG_TAG] \
		                        "${ARGV0} Probe decided to restart scmbm service because of failure after time window time_diff=$time_diff"
                        fi
			restart_service
			if [[ $? -ne 0 ]] then
				logger -p ${SYSLOG_FACILITY}.err \
					-t [$SYSLOG_TAG] \
                                	"${ARGV0} Could not restart dataservice"
                        	return 1
			fi
		elif [ $retries -ge $RETRY_COUNT ]; then
			# We are still within the time window,
			# and the retry counter expired. We have to failover.
                        if [ "$EMP_DEBUG" = "ON" ]
                        then
	                        logger -p ${SYSLOG_FACILITY}.notice \
		                        -t [$SYSLOG_TAG] \
		                        "${ARGV0} Probe decided to failover because of failure within time window and retry counter expired, retries=$retries, time_diff=$time_diff"
                        fi
			retries=0
			scha_control -O GIVEOVER -G $RESOURCEGROUP_NAME \
				-R $RESOURCE_NAME
			if [ $? -ne 0 ]; then
				logger -p ${SYSLOG_FACILITY}.err \
					-t [$SYSLOG_TAG] \
					"${ARGV0} Failover attempt failed."
				exit 1
			fi
		else
			# We are still within the time window,
			# and retry counter has not expired,
			# so do another retry.
                        if [ "$EMP_DEBUG" = "ON" ]
                        then
	                        logger -p ${SYSLOG_FACILITY}.notice \
		                        -t [$SYSLOG_TAG] \
		                        "${ARGV0} Probe decided to restart scmbm service because of failure within time window but retry counter not expired, retries=$retries, time_diff=$time_diff"
                        fi

			retries=`expr $retries + 1`


			# Since we have not reached the maximum no of retires
			# allowed with in the specified retry_interval duration
			# we will try to restart again. 
			restart_service
			if [[ $? -ne 0 ]] then
				logger -p ${SYSLOG_FACILITY}.err \
					-t [$SYSLOG_TAG] \
                                	"${ARGV0} Could not restart dataservice"
                        	return 1
			fi
		fi
	fi
}


###############################################################################
# MAIN
###############################################################################

export PATH=/bin:/usr/bin:/usr/cluster/bin:/usr/sbin:/usr/proc/bin:$PATH

# Obtain the syslog facility to use. This will be used to log the messages.
SYSLOG_FACILITY=`scha_cluster_get -O SYSLOG_FACILITY`

DEBUG_PARMS="$*"

# Parse the arguments that have been passed to this method
parse_args "$@"

SYSLOG_TAG=$RESOURCETYPE_NAME,$RESOURCEGROUP_NAME,$RESOURCE_NAME
PMF_TAG=$RESOURCEGROUP_NAME,$RESOURCE_NAME,0.svc

emp_debug_info="`scha_resource_get -O Extension -R $RESOURCE_NAME \
        -G $RESOURCEGROUP_NAME Debug`"
EMP_DEBUG=`echo $emp_debug_info | awk '{print $2}'`
export EMP_DEBUG

# The interval at which probing is to to be done is set in the system defined
# property THOROUGH_PROBE_INTERVAL. Obtain this information using 
# scha_resource_get 
PROBE_INTERVAL=`scha_resource_get -O THOROUGH_PROBE_INTERVAL \
	-R $RESOURCE_NAME -G $RESOURCEGROUP_NAME`

# Obtain the time-out value allowed for the probe. For nslookup the default 
# time-out is 1.5 minutes. This value is set in the extension property 
# PROBE_TIMEOUT of the data service.
probe_timeout_info=`scha_resource_get -O Extension -R $RESOURCE_NAME \
	-G $RESOURCEGROUP_NAME Probe_timeout`
PROBE_TIMEOUT=`echo $probe_timeout_info | awk '{print $2}'`

# We need to know the full path for the gettime utility which resides in the
# directory <RT_BASEDIR>. Get this from the RT_BASEDIR property of the
# resource type.
RT_BASEDIR=`scha_resource_get -O RT_BASEDIR -R $RESOURCE_NAME \
	-G $RESOURCEGROUP_NAME`

# We need to know the full path for the mbm utilities which reside in the
# directory <EMP_BASEDIR>. Get this from the EMP_BASEDIR property of the
# resource type.
emp_basedir_info="`scha_resource_get -O Extension -R $RESOURCE_NAME \
	-G $RESOURCEGROUP_NAME EMP_BASEDIR`"
EMP_BASEDIR=`echo $emp_basedir_info | awk '{print $2}'`

EMP_SETUP=$EMP_BASEDIR/batchenv

# We need to know the mbm administrator name to start the mbm utilities
# with his/her user id. Get this from the EMP_ADMIN property of the
# resource type.
emp_admin_info="`scha_resource_get -O Extension -R $RESOURCE_NAME \
	-G $RESOURCEGROUP_NAME EMP_ADMIN`"
EMP_ADMIN=`echo $emp_admin_info | awk '{print $2}'`

# Get the Retry count value from the system defined property Retry_count
RETRY_COUNT=`scha_resource_get -O RETRY_COUNT -R $RESOURCE_NAME \
	-G $RESOURCEGROUP_NAME`

# Get the Retry Interval value from the system defined property Retry_interval
RETRY_INTERVAL=`scha_resource_get -O RETRY_INTERVAL -R $RESOURCE_NAME \
	-G $RESOURCEGROUP_NAME`

emp_pmf_retry_cnt=`scha_resource_get -O Extension -R $RESOURCE_NAME \
	-G $RESOURCEGROUP_NAME PMF_Retry_Count`
PMF_RETRY_CNT=`echo $emp_pmf_retry_cnt | awk '{print $2}'`

emp_start_interval_info="`scha_resource_get -O Extension -R $RESOURCE_NAME \
	-G $RESOURCEGROUP_NAME Probe_Start_Interval`"
EMP_PROBE_START_INTERVAL=`echo $emp_start_interval_info | awk '{print $2}'`

typeset -i retries=0

hostnames=`$RT_BASEDIR/gethostnames -R $RESOURCE_NAME -G $RESOURCEGROUP_NAME \
	-T $RESOURCETYPE_NAME`

probe_cmd_args="$EMP_BASEDIR/pack/scbin/mbm_probe -S $EMP_SETUP -D $EMP_DEBUG -I \"$PMF_TAG\" -L \"$SYSLOG_TAG\" -F \"$SYSLOG_FACILITY\" >/dev/null 2>&1"

probe_cmd_prog="$EMP_BASEDIR/pack/scbin/mbm_probe"

if [ "$EMP_DEBUG" = "ON" ]
then
	logger -p ${SYSLOG_FACILITY}.notice \
		-t [$SYSLOG_TAG] \
		"${ARGV0} Starting Probe command for scmbm with parameters $DEBUG_PARMS"
fi

# User added code -- BEGIN vvvvvvvvvvvvvvv
# User added code -- END   ^^^^^^^^^^^^^^^

if [ "$EMP_DEBUG" = "ON" ]
then
	logger -p ${SYSLOG_FACILITY}.notice \
		-t [$SYSLOG_TAG] \
		"${ARGV0} Probe command for scmbm waiting before going in loop $EMP_PROBE_START_INTERVAL seconds"
fi
sleep $EMP_PROBE_START_INTERVAL

while :
do
if [ "$EMP_DEBUG" = "ON" ]
then
	logger -p ${SYSLOG_FACILITY}.notice \
		-t [$SYSLOG_TAG] \
		"${ARGV0} Probe command for scmbm at begin of loop waiting $PROBE_INTERVAL seconds"
fi
	# The interval at which the probe needs to run is specified 
	# in the property THOROUGH_PROBE_INTERVAL. So we need to 
	# sleep for a duration of <THOROUGH_PROBE_INTERVAL>
	sleep $PROBE_INTERVAL

	# User added code -- BEGIN vvvvvvvvvvvvvvv
	# User added code -- END   ^^^^^^^^^^^^^^^

	if [[ -f $probe_cmd_prog && -x $probe_cmd_prog ]]; then
                if [ "$EMP_DEBUG" = "ON" ]
                then
	                logger -p ${SYSLOG_FACILITY}.notice \
		                -t [$SYSLOG_TAG] \
		                "${ARGV0} Probe executes su $EMP_ADMIN -c hatimerun -t $PROBE_TIMEOUT $probe_cmd_args"
                fi
		su $EMP_ADMIN -c "hatimerun -t $PROBE_TIMEOUT $probe_cmd_args" >/dev/null 2>&1
                ret_status=$?
                if [ "$EMP_DEBUG" = "ON" ]
                then
	                logger -p ${SYSLOG_FACILITY}.notice \
		                -t [$SYSLOG_TAG] \
		                "${ARGV0} Probe $probe_cmd_args returns $ret_status"
                fi

		# User added code -- BEGIN vvvvvvvvvvvvvvv
		# User added code -- END   ^^^^^^^^^^^^^^^
	else
		# We were not supplied with a probe command. We will use the 
		# simple_probe that comes bundled as an utility.
		logger -p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG] \
			"${ARGV0} Could not find or execute $probe_cmd_prog. "
                if [ "$EMP_DEBUG" = "ON" ]
                then
	                logger -p ${SYSLOG_FACILITY}.notice \
		                -t [$SYSLOG_TAG] \
		                "${ARGV0} Probe executes simple probe $RT_BASEDIR/simple_probe"
                fi
		hatimerun -t $PROBE_TIMEOUT $RT_BASEDIR/simple_probe \
			-R $RESOURCE_NAME -G $RESOURCEGROUP_NAME \
			-T $RESOURCETYPE_NAME >/dev/null 2>&1
                ret_status=$?
                if [ "$EMP_DEBUG" = "ON" ]
                then
	                logger -p ${SYSLOG_FACILITY}.notice \
		                -t [$SYSLOG_TAG] \
		                "${ARGV0} Probe simple probe returns $ret_status"
                fi

		# User added code -- BEGIN vvvvvvvvvvvvvvv
		# User added code -- END   ^^^^^^^^^^^^^^^
	fi

	if [[ $ret_status != 0 ]]; then
		# User added code -- BEGIN vvvvvvvvvvvvvvv
		# User added code -- END   ^^^^^^^^^^^^^^^

                if [ "$EMP_DEBUG" = "ON" ]
                then
	                logger -p ${SYSLOG_FACILITY}.notice \
		                -t [$SYSLOG_TAG] \
		                "${ARGV0} Probe to decide to restart or failover"
                fi
		decide_restart_or_failover
		if [[ $? -ne 0 ]] then
			logger -p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG] \
				"${ARGV0} Could not Restart/Failover the dataservice."
		fi
	else
            if [ "$EMP_DEBUG" = "ON" ]
            then
	            logger -p ${SYSLOG_FACILITY}.notice \
		            -t [$SYSLOG_TAG] \
			    "${ARGV0} Probe for resource scmbm successful"
            fi
	fi

	# User added code -- BEGIN vvvvvvvvvvvvvvv
	# User added code -- END   ^^^^^^^^^^^^^^^
done
