#!/bin/ksh
#
#pragma	ident	"@(#)monitor_rpcbind.sh 1.3 00/01/10"
#Copyright (C)	1997 Sun Microsystems, Inc.
#

##########################################################################
# monitor_rpcbind.sh							 #
#                                                                        #
# Check whether the rpcbind daemon is running.  If not, restart it.      #
# We can attempt to restart rpcbind however only if it aborted or was    #
# killed with a SIGINT or SIGTERM. In this case it creates the files     #
# /tmp/portmap.file and /tmp/rpcbind.file. Starting rpcbind with -w      #
# (warm start) allows it to pickup from where it left off. Here is       #
# the man page snippet on this:					         #
#     -w   Do a warm start.  If rpcbind aborts  or  terminates  on       #
#          SIGINT  or  SIGTERM,  it will write the current list of       #
#          registered   services    to    /tmp/portmap.file    and       #
#          /tmp/rpcbind.file.  Starting rpcbind with the -w option       #
#          instructs it to look for these files and  start  opera-       #
#          tion  with the registrations found in them. This allows       #
#          rpcbind to resume operation without requiring  all  RPC       #
#          services to be restarted.       				 #
#									 #
# We may abort the node if rpcbind is not warm startable (no file in	 #
# /tmp, thus implying that rpcbind was killed with SIGKILL)		 #
#                                                                        #
# This program has it's tuneable values in the cdb. These control	 #
# the number of allowed retries, the interval between polling and	 #
# the behaviour on failure - to abort the node or just warn and exit	 #
#                                                                        #
# This program is mostly an infinite loop.  It exits only when it        #
# gets killed by the the sc2.X framework.				 #
# The exit code is uninteresting.                                        #
##########################################################################

cleanup()
{
 	exit 0
}

# What we really want to do here is just abort the node from
# the cluster membership. However due to some interactions
# between some of the cluster components and rpcbind what really
# happens is that we will failfast the node. I guess this would
# be ok since upon the reboot rpcbind will be restarted by the
# system. It would be nice however if we could exit the cluster
# membership without a panic.
relinquish()
{
	if [ ${ACTION} = "abort" ] ; then
		typeset clpid;

		clpid=$(getpid clustd)
		if [ ${DEBUG} -eq 1 ] ; then
			/usr/bin/date >> ${DEBUGLOG}
			echo "rpcbind not running - aborting the node." >> ${DEBUGLOG}
		fi
		log_info "${pre}.5010" "rpcbind is not running on this node and \
cannot be restarted. This node will be aborted."
		sync;sync;sync
		kill -9 ${clpid}
		exit 0
	else
		log_info "${pre}.5011" "rpcbind is not running on this node and \
cannot be restarted. Selected action is to continue operation."
		if [ ${DEBUG} -eq 1 ] ; then
			/usr/bin/date >> ${DEBUGLOG}
			echo "rpcbind not running - alerting only." >> ${DEBUGLOG}
		fi
		exit 1
	fi
}

#
# check_rpcbind()
#
# does the actual testing for the running rpcbind.
#
check_rpcbind()
{
	if [ ${DEBUG} -eq 1 ] ; then
		DATE=`/usr/bin/date`
		echo "${DATE} - rpcinfo probe started." >> ${DEBUGLOG}
	fi
	probe_active=`rpcinfo -T udp 127.0.0.1 100000 2>&1`
	if [ $? -ne 0 ] ; then
		/usr/bin/date >> ${DEBUGLOG}
		echo ${probe_active} >> ${DEBUGLOG}
		return 1
	else
		NUM_NORESPONSE=0
		return 0
	fi
}
#
# restart_rpcbind()
#
# checks to see if it is possible to warm restart rpcbind, or if not takes the
# appropriate action.
#
# maintains a counter of the number of warm restarts already performed.
#
restart_rpcbind()
{
        let NUM_RESTARTS=NUM_RESTARTS+1
        if [[ -f /tmp/portmap.file || -f /tmp/rpcbind.file ]]; then
            log_info "${pre}.5001" "rpcbind is not running but warm restart seems to \
                         be possible. Will attempt to restart. Restart ${NUM_RESTARTS}."
	    if [ ${DEBUG} -eq 1 ] ; then
		/usr/bin/date >> ${DEBUGLOG}
		echo "Attempting to warm start rpcbind." >> ${DEBUGLOG}
	    fi
            /usr/sbin/rpcbind -w
            if [ $? -ne 0 ]; then
              log_info "${pre}.3001" \
                "Failed to restart rpcbind -w."
                relinquish
            fi
            # Everything is ok. Lets get rid of any of these /tmp files
            # just in case we die again. Restarting rpcbind -w does
            # not do this and it might create problems latest...
            /bin/rm -rf /tmp/portmap.file
            /bin/rm -rf /tmp/rpcbind.file

          else
            log_info "${pre}.4502" "rpcbind is not running -- manual reboot may be
needed"
            relinquish
          fi
          return 0
}
#
# report_status()
#
# logs the current operating parameters into the debug log file.
#
report_status()
{
	echo "monitor_rpcbind starting." >> ${DEBUGLOG}
	/usr/bin/date >> ${DEBUGLOG}
	echo "rpcmon.action : ${ACTION}" >> ${DEBUGLOG}
	echo "rpcmon.retries : ${RETRIES}" >> ${DEBUGLOG}
	echo "rpcmon.ival : ${IVAL}" >> ${DEBUGLOG}
	echo "rpcmon.noresponse : ${NORESPONSE}" >> ${DEBUGLOG}
}

getpid()  {
	typeset pid

        pid=`/usr/bin/ps -e | /usr/bin/grep -w $1 | \
                        /usr/bin/sed -e 's/^  *//' -e 's/ .*//'`
	echo ${pid}
}

#
# Main program
#

# include common utililites.
FMBIN=/opt/SUNWcluster/ha/nfs
. ${FMBIN}/nfs_common_util
ECH_TRAPSIGNALS="1 2 3 15"
trap "cleanup ; trap 0 ; exit 1" $ECH_TRAPSIGNALS
initnfsenv

prog=$(/bin/basename $0)
pre="SUNWcluster.monitor_rpcbind"

DEBUGLOG="${CLUSTERVAR}/monitor_rpcbind.log"
clustname=$(/bin/cat ${CLUSTERETC}/conf/default_clustername)
NODE=`/usr/bin/uname -n`
CDBFILE="/etc/opt/SUNWcluster/conf/${clustname}.cdb"
#
# Set the Debug flag off as a default. Change to a "1" to enable it.
#
DEBUG=0
#
# Read in the parameters from the cdb.
#
ACTION=`/opt/SUNWcluster/bin/cdbmatch rpcmon.action ${CDBFILE}`

RETRIES=`/opt/SUNWcluster/bin/cdbmatch rpcmon.retries ${CDBFILE}`

IVAL=`/opt/SUNWcluster/bin/cdbmatch rpcmon.ival ${CDBFILE}`

NORESPONSE=`/opt/SUNWcluster/bin/cdbmatch rpcmon.noresponse ${CDBFILE}`
#
# Initialise retry and restart counters.
#
NUM_RETRIES=0

NUM_RESTARTS=0

NUM_NORESPONSE=0
#
if [ ${DEBUG} -eq 1 ] ; then
	report_status
fi
#
# start to monitor the process. should only exit the loop through an abort or
# explicit decision to exit elsewhere.
#
while true; do
        while [ ${NUM_RETRIES} -lt ${RETRIES} ] ; do
                check_rpcbind
                if [ $? -ne 0 ] ; then
                        let NUM_RETRIES=NUM_RETRIES+1
                fi
                sleep ${IVAL}
        done
        NUM_RETRIES=0
        RPC_PID=$(getpid rpcbind)
        if [ ${RPC_PID} ] ; then
		if [ ${NUM_NORESPONSE} -lt ${NORESPONSE} ] ; then
		let NUM_NORESPONSE=NUM_NORESPONSE+1
		log_info "${pre}.5003" "rpcbind in process list but has not responded."
		if [ ${DEBUG} -eq 1 ] ; then
			/usr/bin/date >> ${DEBUGLOG}
			echo "rpcbind non-responsive but in proc table. Pass ${NUM_NORESPONSE}." >> ${DEBUGLOG}
		fi
		else
			NUM_NORESPONSE=0
			restart_rpcbind
		fi
	else
		restart_rpcbind
	fi
done
