#!/bin/sh
#
#pragma ident	"@(#)initrgm	1.15	03/11/07 SMI"
#
# Copyright 2003 Sun Microsystems, Inc.  All rights reserved.
# Use is subject to license terms.
#

# Start/stop resource group manager daemon

LIBSCDIR=/usr/cluster/lib/sc
BINDIR=/usr/cluster/bin
USRBIN=/usr/bin
SERVER=rgmd

# The following three variables are used in the build the file
# used by libsecurity.
TRANSPORT=ticotsord
PROGNUM=100141
VERSNUM=1
FILE=/var/run/scrpc/${PROGNUM}.${VERSNUM}.${TRANSPORT}
#
# daemon_exists daemonname
#
# Tests whether a daemon exists with the daemonname, running
# as root and with parent process id of 1.  We don't need
# the actual pid, just the boolean status of whether it
# exists.  Returns 0 for exists, non-zero otherwise.
#
daemon_exists()
{
	${USRBIN}/pgrep -u 0 -P 1 -x $1 >/dev/null 2>&1
	return $?
}


#
# daemon_kills daemonname
#
# Kill the daemon named daemon name
# (if several process are named daemonname then all
# of them are killed, however this should not happen)
daemon_kills()
{
	PIDS=`${USRBIN}/pgrep -u 0 -P 1 -x $1`
	if [ -n "${PIDS}" ]
	then
		# kill all processes in list
                kill -TERM ${PIDS}
		echo "Notice: ${1} is being stopped."
	fi
}

case "$1" in
'start')
	# test whether we are a cluster and exit if not a cluster
	/usr/sbin/clinfo > /dev/null 2>&1
	if [ $? != 0 ]
	then
		exit 0
	fi

	# Check if an instance of the server is already running and if yes exit
	# We only allow one rgmd at a time.
	if daemon_exists ${SERVER}
	then
		${USRBIN}/logger -p local0.err -t INITRGM "Error: ${SERVER} is \
already running."
		exit 0
	fi

        # check that the servers on which rgm depends are up
	# no need to loop since we looped when we started the other servers
	# check that rpc.pmfd is up

	for PROCESS in rpc.pmfd rpc.fed pnmd
	do
	    daemon_exists ${PROCESS}
	    if [ $? -ne 0 ]
	    then
                ${USRBIN}/logger -p local0.err -t INITRGM "Error: ${PROCESS} \
is not started."
		exit 1
	    fi
	done

	${LIBSCDIR}/${SERVER}
	if [ $? -ne 0 ]
	then
		${USRBIN}/logger -p local0.err -t INITRGM "Error: Can't \
start ${SERVER}."
		exit 1
	fi

	# Loop for 2 min checking that the server is up.
	#
	# We need to wait until the server is up and registered before
	# moving on, so that init scripts that follow us will be able
	# to succeed if they depend on us being up.
	# We rely on the file created by libsecurity to detect if the
	# server is ready
	COUNTER=120
	while [ ${COUNTER} -gt 0 ]
	do
		COUNTER=`expr ${COUNTER} - 1`

		daemon_exists ${SERVER}
		if [ $? -ne 0 ]
		then
		    # if the process disappeared there is no need to
		    # wait for the file to be created. startup failed
		    ${USRBIN}/logger -p local0.err -t INITRGM "Error: Startup of ${SERVER} failed."
		    exit 1
		fi

		# As soon as the file is created we assume that the server is up
		# and running.
		if [ -f ${FILE} ]
		then
			exit 0
		fi

		# we log a warning every 30 seconds in case of failure
		if [ `expr ${COUNTER} % 30` -eq 0 ]
		then
			${USRBIN}/logger -p local0.warning -t INITRGM "Waiting for ${SERVER} to be ready."
		fi

		${USRBIN}/sleep 1

	done

	# startup failed
	# We log an error
	daemon_kills ${SERVER}
	${USRBIN}/logger -p local0.err -t INITRGM "Error: Startup of ${SERVER} failed."
	exit 1
	;;

'stop')
	if [ -z "${_INIT_RUN_LEVEL}" ]
	then
	    # If the environment variable _INIT_RUN_LEVEL is not set
	    # it means that the command is called outside of run
	    # control environment
	    ${USRBIN}/logger -p local0.err -t INITRGM "Error: Can't stop \
${SERVER} outside of run control environment."
	    exit 1
	fi

	# Extra sync's are to increase our chances of getting
	# stuff written out to disk.
	/sbin/sync &

	# If the rgmd is running on this node, attempt the
	# scswitch evacuate, in case the human admin forgot
	# to do it.
	if daemon_exists rgmd
	then
		echo "$0: Calling scswitch -S (evacuate)"

		# -K 300 flag keeps RGs off the node for five minutes
	 	${BINDIR}/scswitch -S -h `/usr/sbin/clinfo -n` -K 300
		# We proceed with the rest of the init transition
		# regardless of whether scswitch -S worked or not.

		# If all resources have been successfully evacuated
		# then failfasts can be disabled.
		if [ $? -eq 0 ]; then
			/usr/cluster/lib/sc/cmm_ctl -n `/usr/sbin/clinfo -n`
		fi
	fi
	/sbin/sync &

	# If any of the following user-space daemons that are
	# part of the rgm world exist, then it means that this
	# node had made it up as far as run-level 3.  It is beyond
	# the ability of the Sun Cluster algorithms to support
	# transitioning to run levels that are less fully up than
	# level 3 but where the host stays up indefinitely.  For
	# attempts to transition to those levels, we halt instead.
	if daemon_exists rpc.pmfd || \
	    daemon_exists rpc.fed  || \
	    daemon_exists rgmd || \
	    daemon_exists ucmmd
	then
		if [ "${_INIT_RUN_LEVEL}" = "S" ] || \
		   [ "${_INIT_RUN_LEVEL}" = "s" ] || \
		   [ "${_INIT_RUN_LEVEL}" = "1" ] || \
		   [ "${_INIT_RUN_LEVEL}" = "2" ]
		then
			 ${USRBIN}/logger -p local0.emerg -t INITRGM "Error: Sun \
Cluster does not support transitioning from run-level 3 to levels S \
(single-user), 1, or 2, halting"
			/sbin/sync &
			/usr/sbin/halt
			# In case halt fails call uadmin:
			/sbin/uadmin 2 0
		fi
	fi
	;;

*)
	echo "Usage: /etc/init.d/initrgm { start | stop }"
	;;
esac
