#! /usr/bin/sh
#
# ident	"@(#)sms.sh	1.26	01/09/28 SMI"
#
# Copyright (c) 1999-2001 by Sun Microsystems, Inc.
# All rights reserved.
#
# Start-up script to start-up and shut-down the sms software. The 'start'
# option starts a background process which starts and monitors the ssd. The ssd
# in turn starts and monitors all the sms daemons and servers. The 'stop'
# option kills the background loop that starts and monitors the ssd and then
# signals the ssd to shut itself and all the sms daemons and servers down.
#
# exit codes
# --------------------------------------------------------------------------
# 0 - operation was successful
# 1 - environment file missing or not readable
# 2 - usage error
# 3 - can not "start" sms, it is already running
# 4 - can not "stop" sms, it is not running
# 5 - SSC not yet configured as "MAIN" or "SPARE"
# 6 - smsconfig has not been run

PATH=/sbin:/usr/sbin:/usr/bin; export PATH

#
# GLOBAL DEFINITIONS
#
command=`/usr/bin/basename $0`
sub_command=$1
SMS_VAR=/var/opt/SUNWSMS
SMS_ETC=/etc/opt/SUNWSMS
SMS_ENV=$SMS_ETC/startup/sms_env.sh
SSD_CNFG=$SMS_ETC/startup/ssd_start
SSD_LOOP=$SMS_VAR/data/ssd_loop.pid


############################################################################
#
# usage_error
#
# Prints a 'sms' script usage error message.
#
############################################################################
usage_error ()
{
	echo `gettext "Usage: $command { start | stop }"`
}


############################################################################
#
# process_exists
#
# Returns 0 if the process exists, 1 if it does not.
#
# $1 - name of process to look for (must be short name as in ps)
# $2 - owner of process (defaults to root)
#
############################################################################
process_exists ()
{
	/usr/bin/pgrep -x -U${2:-root} $1 >/dev/null 2>&1
}


############################################################################
#
# 4490854 PCI bridge secondary clocks sometimes come up disabled from power on
#
# check_sscpost_status()
#
# This function checks the SSCPOST test results for a specific failure which
# indicates that the PCI secondary clocks have failed.  If they have, then the
# appropriate action is taken otherwise the function just returns.
#
# Returns 0 for success, 1 for failure
#
############################################################################
check_sscpost_status()
{
	date_format="date +'%b %e %T %Y'"
	localschost=`/bin/hostname`
	log_format="$localschost sms[]: [NOTICE check_sscpost_status]"
	man_config=$SMS_ETC/config/MAN.cf
	flag_dir=$SMS_VAR/data/.failover/local
	platform_log=/var/opt/SUNWSMS/adm/platform/messages
	flag_file=.scpost_status
	failed_status_string="Failed"
	scpost_status="FAILED"
	i2sc0host=`egrep SC0-I2 $man_config | egrep -v "^#" | awk '{print $3}'`
	i2sc1host=`egrep SC1-I2 $man_config | egrep -v "^#" | awk '{print $3}'`
	remoteI2host=$i2sc1host
	eaddr=
	rstatus=0

	#
	# get SSCPOST status from prtconf
	#
	prtconf -pv | grep ssc-post-results | cut -d':' -f2- | egrep "$failed_status_string" >/dev/null 2>&1
	if [ $? -ne 0 ]; then
		return 0
	fi

	echo "`eval $date_format` $log_format Attempting to transfer SSCPOST results to other SC" >>$platform_log

	egrep $i2sc0host /.rhosts >/dev/null 2>&1
	if [ $? -eq 0 ]; then
		remoteI2host=$i2sc0host
	fi

	#
	# test for scman1 (I2 net) connectivity & create file
	#
 	# We need to ifconfig hme1 up using parameters from /etc/hostname.scman1
	#
	if [ -f "/etc/hostname.scman1" ]; then
		eaddr=`ifconfig scman1 | egrep ether | cut -d" "  -f2`
		ifconfig scman1 down
		ifconfig hme1 plumb `cat /etc/hostname.scman1` broadcast +
		ifconfig hme1 ether $eaddr
	elif [ -f "/etc/hostname6.scman1" ]; then
		ifconfig scman1 inet6 down
		ifconfig hme1 inet6 plumb
		ifconfig hme1 inet6 `cat /etc/hostname6.scman1`
	else
		return 1
	fi
	if [ $? -ne 0 ]; then
		echo "`eval $date_format` $log_format Unable to ifconfig hme1" >>$platform_log
		if [ -f "/etc/hostname.scman1" ]; then
			ifconfig hme1 unplumb
			ifconfig scman1 up
		elif [ -f "/etc/hostname6.scman1" ]; then
			ifconfig hme1 inet6 unplumb
			ifconfig scman1 inet6 up
		else
			return 1
		fi
		echo "`eval $date_format` $log_format Unable to transfer SSCPOST results, exiting" >>$platform_log

		return 1
	fi

	echo "`eval $date_format` $log_format hme1 successfully ifconfig'd" >>$platform_log
	rsh -n $remoteI2host sh -c "'/bin/echo $scpost_status' > $flag_dir/$flag_file"
	if [ $? -eq 0 ]; then
		echo "`eval $date_format` $log_format Completed transfer of SSCPOST results to other SC" >>$platform_log
	else
		echo "`eval $date_format` $log_format Unable to transfer SSCPOST results, exiting" >>$platform_log
		rstatus=1
	fi

	if [ -f "/etc/hostname.scman1" ]; then
		ifconfig hme1 unplumb
		ifconfig scman1 up
	elif [ -f "/etc/hostname6.scman1" ]; then
		ifconfig hme1 inet6 unplumb
		ifconfig scman1 inet6 up
	else
		return 1
	fi

	return $rstatus
}


############################################################################
#
# sms_start
#
# Starts-up all of sms by starting the ssd.
#
############################################################################
sms_start ()
{
	# initialize ecode to zero
	ecode=0

	# check to see if sms is already running
	if [ -s "$SSD_LOOP" ]; then
		# indicate an error
		ecode=3

		# print message to indicate that sms is already running
		echo `gettext "$command: SMS is already running"`

		# silently check health of the ssd start loop, if its' okay
		# then we are done. if not, then re-start it.
		if /usr/bin/kill -0 `cat $SSD_LOOP` >/dev/null 2>&1; then
			exit $ecode
		fi
	fi

	grep -v "^\#" $SMS_ETC/config/MAN.cf > /dev/null 2>&1
	if [ $? -ne 0 ]; then
		# indicate an error
		ecode=6

		# print message to indicate smsconfig not yet run
		echo `gettext "$command: smsconfig(1M) has not been run. Unable to start sms services."`
		exit $ecode
	fi

	# setup the core size limit and name
	coresize=`grep \^" " $SMS_ETC/config/mld_tuning | grep core_max_size | cut -d "=" -f 2`
	if [ $? -eq 0 ]; then
		if [ -x "/usr/bin/ulimit" ]; then
			ulimit -HSc $coresize >/dev/null 2>&1
		fi
	fi
	if [ -x "/usr/bin/coreadm" ]; then
		/usr/bin/coreadm -p /var/tmp/sms_core.%f.new $$ >/dev/null 2>&1
	fi

	# start the mld prior to starting the ssd so that the ssd can have
	# logging capabilities. Once the ssd starts it will connect with
	# this running mld and begin monitoring it.
	process_exists mld
	if [ $? -ne 0 ]; then
		# get the command line options from the ssd_start config file
		MLD_CNFG=`grep '^[ 	]*mld[ 	]*:' $SSD_CNFG 2>/dev/null`
		MLD_OPTS=`expr "$MLD_CNFG" : '^[^:]*:\([^:]*\):.*$'`

		# start the mld
		mld ${MLD_OPTS} >/dev/null 2>&1;
	fi

	# make sure we can execute the ssd
	ssd_path=`which ssd`
	if [ ! -x "$ssd_path" ]; then
        	# indicate an error
        	ecode=3

        	# print message to indicate that sms is already running
        	echo `gettext "$command: Unable to execute ssd"`

        	exit $ecode
	fi

	SSD_START_MSG1="SMS software start-up initiated"
	SSD_START_MSG2="SC POST results:"`/usr/sbin/prtconf -pv | grep ssc-post-results | cut -d':' -f2-`
	ssd $SSD_OPTIONS -i "$SSD_START_MSG1" -i"$SSD_START_MSG2"
	# start the background loop to monitor and start the ssd when and
	# if it dies.
	sh -c 'echo $$ >/var/opt/SUNWSMS/data/ssd_loop.pid;
	while true; do
		# if the ssd is not running then start it
		/usr/bin/pgrep -x -U0 ssd >/dev/null 2>&1
		if [ $? -ne 0 ]; then
			ssd $SSD_OPTIONS;
		fi
		sleep 3;
	done' >/dev/null 2>&1 &

	exit $ecode
}

############################################################################
#
# sms_stop
#
# Terminates all of sms by signaling the ssd.
#
############################################################################
sms_stop ()
{
	# initialize ecode to zero
	ecode=0

	# kill the background process that starts and monitors the ssd
	if [ -s "$SSD_LOOP" ]; then
		/usr/bin/kill `cat $SSD_LOOP` >/dev/null 2>&1
		rm -f $SSD_LOOP >/dev/null 2>&1
	else
		# inform user of error
		echo `gettext "$command: SMS is not running"`

		# indicate an error
		ecode=4
	fi

	# determine from the ssd_start file the longest we should have to
	# wait for all the daemons to shut down
	SHUTDOWN_MAX=`/usr/bin/awk -F:< $SSD_CNFG '
	BEGIN                                     { shutdown_max = 0 }
	$0 ~ /^[ \t]*[^#]/ && $8 > shutdown_max   { shutdown_max = $8 }
	END                                       { print shutdown_max }'`

	# build list of args to use with pgrep that contains all the daemon and
	# server names and the user ids that they run under
	SIG_TARGETS=`awk -F:< $SSD_CNFG '
        BEGIN { i=0
                uid_list="-Uroot"
                comp_names="ssd"
              }
        $0 ~ /^[ \t]*[^#]/ { split($9,temp," \t")
                             uid_list = sprintf "%s,%s", uid_list, temp[1]
                             split($1,temp," \t")
                             comp_names = sprintf "%s|%s", comp_names, temp[1]
                           }
        END { targets = sprintf "%s \"%s\"", uid_list, comp_names
              print targets
            }'`

	# if the ssd exists, use it to shut-down sms
	if process_exists ssd; then
		# set flag to indicate that the ssd was used to shutdown sms
		SSD_USED=0

		# signal the ssd to shut-down itself and all of sms
		/usr/bin/pkill -USR2 -x -U0 ssd >/dev/null 2>&1

	# if the ssd doesn't exist, then signal the SW components manually
	else
		# signal the daemons and servers based on their user ids
		/usr/bin/pkill -TERM -x $SIG_TARGETS >/dev/null 2>&1
	fi

	# sleep upto SHUTDOWN_MAX seconds while waiting for the daemons to
	# shutdown
	SECONDS=$SHUTDOWN_MAX
	while [ $SECONDS -ne 0 ]; do
		# check to see if any of the daemons are still running
		/usr/bin/pgrep -x $SIG_TARGETS >/dev/null 2>&1
		DAEMONS_EXIST=$?

		# return if all the daemons have terminated
		if [ $DAEMONS_EXIST -ne 0 ]; then
			return 0
		fi

		# sleep for a second
		sleep 1

		# decrement the number of seconds left to sleep
		SECONDS=`expr $SECONDS - 1`
	done

	# if the ssd was used to shutdown the other daemons
	if [ $SSD_USED ]; then
		# if the ssd is running at this point it must be sick, kill it
		/usr/bin/pkill -KILL -x -U0 ssd >/dev/null 2>&1

		# if the daemons didn't respond to the SIGTERM we can not be
		# sure if it is because the ssd was sick or the daemons are
		# sick. signal the daemons manually and wait for shutdown
		if [ $DAEMONS_EXIST -eq 0 ]; then
			/usr/bin/pkill -TERM -x $SIG_TARGETS >/dev/null 2>&1

			# sleep upto SHUTDOWN_MAX seconds while waiting for
			# the daemons to shutdown
			SECONDS=$SHUTDOWN_MAX
			while [ $SECONDS -ne 0 ]; do
				# check to see if any of the daemons are
				# still running
				/usr/bin/pgrep -x $SIG_TARGETS >/dev/null 2>&1
				DAEMONS_EXIST=$?

				# return if all the daemons have terminated
				if [ $DAEMONS_EXIST -ne 0 ]; then
					return 0
				fi

				# sleep for a second
				sleep 1

				# decrement the number of seconds left to sleep
				SECONDS=`expr $SECONDS - 1`
			done
		fi
	fi

	# at this point regardless of how the daemons were signaled (via the
	# ssd or manually) we can be sure that they received the SIGTERM and
	# were given time to shutdown. if they still persist kill them
	/usr/bin/pkill -KILL -x $SIG_TARGETS >/dev/null 2>&1

	exit $ecode
}


############################################################################
#
# Execution begins here.
#
############################################################################

#
# 4490854 PCI bridge secondary clocks sometimes come up disabled from
#	  power on
#
if [ $?SMS_4490854 ]; then
	if [ "$sub_command" = "start" -a -n "$_INIT_RUN_LEVEL" ]; then
		if [ $_INIT_RUN_LEVEL -eq 3 ]; then
			check_sscpost_status
		fi
	fi
fi

# silently exit if we are not running on a SSC
dev_count=`prtconf -p | grep -c \'gchip\'`
if [ ${dev_count} -eq 0 ] ; then
	exit 0
fi
dev_count=`prtconf -p | grep -c \'echip\'`
if [ ${dev_count} -eq 0 ] ; then
	exit 0
fi
dev_count=`prtconf -p | grep -c \'consbus\'`
if [ ${dev_count} -eq 0 ] ; then
	exit 0
fi

# source the sms environment file
if [ -r "$SMS_ENV" ]; then
	. $SMS_ENV
else
	echo `gettext "$command: sms_env.sh file missing or not readable"`
	exit 1
fi

# CD to a harmless directory first
cd /

# ignore HUP and QUIT signals
trap '' HUP QUIT

case "$sub_command" in

'start')
	sms_start
	;;

'stop')
	sms_stop
	;;

*)
	usage_error
	exit 2
	;;
esac

exit 0
