#!/bin/ksh  
#Copyright 03/17/99 Sun Microsystems, Inc. All rights reserved.
#pragma ident "@(#)em_monitor	1.2 99/03/17 Sun Microsystems"
#


###############################################################################
# errmsg
###############################################################################
#
# Echo arguments.
#
###############################################################################

errmsg()
{
    if [ "$1" = "-n" ] ; then
        shift
        gettext "$*"            # no newline
    else
        gettext "$*\\n"
    fi
}


###############################################################################
# get_pid
###############################################################################
#
# Return the pid(s) of the processes with the given name.  A null string is
# returned if the process isn't running.
#
###############################################################################

get_pid()
{
   case $1 in
      em_ob_corba_rgw | em_ob_corba_mgw | em_ob_corba_epr | em_ob_corba_eds |\
      em_vb_corba_rgw | em_vb_corba_mgw | em_vb_corba_epr | em_vb_corba_eds |\
      em_io_corba_rgw | em_io_corba_mgw | em_io_corba_epr | em_io_corba_eds )
         short_daemon=$1
         ps -eo pid,comm  | grep "$short_daemon" | sed -e 's/^  *//' -e 's/ .*//'
      ;;

	em_mpa_sunmc | em_jdmkfwd)
		if [ $1 = "em_mpa_sunmc" ] ; then
	 		pid_file=/var/tmp/em_mpa_sunmc.pid
		elif [ $1 = "em_jdmkfwd" ] ; then
			pid_file=/tmp/jdmk_fwd.pid
		fi
		if [ -r $pid_file ] ; then
			process_pid=`cat $pid_file`
			ps -p $process_pid >/dev/null 2>&1
			if [  $? -ne 0 ] ; then
				process_pid=
			fi
		else
			process_pid=
		fi
		echo $process_pid
		;;

	*)
		ps -e -o pid,args | /usr/bin/grep -v "grep"| /usr/bin/grep -w $1 |  sed -e 's/^  *//' -e 's/ .*//'
      ;;
   esac
}

###############################################################################
# escalation_level
###############################################################################
escalation_level() {
   if [[ -f /tmp/emha_escalate3 ]] ; then
      echo 3
      return
   fi
   if [[ -f /tmp/emha_escalate2 ]] ; then
      echo 2
      return
   fi
   if [[ -f /tmp/emha_escalate1 ]] ; then
      echo 1
      return
   fi
   echo 0
   return

}

###############################################################################
#
# monitor()
#
# monitors daemon processes.  Never returns unless there a dead process
# is detected.
#
###############################################################################

monitor() {

typeset d
typeset p
typeset pid_list
typeset pid
typeset -i i
typeset -i db

# Get list of daemons started by em_services
list=$(</tmp/emha_daemon_list)
col=1
i=1
for x in $list ; do
   if [[ col -eq 1 ]] ; then
      # read daemon name
      daemon_list[$i]=$x
      col=0
   else
      # read number of daemon processes
      number[$i]=$x
      col=1
      i=$(($i+1))
   fi
done
  
Daemons=$(($i-1))

# Check if all daemons started by em_services are still running
i=1
db=0
pid_list=
while [[ $i -le $Daemons ]]; do
   d=${daemon_list[$i]}
   n=${number[$i]}
   pid=`get_pid $d`
   pid_list="$pid_list $pid"
   if [[ $n -gt 0 ]] ; then
      if [[ -z $pid ]] ; then
         # daemon is not running
         return 1
      fi
      set $pid
      if [[ $# -lt $n ]] ; then
         # all daemon's processes are not running
         return 1
      fi
   fi
   i=$(($i+1))
done

# platform is OK
# reset the escalation status
rm -f /tmp/emha_escalate*

# start the endless monitor loop
errmsg "monitor: (`date`) started at escalation level: `escalation_level`" > $MON_DISPLAY

while true; do
    for pid in $pid_list; do
        p=`ps -os -p $pid`
	if [[ "`echo $p`" = "S" ]] || [[ "`echo $p`" = "S Z" ]] ; then
           #       ^ no such pid ^ OR      ^  zombie state  ^
           errmsg "monitor: (`date`): $pid dead" > $MON_DISPLAY
           return 1
        fi
    done
    sleep $sleep_interval
done
}

###############################################################################
# cleanup
###############################################################################
cleanup() {
this_pid=$$
pid=`get_pid $PROGNAME`
pid="$pid `get_pid fixit`"
for p in $pid ; do
   if [[ $p != $this_pid ]] ; then
      kill -KILL $p > /dev/null 2>&1
   fi
done
rm -f /tmp/emha_escalate*
rm -f /tmp/emha_startup
rm -f /tmp/emha_hold
rm -f /tmp/emha_progress
}

###############################################################################
# main
###############################################################################
#
# The main part of this script.
#
###############################################################################

all_daemon_list="oninit em_auxdb em_autoexd em_ncam em_mis em_login \
             em_ns_server em_eds em_log2hist em_toposrv \
             em_log em_snmp-trap em_snmfwd em_nnmpa em_mpa_snmp em_mpa_sunmc \
             em_mpa_rpc em_cmip em_purged em_mpa_jdmk em_jdmkfwd jme_jre em_srm \
             em_autod em_datad \
	     em_ob_corba_rgw em_ob_corba_mgw em_ob_corba_epr em_ob_corba_eds \
	     em_vb_corba_rgw em_vb_corba_mgw em_vb_corba_epr em_vb_corba_eds \
	     em_io_corba_rgw em_io_corba_mgw em_io_corba_epr em_io_corba_eds" 

# Fail if the required EM package is not installed.
if pkginfo -q SUNWemalb ; then
    em_dir=`pkgparam SUNWemalb BASEDIR`
    PATH=${PATH}:$em_dir/SUNWconn/em/bin
    em_dir=$em_dir/SUNWconn/em/bin
else
    exit 1
fi

PROGNAME=`basename $0`

if [[ -z $MON_DISPLAY ]]; then
   if tty -s ; then
      MON_DISPLAY=/dev/tty
   else
      MON_DISPLAY=/dev/console
   fi
fi
export MON_DISPLAY

# place to send messages

typeset -i esc_level
# current esc. level

MONITOR_DELAY=${MONITOR_DELAY:-20}
# User settable delay for monitor timeout
if [[ $MONITOR_DELAY -lt 1 ]] || [[ $MONITOR_DELAY -gt 1000 ]] ; then
   MONITOR_DELAY=20
fi

typeset -i sleep_interval=10
# How long to sleep between ticks

EM_START_TIMEOUT=$(($MONITOR_DELAY*1))
# TIMEOUT = time (units of sleep_interval secs) between 
# successive calls to "progress"

typeset -i i
typeset -i ticks

case $1 in
   '-stop' )
      cleanup
      errmsg "monitor: (`date`) stopped by user" > $MON_DISPLAY
      exit 0
      ;;
   '-restart')
      cleanup
#      errmsg "monitor: (`date`) stopped by user" > $MON_DISPLAY
      $em_dir/$PROGNAME &
      exit 0
      ;;
   "") :
      ;;
   *) errmsg "unknown option: $1"
      errmsg "valid options: -stop (stop monitoring); -restart (start monitoring)"
      exit 1
      ;;
esac

# check if monitor is already running
this_pid=$$
pid=`get_pid $PROGNAME`
for p in $pid ; do
   pa=`ps -os -p $p`
   if [[ "`echo $pa`" = "S" ]] || [[ "`echo $pa`" = "S Z" ]] ; then
   #       ^ no such pid ^ OR      ^  zombie state  ^
      # ignore these pids
      continue
   fi
   if [[ $p != $this_pid ]] ; then
      errmsg "Another instance of the monitor is already running."
      exit 1
   fi
done

em_bin="`pkgparam SUNWemalb BASEDIR`/SUNWconn/em/bin"
if pkginfo -q SUNWemmis.* ; then
    mis_installed=TRUE
else
    mis_installed=FALSE
fi


#
# Start em_services and wait for it to remove lockfile (emha_startup)
#

ticks=0
esc_level=`escalation_level`

# clean up dead files
rm -f /tmp/emha_hold
rm -f /tmp/emha_progress

if [[ -f /tmp/emha_startup ]] ; then
   # monitor is being called from em_services
   in_em_services=TRUE
else
   # being started by user
   in_em_services=FALSE
fi

# Now start monitoring progress of em_services

while [[ -f /tmp/emha_startup ]] ; do
   if [[ $ticks -gt $EM_START_TIMEOUT ]]; then
      errmsg "monitor: em_services timed out" > $MON_DISPLAY
      esc_level=$(($esc_level+1))
      if [[ $esc_level -gt 4 ]]; then
         esc_level=4
      fi
      # remove lock so that this monitor instance can be killed
      rm -f /tmp/emha_startup
      # record the esc. level
      cp /dev/null /tmp/emha_escalate$esc_level
      $em_bin/fixit
      # wait to be killed
      sleep 30
   fi
	d=em_mis
   if [ $mis_installed = TRUE ] ; then
   		mispid=`get_pid $d`
   # check if em_mis started
   		if [ -z $mispid ] ; then
			errmsg '\nem_mis FAILED to start...Aborting em_services...\n'
      		esc_level=$(($esc_level+2))
      		if [[ $esc_level -gt 4 ]]; then
         		esc_level=4
      		fi
      # remove lock so that this monitor inst. can be killed
      		rm -f /tmp/emha_startup
      		errmsg "monitor: platform failed, escalation level: $esc_level" > $MON_DISPLAY
      		cp /dev/null /tmp/emha_escalate$esc_level
      		$em_bin/fixit
      		errmsg "monitor: giving up" > $MON_DISPLAY
      # wait to be killed
      		sleep 60
		fi
   fi
   sleep $sleep_interval
   if [[ -f /tmp/emha_hold ]] ; then
      #  placed on hold by interrupt in em_services; 
      #  so dont increment the timer
      continue
   fi
   if [[ -f /tmp/emha_progress ]] ; then
      # there is progress, so zero out the timer
      ticks=0
      rm -f /tmp/emha_progress ]]
      continue
   fi
   ticks=$(($ticks+1))
done

# check if db and mis are up
failed=FALSE
for d in oninit em_mis ; do
   pid=`get_pid $d`
   # check if em_mis started
   if [[ $d = "em_mis" ]] && [[ -z $pid ]] && [[ $mis_installed = TRUE ]]; then
      failed=TRUE
      break
   fi
   # check if oninit started
   if [[ $d = "oninit" ]] && [[ -z $pid ]] ; then
      failed=TRUE
      break
   fi
done

if [[ $failed = TRUE ]]; then
   # check if being called from em_services
   if [[ $in_em_services = TRUE ]] ; then
      # platform did not come up
      esc_level=$(($esc_level+1))
      if [[ $esc_level -gt 4 ]]; then
         esc_level=4
      fi
      # remove lock so that this monitor inst. can be killed
      rm -f /tmp/emha_startup
      errmsg "monitor: platform failed, escalation level: $esc_level" > $MON_DISPLAY
      cp /dev/null /tmp/emha_escalate$esc_level
      $em_bin/fixit
      # wait to be killed
      sleep 60
      errmsg "monitor: giving up" > $MON_DISPLAY
      exit 1
   else
   # being called by user
      errmsg "monitor: platform not up" > $MON_DISPLAY
      exit 1
   fi
fi

# If we are here, then platform is up

rm -f /tmp/emha_escalate*

# find out which daemons were started by em_services
rm -f /tmp/emha_daemon_list
for d in $all_daemon_list ; do
  pid=`get_pid $d`
   i=0
   for p in $pid ; do
      i=$(($i+1))
   done
   echo $d $i >> /tmp/emha_daemon_list
done

# Start the main monitor loop
while true; do

   monitor
   # out of monitor because of problem situation
   if $em_bin/fixit ; then
      # fixit was able to restart dead process
      :
   else
      # fixit is going to restart platform
      break
   fi

done

# wait to be killed by em_services
TIMEOUT=20
ticks=0
while true; do
   sleep 10
   if [[ $ticks -gt $TIMEOUT ]]; then
      # fixit seems to have failed.  Restart platform
      # remove lock so that this monitor inst. can be killed
      rm -f /tmp/emha_startup
      em_services -start  &
      sleep 60
      errmsg "monitor: giving up" > $MON_DISPLAY
      exit 1
   fi
   ticks=$(($ticks+1))
done
