#!/bin/ksh  
#Copyright 03/17/99 Sun Microsystems, Inc. All rights reserved.
#pragma ident "@(#)em_monitor	1.2 99/03/17 Sun Microsystems"
#


###############################################################################
# errmsg
###############################################################################
#
# Echo arguments.
#
###############################################################################

errmsg()
{
    if [ "$1" = "-n" ] ; then
        shift
        gettext "$*"            # no newline
    else
        gettext "$*\\n"
    fi

	printTrcMsg "$*"
}


###############################################################################
# get_pid
###############################################################################
#
# Return the pid(s) of the processes with the given name.  A null string is
# returned if the process isn't running.
#
###############################################################################

get_pid()
{
   case $1 in
      em_ob_corba_rgw | em_ob_corba_mgw | em_ob_corba_epr | em_ob_corba_eds |\
      em_vb_corba_rgw | em_vb_corba_mgw | em_vb_corba_epr | em_vb_corba_eds |\
      em_io_corba_rgw | em_io_corba_mgw | em_io_corba_epr | em_io_corba_eds )
         short_daemon=$1
         ps -eo pid,comm  | grep "$short_daemon" | sed -e 's/^  *//' -e 's/ .*//'
      ;;

	em_mpa_sunmc | em_jdmkfwd)
		if [ $1 = "em_mpa_sunmc" ] ; then
	 		pid_file=/var/tmp/em_mpa_sunmc.pid
		elif [ $1 = "em_jdmkfwd" ] ; then
			pid_file=/tmp/jdmk_fwd.pid
		fi
		if [ -r $pid_file ] ; then
			process_pid=`cat $pid_file`
			ps -p $process_pid >/dev/null 2>&1
			if [  $? -ne 0 ] ; then
				process_pid=
			fi
		else
			process_pid=
		fi
		echo $process_pid
		;;

	jme_jre)
		ps -e -o pid,args | /usr/bin/grep -v "grep"| /usr/bin/grep "jme_jre" | /usr/bin/grep "sh" | /usr/bin/awk '{ print $1}'
		;;

	em_monitor)
		ps -e -o pid,args | /usr/bin/grep -v "grep"| /usr/bin/grep "em_monitor$" | /usr/bin/grep "sh" | /usr/bin/awk '{ print $1}'
		;;

	*)
		ps -e -o pid,comm | /usr/bin/grep -v "grep"| /usr/bin/grep -w $1 |  sed -e 's/^  *//' -e 's/ .*//'
      ;;
   esac
}


###############################################################################
# print date
###############################################################################
printDate()
{
/usr/bin/date '+20%y-%m-%d %H:%M:%S'
}

###############################################################################
# print message to em_monitor logfile
###############################################################################
printTrcMsg(){
/usr/bin/echo "`printDate`: em_monitor: $*" >> $em_monitor_logfile
}

###############################################################################
# print status for given PID list
###############################################################################
printProcessStatus()
{
/usr/bin/echo "`printDate`: process status"
actualPidList="$*"
/usr/bin/ps -p	"${actualPidList}" -o user -o pid -o ppid -o rss -o pmem -o vsz -o s -o stime -o time -o args
}


###############################################################################
# get status for monitored PID list
###############################################################################
get_monitored_pidlist(){
monitoredPidList=$1
/usr/bin/ps -p "${monitoredPidList}" -os -o pid -o args |
/usr/bin/nawk '
{
if ( $1 == "S" && $2 == "PID" && $3 "COMMAND" )
   {
   next
   }
if (  $1 == "Z" )
   {
   next
   }
printf "%s\n" , $2
}' | /usr/bin/sort 

}


###############################################################################
# escalation_level
###############################################################################
escalation_level() {
   if [[ -f /tmp/emha_escalate3 ]] ; then
      echo 3
      return
   fi
   if [[ -f /tmp/emha_escalate2 ]] ; then
      echo 2
      return
   fi
   if [[ -f /tmp/emha_escalate1 ]] ; then
      echo 1
      return
   fi
   echo 0
   return

}

###############################################################################
#
# monitor()
#
# monitors daemon processes.  Never returns unless there a dead process
# is detected.
#
###############################################################################

monitor() {

typeset d
typeset p
typeset pid_list
typeset pid
typeset -i i
typeset -i db

# Get list of daemons started by em_services
list=$(</tmp/emha_daemon_list)
col=1
i=1
for x in $list ; do
   if [[ col -eq 1 ]] ; then
      # read daemon name
      daemon_list[$i]=$x
      col=0
   else
      # read number of daemon processes
      number[$i]=$x
      col=1
      i=$(($i+1))
   fi
done
  
Daemons=$(($i-1))

# Check if all daemons started by em_services are still running
i=1
db=0
pid_list=
while [[ $i -le $Daemons ]]; do
   d=${daemon_list[$i]}
   n=${number[$i]}
   pid=`get_pid $d`
   pid_list="$pid_list $pid"
   if [[ $n -gt 0 ]] ; then
      if [[ -z $pid ]] ; then
         # daemon is not running
         return 1
      fi
      set $pid
      if [[ $# -lt $n ]] ; then
         # all daemon's processes are not running
         return 1
      fi
   fi
   i=$(($i+1))
done

# platform is OK
# reset the escalation status
rm -f /tmp/emha_escalate*

# start the endless monitor loop
errmsg "monitor: (`date`) started at escalation level: `escalation_level`" > $MON_DISPLAY

# get sorted pids to be monitored
ini_pidlist=$(
for xpid in $pid_list
do
	/usr/bin/echo $xpid
done | /usr/bin/sort 
)

ini_pidlist=`/usr/bin/echo ${ini_pidlist}`

printTrcMsg    "initial process status:"
printProcessStatus "${ini_pidlist}" >>	$em_monitor_logfile

while true
do
	act_pidlist=`get_monitored_pidlist "${ini_pidlist}"`
	act_pidlist=`/usr/bin/echo ${act_pidlist}`
	if [ "${act_pidlist}" != "${ini_pidlist}" ]
	then
		errmsg "monitor: some process died, actual process status:"
		printProcessStatus "${act_pidlist}" >>  $em_monitor_logfile
	return 1
	fi	
	sleep $sleep_interval
done


}

###############################################################################
# cleanup
###############################################################################
cleanup() {
this_pid=$$
pid=`get_pid $PROGNAME`
pid="$pid `get_pid fixit`"
for p in $pid ; do
   if [[ $p != $this_pid ]] ; then
      kill -KILL $p > /dev/null 2>&1
   fi
done
rm -f /tmp/emha_escalate*
rm -f /tmp/emha_startup
rm -f /tmp/emha_hold
rm -f /tmp/emha_progress
}

###############################################################################
# main
###############################################################################
#
# The main part of this script.
#
###############################################################################

em_monitor_logfile=${EM_RUNTIME}/debug/em_monitor.output

all_daemon_list="oninit em_auxdb em_autoexd em_ncam em_mis em_login \
             em_ns_server em_eds em_log2hist em_toposrv \
             em_log em_snmp-trap em_snmfwd em_nnmpa em_mpa_snmp em_mpa_sunmc \
             em_mpa_rpc em_cmip em_purged em_mpa_jdmk em_jdmkfwd jme_jre em_srm \
             em_autod em_datad \
	     em_ob_corba_rgw em_ob_corba_mgw em_ob_corba_epr em_ob_corba_eds \
	     em_vb_corba_rgw em_vb_corba_mgw em_vb_corba_epr em_vb_corba_eds \
	     em_io_corba_rgw em_io_corba_mgw em_io_corba_epr em_io_corba_eds" 

# Fail if the required EM package is not installed.
if pkginfo -q SUNWemalb ; then
    em_dir=`pkgparam SUNWemalb BASEDIR`
    PATH=${PATH}:$em_dir/SUNWconn/em/bin
    em_dir=$em_dir/SUNWconn/em/bin
else
    exit 1
fi

PROGNAME=`basename $0`

if [[ -z $MON_DISPLAY ]]; then
   if tty -s ; then
      MON_DISPLAY=/dev/tty
   else
      MON_DISPLAY=/dev/console
   fi
fi
export MON_DISPLAY

# place to send messages

typeset -i esc_level
# current esc. level

MONITOR_DELAY=${MONITOR_DELAY:-20}
# User settable delay for monitor timeout
if [[ $MONITOR_DELAY -lt 1 ]] || [[ $MONITOR_DELAY -gt 1000 ]] ; then
   MONITOR_DELAY=20
fi

typeset -i sleep_interval=10
# How long to sleep between ticks

EM_START_TIMEOUT=$(($MONITOR_DELAY*1))
# TIMEOUT = time (units of sleep_interval secs) between 
# successive calls to "progress"

typeset -i i
typeset -i ticks

errmsg "invoked: par = $*, ppid=${PPID} pid=$$"

case $1 in
   '-stop' )
      cleanup
      errmsg "monitor: (`date`) stopped by user" > $MON_DISPLAY
      exit 0
      ;;
   '-restart')
      cleanup
#      errmsg "monitor: (`date`) stopped by user" > $MON_DISPLAY
      $em_dir/$PROGNAME &
      exit 0
      ;;
   "") :
      ;;
   *) errmsg "unknown option: $1"
      errmsg "valid options: -stop (stop monitoring); -restart (start monitoring)"
      exit 1
      ;;
esac

# check if monitor is already running
this_pid=$$
pid=`get_pid $PROGNAME`
for p in $pid ; do
   pa=`ps -os -p $p`
   if [[ "`echo $pa`" = "S" ]] || [[ "`echo $pa`" = "S Z" ]] ; then
   #       ^ no such pid ^ OR      ^  zombie state  ^
      # ignore these pids
      continue
   fi
   if [[ $p != $this_pid ]] ; then
	  errmsg "Another instance of the monitor is already running: p=${p} this_pid=${this_pid}."	
      exit 1
   fi
done

em_bin="`pkgparam SUNWemalb BASEDIR`/SUNWconn/em/bin"
if pkginfo -q SUNWemmis.* ; then
    mis_installed=TRUE
else
    mis_installed=FALSE
fi


#
# Start em_services and wait for it to remove lockfile (emha_startup)
#

ticks=0
esc_level=`escalation_level`

# clean up dead files
rm -f /tmp/emha_hold
rm -f /tmp/emha_progress

if [[ -f /tmp/emha_startup ]] ; then
   # monitor is being called from em_services
   in_em_services=TRUE
else
   # being started by user
   in_em_services=FALSE
fi

# Now start monitoring progress of em_services

while [[ -f /tmp/emha_startup ]] ; do
   if [[ $ticks -gt $EM_START_TIMEOUT ]]; then
      errmsg "monitor: em_services timed out" > $MON_DISPLAY
      esc_level=$(($esc_level+1))
      if [[ $esc_level -gt 4 ]]; then
         esc_level=4
      fi
      # remove lock so that this monitor instance can be killed
      rm -f /tmp/emha_startup
      # record the esc. level
      cp /dev/null /tmp/emha_escalate$esc_level
      $em_bin/fixit
      # wait to be killed
      sleep 30
   fi
	d=em_mis
   if [ $mis_installed = TRUE ] ; then
   		mispid=`get_pid $d`
   # check if em_mis started
   		if [ -z $mispid ] ; then
			errmsg '\nem_mis FAILED to start...Aborting em_services...\n'
      		esc_level=$(($esc_level+2))
      		if [[ $esc_level -gt 4 ]]; then
         		esc_level=4
      		fi
      # remove lock so that this monitor inst. can be killed
      		rm -f /tmp/emha_startup
      		errmsg "monitor: platform failed, escalation level: $esc_level" > $MON_DISPLAY
      		cp /dev/null /tmp/emha_escalate$esc_level
      		$em_bin/fixit
      		errmsg "monitor: giving up" > $MON_DISPLAY
      # wait to be killed
      		sleep 60
		fi
   fi
   sleep $sleep_interval
   if [[ -f /tmp/emha_hold ]] ; then
      #  placed on hold by interrupt in em_services; 
      #  so dont increment the timer
      continue
   fi
   if [[ -f /tmp/emha_progress ]] ; then
      # there is progress, so zero out the timer
      ticks=0
      rm -f /tmp/emha_progress ]]
      continue
   fi
   ticks=$(($ticks+1))
done

# check if db and mis are up
failed=FALSE
for d in oninit em_mis ; do
   pid=`get_pid $d`
   # check if em_mis started
   if [[ $d = "em_mis" ]] && [[ -z $pid ]] && [[ $mis_installed = TRUE ]]; then
      failed=TRUE
      break
   fi
   # check if oninit started
   if [[ $d = "oninit" ]] && [[ -z $pid ]] ; then
      failed=TRUE
      break
   fi
done

if [[ $failed = TRUE ]]; then
   # check if being called from em_services
   if [[ $in_em_services = TRUE ]] ; then
      # platform did not come up
      esc_level=$(($esc_level+1))
      if [[ $esc_level -gt 4 ]]; then
         esc_level=4
      fi
      # remove lock so that this monitor inst. can be killed
      rm -f /tmp/emha_startup
      errmsg "monitor: platform failed, escalation level: $esc_level" > $MON_DISPLAY
      cp /dev/null /tmp/emha_escalate$esc_level
      $em_bin/fixit
      # wait to be killed
      sleep 60
      errmsg "monitor: giving up" > $MON_DISPLAY
      exit 1
   else
   # being called by user
      errmsg "monitor: platform not up" > $MON_DISPLAY
      exit 1
   fi
fi

# If we are here, then platform is up

rm -f /tmp/emha_escalate*

# find out which daemons were started by em_services
rm -f /tmp/emha_daemon_list
for d in $all_daemon_list ; do
  pid=`get_pid $d`
   i=0
   for p in $pid ; do
      i=$(($i+1))
   done
   echo $d $i >> /tmp/emha_daemon_list
done

# Start the main monitor loop
while true; do

   monitor
   # out of monitor because of problem situation
   if $em_bin/fixit ; then
      # fixit was able to restart dead process
      :
   else
      # fixit is going to restart platform
      break
   fi

done

# wait to be killed by em_services
TIMEOUT=20
ticks=0
while true; do
   sleep 10
   if [[ $ticks -gt $TIMEOUT ]]; then
      # fixit seems to have failed.  Restart platform
      # remove lock so that this monitor inst. can be killed
      rm -f /tmp/emha_startup
      em_services -start  &
      sleep 60
      errmsg "monitor: giving up" > $MON_DISPLAY
      exit 1
   fi
   ticks=$(($ticks+1))
done
