#
# ident	"@(#)actionreboot.tcl	1.95	00/01/11 SMI"
#
# Copyright (c) 1996-2000 by Sun Microsystems, Inc.
# All rights reserved.
#
# These tcl action scripts are used to respond to edd events generated
# by a following event traps:  
# 
#	The following events will cause a reboot
#	heartbeat failure
#	panic (1,2)
#	reboot trap
#	panic reboot
#	watchdog timeout
#	arbstop  - special case where a post dump will be taken first.
#

#
# global vars
#
set sec 10
set procnums ""
set clusterDomain {}
set bootCpu("unknown") -1

#
# bringup_active()
#
# determine if a bringup is active on the current domain.  this
# is required for the fix for 4103410.
#
# Returns 1 if a bringup is in process, 0 otherwise.
#
proc bringup_active {} {
    global env
    set pid 0
    set pidfile "/var/opt/SUNWssp/pids/bringup-$env(SUNW_HOSTNAME).pid"
    if { [ file readable $pidfile ] } {
	# a bringup may be currently running on the domain
	catch { set fid [open $pidfile {RDONLY } ] }
	    if { "fid" == [ info vars fid ] } {
		#
		# File contains process id
		#
		gets $fid pid
		set pid [lindex $pid 0]
	    } else {
		#
		# No process id
		#
		return 0
	    }
	if { [ catch { exec kill -0 $pid } ] == 0 } {
	    # bringup process active
	    return 1
	} else {
	    # process id not active, ignore it
	    return 0
	}
    }

    # either unable to read pidfile, or it doesn't exist
    return 0
}

#
# postdump_init()
#
# postdump initialization:
# lock all relevant resources so dump can be taken without any
# interference form other SSP processes. Also, verify that no
# other hpost is running, otherwise wait for it to complete.
#
proc postdump_init {} \
{
	global env sec
	# this will give a 30 minute wait for hpost
	set hloop 180
	#
	# get the postdump lock file
	#
	set lock [ lock_ssp_file "eddpostdump.lock 0" ]

	#
	# get the bringup lock file
	#
	set lock [ lock_ssp_file "bringup.lock 0" ]

	#
	# Next check if hpost is running and wait for that
	# to finish before starting the dump in the past too
	# many problems occur when a dump is running with a
	# bringup hpost.  Find the hpost lock file and check
	# for it in every domain.
	#
	if {[catch { get_domain_config } domaininfo ] != 0} {
		logMessage "No domains found. EXITING " postdump
		set lock [ lock_ssp_file "eddpostdump.lock 1" ]
		set lock [ lock_ssp_file "bringup.lock 1" ]
		return -1
	}

	append hpath $env(SSPVAR)/adm/
	set i 0

	foreach domain $domaininfo {
		set name [ lindex $domain 0 ]
		set postlock $hpath$name/hpost.lock
		catch { set fid [open $postlock {RDONLY } ] }
		if { "fid" == [ info vars fid ] } {
			#
			# A hpost is running in one of the
			# domain wait for it to finish.
			#
			gets $fid pid
			while { [ catch { exec ps -p $pid } ] == 0 } {
				sleep $sec
				incr i
				if { $i == $hloop } {
					set sleeptotal [ expr ( $sec * $hloop ) ]
					logMessage "Error: Domain \
$env(SUNW_HOSTNAME) Waited for $sleeptotal seconds for hpost in $name \
to complete, continue" postdump
					break
				}
			}
		}
	}

	return 0
}

#
# postdump()
#
# do a post dump with all the boards in that domain
# First we must find which boards are in the domain by walking the MIB.
# This version still needs to be upgraded to handle domain clusters.
#
proc postdump mes \
{
	# initial postdump setup
	if { [ postdump_init ] != 0 } {
		return
	}

	global env
	cd $env(HOME)

	set D_TABLE	"domainName domainSysBrdList domainBootProc"
	set domainname  $env(SUNW_HOSTNAME)
	set date 	[ exec date +%m.%d.%H:%M ]

	# First kill obphelper and netcon if they are
	# running from not record stop dumps

	set obpfile $env(SSPVAR)/pids/obp_helper-$env(SUNW_HOSTNAME).pid
	set netfile $env(SSPVAR)/pids/netcon_server-$env(SUNW_HOSTNAME).pid

	if { $mes != "Edd-Record-Stop" } { 
		if {[ tokenizeFile $obpfile tok " " ] == 1 } {
			set obpid [lindex $tok(1) 0]
			catch { exec kill -s KILL $obpid }
		}
		if {[ tokenizeFile $netfile tok " " ] == 1 } {
			set netpid [lindex $tok(1) 0]
			catch { exec kill -s KILL $netpid }
		}
	}

	if { [ catch {set handle [ snmp session -community edd ]} errmsg ] } {
		set lock [ lock_ssp_file "eddpostdump.lock 1" ]
		set lock [ lock_ssp_file "bringup.lock 1" ]
		logDomainMessage " snmp session error $errmsg" postdump
		return -code error
	}

	$handle configure -timeout 100 -retries 1

	$handle walk x "$D_TABLE" {
		set host [lindex [lindex $x 0] 2]
		if { $host == $env(SUNW_HOSTNAME) } {
			# Got our domain name
			set boards [lindex [lindex $x 1] 2]
			set bproc  [lindex [lindex $x 2] 2]
			break
		}
	}

	$handle destroy

	#
	# Save the bbsram info
	#
	if { ($mes == "Edd-Arbstop") && [info exists bproc] } {
		set pathname 	$env(SSPLOGGER)/$domainname/$mes-bbsram-$date
		set cmdstr "proc $bproc; bba 7000 1024"
		logDomainMessage "Save BBSRAM information in $pathname" postdump
		catch { eval "exec redx -x \"$cmdstr\" > $pathname" }
	}

	set boards [split $boards "-" ]

	# turn the board numbers into a mask.
	set brdmask 0

	foreach brd $boards {
		set brdmask [ expr ( $brdmask | ( 1 << $brd)) ]
	}

	set pathname 	$env(SSPLOGGER)/$domainname/$mes-Dump-$date
	set postcmd [ format "hpost -s -d \"$mes-Dump\" -D3%04x,%s" $brdmask $pathname ]

	# now exect the hpost to do the dump for the present domain
	# file is to be expect to be 4-5k per board

	logDomainMessage " $postcmd for domain $domainname and boards $boards, \
being written to $pathname"  postdump

	catch { eval "exec $postcmd" }

	set lock [ lock_ssp_file "eddpostdump.lock 1" ]
	set lock [ lock_ssp_file "bringup.lock 1" ]
}

#
# postdump_cluster()
#
# Do a post dump on an MDC (MultiDomain Cluster),
# the boardmask is obtained from the MIB
#
proc postdump_cluster { mes arbdomain } \
{

	# initial postdump setup
	if { [ postdump_init ] != 0 } {
		return
	}

	global env clusterDomain bootCpu
	cd $env(HOME)

	set procname	"postdump_cluster"
	set domainname  $env(SUNW_HOSTNAME)
	set date 	[ exec date +%m.%d.%H:%M ]

	# First kill obphelper and netcon if they are
	# running from not record stop dumps

	set obpfile $env(SSPVAR)/pids/obp_helper-$env(SUNW_HOSTNAME).pid
	set netfile $env(SSPVAR)/pids/netcon_server-$env(SUNW_HOSTNAME).pid

	if { $mes != "Edd-MD-Recordstop" } { 
		if {[ catch { exec cat $obpfile } obpid ] == 0 } {
			catch { exec kill -s KILL $obpid }
		}
		if {[ catch { exec cat $netfile } netpid ] == 0 } {
			catch { exec kill -s KILL $netpid }
		}
	}

	if { [ catch {set handle [ snmp session -community edd ]} errmsg ] } {
		set lock [ lock_ssp_file "eddpostdump.lock 1" ]
		set lock [ lock_ssp_file "bringup.lock 1" ]
		logDomainMessage " snmp session error $errmsg" $procname
		return -code error
	}

	$handle configure -timeout 100 -retries 1

	if { $mes == "Edd-MD-Arbstop" } {
		#
		# A set of IDN domains based on the SMD regs
		#
		set clusterDomain [get_smd_idnnet $arbdomain]

		#
		# Read from the MIB the bootprocs of all existing domains
		#
		logMessage "Reading the boot procs from the MIB" $procname
		$handle walk x "domainName domainBootProc" {
			# Got our domain name
			set host [lindex [lindex $x 0] 2]
			set bootCpu($host)  [lindex [lindex $x 1] 2]
		}

		#
		# Set of boot procs
		#
		set plist {}
		foreach idnDomain $clusterDomain {
			if { ([info exists bootCpu($idnDomain) ] == 1) && \
			     ($bootCpu($idnDomain) >= 0) && \
			     ($bootCpu($idnDomain) <= 63) } {
				set plist [concat $plist $bootCpu($idnDomain)]
			}
		}
		#
		# compose the redx cmd string
		#
		set cmdstr ""
		foreach p [lsort -int $plist]  {
			set cmdstr "$cmdstr proc $p; bba 7000 1024;"
		}
		#
		# Save the bbsram info
		#
		set pathname 	$env(SSPLOGGER)/$domainname/$mes-bbsram-$date
		logMessage "Save BBSRAM information in $pathname" $procname
		if { $cmdstr != "" } {
			catch { eval "exec redx -x \"$cmdstr\" > $pathname" }
		}
	} 

	# Find the boardmask with respect to the actual
	# arbstopped domain and _not_ the domainname
	# as they may be different.  domainname is a logical
	# representative for the entire IDNnet containing
	# both domains.

	set brdmask [ lindex [ get_smd_mask $arbdomain ] 0 ]

	# exec hpost to do the dump. Dump is stored under this
	# domain's name file is expected to be 4-5k per board

	set pathname 	$env(SSPLOGGER)/$domainname/$mes-Dump-$date
	set postcmd \
		[ format "hpost -s -d \"$mes-Dump\" -D3%04x,%s" \
						[ expr 0x$brdmask ] $pathname ]
	set Message \
		[ format "$postcmd for MultiDomain (clustermask=0x%04x) \
being written to $pathname" [ expr 0x$brdmask ] ]

	logMessage "$Message" $procname

	catch { eval "exec $postcmd" }

	$handle destroy	
	set lock [ lock_ssp_file "eddpostdump.lock 1" ]
	set lock [ lock_ssp_file "bringup.lock 1" ]
}

proc Bringup mode {
    # First check to see if any other doamins are up and running
    # This needs to be done to see if the -C option needs to be used
    #
    global env errorCode
    cd $env(HOME)
    set domainname  $env(SUNW_HOSTNAME) 
    set pathname $env(SSPLOGGER)
    append pathname /$domainname/last_edd_bringup.out

    if {$mode == "Quick" } {
	set arglist "-L -F -Q -A on"
	if {[catch { eval "exec bringup $arglist >$pathname" } err] != 0} {
	    if { [[string length $err]] != 0 } {
		set err "Error = $err. "
	    }
	    logDomainMessage "ERROR: edd-initiated bringup $arglist failed for domain $domainname. ${err}Retry started" Bringup
	    Bringup Long
	}

    } elseif {$mode == "Reset" } {
	set arglist "-L -F -Q"
	if {[catch { eval "exec bringup $arglist >$pathname" } err] != 0} {
	    if { [[string length $err]] != 0 } {
		set err "Error = $err. "
	    }
	    logDomainMessage "ERROR: edd-initiated bringup $arglist failed for domain $domainname. ${err}Retry started" Bringup
	    Bringup ResetLong
	}

    } elseif {$mode == "ResetLong"} {
    	set arglist "-L -F -a"
    	if {[catch { eval "exec bringup $arglist >$pathname" } err] != 0} {
	    if { [[string length $err]] != 0 } {
		set err "Error = $err. "
	    }
	    logDomainMessage "ERROR: edd-initiated bringup $arglist failed for domain $domainname. ${err}See messages and $pathname file for details." Bringup
	}

    } elseif {$mode == "Long"} {
	set arglist "-L -F -a -A on"
	if {[catch { eval "exec bringup $arglist >$pathname" } err] != 0} {
	    if { [[string length $err]] != 0 } {
		set err "Error = $err. "
	    }
	    logDomainMessage "ERROR: edd-initiated bringup $arglist failed for domain $domainname. ${err}See messages and $pathname file for details." Bringup
	}

    } elseif {$mode == "HeartbeatLong"} {
	set arglist "-L -F -l24 -A on"
	if {[catch { eval "exec bringup $arglist >$pathname" } err] != 0} {
	    if { [string length $err] != 0 } {
		set err "Error = $err. "
	    }
	    logDomainMessage "ERROR: edd-initiated bringup $arglist failed for domain $domainname. ${err}See messages and $pathname file for details." Bringup
        }

    } else {
	set arglist "-L -F -A on"
	if {[catch { eval "exec bringup $arglist >$pathname" } err] != 0} {
	    if { [string length $err] != 0 } {
		set err "Error = $err. "
	    }
	    logDomainMessage "ERROR: edd-initiated bringup $arglist failed for domain $domainname. ${err}Retry started" Bringup
	    Bringup Long
	}
    }
}

#
# Recordstopact()
#
#  -e { error number }
# valid e values are : 57 for record stop, 75 for cluster_recordstop
#			
# For record stop do the dump and clear the record stop
#
proc Recordstopact { args } \
{
	global env errorCode
	sethostname $args

	#
	# "try lock" the high-level bringup_dr.lock. 
	# (see bugid 4045478).  Note - this lock is platform wide.
	#
	set lock [ lock_ssp_file "bringup_dr.lock 0" ]
	
	if { ( [ llength [ lindex $args 1 ] ] > 2 ) } {
		#
		# Domain cluster recordstop
		#

		set MD 1

		set recDomain [ lindex [ lindex $args 1 ] 3 ]
		set idnnet [ get_idn_config $recDomain ]

		if { [ llength $idnnet ] > 0 } {
			set idnnet [ lindex [ lindex $idnnet 0 ] 1 ]
		} else {
			set idnnet "<unknown>"
		}
		logMessage "ERROR: MultiDomain Recordstop detected:\
(domain $recDomain, cluster $idnnet)" Recordstopact

		postdump_cluster "Edd-MD-Recordstop" "$recDomain"

		# Now clear out the recordstop

		if { [catch { exec hpost -s -Wc }] != 0 } {
			set hpostcode [ lindex $errorCode 2 ]
		}
	} else {
		set MD 0

		logDomainMessage "ERROR: Record stop detected in \
domain $env(SUNW_HOSTNAME)" Recordstopact
		postdump "Edd-Record-Stop"
	
		# Now clear out the recordstop

		if { [catch { exec hpost -s -W }] != 0 } {
			set hpostcode [ lindex $errorCode 2 ]
		}
	}

	#  if -W is successful a 85 will be returned */

	# Log the hpost error
	if { [info exists hpostcode] && $hpostcode != 85 } {
		switch -exact -- $hpostcode {
		79 {
			set mess \
" 79 -W Unable to determine the current centerplane bus configuration."
		   }
		86 {
			set mess \
" 86 -W successful, except unable to reenable centerplane xmux recording,\
presumably because of a recordstop in a different domain."
		   }
		87 {
			set mess " 87 -W unsuccessful "
		   }
		88 {
			set mess " 88 -W found no recordstop "
		   }
		89 {
			set mess \
" 89 -W found an arbstop or some other error it cannot handle. "
		   }
		default {
			set mess \
" Unknown hpost -W exit code returned $hpostcode"
		   }
		}

		if { $MD == 0 } {
			logDomainMessage "ERROR: Record stop clear \
error $mess, in domain $env(SUNW_HOSTNAME)" Recordstopact
		} else {
			logMessage "ERROR: Record stop clear error \
$mess, in multi-domain $idnnet" Recordstopact
		}
	} else {
		if { $MD == 0 } {
			logMessage "Cleared recordstop state in domain \
$env(SUNW_HOSTNAME)" Recordstopact
		} else {
			logMessage "Cleared recordstop states in the \
cluster($idnid) $idnnet" Recordstopact
		}
	}

	#
	# Release bringup/dr lock
	#
	set lock [ lock_ssp_file "bringup_dr.lock 1" ]
}

#
# sethostname()
#
# The routine used to check the arguments for the action scripts
# checks the board number and sends the message to the log file.
#
proc sethostname { args } \
{
	global env domain
	regsub -all \{  $args \  args
	regsub -all \}  $args \  args
	regsub "\domainName." $args "domainName " args
	set args [split $args " "]
	set i [ lsearch $args -t ]
	if { $i != -1 } {
		#
		# This is used by the EDD state machine. 
		# It need to sleep for panic 1 and 2 in order to complete
		# the panic flow.
		#
		incr i
		set timeout [lindex $args $i]
		catch { exec sleep $timeout }
	}

	set i [ lsearch $args {domainName} ]
	if { $i == -1 } {
		logMessage "Error : domainName not sent to action script:\
$args"  sethostname
		return -code error
	}
	incr i 2
	set domain [ lindex $args $i ]
	set env(SUNW_HOSTNAME) $domain
	return 0
}

#
# Arbstopact()
#
# arguments include the e number and the domain name
#  -e { error number } and -d { domain name }
# valid e values are : 56 for arbstop, 74 for cluster_arbstop
#
# For Regular dumps, args are:
#	-d { domainName.0 domainA }
#
# For Cluster dumps, args are:
#	-d { domainName.0 domainA confStarfireIDNnetResidents.0 domainB }
#
# For a arbstop do a dump and reboot the system
# for hpost pass argument -D3 xxxx,<pathname>
#  where xxx are the board numbers.
#
proc Arbstopact { args } \
{
	global env clusterDomain bootCpu errorCode
	sethostname $args

	if { ( [ llength [ lindex $args 1 ] ] > 2 ) } {
		#
		# Cluster arbstop
		# Unlink the idnnet, clear ARBSTOP state and reboot all domains

		set arbDomain [ lindex [ lindex $args 1 ] 3 ]
		set idnnet [ get_idn_config $arbDomain ]

		#
		# "try lock" the high-level bringup_dr.lock.
		# (see bugid 4045478).  Note - this lock is platform wide.
		#
		set lock [ lock_ssp_file "bringup_dr.lock 0 " ]

		# Domain cluster arbstop
		# set mibName [ lindex [ lindex $args 1 ] 2 ]

		if { [ llength $idnnet ] > 0 } {
			set idnid  [ lindex [ lindex $idnnet 0 ] 0 ]
			set idnnet [ lindex [ lindex $idnnet 0 ] 1 ]
		} else {
			set idnid  -1
			set idnnet "<unknown>"
		}

		logMessage "MultiDomain Arbstop reboot :\
(domain $arbDomain, cluster($idnid) $idnnet) started" Arbstopact

		# Take a dump of the entire cluster of IDN domains
		postdump_cluster "Edd-MD-Arbstop" "$arbDomain"

		#
		# A set of IDN domains based on the SMD regs
		# It should be equal or a subset of $idnnet
		#
		if { [llength $clusterDomain] <= 0 } {
			set arbnet $idnnet
		} else {
			set arbnet $clusterDomain
		}

		#
		# Unlink all domains in one cmd
		# Separate the domains in the SMD register, so the command 
		#"hpost -Z" can succeed.
		# EBUSY=16
		set rv 16
		set arglst "-f -E -B -XM"
		while { $rv == 16 } {
			logMessage "Unlink $arbnet of cluster($idnid)" \
								Arbstopact
			if { [catch \
				{eval "exec domain_unlink $arglst $arbnet"} \
				err ] } {
				set rv [lindex $errorCode 2]
				catch { exec sleep 5 }
			} else {
				# unlink all is successful.
				set rv 0
			}
		}
		if { $rv != 0 } {
			set unlink2 1
		}

		#
		# Clear Arbstop state.
		# Reset the localDom last
		#
		logMessage "Clearing ARBSTOPS in $arbnet of cluster($idnid)" \
							Arbstopact
		foreach idnDomain $arbnet {

			set env(SUNW_HOSTNAME) $idnDomain

			# If parallel unlink fails, unlink each domain again
			# Separate the domains in the SMD register, so the
			# next command "hpost -Z" can succeed.
			# EBUSY=16
			set rv 16		
			set arglst "-F -E -B -XM"
			while { [info exists unlink2] && ($rv == 16) && \
			   [catch \
			      {eval "exec domain_unlink $arglst $idnDomain"} \
			      err ] } {
				set rv [lindex $errorCode 2]
				catch { exec sleep 5 }
				logMessage "Retry unlink $idnDomain" Arbstopact 
			}

			#
			# Now clear the arbstop state of this domain
			# This also prevents the heartbeat failure
			#
			if { ([info exists bootCpu($idnDomain) ] == 1) && \
			     ($bootCpu($idnDomain) >= 0) && \
			     ($bootCpu($idnDomain) <= 63) } {
				logMessage "Clear ARBSTOPS of $idnDomain" \
					Arbstopact
				catch { exec hpost -Z$bootCpu($idnDomain) }
			}
		}

		#
		# Reboot the domain which is not UP
		# check_host should be within the brinup.dr section. Otherwise,
		# it may be blocked by other domain dump.
		#
		set downnet {}
		foreach idnDomain $arbnet {
			set env(SUNW_HOSTNAME) $idnDomain
			if { [catch {exec check_host -q} err] } {
				set downnet [concat $downnet $idnDomain]
			}
		}

		#
		# Release bringup/dr lock
		#
		set lock [ lock_ssp_file "bringup_dr.lock 1" ]

		#
		# Reboot (Long) all the domains
		# reboot the arbDomain last
		#
		set i [lsearch $downnet $arbDomain]
		if  { $i >= 0 } {
			set idnnet0 [lreplace $downnet $i $i]
			set downnet [concat $idnnet0 $arbDomain]
			unset idnnet0
		}
		logMessage "Rebooting domains $downnet in cluster($idnid)" \
			Arbstopact
		set max_brup 2;  	# Max number of bringup commands .
		set cnt $max_brup;	# counter of bringup cmds
		set bg	"&"; 		# bringup in parallel
		if { $max_brup < 2  || $max_brup > 16 } {
			set max_brup 1
			set cnt 1
			set bg " "	
		}
		foreach idnDomain $downnet {
			logMessage "Reboot domain $idnDomain" Arbstopact

			# Select the domain
			set env(SUNW_HOSTNAME) $idnDomain

			# command args for Bringup Long
			set arglist "-L -F -A on"
			set pathname $env(SSPLOGGER)
			append pathname /$idnDomain/last_edd_bringup.out

			# Number of bringup in batch
			if { ($idnDomain == $arbDomain) && ($max_brup > 1) } {
				set bg " "
			} elseif { [set cnt [expr $cnt - 1]] > 0 } {
				set bg "&"
			} else {
				set bg " "
			}

			if { $bg != "&" } {
				Bringup Long
				set cnt $max_brup
			} elseif { [ catch \
				{eval "exec bringup $arglist >$pathname $bg"} \
				err ] != 0 } {
				set err [lindex $errorCode 2]
				if { [string length $err] != 0 } {
					set err "Error = $err. "
				}

				logDomainMessage "ERROR: edd-initiated bringup $arglist failed for domain $idnDomain. ${err}" Bringup
			} else {
				# This sleep is optional
				catch { exec sleep 64 }
			}
		}
	} else {
		#
		# Domain arbstop
		#
		logDomainMessage \
		    "Arbstop reboot of domain $env(SUNW_HOSTNAME) started" \
		    Arbstopact
		
		#
		# "try lock" the high-level bringup_dr.lock.
		# (see bugid 4045478).  Note - this lock is platform wide.
		#
		set lock [ lock_ssp_file "bringup_dr.lock 0 " ]

		# Domain cluster arbstop

		postdump "Edd-Arbstop"

		#
		# Release bringup/dr lock
		#
		set lock [ lock_ssp_file "bringup_dr.lock 1" ]

		Bringup Long
	}
}

proc Rebootact { args } {
# arguments include the e number and the domain name
#  -e { error number } and -d { domain name }
# valid e values are : 60 for  rebootrap
#			
# do a fast reboot in this case, user requested
#
global env
sethostname $args
logDomainMessage "reboot of domain $env(SUNW_HOSTNAME) started" rebootact
Bringup Quick
}


proc ObpResetact { args } {
    # arguments include the the domain name
    # -d { domain name }
    #
    # do a fast reboot in this case, user request obp reset
    #
    global env
    sethostname $args
    logDomainMessage "reset of domain $env(SUNW_HOSTNAME) started" obpresetact
    Bringup Reset
}

proc Panicact { args } {
    # passes the amount of time to sleep
    set panic1Trap		61
    set panic2Trap		62
    global env
    sethostname $args

    # Make sure this domain is down, this is used to cover a special
    # case where a manual bringup might have been started.
    set i [ lsearch $args -e ]
    if { $i == -1 } {
	logDomainMessage "Panic reboot of domain $env(SUNW_HOSTNAME) started" Panicact
    } else {
	incr i
	set event_type [lindex $args $i]
	if {$event_type == $panic1Trap} {
	    logDomainMessage "panic1 timed out. Panic reboot of domain $env(SUNW_HOSTNAME) started" Panicact
	    set host_down [check_host -q]
	    if { $host_down != 0} { 
		set bringingup [ bringup_active ]
		if { $bringingup == 0 } {
		    # we are stuck in panic 1, save off what we can
		    hostresetdump Panicact
		} else {
		    logDomainMessage "bringup already in progress, exiting, $env(SUNW_HOSTNAME)" Panicact
		    return
		}
	    } else { 
		logDomainMessage "check_host says the domain is still up, exiting, $env(SUNW_HOSTNAME)" Panicact
		return
	    }

	} elseif {$event_type == $panic2Trap} {
    
	    logDomainMessage "panic2 timed out. Panic reboot of domain $env(SUNW_HOSTNAME) started" Panicact
	    set host_down [check_host -b -q]
	    if { $host_down == 0} { logDomainMessage "Domain $env(SUNW_HOSTNAME) is up, exiting"  Panicact
		return
	    } else {
		hostresetdump Panicact
	    }
	} else {
	    logDomainMessage "Panic reboot of domain $env(SUNW_HOSTNAME) started" Panicact
	}
    }
    Bringup Long
}

proc PanicRebootact { args } {
# arguments include the -t followed by the amount of time to sleep for.
#  -e { error number } and -d { domain name }
# valid e values are : 
#
global env
# set panicRebootTrap	63
# set heartBeatFailTrap	64
sethostname $args
logDomainMessage "Panic reboot of domain $env(SUNW_HOSTNAME) started" PanicRebootact
Bringup Long

}


proc Environmentact { args } {
# arguments include the e number and the domain name
#  -e { error number } and -d { domain name } -b { boards }
# the board numbers are used to determine which boards to power down.
# Need also to determine if the whole system should be powered down
# valid e values are : 48 for arbstop
#
global env	
set environmentTrap	51
# This means that the OS has shutdown so we can power off the
# boards in that domain.
#
sethostname $args
logDomainMessage "Environment Trap of domain $env(SUNW_HOSTNAME) received" environmentact
#exec power -B -off
}


proc WatchDogRebootact { args } {
# Collects reset information on each proc in the doamin
# Then read up the domainProcConfig for that domain.
# then convert the date for the dump routines.
# Dump the post date for that domain
# and finally reboot the domain.
#
global env domain
sethostname $args

logDomainMessage "WatchDog or Redmode reboot of domain $env(SUNW_HOSTNAME) started" WatchDogRebootact

hostresetdump WatchDogRebootact

#
# "try lock" the high-level bringup_dr.lock.
# (see bugid 4045478).  Note - this lock is platform wide.
#
set lock [ lock_ssp_file "bringup_dr.lock 0 " ]

postdump "Edd-WatchDog-Redmode"

#
# Release bringup/dr lock
#
set lock [ lock_ssp_file "bringup_dr.lock 1 " ]

Bringup Long
}


#
# PowerFailRebootact is called on SSP system start up and on cbs connect to determine if any
# domains need to be rebooted due to a power failure.
#
# Check if each domain is powered up and booted, exit if this is true.
# If a domain is powered up and not booted exit.
#
# If a domain is not powered up check the ssp_resource file entry to 
# determine if the domain had been booted, power on only those boards in
# the domain and start a reboot.
# 
#
proc PowerFailRebootact { args } {
global env

#
# determine how many domains are running
#
    if {[catch { get_domain_config } domaininfo ] != 0} {
	logMessage "No domains found. EXITING " PowerFailRebootact
	return
	}
set plat [ lindex [ lindex $domaininfo 0 ] 2 ]

# check if PowerFailRebootact is enabled ot disabled in the erc file
set pathname $env(SSPVAR)
append pathname /etc/$plat/edd.erc
set fid [open $pathname {RDONLY } ]
gets $fid data
while { [ lsearch $data PowerFailRebootact] == -1 } {
	if { [ gets $fid data ] < 0 } {
		logMessage "ERROR: PowerFailRebootact not found in edd.erc file" PowerFailRebootact
		return } }

set comment [lindex $data 0]
if { [ string match "#cbe_connected" $comment ] == 1 } {
logMessage "PowerFail recovery is disabled in the edd.erc file" PowerFailRebootact
						return }

if { [lsearch $data "#" ] == 0 } { 
logMessage "PowerFail recovery is disabled in the edd.erc file" PowerFailRebootact
						return }

if { [lsearch $data disable] > 0 } { 
logMessage "PowerFail recovery is disabled in the edd.erc file" PowerFailRebootact
						return }

if { [lsearch $data disabled] > 0 } { 
logMessage "PowerFail recovery is disabled in the edd.erc file" PowerFailRebootact
						return }

set resname $plat.eddDomainMonitor
if {[catch { ssp_get_resource $resname } domains ] != 0} {
	logMessage "Error: Could not find $resname in ssp_resource file " PowerFailRebootact
	return -code error
	}

if { [ llength $domains ] == 0 } {
	logMessage "None of the domains were booted, Power Fail recovery skipped." PowerFailRebootact  
	return }

# check which domains are booted and powered up.
foreach domain $domains {
	set env(SUNW_HOSTNAME) $domain
	set host_down [check_host -b -q]
	if {$host_down == 1} {
# 	This domain was up when we lost power reboot it
		logDomainMessage "Domain $domain is being checked for PowerFailReboot " PowerFailRebootact 
		powerupandboot $domain
		}}
return
}


proc powerupandboot { domain } {
global env
# Power on csb cb get the boards in a domain and power them up.
#
set PWR_SYS_BRD_TABLE "sysBrdGenNum sysBrdGenPower"

set domaininfo [lindex [get_sysbds $domain] 0]
set boards [lindex $domaininfo 1]
# if the boards are powered up exit this is not a power-fail condition.

set poweron 0

# use power -v to validate each board's power status
foreach board $boards {

	if {[catch {exec power -q -v -sb $board} ] == 0} {
		incr poweron 1
	}
}

if { $poweron >= 1 } { 
	logDomainMessage "Domain $domain powered on, PowerFail recovery skipped"  powerupandboot
	return }

set env(SUNW_HOSTNAME) $domain

if { [ catch {set handle [ exec power -on -q ]} errmsg ] } {
	logDomainMessage "Power on of domain $domain failed $errmsg" powerupandboot
	return -code error }

	logDomainMessage "Power on of domain $domain has been completed, starting bringup" powerupandboot
#
# kill obp_helper and netcon_server of this domain
#
	set obpfile $env(SSPVAR)/pids/obp_helper-$env(SUNW_HOSTNAME).pid
	set netfile $env(SSPVAR)/pids/netcon_server-$env(SUNW_HOSTNAME).pid

	if {[ catch { exec cat $obpfile } obpid ] == 0 } { catch { exec kill -s KILL $obpid } }
	if {[ catch { exec cat $netfile } netpid ] == 0 } { catch { exec kill -s KILL $netpid } }
	
# do sys_clock -m to set the clk source
# This is a precaution action in case the primary control board is 
# board 1 and yet sys_clock had problem setting the clock source
# when cbs/cbe connect.

	catch { exec sys_clock -m }
	Bringup Standard
}

#
# proc  set_procnums
# Query snmpd for the processors and set the global procnums.  Data format:
# {1.3.6.1.4.1.34.2.1.3.1.9.0 {OCTET STRING} 4.3C-4.2C-4.1C-4.0C-
#  6.3C-6.2C-6.1C-6.0C}
#
proc set_procnums { program } {
    global env
    global procnums

    set procnums ""
    set handle [ snmp session ]
    $handle walk x "domainIndex domainName" {
	set idx [lindex [lindex $x 0] 2]
	set host [lindex [lindex $x 1] 2]
	if { $host == $env(SUNW_HOSTNAME) } {
	    set procnums [lindex [$handle get domainProcConfig.$idx ] 0]
	    set procnums [lindex $procnums 2]

	    # Only procs with status 'G' can be in this domain
	    set domain_proc_list [split $procnums -]
	    set first_time 0
	    set proc_strg ""
	    foreach i $domain_proc_list {
		set b 0
		set p 0
		set s "R"
		scan $i "%d.%d%s" b p s

		if { $s == "G" } {
		    set good_proc [expr $b*4+$p]
		    if { $first_time != 0 } {
			set proc_strg [format "%d-%s" $good_proc $proc_strg]
		    } else {
			set first_time 1
			set proc_strg [format "%d" $good_proc]
		    }
		}
	    }

	    break
	}
    }

    if { $procnums == "" } {
	logDomainMessage "ERROR: can't find processors for $env(SUNW_HOSTNAME)" $program
	return 0
    }

    # log the cpu's on heartbeat failure - Bug 4291084
    if { $program == "HeartBeatFailact" } {
	logDomainMessage "ERROR: heartbeat failure processor list: $proc_strg " $program
    }
    return 1
}

#
# procstatedump
# cause a kernel/processor dump
#
proc procstatedump { program } {
    global env
    global procnums

    set pathname $env(SSPLOGGER)
    set date [ exec date +%m.%d.%H:%M ]
    regsub -all \  $date \-  date
    append pathname /$env(SUNW_HOSTNAME)/hostresetdump-$date

    logDomainMessage "Hostreset of domain $env(SUNW_HOSTNAME) dump file = $pathname" $program

    # Set procnums with  processor numbers from the MIB
    set rc [set_procnums $program]
    if { $rc == 0 } {
	# message already logged
	return
    }

    if { [catch {eval "exec sigbcmd -R -P $procnums > $pathname" }] != 0} {
	logDomainMessage "ERROR: Unable to take dump resetinfo of processors $procnums " $program
    }
    if { [catch {eval "exec sigbcmd -B -P $procnums >> $pathname" }] != 0} {
	logDomainMessage "ERROR: Unable to take dump sigblock of processors $procnums " $program
    }
}

#
# HeartBeatFailact is called when a heartbeat failure is detected.
# It will try to cause a kernel/processor dump before bringup happens.
#
proc HeartBeatFailact { args } {
    global env
    sethostname $args

    logDomainMessage "Heartbeat failure detected for domain $env(SUNW_HOSTNAME)" HeartBeatFailact

    # Generates dumps before hostint
    procstatedump HeartBeatFailact

    logDomainMessage "Interrupting domain $env(SUNW_HOSTNAME) ..." HeartBeatFailact
    if {[catch {exec hostint} ] == 0} {
	# wait and check if hostint succeeds
	exec sleep 1
        if {[check_host -q] != 0} { 
	    # host is down, waiting to reboot
            return
	}
    }
    logDomainMessage "ERROR: hostint failed for domain $env(SUNW_HOSTNAME). Attempting hostreset ..." HeartBeatFailact
    hostresetdump HeartBeatFailact
    logDomainMessage "reboot of domain $env(SUNW_HOSTNAME) started" HeartBeatFailact
    Bringup HeartbeatLong
}


proc hostresetdump { program } {
# do a hostreset on each proc then grather processor states

    global env
    global procnums

    # Set the global variable  procnums with  processor numbers from the MIB
    set rc [set_procnums $program]
    if { $rc == 0 } {
	# message already logged
	return
    }

    # do hostreset to each processor
    set proc_list [ split $procnums - ]

    if { $program == "HeartBeatFailact" || $program == "Panicact" } {
	# Only force a hostreset if there has need a heartbeat failure
	foreach i $proc_list {
	    set b 0
	    set p 0
	    set s "R"
	    scan $i "%d.%d%s" b p s
	    #
	    # Only hostreset a proc if the status is G
	    # that is the proc is configured as part of
	    # the domain
	    #
	    if {$s == "G"} {
		set pnum [expr $b * 4 + $p ]
		if {[catch {exec hostreset -p $pnum} ] != 0} {
		    logDomainMessage "ERROR: hostreset failed for processor $pnum" $program
		}
		exec sleep 1
	    }
	}
    }

    if { $program != "HeartBeatFailact" } {
	# Processor dump for heartbeat failure is done in HeartBeatFailact -
	# before hostint was tried
	procstatedump $program
    }
}

#
# get_smd_idnnet
# Find a list of domains linked with a given domain based on the SMD board mask
#
proc get_smd_idnnet { domain_name } {
    global env

	# default return list
	set smdidnnet {}

	# read the board mask of the domain and convert it from hex to decimal
	set brdmask [ lindex [ get_smd_mask $domain_name ] 0 ]
	set brdmask [ expr 0 + "0x$brdmask" ]

	if { [ catch {set handle [snmp session -community edd ]} errmsg]} {
		logMessage "Unable to acquire snmpd handle: $errmsg" \
						get_smd_idnnet
		return $smdidnnet
	} 
	if { [catch {set dnum [ $handle get confNumDomain.0 ] } errmsg ] } {
		logMessage "Error: unable to get confNumDomain from MIB: $errmsg" get_smd_idnnet
		$handle destroy
		return $smdidnnet
	}
	set dnum [lindex [ lindex $dnum 0 ] 2]	

	for {set inst 0} {$inst < $dnum} {incr inst} {
		# domain name and board list
		if { [ catch { set dname \
			[ $handle get domainName.$inst ] } errmsg ] } {
			logMessage "Error: unable to get domainName.$inst from MIB: $errmsg" get_smd_idnnet
			$handle destroy
			return $smdidnnet
		}
		set dname [lindex [lindex $dname 0] 2]
		if { $dname != $domain_name } {
			continue
		}
		if { [ catch { set data \
			[ $handle get domainSysBrdConfig.$inst ] } errmsg ] } {
			logMessage "Error: unable to get domainSysBrdConfig.$inst from MIB: $errmsg" get_smd_idnnet
			$handle destroy
			return $smdidnnet
		}
		set data [lindex [lindex $data 0] 2]
		set data [split $data -]

		# if brdmask contains any brd of domain, add this domain
		# to the idnnet
		foreach brd $data {
			if { [expr $brdmask & [expr 1 << $brd]] != 0 } {
				if { [lsearch $smdidnnet $dname] != -1 } {
					set smdidnnet [concat $smdidnnet $dname]
				}
				break;
			}
		}
	}

	return $smdidnnet
}

#
# get_smd_idnnet
# Find a list of domains linked with a given domain based on the SMD board mask
#
proc get_smd_idnnet { domain_name } {
	global env

	# default return list
	set smdidnnet {}

	# read the board mask of the domain and convert it from hex to decimal
	set brdmask [ lindex [ get_smd_mask $domain_name ] 0 ]
	set brdmask [ expr 0 + "0x$brdmask" ]

	if { [ catch {set handle [snmp session -community edd ]} errmsg ] } {
		logMessage "Unable to acquire snmpd handle: $errmsg" \
						get_smd_idnnet
		return $smdidnnet
	} 
	if { [catch {set dnum [ $handle get confNumDomain.0 ] } errmsg ] } {
		logMessage "Error: unable to get confNumDomain from MIB: $errmsg" get_smd_idnnet
		$handle destroy
		return $smdidnnet
	}
	set dnum [lindex [ lindex $dnum 0 ] 2]	

	for {set inst 0} {$inst < $dnum} {incr inst} {
		# domain name and board list
		if { [ catch { set dname \
			[ $handle get domainName.$inst ] } errmsg ] } {
			logMessage "Error: unable to get domainName.$inst from MIB: $errmsg" get_smd_idnnet
			$handle destroy
			return $smdidnnet
		}
		if { [ catch { set data \
			[ $handle get domainSysBrdConfig.$inst ] } errmsg ] } {
			logMessage "Error: unable to get domainSysBrdConfig.$inst from MIB: $errmsg" get_smd_idnnet
			$handle destroy
			return $smdidnnet
		}
		set dname [lindex [lindex $dname 0] 2]
		set data [lindex [lindex $data 0] 2]
		set data [split $data -]

		# if brdmask contains any brd of domain, add this domain
		# to the idnnet
		foreach brd $data {
			if { [expr $brdmask & [expr 1 << $brd]] != 0 } {
				if { [lsearch $smdidnnet $dname] == -1 } {
					set smdidnnet [concat $smdidnnet $dname]
				}
				break;
			}
		}
	}
	$handle destroy

	return $smdidnnet
}
