#! /usr/bin/ksh
#
# Copyright 2003-2004 Sun Microsystems, Inc.  All rights reserved.
# Use is subject to license terms.
#
# ident	"@(#)run_reserve.ksh	1.61	04/04/26 SMI"
#
#
# the following invocations are evaluated:
#
# run_reserve	-c node_join
#		-c release_shared_scsi2 -n joining_node
#		-c make_primary -s service_name -C service_class
#		-c make_exclusive -s service_name -C service_class
#		-c primary_to_secondary -s service_name -C service_class
#		-c fence_node -f fenced_node
#		-c fence_node_ng -f fenced_node
#		-c enfailfast_all
#		-c fence_all_nodes
#		-c reset_shared_bus
#
# the following invocations currently require no disk fencing:
#
# run_reserve	-c make_secondary
#		-c add_secondary
#		-c remove_secondary
#
# The options -o -d -a are also currently ignored.
#
# The following invocations are also available:
#
# run_reserve	-c node_join -x
#		(Prevents running of 'release_shared_scsi2' on other cluster
#                nodes.  This is used by scgdevs since scgdevs executes on all
#                nodes.)
#		-c node_join -S
#		(Replaces running 'release_shared_scsi2' on other cluster nodes
#                with running 'node_join -x' on other cluster nodes.  This is
#                used by 'scdidadm -C' to ensure scrubbing may be done when only
#                some nodes are still connected to the device.
#
#
# This script is called from the rc script /etc/rc2.d/S75MOUNTGFSYS when a node
# is joining the cluster to ensure that that the node has access to all attached
# disks.  It is also called during cluster membership reconfigurations in order
# to fence non-cluster nodes from shared disks.  When a node is made the primary
# node, for a device group, this script is responsible for importing the VxVM
# disk group or taking ownership of the SDS diskset.  When a node is demoted to
# being a secondary node, this script deports the VxVM disk group / release the
# SDS diskset.
# 
#
# Transitions:
#
# node_join - Invoked when a node is joining the cluster to  ensure that it has
#		access to all attached disks.  The reservation program called by
#		this script will spawn a 'release_shared_scsi2' invokation of
#		this script on all other cluster nodes, so that they may release
#		scsi-2 reservations on disks they share with the joining node.
#		Enables failfast on all devices.
# release_shared_scsi2 - Invoked on all cluster nodes (except the joining node)
# 		by the reservation program during node_join transition. Makes
#		use of clexecd to accomplish this. Responsible for dropping
#		scsi-2 reservations held on disks shared with the joining node.
# make_primary / make_exclusive - Invoked through the HA device services
#		framework when a node is promoted to being the primary node for
#		an HA device group.  Performs sds take and vxvm import
#               operations.
# primary_to_secondary - Invoked through the HA device services framework when a
#		node is demoted from primary to secondary (potential primary).
#		Performs sds release and vxvm deport operations.
# fence_node_ng - Fence all devices shared between this node and the specified
#               node.
# fence_all_nodes - Fence all non-cluster nodes from devices shared with this
#		node.  Not currently used.
# reset_shared_bus - Initiate a scsi-bus reset for all scsi buses shared with a
#		non-cluster node.  This is to ensure that the bus is not wedged
#		attempting to contact a node which has just died.
#
# fence_node -  This option is kept in case a customer upgraded their
#		cluster nodes with the core patch and not the OPS patch. 
#		In that case they are driving fencing from both CMM and 
#		cvmreconfig.
#		cvmreconfig will call this script from step 1, expecting to
#		start the fencing. In this implementation we call reserve -l
#		which will wait for the CMM fencing to complete.
#		
# enfailfast_all - Another left over from cvmreconfig driven fencing. This
#		option is mapped to a no-op operation, and we do a 
#		clean_exit. We can do this because fencing is now driven
#		out of CMM. The reason we need this is for the scenario where
#		the customer upgraded the corepatch but not the OPS patch.
#		


mount_file=/tmp/reservation_mountfile_$$
resv_prog=/usr/cluster/lib/sc/reserve
NAS_dir=/usr/cluster/lib/sc/nas
command=
service_name=
service_class=
joining_node=
fenced_node=
opt_arg=
upgrade_flag_file=/scnoreservedisks
retry_num=3
retry_interval=5
metaset_path=NOT_CONFIGURED
# flags to tell if we need to back out import if failure occurs
needVxVMdeport_on_error=0
needSDSrelease_on_error=0


if [ -x /usr/sbin/metaset ]
then
	# Solaris 8 location
	metaset_path=/usr/sbin/metaset
elif [ -x /usr/opt/SUNWmd/sbin/metaset ]
then
	# Solaris 7 location
	metaset_path=/usr/opt/SUNWmd/sbin/metaset
fi

NWS_RECONFIG=/usr/opt/SUNWesm/cluster/sbin/reconfig

###########
# functions
###########
# cleanup function called before all exits
clean_exit ()
{
	rm -f $mount_file

	exit $1
}

# check return value of a command, exiting if it is non-zero
chk_retval_exit ()
{
	if [ $retval != 0 ]
	then
		echo `gettext "$0:  $1 failed during $command, returned $retval"`

		# backout VxVM import / SDS take if needed
		if [ $needVxVMdeport_on_error != 0 ]
		then
			#
			# If deport/release fails, the node is halted lest the
			# disk group should end up being imported on two nodes.
			# If failback is not enabled, we should reboot the node
			# instead. Filing a separate bug for that.
			#

			/usr/sbin/vxdg deport $service_name
			retval1=$?
			if [ $retval1 != 0 ]
			then
				echo `gettext "Fatal error: could not deport VxVM diskgroup $service_name. Halting node."`
				echo `gettext "Fatal error: could not deport VxVM diskgroup $service_name. Halting node."` >> /dev/console
				halt
			fi
		fi
		if [ $needSDSrelease_on_error != 0 ]
		then
			$metaset_path -C release -s $service_name
			retval1=$?
			if [ $retval1 != 0 ]
			then
				echo `gettext "Fatal error: could not release SDS diskset $service_name. Halting node "`
				echo `gettext "Fatal error: could not release SDS diskset $service_name. Halting node "` >> /dev/console
				halt
			fi
		fi

		clean_exit $retval
	fi
}



#
# Check mount command to see if there are any local filesystems still
# mounted on the device group.  Return 0 if mounted local
# filesystems are found, 1 otherwise.  An argument is passed in,
# specifying which class of device group to check for, /dev/global/
# (rawdisk), /dev/vx/ (SUNWvxvm), or /dev/md/ (SUNWmd).
#
chk_for_mounted_filesystems()
{
	mount > $mount_file
	retval=$?
	chk_retval_exit mount

	while read mntpoint on device options
	do
		# look for mounts on specified device group class
		echo $device | grep $1 > /dev/null 2>&1
		retval=$?
		if [ $retval = 1 ]
		then
			# not the type we are looking for
			continue
		fi
		chk_retval_exit grep

		# see if this is our device group
		case $1 in
	       	"/dev/global/")
	       		# grab d- part of /dev/global/dsk/d-s-
	       		device=`basename $device`
	       		device=`echo $device | sed 's/s.//'`
	       		# see which rawdisk devices group this device belongs to
	       		mountdg=`/usr/cluster/lib/dcs/dgconv -d $device"`
	       		retval=$?
	       		chk_retval_exit dgconv
			;;
		"/dev/vx/")
			mountdg=`dirname $device`
			mountdg=`basename $mountdg`
			;;
		"/dev/md/")
			mountdg=`dirname $device`
			mountdg=`dirname $mountdg`
			mountdg=`basename $mountdg`
			;;
		*)
			echo Illegal input to chk_for_mounted_filesystems - $1
			retval=1
			chk_retval_exit chk_for_mounted_filesystems
			;;
	       	esac

		if [ $mountdg != $service_name ]
		then
			# not our rawdisk device group
			continue
		fi

		# look for non-global mounts
		echo $options | grep -v global > /dev/null 2>&1
		retval=$?
		if [ $retval = 1 ]
		then
			# is a global mount
			continue
		fi
		chk_retval_exit grep

		return 0
	done < $mount_file

	return 1
}



# perform operations for make_primary or primary_to_secondary of -C DISK
do_local_mount_chk ()
{
       	#
       	# There could be local filesystems still mounted.
       	# Need to fail primary_to_secondary if this is the case.
       	#
       	chk_for_mounted_filesystems $1
       	retval=$?
       	if [ $retval = 0 ]
       	then
       		echo File systems still mounted on device group $service_name, unable to shutdown device group.

       		echo File systems still mounted on device group $service_name, unable to shutdown device group. >> /dev/console
       		echo Please unmount file systems and re-try operation. >> /dev/console
       		clean_exit 1
       	fi
}



##############
# script start
#####################################################
#
# This script depends on an ASCII collating
# sequence for checking things like legal node names,
# adapter names, and junction names.  There are no dependencies
# on the collating sequence for inspecting, sorting, or massaging any
# data which might be internationalized.  Therefore,
# the collating sequence locale is forced to the 'C' locale.
#
#####################################################
typeset -x TEXTDOMAIN=SUNW_SC_CMD
typeset -x TEXTDOMAINDIR=/usr/cluster/lib/locale

LC_COLLATE=C;  export LC_COLLATE

##########################
# get command line options
##########################
while getopts c:s:C:o:d:a:n:f:xmS name
do

	case $name in
		c)	command="$OPTARG"
			;;
		s)	service_name="$OPTARG"
			;;
		C)	service_class="$OPTARG"
			;;
		n)	joining_node="$OPTARG"
			;;
		f)	fenced_node="$OPTARG"
			;;
		#
		# supress calling of release_shared_scsi2
		# during calls from scgdevs
		#
		x)	opt_arg="-x "$opt_arg
			;;
		# turn on debug messages
		m)	opt_arg="-m "$opt_arg
			;;
		#
		# replace calling of release_shared_scsi2 with
		# node_join -x during calls from scdidadm -C
		#
		S)	opt_arg="-S "$opt_arg
			;;
		# dcs passes these to us, but we don't use them right now
		# may be used later to optimize some things
		o)	;;
		d)	;;
		a)	;;
		?)	echo $0:  `gettext "illegal command line option"`
			clean_exit 1
			;;
	esac
done

############################
# check command line options
############################
if [ -z $command ]
then
	echo $0:  `gettext "command not specified"`
	clean_exit 1
fi

# for these state transitions the reservation program need do nothing
if [[ $command = make_secondary || $command = add_secondary || $command = remove_secondary ]]
then
	clean_exit 0
fi

# treat both the same
if [ $command == make_exclusive ]
then
	command=make_primary
fi

if [[ $command == make_primary || $command == primary_to_secondary ]]
then
	if [ -z $service_class ]
	then
		echo $0:  `gettext "service_class not specified"`
		clean_exit 1
	elif [[ $service_class != DISK && $service_class != SUNWmd && $service_class != SUNWvxvm ]]
	then
		echo $0:  `gettext "illegal service_class:"`  $service_class
		clean_exit 1
	fi

	if [ -z $service_name ]
	then
		echo $0:  `gettext "service_name not specified"`
		clean_exit 1
	fi
fi

if [ $command = release_shared_scsi2 ]
then
	if [ -z $joining_node ]
	then
		echo $0:  `gettext "joining_node not specified"`
		clean_exit 1
	fi
fi

if [ $command = fence_node_ng ]
then
	if [ -z $fenced_node ]
	then
		echo $0:  `gettext "fenced_node not specified"`
		clean_exit 1
	fi
fi

#################
# perform command
######################
# release_shared_scsi2
######################
if [ $command = release_shared_scsi2 ]
then
	echo `gettext "releasing reservations for scsi-2 disks shared with"` $joining_node

	#
       	# Wait for any fencing to complete in case a node has rejoined
	# the cluster before fencing against it has completed.  No need
	# to check the return value here, if fencing really is still in
	# progress, the worst that can happen is the node gets fenced
	# again after we give give him access.  If fencing is no longer
	# running, then we want to go ahead and clear any devices that
	# were fenced before fencing ran into problems.
       	#
	$resv_prog -c check_lock -n FENCE_LOCK

	# call reservation program
	$resv_prog -c $command -j $joining_node $opt_arg -h `hostname`
	retval=$?
	chk_retval_exit reservation_program
###########
# node_join
###########
elif [ $command = node_join ]
then
	echo `gettext "obtaining access to all attached disks"`

	#
	# If this is a boot during upgrade simply return success
	# we will be run later when the system is ready.
	# This prevents us from fencing off active SC2.2 nodes.
	#
	if [ -f $upgrade_flag_file ]
	then
		clean_exit 0
	fi

	#
	# No need to check to make sure fencing is complete here, since we did
	# that during release_shared_scsi2 and node_join will not get called
	# until release_shared_scsi2 has completed on all nodes.
	#

	# unfence SCSI devices
	$resv_prog -c $command -h `hostname` $opt_arg &

	# unfence NAS devices
	for nas_exec in "$NAS_dir"/*
	do
		if [ -x $nas_exec ]
		then
			$nas_exec -c node_join -h `hostname` &
		fi
	done

	# wait for unfencing to comeplete
	wait
##############
# make_primary
##############
elif [ $command = make_primary ]
then
	echo `gettext "becoming primary for"` $service_name

	#
	# check for fencing-in-progress - this will return when all fencing is
	# complete or when the CMM timeout has been met, at which point the
	# node(s) being fenced will have brought themselves down and it is safe
	# to procede
	#
	$resv_prog -c check_lock -n FENCE_LOCK
	retval=$?
	if [[ $retval != 0 && $retval != 1 ]]
	then
		chk_retval_exit reservation_program_check_lock
	fi

	if [ $service_class = SUNWmd ]
	then
		# tell SUNWmd to take this volume
		$metaset_path -C take -f -s $service_name
		retval=$?
		if [ $retval != 0 ]
		then
			# these failures result in read-only ownership
			if [ $retval = 66 ]
			then
				echo `gettext "Stale database for diskset"` $service_name
			elif [ $retval = 2 ]
			then
				echo `gettext "Tagged data encountered for diskset"` $service_name
			elif [ $retval = 3 ]
			then
				echo `gettext "Only 50% replicas and 50% mediator hosts available for diskset"` $service_name
			fi

			#
			# XXX if it's not one of the above errors, we should
			# really retry the metaset -C take
			#

			# make sure we haven't taken read-only ownership
			$metaset_path -s $service_name -C release

			# exit using the failed take's return value
			clean_exit $retval
		fi

		# at this point we have taken the  diskset
	       	needSDSrelease_on_error=1
	elif [ $service_class = SUNWvxvm ]
	then
		/usr/sbin/vxdg -C -t -f import $service_name
		retval=$?

		#
		# error 12 = diskgroup already imported, this is not an error,
		# This can occur if this is a make_primary after a
		# primary_to_secondary failure during which the disk group was
		# not deported.
		#
		if [ $retval = 12 ]
		then
			retval=0
		fi

		how_many=0
		while [ $retval != 0 ]
			do
		       	if [ $how_many = $retry_num ]
		       	then
			       	chk_retval_exit /usr/sbin/vxdg
		       	fi
		       	echo `gettext "$0:  /usr/sbin/vxdg failed during $command, returned $retval, will retry in $retry_interval seconds"`
		       	sleep $retry_interval
		       	/usr/sbin/vxdg -C -t -f import $service_name
		       	retval=$?
		       	how_many=$(($how_many + 1))
	       	done

	       	# at this point we have imported disk group
	       	needVxVMdeport_on_error=1

		# enable all volumes for this disk group
		/usr/sbin/vxrecover -g $service_name -s -b
		retval=$?
		how_many=0
		while [ $retval != 0 ]
		do
			if [ $how_many = $retry_num ]
			then
				chk_retval_exit /usr/sbin/vxrecover
			fi
			echo `gettext "$0:  /usr/sbin/vxrecover failed during $command, returned $retval, will retry in $retry_interval seconds"`
			sleep $retry_interval
			/usr/sbin/vxrecover -g $service_name -s -b
			retval=$?
			how_many=$(($how_many + 1))
		done
	fi
	if [ -x "$NWS_RECONFIG" ]
	then
		$NWS_RECONFIG start $service_name
	fi

######################
# primary_to_secondary
######################
elif [ $command = primary_to_secondary ]
then
	echo `gettext "no longer primary for"` $service_name

	if [ -x "$NWS_RECONFIG" ]
	then
		$NWS_RECONFIG stop $service_name
	fi

	if [ $service_class = DISK ]
	then
		# check for locally mounted filesystems
		do_local_mount_chk /dev/global/
	elif [ $service_class = SUNWmd ]
	then
		# check for locally mounted filesystems
		do_local_mount_chk /dev/md/

		# tell SUNWmd to release this volume
		$metaset_path -C release -s $service_name
		retval=$?
		how_many=0
		while [ $retval != 0 ]
		do
			if [ $how_many = $retry_num ]
			then
				chk_retval_exit $metaset_path
			fi
			echo `gettext "$0:  $metaset_path failed during $command, returned $retval, will retry in $retry_interval seconds"`
			sleep $retry_interval
			$metaset_path -C release -s $service_name
			retval=$?
			how_many=$(($how_many + 1))
		done
	elif [ $service_class = SUNWvxvm ]
	then
		# check for locally mounted filesystems
		do_local_mount_chk /dev/vx/

		# deport the disk group
		/usr/sbin/vxdg deport $service_name
		retval=$?
		how_many=0
		while [ $retval != 0 ]
		do
			if [ $how_many = $retry_num ]
			then
				chk_retval_exit /usr/sbin/vxdg
			fi
			echo `gettext "$0:  /usr/sbin/vxdg failed during $command, returned $retval, will retry in $retry_interval seconds"`
			sleep $retry_interval
			/usr/sbin/vxdg deport $service_name
			retval=$?
			how_many=$(($how_many + 1))
		done
	fi
###########
# fence_node_ng
###########
elif [ $command = fence_node_ng ]
then
	echo `gettext "fencing node $fenced_node from shared devices"`

	# fence SCSI devices
	$resv_prog -c fence_node -h `hostname` -f $fenced_node $opt_arg &

	# fence NAS devices
	for nas_exec in "$NAS_dir"/*
	do
		if [ -x $nas_exec ]
		then
			$nas_exec -c fence_node -h `hostname` -f $fenced_node &
		fi
	done

	# wait for fencing to comeplete
	wait

	# release fencing lock
	$resv_prog -c release_lock -n FENCE_LOCK.$fenced_node
	retval=$?
	if [[ $retval != 0 && $retval != 1 ]]
	then
		#
		# Halt the node since not being able to release the lock is
		# probably a bug or an indication of a serious cluster problem.
		#
		echo `gettext "$0:  failed to release fencing lock during $command, returned $retval, halting node"`
		echo `gettext "$0:  failed to release fencing lock during $command, returned $retval, halting node"` >> /dev/console
		halt
	fi
###########
# fence_node
###########
elif [ $command = fence_node ]
then
	echo `gettext "fencing node $fenced_node from shared devices"`

	# call reservation program
	$resv_prog -l $opt_arg
	retval=$?
	chk_retval_exit reservation_program
###########
# reset_shared_bus
###########
elif [ $command = reset_shared_bus ]
then
	echo `gettext "resetting scsi buses shared with non-cluster nodes"`

	# call reservation program
	$resv_prog -c $command -h `hostname` $opt_arg
	retval=$?
	chk_retval_exit reservation_program
###########
# fence_all_nodes
###########
elif [ $command = fence_all_nodes ]
then
	echo `gettext "fencing shared devices from non-cluster nodes"`

	# call reservation program
	$resv_prog -c $command -h `hostname` $opt_arg
	retval=$?
	chk_retval_exit reservation_program
###########
# enfailfast_all, provided for backward compatibility.
###########
elif [ $command = enfailfast_all ]
then
	clean_exit 0
###########
# illegal command
###########
else
	echo $0:  `gettext "illegal command specification:"`  -c $command
	clean_exit 1
fi

clean_exit 0
