#! /bin/ksh

#
# Copyright (c) 2000 by Sun Microsystems, Inc.
#

#
# @(#)dsmcmn.sh 1.15 00/06/08 SMI
#

#
# Script file which provides common funtions used by the other scripts
#

umask 177;			# set only user's rw permissions

DEBUG=0;			# set this to 1 here for debugging

PKG_NAME=SUNWbdm;

# 
# store only the script name, not the complete path or the trailing .sh
#
SCRIPT=`echo "$0" | sed -e 's/.*\///g' -e 's/.sh//'`;

#
# Derive and store the DSM_DIR
#
DSM_DIR=`echo "$0" | sed 's/scripts\/'${SCRIPT}'.sh$//'`;

#
# L10N directory of bdm messages
#
# bugID:4343375
# The environment variable TEXTDOMAINDIR is looked up when gettext is called
# to get the localized messages.  It should have set in init_i18n() routine
# to point to the correct localization path for bdm messages.  
L10N_DSM_DIR=${TEXTDOMAINDIR};

#
# Set the DSM log directory (for dsmlog, deleted_metadevices, and *tmp* files)
#
DSM_LOG_DIR=/var/opt/${PKG_NAME};

#
# Set the DSM config directory (for dsm.cf, md.cf, disk.cf, dsmdisk.inf) 
#
DSM_CFG_DIR=/etc/opt/${PKG_NAME};


#
# set the domain name; need a better way to do this than hardcode it.
#
# bugID:4343375
# fix the DSM domain name 
DSM_DOMAIN=SUNW_NSP_NETRA_BDM;

#
# get Maintenance and 'Last Erred' strings from SDS's msg files
#
# bugID:4343375
# Since the SDS localized msgs are located in the default /usr/lib/locale and
# the bdm messages are in the L10N bdm directory, the TEXTDOMAINDIR environment
# variable needs to be set to default before using gettext on SUNW_MD, but
# changed back to the bdm L10N directory.
TEXTDOMAINDIR=/usr/lib/locale;

_maintenance=`gettext SUNW_MD "Maintenance"`;
_last_erred=`gettext SUNW_MD "Last Erred"`;

TEXTDOMAINDIR=${L10N_DSM_DIR};

_compensating_actions_initiated="compensating actions initiated";
_compensating_actions_completed="compensating actions completed";
_reintegrating_actions_initiated="reintegrating actions initiated";
_reintegrating_actions_not_completed="reintegrating actions not completed";
_reintegrating_actions_completed="reintegrating actions completed";

# bugID: 4469335 
# set a default interval to 60 seconds for status checking by dsmadd.
# this variable is modified by the automated functions that have a 
# commandline interface for defining interval value.
# If dsmadd is run standalone this is the interval value it will use.
_interval=60;


# ********************** FUNCTION DEFINITIONS *******************************

#
# Purpose:	To print messages to the terminal and to system log.
# Invocation:	Echo <level> <message string>
# Assumptions:	/etc/syslog.conf file has a (selector,action) entry for
#		DSMFACILITY.level. See syslog.conf(4) for more details.
# Returns:	Nothing.
#
Echo()
{
	#
	# print to terminal
	#
	echo "$2";
	
	#
	# and send to syslog as well; if DSMFACILITY is not defined then use
	# daemon as the facility.
	#
	if [[ -n $DSMFACILITY ]]
	then
		logger -p "${DSMFACILITY}.$1" -t ${PKG_NAME} "$2";
	else
		logger -p daemon.$1 -t ${PKG_NAME} "$2";
	fi	
}


#
# Purpose:	To print debug messages
# Invocation:	debug_echo <message string>
# Assumptions:	None.
# Returns:	Nothing.
#
debug_echo()
{
	if (( DEBUG == 1 ))
	then
		Echo "debug" "$1";
	fi
}


#
# Purpose:	check the present working directory, and if not equal to the
#		DSM_DIR, change it
# Invocation:	check_cur_dir;
# Assumptions:	DSM_DIR should have been set earlier.
# Returns:	Nothing
#
check_cur_dir()
{
	if [ $PWD != "${DSM_DIR}/scripts" ]
	then
		cd "${DSM_DIR}/scripts";
	fi
	debug_echo "${SCRIPT}: DEBUG: Working directory: `pwd`";
}



#
# Purpose:	to update the disk configuration file created earlier with the
#		new serial number for the replaced disk.
# Invocation:	update_disk_config <disk> <disk config file>.
# Assumptions:	No error/range checking is done on the arg passed, as such
#		they are assumed to be correct.
# Returns:	nothing
#
update_disk_config()
{
	typeset _disk _file _new_serialnum;

	_disk="$1";
	_file="$2";

	_new_serialnum=`luxadm inquiry /dev/rdsk/${_disk}s2 | \
			grep 'Serial Number' | sed 's/  */ /g' | \
			cut -d' ' -f3`;

	#
	# replace the serial number entry in the file specified with
	# the new serial number
	#
	grep -v $_disk $_file > ${DSM_LOG_DIR}/udctmp1;
	echo "`grep $_disk $_file | \
	       cut -d ' ' -f1,2` $_new_serialnum" >> ${DSM_LOG_DIR}/udctmp1;
	mv ${DSM_LOG_DIR}/udctmp1 $_file;
}


#
# Purpose:	checks if the configuration files exist and are non-empty.
# Invocation:	check_config_files
# Assumptions:	none.
# Returns:	0 if non-empty files exist, else 1
#
check_config_files()
{
	if [ -s ${DSM_CFG_DIR}/md.cf ] && [ -s ${DSM_CFG_DIR}/disk.cf ]
	then
		return;
	else
		Echo "err" "${SCRIPT}: `gettext $DSM_DOMAIN 'Configuration files have not been created or are empty. Run dsmcfg.'`";
		exit 1;
	fi
}



#
# Purpose:	check state database replicas on the specified disk by
#		invoking metadb(1M).
# Invocation:	check_sdr <disk>
# Assumptions:	None
# Returns:	exit status = 0 if all sdr's are ok
#		exit status = 1 if atleast one sdr is bad on this disk
#
check_sdr()
{
	typeset _disk _rval;

	_disk="$1";
	_rval=0;		# implies all sdr's on this disk are ok

	debug_echo "${SCRIPT}: DEBUG: checking state database replicas for \
$_disk";
	if (( `metadb | grep $_disk | grep -c '[WMDRF]'` > 0 ))
	then
		_rval=1;	# implies that sdr(s) on this disk is/are bad
	fi
	
	return $_rval;
}


#
# Purpose:	check the submirrors on the specified disk by invoking
#		metastat(1M). State is checked for both 'maintenance' and
#		'Last Erred'
# Invocation:	check_sm <disk>
# Assumptions:	None
# Returns:	exit status = 0 if all submirrors are ok
#		exit status = 1 if atleast one submirror is in maintenance state
#		on this disk.
#		exit status = 2 if atleast one submirror is in Last Erred state
#		on this disk.
#
check_sm()
{
	typeset _disk _submirrors _sm _sm_stat _rval;
	
	_disk="$1";
	_rval=0;			# implies everything ok

	_submirrors=`grep $_disk ${DSM_CFG_DIR}/md.cf | cut -d' ' -f1 | \
		     paste -d' ' -s -`;
	
	for _sm in $_submirrors
	do
		debug_echo "${SCRIPT}: DEBUG: checking $_sm on $_disk";
		_sm_stat=`metastat $_sm`;
		if (( `echo "$_sm_stat" | grep -ci "$_last_erred"` > 0 ))
		then
			_rval=2;	# implies a submirror on this disk in
					# Last Erred state.
			break;
		fi
		if (( `echo "$_sm_stat" | grep -ci "$_maintenance"` > 0 ))
		then
			_rval=1;	# implies a submirror on this disk in
					# maintenance state
			break;
		fi
	done
	
	#
	# Take into account the possibility that compensating actions may not
	# have completed in a previous invocation.
	#
	if [[ -s ${DSM_LOG_DIR}/dsmlog ]] && \
	   (( `grep -c $_disk ${DSM_LOG_DIR}/dsmlog` == 1 )) && \
	   (( _rval == 0 ))
	then
		_rval=1;
	fi

	return $_rval;
}


#
# Purpose:	check if state database replicas and submirrors on the
#		specified disk are ok, if not invoke dsmrm.sh script
#		which will initiate compensating actions.
# Invocation:	check_and_compensate_sdrsm <disk to be checked>;
# Assumptions:	No error/range checking is done on the arg passed, as such
#		they are assumed to be correct.
# Returns:	exit status = 0, if all replicas and submirrors are ok,
#		returns to calling code.
#		exit status = 1, if replicas and/or submirrors are not ok, BUT
#		the compensating script which was invoked, completed
#		successfully. Returns to the calling code.
#		exit status = 2, if the replicas and/or submirrors are not ok,
#		and the compensating action script which was invoked,
#		terminated with errors. Returns to the calling code.
#		exit status = 3, if one of the submirrors is in Last Erred state.
#
check_and_compensate_sdrsm()
{
	typeset _disk _rval _es1 _es2 _exit_stat;
	
	_disk="$1";
	_rval=0;
	
	check_sdr $_disk;		# check replicas
	_es1="$?";			# 0 => ok, 1 => bad

	check_sm $_disk;		# check submirrors
	_es2="$?";			# 0 => ok, 1 => maintenance, 2 => Last
					# Erred
	#
	# Invoke dsmrm.sh only if the replicas are bad OR submirror(s) is/are
	# in maintenance state as opposed to Last Erred state. Last Erred is a
	# double fault that should be not be handled here.
	#
	if (( _es1 == 1  ||  _es2 == 1 ))
	then
		#
		# execute the dsmrm.sh script with the failed disk as the arg
		#
		debug_echo "${SCRIPT}: DEBUG: At least one state database \
replica and/or submirror on $_disk is bad, going to invoke dsmrm.sh";
		Echo "crit" "${SCRIPT}: ${_disk}: `gettext $DSM_DOMAIN 'Disk failure detected.'`";
		dsmrm.sh -d $_disk;
		_exit_stat="$?";
		if (( _exit_stat != 0 ))
		then
			_rval=2;	# dsmrm.sh terminated with error
		else
			_rval=1;	# dsmrm.sh completed successfully
		fi
	elif (( _es2 == 2 ))		# Last Erred !
	then
		debug_echo "${SCRIPT}: DEBUG: At least one submirror on $_disk \
is in Last Erred state. Will not do anything in this function."
		Echo "crit" "${SCRIPT}: `gettext $DSM_DOMAIN 'Double fault condition - both disks show failure.'`";
		_rval=3;
	else				# => _es1 = 0 AND _es2 = 0.
		debug_echo "${SCRIPT}: DEBUG: state database replicas and \
submirrors are ok on disk $_disk"; 
		_rval=0;		# state database replicas and submirrors 
					# are ok
	fi

	return $_rval;
}



#
# Purpose:	checks on the state of the state database replicas
#		and submirrors by invoking appropriate functions in a loop. If
#		either of them has gone bad, the compensating action script is
#		invoked from within the called function. This is done per
#		disk.
# Invocation:	check_and_compensate_metadevices <poll interval>;
# Assumptions:	No error/range checking is done on the arg passed, as such
#		they are assumed to be correct.
# Returns:	exit status = 0, if either state database replica(s) or
#		submirrors on a disk were bad, and dsmrm.sh completed
#		successfully.
#		exit status = 1, if either state database replica(s) or
#		submirrors on a disk were bad, and dsmrm.sh  terminated with
#		error(s). 
#
check_and_compensate_metadevices()
{
	typeset _first_disk _second_disk _interval _exit_stat;
	
	_interval="$1";
	
	_first_disk=`cut -d' ' -f1 ${DSM_CFG_DIR}/disk.cf | paste -d' ' -s - | \
		     cut -d' ' -f1`;
	_second_disk=`cut -d' ' -f1 ${DSM_CFG_DIR}/disk.cf | paste -d' ' -s - | \
		     cut -d' ' -f2`;

	debug_echo "${SCRIPT}: DEBUG: Will start checking metadb(1M) for state\
 database replicas, and metastat(1M) for each submirror";

	while true		# start of while loop
	do
		# check if replicas and submirrors are ok on first disk
		check_and_compensate_sdrsm $_first_disk;
		_exit_stat="$?";
		if (( _exit_stat != 0 ))
		then
			break;	# replica(s) and/or submirror(s) not ok
		fi

		# check if replicas and submirrors are ok on second disk
		check_and_compensate_sdrsm $_second_disk;
		_exit_stat="$?";
		if (( _exit_stat != 0 ))
		then
			break;	# replica(s) and/or submirror(s) not ok
		fi

		debug_echo "${SCRIPT}: DEBUG: `date` sleeping for $_interval \
seconds"; 
		        
		sleep "$_interval";	# sleep for specified time
    
	done			# End

	if (( _exit_stat == 1 ))
	then
		return 0;	# compensating action script completed
				# successfully
	else
		return 1;	# compensating action script terminated with
				# error(s)
	fi
}


#
# Purpose:	to determine if compensating actions state (and if applicable,
#		monitor state) should indeed be entered. If either of the four
#		conditions in the code test true, we can enter this state, else
#		no.
# Invocation:	enter_compensating_actions
# Assumptions:	none
# Returns:	exit status = 0 implies the compensating actions (preceded by
#		monitor code, if applicable) can be done.
#		exit status = 1, no - just skip this state
#
enter_compensating_actions()
{
	#
	# conditions are as:-
	# a. dsmlog doesn't exist (this is the first time we are compensating)
	# b. compensating started (in a previous invocation) but not completed
	# c. reintegrating actions not completed (due to bad submirror/replica)
	# d. reintegrating actions completed (this is a new C.A. cycle)
	#
	# XXX: try to simplify the conditions (negate the ones for R.A)
	#
	if [[ ! -s ${DSM_LOG_DIR}/dsmlog ]] || \
	   (( `grep -c "$_compensating_actions_initiated" \
		${DSM_LOG_DIR}/dsmlog` > 0 && \
	       `grep -c "$_compensating_actions_completed" \
		${DSM_LOG_DIR}/dsmlog` == 0 )) || \
	   (( `grep -c "$_reintegrating_actions_not_completed" \
		${DSM_LOG_DIR}/dsmlog` > 0 || \
	      `grep -c "$_reintegrating_actions_completed" \
		${DSM_LOG_DIR}/dsmlog` > 0 ))
	then
		return 0;
	else
		return 1;
	fi
}



#
# Purpose:	check if a new disk has been inserted. Find this by invoking
#		luxadm(1M) and verify that the serial number obtained is a
#		valid serial number.
# Invocation:	check_new_disk <disk> <disk's serialnum>
# Assumptions:	No error/range checking is done on the args passed, as such
#		they are assumed to be correct.
# Returns:	exit status = 0 implies a new disk.
#		exit status = 1 implies either the old disk, OR invalid serial
#		number (null or a string of 0's).
#
check_new_disk()
{
	typeset _disk _old_serialnum _new_serialnum;

	_disk="$1";
	_old_serialnum="$2";

	_new_serialnum=`luxadm inquiry /dev/rdsk/${_disk}s2 | \
			grep 'Serial Number' | sed 's/  */ /g' | \
			cut -d' ' -f3`;
	
	#
	# verify that we have obtained a valid serial number, more
	# specifically this verifies that we have not obtained a null
	# thru' luxadm, and that the number obtained is not a string
	# of 0's
	#
	if [ -z "$_new_serialnum" ] || [[ "$_new_serialnum" == +([0]) ]]
	then
		#
		# sometimes the disk doesn't spin (a common problem), if so,
		# perhaps this might help
		#
		prtvtoc /dev/rdsk/${_disk}s2 > /dev/null;
		debug_echo "${SCRIPT}: DEBUG: need to check again, will sleep \
for 5s";
		sleep 5;	# need to check again?!
		_new_serialnum=`luxadm inquiry /dev/rdsk/${_disk}s2 | \
				grep 'Serial Number' | sed 's/  */ /g' | \
				cut -d' ' -f3`;
		if [ -z "$_new_serialnum" ] || \
		   [[ "$_new_serialnum" == +([0]) ]]
		then
			return 1;	# invalid serial number
		fi
	fi
	
	if [ "$_new_serialnum" != "$_old_serialnum" ]
	then
		return 0;		# new disk inserted!
	else
		return 1;		# same old disk :-(
	fi
}



#
# Purpose:	checks if a new disk has been inserted by invoking
#		check_new_disk() in a loop. If so, then initiates integration
#		by invoking dsmadd.sh.
# Invocation:	check_and_integrate_disk <disk> <disk config file>
# Assumptions:	No error/range checking is done on the args passed, as such
#		they are assumed to be correct.
# Returns:	exit status as returned from dsmadd.sh.
#
check_and_integrate_disk()
{
	typeset _disk _old_serialnum _file _exit_stat;
	
	_disk="$1";
	_file="$2";

	_old_serialnum=`grep $_disk $_file | cut -d' ' -f3`;
	
	debug_echo "${SCRIPT}: DEBUG: Starting polling for insertion of a new \
disk"; 

	while true
	do
		debug_echo "${SCRIPT}: DEBUG: `date` Sleeping for 1 min in \
check_and_integrate_disk()"; 
		sleep 60;
	
		check_new_disk $_disk $_old_serialnum ;
		_exit_stat="$?";
		if (( _exit_stat != 0 ))
		then	
			continue;		# => old disk
		fi
		
		# => new disk!
		# bugID 4469335: 
		# pass the status checking interval time in seconds to dsmadd
		SB_DSMADD_INTERVAL="$_interval" dsmadd.sh -n;
		_exit_stat="$?";
		if (( _exit_stat == 3 ))
		then
			debug_echo "${SCRIPT}: DEBUG: had a fmthard failure";
			continue;
		else
			break;
		fi
	done

	return $_exit_stat;
}



#
# Purpose:	To check that the metadevices have been deleted after dsmrm.sh
#		and the metadevices have been created after dsmadd.sh. The
#		action to be performed is specified by the second arg.
# Invocation:	check_stat <config file> <action>; where action can be one of
#		these two strings - deleted, created. As an example, 
#		check_stat deleted_metadevices deleted;
# Assumptions:	No error/range checking is done on the args passed, as such
#		they are assumed to be correct.
# Returns:	exit status = 0 when the submirrors and the replicas which
#		were to be deleted have been deleted OR when the submirrors
#		and the replicas which were to be created have been created.
#		exit status = 1 when the submirrors and the replicas which
#		were to be deleted have NOT been deleted OR when the
#		submirrors and the replicas which were to be created have NOT
#		been created.
#
check_stat()
{
	typeset _file _action _slice _submirrors _num _sm;

	_file="$1";
	_action="$2";
	
	_slice=`grep -v '\-m' "$_file" | cut -d' ' -f2`;
	_submirrors=`sed '1d' "$_file" | cut -d' ' -f3 | paste -d' ' -s -`;

	_num=`metadb | grep -c "$_slice"`;
	if [[ $_action = deleted ]] && (( _num > 0 ))
	then
		return 1;
	elif [[ $_action = created ]] && (( _num == 0 ))
	then
		return 1;
	fi

	for _sm in $_submirrors
	do
		#
		# verify the number of occurances of a submirror in metastat
		# -p; if deleted, number should be 0, else if created, number
		# should be 2
		#
		_num=`metastat -p | grep -c "$_sm "`;
		if [[ $_action = deleted ]] && (( _num != 0 ))
		then
			return 1;
		elif [[ $_action = created ]] && (( _num != 2 ))
		then
			return 1;
		fi
	done
}
