#!/bin/sh
#
#+-#+-#+-#-+-#+-#+-#-+-#+-#+-#-+-#+-#+-#-+-#+-#+-#-+-#+-#+-#-+-#+-#+-#-+-#
#
# $SGE_ROOT/util/master_template
#
# DO NOT EDIT THIS FILE - this file is used as an template
# Don't change the markers #+-#+-#+-# and "#-#-#-#" , they will be removed
#
#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-
#
# SGE/SGEEE startup script
#
# (c) 2004 Sun Microsystems, Inc. Use is subject to license terms.  

#
# This script can be called with the following arguments:
#
#       start       start qmaster, scheduler or shadowd 
#       stop        Terminates scheduler and qmaster if we are on the master
#                   machine.
#       -qmaster    only starts qmaster and scheduler
#       -shadowd    start shadwod if found in the "shadow_masters" file
#       -migrate    shuts down qmaster and scheduler if they are running
#                   on another host and start the daemons on this host
#
# If the file "primary_qmaster" in the $SGE_ROOT/$SGE_CELL/common
# exists and it contains the hostname of the current machine and qmaster
# is running on another host it will be shut down and started on this host
#
# Unix commands which may be used in this script:
#    cat cut tr ls grep awk sed basename
#
# This script requires the script $SGE_ROOT/util/arch
#

PATH=/bin:/usr/bin:/sbin:/usr/sbin

#---------------------------------------------------------------------------
# The following lines provide the necessary info for adding a startup script
# according to the Linux Standard Base Specification (LSB) 1.0.0 which can
# be found at:
#
#    http://www.linuxbase.org/spec/gLSB/gLSB/sysinit.html
#
# See also
#
#    http://www.suse.de/~grimmer/Package-Conventions/index.html
#
### BEGIN INIT INFO
# Provides:       sgemaster 
# Required-Start: $network
# Required-Stop:
# Default-Start:  3 5
# Default-Stop: 0 1 2 6
# Description:  start Grid Engine qmaster, schedd, shadowd
### END INIT INFO
#---------------------------------------------------------------------------

SGE_ROOT=GENROOT; export SGE_ROOT
SGE_CELL=GENCELL; export SGE_CELL
SGE_QMASTER_PORT=GENSGE_QMASTER_PORT; export SGE_QMASTER_PORT 
SGE_EXECD_PORT=GENSGE_EXECD_PORT; export SGE_EXECD_PORT

unset CODINE_ROOT GRD_ROOT COD_CELL GRD_CELL

ARCH=`$SGE_ROOT/util/arch`
shlib_path_name=`$SGE_ROOT/util/arch -lib`
old_value=`eval echo '$'$shlib_path_name`
if [ x$old_value = x ]; then
   eval $shlib_path_name=$SGE_ROOT/lib/$ARCH
else
   eval $shlib_path_name=$old_value:$SGE_ROOT/lib/$ARCH
fi
export $shlib_path_name


#---------------------------------------------------------------------------
# Shutdown
# Send SIGTERM to process name $1 with pid in file $2
#
Shutdown()
{
   name=$1
   pidfile=$2
   if [ -f $pidfile ]; then
      pid=`cat $pidfile`
      maxretries=6
      i=0
      while [ $i -lt $maxretries ]; do
         $utilbin_dir/checkprog $pid $name > /dev/null
         if [ "$?" = 0 ]; then
            kill $pid
         else
            return 0
         fi
         sleep `expr 2 + $i`
         i=`expr $i + 1`

      done
      kill -9 $pid
      return $?
   fi
}


#---------------------------------------------------------------------------
# QmasterSpoolDir
#    Return qmasters spool directory
#
QmasterSpoolDir()
{
   qma_spool_dir=`grep qmaster_spool_dir \
                      $SGE_ROOT/$SGE_CELL/common/bootstrap | \
                      awk '{ print $2 }'`
   echo $qma_spool_dir
}

#---------------------------------------------------------------------------
# CheckIfQmasterHost
#    If our hostname given in $1 is the same as in the "act_qmaster" file
#    echo "true" else echo "false"
#
CheckIfQmasterHost()
{
   host=$1

   if [ "$host" = "`cat $SGE_ROOT/$SGE_CELL/common/act_qmaster`" ]; then
      echo true
   else
      echo false
   fi
}

#---------------------------------------------------------------------------
# CheckIfPrimaryQmasterHost
#    Check if our hostname given in $1 is the same as in the
#    "primary_qmaster" file
#    echo true if there is our hostname else echo false
#
CheckIfPrimaryQmasterHost()
{
   host=$1

   fname=$SGE_ROOT/$SGE_CELL/common/primary_qmaster

   if [ -f $fname ]; then
      if [ "$host" = "`cat $fname`" ]; then
         echo true
      else
         echo false
      fi
   else
      echo false
   fi
}


#---------------------------------------------------------------------------
# CheckIfShadowMasterHost
#    Check if our hostname given in $1 is contained in the
#    "shadow_masters" file
#    echo true if there is our hostname else echo false
#
CheckIfShadowMasterHost()
{
   host=$1

   fname=$SGE_ROOT/$SGE_CELL/common/shadow_masters

   if [ -f $fname ]; then
      grep -i $host $fname 2>&1 > /dev/null
      if [ $? = 0 ]; then
         shadow_host="true"
      else
         shadow_host="false"
      fi
   else
      shadow_host="false"
   fi
}

#---------------------------------------------------------------------------
# GetPathToBinaries
#    echo the name of the bin_dir on this system
#    The check is fullfilled if we can access the qstat binary
#    echo "none" if we can't determine the binary path
GetPathToBinaries()
{
   cfgname=$SGE_ROOT/$SGE_CELL/common/bootstrap

   base=none

   if [ -f $cfgname ]; then
      base=`grep binary_path $cfgname | awk '{ print $2 }'`
      if [ -f $base/qstat ]; then
         :
      elif [ -f $SGE_ROOT/util/arch ]; then
         arch=`$SGE_ROOT/util/arch`
         if [ -f $base/$arch/qstat ]; then
               base=$base/$arch
         fi
      fi
   fi

   echo $base
}


#---------------------------------------------------------------------------
# GetAdminUser
#    echo the name of the admin user on this system
#    echo "root" if admin user retrieval fails
GetAdminUser()
{
   cfgname=$SGE_ROOT/$SGE_CELL/common/bootstrap
   user=none

   if [ -f $cfgname ]; then
      user=`grep admin_user $cfgname | awk '{ print $2 }'`
   fi

   if [ `echo $user|tr "A-Z" "a-z"` = "none" ]; then
      user=root
   fi
   echo $user
}

#---------------------------------------------------------------------------
# GetPathToUtilbin
#    echo the path to the binaires in utilbin
#    The check is fullfilled if we can access the "gethostname" binary
#    echo "none" if we can't determine the binary path
#
GetPathToUtilbin()
{
   base=none

   if [ -f $SGE_ROOT/util/arch ]; then
      utilbindir=$SGE_ROOT/utilbin

      arch=`$SGE_ROOT/util/arch`
      if [ -f $utilbindir/$arch/gethostname ]; then
         base=$utilbindir/$arch
      fi
   fi

   echo $base
}

#---------------------------------------------------------------------------
# CheckRunningQmaster
# checks, if sge_qmaster is running
# In error case the sge_qmaster didn't start, silently
#
CheckRunningQmaster()
{
   masterhost=`cat $SGE_ROOT/$SGE_CELL/common/act_qmaster`
   running=false
   loop=0

   if [ "$SGE_QMASTER_PORT" = "" ]; then
      SGE_QMASTER_PORT=`$utilbin_dir/getservbyname -number sge_qmaster`
   fi

   while [ $running = "false" -a $loop -ne 10 ]; do 
      $bin_dir/qping -info $masterhost $SGE_QMASTER_PORT qmaster 1 > /dev/null 2>&1

      if [ "$?" = 0 ]; then
         running=true
      else
         sleep 3 
         masterhost=`cat $SGE_ROOT/$SGE_CELL/common/act_qmaster`
         loop=`expr $loop + 1`
      fi
   done

   if [ $running = "false" ]; then
      echo
      echo "sge_qmaster didn't start!"
      echo "Please check the messages file"
      echo
   fi
}

#---------------------------------------------------------------------------
usage()
{
   echo "Grid Engine start/stop script. Valid parameters are:"
   echo ""
   echo "   (no parameters): start qmaster and execution daemon if applicable"
   echo "   \"start\"        dto."
   echo "   \"stop\"         shutdown local Grid Engine processes and jobs"
   echo "   \"-qmaster\"     only start/stop qmaster and scheduler (if applicable)"
   echo "   \"-shadowd\"     only start/stop shadowd (if applicable)"   
   echo "   \"-migrate\"     shutdown qmaster/scheduler if it's running on another"
   echo "                    host and restart it on this host"
   echo "                    Migration only works if this host is an admin host"
   echo ""
   echo "Only one of the parameters \"start\", \"stop\" or \"softstop\" is allowed."
   echo "Only one of the parameters beginning  with \"-\" is allowed." 
   echo
   echo "Default argument is \"start\" for all components."
   echo "Default for \"stop\" is shutting down all components."
   echo
   exit 1
}


#---------------------------------------------------------------------------
# MAIN Procedure
#

if [ "$#" -gt 2 -o "$1" = "-h" -o "$1" = "help" ]; then
   usage
fi

startup=true
qmaster=true
shadowd=true
qstd=false
migrate_qmaster=false
softstop=false

for i in $*; do
   if [ "$i" = start ]; then
      startup=true
   elif [ "$i" = stop ]; then
      startup=false
   elif [ "$i" = softstop ]; then
      startup=false
      softstop=true
   elif [ "$i" = -qmaster ]; then
      qmaster=true
      shadowd=false
   elif [ "$i" = -shadowd ]; then
      qmaster=false
      shadowd=true
   elif [ "$i" = -migrate ]; then
      migrate_qmaster=true
      qmaster=true
      shadowd=false
   else
      usage
   fi
done

bin_dir=`GetPathToBinaries`
if [ "$bin_dir" = "none" ]; then
   echo "can't determine path to Grid Engine binaries"
   exit 1
fi

utilbin_dir=`GetPathToUtilbin`
if [ "$utilbin_dir" = "none" ]; then
   echo "can't determine path to Grid Engine utility binaries"
   exit 1
fi

HOST=`$utilbin_dir/gethostname -aname`
UQHOST=`$utilbin_dir/gethostname -aname | cut -f1 -d.`
qmaster_spool_dir=`QmasterSpoolDir`
CheckIfShadowMasterHost $HOST

if [ "$startup" = true ]; then

   # qmaster_host=true if qmaster was running on this host the last time
   #                   this host is an execution host

   qmaster_host=`CheckIfQmasterHost $HOST`
   primary_qmaster_host=`CheckIfPrimaryQmasterHost $HOST`

   if [ $qmaster = true -a $qmaster_host = true -a $migrate_qmaster = true ]; then
      echo "   qmaster and scheduler running on this host. Will not migrate qmaster."
      exit 1
   fi

   if [ $qmaster = true -a $qmaster_host = false -a  \
        \( $primary_qmaster_host = true -o $migrate_qmaster = true \) ]; then
       actual_qmaster_host=`cat $SGE_ROOT/$SGE_CELL/common/act_qmaster`
       echo "   shutting down qmaster and scheduler on host \"$actual_qmaster_host\" ..."
       qconf_output=`$bin_dir/qconf -ks 2>&1 | grep "denied"`
       if [ "$qconf_output" != "" ]; then
          echo "   denied: host \"$HOST\" is no admin host."
          exit 1
       fi
       $bin_dir/qconf -km > /dev/null 2>&1
       
       qping_count=0
       qping_retries=10
       qping_exit_state=0
       while [ $qping_count -lt $qping_retries ]; do
          $bin_dir/qping -info $actual_qmaster_host $SGE_QMASTER_PORT qmaster 1  > /dev/null 2>&1
          qping_exit_state=$?
          if [ $qping_exit_state -ne 0 ]; then
             break
          fi
          sleep 3
          qping_count=`expr $qping_count + 1`
       done

       if [ $qping_exit_state -eq 0 ]; then
       #  qmaster is still running
          echo "   qmaster on host $actual_qmaster_host still alive. Cannot migrate qmaster."
          exit 1
       fi

       lock_file_read_retries=10
       lock_file_read_count=0
       lock_file_found=0
       while [ $lock_file_read_count -lt $lock_file_read_retries ]; do
          if [ -f $qmaster_spool_dir/lock ]; then
             lock_file_found=1
             break
          fi
          sleep 3
          lock_file_read_count=`expr $lock_file_read_count + 1`
       done

       if [ $lock_file_found -eq 0 ]; then
       #  old qmaster did not write lock file 
          echo "   old qmaster did not write lock file. Cannot migrate qmaster."
          echo "   Please verify that qmaster on host $actual_qmaster_host is down"
          echo "   and make sure that the lock file in qmaster spool directory is"
          echo "   read-able."
          exit 1
       fi

       qmaster_host=true
   fi

   if [ $qmaster = true -a $qmaster_host = true ]; then
      echo "   starting sge_qmaster"
      $bin_dir/sge_qmaster
      [ $? -eq 0 -a -d /var/lock/subsys ] && touch /var/lock/subsys/sgemaster >/dev/null 2>&1
      CheckRunningQmaster
      echo "   starting sge_schedd"
      $bin_dir/sge_schedd
   elif [ $qmaster = true -a $qmaster_host = false ]; then
      echo
      echo "sge_qmaster didn't start!"
      echo "This is not a qmaster host!"
      echo "Please, check your act_qmaster file!" 
      echo
   fi
      
   if [ $shadowd = true -a $shadow_host = true ]; then
      start_shadowd=true
      UQpidfile=$qmaster_spool_dir/shadowd_$UQHOST.pid
      pidfile=$qmaster_spool_dir/shadowd_$HOST.pid

      if [ -f $pidfile ]; then
         pid=`cat $pidfile`
         $utilbin_dir/checkprog $pid sge_shadowd > /dev/null
         if [ "$?" = 0 ]; then
            echo "   found running sge_shadowd - not starting"
            start_shadowd=false 
         fi
      fi

      if [ -f $UQpidfile ]; then
         pid=`cat $UQpidfile`
         $utilbin_dir/checkprog $pid sge_shadowd > /dev/null
         if [ "$?" = 0 ]; then
            echo "   found running sge_shadowd - not starting"
            start_shadowd=false
         fi
      fi

      if [ $start_shadowd = true ]; then
         echo "   starting sge_shadowd"
         $bin_dir/sge_shadowd
      else
         exit 1
      fi
   fi
else
   if [ $shadow_host = true ]; then
      # Send SIGTERM to shadowd
      echo "   Shutting down Grid Engine shadowd"
      if [ -f $qmaster_spool_dir/shadowd_$UQHOST.pid ]; then
         Shutdown sge_shadowd $qmaster_spool_dir/shadowd_$UQHOST.pid
      elif [ -f $qmaster_spool_dir/shadowd_$HOST.pid ]; then
         Shutdown sge_shadowd $qmaster_spool_dir/shadowd_$HOST.pid
      fi
   fi

   if [ $qmaster = true ]; then
      if [ `CheckIfQmasterHost $HOST` = true ]; then
         # Send SIGTERM to scheduler
         echo "   Shutting down Grid Engine scheduler"
         Shutdown sge_schedd $qmaster_spool_dir/schedd/schedd.pid

         # Send SIGTERM to qmaster
         echo "   Shutting down Grid Engine qmaster"
         Shutdown sge_qmaster $qmaster_spool_dir/qmaster.pid
         ret=$?
         if [ -f /var/lock/subsys/sgemaster ]; then
            uid=`$utilbin_dir/uidgid -uid`
            if [ "$uid" = "0" -a "$ret" = "0" ]; then            
               rm -f /var/lock/subsys/sgemaster >/dev/null 2>&1
            else
               echo "Can't shut down qmaster!"
               exit 1
            fi
         fi
      fi
   fi

fi
