#################################################################################
##################################### Microway Cluster Management Software (MCMS)
#################################################################################
#
# Configuration for SLURM Resource Manager
#
#
# This file must be present on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
#################################################################################

ClusterName=microway

ControlMachine=master
#ControlAddr=
#BackupController=
#BackupAddr=

SlurmdPort=6818
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmdSpoolDir=/var/spool/slurmd
StateSaveLocation=/var/lib/slurmd
SlurmUser=slurm
#SlurmdUser=root


# PROLOG, EPILOG AND HEALTH SCRIPTS

# Prepare for user jobs (must run very quickly)
#Prolog=
#
# Clean up after user jobs
#Epilog=/etc/slurm/scripts/slurm.epilog

#SrunProlog=
#SrunEpilog=

#TaskProlog=
#TaskEpilog=

# Prepare nodes for use
#PrologSlurmctld=
#
#EpilogSlurmctld=

# Check health of all nodes in the cluster. This program must run very
# quickly, because it is automatically terminated after 60 seconds.
#HealthCheckProgram=/etc/slurm/scripts/slurm.healthcheck
#
# Run health check every 15 minutes (900 seconds)
#HealthCheckInterval=900


AuthType=auth/munge
CacheGroups=0
#GroupUpdateForce=0
#GroupUpdateTime=600
CryptoType=crypto/munge
#DisableRootJobs=NO
#EnforcePartLimits=NO
#FirstJobId=1
MaxJobCount=25000
#MaxJobId=999999
GresTypes=gpu
#CheckpointType=checkpoint/none
#JobCheckpointDir=/var/slurm/checkpoint
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
#JobFileAppend=0
#JobRequeue=1
#JobSubmitPlugins=1
#KillOnBadExit=0
#LaunchType=launch/slurm
#Licenses=foo*4,bar
#MailProg=/bin/mail
#MaxStepCount=40000
#MaxTasksPerNode=128
MpiDefault=none
#MpiParams=ports=12000-12999
#PluginDir=
PlugStackConfig=/etc/slurm/plugstack.conf
#PrivateData=jobs
ProctrackType=proctrack/linuxproc
#PropagatePrioProcess=0
#PropagateResourceLimits=
PropagateResourceLimitsExcept=MEMLOCK
RebootProgram=/sbin/reboot
ReturnToService=2
#SallocDefaultCommand=
SwitchType=switch/none
TaskPlugin=task/affinity
TaskPluginParam=Sched
#TopologyPlugin=topology/tree
#TmpFS=/tmp
#TrackWCKey=no
#TreeWidth=
#UnkillableStepProgram=
#UsePAM=0


# TIMERS
#BatchStartTimeout=10
#CompleteWait=0
#EpilogMsgTime=2000
#GetEnvTimeout=2
InactiveLimit=0
KillWait=30
MessageTimeout=30
MinJobAge=300
OverTimeLimit=10
#ResvOverRun=0
SlurmctldTimeout=120
SlurmdTimeout=300
#UnkillableStepTimeout=60
#VSizeFactor=0
Waittime=0


# SCHEDULING
#DefMemPerCPU=0
FastSchedule=0
#MaxMemPerCPU=0
#SchedulerRootFilter=1
#SchedulerTimeSlice=30
SchedulerType=sched/backfill
SchedulerPort=7321
#SelectType=select/linear
SelectType=select/cons_res
SelectTypeParameters=CR_Core_Memory,CR_ONE_TASK_PER_CORE

# JOB PRIORITY
#PriorityFlags=
#PriorityType=priority/basic
#PriorityDecayHalfLife=
#PriorityCalcPeriod=
#PriorityFavorSmall=
#PriorityMaxAge=
#PriorityUsageResetPeriod=
#PriorityWeightAge=
#PriorityWeightFairshare=
#PriorityWeightJobSize=
#PriorityWeightPartition=
#PriorityWeightQOS=


# LOGGING AND ACCOUNTING
#AccountingStorageEnforce=limits
#AccountingStorageHost=
#AccountingStorageLoc=
#AccountingStoragePass=/var/run/munge/munge.socket.2
#AccountingStoragePort=
AccountingStorageType=accounting_storage/slurmdbd
#AccountingStorageUser=
#AccountingStoreJobComment=YES
#DebugFlags=
#JobCompHost=
#JobCompLoc=
#JobCompPass=
#JobCompPort=
JobCompType=jobcomp/none
#JobCompUser=
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/linux
SlurmctldDebug=3
#SlurmctldLogFile=
SlurmdDebug=3
#SlurmdLogFile=
#SlurmSchedLogFile=
#SlurmSchedLogLevel=


# POWER SAVE SUPPORT FOR IDLE NODES
#SuspendProgram=
#ResumeProgram=
# How long a node must be idle before it will be powered off (in seconds)
#SuspendTime=14400   # Four hours
#SuspendTimeout=30   # Number of seconds we expect the node shutdown to take
#ResumeTimeout=300   # Number of seconds we expect the node boot process to take
#ResumeRate=100      # Number of nodes we're willing to turn on at a time
#SuspendRate=100     # Number of nodes we're willing to power off at a time
#SuspendExcNodes=
#SuspendExcParts=


# COMPUTE NODES
NodeName=DEFAULT Sockets=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=385383
NodeName=master
NodeName=node1
NodeName=node2 CoresPerSocket=18 RealMemory=191883 Gres=gpu:P100_SXM:2 Feature=Tesla


# QUEUE DEFINITIONS

### CPU Queues ###

# Month-long jobs must be sent here (shorter jobs can be assigned higher priorities below)
PartitionName=month-long-cpu Priority=5000 Default=NO MaxTime=31-0:00:00 State=UP Nodes=node[1-2] MaxNodes=2

# Week-long jobs must be sent here (shorter jobs can be assigned higher priorities below)
PartitionName=week-long-cpu Priority=10000 Default=NO MaxTime=7-0:00:00 State=UP Nodes=node[1-2] MaxNodes=2

# Day-long jobs must be sent here (shorter jobs can be assigned higher priorities below)
PartitionName=day-long-cpu Priority=20000 Default=NO MaxTime=1-0:00:00 State=UP Nodes=node[1-2]

# 30-Minute short, high-priority jobs may be sent here
PartitionName=short-cpu Priority=40000 Default=YES MaxTime=30:00 State=UP Nodes=node[1-2]

PartitionName=admin     Priority=65535 Default=YES MaxTime=3000000:00 State=UP Nodes=node[1-2] AllowGroups=slurmadmin

PartitionName=PHYSICS     Priority=10000 Default=YES MaxTime=3000000:00 State=UP Nodes=node2 AllowGroups=physics


# Interactive sessions are considered higher priority than batch jobs.

# Specify up to half a node per interactive session:
PartitionName=interactive-cpu Priority=50000 Shared=YES:2 DefaultTime=8:00:00 MaxTime=48:00:00 State=UP Nodes=node[1-2] MaxNodes=1 MaxCPUsPerNode=14 MaxMemPerNode=128834