################################################################################# ##################################### Microway Cluster Management Software (MCMS) ################################################################################# # # Configuration for SLURM Resource Manager # # # This file must be present on all nodes of your cluster. # See the slurm.conf man page for more information. # ################################################################################# ClusterName=microway ControlMachine=master #ControlAddr= #BackupController= #BackupAddr= SlurmdPort=6818 SlurmctldPort=6817 SlurmdPidFile=/var/run/slurmd.pid SlurmctldPidFile=/var/run/slurmctld.pid SlurmdSpoolDir=/var/spool/slurmd StateSaveLocation=/var/lib/slurmd SlurmUser=slurm #SlurmdUser=root # PROLOG, EPILOG AND HEALTH SCRIPTS # Prepare for user jobs (must run very quickly) #Prolog= # # Clean up after user jobs #Epilog=/etc/slurm/scripts/slurm.epilog #SrunProlog= #SrunEpilog= #TaskProlog= #TaskEpilog= # Prepare nodes for use #PrologSlurmctld= # #EpilogSlurmctld= # Check health of all nodes in the cluster. This program must run very # quickly, because it is automatically terminated after 60 seconds. #HealthCheckProgram=/etc/slurm/scripts/slurm.healthcheck # # Run health check every 15 minutes (900 seconds) #HealthCheckInterval=900 AuthType=auth/munge CacheGroups=0 #GroupUpdateForce=0 #GroupUpdateTime=600 CryptoType=crypto/munge #DisableRootJobs=NO #EnforcePartLimits=NO #FirstJobId=1 MaxJobCount=25000 #MaxJobId=999999 GresTypes=gpu #CheckpointType=checkpoint/none #JobCheckpointDir=/var/slurm/checkpoint #JobCredentialPrivateKey= #JobCredentialPublicCertificate= #JobFileAppend=0 #JobRequeue=1 #JobSubmitPlugins=1 #KillOnBadExit=0 #LaunchType=launch/slurm #Licenses=foo*4,bar #MailProg=/bin/mail #MaxStepCount=40000 #MaxTasksPerNode=128 MpiDefault=none #MpiParams=ports=12000-12999 #PluginDir= PlugStackConfig=/etc/slurm/plugstack.conf #PrivateData=jobs ProctrackType=proctrack/linuxproc #PropagatePrioProcess=0 #PropagateResourceLimits= PropagateResourceLimitsExcept=MEMLOCK RebootProgram=/sbin/reboot ReturnToService=2 #SallocDefaultCommand= SwitchType=switch/none TaskPlugin=task/affinity TaskPluginParam=Sched #TopologyPlugin=topology/tree #TmpFS=/tmp #TrackWCKey=no #TreeWidth= #UnkillableStepProgram= #UsePAM=0 # TIMERS #BatchStartTimeout=10 #CompleteWait=0 #EpilogMsgTime=2000 #GetEnvTimeout=2 InactiveLimit=0 KillWait=30 MessageTimeout=30 MinJobAge=300 OverTimeLimit=10 #ResvOverRun=0 SlurmctldTimeout=120 SlurmdTimeout=300 #UnkillableStepTimeout=60 #VSizeFactor=0 Waittime=0 # SCHEDULING #DefMemPerCPU=0 FastSchedule=0 #MaxMemPerCPU=0 #SchedulerRootFilter=1 #SchedulerTimeSlice=30 SchedulerType=sched/backfill SchedulerPort=7321 #SelectType=select/linear SelectType=select/cons_res SelectTypeParameters=CR_Core_Memory,CR_ONE_TASK_PER_CORE # JOB PRIORITY #PriorityFlags= #PriorityType=priority/basic #PriorityDecayHalfLife= #PriorityCalcPeriod= #PriorityFavorSmall= #PriorityMaxAge= #PriorityUsageResetPeriod= #PriorityWeightAge= #PriorityWeightFairshare= #PriorityWeightJobSize= #PriorityWeightPartition= #PriorityWeightQOS= # LOGGING AND ACCOUNTING #AccountingStorageEnforce=limits #AccountingStorageHost= #AccountingStorageLoc= #AccountingStoragePass=/var/run/munge/munge.socket.2 #AccountingStoragePort= AccountingStorageType=accounting_storage/slurmdbd #AccountingStorageUser= #AccountingStoreJobComment=YES #DebugFlags= #JobCompHost= #JobCompLoc= #JobCompPass= #JobCompPort= JobCompType=jobcomp/none #JobCompUser= JobAcctGatherFrequency=30 JobAcctGatherType=jobacct_gather/linux SlurmctldDebug=3 #SlurmctldLogFile= SlurmdDebug=3 #SlurmdLogFile= #SlurmSchedLogFile= #SlurmSchedLogLevel= # POWER SAVE SUPPORT FOR IDLE NODES #SuspendProgram= #ResumeProgram= # How long a node must be idle before it will be powered off (in seconds) #SuspendTime=14400 # Four hours #SuspendTimeout=30 # Number of seconds we expect the node shutdown to take #ResumeTimeout=300 # Number of seconds we expect the node boot process to take #ResumeRate=100 # Number of nodes we're willing to turn on at a time #SuspendRate=100 # Number of nodes we're willing to power off at a time #SuspendExcNodes= #SuspendExcParts= # COMPUTE NODES NodeName=DEFAULT Sockets=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=385383 NodeName=master NodeName=node1 NodeName=node2 CoresPerSocket=18 RealMemory=191883 Gres=gpu:P100_SXM:2 Feature=Tesla # QUEUE DEFINITIONS ### CPU Queues ### # Month-long jobs must be sent here (shorter jobs can be assigned higher priorities below) PartitionName=month-long-cpu Priority=5000 Default=NO MaxTime=31-0:00:00 State=UP Nodes=node[1-2] MaxNodes=2 # Week-long jobs must be sent here (shorter jobs can be assigned higher priorities below) PartitionName=week-long-cpu Priority=10000 Default=NO MaxTime=7-0:00:00 State=UP Nodes=node[1-2] MaxNodes=2 # Day-long jobs must be sent here (shorter jobs can be assigned higher priorities below) PartitionName=day-long-cpu Priority=20000 Default=NO MaxTime=1-0:00:00 State=UP Nodes=node[1-2] # 30-Minute short, high-priority jobs may be sent here PartitionName=short-cpu Priority=40000 Default=YES MaxTime=30:00 State=UP Nodes=node[1-2] PartitionName=admin Priority=65535 Default=YES MaxTime=3000000:00 State=UP Nodes=node[1-2] AllowGroups=slurmadmin PartitionName=PHYSICS Priority=10000 Default=YES MaxTime=3000000:00 State=UP Nodes=node2 AllowGroups=physics # Interactive sessions are considered higher priority than batch jobs. # Specify up to half a node per interactive session: PartitionName=interactive-cpu Priority=50000 Shared=YES:2 DefaultTime=8:00:00 MaxTime=48:00:00 State=UP Nodes=node[1-2] MaxNodes=1 MaxCPUsPerNode=14 MaxMemPerNode=128834