pax_global_header00006660000000000000000000000064150141272070014511gustar00rootroot0000000000000052 comment=99660f26c47380924b46b06c87466d52a24cda3c README.md000066400000000000000000000011021501412720700123260ustar00rootroot00000000000000# Slurm Test for Sparky This requires the Sparky setup for Rocky: https://git.resf.org/testing/Sparky_Rocky . Please see the getting started guide: https://git.resf.org/testing/Sparky_Getting_Started . # Report example ``` [task run: task.bash - files/tasks/slurm-verify] [task stdout] 16:47:58 :: PARTITION AVAIL TIMELIMIT NODES STATE NODELIST 16:47:58 :: normal* up 7-00:00:00 1 idle master [task check] stdout match (s) True stdout match (s) True ``` files/000077500000000000000000000000001501412720700121575ustar00rootroot00000000000000files/tasks/000077500000000000000000000000001501412720700133045ustar00rootroot00000000000000files/tasks/slurm-install/000077500000000000000000000000001501412720700161125ustar00rootroot00000000000000files/tasks/slurm-install/task.bash000066400000000000000000000130311501412720700177110ustar00rootroot00000000000000#!/bin/bash - set -e # Super simple check. In theory, this should work on 8 & 9. In reality, 8 will # fail with errors on libsz which is in the missing package libaec # # Using parts of this guide to fill out the test: # https://rpa.st/BZMQ sudo hostnamectl set-hostname master sudo dnf -y install rocky-release-hpc epel-release sudo /usr/bin/dnf config-manager --set-enabled crb sudo dnf -y install slurm23.11-* mariadb-server sudo /usr/sbin/create-munge-key sudo systemctl enable --now munge sudo bash -c 'cat < /etc/my.cnf.d/innodb.cnf [mysqld] innodb_buffer_pool_size=1024M innodb_log_file_size=64M innodb_lock_wait_timeout=900 EOF' sudo systemctl enable --now mariadb sudo mysql_secure_installation << EOD y n y y y y EOD sudo mysql << EOD create user 'slurm' identified by 'slurmpassword'; grant all on slurm_acct_db.* TO 'slurm'; create database slurm_acct_db; EOD sudo adduser slurm sudo bash -c 'cat << EOD > /etc/slurm/slurmdbd.conf PurgeEventAfter=24 # these values can be tweaked but as they are they keep accounting data in the DB for 2y PurgeJobAfter=24 PurgeResvAfter=24 PurgeStepAfter=24 PurgeSuspendAfter=24 AuthType=auth/munge DbdHost=master SlurmUser=slurm DebugLevel=4 LogFile=/var/log/slurm/slurmdbd.log PidFile=/var/run/slurm/slurmdbd.pid PrivateData=usage,users,jobs # here users can only retrieve their own accounting data StorageType=accounting_storage/mysql StorageHost=master StoragePass=slurmpassword StorageUser=slurm StorageLoc=slurm_acct_db EOD' sudo chmod 600 /etc/slurm/slurmdbd.conf sudo chown slurm:slurm /etc/slurm/slurmdbd.conf sudo systemctl enable slurmdbd sudo systemctl start slurmdbd sleep 5 # Needed to give the db time to sort life out... sudo bash -c 'cat << EOD > /etc/slurm/slurm.conf AuthType=auth/munge CryptoType=crypto/munge EnforcePartLimits=yes Epilog=/etc/slurm/epilog.sh # set up epilog script MaxTasksPerNode=2 # set this to the nr of physical cores your compute nodes have MpiDefault=pmix_v3 # we use pmix_v3 by default so openmpi hooks into SLURMs pmix_v3 plugin so srun works nicely CpuFreqDef=Performance PrivateData=usage,users ProctrackType=proctrack/cgroup # we use cgroups to track and isolate jobs Prolog=/etc/slurm/prolog.sh # set up prolog script RebootProgram=/usr/sbin/reboot # make sure we can reboot nodes via scontrol ReturnToService=1 SlurmctldHost=master SlurmctldParameters=enable_configless # we will use config-less method to get config to compute nodes from master SlurmctldPidFile=/var/run/slurm/slurmctld.pid SlurmctldPort=6817 SlurmdPidFile=/var/run/slurm/slurmd.pid SlurmdPort=6818 SlurmdSpoolDir=/var/spool/slurm/slurmd SlurmUser=slurm StateSaveLocation=/var/spool/slurm/state SwitchType=switch/none TaskPlugin=task/affinity,task/cgroup TaskPluginParam=Cores TaskProlog=/etc/slurm/prolog.sh InactiveLimit=0 KillWait=30 MinJobAge=300 SlurmctldTimeout=120 SlurmdTimeout=300 Waittime=0 DefMemPerCPU=1024 # set this to the RAM of the node [minus a few gig] divided by the number of physical cores on nodes MaxMemPerNode=2048 # set this the RAM of node minus a few gig for OS etc SchedulerType=sched/backfill # use backfill SelectType=select/cons_tres # schedule cores and RAM SelectTypeParameters=CR_Core_Memory PriorityType=priority/multifactor # use multi factor for fairshare, size and QOS PriorityDecayHalfLife=14-0 # decay time of fairshare data is 2w cause users seem to remember who did what last week on clusters but not last month :) PriorityFavorSmall=yes PriorityWeightAge=10000 PriorityWeightFairshare=800000 # mostly priority wrt fairshare and within user jobs wrt to size PriorityWeightJobSize=1000 PriorityWeightQOS=1000000 # so we can have "ASAP" QOS for urgent "this needs to run now" jobs from professors AccountingStorageEnforce=qos,limits # enforce limits AccountingStorageHost=master AccountingStorageType=accounting_storage/slurmdbd AccountingStoreFlags=job_comment ClusterName=master DebugFlags=NO_CONF_HASH JobCompLoc=/var/spool/slurm/accounting/jobs.txt # we keep track of jobs finished via text files log rotated daily and these are kept forever JobCompType=jobcomp/filetxt JobAcctGatherFrequency=30 JobAcctGatherType=jobacct_gather/linux SlurmctldDebug=3 SlurmctldLogFile=/var/log/slurm/slurmctld.log SlurmdDebug=3 SlurmdLogFile=/var/log/slurm/slurmd.log SlurmSchedLogFile=/var/log/slurm/slurmsched.log SlurmSchedLogLevel=1 NodeName=master RealMemory=1024 Sockets=1 CoresPerSocket=1 ThreadsPerCore=1 State=UNKNOWN #NodeName=compute[01-02] RealMemory=2048 Sockets=1 CoresPerSocket=2 ThreadsPerCore=1 State=UNKNOWN #PartitionName=normal Nodes=master,compute[101-130] Default=YES MaxTime=7-0 State=UP ExclusiveUser=NO OverSubscribe=NO PartitionName=normal Nodes=master Default=YES MaxTime=7-0 State=UP ExclusiveUser=NO OverSubscribe=NO EOD' sudo chmod 644 /etc/slurm/slurm.conf sudo chown slurm:slurm /etc/slurm/slurm.conf sudo bash -c 'cat << EOD > /etc/slurm/cgroup.conf ConstrainCores=yes ConstrainRAMSpace=yes MaxRAMPercent=93 EOD' sudo chmod 644 /etc/slurm/cgroup.conf sudo chown slurm:slurm /etc/slurm/cgroup.conf sudo bash -c 'cat << EOD > /etc/slurm/plugstack.conf optional /usr/lib64/slurm/spank_pbs.so EOD' sudo chmod 644 /etc/slurm/plugstack.conf sudo chown slurm:slurm /etc/slurm/plugstack.conf sudo chown slurm:slurm /var/log/slurm sudo mkdir /var/spool/slurm/accounting sudo chown -R slurm:slurm /var/spool/slurm/ sudo systemctl enable --now slurmctld slurmd files/tasks/slurm-verify/000077500000000000000000000000001501412720700157505ustar00rootroot00000000000000files/tasks/slurm-verify/task.bash000066400000000000000000000001201501412720700175420ustar00rootroot00000000000000#!/bin/bash - # Sleep to give time for the services to start up sleep 20 sinfo files/tasks/slurm-verify/task.check000066400000000000000000000001561501412720700177130ustar00rootroot00000000000000begin: PARTITION AVAIL TIMELIMIT NODES STATE NODELIST normal* up 7-00:00:00 1 idle master end: main.raku000066400000000000000000000001231501412720700126610ustar00rootroot00000000000000#!raku task-run "files/tasks/slurm-install"; task-run "files/tasks/slurm-verify";