xenon-middleware / xenon

A middleware abstraction library that provides a simple programming interface to various compute and storage resources.
http://xenon-middleware.github.io/xenon/
Apache License 2.0
33 stars 17 forks source link

Add support for slurm 19 #669

Closed jmaassen closed 4 years ago

jmaassen commented 4 years ago

Apparently slurm has a different output format that the older versions. See https://github.com/xenon-middleware/xenon-cli/issues/72

jmaassen commented 4 years ago

The parsing issue if fixed in the jobstatus-bug branch. Needs more testing, preferably against a slurm 19 docker

sverhoeven commented 4 years ago

I created a Slurm 19 Docker image which can be build with boatswain build -vv slurm:19

I tried to run the live test against it with

docker run --detach --publish 10022:22 xenonmiddleware/slurm:19
./gradlew liveTest -Dxenon.scheduler=slurm -Dxenon.scheduler.location=ssh://localhost:10022 -Dxenon.adaptors.slurm.strictHostKeyChecking=false  -Dxenon.username=xenon -Dxenon.password=javagat

All tests fail with Got invalid key/value pair in output: Slurmctld(primary) at 9a0b4eaf438e is UP.

scontrol show config
Configuration data as of 2020-03-13T15:18:39
AccountingStorageBackupHost = (null)
AccountingStorageEnforce = none
AccountingStorageHost   = localhost
AccountingStorageLoc    = /var/log/slurm/accounting
AccountingStoragePort   = 0
AccountingStorageTRES   = cpu,mem,energy,node,billing,fs/disk,vmem,pages
AccountingStorageType   = accounting_storage/filetxt
AccountingStorageUser   = root
AccountingStoreJobComment = Yes
AcctGatherEnergyType    = acct_gather_energy/none
AcctGatherFilesystemType = acct_gather_filesystem/none
AcctGatherInterconnectType = acct_gather_interconnect/none
AcctGatherNodeFreq      = 0 sec
AcctGatherProfileType   = acct_gather_profile/none
AllowSpecResourcesUsage = 0
AuthAltTypes            = (null)
AuthInfo                = (null)
AuthType                = auth/munge
BatchStartTimeout       = 2 sec
BOOT_TIME               = 2020-03-13T15:10:13
BurstBufferType         = (null)
CheckpointType          = checkpoint/none
CliFilterPlugins        = (null)
ClusterName             = mycluster
CommunicationParameters = (null)
CompleteWait            = 0 sec
CoreSpecPlugin          = core_spec/none
CpuFreqDef              = Unknown
CpuFreqGovernors        = Performance,OnDemand,UserSpace
CredType                = cred/munge
DebugFlags              = (null)
DefMemPerNode           = UNLIMITED
DisableRootJobs         = No
EioTimeout              = 60
EnforcePartLimits       = NO
Epilog                  = (null)
EpilogMsgTime           = 1 usec
EpilogSlurmctld         = (null)
ExtSensorsType          = ext_sensors/none
ExtSensorsFreq          = 0 sec
FastSchedule            = 1
FederationParameters    = (null)
FirstJobId              = 1
GetEnvTimeout           = 2 sec
GresTypes               = (null)
GpuFreqDef              = high,memory=high
GroupUpdateForce        = 1
GroupUpdateTime         = 2 sec
HASH_VAL                = Match
HealthCheckInterval     = 0 sec
HealthCheckNodeState    = ANY
HealthCheckProgram      = (null)
InactiveLimit           = 0 sec
JobAcctGatherFrequency  = 2
JobAcctGatherType       = jobacct_gather/linux
JobAcctGatherParams     = (null)
JobCheckpointDir        = /var/slurm/checkpoint
JobCompHost             = localhost
JobCompLoc              = /var/log/slurm/job_completions
JobCompPort             = 0
JobCompType             = jobcomp/filetxt
JobCompUser             = root
JobContainerType        = job_container/none
JobCredentialPrivateKey = /usr/local/etc/slurm/slurm.key
JobCredentialPublicCertificate = /usr/local/etc/slurm/slurm.cert
JobDefaults             = (null)
JobFileAppend           = 0
JobRequeue              = 1
JobSubmitPlugins        = (null)
KeepAliveTime           = SYSTEM_DEFAULT
KillOnBadExit           = 0
KillWait                = 2 sec
LaunchParameters        = (null)
LaunchType              = launch/slurm
Layouts                 = 
Licenses                = (null)
LicensesUsed            = (null)
LogTimeFormat           = iso8601_ms
MailDomain              = (null)
MailProg                = /bin/true
MaxArraySize            = 1001
MaxJobCount             = 10000
MaxJobId                = 67043328
MaxMemPerNode           = UNLIMITED
MaxStepCount            = 40000
MaxTasksPerNode         = 512
MCSPlugin               = mcs/none
MCSParameters           = (null)
MessageTimeout          = 2 sec
MinJobAge               = 2 sec
MpiDefault              = none
MpiParams               = (null)
MsgAggregationParams    = (null)
NEXT_JOB_ID             = 3
NodeFeaturesPlugins     = (null)
OverTimeLimit           = 0 min
PluginDir               = /usr/local/lib/slurm
PlugStackConfig         = /usr/local/etc/slurm/plugstack.conf
PowerParameters         = (null)
PowerPlugin             = 
PreemptMode             = OFF
PreemptType             = preempt/none
PreemptExemptTime       = 00:00:00
PriorityParameters      = (null)
PrioritySiteFactorParameters = (null)
PrioritySiteFactorPlugin = (null)
PriorityType            = priority/basic
PrivateData             = none
ProctrackType           = proctrack/linuxproc
Prolog                  = (null)
PrologEpilogTimeout     = 65534
PrologSlurmctld         = (null)
PrologFlags             = (null)
PropagatePrioProcess    = 0
PropagateResourceLimits = ALL
PropagateResourceLimitsExcept = (null)
RebootProgram           = (null)
ReconfigFlags           = (null)
RequeueExit             = (null)
RequeueExitHold         = (null)
ResumeFailProgram       = (null)
ResumeProgram           = (null)
ResumeRate              = 300 nodes/min
ResumeTimeout           = 60 sec
ResvEpilog              = (null)
ResvOverRun             = 0 min
ResvProlog              = (null)
ReturnToService         = 1
RoutePlugin             = route/default
SallocDefaultCommand    = (null)
SbcastParameters        = (null)
SchedulerParameters     = (null)
SchedulerTimeSlice      = 5 sec
SchedulerType           = sched/backfill
SelectType              = select/linear
SlurmUser               = root(0)
SlurmctldAddr           = (null)
SlurmctldDebug          = debug3
SlurmctldHost[0]        = 9a0b4eaf438e(localhost)
SlurmctldLogFile        = (null)
SlurmctldPort           = 6817
SlurmctldSyslogDebug    = unknown
SlurmctldPrimaryOffProg = (null)
SlurmctldPrimaryOnProg  = (null)
SlurmctldTimeout        = 2 sec
SlurmctldParameters     = (null)
SlurmdDebug             = info
SlurmdLogFile           = /var/log/slurm-llnl/slurmd.%n.log
SlurmdParameters        = (null)
SlurmdPidFile           = /var/run/slurmd.%n.pid
SlurmdPort              = 6818
SlurmdSpoolDir          = /var/spool/slurmd.%n
SlurmdSyslogDebug       = unknown
SlurmdTimeout           = 2 sec
SlurmdUser              = root(0)
SlurmSchedLogFile       = (null)
SlurmSchedLogLevel      = 0
SlurmctldPidFile        = /var/run/slurmctld.pid
SlurmctldPlugstack      = (null)
SLURM_CONF              = /usr/local/etc/slurm/slurm.conf
SLURM_VERSION           = 19.05.5
SrunEpilog              = (null)
SrunPortRange           = 0-0
SrunProlog              = (null)
StateSaveLocation       = /var/spool/slurmctld/state
SuspendExcNodes         = (null)
SuspendExcParts         = (null)
SuspendProgram          = (null)
SuspendRate             = 60 nodes/min
SuspendTime             = NONE
SuspendTimeout          = 30 sec
SwitchType              = switch/none
TaskEpilog              = (null)
TaskPlugin              = task/none
TaskPluginParam         = (null type)
TaskProlog              = (null)
TCPTimeout              = 2 sec
TmpFS                   = /tmp
TopologyParam           = (null)
TopologyPlugin          = topology/none
TrackWCKey              = No
TreeWidth               = 50
UsePam                  = 0
UnkillableStepProgram   = (null)
UnkillableStepTimeout   = 60 sec
VSizeFactor             = 0 percent
WaitTime                = 0 sec
X11Parameters           = (null)

Slurmctld(primary) at 9a0b4eaf438e is UP

sverhoeven commented 4 years ago

Seems in Slurm 19 the --workdir option was renamed to --chdir.

jmaassen commented 4 years ago

Yes. Fixed it in the latest version

jmaassen commented 4 years ago

Passed tests successfully, but I need to remove some debug statements here and there ;-)

jmaassen commented 4 years ago

I'm preparing a release 3.1.0