Using the fix-spin-up branch and the below .tfvars file, all nodes get created and the cluster is working with 3 CPU nodes but the GPU node does not join to the cluster.
One is that the native.cgroupdriver is set to systemd for CPU nodes but is cgroupfs for GPU nodes
The error in `/var/log/cloud-init-output.log' is
[ERROR SystemVerification]: could not unmarshal the JSON output of 'docker info':
WARNING: Error loading config file: .dockercfg: $HOME is not defined
{"ID":"OA3Z:JNDT:ZKPI:MYMH:OD3J:ULME:62R3:U2FD:OKF6:H7QE:M2MF:FTDX","Containers":0,"ContainersRunning":0,"ContainersPaused":0,"ContainersStopped":0,"Images":0,"Driver":"overlay2","DriverStatus":[["Backing Filesystem","extfs"],["Supports d_type","true"],["Native Overlay Diff","true"]],"Plugins":{"Volume":["local"],"Network":["bridge","host","ipvlan","macvlan","null","overlay"],"Authorization":null,"Log":["awslogs","fluentd","gcplogs","gelf","journald","json-file","local","logentries","splunk","syslog"]},"MemoryLimit":true,"SwapLimit":false,"KernelMemory":true,"KernelMemoryTCP":true,"CpuCfsPeriod":true,"CpuCfsQuota":true,"CPUShares":true,"CPUSet":true,"PidsLimit":true,"IPv4Forwarding":true,"BridgeNfIptables":true,"BridgeNfIp6tables":true,"Debug":false,"NFd":22,"OomKillDisable":true,"NGoroutines":33,"SystemTime":"2021-04-27T18:58:49.684520859Z","LoggingDriver":"json-file","CgroupDriver":"cgroupfs","CgroupVersion":"1","NEventsListener":0,"KernelVersion":"4.15.0-50-generic","OperatingSystem":"Ubuntu 18.04.2 LTS","OSVersion":"18.04","OSType":"linux","Architecture":"x86_64","IndexServerAddress":"https://index.docker.io/v1/","RegistryConfig":{"AllowNondistributableArtifactsCIDRs":[],"AllowNondistributableArtifactsHostnames":[],"InsecureRegistryCIDRs":["127.0.0.0/8"],"IndexConfigs":{"docker.io":{"Name":"docker.io","Mirrors":[],"Secure":true,"Official":true}},"Mirrors":[]},"NCPU":48,"MemTotal":201428475904,"GenericResources":null,"DockerRootDir":"/var/lib/docker","HttpProxy":"","HttpsProxy":"","NoProxy":"","Name":"metal-multiarch-k8s-gpu-gpu-green-00","Labels":[],"ExperimentalBuild":false,"ServerVersion":"20.10.2","Runtimes":{"io.containerd.runc.v2":{"path":"runc"},"io.containerd.runtime.v1.linux":{"path":"runc"},"nvidia":{"path":"/usr/bin/nvidia-container-runtime"},"runc":{"path":"runc"}},"DefaultRuntime":"nvidia","Swarm":{"NodeID":"","NodeAddr":"","LocalNodeState":"inactive","ControlAvailable":false,"Error":"","RemoteManagers":null},"LiveRestoreEnabled":false,"Isolation":"","InitBinary":"docker-init","ContainerdCommit":{"ID":"","Expected":""},"RuncCommit":{"ID":"N/A","Expected":"N/A"},"InitCommit":{"ID":"","Expected":""},"SecurityOptions":["name=apparmor","name=seccomp,profile=default"],"Warnings":["WARNING: No swap limit support"],"ClientInfo":{"Debug":false,"Context":"default","Plugins":[],"Warnings":null}}
Using the
fix-spin-up
branch and the below .tfvars file, all nodes get created and the cluster is working with 3 CPU nodes but the GPU node does not join to the cluster.I see 2 issues;
One is that the
native.cgroupdriver
is set tosystemd
for CPU nodes but iscgroupfs
for GPU nodesThe error in `/var/log/cloud-init-output.log' is