Open noranraskin opened 2 years ago
[ debug ] 2022/03/08 14:04:41 routes.go:160: /gpushare-scheduler/filter request body = &{0xc42065bc80 <nil> <nil> false true {0 0} false false false 0x69bfd0}
[ debug ] 2022/03/08 14:04:41 routes.go:81: gpusharingfilter ExtenderArgs ={&Pod{ObjectMeta:k8s_io_apimachinery_pkg_apis_meta_v1.ObjectMeta{Name:gpu-share-pod1,GenerateName:,Namespace:default,S
elfLink:,UID:2ba05d9f-a682-4339-acec-b8d65609736a,ResourceVersion:196958,Generation:0,CreationTimestamp:2022-03-08 13:00:19 +0000 UTC,DeletionTimestamp:<nil>,DeletionGracePeriodSeconds:nil,Labels:map[string]stri
ng{},Annotations:map[string]string{kubectl.kubernetes.io/last-applied-configuration: {"apiVersion":"v1","kind":"Pod","metadata":{"annotations":{},"name":"gpu-share-pod1","namespace":"default"},"spec":{"container
s":[{"env":[{"name":"NVIDIA_VISIBLE_DEVICES","value":"all"}],"image":"cheyang/gpu-player:v2","name":"gpu-share-pod1","resources":{"limits":{"aliyun.com/gpu-mem":3}}}],"restartPolicy":"OnFailure"}}
,},OwnerReferences:[],Finalizers:[],ClusterName:,Initializers:nil,},Spec:PodSpec{Volumes:[{kube-api-access-7dp6x {nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil Proje
ctedVolumeSource{Sources:[{nil nil nil ServiceAccountTokenProjection{Audience:,ExpirationSeconds:*3607,Path:token,}} {nil nil &ConfigMapProjection{LocalObjectReference:LocalObjectReference{Name:kube-root-ca.crt,
},Items:[{ca.crt ca.crt <nil>}],Optional:nil,} nil} {nil &DownwardAPIProjection{Items:[{namespace ObjectFieldSelector{APIVersion:v1,FieldPath:metadata.namespace,} nil <nil>}],} nil nil}],DefaultMode:*420,} nil n
il nil}}],Containers:[{gpu-share-pod1 cheyang/gpu-player:v2 [] [] [] [] [{NVIDIA_VISIBLE_DEVICES all nil}] {map[aliyun.com/gpu-mem:{{3 0} {<nil>} 3 DecimalSI}] map[aliyun.com/gpu-mem:{{3 0} {<nil>} 3 DecimalSI}
]} [{kube-api-access-7dp6x true /var/run/secrets/kubernetes.io/serviceaccount <nil>}] [] nil nil nil /dev/termination-log File IfNotPresent nil false false false}],RestartPolicy:OnFailure,TerminationGracePeriod
Seconds:*30,ActiveDeadlineSeconds:nil,DNSPolicy:ClusterFirst,NodeSelector:map[string]string{},ServiceAccountName:default,DeprecatedServiceAccount:default,NodeName:,HostNetwork:false,HostPID:false,HostIPC:false,S
ecurityContext:&PodSecurityContext{SELinuxOptions:nil,RunAsUser:nil,RunAsNonRoot:nil,SupplementalGroups:[],FSGroup:nil,RunAsGroup:nil,Sysctls:[],},ImagePullSecrets:[],Hostname:,Subdomain:,Affinity:nil,SchedulerN
ame:default-scheduler,InitContainers:[],AutomountServiceAccountToken:nil,Tolerations:[{node.kubernetes.io/not-ready Exists NoExecute 0xc420603380} {node.kubernetes.io/unreachable Exists NoExecute 0xc4206033a0}
],HostAliases:[],PriorityClassName:,Priority:*0,DNSConfig:nil,ShareProcessNamespace:nil,ReadinessGates:[],},Status:PodStatus{Phase:Pending,Conditions:[{PodScheduled False 0001-01-01 00:00:00 +0000 UTC 2022-03-08
13:00:19 +0000 UTC Unschedulable Post "http://172.26.1.11:32766/gpushare-scheduler/filter": EOF}],Message:,Reason:,HostIP:,PodIP:,StartTime:<nil>,ContainerStatuses:[],QOSClass:BestEffort,InitContainerStatuses:[
],NominatedNodeName:,},} &NodeList{ListMeta:k8s_io_apimachinery_pkg_apis_meta_v1.ListMeta{SelfLink:,ResourceVersion:,Continue:,},Items:[{{ } {172.26.1.151 2220814c-e29c-4310-9f62-0396d19583d5 198797 0 2022-03
-07 08:31:54 +0000 UTC <nil> <nil> map[gpushare:true kubernetes.io/arch:amd64 kubernetes.io/hostname:172.26.1.151 kubernetes.io/os:linux kubernetes.io/role:node beta.kubernetes.io/arch:amd64 beta.kubernetes.io/o
s:linux] map[volumes.kubernetes.io/controller-managed-attach-detach:true csi.volume.kubernetes.io/nodeid:{"nasplugin.csi.alibabacloud.com":"i-k1a2lgjh2ac8kiphf04m"} node.alpha.kubernetes.io/ttl:0] [] nil [] } {1
72.28.2.0/24 false [] nil } {map[hugepages-2Mi:{{0 0} {<nil>} 0 DecimalSI} memory:{{15729852416 0} {<nil>} BinarySI} pods:{{250 0} {<nil>} 250 DecimalSI} aliyun.com/gpu-count:{{1 0} {<nil>} 1 DecimalSI} aliyun
.com/gpu-mem:{{15109 0} {<nil>} 15109 DecimalSI} cpu:{{4 0} {<nil>} 4 DecimalSI} ephemeral-storage:{{211243667456 0} {<nil>} 206292644Ki BinarySI} hugepages-1Gi:{{0 0} {<nil>} 0 DecimalSI}] map[memory:{{15415279
616 0} {<nil>} BinarySI} pods:{{250 0} {<nil>} 250 DecimalSI} aliyun.com/gpu-count:{{1 0} {<nil>} 1 DecimalSI} aliyun.com/gpu-mem:{{15109 0} {<nil>} 15109 DecimalSI} cpu:{{4 0} {<nil>} 4 DecimalSI} ephemeral-st
orage:{{190119300396 0} {<nil>} 190119300396 DecimalSI} hugepages-1Gi:{{0 0} {<nil>} 0 DecimalSI} hugepages-2Mi:{{0 0} {<nil>} 0 DecimalSI}] [{NetworkUnavailable False 2022-03-07 08:33:03 +0000 UTC 2022-03-07 0
8:33:03 +0000 UTC CalicoIsUp Calico is running on this node} {MemoryPressure False 2022-03-08 14:04:17 +0000 UTC 2022-03-07 08:31:54 +0000 UTC KubeletHasSufficientMemory kubelet has sufficient memory available}
{DiskPressure False 2022-03-08 14:04:17 +0000 UTC 2022-03-07 08:31:54 +0000 UTC KubeletHasNoDiskPressure kubelet has no disk pressure} {PIDPressure False 2022-03-08 14:04:17 +0000 UTC 2022-03-07 08:31:54 +0000 U
TC KubeletHasSufficientPID kubelet has sufficient PID available} {Ready True 2022-03-08 14:04:17 +0000 UTC 2022-03-07 08:31:54 +0000 UTC KubeletReady kubelet is posting ready status. AppArmor enabled}] [{Interna
lIP 172.26.1.151} {Hostname 172.26.1.151}] {{10250}} {20220215152909852048611999857181 a682fbc8-9b7f-4816-8186-e378b5bc8827 1e88609c-b1ca-4980-bb49-31205abcdd7f 5.4.0-100-generic Ubuntu 20.04.3 LTS containerd://
1.5.8 v1.23.1 v1.23.1 linux amd64} [{[docker.io/kubernetesui/dashboard:v2.4.0] 224447837} {[registry.cn-hangzhou.aliyuncs.com/acs/csi-plugin@sha256:a90d8f40842d25768107bf627d3a5947503ceab05a17d4d0846783a94d3fc21
7 registry.cn-hangzhou.aliyuncs.com/acs/csi-plugin:v1.18.8.47-906bd535-aliyun] 168746392} {[docker.io/calico/node:v3.19.3] 155951331} {[docker.io/calico/cni:v3.19.3] 145881091} {[docker.io/easzlab/k8s-dns-node-c
ache:1.21.1] 106161195} {[docker.io/easzlab/metrics-server:v0.5.2] 65673656} {[docker.io/calico/kube-controllers:v3.19.3] 60621887} {[docker.io/coredns/coredns:1.8.6] 46957023} {[registry.cn-hangzhou.aliyuncs.co
m/acs/k8s-gpushare-plugin@sha256:76769d69f5a5b24cbe117f8ac83a0ff7409fda6108ca982c8f3b8f763e016100 registry.cn-hangzhou.aliyuncs.com/acs/k8s-gpushare-plugin:v2-1.11-aff8a23] 37903785} {[docker.io/kubernetesui/met
rics-scraper:v1.0.7] 34453097} {[registry.cn-hangzhou.aliyuncs.com/acs/k8s-gpushare-schd-extender@sha256:03c869664a7aaa5baa5d8c5e8cb77360a8773ee80e19128b67059ca9e5ce4552 registry.cn-hangzhou.aliyuncs.com/acs/k8s
-gpushare-schd-extender:1.11-d170d8a] 32274301} {[registry.cn-hangzhou.aliyuncs.com/acs/csi-provisioner@sha256:3571de46b199fd1271561d2f0d5480af736d472e8e6584d8b7e668e9e1f4eba1 registry.cn-hangzhou.aliyuncs.com/a
cs/csi-provisioner:v3.0.0-3f86569-aliyun] 22653597} {[docker.io/calico/pod2daemon-flexvol:v3.19.3] 21840185} {[registry.cn-hangzhou.aliyuncs.com/acs/csi-node-driver-registrar@sha256:273175c272162d480d06849e09e6e
3cdb0245239e3a82df6630df3bc059c6571 registry.cn-hangzhou.aliyuncs.com/acs/csi-node-driver-registrar:v1.2.0] 7676865} {[docker.io/easzlab/pause:3.6] 685866}] [] [] nil}}],} <nil>}
[ info ] 2022/03/08 14:04:41 server.go:2926: http: panic serving 172.26.1.11:46078: runtime error: invalid memory address or nil pointer dereference
goroutine 49 [running]:
net/http.(*conn).serve.func1(0xc42034e0a0)
/usr/local/go/src/net/http/server.go:1726 +0xd0
panic(0x1098dc0, 0x1af3170)
/usr/local/go/src/runtime/panic.go:502 +0x229
github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/scheduler.Predicate.Handler(0x12060a2, 0x10, 0x127b7e8, 0xc420336c80, 0xc420439880, 0xc420149960, 0x0, 0x0)
/go/src/github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/scheduler/predicate.go:17 +0x37
github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/routes.PredicateRoute.func1(0x130e8e0, 0xc4206941c0, 0xc420358300, 0x0, 0x0, 0x0)
/go/src/github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/routes/routes.go:82 +0x7b0
github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/routes.DebugLogging.func1(0x130e8e0, 0xc4206941c0, 0xc420358300, 0x0, 0x0, 0x0)
/go/src/github.com/AliyunContainerService/gpushare-scheduler-extender/pkg/routes/routes.go:161 +0x197
github.com/AliyunContainerService/gpushare-scheduler-extender/vendor/github.com/julienschmidt/httprouter.(*Router).ServeHTTP(0xc4204e2040, 0x130e8e0, 0xc4206941c0, 0xc420358300)
/go/src/github.com/AliyunContainerService/gpushare-scheduler-extender/vendor/github.com/julienschmidt/httprouter/router.go:334 +0x79c
net/http.serverHandler.ServeHTTP(0xc4200b2680, 0x130e8e0, 0xc4206941c0, 0xc420358300)
/usr/local/go/src/net/http/server.go:2697 +0xbc
net/http.(*conn).serve(0xc42034e0a0, 0x130f7a0, 0xc420620640)
/usr/local/go/src/net/http/server.go:1830 +0x651
created by net/http.(*Server).Serve
/usr/local/go/src/net/http/server.go:2798 +0x27b
The problem has been solved, caused by wrong parameter settings,nodeCacheCapable must be set to true:
nodeCacheCapable: true
The problem has been solved, caused by wrong parameter settings,nodeCacheCapable must be set to true:
nodeCacheCapable: true
@geyong91 can you give more details where you had to change that parameter exactly?
Sure,The configuration is the same with issue 166,but changed nodeCacheCapable to true.
That did the trick, thanks!
Deployments using the GPU fail to start with the above message. Everything else seems to be working fine.
The gpushare pods are running:
And I can run
inspect gpushare
:This issue seems to be somewhat related to #22 but the error message is a little different. If I change the service type of gpushare-schd-extender to ClusterIP I get the same error message like #22. Right now though it's set as NodePort as per the default install instructions.
I'm running
Kubernetes 1.23.3
on a single node cluster setup using kubeadm. I don't know if kube-proxy is running in IPVS or iptables mode. Any direction how I can check this is greatly appreciated.