kleveross / ormb

Docker for Your ML/DL Models Based on OCI Artifacts
Apache License 2.0
459 stars 61 forks source link

您好,在部署“使用 Seldon Core 启动模型服务”是遇到问题 #185

Open VincentWei2021 opened 3 years ago

VincentWei2021 commented 3 years ago

{"level":"info","ts":1629958732.6136754,"logger":"controllers.SeldonDeployment","msg":"Scheme","SeldonDeployment":"kleveross-system/onnx-service-test002","r.scheme":{}} {"level":"info","ts":1629958732.6136875,"logger":"controllers.SeldonDeployment","msg":"createDeployments","SeldonDeployment":"kleveross-system/onnx-service-test002","deploy":{"namespace":"kleveross-system","name":"onnx-service-test002-onnx-0-onnx"}} {"level":"info","ts":1629958732.6138568,"logger":"controllers.SeldonDeployment","msg":"Updating Deployment","SeldonDeployment":"kleveross-system/onnx-service-test002","namespace":"kleveross-system","name":"onnx-service-test002-onnx-0-onnx"} {"level":"info","ts":1629958732.6246343,"logger":"controllers.SeldonDeployment","msg":"The deployments are the same - api server defaults ignored","SeldonDeployment":"kleveross-system/onnx-service-test002"} {"level":"info","ts":1629958732.6246703,"logger":"controllers.SeldonDeployment","msg":"Found identical deployment","SeldonDeployment":"kleveross-system/onnx-service-test002","namespace":"kleveross-system","name":"onnx-service-test002-onnx-0-onnx","status":{"observedGeneration":1,"replicas":1,"updatedReplicas":1,"unavailableReplicas":1,"conditions":[{"type":"Available","status":"False","lastUpdateTime":"2021-08-26T06:18:51Z","lastTransitionTime":"2021-08-26T06:18:51Z","reason":"MinimumReplicasUnavailable","message":"Deployment does not have minimum availability."},{"type":"Progressing","status":"True","lastUpdateTime":"2021-08-26T06:18:51Z","lastTransitionTime":"2021-08-26T06:18:51Z","reason":"ReplicaSetUpdated","message":"ReplicaSet \"onnx-service-test002-onnx-0-onnx-85465b6bf8\" is progressing."}]}} {"level":"info","ts":1629958732.6247447,"logger":"controllers.SeldonDeployment","msg":"Deployment status ","SeldonDeployment":"kleveross-system/onnx-service-test002","name":"onnx-service-test002-onnx-0-onnx","status":{"observedGeneration":1,"replicas":1,"updatedReplicas":1,"unavailableReplicas":1,"conditions":[{"type":"Available","status":"False","lastUpdateTime":"2021-08-26T06:18:51Z","lastTransitionTime":"2021-08-26T06:18:51Z","reason":"MinimumReplicasUnavailable","message":"Deployment does not have minimum availability."},{"type":"Progressing","status":"True","lastUpdateTime":"2021-08-26T06:18:51Z","lastTransitionTime":"2021-08-26T06:18:51Z","reason":"ReplicaSetUpdated","message":"ReplicaSet \"onnx-service-test002-onnx-0-onnx-85465b6bf8\" is progressing."}]}} {"level":"info","ts":1629958732.6375253,"logger":"controllers.SeldonDeployment","msg":"Reconcile called","SeldonDeployment":"kleveross-system/onnx-service-test002"} {"level":"info","ts":1629958732.637599,"logger":"seldondeployment","msg":"Defaulting Seldon Deployment called","name":"onnx-service-test002"} {"level":"info","ts":1629958732.637626,"logger":"controllers.SeldonDeployment","msg":"pSvcName","SeldonDeployment":"kleveross-system/onnx-service-test002","val":"onnx-service-test002-onnx"} {"level":"info","ts":1629958732.6378431,"logger":"controllers.SeldonDeployment","msg":"Found identical Service","SeldonDeployment":"kleveross-system/onnx-service-test002","all":false,"namespace":"kleveross-system","name":"onnx-service-test002-onnx-onnx","status":{"loadBalancer":{}}} {"level":"info","ts":1629958732.6379495,"logger":"controllers.SeldonDeployment","msg":"Found identical Service","SeldonDeployment":"kleveross-system/onnx-service-test002","all":false,"namespace":"kleveross-system","name":"onnx-service-test002-onnx","status":{"loadBalancer":{}}} {"level":"info","ts":1629958732.6380079,"logger":"controllers.SeldonDeployment","msg":"Scheme","SeldonDeployment":"kleveross-system/onnx-service-test002","r.scheme":{}} {"level":"info","ts":1629958732.638027,"logger":"controllers.SeldonDeployment","msg":"createDeployments","SeldonDeployment":"kleveross-system/onnx-service-test002","deploy":{"namespace":"kleveross-system","name":"onnx-service-test002-onnx-0-onnx"}} {"level":"info","ts":1629958732.6382554,"logger":"controllers.SeldonDeployment","msg":"Updating Deployment","SeldonDeployment":"kleveross-system/onnx-service-test002","namespace":"kleveross-system","name":"onnx-service-test002-onnx-0-onnx"} {"level":"info","ts":1629958732.6485512,"logger":"controllers.SeldonDeployment","msg":"The deployments are the same - api server defaults ignored","SeldonDeployment":"kleveross-system/onnx-service-test002"} {"level":"info","ts":1629958732.6485822,"logger":"controllers.SeldonDeployment","msg":"Found identical deployment","SeldonDeployment":"kleveross-system/onnx-service-test002","namespace":"kleveross-system","name":"onnx-service-test002-onnx-0-onnx","status":{"observedGeneration":1,"replicas":1,"updatedReplicas":1,"unavailableReplicas":1,"conditions":[{"type":"Available","status":"False","lastUpdateTime":"2021-08-26T06:18:51Z","lastTransitionTime":"2021-08-26T06:18:51Z","reason":"MinimumReplicasUnavailable","message":"Deployment does not have minimum availability."},{"type":"Progressing","status":"True","lastUpdateTime":"2021-08-26T06:18:51Z","lastTransitionTime":"2021-08-26T06:18:51Z","reason":"ReplicaSetUpdated","message":"ReplicaSet \"onnx-service-test002-onnx-0-onnx-85465b6bf8\" is progressing."}]}} {"level":"info","ts":1629958732.6486323,"logger":"controllers.SeldonDeployment","msg":"Deployment status ","SeldonDeployment":"kleveross-system/onnx-service-test002","name":"onnx-service-test002-onnx-0-onnx","status":{"observedGeneration":1,"replicas":1,"updatedReplicas":1,"unavailableReplicas":1,"conditions":[{"type":"Available","status":"False","lastUpdateTime":"2021-08-26T06:18:51Z","lastTransitionTime":"2021-08-26T06:18:51Z","reason":"MinimumReplicasUnavailable","message":"Deployment does not have minimum availability."},{"type":"Progressing","status":"True","lastUpdateTime":"2021-08-26T06:18:51Z","lastTransitionTime":"2021-08-26T06:18:51Z","reason":"ReplicaSetUpdated","message":"ReplicaSet \"onnx-service-test002-onnx-0-onnx-85465b6bf8\" is progressing."}]}}

gaocegege commented 3 years ago

Can you please show kubectl describe <targeting pod>

VincentWei2021 commented 3 years ago

[root@k8s-master ~]# kubectl describe pod -n seldon-system seldon-controller-manager-6757ccd99-792mb Name: seldon-controller-manager-6757ccd99-792mb Namespace: seldon-system Priority: 0 Node: k8s-master/192.168.100.48 Start Time: Thu, 26 Aug 2021 07:29:53 +0000 Labels: app=seldon app.kubernetes.io/instance=seldon1 app.kubernetes.io/name=seldon app.kubernetes.io/version=v0.5 control-plane=seldon-controller-manager pod-template-hash=6757ccd99 Annotations: prometheus.io/scrape: true sidecar.istio.io/inject: false Status: Running IP: 10.244.0.181 IPs: IP: 10.244.0.181 Controlled By: ReplicaSet/seldon-controller-manager-6757ccd99 Containers: manager: Container ID: docker://47eae4b2a7a32104217cde505d693cb0a68fbdbf557e07630fc041ed765be11d Image: ghcr.io/kleveross/seldon-core-operator:v1.5.0-alpha.3 Image ID: docker-pullable://ghcr.io/kleveross/seldon-core-operator@sha256:6d305105b68c4f86fe66b6eafeed07132976512ff60944cb29df489b028dfb25 Ports: 4443/TCP, 8080/TCP Host Ports: 0/TCP, 0/TCP Command: /manager Args: --enable-leader-election --webhook-port=4443 --create-resources=$(MANAGER_CREATE_RESOURCES) --log-level=$(MANAGER_LOG_LEVEL)

State:          Running
  Started:      Thu, 26 Aug 2021 07:29:55 +0000
Ready:          True
Restart Count:  0
Limits:
  cpu:     500m
  memory:  300Mi
Requests:
  cpu:     100m
  memory:  200Mi
Environment:
  MANAGER_LOG_LEVEL:                            INFO
  WATCH_NAMESPACE:                              
  RELATED_IMAGE_EXECUTOR:                       
  RELATED_IMAGE_ENGINE:                         
  RELATED_IMAGE_STORAGE_INITIALIZER:            
  RELATED_IMAGE_SKLEARNSERVER:                  
  RELATED_IMAGE_XGBOOSTSERVER:                  
  RELATED_IMAGE_MLFLOWSERVER:                   
  RELATED_IMAGE_TFPROXY:                        
  RELATED_IMAGE_TENSORFLOW:                     
  RELATED_IMAGE_EXPLAINER:                      
  RELATED_IMAGE_MOCK_CLASSIFIER:                
  MANAGER_CREATE_RESOURCES:                     false
  POD_NAMESPACE:                                seldon-system (v1:metadata.namespace)
  CONTROLLER_ID:                                
  AMBASSADOR_ENABLED:                           false
  AMBASSADOR_SINGLE_NAMESPACE:                  false
  ENGINE_CONTAINER_IMAGE_AND_VERSION:           docker.io/seldonio/engine:1.10.0
  ENGINE_CONTAINER_IMAGE_PULL_POLICY:           IfNotPresent
  ENGINE_CONTAINER_SERVICE_ACCOUNT_NAME:        default
  ENGINE_CONTAINER_USER:                        8888
  ENGINE_LOG_MESSAGES_EXTERNALLY:               false
  PREDICTIVE_UNIT_HTTP_SERVICE_PORT:            9000
  PREDICTIVE_UNIT_GRPC_SERVICE_PORT:            9500
  PREDICTIVE_UNIT_DEFAULT_ENV_SECRET_REF_NAME:  
  PREDICTIVE_UNIT_METRICS_PORT_NAME:            metrics
  ENGINE_SERVER_GRPC_PORT:                      5001
  ENGINE_SERVER_PORT:                           8000
  ENGINE_PROMETHEUS_PATH:                       /prometheus
  ISTIO_ENABLED:                                true
  KEDA_ENABLED:                                 false
  ISTIO_GATEWAY:                                istio-system/kleveross-gateway
  ISTIO_TLS_MODE:                               
  USE_EXECUTOR:                                 true
  EXECUTOR_CONTAINER_IMAGE_AND_VERSION:         docker.io/seldonio/seldon-core-executor:1.10.0
  EXECUTOR_CONTAINER_IMAGE_PULL_POLICY:         IfNotPresent
  EXECUTOR_PROMETHEUS_PATH:                     /prometheus
  EXECUTOR_SERVER_PORT:                         8000
  EXECUTOR_CONTAINER_USER:                      8888
  EXECUTOR_CONTAINER_SERVICE_ACCOUNT_NAME:      default
  EXECUTOR_SERVER_METRICS_PORT_NAME:            metrics
  EXECUTOR_REQUEST_LOGGER_DEFAULT_ENDPOINT:     http://default-broker
  DEFAULT_USER_ID:                              0
  EXECUTOR_DEFAULT_CPU_REQUEST:                 500m
  EXECUTOR_DEFAULT_MEMORY_REQUEST:              512Mi
  EXECUTOR_DEFAULT_CPU_LIMIT:                   500m
  EXECUTOR_DEFAULT_MEMORY_LIMIT:                512Mi
  ENGINE_DEFAULT_CPU_REQUEST:                   500m
  ENGINE_DEFAULT_MEMORY_REQUEST:                512Mi
  ENGINE_DEFAULT_CPU_LIMIT:                     500m
  ENGINE_DEFAULT_MEMORY_LIMIT:                  512Mi
Mounts:
  /tmp/k8s-webhook-server/serving-certs from cert (ro)
  /var/run/secrets/kubernetes.io/serviceaccount from seldon-manager-token-m6nl4 (ro)

Conditions: Type Status Initialized True Ready True ContainersReady True PodScheduled True Volumes: cert: Type: Secret (a volume populated by a Secret) SecretName: seldon-webhook-server-cert Optional: false seldon-manager-token-m6nl4: Type: Secret (a volume populated by a Secret) SecretName: seldon-manager-token-m6nl4 Optional: false QoS Class: Burstable Node-Selectors: Tolerations: node.kubernetes.io/not-ready:NoExecute op=Exists for 300s node.kubernetes.io/unreachable:NoExecute op=Exists for 300s Events: Type Reason Age From Message


Normal Scheduled 96s default-scheduler Successfully assigned seldon-system/seldon-controller-manager-6757ccd99-792mb to k8s-master Normal Pulled 95s kubelet Container image "ghcr.io/kleveross/seldon-core-operator:v1.5.0-alpha.3" already present on machine Normal Created 94s kubelet Created container manager Normal Started 93s kubelet Started container manager

gaocegege commented 3 years ago

Sorry, I do not mean the seldon controller manager, I mean the model server pod.

VincentWei2021 commented 3 years ago

model服务pod没起来

VincentWei2021 commented 3 years ago

我现在换了seldon-controller manager的版本后 2021/08/26 07:36:18 http: panic serving 192.168.100.48:31324: runtime error: invalid memory address or nil pointer dereference goroutine 1429 [running]: net/http.(conn).serve.func1(0xc0004739a0) /usr/local/go/src/net/http/server.go:1800 +0x139 panic(0x1736ec0, 0x28002c0) /usr/local/go/src/runtime/panic.go:975 +0x3e3 github.com/seldonio/seldon-core/operator/apis/machinelearning.seldon.io/v1.(SeldonDeploymentSpec).checkPredictiveUnits(0xc000df2518, 0xc0005da6a0, 0xc0005da690, 0xc000f305d0, 0x0, 0x0, 0x0, 0xc0006b5ce0, 0xc00009a100, 0x901) /workspace/apis/machinelearning.seldon.io/v1/seldondeployment_webhook.go:338 +0x40 github.com/seldonio/seldon-core/operator/apis/machinelearning.seldon.io/v1.(SeldonDeploymentSpec).ValidateSeldonDeployment(0xc000df2518, 0x1949373, 0x27) /workspace/apis/machinelearning.seldon.io/v1/seldondeployment_webhook.go:524 +0x590 github.com/seldonio/seldon-core/operator/apis/machinelearning.seldon.io/v1.(SeldonDeployment).ValidateCreate(0xc000df2400, 0xc00062b0b0, 0x24) /workspace/apis/machinelearning.seldon.io/v1/seldondeployment_webhook.go:576 +0xde sigs.k8s.io/controller-runtime/pkg/webhook/admission.(validatingHandler).Handle(0xc000335fa0, 0x1b9e5c0, 0xc001494e00, 0xc00062b0b0, 0x24, 0xc00098b380, 0x19, 0xc001158418, 0x2, 0xc001158420, ...) /go/pkg/mod/sigs.k8s.io/controller-runtime@v0.6.4/pkg/webhook/admission/validator.go:69 +0xa74 sigs.k8s.io/controller-runtime/pkg/webhook/admission.(Webhook).Handle(0xc00011e450, 0x1b9e5c0, 0xc001494e00, 0xc00062b0b0, 0x24, 0xc00098b380, 0x19, 0xc001158418, 0x2, 0xc001158420, ...) /go/pkg/mod/sigs.k8s.io/controller-runtime@v0.6.4/pkg/webhook/admission/webhook.go:135 +0xb3 sigs.k8s.io/controller-runtime/pkg/webhook/admission.(Webhook).ServeHTTP(0xc00011e450, 0x7f2460700018, 0xc001480690, 0xc00067e000) /go/pkg/mod/sigs.k8s.io/controller-runtime@v0.6.4/pkg/webhook/admission/http.go:87 +0xb61 github.com/prometheus/client_golang/prometheus/promhttp.InstrumentHandlerInFlight.func1(0x7f2460700018, 0xc001480690, 0xc00067e000) /go/pkg/mod/github.com/prometheus/client_golang@v1.7.1/prometheus/promhttp/instrument_server.go:40 +0xab net/http.HandlerFunc.ServeHTTP(0xc00011e6f0, 0x7f2460700018, 0xc001480690, 0xc00067e000) /usr/local/go/src/net/http/server.go:2041 +0x44 github.com/prometheus/client_golang/prometheus/promhttp.InstrumentHandlerCounter.func1(0x1b99940, 0xc000cde000, 0xc00067e000) /go/pkg/mod/github.com/prometheus/client_golang@v1.7.1/prometheus/promhttp/instrument_server.go:100 +0xda net/http.HandlerFunc.ServeHTTP(0xc00011e840, 0x1b99940, 0xc000cde000, 0xc00067e000) /usr/local/go/src/net/http/server.go:2041 +0x44 github.com/prometheus/client_golang/prometheus/promhttp.InstrumentHandlerDuration.func2(0x1b99940, 0xc000cde000, 0xc00067e000) /go/pkg/mod/github.com/prometheus/client_golang@v1.7.1/prometheus/promhttp/instrument_server.go:76 +0xb2 net/http.HandlerFunc.ServeHTTP(0xc00011e930, 0x1b99940, 0xc000cde000, 0xc00067e000) /usr/local/go/src/net/http/server.go:2041 +0x44 net/http.(ServeMux).ServeHTTP(0xc000993700, 0x1b99940, 0xc000cde000, 0xc00067e000) /usr/local/go/src/net/http/server.go:2416 +0x1a5 net/http.serverHandler.ServeHTTP(0xc0009b1ea0, 0x1b99940, 0xc000cde000, 0xc00067e000) /usr/local/go/src/net/http/server.go:2836 +0xa3 net/http.(conn).serve(0xc0004739a0, 0x1b9e5c0, 0xc001494d00) /usr/local/go/src/net/http/server.go:1924 +0x86c created by net/http.(Server).Serve /usr/local/go/src/net/http/server.go:2962 +0x35c

VincentWei2021 commented 3 years ago

http: panic serving 192.168.100.48:31324: runtime error: invalid memory address or nil pointer dereference

gaocegege commented 3 years ago

看起来是 seldon 这边出现了 null pointer exception,后来有定位到么