Closed nmehlei closed 2 months ago
I'm facing the same problem that both the hcloud-csi-node instances as well as the hcloud-csi-controller are constantly being restarted.
This is a relatively fresh cluster, created around a week ago, with just a few simple services running.
hcloud-csi-node:
I0824 01:30:58.006872 1 main.go:150] "Version" version="v2.11.1" I0824 01:30:58.007058 1 main.go:151] "Running node-driver-registrar" mode="" I0824 01:30:59.018946 1 node_register.go:56] "Starting Registration Server" socketPath="/registration/csi.hetzner.cloud-reg.sock" I0824 01:30:59.020808 1 node_register.go:66] "Registration Server started" socketPath="/registration/csi.hetzner.cloud-reg.sock" I0824 01:30:59.020899 1 node_register.go:96] "Skipping HTTP server" I0824 01:30:59.640445 1 main.go:96] "Received GetInfo call" request="&InfoRequest{}" I0824 01:30:59.760491 1 main.go:108] "Received NotifyRegistrationStatus call" status="&RegistrationStatus{PluginRegistered:true,Error:,}" I0825 00:27:11.763976 1 main.go:96] "Received GetInfo call" request="&InfoRequest{}" I0825 00:27:12.281916 1 main.go:108] "Received NotifyRegistrationStatus call" status="&RegistrationStatus{PluginRegistered:true,Error:,}"
Logs for liveness probe within hcloud-csi-node:
E0902 15:43:59.466673 1 main.go:67] "Failed to establish connection to CSI driver" err="context deadline exceeded" E0902 15:44:10.074620 1 main.go:67] "Failed to establish connection to CSI driver" err="context deadline exceeded" E0902 15:44:13.382138 1 main.go:67] "Failed to establish connection to CSI driver" err="context deadline exceeded" E0902 15:44:13.389440 1 main.go:67] "Failed to establish connection to CSI driver" err="context deadline exceeded" E0902 15:44:13.390346 1 main.go:67] "Failed to establish connection to CSI driver" err="context canceled" E0902 15:44:26.310868 1 main.go:67] "Failed to establish connection to CSI driver" err="context deadline exceeded" E0902 15:44:29.704050 1 main.go:67] "Failed to establish connection to CSI driver" err="context deadline exceeded" E0902 15:44:31.974671 1 main.go:77] "Health check failed" err="rpc error: code = DeadlineExceeded desc = context deadline exceeded" E0902 15:44:44.586302 1 main.go:77] "Health check failed" err="rpc error: code = DeadlineExceeded desc = context deadline exceeded" E0902 15:44:53.939488 1 main.go:77] "Health check failed" err="rpc error: code = DeadlineExceeded desc = context deadline exceeded"
hcloud-csi-controller
I0902 15:32:19.560366 1 trace.go:236] Trace[824572742]: "Reflector ListAndWatch" name:k8s.io/client-go/informers/factory.go:160 (02-Sep-2024 15:31:59.638) (total time: 19826ms): Trace[824572742]: ---"Objects listed" error:<nil> 15403ms (15:32:15.041) Trace[824572742]: ---"Resource version extracted" 2341ms (15:32:17.382) Trace[824572742]: ---"SyncWith done" 1674ms (15:32:19.232) Trace[824572742]: [19.826579496s] [19.826579496s] END I0902 15:32:37.881333 1 trace.go:236] Trace[1413627624]: "Reflector ListAndWatch" name:k8s.io/client-go/informers/factory.go:160 (02-Sep-2024 15:31:51.642) (total time: 43422ms): Trace[1413627624]: ---"Objects listed" error:<nil> 43399ms (15:32:35.041) Trace[1413627624]: [43.422928717s] [43.422928717s] END W0902 15:45:27.368751 1 reflector.go:470] k8s.io/client-go/informers/factory.go:160: watch of *v1.CSINode ended with: an error on the server ("unable to decode an event from the watch stream: http2: client connection lost") has prevented the request from succeeding W0902 15:45:30.001733 1 reflector.go:470] k8s.io/client-go/informers/factory.go:160: watch of *v1.VolumeAttachment ended with: an error on the server ("unable to decode an event from the watch stream: http2: client connection lost") has prevented the request from succeeding W0902 15:47:30.000819 1 reflector.go:470] k8s.io/client-go/informers/factory.go:160: watch of *v1.PersistentVolume ended with: an error on the server ("unable to decode an event from the watch stream: http2: client connection lost") has prevented the request from succeeding I0902 15:47:48.853225 1 trace.go:236] Trace[847951132]: "Reflector ListAndWatch" name:k8s.io/client-go/informers/factory.go:160 (02-Sep-2024 15:46:41.046) (total time: 62670ms): Trace[847951132]: ---"Objects listed" error:<nil> 59844ms (15:47:40.890) Trace[847951132]: ---"Resource version extracted" 1950ms (15:47:42.841) Trace[847951132]: [1m2.670380599s] [1m2.670380599s] END I0902 15:47:48.871428 1 trace.go:236] Trace[1806878113]: "Reflector ListAndWatch" name:k8s.io/client-go/informers/factory.go:160 (02-Sep-2024 15:45:33.917) (total time: 134687ms): Trace[1806878113]: ---"Objects listed" error:<nil> 129517ms (15:47:43.434) Trace[1806878113]: ---"Resource version extracted" 2965ms (15:47:46.400) Trace[1806878113]: [2m14.687636424s] [2m14.687636424s] END I0902 15:48:10.705429 1 trace.go:236] Trace[2102806860]: "Reflector ListAndWatch" name:k8s.io/client-go/informers/factory.go:160 (02-Sep-2024 15:47:34.577) (total time: 35980ms): Trace[2102806860]: ---"Objects listed" error:<nil> 35895ms (15:48:10.472) Trace[2102806860]: [35.980470108s] [35.980470108s] END
This may be somewhat related to this discussion: https://github.com/kube-hetzner/terraform-hcloud-kube-hetzner/discussions/1292
locals { hcloud_token = "redacted" } module "kube-hetzner" { providers = { hcloud = hcloud } hcloud_token = var.hcloud_token != "" ? var.hcloud_token : local.hcloud_token source = "kube-hetzner/kube-hetzner/hcloud" ssh_public_key = file("./mycluster_id_ed25555.pub") ssh_private_key = file("./mycluster_id_ed25555") network_region = "eu-central" # change to `us-east` if location is ash control_plane_nodepools = [ { name = "control-plane-fsn1", server_type = "cx22", location = "fsn1", labels = [], taints = [], count = 1 }, { name = "control-plane-nbg1", server_type = "cx22", location = "nbg1", labels = [], taints = [], count = 1 }, { name = "control-plane-hel1", server_type = "cx22", location = "hel1", labels = [], taints = [], count = 1 } ] agent_nodepools = [ { name = "agent-small", server_type = "cx22", location = "fsn1", labels = [], taints = [], count = 0 }, { name = "agent-large", server_type = "cx32", location = "nbg1", labels = [], taints = [], count = 0 }, { name = "storage", server_type = "cx32", location = "fsn1", # Fully optional, just a demo. labels = [ "node.kubernetes.io/server-usage=storage" ], taints = [], count = 0 }, { name = "egress", server_type = "cx22", location = "fsn1", labels = [ "node.kubernetes.io/role=egress" ], taints = [ "node.kubernetes.io/role=egress:NoSchedule" ], floating_ip = true count = 0 }, # Arm based nodes { name = "agent-arm-small", server_type = "cax11", location = "fsn1", labels = [], taints = [], count = 0 }, ] # * LB location and type, the latter will depend on how much load you want it to handle, see https://www.hetzner.com/cloud/load-balancer load_balancer_type = "lb11" load_balancer_location = "fsn1" ingress_controller = "nginx" ingress_target_namespace = "nginx" enable_klipper_metal_lb = "false" enable_metrics_server = true allow_scheduling_on_control_plane = true system_upgrade_use_drain = true automatically_upgrade_os = true cluster_name = "my-cluster" k3s_registries = <<-EOT configs: myclusterregistry.azurecr.io: auth: username: myclusterregistry password: redacted EOT extra_firewall_rules = [ { description = "HTTP" direction = "in" protocol = "tcp" port = "80" source_ips = ["0.0.0.0/0", "::/0"] # Allow from any IP address destination_ips = [] # Won't be used for this rule }, { description = "HTTPS" direction = "in" protocol = "tcp" port = "443" source_ips = ["0.0.0.0/0", "::/0"] # Allow from any IP address destination_ips = [] # Won't be used for this rule }, { description = "Outgoing all" direction = "out" protocol = "tcp" port = "any" source_ips = [] # Won't be used for this rule destination_ips = ["0.0.0.0/0", "::/0"] } ] enable_cert_manager = true dns_servers = [ "1.1.1.1", "8.8.8.8", "2606:4700:4700::1111", ] additional_tls_sans = ["my-cluster.my-hosts.de"] kubeconfig_server_address = "my-cluster.my-hosts.de" lb_hostname = "nginx.my-cluster.my-hosts.de" } provider "hcloud" { token = var.hcloud_token != "" ? var.hcloud_token : local.hcloud_token } terraform { required_version = ">= 1.5.0" required_providers { hcloud = { source = "hetznercloud/hcloud" version = ">= 1.43.0" } } } output "kubeconfig" { value = module.kube-hetzner.kubeconfig sensitive = true } variable "hcloud_token" { sensitive = true default = "" }
The warning sign in the screenshot shows: Back-off restarting failed container hcloud-csi-driver in pod hcloud-csi-controller.
Back-off restarting failed container hcloud-csi-driver in pod hcloud-csi-controller
Mac as Desktop
Same Issue with a litte lab cluster (2 worker - 1 control) :(
hcloud-csi-node logs...
Any tips?
Regards
Will look into this ASAP.
Must have been an issue on Hetzner's side with their API. Closing for now.
Description
Description
I'm facing the same problem that both the hcloud-csi-node instances as well as the hcloud-csi-controller are constantly being restarted.
This is a relatively fresh cluster, created around a week ago, with just a few simple services running.
Logs
hcloud-csi-node:
Logs for liveness probe within hcloud-csi-node:
hcloud-csi-controller
This may be somewhat related to this discussion: https://github.com/kube-hetzner/terraform-hcloud-kube-hetzner/discussions/1292
Kube.tf file
Screenshots
The warning sign in the screenshot shows:
Back-off restarting failed container hcloud-csi-driver in pod hcloud-csi-controller
.Platform
Mac as Desktop