Closed asettouf closed 1 year ago
Same issue here, except it occurs on all nodes (three-node cluster, one as master+worker+etcd, the other two being only worker nodes).
uname -r
--> 5.14.18-100.fc33.x86_64
# Docker logs on master node
$ docker logs kubelet
# ...
E0305 14:04:57.220501 44783 summary_sys_containers.go:82] "Failed to get system container stats" err="failed to get cgroup stats for \"/../docker.service\": failed to get container info for \"/../docker.service\": unknown container \"/../docker.service\"" containerName="/../docker.service"
E0305 14:05:04.343207 44783 summary_sys_containers.go:47] "Failed to get system container stats" err="failed to get cgroup stats for \"/../docker.service\": failed to get container info for \"/../docker.service\": unknown container \"/../docker.service\"" containerName="/../docker.service"
# ...
Full cluster.yml
:
nodes:
- address: 192.168.8.119
port: "22"
internal_address: ""
role:
- controlplane
- worker
- etcd
hostname_override: kube-1
user: rke
docker_socket: /var/run/docker.sock
ssh_key: ""
ssh_key_path: ~/.ssh/id_rsa
ssh_cert: ""
ssh_cert_path: ""
labels: {}
taints: []
- address: 192.168.8.150
port: "22"
internal_address: ""
role:
- worker
hostname_override: kube-2
user: rke
docker_socket: /var/run/docker.sock
ssh_key: ""
ssh_key_path: ~/.ssh/id_rsa
ssh_cert: ""
ssh_cert_path: ""
labels: {}
taints: []
- address: 192.168.8.120
port: "22"
internal_address: ""
role:
- worker
hostname_override: kube-3
user: rke
docker_socket: /var/run/docker.sock
ssh_key: ""
ssh_key_path: ~/.ssh/id_rsa
ssh_cert: ""
ssh_cert_path: ""
labels: {}
taints: []
services:
etcd:
image: ""
extra_args: {}
extra_binds: []
extra_env: []
win_extra_args: {}
win_extra_binds: []
win_extra_env: []
external_urls: []
ca_cert: ""
cert: ""
key: ""
path: ""
uid: 0
gid: 0
snapshot: null
retention: ""
creation: ""
backup_config: null
kube-api:
image: ""
extra_args: {}
extra_binds: []
extra_env: []
win_extra_args: {}
win_extra_binds: []
win_extra_env: []
service_cluster_ip_range: 10.43.0.0/16
service_node_port_range: ""
pod_security_policy: false
always_pull_images: false
secrets_encryption_config: null
audit_log: null
admission_configuration: null
event_rate_limit: null
kube-controller:
image: ""
extra_args: {}
extra_binds: []
extra_env: []
win_extra_args: {}
win_extra_binds: []
win_extra_env: []
cluster_cidr: 10.42.0.0/16
service_cluster_ip_range: 10.43.0.0/16
scheduler:
image: ""
extra_args: {}
extra_binds: []
extra_env: []
win_extra_args: {}
win_extra_binds: []
win_extra_env: []
kubelet:
image: ""
extra_args: {}
extra_binds: []
extra_env: []
win_extra_args: {}
win_extra_binds: []
win_extra_env: []
cluster_domain: cluster.local
infra_container_image: ""
cluster_dns_server: 10.43.0.10
fail_swap_on: false
generate_serving_certificate: false
kubeproxy:
image: ""
extra_args: {}
extra_binds: []
extra_env: []
win_extra_args: {}
win_extra_binds: []
win_extra_env: []
network:
plugin: flannel
options: {}
mtu: 0
node_selector: {}
update_strategy: null
tolerations: []
authentication:
strategy: x509
sans: []
webhook: null
addons: ""
addons_include: []
system_images:
etcd: rancher/mirrored-coreos-etcd:v3.5.0
alpine: rancher/rke-tools:v0.1.78
nginx_proxy: rancher/rke-tools:v0.1.78
cert_downloader: rancher/rke-tools:v0.1.78
kubernetes_services_sidecar: rancher/rke-tools:v0.1.78
kubedns: rancher/mirrored-k8s-dns-kube-dns:1.17.4
dnsmasq: rancher/mirrored-k8s-dns-dnsmasq-nanny:1.17.4
kubedns_sidecar: rancher/mirrored-k8s-dns-sidecar:1.17.4
kubedns_autoscaler: rancher/mirrored-cluster-proportional-autoscaler:1.8.3
coredns: rancher/mirrored-coredns-coredns:1.8.6
coredns_autoscaler: rancher/mirrored-cluster-proportional-autoscaler:1.8.5
nodelocal: rancher/mirrored-k8s-dns-node-cache:1.21.1
kubernetes: rancher/hyperkube:v1.22.6-rancher1
flannel: rancher/mirrored-coreos-flannel:v0.15.1
flannel_cni: rancher/flannel-cni:v0.3.0-rancher6
calico_node: rancher/mirrored-calico-node:v3.21.1
calico_cni: rancher/mirrored-calico-cni:v3.21.1
calico_controllers: rancher/mirrored-calico-kube-controllers:v3.21.1
calico_ctl: rancher/mirrored-calico-ctl:v3.21.1
calico_flexvol: rancher/mirrored-calico-pod2daemon-flexvol:v3.21.1
canal_node: rancher/mirrored-calico-node:v3.21.1
canal_cni: rancher/mirrored-calico-cni:v3.21.1
canal_controllers: rancher/mirrored-calico-kube-controllers:v3.21.1
canal_flannel: rancher/mirrored-coreos-flannel:v0.15.1
canal_flexvol: rancher/mirrored-calico-pod2daemon-flexvol:v3.21.1
weave_node: weaveworks/weave-kube:2.8.1
weave_cni: weaveworks/weave-npc:2.8.1
pod_infra_container: rancher/mirrored-pause:3.5
ingress: rancher/nginx-ingress-controller:nginx-1.1.0-rancher1
ingress_backend: rancher/mirrored-nginx-ingress-controller-defaultbackend:1.5-rancher1
ingress_webhook: rancher/mirrored-ingress-nginx-kube-webhook-certgen:v1.1.1
metrics_server: rancher/mirrored-metrics-server:v0.5.1
windows_pod_infra_container: rancher/kubelet-pause:v0.1.6
aci_cni_deploy_container: noiro/cnideploy:5.1.1.0.1ae238a
aci_host_container: noiro/aci-containers-host:5.1.1.0.1ae238a
aci_opflex_container: noiro/opflex:5.1.1.0.1ae238a
aci_mcast_container: noiro/opflex:5.1.1.0.1ae238a
aci_ovs_container: noiro/openvswitch:5.1.1.0.1ae238a
aci_controller_container: noiro/aci-containers-controller:5.1.1.0.1ae238a
aci_gbp_server_container: noiro/gbp-server:5.1.1.0.1ae238a
aci_opflex_server_container: noiro/opflex-server:5.1.1.0.1ae238a
ssh_key_path: ~/.ssh/id_rsa
ssh_cert_path: ""
ssh_agent_auth: false
authorization:
mode: rbac
options: {}
ignore_docker_version: null
enable_cri_dockerd: null
kubernetes_version: ""
private_registries: []
ingress:
provider: ""
options: {}
node_selector: {}
extra_args: {}
dns_policy: ""
extra_envs: []
extra_volumes: []
extra_volume_mounts: []
update_strategy: null
http_port: 0
https_port: 0
network_mode: ""
tolerations: []
default_backend: null
default_http_backend_priority_class_name: ""
nginx_ingress_controller_priority_class_name: ""
default_ingress_class: null
cluster_name: ""
cloud_provider:
name: ""
prefix_path: ""
win_prefix_path: ""
addon_job_timeout: 0
bastion_host:
address: ""
port: ""
user: ""
ssh_key: ""
ssh_key_path: ""
ssh_cert: ""
ssh_cert_path: ""
ignore_proxy_env_vars: false
monitoring:
provider: ""
options: {}
node_selector: {}
update_strategy: null
replicas: null
tolerations: []
metrics_server_priority_class_name: ""
restore:
restore: false
snapshot_name: ""
rotate_encryption_key: false
dns: null
Is this a configuration problem or a bug somewhere in RKE?
I've got a mixed set of nodes (centos7 and ubuntu 22.04) and only see this on the ubuntu 22.04 nodes.
This repository uses an automated workflow to automatically label issues which have not had any activity (commit/comment/label) for 60 days. This helps us manage the community issues better. If the issue is still relevant, please add a comment to the issue so the workflow can remove the label and we know it is still valid. If it is no longer relevant (or possibly fixed in the latest release), the workflow will automatically close the issue in 14 days. Thank you for your contributions.
I am facing the same issue. Looks to me as if rke is not compatible with cgroup v2. Is there any update on this?
I am facing the same issue on Ubuntu 22.04.
same issue, kubelet containers running on ubuntu 22.04 nodes report the same error
Same issue as well, on Ubuntu 22.04. Rancher 2.6.3. I have set systemd.unified_cgroup_hierarchy=0
to disable cgroup v2, but may still be a cgroup related issue.
I guess this has most likely been happening since installing the cluster, but I didn't notice until I was digging in to change a self-signed root CA certificate. The cluster is up and connected fine, so I'm assuming this failure is only informational in nature (related to just stats gathering perhaps)?
# docker --version
Docker version 20.10.17, build 100c701
# systemd --version
systemd 249 (249.11-0ubuntu3.4)
+PAM +AUDIT +SELINUX +APPARMOR +IMA +SMACK +SECCOMP +GCRYPT +GNUTLS +OPENSSL +ACL +BLKID +CURL +ELFUTILS +FIDO2 +IDN2 -IDN +IPTC +KMOD +LIBCRYPTSETUP -LIBFDISK +PCRE2 -PWQUALITY -P11KIT -QRENCODE +BZIP2 +LZ4 +XZ +ZLIB +ZSTD -XKBCOMMON +UTMP +SYSVINIT default-hierarchy=unified
I0822 00:56:34.625086 2073 container_manager_linux.go:510] "Discovered runtime cgroup name" cgroupName="/../../system.slice/docker.service"
E0822 00:56:43.960235 2073 summary_sys_containers.go:47] "Failed to get system container stats" err="failed to get cgroup stats for \"/../../system.slice/docker.service\": failed to get container info for \"/../../system.slice/docker.service\": unknown co
ntainer \"/../../system.slice/docker.service\"" containerName="/../../system.slice/docker.service"
E0822 00:56:53.966543 2073 summary_sys_containers.go:47] "Failed to get system container stats" err="failed to get cgroup stats for \"/../../system.slice/docker.service\": failed to get container info for \"/../../system.slice/docker.service\": unknown co
ntainer \"/../../system.slice/docker.service\"" containerName="/../../system.slice/docker.service"
E0822 00:57:03.979916 2073 summary_sys_containers.go:47] "Failed to get system container stats" err="failed to get cgroup stats for \"/../../system.slice/docker.service\": failed to get container info for \"/../../system.slice/docker.service\": unknown co
ntainer \"/../../system.slice/docker.service\"" containerName="/../../system.slice/docker.service"
E0822 00:57:13.999209 2073 summary_sys_containers.go:47] "Failed to get system container stats" err="failed to get cgroup stats for \"/../../system.slice/docker.service\": failed to get container info for \"/../../system.slice/docker.service\": unknown co
ntainer \"/../../system.slice/docker.service\"" containerName="/../../system.slice/docker.service"
E0822 00:57:24.005696 2073 summary_sys_containers.go:47] "Failed to get system container stats" err="failed to get cgroup stats for \"/../../system.slice/docker.service\": failed to get container info for \"/../../system.slice/docker.service\": unknown co
ntainer \"/../../system.slice/docker.service\"" containerName="/../../system.slice/docker.service"
E0822 00:57:34.013132 2073 summary_sys_containers.go:47] "Failed to get system container stats" err="failed to get cgroup stats for \"/../../system.slice/docker.service\": failed to get container info for \"/../../system.slice/docker.service\": unknown co
ntainer \"/../../system.slice/docker.service\"" containerName="/../../system.slice/docker.service"
E0822 00:57:44.031896 2073 summary_sys_containers.go:47] "Failed to get system container stats" err="failed to get cgroup stats for \"/../../system.slice/docker.service\": failed to get container info for \"/../../system.slice/docker.service\": unknown co
ntainer \"/../../system.slice/docker.service\"" containerName="/../../system.slice/docker.service"
E0822 00:57:54.039587 2073 summary_sys_containers.go:47] "Failed to get system container stats" err="failed to get cgroup stats for \"/../../system.slice/docker.service\": failed to get container info for \"/../../system.slice/docker.service\": unknown co
ntainer \"/../../system.slice/docker.service\"" containerName="/../../system.slice/docker.service"
E0822 00:58:04.050405 2073 summary_sys_containers.go:47] "Failed to get system container stats" err="failed to get cgroup stats for \"/../../system.slice/docker.service\": failed to get container info for \"/../../system.slice/docker.service\": unknown co
ntainer \"/../../system.slice/docker.service\"" containerName="/../../system.slice/docker.service"
E0822 00:58:14.067946 2073 summary_sys_containers.go:47] "Failed to get system container stats" err="failed to get cgroup stats for \"/../../system.slice/docker.service\": failed to get container info for \"/../../system.slice/docker.service\": unknown co
ntainer \"/../../system.slice/docker.service\"" containerName="/../../system.slice/docker.service"
E0822 00:58:24.074202 2073 summary_sys_containers.go:47] "Failed to get system container stats" err="failed to get cgroup stats for \"/../../system.slice/docker.service\": failed to get container info for \"/../../system.slice/docker.service\": unknown co
ntainer \"/../../system.slice/docker.service\"" containerName="/../../system.slice/docker.service"
E0822 00:58:34.080877 2073 summary_sys_containers.go:47] "Failed to get system container stats" err="failed to get cgroup stats for \"/../../system.slice/docker.service\": failed to get container info for \"/../../system.slice/docker.service\": unknown co
ntainer \"/../../system.slice/docker.service\"" containerName="/../../system.slice/docker.service"
---repeate ad nauseum---
# mount | grep cgroup
tmpfs on /sys/fs/cgroup type tmpfs (ro,nosuid,nodev,noexec,size=4096k,nr_inodes=1024,mode=755,inode64)
cgroup2 on /sys/fs/cgroup/unified type cgroup2 (rw,nosuid,nodev,noexec,relatime)
cgroup on /sys/fs/cgroup/systemd type cgroup (rw,nosuid,nodev,noexec,relatime,xattr,name=systemd)
cgroup on /sys/fs/cgroup/freezer type cgroup (rw,nosuid,nodev,noexec,relatime,freezer)
cgroup on /sys/fs/cgroup/cpu,cpuacct type cgroup (rw,nosuid,nodev,noexec,relatime,cpu,cpuacct)
cgroup on /sys/fs/cgroup/pids type cgroup (rw,nosuid,nodev,noexec,relatime,pids)
cgroup on /sys/fs/cgroup/misc type cgroup (rw,nosuid,nodev,noexec,relatime,misc)
cgroup on /sys/fs/cgroup/net_cls,net_prio type cgroup (rw,nosuid,nodev,noexec,relatime,net_cls,net_prio)
cgroup on /sys/fs/cgroup/perf_event type cgroup (rw,nosuid,nodev,noexec,relatime,perf_event)
cgroup on /sys/fs/cgroup/devices type cgroup (rw,nosuid,nodev,noexec,relatime,devices)
cgroup on /sys/fs/cgroup/cpuset type cgroup (rw,nosuid,nodev,noexec,relatime,cpuset)
cgroup on /sys/fs/cgroup/blkio type cgroup (rw,nosuid,nodev,noexec,relatime,blkio)
cgroup on /sys/fs/cgroup/memory type cgroup (rw,nosuid,nodev,noexec,relatime,memory)
cgroup on /sys/fs/cgroup/hugetlb type cgroup (rw,nosuid,nodev,noexec,relatime,hugetlb)
cgroup on /sys/fs/cgroup/rdma type cgroup (rw,nosuid,nodev,noexec,relatime,rdma)
# cat /proc/cmdline
BOOT_IMAGE=/boot/vmlinuz-5.15.0-47-generic root=/dev/vda1 ro systemd.unified_cgroup_hierarchy=0 console=tty1 console=ttyS0
I have the same problem. In a 4 node cluster on ubuntu 20.04 running rancher provisioned k8s rke v1.23.10 I have updated 3 nodes to 22.04. The results were baffling. On one node everything went smooth. On the 2 other nodes, rancher began to spawn thousands of rancher-agent containers. So I had to remove them from the cluster, cleanup the nodes and add them again. The nodes look OK. However I see more the above error about cgroups and I don't know if it is related but I see more instances of unremoved volumes from orphaned pod that have to be removed by hand.
Upon further investigation I found the culprit for the cgroup strange behavior - on one upgraded node the kubelet was finding the correct groups, and on the other two - it didn't, although all docker configuration on the hosts was the same. The problem was the .HostConfig.CgroupnsMode
parameter on the kubelet container.
With ubuntu 22.04 the docker started using systemd v2 cgroup driver, but on ubuntu 20.04 it was using cgroupfs v1. The default-cgroupns-mode
option for v1 is host
, and for v2 is private
.
The node that was working without cleanup after the upgrade was keeping the container configuration from when it was running on docker with cgroup v1 and the kubelet on the other two nodes that had errors was created with private(the new default) cgroupnsmode.
Now the question is: Which one is the best approach to fix this?
1) To edit the .HostConfig.CgroupnsMode
of the kubelet and change it to host
2) To change the default-cgroupns-mode
of the docker daemon to host
3) To change the docker cgroup driver back to cgroupfs
and v1?
This repository uses an automated workflow to automatically label issues which have not had any activity (commit/comment/label) for 60 days. This helps us manage the community issues better. If the issue is still relevant, please add a comment to the issue so the workflow can remove the label and we know it is still valid. If it is no longer relevant (or possibly fixed in the latest release), the workflow will automatically close the issue in 14 days. Thank you for your contributions.
I also had this issue after a fresh install on Ubuntu 22.04 and was able to solve it after some inspiration from @ZleFox.
Updated (with feedback from @ZleFox)
/etc/docker/daemon.json
and add "default-cgroupns-mode": "host"
. Eg
{
"default-cgroupns-mode": "host"
}
jq
if you don't already have it with apt-get update && apt-get install jq
kubelet
container to change its mode (changing the default above does not effect existing containers)
KUBELET_HOST_CONFIG=/var/lib/docker/containers/$(docker inspect --format="{{.Id}}" kubelet)/hostconfig.json
sudo cp $KUBELET_HOST_CONFIG $KUBELET_HOST_CONFIG.bak
sudo jq '.CgroupnsMode = "host"' $KUBELET_HOST_CONFIG.bak | sudo tee $KUBELET_HOST_CONFIG > /dev/null
sudo systemctl restart docker
Original
First, set the default-cgroupns-mode
of the docker daemon to host
on each node. I did this one node at a time and waiting for all pods to come back before moving on to the next node
~~1. Edit /lib/systemd/system/docker.service
and append --default-cgroupns-mode=host --exec-opt native.cgroupdriver=systemd
to the end of ExecStart
sudo systemctl daemon-reload
sudo systemctl restart docker
~~This only changes the cgroup for new contains. I couldn't work out how to change this for existing containers, so my solution was simply to delete the kubelet
container from each node, then run rke up
again, which re-created it with CgroupnsMode=host
. (Please let me know if anyone knows a way to edit this on an existing container!)
~~1. On first node, run docker rm -f kubelet
rke up
docker inspect kubelet | grep CgroupnsMode
It might be possible to remove the kubelet container from all nodes at once, but I didn't want to push my luck as I was working with a production cluster.
Hope this helps others!
You can actually patch the current kubelet container by stoping docker docker (service and socket) and editing /var/lib/docker/containers/
just add "default-cgroupns-mode": "host",
and you won't have to edit service files, which will disrupt any future upgrades via apt
Good tip @ZleFox. I've moved the configuration to /etc/docker/daemon.json
and removed it from /lib/systemd/system/docker.service
.
Thanks for the tips on patching the hostconfig.json
file!
I've updated my original comment to reflect the updated changes.
This repository uses an automated workflow to automatically label issues which have not had any activity (commit/comment/label) for 60 days. This helps us manage the community issues better. If the issue is still relevant, please add a comment to the issue so the workflow can remove the label and we know it is still valid. If it is no longer relevant (or possibly fixed in the latest release), the workflow will automatically close the issue in 14 days. Thank you for your contributions.
I m getting same issue on RockyOS9
{"log":"E0209 22:24:27.239981 4186647 summary_sys_containers.go:83] \"Failed to get system container stats\" err=\"failed to get cgroup stats for \\"/../docker.service\\": failed to get container info for \\"/../docker.service\\": unknown container \\"/../docker.service\\"\" containerName=\"/../docker.service\"\n","stream":"stderr","time":"2024-02-09T22:24:27.240306186Z"} {"log":"E0209 22:24:35.541261 4186647 summary_sys_containers.go:48] \"Failed to get system container stats\" err=\"failed to get cgroup stats for \\"/../docker.service\\": failed to get container info for \\"/../docker.service\\": unknown container \\"/../docker.service\\"\" containerName=\"/../docker.service\"\n","stream":"stderr","time":"2024-02-09T22:24:35.541394101Z"}
Tried above steps of updating cgroupsns-mode and hostconfig.json. Any other steps which we can try?
Kernel: 5.14.0-362.13.1.el9_3.x86_64 NAME="Rocky Linux" VERSION="9.3 (Blue Onyx)" Client: Docker Engine - Community Version: 24.0.7 API version: 1.41 (downgraded from 1.43) Go version: go1.20.10 Git commit: afdd53b Built: Thu Oct 26 09:09:13 2023 OS/Arch: linux/amd64 Context: default
Server: Docker Engine - Community Engine: Version: 20.10.24 API version: 1.41 (minimum version 1.12) Go version: go1.19.7 Git commit: 5d6db84 Built: Tue Apr 4 18:18:28 2023 OS/Arch: linux/amd64 Experimental: false containerd: Version: 1.6.26 GitCommit: 3dd1e886e55dd695541fdcd67420c2888645a495 runc: Version: 1.1.10 GitCommit: v1.1.10-0-g18a0cb0 docker-init: Version: 0.19.0 GitCommit: de40ad0
RKE version: v1.3.5
Docker version: (
docker version
,docker info
preferred) Docker version 20.10.10, build b485636Operating system and kernel: (
cat /etc/os-release
,uname -r
preferred)Debian GNU/Linux 11 (bullseye)
Type/provider of hosts: (VirtualBox/Bare-metal/AWS/GCE/DO) Bare-metal
cluster.yml file:
Steps to Reproduce:
Run a RKE cluster following the documentation. Log onto the master. Check the
kubelet
logs (e.g.docker logs kubelet
), notice:Results: It's expected to have no errors in this log. Possibly linked to this issue: https://github.com/kubernetes/kubeadm/issues/2077
Of course as
kubelet
runs ondocker
with RKE, it's quite impossible to apply the suggested workarounds that rely onsystemd