Open sebastienmusso opened 8 months ago
@sebastienmusso Would you be able to provide the (sanitized) yaml for a cluster.provisioning.cattle.io object you are seeing this issue with?
Hi jakefhyde,
here is my cluster provisionning object:
apiVersion: provisioning.cattle.io/v1
kind: Cluster
metadata:
name: test-dev-2
namespace: fleet-kaas
spec:
agentEnvVars:
- name: CATTLE_AGENT_LOGLEVEL
value: debug
defaultClusterRoleForProjectMembers: admin
enableNetworkPolicy: true
fleetAgentDeploymentCustomization:
appendTolerations:
- effect: NoSchedule
key: CriticalAddonsOnly
operator: Equal
value: "true"
overrideAffinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/control-plane
operator: In
values:
- "true"
- key: node-role.kubernetes.io/master
operator: In
values:
- "true"
kubernetesVersion: v1.26.10+rke2r2
localClusterAuthEndpoint: {}
rkeConfig:
etcd:
s3:
bucket: mybucket
cloudCredentialName: etcd-s3-backup-creds
endpoint: mys3endpoint
folder: rke2
region: us-east-1
machineGlobalConfig:
cluster-cidr: 100.64.64.0/19
cluster-dns: 100.64.224.10
cluster-domain: test-dev-2.local
cni: cilium
control-plane-resource-requests:
- kube-apiserver-cpu=200m
- kube-apiserver-memory=1536M
- kube-scheduler-cpu=10m
- kube-scheduler-memory=128M
- etcd-cpu=100m
- etcd-memory=512M
- kube-controller-manager-cpu=10m
- kube-controller-manager-memory=128M
- cloud-controller-manager-cpu=10m
- cloud-controller-manager-memory=128M
disable:
- rke2-ingress-nginx
disable-cloud-controller: false
disable-kube-proxy: true
etcd-expose-metrics: true
kube-apiserver-arg:
- audit-log-path=/var/log/kubernetes-audit.log
- tls-min-version=VersionTLS12
- enable-admission-plugins=CertificateApproval,CertificateSigning,CertificateSubjectRestriction,DefaultIngressClass,DefaultStorageClass,DefaultTolerationSeconds,LimitRanger,MutatingAdmissionWebhook,PersistentVolumeClaimResize,Priority,ResourceQuota,RuntimeClass,StorageObjectInUseProtection,TaintNodesByCondition,ValidatingAdmissionWebhook,NodeRestriction
- cloud-provider=external
kube-controller-manager-arg:
- configure-cloud-routes=false
- tls-min-version=VersionTLS12
- bind-address=0.0.0.0
- secure-port=10257
- cloud-provider=external
kube-scheduler-arg:
- tls-min-version=VersionTLS12
- bind-address=0.0.0.0
- secure-port=10259
kubelet-arg:
- feature-gates=AdvancedAuditing=true
- container-log-max-files=4
- container-log-max-size=50Mi
- system-reserved=cpu=300m,memory=1Gi
- kube-reserved=cpu=200m,memory=1Gi
- cloud-provider=external
- eviction-hard=memory.available<500Mi,nodefs.available<5%,imagefs.available<5%,nodefs.inodesFree<5%
- eviction-soft=memory.available<1.5Gi,nodefs.available<20%,imagefs.available<10%,nodefs.inodesFree<10%
- eviction-soft-grace-period=memory.available=1m30s,nodefs.available=1m30s,imagefs.available=1m30s,nodefs.inodesFree=1m30s
private-registry: /var/registries.yaml
protect-kernel-defaults: true
service-cidr: 100.64.224.0/19
machinePoolDefaults: {}
registries:
mirrors:
docker.io:
endpoint:
- https://docker-io-registry
ghcr.io:
endpoint:
- https://ghcr-registry
k8s.gcr.io:
endpoint:
- https://k8s-gcr-registry
quay.io:
endpoint:
- https://quay-registry
registry.k8s.io:
endpoint:
- https://registry-k8s-registry
upgradeStrategy:
controlPlaneConcurrency: 0%
controlPlaneDrainOptions:
deleteEmptyDirData: true
disableEviction: false
enabled: false
force: false
gracePeriod: 300
ignoreDaemonSets: true
ignoreErrors: false
postDrainHooks: null
preDrainHooks: null
skipWaitForDeleteTimeoutSeconds: 300
timeout: 900
workerConcurrency: 0%
workerDrainOptions:
deleteEmptyDirData: true
disableEviction: false
enabled: false
force: false
gracePeriod: 300
ignoreDaemonSets: true
ignoreErrors: false
postDrainHooks: null
preDrainHooks: null
skipWaitForDeleteTimeoutSeconds: 300
timeout: 900
I thought it could come from cloud-provider set to external which is not necessary
Hi jakefhyde,
here is my cluster provisionning object:
apiVersion: provisioning.cattle.io/v1 kind: Cluster metadata: name: test-dev-2 namespace: fleet-kaas spec: agentEnvVars: - name: CATTLE_AGENT_LOGLEVEL value: debug defaultClusterRoleForProjectMembers: admin enableNetworkPolicy: true fleetAgentDeploymentCustomization: appendTolerations: - effect: NoSchedule key: CriticalAddonsOnly operator: Equal value: "true" overrideAffinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: node-role.kubernetes.io/control-plane operator: In values: - "true" - key: node-role.kubernetes.io/master operator: In values: - "true" kubernetesVersion: v1.26.10+rke2r2 localClusterAuthEndpoint: {} rkeConfig: etcd: s3: bucket: mybucket cloudCredentialName: etcd-s3-backup-creds endpoint: mys3endpoint folder: rke2 region: us-east-1 machineGlobalConfig: cluster-cidr: 100.64.64.0/19 cluster-dns: 100.64.224.10 cluster-domain: test-dev-2.local cni: cilium control-plane-resource-requests: - kube-apiserver-cpu=200m - kube-apiserver-memory=1536M - kube-scheduler-cpu=10m - kube-scheduler-memory=128M - etcd-cpu=100m - etcd-memory=512M - kube-controller-manager-cpu=10m - kube-controller-manager-memory=128M - cloud-controller-manager-cpu=10m - cloud-controller-manager-memory=128M disable: - rke2-ingress-nginx disable-cloud-controller: false disable-kube-proxy: true etcd-expose-metrics: true kube-apiserver-arg: - audit-log-path=/var/log/kubernetes-audit.log - tls-min-version=VersionTLS12 - enable-admission-plugins=CertificateApproval,CertificateSigning,CertificateSubjectRestriction,DefaultIngressClass,DefaultStorageClass,DefaultTolerationSeconds,LimitRanger,MutatingAdmissionWebhook,PersistentVolumeClaimResize,Priority,ResourceQuota,RuntimeClass,StorageObjectInUseProtection,TaintNodesByCondition,ValidatingAdmissionWebhook,NodeRestriction - cloud-provider=external kube-controller-manager-arg: - configure-cloud-routes=false - tls-min-version=VersionTLS12 - bind-address=0.0.0.0 - secure-port=10257 - cloud-provider=external kube-scheduler-arg: - tls-min-version=VersionTLS12 - bind-address=0.0.0.0 - secure-port=10259 kubelet-arg: - feature-gates=AdvancedAuditing=true - container-log-max-files=4 - container-log-max-size=50Mi - system-reserved=cpu=300m,memory=1Gi - kube-reserved=cpu=200m,memory=1Gi - cloud-provider=external - eviction-hard=memory.available<500Mi,nodefs.available<5%,imagefs.available<5%,nodefs.inodesFree<5% - eviction-soft=memory.available<1.5Gi,nodefs.available<20%,imagefs.available<10%,nodefs.inodesFree<10% - eviction-soft-grace-period=memory.available=1m30s,nodefs.available=1m30s,imagefs.available=1m30s,nodefs.inodesFree=1m30s private-registry: /var/registries.yaml protect-kernel-defaults: true service-cidr: 100.64.224.0/19 machinePoolDefaults: {} registries: mirrors: docker.io: endpoint: - https://docker-io-registry ghcr.io: endpoint: - https://ghcr-registry k8s.gcr.io: endpoint: - https://k8s-gcr-registry quay.io: endpoint: - https://quay-registry registry.k8s.io: endpoint: - https://registry-k8s-registry upgradeStrategy: controlPlaneConcurrency: 0% controlPlaneDrainOptions: deleteEmptyDirData: true disableEviction: false enabled: false force: false gracePeriod: 300 ignoreDaemonSets: true ignoreErrors: false postDrainHooks: null preDrainHooks: null skipWaitForDeleteTimeoutSeconds: 300 timeout: 900 workerConcurrency: 0% workerDrainOptions: deleteEmptyDirData: true disableEviction: false enabled: false force: false gracePeriod: 300 ignoreDaemonSets: true ignoreErrors: false postDrainHooks: null preDrainHooks: null skipWaitForDeleteTimeoutSeconds: 300 timeout: 900
I thought it could come from cloud-provider set to external which is not necessary
What cloud provider are you using? AWS? From the logs you posted, it seems like Rancher is reflecting a deleted v1.Node
object. To clarify as well, the corresponding machines.cluster.x-k8s.io
object was still around right? you just saw your v1.Node
object in the downstream cluster be deleted?
@guillaumebernard84, since you've 👍 the issue I'm wondering if this is something you also ran into and could share more information with us?
@Oats87 We use openstack out of tree cloud-controller-manager si only service controller enabled (loadbalancer handling) machines.cluster.x-k8s.io was objects was deleted in the rms as well, as the v1.Node in the downstream cluster.
Hello @snasovich, I work with sebastienmusso, we tried to diagnose the problem together, but weren't able to find the root cause.
It seem's when we delete cluster downstream nodes some nodes.management.cattle.io still on deleting state with deletionTimestamp field not null.
In order to delete them correctly we had to delete finalizer.
Could it come from our node upgrade process ?
1) Delete node from machine.k8s-api and wait node delete from downstream cluster 2) delete and recreate virtual machine with our iaas (same hostname same ip) 3) re enroll machine with rancher
It feels like it is the upgrade process. I am trying to reproduce it to confirm.
It seem's when we delete cluster downstream nodes some nodes.management.cattle.io still on deleting state with deletionTimestamp field not null.
In order to delete them correctly we had to delete finalizer.
Could it come from our node upgrade process ?
1. Delete node from machine.k8s-api and wait node delete from downstream cluster 2. delete and recreate virtual machine with our iaas (same hostname same ip) 3. re enroll machine with rancher
To clarify, your node upgrade process is happening independently from your upgrade of Rancher, correct?
@Oats87 Correct.
Rancher Server Setup
Rancher version: 2.8.1 Installation option (Docker install/Helm Chart): Helm If Helm Chart, Kubernetes Cluster and version (RKE1, RKE2, k3s, EKS, etc): RKE2 Proxy/Cert Details: Information about the Cluster
Kubernetes version: v1.26.10+rke2r2 Cluster Type (Local/Downstream): Downstream If downstream, what type of cluster? (Custom/Imported or specify provider for Hosted/Infrastructure Provider): Custom v1.26.10+rke2r2
What is the role of the user logged in? (Admin/Cluster Owner/Cluster Member/Project Owner/Project Member/Custom) If custom, define the set of permissions: Admin
Describe the bug We have a rancher rms v1.26.10+rke2r2 cluster and a downstream rke2 custom cluster v1.26.10+rke2r2 with 3 control plane and 4 workers. The rancher helm chart upgrade (2.8.0->2.8.1) provok node deletion of 2 control plane and 2 workers. I had to remove Machine.cluster.x-k8s.io from cluster management ui and reenroll nodes in order to have all my nodes fully fonctionnional.
This operation have been played on 8 different rancher management server and I saw the issue was seen on only 3 of them.
Result Cluster broken
Expected Result Downstream cluster keep his nodes
Additional context interesting log lines from rancher v2.8.1 at startup
2024/01/29 16:08:39 [INFO] [planner] rkecluster fleet-kaas/test-dev-2: configuring bootstrap node(s) custom-00266ec772bf: waiting for plan to be applied 2024/01/29 16:08:39 [INFO] [planner] rkecluster fleet-kaas/test-dev-2: configuring bootstrap node(s) custom-00266ec772bf: waiting for plan to be applied 2024/01/29 16:08:39 [INFO] [planner] rkecluster fleet-kaas/test-dev-2: configuring bootstrap node(s) custom-00266ec772bf: waiting for plan to be applied 2024/01/29 16:08:39 [ERROR] error syncing 'system-library': handler catalog: failed to fetch images from system charts: failed to load system charts index: stat management-state/catalog-cache/a67038d6110101d84b823470950db15b8ab6c06fe4e8895bcae7a337c1a8990d: no such file or directory, requeuing 2024/01/29 16:08:38 [INFO] Catalog sync done. 0 templates created, 7 templates updated, 0 templates deleted, 0 templates failed 2024/01/29 16:08:38 [INFO] [planner] rkecluster fleet-kaas/test-dev-2: configuring bootstrap node(s) custom-00266ec772bf: waiting for plan to be applied 2024/01/29 16:08:38 [ERROR] error syncing 'c-m-vfrrp9d2/machine-cvdrz': handler node-controller-sync: Operation cannot be fulfilled on nodes.management.cattle.io "machine-cvdrz": StorageError: invalid object, Code: 4, Key: /registry/management.cattle.io/nodes/c-m-vfrrp9d2/machine-cvdrz, ResourceVersion: 0, AdditionalErrorMsg: Precondition failed: UID in precondition: 8f7f17c4-0329-43a2-aa6e-b20c2cdb3955, UID in object meta: , requeuing 2024/01/29 16:08:38 [ERROR] error syncing 'c-m-vfrrp9d2/machine-gp5rg': handler node-controller-sync: Operation cannot be fulfilled on nodes.management.cattle.io "machine-gp5rg": StorageError: invalid object, Code: 4, Key: /registry/management.cattle.io/nodes/c-m-vfrrp9d2/machine-gp5rg, ResourceVersion: 0, AdditionalErrorMsg: Precondition failed: UID in precondition: 26271033-e747-40a5-8842-c0335d72b412, UID in object meta: , requeuing 2024/01/29 16:08:38 [ERROR] error syncing 'c-m-vfrrp9d2/machine-zxcp8': handler node-controller-sync: Operation cannot be fulfilled on nodes.management.cattle.io "machine-zxcp8": StorageError: invalid object, Code: 4, Key: /registry/management.cattle.io/nodes/c-m-vfrrp9d2/machine-zxcp8, ResourceVersion: 0, AdditionalErrorMsg: Precondition failed: UID in precondition: 43798196-7458-45d6-aa14-fcaf3afaeaac, UID in object meta: , requeuing 2024/01/29 16:08:38 [INFO] Deleted cluster node machine-cvdrz [test-dev-2-worker-az1-0] 2024/01/29 16:08:38 [INFO] Deleted cluster node machine-gp5rg [test-dev-2-worker-az1-3] 2024/01/29 16:08:38 [INFO] [snapshotbackpopulate] rkecluster fleet-kaas/test-dev-2: processing configmap kube-system/rke2-etcd-snapshots 2024/01/29 16:08:38 [INFO] Deleted cluster node machine-zxcp8 [test-dev-2-worker-az1-1] 2024/01/29 16:08:38 [ERROR] error syncing 'c-m-vfrrp9d2/machine-7hgnb': handler node-controller-sync: Operation cannot be fulfilled on nodes.management.cattle.io "machine-7hgnb": StorageError: invalid object, Code: 4, Key: /registry/management.cattle.io/nodes/c-m-vfrrp9d2/machine-7hgnb, ResourceVersion: 0, AdditionalErrorMsg: Precondition failed: UID in precondition: e0c5e1ea-f8e2-48be-9855-4515f44f402c, UID in object meta: , requeuing 2024/01/29 16:08:38 [INFO] [planner] rkecluster fleet-kaas/test-dev-2: configuring bootstrap node(s) custom-00266ec772bf: waiting for plan to be applied 2024/01/29 16:08:38 [INFO] Deleted cluster node machine-7hgnb [test-dev-2-master-az1-1] 2024/01/29 16:08:38 [INFO] [snapshotbackpopulate] rkecluster fleet-kaas/test-dev-2: processing configmap kube-system/rke2-etcd-snapshots 2024/01/29 16:08:37 [INFO] Created machine for node [test-dev-2-worker-az1-3] 2024/01/29 16:08:37 [INFO] [snapshotbackpopulate] rkecluster fleet-kaas/test-dev-2: processing configmap kube-system/rke2-etcd-snapshots 2024/01/29 16:08:37 [INFO] Created machine for node [test-dev-2-master-az1-2] 2024/01/29 16:08:36 [INFO] Starting cluster controllers for c-m-vfrrp9d2 2024/01/29 16:08:36 [INFO] Created machine for node [test-dev-2-worker-az1-1] 2024/01/29 16:08:36 [INFO] Created machine for node [test-dev-2-worker-az1-0] 2024/01/29 16:08:36 [INFO] Created machine for node [test-dev-2-master-az1-1] 2024/01/29 16:08:36 [INFO] Starting cluster controllers for c-m-vfrrp9d2 2024/01/29 16:08:36 [INFO] Starting cluster agent for c-m-vfrrp9d2 [owner=true] 2024/01/29 16:08:36 [INFO] [planner] rkecluster fleet-kaas/test-dev-2: configuring bootstrap node(s) custom-00266ec772bf: waiting for plan to be applied 2024/01/29 16:08:36 [INFO] [planner] rkecluster fleet-kaas/test-dev-2: configuring bootstrap node(s) custom-00266ec772bf: waiting for plan to be applied 2024/01/29 16:08:35 [INFO] [planner] rkecluster fleet-kaas/test-dev-2: configuring bootstrap node(s) custom-00266ec772bf: waiting for plan to be applied 2024/01/29 16:08:35 [ERROR] error syncing 'c-m-vfrrp9d2/machine-dmbxj': handler node-controller-sync: Operation cannot be fulfilled on nodes.management.cattle.io "machine-dmbxj": StorageError: invalid object, Code: 4, Key: /registry/management.cattle.io/nodes/c-m-vfrrp9d2/machine-dmbxj, ResourceVersion: 0, AdditionalErrorMsg: Precondition failed: UID in precondition: 990d27ff-adcb-48d7-92af-213500abf0c7, UID in object meta: , requeuing 2024/01/29 16:08:35 [INFO] Creating system token for u-btd74yald3, token: agent-u-btd74yald3 2024/01/29 16:08:35 [INFO] Redeploy Rancher Agents is needed for c-m-vfrrp9d2: forceDeploy=false, agent/auth image changed=true, private repo changed=false, 2024/01/29 16:08:35 [ERROR] error syncing 'c-m-vfrrp9d2/machine-qfkxx': handler node-controller-sync: Operation cannot be fulfilled on nodes.management.cattle.io "machine-qfkxx": StorageError: invalid object, Code: 4, Key: /registry/management.cattle.io/nodes/c-m-vfrrp9d2/machine-qfkxx, ResourceVersion: 0, AdditionalErrorMsg: Precondition failed: UID in precondition: 17feafdb-a3d0-4d3e-9d84-662488eba0cb, UID in object meta: , requeuing 2024-01-29 16:08:35.176 2024/01/29 16:08:35 [ERROR] error syncing 'c-m-vfrrp9d2/machine-7hz8g': handler node-controller-sync: Operation cannot be fulfilled on nodes.management.cattle.io "machine-7hz8g": StorageError: invalid object, Code: 4, Key: /registry/management.cattle.io/nodes/c-m-vfrrp9d2/machine-7hz8g, ResourceVersion: 0, AdditionalErrorMsg: Precondition failed: UID in precondition: 82710fcb-8d9f-416e-b366-5235a18c15d1, UID in object meta: , requeuing 2024-01-29 16:08:35.175 2024/01/29 16:08:35 [ERROR] error syncing 'c-m-vfrrp9d2/machine-7cz45': handler node-controller-sync: Operation cannot be fulfilled on nodes.management.cattle.io "machine-7cz45": StorageError: invalid object, Code: 4, Key: /registry/management.cattle.io/nodes/c-m-vfrrp9d2/machine-7cz45, ResourceVersion: 0, AdditionalErrorMsg: Precondition failed: UID in precondition: c8e8c68d-fbd8-4ed7-8679-8fd7cfad8a9f, UID in object meta: , requeuing 2024-01-29 16:08:35.174 2024/01/29 16:08:35 [ERROR] error syncing 'c-m-vfrrp9d2/machine-2c74b': handler node-controller-sync: Operation cannot be fulfilled on nodes.management.cattle.io "machine-2c74b": StorageError: invalid object, Code: 4, Key: /registry/management.cattle.io/nodes/c-m-vfrrp9d2/machine-2c74b, ResourceVersion: 0, AdditionalErrorMsg: Precondition failed: UID in precondition: b0de264c-7a7e-4a82-a79f-2a655d796f81, UID in object meta: , requeuing 2024-01-29 16:08:35.174 2024/01/29 16:08:35 [ERROR] error syncing 'c-m-vfrrp9d2/machine-6w5cj': handler node-controller-sync: Operation cannot be fulfilled on nodes.management.cattle.io "machine-6w5cj": StorageError: invalid object, Code: 4, Key: /registry/management.cattle.io/nodes/c-m-vfrrp9d2/machine-6w5cj, ResourceVersion: 0, AdditionalErrorMsg: Precondition failed: UID in precondition: 644ea99a-a3bd-4911-af44-d2fef3220ab3, UID in object meta: , requeuing 2024-01-29 16:08:35.170 2024/01/29 16:08:35 [ERROR] error syncing 'c-m-vfrrp9d2/machine-mhn9q': handler node-controller-sync: Operation cannot be fulfilled on nodes.management.cattle.io "machine-mhn9q": StorageError: invalid object, Code: 4, Key: /registry/management.cattle.io/nodes/c-m-vfrrp9d2/machine-mhn9q, ResourceVersion: 0, AdditionalErrorMsg: Precondition failed: UID in precondition: a3863293-d28d-4404-95b3-d1490a4fec90, UID in object meta: , requeuing 2024-01-29 16:08:35.081 2024/01/29 16:08:35 [INFO] Starting cluster controllers for c-m-vfrrp9d2 2024-01-29 16:08:34.480 2024/01/29 16:08:34 [INFO] Handling backend connection request [c-m-vfrrp9d2] 2024-01-29 16:08:34.470 2024/01/29 16:08:34 [INFO] Handling backend connection request [c-m-vfrrp9d2]