kairos-io / kairos

The immutable Linux meta-distribution for edge Kubernetes.
https://kairos.io
Apache License 2.0
1.16k stars 97 forks source link

The Entangle component fails to achieve control over clusters located in different subnets #2982

Open mmmmyue opened 2 weeks ago

mmmmyue commented 2 weeks ago

I used two single-node clusters as the control cluster and the normal cluster.

Kairos version: control cluster info

cat /etc/os-release
NAME="Ubuntu"
VERSION="20.04.6 LTS (Focal Fossa)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 20.04.6 LTS"
VERSION_ID="20.04"
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
VERSION_CODENAME=focal
UBUNTU_CODENAME=focal
KAIROS_FLAVOR_RELEASE="20.04"
KAIROS_FAMILY="ubuntu"
KAIROS_VARIANT="standard"
KAIROS_HOME_URL="https://github.com/kairos-io/kairos"
KAIROS_GITHUB_REPO="kairos-io/kairos"
KAIROS_SOFTWARE_VERSION_PREFIX="k3s"
KAIROS_ID_LIKE="kairos-standard-ubuntu-20.04"
KAIROS_ARTIFACT="kairos-ubuntu-20.04-standard-amd64-generic-v3.1.3-k3sv1.31.0+k3s1"
KAIROS_REGISTRY_AND_ORG="quay.io/kairos"
KAIROS_SOFTWARE_VERSION="v1.31.0+k3s1"
KAIROS_ID="kairos"
KAIROS_VERSION="v3.1.3-v1.31.0-k3s1"
KAIROS_PRETTY_NAME="kairos-standard-ubuntu-20.04 v3.1.3-v1.31.0-k3s1"
KAIROS_MODEL="generic"
KAIROS_RELEASE="v3.1.3"
KAIROS_BUG_REPORT_URL="https://github.com/kairos-io/kairos/issues"
KAIROS_NAME="kairos-standard-ubuntu-20.04"
KAIROS_VERSION_ID="v3.1.3-v1.31.0-k3s1"
KAIROS_IMAGE_REPO="quay.io/kairos/ubuntu:20.04-standard-amd64-generic-v3.1.3-k3sv1.31.0-k3s1"
KAIROS_IMAGE_LABEL="20.04-standard-amd64-generic-v3.1.3-k3sv1.31.0-k3s1"
KAIROS_FLAVOR="ubuntu"
KAIROS_TARGETARCH="amd64"

uname -r
5.15.0-121-generic
uname -a
Linux cncp-cs-01 5.15.0-121-generic #131~20.04.1-Ubuntu SMP Mon Aug 12 13:09:56 UTC 2024 x86_64 x86_64 x86_64 GNU/Linux

normal cluster info

cat /etc/os-release
PRETTY_NAME="Ubuntu 24.04 LTS"
NAME="Ubuntu"
VERSION_ID="24.04"
VERSION="24.04 LTS (Noble Numbat)"
VERSION_CODENAME=noble
ID=ubuntu
ID_LIKE=debian
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
UBUNTU_CODENAME=noble
LOGO=ubuntu-logo
KAIROS_VERSION="v3.1.2-v1.30.4-k3s1"
KAIROS_IMAGE_LABEL="24.04-standard-amd64-generic-v3.1.2-k3sv1.30.4-k3s1"
KAIROS_FLAVOR_RELEASE="24.04"
KAIROS_HOME_URL="https://github.com/kairos-io/kairos"
KAIROS_ID="kairos"
KAIROS_ARTIFACT="kairos-ubuntu-24.04-standard-amd64-generic-v3.1.2-k3sv1.30.4+k3s1"
KAIROS_REGISTRY_AND_ORG="quay.io/kairos"
KAIROS_BUG_REPORT_URL="https://github.com/kairos-io/kairos/issues"
KAIROS_SOFTWARE_VERSION_PREFIX="k3s"
KAIROS_NAME="kairos-standard-ubuntu-24.04"
KAIROS_PRETTY_NAME="kairos-standard-ubuntu-24.04 v3.1.2-v1.30.4-k3s1"
KAIROS_IMAGE_REPO="quay.io/kairos/ubuntu:24.04-standard-amd64-generic-v3.1.2-k3sv1.30.4-k3s1"
KAIROS_VARIANT="standard"
KAIROS_MODEL="generic"
KAIROS_TARGETARCH="amd64"
KAIROS_RELEASE="v3.1.2"
KAIROS_GITHUB_REPO="kairos-io/kairos"
KAIROS_ID_LIKE="kairos-standard-ubuntu-24.04"
KAIROS_VERSION_ID="v3.1.2-v1.30.4-k3s1"
KAIROS_FLAVOR="ubuntu"
KAIROS_FAMILY="ubuntu"
KAIROS_SOFTWARE_VERSION="v1.30.4+k3s1"

 uname -r
6.8.0-41-generic
uname -a
Linux cncp-ms-01 6.8.0-41-generic #41-Ubuntu SMP PREEMPT_DYNAMIC Fri Aug  2 20:41:06 UTC 2024 x86_64 x86_64 x86_64 GNU/Linux

****The file contents executed on each of the two clusters are as follows. normal cluster

root@cncp-ms-01:/home/kairos# cat entangle/tgjq.yaml
apiVersion: v1
kind: Secret
metadata:
  name: 103secret
  namespace: default
type: Opaque
stringData:
  network_token:  <token_here>
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: entangle
  namespace: default
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: entangle
rules:
- apiGroups:
  - ""
  - "upgrade.cattle.io"
  resources:
  - pods
  - plans
  verbs:
  - create
  - delete
  - get
  - list
  - update
  - watch

- apiGroups:
  - ""
  resources:
  - events
  verbs:
  - create
---
apiVersion: v1
kind: List
items:
  - apiVersion: rbac.authorization.k8s.io/v1
    kind: ClusterRoleBinding
    metadata:
      name: entangle
    subjects:
    - kind: ServiceAccount
      name: entangle
      namespace: default
    roleRef:
      kind: ClusterRole
      name: entangle
      apiGroup: rbac.authorization.k8s.io
---
apiVersion: apps/v1
kind: Deployment
metadata:
  labels:
    app: agent-proxy
  name: agent-proxy
  namespace: default
spec:
  selector:
    matchLabels:
      app: agent-proxy
  replicas: 1
  template:
    metadata:
      labels:
        app: agent-proxy
        entanglement.kairos.io/name: "103secret"
        entanglement.kairos.io/service: "foo103"
        entanglement.kairos.io/target_port: "8001"
        entanglement.kairos.io/direction: "entangle"
    spec:
      serviceAccountName: entangle
      containers:
        - name: proxy
          image: "quay.io/kairos/kubectl"
          imagePullPolicy: Always
          command: ["/usr/bin/kubectl"]
          args:
            - "proxy"

root@cncp-ms-01:/home/kairos#

control cluster manifest info

root@cncp-cs-01:/home/kairos/entangle-proxy# cat 103kzjq.yaml
apiVersion: v1
kind: Secret
metadata:
  name: 103secret
  namespace: default
type: Opaque
stringData:
  network_token: <token_here>
---
apiVersion: entangle-proxy.kairos.io/v1alpha1
kind: Manifests
metadata:
  name: 103apply
  namespace: default
  labels:
   entanglement.kairos.io/name: "103secret"
   entanglement.kairos.io/service: "foo103"
   entanglement.kairos.io/target_port: "9092"
spec:
   serviceUUID: "foo103"
   secretRef: "103secret"
   manifests:
   - |
      apiVersion: v1
      kind: Pod
      metadata:
        name: test
        namespace: default
      spec:
            containers:
            - name: hello
              image: busybox:1.28
              command: ['sh', '-c', 'echo "Hello, ssaa!" && sleep 3600']
            restartPolicy: OnFailure
root@cncp-cs-01:/home/kairos/entangle-proxy#

Describe the bug and Expected behavior After the Manifests are executed in the control cluster, it is expected that a pod named test should be created in the ordinary cluster, but now there is no such effect and the pod created after the Manifests of the control cluster are executed is always in the running state. Following the same steps, the pods created by the control cluster will become completed when they are on the same network segment.

control cluster created pod 103apply-apply-cjwhf

kubectl get pods
NAME                                     READY   STATUS    RESTARTS   AGE
103apply-apply-cjwhf                     2/2     Running   0          14m
kairos-entangle-f9fdd48f7-5kb5c          2/2     Running   0          44m
kairos-entangle-proxy-6f5f545d4b-vjs7j   2/2     Running   0          44m
root@cncp-cs-01:/home/kairos/entangle-proxy#

kubectl logs 103apply-apply-cjwhf
Defaulted container "proxy" out of: proxy, entanglement
Waiting for kubectl get pods
E1104 08:40:09.598216      13 memcache.go:265] couldn't get current server API group list: Get "http://localhost:8080/api?timeout=32s": dial tcp 127.0.0.1:8080: connect: connection refused
E1104 08:40:09.598594      13 memcache.go:265] couldn't get current server API group list: Get "http://localhost:8080/api?timeout=32s": dial tcp 127.0.0.1:8080: connect: connection refused
E1104 08:40:09.600000      13 memcache.go:265] couldn't get current server API group list: Get "http://localhost:8080/api?timeout=32s": dial tcp 127.0.0.1:8080: connect: connection refused
E1104 08:40:09.601473      13 memcache.go:265] couldn't get current server API group list: Get "http://localhost:8080/api?timeout=32s": dial tcp 127.0.0.1:8080: connect: connection refused
E1104 08:40:09.602787      13 memcache.go:265] couldn't get current server API group list: Get "http://localhost:8080/api?timeout=32s": dial tcp 127.0.0.1:8080: connect: connection refused
The connection to the server localhost:8080 was refused - did you specify the right host or port?
E1104 08:40:20.690277      32 memcache.go:265] couldn't get current server API group list: Get "http://localhost:8080/api?timeout=32s": read tcp 127.0.0.1:46266->127.0.0.1:8080: read: connection reset by peer - error from a previous attempt: read tcp 127.0.0.1:46240->127.0.0.1:8080: read: connection reset by peer
E1104 08:40:30.703144      32 memcache.go:265] couldn't get current server API group list: Get "http://localhost:8080/api?timeout=32s": read tcp 127.0.0.1:39506->127.0.0.1:8080: read: connection reset by peer - error from a previous attempt: read tcp 127.0.0.1:39482->127.0.0.1:8080: read: connection reset by peer
E1104 08:40:40.717122      32 memcache.go:265] couldn't get current server API group list: Get "http://localhost:8080/api?timeout=32s": read tcp 127.0.0.1:36798->127.0.0.1:8080: read: connection reset by peer - error from a previous attempt: read tcp 127.0.0.1:36772->127.0.0.1:8080: read: connection reset by peer
E1104 08:40:50.730084      32 memcache.go:265] couldn't get current server API group list: Get "http://localhost:8080/api?timeout=32s": http: server closed idle connection - error from a previous attempt: read tcp 127.0.0.1:56646->127.0.0.1:8080: read: connection reset by peer
E1104 08:41:00.749669      32 memcache.go:265] couldn't get current server API group list: Get "http://localhost:8080/api?timeout=32s": read tcp 127.0.0.1:35144->127.0.0.1:8080: read: connection reset by peer - error from a previous attempt: read tcp 127.0.0.1:35128->127.0.0.1:8080: read: connection reset by peer
error: Get "http://localhost:8080/api?timeout=32s": read tcp 127.0.0.1:35144->127.0.0.1:8080: read: connection reset by peer - error from a previous attempt: read tcp 127.0.0.1:35128->127.0.0.1:8080: read: connection reset by peer

kubectl logs 103apply-apply-cjwhf -c manager
error: container manager is not valid for pod 103apply-apply-cjwhf
root@cncp-cs-01:/home/kairos/entangle-proxy# kubectl logs 103apply-apply-cjwhf -c entanglement
{"level":"INFO","time":"2024-11-04T08:40:09.616Z","caller":"config/config.go:290","message":" go-libp2p resource manager protection disabled"}
{"level":"INFO","time":"2024-11-04T08:40:09.619Z","caller":"cmd/util.go:368","message":" \tedgevpn  Copyright (C) 2021-2022 Ettore Di Giacinto\nThis program comes with ABSOLUTELY NO WARRANTY.\nThis is free software, and you are welcome to redistribute it\nunder certain conditions."}
{"level":"INFO","time":"2024-11-04T08:40:09.619Z","caller":"cmd/util.go:370","message":"Version:  commit: \n"}
{"level":"INFO","time":"2024-11-04T08:40:09.619Z","caller":"node/node.go:118","message":" Starting EdgeVPN network"}
{"level":"DEBUG","time":"2024-11-04T08:40:09.620Z","caller":"node/node.go:154","message":" Generating host data"}
2024/11/04 08:40:09 failed to sufficiently increase send buffer size (was: 208 kiB, wanted: 2048 kiB, got: 416 kiB). See https://github.com/quic-go/quic-go/wiki/UDP-Buffer-Sizes for details.
{"level":"INFO","time":"2024-11-04T08:40:09.647Z","caller":"node/node.go:172","message":" Node ID: 12D3KooWFRHxx7tTasuvgNywwGNb1hkFE6bEGg1cHXWDdEdobTP6"}
{"level":"INFO","time":"2024-11-04T08:40:09.647Z","caller":"node/node.go:173","message":" Node Addresses: [/ip4/10.89.1.137/tcp/33193 /ip4/10.89.1.137/udp/50441/quic-v1 /ip4/10.89.1.137/udp/51024/quic-v1/webtransport/certhash/uEiC2zLNtfRbPBXYrb79fq9z0KUx7cwgYIm03dQxSxc6ojA/certhash/uEiAbAr3gPopikA9cfO0p5k-JrelTvifXR4-Fy44zrJajCg /ip4/127.0.0.1/tcp/33193 /ip4/127.0.0.1/udp/50441/quic-v1 /ip4/127.0.0.1/udp/51024/quic-v1/webtransport/certhash/uEiC2zLNtfRbPBXYrb79fq9z0KUx7cwgYIm03dQxSxc6ojA/certhash/uEiAbAr3gPopikA9cfO0p5k-JrelTvifXR4-Fy44zrJajCg /ip6/::1/tcp/45487 /ip6/::1/udp/56619/quic-v1/webtransport/certhash/uEiC2zLNtfRbPBXYrb79fq9z0KUx7cwgYIm03dQxSxc6ojA/certhash/uEiAbAr3gPopikA9cfO0p5k-JrelTvifXR4-Fy44zrJajCg /ip6/::1/udp/60421/quic-v1]"}
{"level":"INFO","time":"2024-11-04T08:40:09.658Z","caller":"discovery/dht.go:104","message":" Bootstrapping DHT"}
{"level":"DEBUG","time":"2024-11-04T08:40:09.658Z","caller":"node/node.go:195","message":" Network started"}
{"level":"WARN","time":"2024-11-04T08:40:09.665Z","caller":"node/connection.go:221","message":"publish error: no message room available\n"}
{"level":"DEBUG","time":"2024-11-04T08:40:14.665Z","caller":"discovery/dht.go:147","message":" failed to dial: failed to dial QmaCpDMGvV2BGHeYERUEnRQAwe3N8SzbUtfsmvsqQLuvuJ: all dials failed\n  * [/ip4/104.131.131.82/tcp/4001] dial tcp4 0.0.0.0:33193->104.131.131.82:4001: i/o timeout"}
{"level":"DEBUG","time":"2024-11-04T08:40:19.671Z","caller":"discovery/dht.go:147","message":" failed to dial: failed to dial QmcZf59bWwK5XFi76CZX8cbJ4BhTzzA3gU1ZjYZcYW3dwt: no good addresses"}
{"level":"DEBUG","time":"2024-11-04T08:40:19.671Z","caller":"discovery/dht.go:147","message":" failed to dial: failed to dial QmNnooDu7bfjPFoTZYxMNLWUQJyrVwtbZg5gBMjTezGAJN: no good addresses"}
{"level":"DEBUG","time":"2024-11-04T08:40:19.671Z","caller":"discovery/dht.go:147","message":" failed to dial: failed to dial QmbLHAnMoJPWSCR5Zhtx6BHJX9KiKNN6tpvbUcqanj75Nb: no good addresses"}
{"level":"DEBUG","time":"2024-11-04T08:40:19.671Z","caller":"discovery/dht.go:147","message":" failed to dial: failed to dial QmQCU2EcMqAqQPR2i9bChDtGNJchTbq5TbXJJ16u19uLTa: no good addresses"}
{"level":"DEBUG","time":"2024-11-04T08:40:19.671Z","caller":"discovery/dht.go:204","message":" Announcing ourselves..."}
{"level":"DEBUG","time":"2024-11-04T08:40:19.671Z","caller":"discovery/dht.go:207","message":" Successfully announced!"}
{"level":"DEBUG","time":"2024-11-04T08:40:19.671Z","caller":"discovery/dht.go:210","message":" Searching for other peers..."}
{"level":"DEBUG","time":"2024-11-04T08:40:24.673Z","caller":"discovery/dht.go:147","message":" failed to dial: failed to dial QmaCpDMGvV2BGHeYERUEnRQAwe3N8SzbUtfsmvsqQLuvuJ: all dials failed\n  * [/ip4/104.131.131.82/tcp/4001] dial tcp4 0.0.0.0:33193->104.131.131.82:4001: i/o timeout"}
{"level":"DEBUG","time":"2024-11-04T08:40:29.674Z","caller":"discovery/dht.go:147","message":" failed to dial: failed to dial QmQCU2EcMqAqQPR2i9bChDtGNJchTbq5TbXJJ16u19uLTa: no good addresses"}
{"level":"DEBUG","time":"2024-11-04T08:40:29.674Z","caller":"discovery/dht.go:147","message":" failed to dial: failed to dial QmNnooDu7bfjPFoTZYxMNLWUQJyrVwtbZg5gBMjTezGAJN: no good addresses"}
{"level":"DEBUG","time":"2024-11-04T08:40:29.674Z","caller":"discovery/dht.go:147","message":" failed to dial: failed to dial QmbLHAnMoJPWSCR5Zhtx6BHJX9KiKNN6tpvbUcqanj75Nb: no good addresses"}
{"level":"DEBUG","time":"2024-11-04T08:40:29.675Z","caller":"discovery/dht.go:147","message":" failed to dial: failed to dial QmcZf59bWwK5XFi76CZX8cbJ4BhTzzA3gU1ZjYZcYW3dwt: no good addresses"}
mudler commented 3 days ago

@mmmmyue looks like the controller can't reach libp2p bootstrap servers.

Can you try bumping the edgevpn version to the latest (v0.28.4) when installing the helm charts?

Here is an example of the values configuration: https://github.com/kairos-io/helm-charts/blob/main/charts/entangle/values.yaml#L19