siderolabs / terraform-provider-talos

Mozilla Public License 2.0
141 stars 18 forks source link

Terraform is stuck during creation of `talos_machine_configuration_apply` #52

Closed artuross closed 1 year ago

artuross commented 1 year ago

I'm not sure if I'm doing something incorrectly. Basically, I've copied the full example from the docs. The output that I consistently get is below.

Variables

cluster_name = "local"
cluster_endpoint = "https://cluster.local:6443"

Output

❯ terraform apply

Terraform used the selected providers to generate the following execution plan. Resource actions are indicated with the following symbols:
  + create

Terraform will perform the following actions:

  # talos_client_configuration.talosconfig will be created
  + resource "talos_client_configuration" "talosconfig" {
      + cluster_name    = "local"
      + endpoints       = [
          + "10.5.0.2",
          + "10.5.0.3",
          + "10.5.0.4",
        ]
      + id              = (known after apply)
      + machine_secrets = (sensitive value)
      + talos_config    = (sensitive value)
    }

  # talos_cluster_kubeconfig.kubeconfig will be created
  + resource "talos_cluster_kubeconfig" "kubeconfig" {
      + endpoint     = "10.5.0.2"
      + id           = (known after apply)
      + kube_config  = (sensitive value)
      + node         = "10.5.0.2"
      + talos_config = (sensitive value)
    }

  # talos_machine_bootstrap.bootstrap will be created
  + resource "talos_machine_bootstrap" "bootstrap" {
      + endpoint     = "10.5.0.2"
      + id           = (known after apply)
      + node         = "10.5.0.2"
      + talos_config = (sensitive value)
    }

  # talos_machine_configuration_apply.cp_config_apply["10.5.0.2"] will be created
  + resource "talos_machine_configuration_apply" "cp_config_apply" {
      + config_patches        = [
          + <<-EOT
                machine:
                  install:
                    disk: /dev/sda
                  network:
                    hostname: local-cp-10.5.0.2
            EOT,
          + jsonencode(
                [
                  + {
                      + op    = "add"
                      + path  = "/cluster/allowSchedulingOnControlPlanes"
                      + value = true
                    },
                ]
            ),
        ]
      + endpoint              = "10.5.0.2"
      + id                    = (known after apply)
      + machine_configuration = (sensitive value)
      + mode                  = "auto"
      + node                  = "10.5.0.2"
      + talos_config          = (sensitive value)
    }

  # talos_machine_configuration_apply.cp_config_apply["10.5.0.3"] will be created
  + resource "talos_machine_configuration_apply" "cp_config_apply" {
      + config_patches        = [
          + <<-EOT
                machine:
                  install:
                    disk: /dev/sda
                  network:
                    hostname: local-cp-10.5.0.3
            EOT,
          + jsonencode(
                [
                  + {
                      + op    = "add"
                      + path  = "/cluster/allowSchedulingOnControlPlanes"
                      + value = true
                    },
                ]
            ),
        ]
      + endpoint              = "10.5.0.3"
      + id                    = (known after apply)
      + machine_configuration = (sensitive value)
      + mode                  = "auto"
      + node                  = "10.5.0.3"
      + talos_config          = (sensitive value)
    }

  # talos_machine_configuration_apply.cp_config_apply["10.5.0.4"] will be created
  + resource "talos_machine_configuration_apply" "cp_config_apply" {
      + config_patches        = [
          + <<-EOT
                machine:
                  install:
                    disk: /dev/sda
                  network:
                    hostname: local-cp-10.5.0.4
            EOT,
          + jsonencode(
                [
                  + {
                      + op    = "add"
                      + path  = "/cluster/allowSchedulingOnControlPlanes"
                      + value = true
                    },
                ]
            ),
        ]
      + endpoint              = "10.5.0.4"
      + id                    = (known after apply)
      + machine_configuration = (sensitive value)
      + mode                  = "auto"
      + node                  = "10.5.0.4"
      + talos_config          = (sensitive value)
    }

  # talos_machine_configuration_apply.worker_config_apply["10.5.0.5"] will be created
  + resource "talos_machine_configuration_apply" "worker_config_apply" {
      + config_patches        = [
          + <<-EOT
                machine:
                  install:
                    disk: /dev/nvme0n1
                  network:
                    hostname: worker-1
            EOT,
        ]
      + endpoint              = "10.5.0.5"
      + id                    = (known after apply)
      + machine_configuration = (sensitive value)
      + mode                  = "auto"
      + node                  = "10.5.0.5"
      + talos_config          = (sensitive value)
    }

  # talos_machine_configuration_apply.worker_config_apply["10.5.0.6"] will be created
  + resource "talos_machine_configuration_apply" "worker_config_apply" {
      + config_patches        = [
          + <<-EOT
                machine:
                  install:
                    disk: /dev/nvme0n1
                  network:
                    hostname: worker-2
            EOT,
        ]
      + endpoint              = "10.5.0.6"
      + id                    = (known after apply)
      + machine_configuration = (sensitive value)
      + mode                  = "auto"
      + node                  = "10.5.0.6"
      + talos_config          = (sensitive value)
    }

  # talos_machine_configuration_controlplane.machineconfig_cp will be created
  + resource "talos_machine_configuration_controlplane" "machineconfig_cp" {
      + cluster_endpoint   = "https://cluster.local:6443"
      + cluster_name       = "local"
      + config_version     = "v1alpha1"
      + docs_enabled       = true
      + examples_enabled   = true
      + id                 = (known after apply)
      + kubernetes_version = "1.26.0"
      + machine_config     = (sensitive value)
      + machine_secrets    = (sensitive value)
    }

  # talos_machine_configuration_worker.machineconfig_worker will be created
  + resource "talos_machine_configuration_worker" "machineconfig_worker" {
      + cluster_endpoint   = "https://cluster.local:6443"
      + cluster_name       = "local"
      + config_version     = "v1alpha1"
      + docs_enabled       = true
      + examples_enabled   = true
      + id                 = (known after apply)
      + kubernetes_version = "1.26.0"
      + machine_config     = (sensitive value)
      + machine_secrets    = (sensitive value)
    }

  # talos_machine_secrets.machine_secrets will be created
  + resource "talos_machine_secrets" "machine_secrets" {
      + id              = (known after apply)
      + machine_secrets = (sensitive value)
    }

Plan: 11 to add, 0 to change, 0 to destroy.

Changes to Outputs:
  + kubeconfig                 = (sensitive value)
  + machineconfig_controlplane = (sensitive value)
  + machineconfig_worker       = (sensitive value)
  + talosconfig                = (sensitive value)

Do you want to perform these actions?
  Terraform will perform the actions described above.
  Only 'yes' will be accepted to approve.

  Enter a value: yes

talos_machine_secrets.machine_secrets: Creating...
talos_machine_secrets.machine_secrets: Creation complete after 0s [id=39LEmf4LI-1oBE7x9Oq-oMlUA_MSiFqFNNVr8GxACBg=]
talos_client_configuration.talosconfig: Creating...
talos_machine_configuration_worker.machineconfig_worker: Creating...
talos_machine_configuration_controlplane.machineconfig_cp: Creating...
talos_client_configuration.talosconfig: Creation complete after 0s [id=local]
talos_cluster_kubeconfig.kubeconfig: Creating...
talos_machine_bootstrap.bootstrap: Creating...
talos_machine_configuration_worker.machineconfig_worker: Creation complete after 0s [id=local]
talos_machine_configuration_controlplane.machineconfig_cp: Creation complete after 0s [id=local]
talos_machine_configuration_apply.worker_config_apply["10.5.0.6"]: Creating...
talos_machine_configuration_apply.worker_config_apply["10.5.0.5"]: Creating...
talos_machine_configuration_apply.cp_config_apply["10.5.0.3"]: Creating...
talos_machine_configuration_apply.cp_config_apply["10.5.0.2"]: Creating...
talos_machine_configuration_apply.cp_config_apply["10.5.0.4"]: Creating...
talos_machine_bootstrap.bootstrap: Still creating... [10s elapsed]
talos_cluster_kubeconfig.kubeconfig: Still creating... [10s elapsed]
talos_machine_configuration_apply.worker_config_apply["10.5.0.6"]: Still creating... [10s elapsed]
talos_machine_configuration_apply.worker_config_apply["10.5.0.5"]: Still creating... [10s elapsed]
talos_machine_configuration_apply.cp_config_apply["10.5.0.3"]: Still creating... [10s elapsed]
talos_machine_configuration_apply.cp_config_apply["10.5.0.2"]: Still creating... [10s elapsed]
talos_machine_configuration_apply.cp_config_apply["10.5.0.4"]: Still creating... [10s elapsed]
talos_cluster_kubeconfig.kubeconfig: Still creating... [20s elapsed]
talos_machine_bootstrap.bootstrap: Still creating... [20s elapsed]
talos_machine_configuration_apply.worker_config_apply["10.5.0.6"]: Still creating... [20s elapsed]
talos_machine_configuration_apply.worker_config_apply["10.5.0.5"]: Still creating... [20s elapsed]
talos_machine_configuration_apply.cp_config_apply["10.5.0.3"]: Still creating... [20s elapsed]
talos_machine_configuration_apply.cp_config_apply["10.5.0.2"]: Still creating... [20s elapsed]
talos_machine_configuration_apply.cp_config_apply["10.5.0.4"]: Still creating... [20s elapsed]

cp_config_apply and worker_config_apply are going on forever. I've tested it with varying configurations (I initially assumed the reason was use of FQDN for endpoint).

When cancelled, it dumps

│ Error: rpc error: code = Canceled desc = context canceled
│ 
│   with talos_cluster_kubeconfig.kubeconfig,
│   on main.tf line 67, in resource "talos_cluster_kubeconfig" "kubeconfig":
│   67: resource "talos_cluster_kubeconfig" "kubeconfig" {
│ 

however on the config I am trying to prepare for myself, I get a slightly better error message:

╷
│ Error: rpc error: code = Unavailable desc = connection error: desc = "transport: authentication handshake failed: x509: certificate signed by unknown authority (possibly because of \"x509: Ed25519 verification failure\" while trying to verify candidate authority certificate \"talos\")"
│ 
│   with talos_machine_configuration_apply.cp_config_apply,
│   on main.tf line 131, in resource "talos_machine_configuration_apply" "cp_config_apply":
│  131: resource "talos_machine_configuration_apply" "cp_config_apply" {
│ 
╵

I'm using Terraform v1.3.9 in M1 iMac and v0.1.1 version of Talos provider.

artuross commented 1 year ago

OK, it seems like I was doing it incorrectly - I thought that talos_machine_configuration_controlplane generates only machine part of the config and apply prepares the final user data file.

I will prepare a PR to clarify this in the documentation.

rmvangun commented 1 year ago

I'm still encountering this issue on version 0.4.0-alpha.0 of the provider, so I'm using the general talos_machine_configuration but other than that, almost identical configuration. Every other resource works as expected, I'm connecting to the cluster, etc... running this in a dev environment on Docker for mac, Intel, Talos 1.5.3.

Terraform used the selected providers to generate the following execution plan. Resource actions are indicated with the following symbols:
  + create

Terraform will perform the following actions:

  # talos_machine_configuration_apply.this[0] will be created
  + resource "talos_machine_configuration_apply" "this" {
      + apply_mode                  = "auto"
      + client_configuration        = {
          + ca_certificate     = "some-long-cert"
          + client_certificate = "some-long-cert"
          + client_key         = (sensitive value)
        }
      + config_patches              = [
          + <<-EOT
                "machine":
                  "kubelet":
                    "extraArgs":
                      "rotate-server-certificates": true
            EOT,
        ]
      + endpoint                    = "localhost"
      + id                          = (known after apply)
      + machine_configuration       = (sensitive value)
      + machine_configuration_input = (sensitive value)
      + node                        = "10.5.0.2"
    }

  # talos_machine_configuration_apply.this[1] will be created
  + resource "talos_machine_configuration_apply" "this" {
      + apply_mode                  = "auto"
      + client_configuration        = {
          + ca_certificate     = "some-long-cert"
          + client_certificate = "some-long-cert"
          + client_key         = (sensitive value)
        }
      + config_patches              = [
          + <<-EOT
                "machine":
                  "kubelet":
                    "extraArgs":
                      "rotate-server-certificates": true
            EOT,
        ]
      + endpoint                    = "localhost"
      + id                          = (known after apply)
      + machine_configuration       = (sensitive value)
      + machine_configuration_input = (sensitive value)
      + node                        = "10.5.0.3"
    }

Not a lot to go on there, but this is the piece I'm configuring:

  + config_patches              = [
      + <<-EOT
            "machine":
              "kubelet":
                "extraArgs":
                  "rotate-server-certificates": true
        EOT,
    ]
frezbo commented 1 year ago

it seems you're trying to apply to a cluster created outside terraform. if you did talosctl cluster create that machine config is applied differently and the TF provider needs to import the cluster secrets manually.

rmvangun commented 1 year ago

Oh, interesting. Is there a way to create a local Docker-based Talos cluster with Terraform or is that on the roadmap? For example, I know you can create a local kind cluster with Terraform.

frezbo commented 1 year ago

no plans as of now, though you can create a cluster with TF and qemu: https://github.com/siderolabs/contrib/tree/main/examples/terraform/advanced