rancher / terraform-provider-rancher2

Terraform Rancher2 provider
https://www.terraform.io/docs/providers/rancher2/
Mozilla Public License 2.0
260 stars 223 forks source link

[BUG] Bootstrapping the admin resource times out after Kubernetes v1.23.15. Unable to reach the rancher server anymore. #1094

Open principekiss opened 1 year ago

principekiss commented 1 year ago

Rancher Server Setup

Information about the Cluster

User Information

Provider Information

Describe the bug

On Terraform apply, the rancher2_bootstrap.admin resource times out:

╷
│ Error: Rancher is not ready: Doing get: Get "https://rancher.test.domain.net/ping": context deadline exceeded (Client.Timeout exceeded while awaiting headers)
│ 
│   with rancher2_bootstrap.admin,
│   on provider.tf line 44, in resource "rancher2_bootstrap" "admin":
│   44: resource "rancher2_bootstrap" "admin" {
│ 
╵

Increasing the timeout doesn't change anything.

To Reproduce

provider.tf:

data "terraform_remote_state" "aks_cluster" {
    backend = "azurerm" 
    config = {
        resource_group_name   = var.storage_account_resource_group
        storage_account_name  = var.storage_account_name
        container_name        = var.container_name
        key                   = var.aks_cluster_tfstate_key
  }
}

locals {
  kubeconfig = data.terraform_remote_state.aks_cluster.outputs.kubeconfig
}

provider "helm" {
    kubernetes {
        host                    = local.kubeconfig.host
        client_certificate      = local.kubeconfig.client_certificate
        client_key              = local.kubeconfig.client_key
        cluster_ca_certificate  = local.kubeconfig.cluster_ca_certificate
    }
}

provider "kubernetes" {
    host                    = local.kubeconfig.host
    client_certificate      = local.kubeconfig.client_certificate
    client_key              = local.kubeconfig.client_key
    cluster_ca_certificate  = local.kubeconfig.cluster_ca_certificate
}

provider "rancher2" {
    alias      = "bootstrap"

    bootstrap  = true
    api_url    = "https://rancher.${var.dns_zone}"
    insecure   = true
    timeout    = "600s"
}

resource "rancher2_bootstrap" "admin" {
    provider          = rancher2.bootstrap

    initial_password  = random_password.rancher.result
    password          = var.admin_password
    telemetry         = false
    token_update      = false

  depends_on = [
      helm_release.rancher,
      azurerm_dns_a_record.rancher
  ]
}

provider "rancher2" {
    alias      = "admin"

    api_url    = rancher2_bootstrap.admin.url
    token_key  = rancher2_bootstrap.admin.token
    insecure   = true
    timeout    = "600s"
}

rancher.tf

## Install External Ingress Nginx Controller

resource "helm_release" "ingress_nginx_external" {
  repository        = "https://kubernetes.github.io/ingress-nginx/"
  name              = "ingress-nginx-external"
  chart             = "ingress-nginx"
  version           = var.ingress_nginx_external.version
  namespace         = var.ingress_nginx_external.namespace
  create_namespace  = true
  wait_for_jobs     = true
  timeout           = 500

  values = [
    file("${path.module}/config/ingress-external.yaml")
  ]
}

## Install Cert Manager

resource "helm_release" "cert_manager" {
  repository        = "https://charts.jetstack.io"
  name              = "jetstack"
  chart             = "cert-manager"
  version           = var.cert_manager.version
  namespace         = var.cert_manager.namespace
  create_namespace  = true
  wait_for_jobs     = true
  timeout           = 500

    set {
      name   = "installCRDs"
      value  = "true"
    }

  depends_on = [helm_release.ingress_nginx_external, azurerm_dns_a_record.rancher]
}

## Install Rancher Server

resource "random_password" "rancher" {
  length            = 14
  lower             = true
  upper             = true
  special           = true
  numeric           = true
}

resource "helm_release" "rancher" {
  repository        = "https://releases.rancher.com/server-charts/${var.rancher.branch}"
  name              = "rancher"
  chart             = "rancher"
  version           = var.rancher.version
  namespace         = var.rancher.namespace
  create_namespace  = true
  wait_for_jobs     = true
  timeout           = 300

  set {
    name   = "hostname"
    value  = "rancher.${var.dns_zone}"
  }

  set {
    name   = "bootstrapPassword"
    value  = random_password.rancher.result
  }

  dynamic set {
    for_each = var.rancher.chart_set
    content {
      name   = set.value.name
      value  = set.value.value
    }
  }

  depends_on = [helm_release.cert_manager, random_password.rancher]
}

variables.tf:

## HELM CHARTS

variable "ingress_nginx_external" {
  type = object({
    namespace  = string
    version    = string
  })
  default = {
    namespace  = "ingress-nginx-external"
    version    = "4.2.3"
  }
  description = "Nginx Ingress Helm chart properties."
}

variable "cert_manager" {
  type = object({
    namespace  = string
    version    = string
  })
  default = {
    namespace  = "cert-manager"
    version    = "1.11.0"
  }
  description = "Cert Manager Helm chart properties."
}

variable "rancher" {
  type = object({
    namespace = string
    version   = string
    branch    = string
    chart_set = list(object({
      name   = string
      value  = string
    }))
  })
  default = {
    namespace  = "cattle-system"
    # There is a bug with destroying the cloud credentials in version 2.6.9 until 2.7.1 and will be fixed in next release 2.7.2.
    # See https://github.com/rancher/rancher/issues/39300
    version    = "2.7.0"
    branch     = "stable"
    chart_set = [
      {
        name   = "replicas"
        value  = 3
      },
      {
        name   = "ingress.ingressClassName"
        value  = "nginx-external"
      },
      {
        name   = "ingress.tls.source"
        value  = "rancher"
      },
      # There is a bug with the uninstallation of Rancher due to missing priorityClassName of rancher-webhook
      # The priorityClassName need to be set
      # See https://github.com/rancher/rancher/issues/40935 
      {
        name  = "priorityClassName"
        value = "system-node-critical"
      }
    ]
  }
  description = "Rancher Helm chart properties."
}

Actual Result

It will fail to create the bootstrap admin resource. the Rancher UI is unreachable as well.

Expected Result

Accessing the Rancher UI after Rancher server installation and being able to create the bootstrap admin resource for provider config.

principekiss commented 1 year ago

This issue got resolved after adding the health probe path annotation to the Ingress controller LB service:

  # Configures the ports the nginx-controller listens on
  containerPort:
    http: 80
    https: 443

  service:
    annotations:
      service.beta.kubernetes.io/azure-load-balancer-health-probe-request-path: /healthz   # <=== Here