kube-hetzner / terraform-hcloud-kube-hetzner

Optimized and Maintenance-free Kubernetes on Hetzner Cloud in one command!
MIT License
2.39k stars 368 forks source link

[Bug]: Can't restore a copy #1309

Closed homergleason closed 6 months ago

homergleason commented 7 months ago

Description

I need to urgently restore a cluster from a backup, I do as written in the instructions, the cluster is not restored, what am I doing wrong?

Kube.tf file

locals {
  hcloud_token = ""

  k3s_token = var.k3s_token  
  etcd_version = "v3.5.9"
  etcd_snapshot_name = "etcd-snapshot-prod-control-plane-hel1-byb-1712404802"
  etcd_s3_endpoint = ""
  etcd_s3_bucket = "k3s"
  etcd_s3_access_key = ""
  etcd_s3_secret_key = var.etcd_s3_secret_key

}

variable "k3s_token" {
  sensitive = true
  type      = string
}

variable "etcd_s3_secret_key" {
  sensitive = true
  type      = string
}

module "kube-hetzner" {

  k3s_token = local.k3s_token

  providers = {
    hcloud = hcloud
  }
  hcloud_token = var.hcloud_token != "" ? var.hcloud_token : local.hcloud_token

  source = "kube-hetzner/kube-hetzner/hcloud"

  cluster_name = "prod"

  automatically_upgrade_os = false
  automatically_upgrade_k3s = false

  ssh_public_key  = file("")
  ssh_private_key = file("")

  network_region = "eu-central"

  enable_rancher = true
  rancher_hostname = "r.domainame.com"
  rancher_install_channel = "stable"
  rancher_bootstrap_password = "password"

  initial_k3s_channel = "v1.27"

  control_plane_nodepools = [
    {
      name        = "control-plane-hel1",
      server_type = "cax31",
      location    = "hel1",
      labels      = [],
      taints      = [],
      count       = 1
      backups     = true
    },
    #{
    #  name        = "storage",
    #  server_type = "cax21",
    #  location    = "hel1",
    #  # Fully optional, just a demo.
    #  labels      = [
    #    "node.kubernetes.io/server-usage=storage"
    # ],
    #  taints      = [],
    #  count       = 1

      # In the case of using Longhorn, you can use Hetzner volumes instead of using the node's own storage by specifying a value from 10 to 10000 (in GB)
      # It will create one volume per node in the nodepool, and configure Longhorn to use them.
      # Something worth noting is that Volume storage is slower than node storage, which is achieved by not mentioning longhorn_volume_size or setting it to 0.
      # So for something like DBs, you definitely want node storage, for other things like backups, volume storage is fine, and cheaper.
      # longhorn_volume_size = 20

      # Enable automatic backups via Hetzner (default: false)
       # backups = true
    #}
  ]

  agent_nodepools = [
    {
      name        = "agent-heavy",
      server_type = "cax41",
      location    = "hel1",
      labels      = [],
      taints      = [],
      count       = 1
      backups     = false
    }
  ]

  autoscaler_nodepools = [
  {
    name        = "autoscaled-peak-load"
    server_type = "cax41"           // Choose a server type with sufficient resources
    location    = "hel1"            // Specify the desired location
    min_nodes   = 0                 // Set a minimum number of nodes
    max_nodes   = 100                // Set a maximum number to scale up to during peak load
    #labels      = {
    #  "node.kubernetes.io/role": "peak-workloads"
    #}
    #taints      = [{
    #  key: "node.kubernetes.io/role"
    #  value: "peak-workloads"
    #  effect: "NoExecute"
    #}]
  }
]

  load_balancer_type     = "lb21"
  load_balancer_location = "hel1"
  load_balancer_disable_ipv6 = true
  load_balancer_health_check_interval = "5s"
  load_balancer_health_check_timeout = "3s"
  load_balancer_health_check_retries = 3

  etcd_s3_backup = {
     etcd-s3-endpoint        = "Here's the data"
     etcd-s3-access-key      = "Here's the data"
     etcd-s3-secret-key      = "Here's the data"
     etcd-s3-bucket          = "k3s"
  }

  cluster_autoscaler_log_level = 4
  cluster_autoscaler_log_to_stderr = true
  cluster_autoscaler_stderr_threshold = "INFO"
  cluster_autoscaler_extra_args = [
    "--skip-nodes-with-system-pods=false",
    "--scale-down-utilization-threshold=0.4"
  ]

  postinstall_exec = [
    (
      local.etcd_snapshot_name == "" ? "" :
      <<-EOF
      export CLUSTERINIT=$(cat /etc/rancher/k3s/config.yaml | grep -i '"cluster-init": true')
      if [ -n "$CLUSTERINIT" ]; then
        echo indeed this is the first control plane node > /tmp/restorenotes
        k3s server \
          --cluster-reset \
          --etcd-s3 \
          --cluster-reset-restore-path=${local.etcd_snapshot_name} \
          --etcd-s3-endpoint=${local.etcd_s3_endpoint} \
          --etcd-s3-bucket=${local.etcd_s3_bucket} \
          --etcd-s3-access-key=${local.etcd_s3_access_key} \
          --etcd-s3-secret-key=${local.etcd_s3_secret_key}
        # renaming the k3s.yaml because it is used as a trigger for further downstream
        # changes. Better to let `k3s server` create it as expected.
        mv /etc/rancher/k3s/k3s.yaml /etc/rancher/k3s/k3s.backup.yaml

        # download etcd/etcdctl for adapting the kubernetes config before starting k3s
        ETCD_VER=${local.etcd_version}
        case "$(uname -m)" in
            aarch64) ETCD_ARCH="arm64" ;;
            x86_64) ETCD_ARCH="amd64" ;;
        esac;
        DOWNLOAD_URL=https://github.com/etcd-io/etcd/releases/download
        rm -f /tmp/etcd-$ETCD_VER-linux-$ETCD_ARCH.tar.gz
        curl -L $DOWNLOAD_URL/$ETCD_VER/etcd-$ETCD_VER-linux-$ETCD_ARCH.tar.gz -o /tmp/etcd-$ETCD_VER-linux-$ETCD_ARCH.tar.gz
        tar xzvf /tmp/etcd-$ETCD_VER-linux-$ETCD_ARCH.tar.gz -C /usr/local/bin --strip-components=1
        rm -f /tmp/etcd-$ETCD_VER-linux-$ETCD_ARCH.tar.gz

        etcd --version
        etcdctl version

        # start etcd server in the background
        nohup etcd --data-dir /var/lib/rancher/k3s/server/db/etcd &
        echo $! > save_pid.txt

        # delete traefik service so that no load-balancer is accidently changed
        etcdctl del /registry/services/specs/traefik/traefik
        etcdctl del /registry/services/endpoints/traefik/traefik

        # delete old nodes (they interfere with load balancer)
        # minions is the old name for "nodes"
        OLD_NODES=$(etcdctl get "" --prefix --keys-only | grep /registry/minions/ | cut -c 19-)
        for NODE in $OLD_NODES; do
          for KEY in $(etcdctl get "" --prefix --keys-only | grep $NODE); do
            etcdctl del $KEY
          done
        done

        kill -9 `cat save_pid.txt`
        rm save_pid.txt
      else
        echo this is not the first control plane node > /tmp/restorenotes
      fi
      EOF
    )
  ]

}

provider "hcloud" {
  token = var.hcloud_token != "" ? var.hcloud_token : local.hcloud_token
}

terraform {
  required_version = ">= 1.5.0"
  required_providers {
    hcloud = {
      source  = "hetznercloud/hcloud"
      version = ">= 1.43.0"
    }
  }
}

output "kubeconfig" {
  value     = module.kube-hetzner.kubeconfig
  sensitive = true
}

output "k3s_token" {
  value     = module.kube-hetzner.k3s_token
  sensitive = true
}

variable "hcloud_token" {
  sensitive = true
  default   = ""
}

Screenshots

control terra

Platform

Linux

mysticaltech commented 7 months ago

@homergleason Please see in our readme how to ssh into your node and look at the last error you posted above. This would help a lot.

@kube-hetzner/core Any other ideas here?

mysticaltech commented 7 months ago

@homergleason Did you manage to debug?