ibm_container_worker_pool_zone_attachment receives 404 "The specified cluster could not be found" on retrieving workers for newly created cluster

pregnor commented 5 years ago

Terraform Version

Terraform v0.11.13 IBM Cloud Terraform provider version: v0.17.3

Affected Resource(s)

Please list the resources as a list, for example:

ibm_container_worker_pool_zone_attachment

Terraform Configuration Files

variable "name" {
  type = "string"
}

variable "public_vlan_id" {}
variable "private_vlan_id" {}
variable "resource_group" {}
variable "default_pool_size" {}
variable "datacenter" {
  type = "string"
}

variable "machine_type" {
  type    = "string"
  default = "b2c.16x64.encrypted"
}

variable "public_name" {
  type = "string"
}

variable "workers" {
  default     = 3
  description = "The number of workers in the Kubernetes cluster"
}

variable "slack_webhook" {
  default = "[redacted]"
}

data "ibm_resource_group" "group" {
  name = "${var.resource_group}"
}

resource "ibm_container_cluster" "k8s" {
  name            = "${var.name}"
  datacenter      = "${var.datacenter}"
  machine_type    = "${var.machine_type}"
  hardware        = "shared"
  public_vlan_id  = "${var.public_vlan_id}"
  private_vlan_id = "${var.private_vlan_id}"
  no_subnet       = false
  default_pool_size = "${var.default_pool_size}"
  update_all_workers = false
  worker_num      = "${var.workers}"
  resource_group_id = "${data.ibm_resource_group.group.id}"
  webhook = [
    {
      level = "Normal"
      type  = "slack"
      url   = "${var.slack_webhook}"
    },
  ]
  billing      = "hourly"
}

# Use data source to get VLANs
data "ibm_container_cluster" "k8s" {
  cluster_name_id = "${ibm_container_cluster.k8s.id}"
  resource_group_id = "${data.ibm_resource_group.group.id}"
}

data "ibm_container_cluster_config" "kubeconfig" {
  cluster_name_id = "${ibm_container_cluster.k8s.name}"
  config_dir      = "/data/k8s"
  admin           = true
  resource_group_id = "${data.ibm_resource_group.group.id}"

  depends_on = ["ibm_container_cluster.k8s"]
}

output "name" {
  value = "${var.name}"
}

output "public_name" {
  value = "${var.public_name}"
}

output "ingress" {
  value = "${ibm_container_cluster.k8s.ingress_hostname}"
}

output "config_path" {
  value = "${data.ibm_container_cluster_config.kubeconfig.config_file_path}"
}

# ---

variable "pool_u2c_2x4_size" {default=0}
variable "pool_b2c_16x64_size" {default=0}
variable "pool_b2c_8x32_size" {default=0}
variable "pool_u2c_2x4_enabled" {default=0}
variable "pool_b2c_16x64_enabled" {default=0}
variable "pool_b2c_8x32_enabled" {default=0}

variable "region" {}

resource "ibm_container_worker_pool" "pool_u2c_2x4" {
  count = "${var.pool_u2c_2x4_enabled}"
  worker_pool_name = "pool_u2c_2x4"
  machine_type     = "u2c.2x4"
  cluster          = "${ibm_container_cluster.k8s.name}"
  size_per_zone    = "${var.pool_u2c_2x4_size}"
  hardware         = "shared"
  disk_encryption  = "true"
  region           = "${var.region}"
  resource_group_id = "${data.ibm_resource_group.group.id}"
}

resource "ibm_container_worker_pool_zone_attachment" "pool_u2c_2x4" {
  count = "${var.pool_u2c_2x4_enabled}"
  cluster         = "${ibm_container_cluster.k8s.name}"
  worker_pool     = "pool_u2c_2x4"
  zone            = "${var.datacenter}"
  private_vlan_id = "${var.private_vlan_id}"
  public_vlan_id  = "${var.public_vlan_id}"
  region          = "${var.region}"
  resource_group_id = "${data.ibm_resource_group.group.id}"
  depends_on      = ["ibm_container_worker_pool.pool_u2c_2x4"]
}

resource "ibm_container_worker_pool" "pool_b2c_16x64" {
  count = "${var.pool_b2c_16x64_enabled}"
  worker_pool_name = "pool_b2c_16x64"
  machine_type     = "b2c.16x64"
  cluster          = "${ibm_container_cluster.k8s.name}"
  size_per_zone    = "${var.pool_b2c_16x64_size}"
  hardware         = "shared"
  disk_encryption  = "true"
  region           = "${var.region}"
  resource_group_id = "${data.ibm_resource_group.group.id}"
}

resource "ibm_container_worker_pool_zone_attachment" "pool_b2c_16x64" {
  count = "${var.pool_b2c_16x64_enabled}"
  cluster         = "${ibm_container_cluster.k8s.name}"
  worker_pool     = "pool_b2c_16x64"
  zone            = "${var.datacenter}"
  private_vlan_id = "${var.private_vlan_id}"
  public_vlan_id  = "${var.public_vlan_id}"
  region          = "${var.region}"
  resource_group_id = "${data.ibm_resource_group.group.id}"
  depends_on      = ["ibm_container_worker_pool.pool_b2c_16x64"]
}

resource "ibm_container_worker_pool" "pool_b2c_8x32" {
  count = "${var.pool_b2c_8x32_enabled}"
  worker_pool_name = "pool_b2c_8x32"
  machine_type     = "b2c.8x32"
  cluster          = "${ibm_container_cluster.k8s.name}"
  size_per_zone    = "${var.pool_b2c_8x32_size}"
  hardware         = "shared"
  disk_encryption  = "true"
  region           = "${var.region}"
  resource_group_id = "${data.ibm_resource_group.group.id}"
}

resource "ibm_container_worker_pool_zone_attachment" "pool_b2c_8x32" {
  count = "${var.pool_b2c_8x32_enabled}"
  cluster         = "${ibm_container_cluster.k8s.name}"
  worker_pool     = "pool_b2c_8x32"
  zone            = "${var.datacenter}"
  private_vlan_id = "${var.private_vlan_id}"
  public_vlan_id  = "${var.public_vlan_id}"
  region          = "${var.region}"
  resource_group_id = "${data.ibm_resource_group.group.id}"
  depends_on      = ["ibm_container_worker_pool.pool_b2c_8x32"]
}

Debug Output

https://gist.github.com/pregnor/cead3f27ed46b15ffbf37f5f4c8bf059

Panic Output

-

Expected Behavior

What should have happened?

No error, successful Terraform apply. As the cluster was created successfully and the worker pool was created successfully we had expected for the worker pool zone attachment to be able to be created successfully as well.

Actual Behavior

What actually happened?

Pool zone attachment creation failed with

* ibm_container_worker_pool_zone_attachment.pool_b2c_8x32: Error waiting for workers of worker pool (pool_b2c_8x32) of cluster ([redacted]) to become ready: Error retrieving workers for cluster: Request failed with status code: 404, ServerErrorResponse: {"incidentID":"51959db082777bbe-PRG","code":"G0004","description":"The specified cluster could not be found. If you use resource groups, make sure that you target the correct resource group.","type":"General","recoveryCLI":"To list the clusters you have access to, run 'ibmcloud ks clusters'. To list the resource groups that you have access to, run 'ibmcloud resource groups'. To target the resource group, run 'ibmcloud target -g \u003cresource_group\u003e'."}

even though the cluster and the worker pool had been created successfully beforehand.

As later on a simple replan + reapply resolved the issue, from a user's perspective it felt like it's a concurrency or redundancy issue - waiting for the workers to be available or retrying sporadic failures. But this is just a wild guess.

Steps to Reproduce

Please list the steps required to reproduce the issue, for example:

Define a new cluster with a worker pool in addition to the default one and a corresponding pool zone attachment.
terraform apply the changes all at once.
Observe the cluster and the additional worker pool being created successfully while the pool zone attachment fails to retrieve the created workers.
Wait a couple of hours.
Check the cluster being functional.
Try terraform apply once again.
Observe the apply output to create the worker pool zone attachment and finish successfully.

Important Factoids

Are there anything atypical about your accounts that we should know? For example: Running in EC2 Classic? Custom version of OpenStack? Tight ACLs?

-

References

Are there any other GitHub issues (open or closed) or Pull Requests that should be linked here? For example:

I had run this search: https://github.com/IBM-Cloud/terraform-provider-ibm/issues?utf8=%E2%9C%93&q=is%3Aissue+pool.

Checked out issues:

343
357
358
405
496
522
532
628
657

but none of those seemed to produce the exact issue I'm having, that's why I'm only mentioning their numbers without referencing them directly with a hashtag.

hkantare commented 5 years ago

Instead of names worker_pool = "pool_b2c_8x32" zone = "${var.datacenter}" We should use the ID of the resources so that resources will wait until all the dependent resources are provisioned and available.

Here is the sample example which list how to use the id's of resources https://github.com/IBM-Cloud/terraform-provider-ibm/blob/master/examples/ibm-cluster/cluster-worker-pool-zone/main.tf

pregnor commented 5 years ago

So if the value of worker_pool in the ibm_container_worker_pool_zone_attachment definition is a worker pool name instead of a worker pool ID then the waiting mechanism doesn't kick in, yet if the value is a worker pool ID then the waiting mechanism works as expected?
In the example you have linked the default pool zone attachment uses pool name instead of pool ID as far as I can see which may cause confusion regarding the pool name being permissible to be used in the zone attachment: https://github.com/IBM-Cloud/terraform-provider-ibm/blob/master/examples/ibm-cluster/cluster-worker-pool-zone/main.tf#L32 I can see what you meant at https://github.com/IBM-Cloud/terraform-provider-ibm/blob/master/examples/ibm-cluster/cluster-worker-pool-zone/main.tf#L53, but still it is not clarified whether there is any difference between pool names and IDs in case of any pool, default or not.
In the documentation (https://ibm-cloud.github.io/tf-ibm-docs/v0.17.3/r/container_worker_pool_zone_attachment.html / Argument Reference / worker_pool) the documentation states worker_pool - (Required, string) The name or id of the worker pool. which doesn't highlight any difference between names and IDs, in case it does matter I would be glad to have this included in the documentation to avoid such easily preventable issues.

IBM-Cloud / terraform-provider-ibm