astronomer / terraform-provider-astro

Astro Terraform Provider
https://registry.terraform.io/providers/astronomer/astro/latest
Other
11 stars 4 forks source link

Cannot create multiple clusters #53

Closed ichung08 closed 6 months ago

ichung08 commented 6 months ago

Describe the bug

Unable to create multiple clusters in the same main.tf file when running terraform apply. Only the first cluster in the main.tf file is created, and the other clusters fail immediately.

Error Message:

Error: Client error
│ 
│   with astro_deployment.team_2_prod_deployment,
│   on make.tf line 242, in resource "astro_deployment" "team_2_prod_deployment":
│  242: resource "astro_deployment" "team_2_prod_deployment" {
│ 
│ Internal server error, status: 500, requestId: ceece673-a6c1-4774-9b4a-95dd1fa9c282
╵
╷
│ Error: Client error
│ 
│   with astro_workspace.team_3_workspace_prod,
│   on make.tf line 346, in resource "astro_workspace" "team_3_workspace_prod":
│  346: resource "astro_workspace" "team_3_workspace_prod" {
│ 
│ failed to perform request, status: 429
╵

What Terraform Provider Version and Terraform Version did you experience this bug? Latest

What Operating System is the above Terraform Provider installed on? macOS

🪜 Steps To Reproduce

Run terraform apply with the following in main.tf:

terraform {
  required_providers {
    astro = {
      source = "registry.terraform.io/astronomer/astro"
    }
  }
}

provider "astro" {
  organization_id = "YOUR-ORG-ID"
}

resource "astro_workspace" "team_1_workspace" {
  name                  = "team-1-workspace"
  description           = "Team 1 workspace"
  cicd_enforced_default = true
}

resource "astro_cluster" "team_1_cluster" {
  type             = "DEDICATED"
  name             = "team-1-aws-cluster"
  region           = "us-east-1"
  cloud_provider   = "AWS"
  db_instance_type = "db.m6g.large"
  vpc_subnet_range = "172.20.0.0/20"
  workspace_ids    = []
  timeouts = {
    create = "3h"
    update = "2h"
    delete = "1m"
  }
}

resource "astro_deployment" "team_1_dev_deployment" {
  name                    = "team-1-dev-deployment"
  description             = "Team 1 Dev Deployment"
  type                    = "STANDARD"
  cloud_provider          = "AWS"
  region                  = "us-east-1"
  contact_emails          = []
  default_task_pod_cpu    = "0.25"
  default_task_pod_memory = "0.5Gi"
  executor                = "CELERY"
  is_cicd_enforced        = true
  is_dag_deploy_enabled   = true
  is_development_mode     = false
  is_high_availability    = false
  resource_quota_cpu      = "10"
  resource_quota_memory   = "20Gi"
  scheduler_size          = "SMALL"
  workspace_id            = astro_workspace.team_1_workspace.id
  environment_variables   = []
  worker_queues = [{
    name               = "default"
    is_default         = true
    astro_machine      = "A5"
    max_worker_count   = 10
    min_worker_count   = 0
    worker_concurrency = 1
  }]
  scaling_spec = {
    hibernation_spec = {
      override = {
        is_hibernating = true
        override_until = "2025-03-01T13:00:00Z"
      }
    }
  }
}

resource "astro_deployment" "team_1_stage_deployment" {
  name                    = "team-1-stage-deployment"
  description             = "Team 1 Stage Deployment"
  type                    = "STANDARD"
  cloud_provider          = "AWS"
  region                  = "us-east-1"
  contact_emails          = []
  default_task_pod_cpu    = "0.25"
  default_task_pod_memory = "0.5Gi"
  executor                = "CELERY"
  is_cicd_enforced        = true
  is_dag_deploy_enabled   = true
  is_development_mode     = false
  is_high_availability    = false
  resource_quota_cpu      = "10"
  resource_quota_memory   = "20Gi"
  scheduler_size          = "SMALL"
  workspace_id            = astro_workspace.team_1_workspace.id
  environment_variables   = []
  worker_queues = [{
    name               = "default"
    is_default         = true
    astro_machine      = "A5"
    max_worker_count   = 10
    min_worker_count   = 0
    worker_concurrency = 1
  }]
  scaling_spec = {
    hibernation_spec = {
      override = {
        is_hibernating = true
        override_until = "2025-03-01T13:00:00Z"
      }
    }
  }
}

resource "astro_deployment" "team_1_prod_deployment" {
  name                    = "team-1-prod-deployment"
  description             = "Team 1 Prod Deployment"
  type                    = "DEDICATED"
  cluster_id              = astro_cluster.team_1_cluster.id
  contact_emails          = ["preview@astronomer.test"]
  default_task_pod_cpu    = "0.25"
  default_task_pod_memory = "0.5Gi"
  executor                = "KUBERNETES"
  is_cicd_enforced        = true
  is_dag_deploy_enabled   = true
  is_development_mode     = false
  is_high_availability    = true
  resource_quota_cpu      = "10"
  resource_quota_memory   = "20Gi"
  scheduler_size          = "SMALL"
  workspace_id            = astro_workspace.team_1_workspace.id
  environment_variables = [{
    key       = "key1"
    value     = "value1"
    is_secret = false
  }]
}

resource "astro_workspace" "team_2_workspace" {
  name                  = "team-2-workspace"
  description           = "Team 2 workspace"
  cicd_enforced_default = true
}

resource "astro_cluster" "team_2_cluster" {
  type             = "DEDICATED"
  name             = "team-2-aws-cluster"
  region           = "us-east-1"
  cloud_provider   = "AWS"
  db_instance_type = "db.m6g.large"
  vpc_subnet_range = "172.20.0.0/20"
  workspace_ids    = []
  timeouts = {
    create = "3h"
    update = "2h"
    delete = "1m"
  }
}

resource "astro_deployment" "team_2_dev_deployment" {
  name                    = "team-2-dev-deployment"
  description             = "Team 2 Dev Deployment"
  type                    = "STANDARD"
  cloud_provider          = "AWS"
  region                  = "us-east-1"
  contact_emails          = []
  default_task_pod_cpu    = "0.25"
  default_task_pod_memory = "0.5Gi"
  executor                = "CELERY"
  is_cicd_enforced        = true
  is_dag_deploy_enabled   = true
  is_development_mode     = false
  is_high_availability    = false
  resource_quota_cpu      = "10"
  resource_quota_memory   = "20Gi"
  scheduler_size          = "SMALL"
  workspace_id            = astro_workspace.team_2_workspace.id
  environment_variables   = []
  worker_queues = [{
    name               = "default"
    is_default         = true
    astro_machine      = "A5"
    max_worker_count   = 10
    min_worker_count   = 0
    worker_concurrency = 1
  }]
  scaling_spec = {
    hibernation_spec = {
      override = {
        is_hibernating = true
        override_until = "2025-03-01T13:00:00Z"
      }
    }
  }
}

resource "astro_deployment" "team_2_stage_deployment" {
  name                    = "team-2-stage-deployment"
  description             = "Team 2 Stage Deployment"
  type                    = "STANDARD"
  cloud_provider          = "AWS"
  region                  = "us-east-1"
  contact_emails          = []
  default_task_pod_cpu    = "0.25"
  default_task_pod_memory = "0.5Gi"
  executor                = "CELERY"
  is_cicd_enforced        = true
  is_dag_deploy_enabled   = true
  is_development_mode     = false
  is_high_availability    = false
  resource_quota_cpu      = "10"
  resource_quota_memory   = "20Gi"
  scheduler_size          = "SMALL"
  workspace_id            = astro_workspace.team_2_workspace.id
  environment_variables   = []
  worker_queues = [{
    name               = "default"
    is_default         = true
    astro_machine      = "A5"
    max_worker_count   = 10
    min_worker_count   = 0
    worker_concurrency = 1
  }]
  scaling_spec = {
    hibernation_spec = {
      override = {
        is_hibernating = true
        override_until = "2025-03-01T13:00:00Z"
      }
    }
  }
}

resource "astro_deployment" "team_2_prod_deployment" {
  name                    = "team-2-prod-deployment"
  description             = "Team 2 Prod Deployment"
  type                    = "DEDICATED"
  cluster_id              = astro_cluster.team_2_cluster.id
  contact_emails          = ["preview@astronomer.test"]
  default_task_pod_cpu    = "0.25"
  default_task_pod_memory = "0.5Gi"
  executor                = "KUBERNETES"
  is_cicd_enforced        = true
  is_dag_deploy_enabled   = true
  is_development_mode     = false
  is_high_availability    = true
  resource_quota_cpu      = "10"
  resource_quota_memory   = "20Gi"
  scheduler_size          = "SMALL"
  workspace_id            = astro_workspace.team_2_workspace.id
  environment_variables = [{
    key       = "key1"
    value     = "value1"
    is_secret = false
  }]
}

resource "astro_workspace" "team_3_workspace" {
  name                  = "team-3-workspace"
  description           = "Team 3 workspace"
  cicd_enforced_default = true
}

resource "astro_cluster" "team_3_cluster" {
  type             = "DEDICATED"
  name             = "team-3-aws-cluster"
  region           = "us-east-1"
  cloud_provider   = "AWS"
  db_instance_type = "db.m6g.large"
  vpc_subnet_range = "172.20.0.0/20"
  workspace_ids    = []
  timeouts = {
    create = "3h"
    update = "2h"
    delete = "1m"
  }
}

resource "astro_deployment" "team_3_dev_deployment" {
  name                    = "team-3-dev-deployment"
  description             = "Team 3 Dev Deployment"
  type                    = "STANDARD"
  cloud_provider          = "AWS"
  region                  = "us-east-1"
  contact_emails          = []
  default_task_pod_cpu    = "0.25"
  default_task_pod_memory = "0.5Gi"
  executor                = "CELERY"
  is_cicd_enforced        = true
  is_dag_deploy_enabled   = true
  is_development_mode     = false
  is_high_availability    = false
  resource_quota_cpu      = "10"
  resource_quota_memory   = "20Gi"
  scheduler_size          = "SMALL"
  workspace_id            = astro_workspace.team_3_workspace.id
  environment_variables   = []
  worker_queues = [{
    name               = "default"
    is_default         = true
    astro_machine      = "A5"
    max_worker_count   = 10
    min_worker_count   = 0
    worker_concurrency = 1
  }]
  scaling_spec = {
    hibernation_spec = {
      override = {
        is_hibernating = true
        override_until = "2025-03-01T13:00:00Z"
      }
    }
  }
}

resource "astro_deployment" "team_3_stage_deployment" {
  name                    = "team-3-stage-deployment"
  description             = "Team 3 Stage Deployment"
  type                    = "STANDARD"
  cloud_provider          = "AWS"
  region                  = "us-east-1"
  contact_emails          = []
  default_task_pod_cpu    = "0.25"
  default_task_pod_memory = "0.5Gi"
  executor                = "CELERY"
  is_cicd_enforced        = true
  is_dag_deploy_enabled   = true
  is_development_mode     = false
  is_high_availability    = false
  resource_quota_cpu      = "10"
  resource_quota_memory   = "20Gi"
  scheduler_size          = "SMALL"
  workspace_id            = astro_workspace.team_3_workspace.id
  environment_variables   = []
  worker_queues = [{
    name               = "default"
    is_default         = true
    astro_machine      = "A5"
    max_worker_count   = 10
    min_worker_count   = 0
    worker_concurrency = 1
  }]
  scaling_spec = {
    hibernation_spec = {
      override = {
        is_hibernating = true
        override_until = "2025-03-01T13:00:00Z"
      }
    }
  }
}

resource "astro_deployment" "team_3_prod_deployment" {
  name                    = "team-3-prod-deployment"
  description             = "Team 3 Prod Deployment"
  type                    = "DEDICATED"
  cluster_id              = astro_cluster.team_3_cluster.id
  contact_emails          = ["preview@astronomer.test"]
  default_task_pod_cpu    = "0.25"
  default_task_pod_memory = "0.5Gi"
  executor                = "KUBERNETES"
  is_cicd_enforced        = true
  is_dag_deploy_enabled   = true
  is_development_mode     = false
  is_high_availability    = true
  resource_quota_cpu      = "10"
  resource_quota_memory   = "20Gi"
  scheduler_size          = "SMALL"
  workspace_id            = astro_workspace.team_3_workspace.id
  environment_variables = [{
    key       = "key1"
    value     = "value1"
    is_secret = false
  }]
}

📸 Screenshots

image

vandyliu commented 6 months ago

So this is a limitation of the API and the cloud providers. Creating multiple clusters in the same cloud provider at close to the same time may run into errors, especially on AWS. One workaround is to do something like this where you delay the create API call by adding a time_sleep resource like so:

terraform {
  required_providers {
    astro = {
      source = "astronomer/astro"
      version = "0.1.0-alpha"
    }
    time = {
      source = "hashicorp/time"
      version = "0.11.1"
    }
  }
}

provider "astro" {
  organization_id = "XXX"
}

resource "astro_cluster" "team_1_cluster" {
  type             = "DEDICATED"
  name             = "team-1-gcp-cluster-test"
  region           = "us-east4"
  cloud_provider   = "GCP"
  db_instance_type = "Small General Purpose"
  vpc_subnet_range = "172.20.0.0/20"
  pod_subnet_range = "172.21.0.0/19"
    service_peering_range = "172.23.0.0/20"
    service_subnet_range =  "172.22.0.0/22"
  workspace_ids    = []
  timeouts = {
    create = "3h"
    update = "2h"
    delete = "30m"
  }
}

resource "time_sleep" "wait_3_minutes" {
  create_duration = "3m"
}

resource "astro_cluster" "team_2_cluster" {
  type             = "DEDICATED"
  name             = "team-2-gcp-cluster-test"
  region           = "us-east4"
  cloud_provider   = "GCP"
  db_instance_type = "Small General Purpose"
  vpc_subnet_range = "172.20.0.0/20"
  pod_subnet_range = "172.21.0.0/19"
    service_peering_range = "172.23.0.0/20"
    service_subnet_range =  "172.22.0.0/22"
  workspace_ids    = []
  timeouts = {
    create = "3h"
    update = "2h"
    delete = "30m"
  }
  depends_on = [time_sleep.wait_3_minutes]
}

Here, we will wait a few minutes before creating the second cluster. After it is done being created, you should be able to safely remove the resource "time_sleep" from your terraform file and use the file as normal again