databricks / terraform-provider-databricks

Databricks Terraform Provider
https://registry.terraform.io/providers/databricks/databricks/latest
Other
454 stars 391 forks source link

[ISSUE] AWS Destroy databricks_cluster & databricks_instance_pool do not terminate instances #1306

Closed tnk-dev closed 2 years ago

tnk-dev commented 2 years ago

Hi there,

setting up my cluster with instance pool works great, but when I destroy it it leaves workerenv-* instances running and therefore my databricks VPC won't destroy either.

Configuration

main.tf

module "vpc" {
  source                   = "../../../modules/databricks_vpc"
  databricks_account_id    = var.databricks_account_id
  databricks_user_name     = var.databricks_user_name
  databricks_user_password = var.databricks_user_password
  sigil                    = local.sigil
  tags                     = local.tags
  vpc_cidr_block           = var.vpc_cidr_block
}

module "workspace" {
  source                   = "../../../modules/databricks_workspace"
  databricks_account_id    = var.databricks_account_id
  databricks_user_name     = var.databricks_user_name
  databricks_user_password = var.databricks_user_password
  region                   = var.region
  sigil                    = local.sigil
  tags                     = local.tags
  vpc_cidr_block           = var.vpc_cidr_block
  network_id               = module.vpc.network_id
  data_role                = data.terraform_remote_state.datalake.outputs.data_role
}

module "cluster" {
  source             = "../../../modules/databricks_cluster"
  number_of_clusters = 1
  sigil              = local.sigil
  tags               = local.tags
  region             = var.region

  workspace       = module.workspace.workspace
  workspace_url   = module.workspace.workspace_url
  workspace_token = module.workspace.workspace_token

  cluster_driver_memory                      = 1
  cluster_executor_memory                    = 1
  cluster_gb_per_core                        = 1
  cluster_instance_spot_pid_price_percentage = 50
  cluster_max_dbus_per_hour                  = 5
  cluster_max_workers                        = 5
  cluster_min_cores                          = 4
  data_role                                  = data.terraform_remote_state.datalake.outputs.data_role
}

databricks_vpc

data "aws_availability_zones" "this" {}

module "vpc" {
  source  = "terraform-aws-modules/vpc/aws"
  version = "3.2.0"

  name = var.sigil
  cidr = var.vpc_cidr_block
  azs  = data.aws_availability_zones.this.names
  tags = var.tags

  enable_dns_hostnames = true
  enable_nat_gateway   = true
  single_nat_gateway   = true
  create_igw           = true

  public_subnets = [cidrsubnet(var.vpc_cidr_block, 3, 0)]
  private_subnets = [
    cidrsubnet(var.vpc_cidr_block, 3, 1),
    cidrsubnet(var.vpc_cidr_block, 3, 2)
  ]

  manage_default_security_group = true
  default_security_group_name   = "${var.sigil}-sg"

  default_security_group_egress = [
    {
      cidr_blocks = "0.0.0.0/0"
    }
  ]

  default_security_group_ingress = [
    {
      description = "Allow all internal TCP and UDP"
      self        = true
    }
  ]
}

module "vpc_endpoints" {
  source  = "terraform-aws-modules/vpc/aws//modules/vpc-endpoints"
  version = "3.2.0"

  vpc_id             = module.vpc.vpc_id
  security_group_ids = [module.vpc.default_security_group_id]
  tags               = var.tags
  endpoints = {
    s3 = {
      service      = "s3"
      service_type = "Gateway"
      route_table_ids = flatten([
        module.vpc.private_route_table_ids,
        module.vpc.public_route_table_ids
      ])
      tags = merge(var.tags, {
        Name = "${var.sigil}-s3-vpc-endpoint"
      })
    },
    sts = {
      service             = "sts"
      private_dns_enabled = true
      subnet_ids          = module.vpc.private_subnets
      tags = merge(var.tags, {
        Name = "${var.sigil}-sts-vpc-endpoint"
      })
    },
  }
}

resource "databricks_mws_networks" "this" {
  provider           = databricks.mws
  account_id         = var.databricks_account_id
  network_name       = var.sigil
  security_group_ids = [module.vpc.default_security_group_id]
  subnet_ids         = module.vpc.private_subnets
  vpc_id             = module.vpc.vpc_id
}

databricks_workspace

data "databricks_aws_assume_role_policy" "this" {
  external_id = var.databricks_account_id
}

/* DB Janitor */

data "databricks_aws_crossaccount_policy" "s3_data" {
  depends_on = [var.data_role]
  pass_roles = [var.data_role.arn]
}

resource "aws_iam_policy" "janitor" {
  name   = "${var.sigil}-workspace-janitor"
  policy = data.databricks_aws_crossaccount_policy.s3_data.json
}

resource "aws_iam_role" "janitor" {
  name               = "${var.sigil}-workspace-janitor"
  assume_role_policy = data.databricks_aws_assume_role_policy.this.json
}

resource "aws_iam_role_policy_attachment" "janitor" {
  policy_arn = aws_iam_policy.janitor.arn
  role       = aws_iam_role.janitor.name
}

resource "databricks_mws_credentials" "this" {
  provider         = databricks.mws
  account_id       = var.databricks_account_id
  credentials_name = var.sigil
  role_arn         = aws_iam_role.janitor.arn
}

# S3 Root Storage

module "s3_storage_bucket" {
  source      = "../s3-bucket"
  sigil       = var.sigil
  bucket_name = "root"
  tags        = var.tags
}

data "databricks_aws_bucket_policy" "this" {
  bucket = module.s3_storage_bucket.bucket.bucket
}

resource "aws_s3_bucket_policy" "this" {
  depends_on = [module.s3_storage_bucket]
  bucket     = module.s3_storage_bucket.bucket.id
  policy     = data.databricks_aws_bucket_policy.this.json
}

# S3 Root Storage Config

resource "databricks_mws_storage_configurations" "this" {
  provider                   = databricks.mws
  account_id                 = var.databricks_account_id
  bucket_name                = module.s3_storage_bucket.bucket.bucket
  storage_configuration_name = var.sigil
}

# Workspace

resource "databricks_mws_workspaces" "this" {
  provider       = databricks.mws
  account_id     = var.databricks_account_id
  aws_region     = var.region
  workspace_name = var.sigil

  credentials_id           = databricks_mws_credentials.this.credentials_id
  storage_configuration_id = databricks_mws_storage_configurations.this.storage_configuration_id
  network_id               = var.network_id

  token {
    comment = "${var.sigil}-terraform"
  }
}

databricks_cluster

data "databricks_current_user" "this" {
  depends_on = [var.workspace]
}

data "databricks_spark_version" "this" {
  depends_on        = [var.workspace]
  spark_version     = var.spark_version
  long_term_support = true
}

data "databricks_node_type" "this" {
  depends_on            = [var.workspace]
  min_cores             = var.cluster_min_cores
  gb_per_core           = var.cluster_gb_per_core
  photon_worker_capable = true
  photon_driver_capable = true
  local_disk            = true
}

resource "databricks_secret_scope" "this" {
  name = "${var.sigil}-secret_scope"
}

resource "databricks_token" "this" {
  comment          = "Created from ${abspath(path.module)}"
  lifetime_seconds = 3600
}

resource "databricks_secret" "this" {
  string_value = databricks_token.this.token_value
  scope        = databricks_secret_scope.this.name
  key          = "token"
}

resource "databricks_notebook" "this" {
  path           = "/${var.sigil}-shared/Template"
  language       = "PYTHON"
  content_base64 = base64encode(<<-EOT
    from pyspark.sql import SparkSession, DataFrame, Row
    EOT
  )
}

resource "databricks_instance_pool" "this" {
  instance_pool_name       = "${var.sigil}-instance-pool"
  min_idle_instances       = 0
  max_capacity             = 30
  node_type_id             = data.databricks_node_type.this.id
  #  If ever needed: enable_elastic_disk = true
  preloaded_spark_versions = [
    data.databricks_spark_version.this.id
  ]

  aws_attributes {
    availability           = "SPOT"
    zone_id                = var.region
    spot_bid_price_percent = var.cluster_instance_spot_pid_price_percentage
  }

  idle_instance_autotermination_minutes = 20

}

resource "aws_iam_instance_profile" "this" {
  name = "${var.sigil}-s3-data"
  role = var.data_role.name
}

resource "databricks_instance_profile" "this" {
  depends_on           = [aws_iam_instance_profile.this]
  instance_profile_arn = aws_iam_instance_profile.this.arn
}

resource "databricks_cluster" "this" {
  count                   = var.number_of_clusters
  depends_on              = [databricks_instance_pool.this]
  cluster_name            = "${var.sigil}-cluster-${count.index}"
  spark_version           = data.databricks_spark_version.this.id
  instance_pool_id        = databricks_instance_pool.this.id
  autotermination_minutes = 20
  autoscale {
    min_workers = 1
    max_workers = var.cluster_max_workers
  }
  aws_attributes {
    availability           = "SPOT"
    zone_id                = var.region
    first_on_demand        = 1
    spot_bid_price_percent = var.cluster_instance_spot_pid_price_percentage
    instance_profile_arn   = databricks_instance_profile.this.id
  }

  custom_tags = var.tags

}

Expected Behavior

To terminate the instances that databricks_cluster and databricks_instance_pool create.

Actual Behavior

All instances from the cluster spinning up stay running even though the workspace and databricks cluster/instance pools do not exist anymore. Therefore I can't destroy the VPC since the instances depend on it.

image

By the way

how do you control the naming of those instances ?

how do you control the instance type?

Terraform and provider versions

    databricks = {
      source  = "databrickslabs/databricks"
      version = "0.5.6"
    }

Debug Output

2022-05-10T12:05:31.119+0200 [DEBUG] provider.terraform-provider-aws_v4.12.1_x5: [aws-sdk-go] DEBUG: Request ec2/DeleteSubnet Details:
---[ REQUEST POST-SIGN ]-----------------------------
POST / HTTP/1.1
Host: ec2.eu-central-1.amazonaws.com
User-Agent: APN/1.0 HashiCorp/1.0 Terraform/1.1.9 (+https://www.terraform.io) terraform-provider-aws/dev (+https://registry.terraform.io/providers/hashicorp/aws) aws-sdk-go/1.44.0 (go1.17.6; darwin; arm64)
Content-Length: 72
Authorization: AWS4-HMAC-SHA256 Credential=***/20220510/eu-central-1/ec2/aws4_request, SignedHeaders=content-length;content-type;host;x-amz-date, Signature=***
Content-Type: application/x-www-form-urlencoded; charset=utf-8
X-Amz-Date: 20220510T100531Z
Accept-Encoding: gzip

Action=DeleteSubnet&SubnetId=subnet-07f50f311f1045692&Version=2016-11-15
-----------------------------------------------------: timestamp=2022-05-10T12:05:31.119+0200
2022-05-10T12:05:31.537+0200 [DEBUG] provider.terraform-provider-aws_v4.12.1_x5: [aws-sdk-go] DEBUG: Response ec2/DeleteSubnet Details:
---[ RESPONSE ]--------------------------------------
HTTP/1.1 400 Bad Request
Connection: close
Transfer-Encoding: chunked
Cache-Control: no-cache, no-store
Content-Type: text/xml;charset=UTF-8
Date: Tue, 10 May 2022 10:05:31 GMT
Server: AmazonEC2
Strict-Transport-Security: max-age=31536000; includeSubDomains
Vary: accept-encoding
X-Amzn-Requestid: c511f097-cb7b-4e17-a03d-5a06ea9c0d8b

-----------------------------------------------------: timestamp=2022-05-10T12:05:31.536+0200
2022-05-10T12:05:31.537+0200 [DEBUG] provider.terraform-provider-aws_v4.12.1_x5: [aws-sdk-go] <?xml version="1.0" encoding="UTF-8"?>
<Response><Errors><Error><Code>DependencyViolation</Code><Message>The subnet 'subnet-07f50f311f1045692' has dependencies and cannot be deleted.</Message></Error></Errors><RequestID>c511f097-cb7b-4e17-a03d-5a06ea9c0d8b</RequestID></Response>: timestamp=2022-05-10T12:05:31.537+0200
2022-05-10T12:05:31.537+0200 [DEBUG] provider.terraform-provider-aws_v4.12.1_x5: [aws-sdk-go] DEBUG: Validate Response ec2/DeleteSubnet failed, attempt 0/25, error DependencyViolation: The subnet 'subnet-07f50f311f1045692' has dependencies and cannot be deleted.
    status code: 400, request id: c511f097-cb7b-4e17-a03d-5a06ea9c0d8b: timestamp=2022-05-10T12:05:31.537+0200
module.vpc.module.vpc.aws_subnet.private[1]: Still destroying... [id=subnet-07f50f311f1045692, 20s elapsed]
nfx commented 2 years ago

@tnk-dev I think it's either a known issue or a platform bug, not related to the provider. Please create a support ticket for this.

nfx commented 2 years ago

Instance type is controlled via node_type_id, though i see you're using databricks_node_type to dynamically pick an instance type.

As far as I know, there's no way to control the naming of instances, but you can propagate tags via clusters and pools.