databricks / terraform-provider-databricks

Databricks Terraform Provider
https://registry.terraform.io/providers/databricks/databricks/latest
Other
442 stars 379 forks source link

[ISSUE] `databricks_group_member` is eventually consistent #2421

Open erniebilling opened 1 year ago

erniebilling commented 1 year ago

Configuration

// variables

variable "databricks_account_id" {
  type        = string
  description = "databricks account id"
}

variable "credentials_id" {
  type        = string
  description = "Databricks credentials id"
}

variable "vpc_name" {
  type        = string
  description = "Workspace VPC name"
}

variable "subnet_names" {
  type        = list(string)
  description = "List of workspace subnets name"
}

variable "aws_region" {
  type        = string
  description = "AWS region"
}

// default variables

variable "metastore_admin_groups" {
  type        = list(string)
  description = "Groups given metastore admin rights"
  default     = []
}

variable "metastore_admin_users" {
  type        = list(string)
  description = "Users that given metastore admin rights"
  default     = []
}

variable "app_domain" {
  type        = string
  description = "Appication domain this workspace represents"
  default     = "test"
}

variable "environment" {
  type        = string
  description = "The environment type: dev|test|stage|prod. You can also use values like dev1|temp|john-dev-2|smith"
  default     = "dev"
}

variable "tier" {
  type        = string
  description = "Used by cloud admins, used in tags, valid values: non-prod|prod"
  validation {
    condition     = contains(["non-prod", "prod"], var.tier)
    error_message = "The value must be a valid (non-prod|prod) tier!"
  }
  default = "non-prod"
}

variable "aws_resource_creation_wait_time" {
  type        = string
  description = "Some aws resources take time to be actually configured, using it right away causes error"
  default     = "10s"
}

# module "workspace" {
#   source = "git::ssh://git@github.anaplan.com/platform-data-services/terraform-databricks-pds-workspace.git?ref=v0.2.0"
#   providers = {
#     databricks.mws = databricks.mws
#   }
#   app_domain            = var.app_domain
#   aws_region            = var.aws_region
#   environment           = var.environment
#   databricks_account_id = var.databricks_account_id
#   credentials_id        = var.credentials_id
#   vpc_name              = var.pds_workspace_vpc_name
#   subnet_names          = var.pds_workspace_subnet_names
# }

// A module to create a databricks workspace

locals {
  workspace_name        = "${var.app_domain}-${var.aws_region}"
  aws_discriminator     = "ap-${local.workspace_name}"
  sg_egress_ports       = [443, 3306, 6666]
  sg_ingress_protocol   = ["tcp", "udp"]
  sg_egress_protocol    = ["tcp", "udp"]
  workspace_bucket_name = replace("${local.aws_discriminator}-${var.environment}-workspace", "_", "-")
  sg_name               = replace("${local.aws_discriminator}-${var.environment}-workspace-sg", "_", "-")
  metastore_name        = local.workspace_name // make it unique per region per environment
  metastore_admins      = "ms-admins-${local.workspace_name}"
  metastore_bucket_name = replace("${local.aws_discriminator}-${var.environment}-metastore", "_", "-")
}

// 1. Workspace Credential Configuration
// provided by var.credentials_id

// 2. Workspace Storgae Configuration

# module "s3_bucket" {
#   source      = "git::ssh://git@github.anaplan.com/platform-data-services/terraform-aws-pds-private-s3-bucket.git?ref=v1.1.0"
#   bucket_name = local.table_bucket_name
#   // the following values is set to maintains the status quo, as per s3 module v0.0.5
#   // further discussion is needed to set the proper values
#   force_destroy     = true
#   enable_versioning = false
# }

resource "aws_s3_bucket" "workspace_bucket" {
  bucket        = local.workspace_bucket_name
  force_destroy = true
  tags = {
    Name = local.workspace_bucket_name
  }
}

// block public access
resource "aws_s3_bucket_public_access_block" "workspace_bucket_public_access" {
  bucket                  = aws_s3_bucket.workspace_bucket.id
  block_public_acls       = true
  block_public_policy     = true
  ignore_public_acls      = true
  restrict_public_buckets = true
  depends_on              = [aws_s3_bucket.workspace_bucket]
}

resource "aws_s3_bucket_versioning" "workspace_bucket_versioning" {
  bucket = aws_s3_bucket.workspace_bucket.id
  versioning_configuration {
    status = "Disabled"
  }
}

resource "aws_s3_bucket_server_side_encryption_configuration" "workspace_bucket_bucket_encryption" {
  bucket = aws_s3_bucket.workspace_bucket.id

  rule {
    apply_server_side_encryption_by_default {
      sse_algorithm = "AES256"
    }
  }
}

###

data "databricks_aws_bucket_policy" "workspace_bucket_policy" {
  provider = databricks.mws
  bucket   = aws_s3_bucket.workspace_bucket.bucket
}

resource "aws_s3_bucket_policy" "workspace_bucket_root_bucket_policy" {
  bucket = aws_s3_bucket.workspace_bucket.id
  policy = data.databricks_aws_bucket_policy.workspace_bucket_policy.json
}

resource "databricks_mws_storage_configurations" "workspace_storage_config" {
  provider                   = databricks.mws
  account_id                 = var.databricks_account_id
  storage_configuration_name = local.workspace_name
  bucket_name                = aws_s3_bucket.workspace_bucket.bucket
  depends_on                 = [aws_s3_bucket_policy.workspace_bucket_root_bucket_policy]
}

// 3. Networking
// The VPC and its subnet and all gatways and firewall were setup by the cloud admins
data "aws_vpc" "workspace_vpc" {
  filter {
    name   = "tag:Name"
    values = [var.vpc_name]
  }
}

data "aws_subnet" "workspace_subnets" {
  for_each = toset(var.subnet_names)

  vpc_id = data.aws_vpc.workspace_vpc.id

  filter {
    name   = "tag:Name"
    values = [each.key]
  }
}

resource "aws_security_group" "workspace_sg" {
  name = local.sg_name
  # Valid descriptions are strings less than 256 characters from the following set:  a-zA-Z0-9. _-:/()#,@[]+=&;{}!$*
  description = "Databricks workspace ${local.workspace_name} security group to allow inbound/outbound from the VPC subnet"
  vpc_id      = data.aws_vpc.workspace_vpc.id

  dynamic "ingress" {
    for_each = local.sg_ingress_protocol
    content {
      from_port = 0
      to_port   = 65535
      protocol  = ingress.value
      self      = true
    }
  }

  dynamic "egress" {
    for_each = local.sg_egress_protocol
    content {
      from_port = 0
      to_port   = 65535
      protocol  = egress.value
      self      = true
    }
  }

  dynamic "egress" {
    for_each = local.sg_egress_ports
    content {
      from_port   = egress.value
      to_port     = egress.value
      protocol    = "tcp"
      cidr_blocks = ["0.0.0.0/0"]
    }
  }

  tags = {
    Name = local.sg_name
  }
}

resource "databricks_mws_networks" "this" {
  provider           = databricks.mws
  account_id         = var.databricks_account_id
  network_name       = local.workspace_name
  vpc_id             = data.aws_vpc.workspace_vpc.id
  subnet_ids         = [for name, subnet in data.aws_subnet.workspace_subnets : subnet.id]
  security_group_ids = [aws_security_group.workspace_sg.id]
}

// 4. last step, the workspace
resource "databricks_mws_workspaces" "workspace" {
  provider                 = databricks.mws
  account_id               = var.databricks_account_id
  workspace_name           = local.workspace_name
  deployment_name          = local.workspace_name
  aws_region               = var.aws_region
  credentials_id           = var.credentials_id
  storage_configuration_id = databricks_mws_storage_configurations.workspace_storage_config.storage_configuration_id
  network_id               = databricks_mws_networks.this.network_id
}

# module "metastore" {
#   source = "git::ssh://git@github.anaplan.com/platform-data-services/terraform-databricks-pds-metastore.git?ref=v2.1.0"
#   providers = {
#     databricks.workspace = databricks.workspace
#     databricks.mws       = databricks.mws
#   }
#   workspace_discriminator         = module.workspace.workspace_name
#   environment                     = var.environment
#   databricks_account_id           = var.databricks_account_id
#   databricks_workspace_ids        = { primary = module.workspace.workspace_id }
#   aws_resource_creation_wait_time = var.aws_resource_creation_wait_time
#   metastore_admins                = var.metastore_admins
# }

// Create a databricks metastore

# module "s3_bucket" {
#   source      = "git::ssh://git@github.anaplan.com/platform-data-services/terraform-aws-pds-private-s3-bucket.git?ref=v1.1.0"
#   bucket_name = local.table_bucket_name
#   // the following values is set to maintains the status quo, as per s3 module v0.0.5
#   // further discussion is needed to set the proper values
#   force_destroy     = true
#   enable_versioning = false
# }

resource "aws_s3_bucket" "metastore_bucket" {
  bucket        = local.metastore_bucket_name
  force_destroy = true
  tags = {
    Name = local.metastore_bucket_name
  }
}

// block public access
resource "aws_s3_bucket_public_access_block" "metastore_bucket_public_access" {
  bucket                  = aws_s3_bucket.metastore_bucket.id
  block_public_acls       = true
  block_public_policy     = true
  ignore_public_acls      = true
  restrict_public_buckets = true
  depends_on              = [aws_s3_bucket.metastore_bucket]
}

resource "aws_s3_bucket_versioning" "metastore_bucket_versioning" {
  bucket = aws_s3_bucket.metastore_bucket.id
  versioning_configuration {
    status = "Disabled"
  }
}

resource "aws_s3_bucket_server_side_encryption_configuration" "metastore_bucket_bucket_encryption" {
  bucket = aws_s3_bucket.metastore_bucket.id

  rule {
    apply_server_side_encryption_by_default {
      sse_algorithm = "AES256"
    }
  }
}

###
# module "metastore_iam_role" {
#   source                = "./modules/aws/iam-role"
#   name                  = "metastore"
#   aws_discriminator     = local.aws_discriminator
#   environment           = var.environment
#   metastore_bucket_arn  = module.s3_bucket.bucket_arn
#   databricks_account_id = var.databricks_account_id
# }

// generate a policy document to give databricks role the ability to assume our metastore role
data "aws_iam_policy_document" "passrole_for_unity_catalog" {
  statement {
    effect  = "Allow"
    actions = ["sts:AssumeRole"]
    principals {
      // hard coded iam role as per https://docs.databricks.com/data-governance/unity-catalog/get-started.html#configure-a-storage-bucket-and-iam-role-in-aws
      identifiers = ["arn:aws:iam::414351767826:role/unity-catalog-prod-UCMasterRole-14S5ZJVKOTYTL"]
      type        = "AWS"
    }
    condition {
      test     = "StringEquals"
      variable = "sts:ExternalId"
      values   = [var.databricks_account_id]
    }
  }
}

locals {
  sample_data_policy_name    = "${local.aws_discriminator}-${var.environment}-metastore-sample-data"
  metastore_data_policy_name = "${local.aws_discriminator}-${var.environment}-metastore-data"
  role_name                  = "${local.aws_discriminator}-${var.environment}-metastore-data-role"
}

// nice to have, if we want to try samples in databricks tutorials
// Required, in case https://docs.databricks.com/data/databricks-datasets.html are needed.
resource "aws_iam_policy" "sample_data" {
  name = local.sample_data_policy_name
  policy = jsonencode({
    Version = "2012-10-17"
    Id      = "sample-data"
    Statement = [
      {
        "Action" : [
          "s3:GetObject",
          "s3:GetObjectVersion",
          "s3:ListBucket",
          "s3:GetBucketLocation"
        ],
        "Resource" : [
          "arn:aws:s3:::databricks-datasets-oregon/*",
          "arn:aws:s3:::databricks-datasets-oregon"

        ],
        "Effect" : "Allow"
      }
    ]
  })
  tags = {
    Name = local.sample_data_policy_name
  }
  // TODO: tags
}

// grant the metastore iam role access to the s3 bucket
resource "aws_iam_policy" "metastore_data" {
  name = local.metastore_data_policy_name
  policy = jsonencode({
    Version = "2012-10-17"
    Id      = "metastore-data"
    Statement = [
      {
        "Action" : [
          "s3:GetObject",
          "s3:GetObjectVersion",
          "s3:PutObject",
          "s3:PutObjectAcl",
          "s3:DeleteObject",
          "s3:ListBucket",
          "s3:GetBucketLocation"
        ],
        "Resource" : [
          aws_s3_bucket.metastore_bucket.arn,
          "${aws_s3_bucket.metastore_bucket.arn}/*"
        ],
        "Effect" : "Allow"
      }
    ]
  })
  tags = {
    Name = local.metastore_data_policy_name
  }
}

resource "aws_iam_role" "this" {
  name                = local.role_name
  assume_role_policy  = data.aws_iam_policy_document.passrole_for_unity_catalog.json
  managed_policy_arns = [aws_iam_policy.sample_data.arn, aws_iam_policy.metastore_data.arn] // Everytime we create a metastore, a new policy is attached
  tags = {
    Name = local.role_name
  }
}

###
// Recommended by databricks:
// Give aws more time to finish the iam role creation and policy attachment 
// I saw this error once: "the AWS IAM role in the storage credential is not configured correctly."
// Ref: https://registry.terraform.io/providers/databricks/databricks/latest/docs/guides/aws-workspace
resource "time_sleep" "wait" {
  depends_on      = [aws_iam_role.this]
  create_duration = var.aws_resource_creation_wait_time
}

// metastore admin group
# module "metastore_admins" {
#   source = "git::ssh://git@github.anaplan.com/platform-data-services/terraform-databricks-pds-user-group.git?ref=v2.0.0"
#   providers = {
#     databricks.mws       = databricks.mws
#     databricks.workspace = databricks.workspace
#   }
#   group_name             = local.metastore_admins
#   users                  = var.metastore_admins.users
#   groups                 = concat(["global metastore admins"], var.metastore_admins.groups)
#   add_group_to_workspace = false
# }

resource "databricks_group" "metastore_admins" {
  provider     = databricks.mws
  display_name = local.metastore_admins
}

data "databricks_user" "group_users" {
  provider  = databricks.mws
  for_each  = toset(var.metastore_admin_users)
  user_name = each.key
}

resource "databricks_group_member" "user_group_member" {
  provider  = databricks.mws
  group_id  = databricks_group.metastore_admins.id
  for_each  = toset(var.metastore_admin_users)
  member_id = data.databricks_user.group_users[each.key].id
}

data "databricks_group" "group_users" {
  provider     = databricks.mws
  for_each     = toset(var.metastore_admin_groups)
  display_name = each.key
}

resource "databricks_group_member" "group_group_member" {
  provider  = databricks.mws
  group_id  = databricks_group.metastore_admins.id
  for_each  = toset(var.metastore_admin_groups)
  member_id = data.databricks_group.group_users[each.key].id
}

###

resource "databricks_metastore" "this" {
  provider      = databricks.workspace
  name          = local.metastore_name
  storage_root  = "s3://${aws_s3_bucket.metastore_bucket.id}/metastore"
  force_destroy = true
  owner         = local.metastore_admins
  depends_on    = [databricks_group.metastore_admins]
}

resource "databricks_metastore_assignment" "default_metastore" {
  provider = databricks.workspace
  workspace_id         = databricks_mws_workspaces.workspace.workspace_id
  metastore_id         = databricks_metastore.this.id
  default_catalog_name = "main"
}

resource "databricks_metastore_data_access" "this" {
  provider = databricks.workspace
  // wait for the iam role to be actually created and for the metastore to be assigned
  depends_on   = [time_sleep.wait, databricks_metastore_assignment.default_metastore]
  metastore_id = databricks_metastore.this.id
  name         = "${local.workspace_name}-data-access"
  aws_iam_role {
    role_arn = aws_iam_role.this.arn
  }
  is_default = true
}

### external tables

locals {
  db_name           = "test-db"
  table_bucket_name = replace("${local.aws_discriminator}-${var.environment}-delta-${local.db_name}", "_", "-")
  medallions        = ["gold", "silver", "bronze"]
}

# module "bucket" {
#   source      = "git::ssh://git@github.anaplan.com/platform-data-services/terraform-aws-pds-private-s3-bucket.git?ref=v1.1.0"
#   bucket_name = local.table_bucket_name
#   // the following values is set to maintains the status quo, as per s3 module v0.0.5
#   // further discussion is needed to set the proper values
#   force_destroy     = true
#   enable_versioning = false
# }

resource "aws_s3_bucket" "table_bucket" {
  bucket        = local.table_bucket_name
  force_destroy = true
  tags = {
    Name = local.table_bucket_name
  }
}

// block public access
resource "aws_s3_bucket_public_access_block" "public_access" {
  bucket                  = aws_s3_bucket.table_bucket.id
  block_public_acls       = true
  block_public_policy     = true
  ignore_public_acls      = true
  restrict_public_buckets = true
  depends_on              = [aws_s3_bucket.table_bucket]
}

resource "aws_s3_bucket_versioning" "versioning" {
  bucket = aws_s3_bucket.table_bucket.id
  versioning_configuration {
    status = "Disabled"
  }
}

resource "aws_s3_bucket_server_side_encryption_configuration" "bucket_encryption" {
  bucket = aws_s3_bucket.table_bucket.id

  rule {
    apply_server_side_encryption_by_default {
      sse_algorithm = "AES256"
    }
  }
}

###
# module "iam_policy" {
#   source                = "./modules/aws/iam/metastore-external-table-role"
#   databricks_account_id = var.databricks_account_id
#   bucket_name           = local.table_bucket_name
#   prefix                = replace("${local.aws_discriminator}-${var.environment}-${local.db_name}", "_", "-")
# }

locals {
  prefix = replace("${local.aws_discriminator}-${var.environment}-${local.db_name}", "_", "-")
  catalog_name = "test-cat"
}

// generate a policy document to give databricks role the ability to assume our metastore role
data "aws_iam_policy_document" "passrole_for_external_data_bucket" {
  statement {
    effect  = "Allow"
    actions = ["sts:AssumeRole"]
    principals {
      // hard coded iam role as per https://docs.databricks.com/data-governance/unity-catalog/get-started.html#configure-a-storage-bucket-and-iam-role-in-aws
      identifiers = ["arn:aws:iam::414351767826:role/unity-catalog-prod-UCMasterRole-14S5ZJVKOTYTL"]
      type        = "AWS"
    }
    condition {
      test     = "StringEquals"
      variable = "sts:ExternalId"
      values   = [var.databricks_account_id]
    }
  }
}

locals {
  external_bucket_policy_name = "${local.prefix}-data"
}

// grant the metastore iam role access to the s3 bucket
resource "aws_iam_policy" "bucket_policy" {
  name = local.external_bucket_policy_name
  policy = jsonencode({
    Version = "2012-10-17"
    Statement = [
      {
        "Action" : [
          "s3:GetObject",
          "s3:GetObjectVersion",
          "s3:PutObject",
          "s3:PutObjectAcl",
          "s3:DeleteObject",
          "s3:ListBucket",
          "s3:GetBucketLocation"
        ],
        "Resource" : [
          "arn:aws:s3:::${local.table_bucket_name}",
          "arn:aws:s3:::${local.table_bucket_name}/*"
        ],
        "Effect" : "Allow"
      }
    ]
  })
  tags = {
    Name = local.external_bucket_policy_name
  }
}

resource "aws_iam_role" "table_bucket" {
  name                = local.prefix
  assume_role_policy  = data.aws_iam_policy_document.passrole_for_external_data_bucket.json
  managed_policy_arns = [aws_iam_policy.bucket_policy.arn] // Everytime we create a metastore, a new policy is attached
  tags = {
    Name = local.prefix
  }
}

###

// Recommended by databricks:
// Give aws more time to finish the iam role creation and policy attachment 
// I saw this error once: "the AWS IAM role in the storage credential is not configured correctly."
// Ref: https://registry.terraform.io/providers/databricks/databricks/latest/docs/guides/aws-workspace
resource "time_sleep" "wait_table_bucket" {
  depends_on      = [aws_iam_role.table_bucket]
  create_duration = var.aws_resource_creation_wait_time
}

// create unity catalog resources
resource "databricks_storage_credential" "cred" {
  provider   = databricks.workspace
  depends_on = [time_sleep.wait_table_bucket]
  name       = replace(local.db_name, "_", "-")
  aws_iam_role {
    role_arn = aws_iam_role.table_bucket.arn
  }
  comment = "For ${local.table_bucket_name}"
}

resource "databricks_external_location" "location" {
  provider        = databricks.workspace
  name            = replace(local.db_name, "_", "-")
  url             = "s3://${local.table_bucket_name}"
  credential_name = databricks_storage_credential.cred.id
  comment         = "For ${local.table_bucket_name}"
}

resource "databricks_catalog" "catalog" {
  provider = databricks.workspace
  name     = local.catalog_name
  comment  = "this catalog is managed by terraform"
}

resource "databricks_schema" "schema" {
  provider     = databricks.workspace
  for_each     = toset(local.medallions)
  catalog_name = local.catalog_name
  name         = each.key
  comment      = "this database is managed by terraform"
  properties = {
    environment = var.environment
    s3_bucket   = local.table_bucket_name
    medallion   = each.key
    database    = local.db_name
  }
  depends_on = [databricks_catalog.catalog]
}

Expected Behavior

terraform apply succeeds, workspace, metastore and external tables successfully created

Actual Behavior

Table creation often fails with permissions failures like

Error: cannot create catalog: User does not have CREATE CATALOG on Metastore 'pds-us-west-2'. Using basic auth: host=https://anaplan-dev-pds-us-west-2.cloud.databricks.com/, username=[mohamed.elgemaiey@anaplan.com](mailto:mohamed.elgemaiey@anaplan.com), password=***REDACTED***, account_id=990c3a09-3de8-48c1-95a9-021a6ef83066
with module.apps.module.glue[0].module.glue_dbs.databricks_catalog.catalog
on .terraform/modules/apps.glue.glue_dbs/[main.tf](http://main.tf/) line 48, in resource "databricks_catalog" "catalog":
resource "databricks_catalog" "catalog" {

Steps to Reproduce

In existing Databricks account:

configure DATABRICKS_USERNAME and DATABRICKS_PASSWORD for account with admin rights. configure AWS account credentials (AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, etc)

terraform plan -out tf.plan

enter values for aws_region, databricks_account_id, credentials_id, vpc_name and subnet_names

terraform apply tf.plan

Terraform and provider versions

Terraform v1.3.9 on linux_amd64

Debug Output

tf-debug.log

Important Factoids

Seems to be timing related - retrying the plan/apply eventually works. Really need this to work the first time.

aravishdatabricks commented 1 year ago

was able to reproduce this behavior as well

Initial Failure databricks_catalog.catalog: Creating... databricks_storage_credential.cred: Creating... databricks_storage_credential.cred: Creation complete after 1s [id=test-db] databricks_external_location.location: Creating... databricks_external_location.location: Creation complete after 1s [id=test-db] ╷ │ Error: cannot create catalog: User does not have CREATE CATALOG on Metastore 'test-us-west-2'. │ │ with databricks_catalog.catalog, │ on main.tf line 546, in resource "databricks_catalog" "catalog": │ 546: resource "databricks_catalog" "catalog" {


Apply again after a few seconds

databricks_catalog.catalog: Creating... databricks_catalog.catalog: Creation complete after 2s [id=test-cat] databricks_schema.schema["gold"]: Creating... databricks_schema.schema["silver"]: Creating... databricks_schema.schema["bronze"]: Creating... databricks_schema.schema["bronze"]: Creation complete after 0s [id=test-cat.bronze] databricks_schema.schema["silver"]: Creation complete after 0s [id=test-cat.silver] databricks_schema.schema["gold"]: Creation complete after 0s [id=test-cat.gold]

Apply complete! Resources: 4 added, 0 changed, 0 destroyed.

erniebilling commented 1 year ago

Removing metastore owner (a group is created and assigned as owner) fixes the problem. Instead we allow the principal running terraform to be owner and the group just gets full permissions on the metastore.