databricks / terraform-provider-databricks

Databricks Terraform Provider
https://registry.terraform.io/providers/databricks/databricks/latest
Other
444 stars 384 forks source link

[ISSUE] Issue with `databricks_metastore_assignment` resource - No metastore assigned for the current workspace #1832

Closed alias-santi closed 1 year ago

alias-santi commented 1 year ago

Configuration

providers.tf

terraform {
  required_providers {
    databricks = {
      source = "databricks/databricks"
    }
    aws = {
      source  = "hashicorp/aws"
      version = ">= 3.34"
    }
  }
}

provider "databricks" {
  alias      = "mws"
  host       = "https://accounts.cloud.databricks.com"
  account_id = "<account_id>"
  username   = "<username>"
  password   = "<password>"
}

# https://discuss.hashicorp.com/t/databricks-unity-catalog-account-vs-workspace-level-understanding/42570#:~:text=Unity%20Catalog%20API,as%20the%20host.
# Per the above link, workspace level provider is currently needed for creating unity resources
# this will most likely be fixed in api2.1 at which point refactors will be required (to use the mws account level provider only)
provider "databricks" {
  alias    = "workspace"
  host       = "<workspace_url>"
  username   = "<username>"
  password   = "<password>"
}

data.tf

#-------------------------------------------
# data.tf
# retrieve common data sources in AWS account
#--------------------------------------------

# current AWS account id
data "aws_caller_identity" "current" {}

#  current AWS account_alias
#data "aws_iam_account_alias" "current" {}

# Get AWS AZ zones
data "aws_availability_zones" "available" {}

# get current AWS provider region
data "aws_region" "current" {}

# Groups are automatically added by SCIM at an account level so we need to pull
# them in as data blocks here.
data "databricks_group" "data_engineer" {
  provider = databricks.mws
  display_name = "data-engineer"
}

data "databricks_group" "devops_admin" {
  provider = databricks.mws
  display_name = "devops-admin"
}

data "databricks_group" "data_scientist" {
  provider = databricks.mws
  display_name = "data-scientist"
}

locals.tf

#-----------------------------------
# locals.tf
#-----------------------------------

locals {

    create_layer = var.create_layer
    common_tags = {
        terraform = "true"
    }

    region_short  = join("",[for r in split("-", data.aws_region.current.name) : substr(r,0,1)])
    account_alias_number = "sso001"

    bucket_name           = "<bucket_name>"
    bucket_acl            = "private"
    bucket_grant          = []
    bucket_lifecycle_rule = []

    uc_role_name = "<aws_role_name>"
    uc_trusted_role_arns = "arn:aws:iam::414351767826:role/unity-catalog-prod-UCMasterRole-14S5ZJVKOTYTL"

    uc_metastore_name = "${local.region_short}-poc"
}

main.tf

# #-----------------------------------
# # main.tf
# #-----------------------------------
# #-------------------------------------------
# # S3 Bucket - Managed table storage for Unity Catalog
# #--------------------------------------------
module "unity_catalog_metastore" {
  source = "terraform-aws-modules/s3-bucket/aws"

  create_bucket = true
  bucket        = local.bucket_name
  server_side_encryption_configuration = {
    rule = {
      apply_server_side_encryption_by_default = {
        sse_algorithm = "AES256"
      }
    }
  }

  versioning = {
    enabled = true
  }

  block_public_acls       = true
  block_public_policy     = true
  ignore_public_acls      = true
  restrict_public_buckets = true

  # bucket ACL grants eg cross-account access using canonical id
  acl   = local.bucket_acl
  grant = local.bucket_grant

  # bucket lifecycle rules
  lifecycle_rule = local.bucket_lifecycle_rule

  # bucket policy
  # attach_policy = true
  # policy        = data.aws_iam_policy_document.bucket_policy.json

  tags = local.common_tags
}

#-----------------------------------------------------
# Assumable IAM role for Databricks Unity Catalog
#-----------------------------------------------------

# assumable role
module "uc_assumable_role" {
  source                  = "terraform-aws-modules/iam/aws//modules/iam-assumable-role"
  version                 = "~> 4.3"
  create_role             = local.create_layer
  role_name               = "${local.uc_role_name}-role"
  role_path               = "/"
  role_description        = "${local.uc_role_name} role"
  role_requires_mfa       = false
  role_sts_externalid     = "<account_id>"
  trusted_role_arns       = [local.uc_trusted_role_arns]
  custom_role_policy_arns = [module.uc_assumable_role_policy.arn]
}

# policy
module "uc_assumable_role_policy" {
  source        = "terraform-aws-modules/iam/aws//modules/iam-policy"
  version       = "~> 4.3"
  create_policy = local.create_layer
  name          = "${local.uc_role_name}-policy"
  path          = "/"
  description   = "${local.uc_role_name} policy"
  policy        = data.aws_iam_policy_document.uc_policy_document.json
  tags          = local.common_tags
}

# policy document
data "aws_iam_policy_document" "uc_policy_document" {
  statement {
    sid = "AllowUCS3MetastoreAccess1"
    actions = [
      "s3:GetObject",
      "s3:GetObjectVersion",
      "s3:PutObject",
      "s3:PutObjectAcl",
      "s3:DeleteObject",
      "s3:ListBucket",
      "s3:GetBucketLocation"
    ]
    resources = [
      module.unity_catalog_metastore.s3_bucket_arn,
      "${module.unity_catalog_metastore.s3_bucket_arn}/*",
    ]
  }
  statement {
    sid = "AllowUCS3SampleDataAccess1"
    actions = [
      "s3:GetObject",
      "s3:GetObjectVersion",
      "s3:ListBucket",
      "s3:GetBucketLocation"
    ]
    resources = [
      "arn:aws:s3:::databricks-datasets-oregon",
      "arn:aws:s3:::databricks-datasets-oregon/*",
    ]
  }
}

#-------------------------------------------
# Databricks Metastore
#--------------------------------------------
# Due to a bug in the Terraform AWS provider (spotted in v3.28) the Databricks
# AWS crossaccount policy creation and attachment to the IAM role takes longer
# than the AWS request confirmation to Terraform. As Terraform continues
# creating the Workspace, validation checks for the credentials are failing, as
# the policy doesn't get applied quick enough. Showing the error:
# Error: MALFORMED_REQUEST: Failed credentials validation check
#
# As a workaround give the aws_iam_role more time to be created with a
# time_sleep resource, which you need to add as a dependency to the
# databricks_mws_workspaces resource.
#
# Ref: https://registry.terraform.io/providers/databrickslabs/databricks/latest/docs/guides/aws-workspace#credentials-validation-checks-errors
resource "time_sleep" "wait_for_cross_account_role" {
  depends_on = [
    module.uc_assumable_role
  ]
  create_duration = "20s"
}

resource "databricks_metastore" "this" {
  depends_on = [
    module.unity_catalog_metastore,

  ]
  provider     = databricks.workspace
  name          = local.uc_metastore_name
  storage_root  = "s3://${module.unity_catalog_metastore.s3_bucket_id}/metastore"
  owner         = data.databricks_group.devops_admin.display_name
  force_destroy = true
}

resource "time_sleep" "wait_for_metastore" {
  depends_on = [
    databricks_metastore.this
  ]
  create_duration = "20s"
}

resource "databricks_metastore_assignment" "default_metastore" {
  depends_on = [
    time_sleep.wait_for_metastore
  ]
  provider     = databricks.workspace
  for_each             = toset(var.databricks_workspace_ids)
  workspace_id         = each.key
  metastore_id         = databricks_metastore.this.id
  default_catalog_name = "hive_metastore"
}

Expected Behavior

Metastore assignments to have been applied succesfully

Actual Behavior

╷ │ Error: cannot read metastore assignment: No metastore assigned for the current workspace. │ │ with databricks_metastore_assignment.default_metastore[""], │ on main.tf line 185, in resource "databricks_metastore_assignment" "default_metastore": │ 185: resource "databricks_metastore_assignment" "default_metastore" { │ ╵

Steps to Reproduce

Terragrunt apply (we're using terragrunt as a wrapper although issue persists with a usual terraform apply when supply tfvars)

Terraform and provider versions

Terraform v1.0.9 on darwin_arm64

Debug Output

022-12-07T14:23:36.115Z [DEBUG] databricks_metastore_assignment.default_metastore["<workspace_id_1>"]: applying the planned Create change
2022-12-07T14:23:36.115Z [DEBUG] databricks_metastore_assignment.default_metastore["<workspace_id_2>"]: applying the planned Create change
2022-12-07T14:23:36.115Z [DEBUG] provider.terraform-provider-databricks_v1.6.5: PUT /api/2.1/unity-catalog/workspaces/<workspace_id_1>/metastore {
  "default_catalog_name": "hive_metastore",
  "metastore_id": "d17f4181-5296-42d3-8329-61b8d6bcf354",
  "workspace_id": <workspace_id_1>
}: timestamp=2022-12-07T14:23:36.115Z
2022-12-07T14:23:36.182Z [DEBUG] provider.terraform-provider-databricks_v1.6.5: PUT /api/2.1/unity-catalog/workspaces/<workspace_id_2>/metastore {
  "default_catalog_name": "hive_metastore",
  "metastore_id": "d17f4181-5296-42d3-8329-61b8d6bcf354",
  "workspace_id": <workspace_id_2>
}: timestamp=2022-12-07T14:23:36.182Z
2022-12-07T14:23:36.569Z [DEBUG] provider.terraform-provider-databricks_v1.6.5: 200 OK  {} <- PUT /api/2.1/unity-catalog/workspaces/<workspace_id_1>/metastore: timestamp=2022-12-07T14:23:36.569Z
2022-12-07T14:23:36.570Z [DEBUG] provider.terraform-provider-databricks_v1.6.5: GET /api/2.1/unity-catalog/metastore_summary: timestamp=2022-12-07T14:23:36.569Z
2022-12-07T14:23:36.790Z [DEBUG] provider.terraform-provider-databricks_v1.6.5: 200 OK  {} <- PUT /api/2.1/unity-catalog/workspaces/<workspace_id_2>/metastore: timestamp=2022-12-07T14:23:36.788Z
2022-12-07T14:23:36.790Z [DEBUG] provider.terraform-provider-databricks_v1.6.5: GET /api/2.1/unity-catalog/metastore_summary: timestamp=2022-12-07T14:23:36.788Z
2022-12-07T14:23:36.877Z [DEBUG] provider.terraform-provider-databricks_v1.6.5: 404 Not Found {
  "details": [
    {
      "@type": "type.googleapis.com/google.rpc.RequestInfo",
      "request_id": "a55537c0-fcc4-42d7-92cb-a74a37375897",
      "serving_data": ""
    }
  ],
  "error_code": "METASTORE_DOES_NOT_EXIST",
  "message": "No metastore assigned for the current workspace."
}: timestamp=2022-12-07T14:23:36.877Z

Important Factoids

This seems to be an intermittent issue from testing when destroying and re-applying resources. In this POC, we're supply two workspace IDs as a var to do the metastore assignment. It would appear as thought the put request to do the assignment works fine but the read operation that the provider does to validate successful creation after seems to periodically come back with a 404 as if the metastore_summary api hasn't quite updated in time to show the newly assigned metastore.

We're using a workspace provider that points to one of the workspace urls per the comments in that we found in online digging - https://discuss.hashicorp.com/t/databricks-unity-catalog-account-vs-workspace-level-understanding/42570#:~:text=Unity%20Catalog%20API,as%20the%20host. The issue persists even when using dedicated providers per workspace and not provisioning the resource with a for_each block.

Is this an expected behaviour? This feels like there needs to be a slight delay between the create and read operations to give the metastore_summary endpoint enough time to reflect new changes or the metastore_summary api itself maybe is the issue?

For now, we just run a re-apply and all is well but looking for any advice/comments on this suspected bug!

Thanks

nkvuong commented 1 year ago

@alias-santi the metastore_assignment resource is unintuitive right now (since the metastore_summary API does not give all the necessary information)

Platform team is building an account-level API that should resolve this, but we are still waiting on an ETA for that

alias-santi commented 1 year ago

@nkvuong thanks for coming back. Assume it would be as such. For now we can live with the issue and add a retry on a pipeline etc for the apply as it usually comes back fine on the second apply..

nkvuong commented 1 year ago

This should be fixed in 1.25.0