hashicorp / terraform-provider-google

Terraform Provider for Google Cloud Platform
https://registry.terraform.io/providers/hashicorp/google/latest/docs
Mozilla Public License 2.0
2.35k stars 1.75k forks source link

Unable to cleanly delete resources with google_gke_hub_feature_membership resource #10258

Open satata-clgx opened 3 years ago

satata-clgx commented 3 years ago

Community Note

Terraform Version

terraform --version
Terraform v0.13.6

Affected Resource(s)

Terraform Configuration Files

resource "google_gke_hub_membership" "gke_hub" {
  count = 1
  provider = google-beta
  membership_id = module.gke.name
  endpoint {
    gke_cluster {
     resource_link = "//container.googleapis.com/projects/${var.project_id}/locations/${module.gke.location}/clusters/${module.gke.name}"
    }
  }
}

resource "google_gke_hub_feature_membership" "feature_member" {
  count = 1
  location = "global"
  feature = "configmanagement"
  membership = google_gke_hub_membership.gke_hub[0].membership_id
  configmanagement {
    version = "1.9.0"
    config_sync {
      git {
        sync_repo = var.sync_repo
        sync_branch = var.sync_branch
        secret_type = var.secret_type
        policy_dir = var.sync_dir
      }
    }
    policy_controller {
      enabled = true
      exemptable_namespaces = ["asm-system","asm-gateways","config-management-system","default","gatekeeper-system","gke-connect","istio-system","kube-node-lease","kube-public","kube-system","kf"] )
      template_library_installed = true
      referential_rules_enabled = true
    }
  }
  provider = google-beta
}

Expected Behavior

Terraform should terminate config management system and gatekeeper namespaces in kubernetes.

Actual Behavior

Terraform unable to terminate config management system and gatekeeper namespaces in kubernetes.

This is the terraform plan stage

# module.acme-anthos-cluster.google_gke_hub_feature_membership.feature_member[0] will be created
+ resource "google_gke_hub_feature_membership" "feature_member" {
    + feature    = "configmanagement"
    + id         = (known after apply)
    + location   = "global"
    + membership = "acme-gke-us-w1-sbx"
    + project    = (known after apply)
    + configmanagement {
        + version = "1.9.0"
        + config_sync {
            + git {
                + policy_dir  = "clusters/acm/sbx"
                + secret_type = "ssh"
                + sync_branch = "master"
                + sync_repo   = "REDACTED"
              }
          }
        + policy_controller {
            + enabled                    = true
            + exemptable_namespaces      = [
                + "asm-gateways",
                + "asm-system",
                + "config-management-system",
                + "default",
                + "gatekeeper-system",
                + "gke-connect",
                + "istio-system",
                + "kube-node-lease",
                + "kube-public",
                + "kube-system",
              ]
            + referential_rules_enabled  = true
            + template_library_installed = true
          }
      }
  }

Plan: 1 to add, 0 to change, 0 to destroy.

This is the terraform destroy stage

  # module.acme-anthos-cluster.google_gke_hub_feature_membership.feature_member[0] will be destroyed
 - resource "google_gke_hub_feature_membership" "feature_member" {
     - feature    = "configmanagement" -> null
     - id         = "projects/acme-anthos/locations/global/features/configmanagement/membershipId/acme-gke-us-w1-sbx" -> null
     - location   = "global" -> null
     - membership = "acme-gke-us-w1-sbx" -> null
     - project    = "acme-anthos" -> null

     - configmanagement {
         - version = "1.9.0" -> null

         - config_sync {
             - git {
                 - policy_dir  = "clusters/acm/sbx" -> null
                 - secret_type = "ssh" -> null
                 - sync_branch = "master" -> null
                 - sync_repo   = "REDACTED" -> null
               }
           }

         - policy_controller {
             - enabled                    = true -> null
             - exemptable_namespaces      = [
                 - "asm-gateways",
                 - "asm-system",
                 - "config-management-system",
                 - "default",
                 - "gatekeeper-system",
                 - "gke-connect",
                 - "istio-system",
                 - "kube-node-lease",
                 - "kube-public",
                 - "kube-system",
               ] -> null
             - log_denies_enabled         = false -> null
             - referential_rules_enabled  = true -> null
             - template_library_installed = true -> null
           }
       }
   }

Plan: 0 to add, 0 to change, 1 to destroy.

Resources in kubernetes present after deletion.

 kubectl get all -n gatekeeper-system                                                               
 NAME                                                READY   STATUS    RESTARTS   AGE
 pod/gatekeeper-audit-db4ff4877-9fzl8                1/1     Running   0          16h
 pod/gatekeeper-controller-manager-966986cb6-4bcnf   1/1     Running   0          16h

 NAME                                 TYPE        CLUSTER-IP       EXTERNAL-IP   PORT(S)   AGE
 service/gatekeeper-webhook-service   ClusterIP   100.110.128.20   <none>        443/TCP   16h

 NAME                                            READY   UP-TO-DATE   AVAILABLE   AGE
 deployment.apps/gatekeeper-audit                1/1     1            1           16h
 deployment.apps/gatekeeper-controller-manager   1/1     1            1           16h

 NAME                                                      DESIRED   CURRENT   READY   AGE
 replicaset.apps/gatekeeper-audit-db4ff4877                1         1         1       16h
 replicaset.apps/gatekeeper-controller-manager-966986cb6   1         1         1       16h

 kubectl get all -n config-management-system                                                            
 NAME                                              READY   STATUS    RESTARTS   AGE
 pod/admission-webhook-7c78f64c98-4bqtx            1/1     Running   81         16h
 pod/admission-webhook-7c78f64c98-8kdgt            1/1     Running   79         16h
 pod/config-management-operator-58ccfd5fc4-mv6w5   1/1     Running   0          27h
 pod/reconciler-manager-6c6976b8bd-7v4fh           2/2     Running   0          16h
 pod/root-reconciler-6f477fb65f-sssx5              4/4     Running   0          16h

 NAME                        TYPE        CLUSTER-IP        EXTERNAL-IP   PORT(S)   AGE
 service/admission-webhook   ClusterIP   100.110.128.201   <none>        443/TCP   16h

 NAME                                         READY   UP-TO-DATE   AVAILABLE   AGE
 deployment.apps/admission-webhook            2/2     2            1           16h
 deployment.apps/config-management-operator   1/1     1            1           27h
 deployment.apps/reconciler-manager           1/1     1            1           16h
 deployment.apps/root-reconciler              1/1     1            1           16h

 NAME                                                    DESIRED   CURRENT   READY   AGE
 replicaset.apps/admission-webhook-7c78f64c98            2         2         2       16h
 replicaset.apps/config-management-operator-58ccfd5fc4   1         1         1       27h
 replicaset.apps/reconciler-manager-6c6976b8bd           1         1         1       16h
 replicaset.apps/root-reconciler-6f477fb65f              1         1         1       16h

Steps to Reproduce

terraform plan terraform apply

References

nat-henderson commented 3 years ago

My initial suspicion is that we are deleting the GCP resource and the GCP resource is failing to do cleanup - Terraform is usually pretty good at noticing if we've accidentally left a resource dangling. I'll look into it and make sure that's right, and if it is, I'll file a bug internally.

nat-henderson commented 2 years ago

I've noticed this in the documentation, which could plausibly explain the issue depending on where your cluster is - but it does seem like your cluster is a GKE cluster on GCP, so probably not.

This is currently only supported for GKE clusters on Google Cloud. To unregister other clusters, follow the instructions at https://cloud.google.com/anthos/multicluster-management/connect/unregistering-a-cluster.

Given that, I've filed a bug internally.

Red-Five commented 2 years ago

I see same behavior with GKE on GCP

`terraform -v Terraform v1.2.7 on linux_amd64

After TF destroying google_gke_hub_feature_membership for ACM, all the ACM GKE Workloads remain running.

karlkfi commented 1 year ago

The ACM Hub Feature controller does not have any implemented uninstall behavior today. It just abandons the resources it previously applied. So it is not possible for Terraform to trigger uninstall. This will need to be implemented in GCP first.

philipsabri commented 2 weeks ago

The ACM Hub Feature controller does not have any implemented uninstall behavior today. It just abandons the resources it previously applied. So it is not possible for Terraform to trigger uninstall. This will need to be implemented in GCP first.

Would it be possible to do a cleanup similar to gcloud container fleet policycontroller disable --all-memberships before completely removing the Policy Controller fleet feature, at least for this feature, not sure about the rest.

It's basically just sending this to each cluster.

POST gkehub.googleapis.com/v1/projects/PROJECT/locations/LOCATION/features/policycontroller?alt=json&updateMask=membership_specs Body:

{
  "membershipSpecs": {
    "projects/PROJECT/locations/LOCATION/memberships/CLUSTER": {
      "policycontroller": {
        "policyControllerHubConfig": {
          "installSpec": "INSTALL_SPEC_NOT_INSTALLED"
        }
      }
    }
  }
}

Could we have that as a step before removing the feature?

Update: I played around with adding some pre_delete code and this cleaned it up, if we want to go in this direction. (with some code improvement to speed it up etc etc)

    // Check if the feature is Policy Controller
    if d.Get("name") == "policycontroller" {
        res, err := transport_tpg.SendRequest(transport_tpg.SendRequestOptions{
            Config:    config,
            Method:    "GET",
            Project:   billingProject,
            RawURL:    url,
            UserAgent: userAgent,
            Headers:   headers,
        })
        if err != nil {
            return transport_tpg.HandleNotFoundError(err, d, "Feature")
        }
        membershipSpecs, ok := res["membershipSpecs"].(map[string]interface{})
        if ok {
            for cluster, _ := range membershipSpecs {
                policycontrollerUrl, err := tpgresource.ReplaceVarsForId(d, config, "{{GKEHub2BasePath}}projects/{{project}}/locations/{{location}}/features/policycontroller?alt=json&updateMask=membership_specs")
                if err != nil {
                    return err
                }

                // body to trigger uninstall
                policyControllerBody := map[string]interface{}{
                    "membershipSpecs": map[string]interface{}{
                        cluster: map[string]interface{}{
                            "policycontroller": map[string]interface{}{
                                "policyControllerHubConfig": map[string]interface{}{
                                    "installSpec": "INSTALL_SPEC_NOT_INSTALLED",
                                },
                            },
                        },
                    },
                }

                _, err = transport_tpg.SendRequest(transport_tpg.SendRequestOptions{
                    Config:    config,
                    Method:    "PATCH",
                    Project:   billingProject,
                    RawURL:    policycontrollerUrl,
                    UserAgent: userAgent,
                    Body:      policyControllerBody,
                    Timeout:   d.Timeout(schema.TimeoutDelete),
                    Headers:   headers,
                })
                if err != nil {
                    return transport_tpg.HandleNotFoundError(err, d, "Feature")
                }

                // wait policycontroller state == "NOT_INSTALLED"
                for {
                    time.Sleep(10 * time.Second)
                    res, err := transport_tpg.SendRequest(transport_tpg.SendRequestOptions{
                        Config:    config,
                        Method:    "GET",
                        Project:   billingProject,
                        RawURL:    url,
                        UserAgent: userAgent,
                        Headers:   headers,
                    })
                    if err != nil {
                        return transport_tpg.HandleNotFoundError(err, d, "Feature")
                    }

                    if state, ok := res["membershipStates"].(map[string]interface{})[cluster].(map[string]interface{})["policycontroller"].(map[string]interface{})["state"].(string); ok && state == "NOT_INSTALLED" {
                        break
                    }

                    log.Printf("[DEBUG] Waiting for Policy Controller to be NOT_INSTALLED for cluster %s", cluster)
                }

                log.Printf("[DEBUG] Cleaned up Policy Controller for cluster %s", cluster)

            }
        } else {
            log.Printf("[DEBUG] No clusters found to clean up")
        }
    }