vmware / terraform-provider-vcd

Terraform VMware Cloud Director provider
https://www.terraform.io/docs/providers/vcd/
Mozilla Public License 2.0
148 stars 112 forks source link

terraform provider fails to acknowledge catalog item upload #281

Open dataclouder opened 5 years ago

dataclouder commented 5 years ago

This is a bug difficult to reproduce, which seems to be a race condition. Using a script generated by full-env.tf, the catalog item upload, which usually completes in less than 3 minutes, sometimes continues indefinitely and needs to be stopped manually.

The script creates the following:

  1. org
  2. org user (depends on 1)
  3. external network
  4. VDC (depends on 1)
  5. edge gateway (depends on 4)
  6. catalog (depends on 1 and 4, as the org storage is added with the VDC)
  7. catalog item (depends on 6)
  8. media item (depends on 6)
  9. isolated network (depends on 4)
  10. direct network (depends on 4)
  11. routed network (depends on 5)

There are two operations that take most of the time and go on in parallel: the edge gateway generation and the upload of the catalog item.

Example script:

provider "vcd" {
  user                 = "administrator"
  password             = "MYPASSWORD"
  url                  = "https://vcd/mycompany.com/api"
  sysorg               = "System"
  org                  = "datacloud"
  vdc                  = "vdc-datacloud"
  allow_unverified_ssl = "true"
  max_retry_timeout    = 600
  version              = "~> 2.4"
  logging              = true
}

resource "vcd_external_network" "extnet-datacloud" {
  name        = "extnet-datacloud"
  description = "external network"

  vsphere_network {
    vcenter = "vc1"
    name    = "VM Network"
    type    = "NETWORK"
  }

  ip_scope {
    gateway    = "10.150.159.253"
    netmask    = "255.255.224.0"
    dns1       = "10.21.82.101"
    dns2       = "8.8.4.4"
    dns_suffix = "datacloud.org"

    static_ip_pool {
      start_address = "10.150.138.23"
      end_address   = "10.150.138.24"
    }
  }
  retain_net_info_across_deployments = "false"
}

resource "vcd_org" "datacloud" {
  name              = "datacloud"
  full_name         = "datacloud"
  is_enabled        = "true"
  stored_vm_quota   = 50
  deployed_vm_quota = 50
  delete_force      = "true"
  delete_recursive  = "true"
}

resource "vcd_org_user" "datacloud-admin" {
  org               = "${vcd_org.datacloud.name}"
  name              = "datacloud-admin"
  password          = "datacloud"
  role              = "Organization Administrator"
  enabled           = true
  take_ownership    = true
  provider_type     = "INTEGRATED"
  stored_vm_quota   = 50
  deployed_vm_quota = 50
}

resource "vcd_org_vdc" "vdc-datacloud" {
  name = "vdc-datacloud"
  org  = "${vcd_org.datacloud.name}"

  allocation_model  = "AllocationVApp"
  provider_vdc_name = "vc1-TestbedCluster-06:58:14"
  network_pool_name = "vc1-TestbedCluster-06:58:14-VXLAN-NP"
  network_quota     = 50

  compute_capacity {
    cpu {
      limit = 0
    }

    memory {
      limit = 0
    }
  }

  storage_profile {
    name    = "*"
    enabled = true
    limit   = 0
    default = true
  }

  storage_profile {
    name    = "Development"
    enabled = true
    limit   = 0
    default = false
  }

  enabled                  = true
  enable_thin_provisioning = true
  enable_fast_provisioning = true
  delete_force             = true
  delete_recursive         = true
}

resource "vcd_edgegateway" "gw-datacloud" {
  org                     = "${vcd_org.datacloud.name}"
  vdc                     = "${vcd_org_vdc.vdc-datacloud.name}"
  name                    = "gw-datacloud"
  description             = "datacloud edge gateway"
  configuration           = "compact"
  default_gateway_network = "${vcd_external_network.extnet-datacloud.name}"
  advanced                = true

  external_networks = ["${vcd_external_network.extnet-datacloud.name}"]
}

resource "vcd_catalog" "cat-datacloud" {
  org         = "${vcd_org.datacloud.name}"
  name        = "cat-datacloud"
  description = "datacloud catalog"

  delete_force     = "true"
  delete_recursive = "true"
  depends_on       = ["vcd_org_vdc.vdc-datacloud"]
}

resource "vcd_catalog_item" "photon-hw11" {
  org     = "${vcd_org.datacloud.name}"
  catalog = "${vcd_catalog.cat-datacloud.name}"

  name                 = "photon-hw11"
  description          = "photon-hw11"
  ova_path             = "/Users/gmax/vm/photon-hw11-3.0-26156e2.ova"
  upload_piece_size    = 5
  show_upload_progress = "true"
}

resource "vcd_catalog_media" "test_media" {
  org     = "${vcd_org.datacloud.name}"
  catalog = "${vcd_catalog.cat-datacloud.name}"

  name                 = "test_media"
  description          = "test_media"
  media_path           = "/Users/gmax/workdir/git/dataclouder/terraform-provider-vcd/test-resources/test.iso"
  upload_piece_size    = 5
  show_upload_progress = "true"
}

# Optional networks will be added only if the
# corresponding names are set in the configuration file

resource "vcd_network_routed" "net-datacloud-r" {
  name         = "net-datacloud-r"
  org          = "${vcd_org.datacloud.name}"
  vdc          = "${vcd_org_vdc.vdc-datacloud.name}"
  edge_gateway = "${vcd_edgegateway.gw-datacloud.name}"
  gateway      = "192.168.2.1"

  static_ip_pool {
    start_address = "192.168.2.2"
    end_address   = "192.168.2.100"
  }
}

resource "vcd_network_isolated" "net-datacloud-d" {
  name    = "net-datacloud-d"
  org     = "${vcd_org.datacloud.name}"
  vdc     = "${vcd_org_vdc.vdc-datacloud.name}"
  gateway = "192.168.3.1"

  static_ip_pool {
    start_address = "192.168.3.2"
    end_address   = "192.168.3.100"
  }
}

resource "vcd_network_direct" "net-datacloud-i" {
  name             = "net-datacloud-i"
  org              = "${vcd_org.datacloud.name}"
  vdc              = "${vcd_org_vdc.vdc-datacloud.name}"
  external_network = "extnet-datacloud"
}

A successful operation looks like this:

vcd_org.datacloud: Creating...
vcd_external_network.extnet-datacloud: Creating...
vcd_org.datacloud: Creation complete after 1s [id=1674abee-9b90-46d9-9105-a3f7173bf40b]
vcd_org_user.datacloud-admin: Creating...
vcd_org_vdc.vdc-datacloud: Creating...
vcd_org_user.datacloud-admin: Creation complete after 4s [id=urn:vcloud:user:d6df99bd-2956-43cc-9904-35221ab8b0c2]
vcd_external_network.extnet-datacloud: Still creating... [10s elapsed]
vcd_org_vdc.vdc-datacloud: Still creating... [10s elapsed]
vcd_external_network.extnet-datacloud: Creation complete after 13s [id=extnet-datacloud]
vcd_org_vdc.vdc-datacloud: Still creating... [20s elapsed]
vcd_org_vdc.vdc-datacloud: Creation complete after 23s [id=urn:vcloud:vdc:1aa83c3e-5bab-40da-964d-6c656b45def4]
vcd_catalog.cat-datacloud: Creating...
vcd_network_direct.net-datacloud-i: Creating...
vcd_edgegateway.gw-datacloud: Creating...
vcd_network_isolated.net-datacloud-d: Creating...
vcd_catalog.cat-datacloud: Creation complete after 2s [id=cat-datacloud]
vcd_catalog_media.test_media: Creating...
vcd_catalog_item.photon-hw11: Creating...
vcd_catalog_media.test_media: Upload progress 0.00%
vcd_network_direct.net-datacloud-i: Creation complete after 7s [id=net-datacloud-i]
vcd_edgegateway.gw-datacloud: Still creating... [10s elapsed]
vcd_network_isolated.net-datacloud-d: Still creating... [10s elapsed]
vcd_catalog_item.photon-hw11: Upload progress 0.00%
vcd_catalog_item.photon-hw11: Still creating... [10s elapsed]
vcd_catalog_media.test_media: Still creating... [10s elapsed]
vcd_catalog_media.test_media: Upload progress 100.00%
vcd_catalog_media.test_media: vCD import catalog item progress 100%
vcd_catalog_media.test_media: Creation complete after 15s [id=cat-datacloud:test_media]
vcd_edgegateway.gw-datacloud: Still creating... [20s elapsed]
vcd_network_isolated.net-datacloud-d: Still creating... [20s elapsed]
vcd_catalog_item.photon-hw11: Still creating... [20s elapsed]
vcd_edgegateway.gw-datacloud: Still creating... [30s elapsed]
vcd_network_isolated.net-datacloud-d: Still creating... [30s elapsed]
vcd_catalog_item.photon-hw11: Still creating... [30s elapsed]
vcd_edgegateway.gw-datacloud: Still creating... [40s elapsed]
vcd_network_isolated.net-datacloud-d: Still creating... [40s elapsed]
vcd_catalog_item.photon-hw11: Still creating... [40s elapsed]
vcd_edgegateway.gw-datacloud: Still creating... [50s elapsed]
vcd_network_isolated.net-datacloud-d: Still creating... [50s elapsed]
vcd_catalog_item.photon-hw11: Still creating... [50s elapsed]
vcd_network_isolated.net-datacloud-d: Still creating... [1m0s elapsed]
vcd_edgegateway.gw-datacloud: Still creating... [1m0s elapsed]
vcd_catalog_item.photon-hw11: Still creating... [1m0s elapsed]
vcd_edgegateway.gw-datacloud: Still creating... [1m10s elapsed]
vcd_network_isolated.net-datacloud-d: Still creating... [1m10s elapsed]
vcd_catalog_item.photon-hw11: Still creating... [1m10s elapsed]
vcd_edgegateway.gw-datacloud: Still creating... [1m20s elapsed]
vcd_network_isolated.net-datacloud-d: Still creating... [1m20s elapsed]
vcd_catalog_item.photon-hw11: Still creating... [1m20s elapsed]
vcd_network_isolated.net-datacloud-d: Still creating... [1m30s elapsed]
vcd_edgegateway.gw-datacloud: Still creating... [1m30s elapsed]
vcd_catalog_item.photon-hw11: Still creating... [1m30s elapsed]
vcd_network_isolated.net-datacloud-d: Still creating... [1m40s elapsed]
vcd_edgegateway.gw-datacloud: Still creating... [1m40s elapsed]
vcd_catalog_item.photon-hw11: Still creating... [1m40s elapsed]
vcd_network_isolated.net-datacloud-d: Still creating... [1m50s elapsed]
vcd_edgegateway.gw-datacloud: Still creating... [1m50s elapsed]
vcd_catalog_item.photon-hw11: Still creating... [1m50s elapsed]
vcd_network_isolated.net-datacloud-d: Still creating... [2m0s elapsed]
vcd_edgegateway.gw-datacloud: Still creating... [2m0s elapsed]
vcd_edgegateway.gw-datacloud: Creation complete after 2m1s [id=urn:vcloud:gateway:0324f344-59a1-4b0c-aa17-f37afc928820]
vcd_network_routed.net-datacloud-r: Creating...
vcd_catalog_item.photon-hw11: Still creating... [2m0s elapsed]
vcd_network_isolated.net-datacloud-d: Still creating... [2m10s elapsed]
vcd_network_routed.net-datacloud-r: Still creating... [10s elapsed]
vcd_catalog_item.photon-hw11: Still creating... [2m10s elapsed]
vcd_network_routed.net-datacloud-r: Creation complete after 15s [id=net-datacloud-r]
vcd_network_isolated.net-datacloud-d: Still creating... [2m20s elapsed]
vcd_catalog_item.photon-hw11: Still creating... [2m20s elapsed]
vcd_network_isolated.net-datacloud-d: Still creating... [2m30s elapsed]
vcd_network_isolated.net-datacloud-d: Creation complete after 2m32s [id=net-datacloud-d]
vcd_catalog_item.photon-hw11: Still creating... [2m30s elapsed]
vcd_catalog_item.photon-hw11: Creation complete after 2m33s [id=cat-datacloud:photon-hw11]
Apply complete! Resources: 11 added, 0 changed, 0 destroyed.

When it fails, there is no conclusion. You will see lots of lines like

vcd_catalog_item.photon-hw11: Still creating... [3m10s elapsed]
vcd_catalog_item.photon-hw11: Still creating... [3m20s elapsed]
vcd_catalog_item.photon-hw11: Still creating... [3m30s elapsed]
vcd_catalog_item.photon-hw11: Still creating... [3m40s elapsed]
vcd_catalog_item.photon-hw11: Still creating... [3m50s elapsed]
vcd_catalog_item.photon-hw11: Still creating... [4m0s elapsed]
[...]
vcd_catalog_item.photon-hw11: Still creating... [8m50s elapsed]

and if you check in the GUI, you'll see that the catalog item has been uploaded and can be used successfully to create vApps.

dataclouder commented 5 years ago

How to reproduce: Using an empty vCD, and having the above script filled with the right credentials, run this script:

#!/bin/bash
function check_exit_code {
    exit_code=$?
    if [ "$exit_code" != "0" ]
    then
        echo "ERROR"
        exit $exit_code
    fi
}

for N in $(seq 1 10)
do
     echo "# $N - init"
     terraform init
     check_exit_code

     echo "# $N - plan"
     terraform plan
     check_exit_code

     echo "# $N - apply"
     terraform apply -auto-approve
     check_exit_code

     echo "# $N - destroy"
     terraform destroy -auto-approve
     check_exit_code
     rm -f terraform.tfstate*
done

Repeat until you see the error

vbauzys commented 5 years ago

We figure out and made change to fail when response is lost: https://github.com/vmware/go-vcloud-director/pull/231

vbauzys commented 5 years ago

@dataclouder I think it is safe to close?

dataclouder commented 5 years ago

No. We made a change that makes the task fail explicitly when previously it was just hanging, but the core issue of this problem is still bothering us. Until we find and fix the root cause, this issue should stay open