hashicorp / terraform-provider-azurerm

Terraform provider for Azure Resource Manager
https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs
Mozilla Public License 2.0
4.61k stars 4.65k forks source link

azurerm_virtual_machine_extension.vmext not honoring retry, failing #20005

Open mwardbopp opened 1 year ago

mwardbopp commented 1 year ago

Is there an existing issue for this?

Community Note

Terraform Version

1.3.5

AzureRM Provider Version

3.38.0

Affected Resource(s)/Data Source(s)

azurerm_virtual_machine_extension.vmext

Terraform Configuration Files

# BIG-IP Cluster

############################ Locals ############################

locals {
  # Retrieve all BIG-IP secondary IPs
  vm01_ext_ips = {
    0 = {
      ip = element(flatten(module.bigip.private_addresses["public_private"]["private_ips"][0]), 0)
    }
    1 = {
      ip = element(flatten(module.bigip.private_addresses["public_private"]["private_ips"][0]), 1)
    }
  }
  vm02_ext_ips = {
    0 = {
      ip = element(flatten(module.bigip2.private_addresses["public_private"]["private_ips"][0]), 0)
    }
    1 = {
      ip = element(flatten(module.bigip2.private_addresses["public_private"]["private_ips"][0]), 1)
    }
  }
  # Determine BIG-IP secondary IPs to be used for VIP
  vm01_vip_ips = {
    app1 = {
      ip = module.bigip.private_addresses["public_private"]["private_ip"][0] != local.vm01_ext_ips.0.ip ? local.vm01_ext_ips.0.ip : local.vm01_ext_ips.1.ip
    }
  }
  vm02_vip_ips = {
    app1 = {
      ip = module.bigip2.private_addresses["public_private"]["private_ip"][0] != local.vm02_ext_ips.0.ip ? local.vm02_ext_ips.0.ip : local.vm02_ext_ips.1.ip
    }
  }
  # Custom tags
  tags = {
    Owner = var.resourceOwner
  }
}

############################ Onboard Scripts ############################

# Setup Onboarding scripts
locals {
  f5_onboard1 = templatefile("${path.module}/f5_onboard.tmpl", {
    regKey                     = var.license1
    f5_username                = var.f5_username
    f5_password                = var.az_keyvault_authentication ? "" : var.f5_password
    az_keyvault_authentication = var.az_keyvault_authentication
    vault_url                  = var.az_keyvault_authentication ? data.azurerm_key_vault.main[0].vault_uri : ""
    keyvault_secret            = var.az_keyvault_authentication ? var.keyvault_secret : ""
    ssh_keypair                = file(var.ssh_key)
    INIT_URL                   = var.INIT_URL
    DO_URL                     = var.DO_URL
    AS3_URL                    = var.AS3_URL
    TS_URL                     = var.TS_URL
    FAST_URL                   = var.FAST_URL
    DO_VER                     = split("/", var.DO_URL)[7]
    AS3_VER                    = split("/", var.AS3_URL)[7]
    TS_VER                     = split("/", var.TS_URL)[7]
    FAST_VER                   = split("/", var.FAST_URL)[7]
    dns_server                 = var.dns_server
    ntp_server                 = var.ntp_server
    timezone                   = var.timezone
    law_id                     = azurerm_log_analytics_workspace.main.workspace_id
    law_primkey                = azurerm_log_analytics_workspace.main.primary_shared_key
    bigIqLicenseType           = var.bigIqLicenseType
    bigIqHost                  = var.bigIqHost
    bigIqPassword              = var.bigIqPassword
    bigIqUsername              = var.bigIqUsername
    bigIqLicensePool           = var.bigIqLicensePool
    bigIqSkuKeyword1           = var.bigIqSkuKeyword1
    bigIqSkuKeyword2           = var.bigIqSkuKeyword2
    bigIqUnitOfMeasure         = var.bigIqUnitOfMeasure
    bigIqHypervisor            = var.bigIqHypervisor
    # cluster info
    host1             = module.bigip.private_addresses["mgmt_private"]["private_ip"][0]
    host2             = module.bigip2.private_addresses["mgmt_private"]["private_ip"][0]
    remote_selfip_ext = module.bigip2.private_addresses["public_private"]["private_ip"][0]
  })
  f5_onboard2 = templatefile("${path.module}/f5_onboard.tmpl", {
    regKey                     = var.license2
    f5_username                = var.f5_username
    f5_password                = var.az_keyvault_authentication ? "" : var.f5_password
    az_keyvault_authentication = var.az_keyvault_authentication
    vault_url                  = var.az_keyvault_authentication ? data.azurerm_key_vault.main[0].vault_uri : ""
    keyvault_secret            = var.az_keyvault_authentication ? var.keyvault_secret : ""
    ssh_keypair                = file(var.ssh_key)
    INIT_URL                   = var.INIT_URL
    DO_URL                     = var.DO_URL
    AS3_URL                    = var.AS3_URL
    TS_URL                     = var.TS_URL
    FAST_URL                   = var.FAST_URL
    DO_VER                     = split("/", var.DO_URL)[7]
    AS3_VER                    = split("/", var.AS3_URL)[7]
    TS_VER                     = split("/", var.TS_URL)[7]
    FAST_VER                   = split("/", var.FAST_URL)[7]
    dns_server                 = var.dns_server
    ntp_server                 = var.ntp_server
    timezone                   = var.timezone
    law_id                     = azurerm_log_analytics_workspace.main.workspace_id
    law_primkey                = azurerm_log_analytics_workspace.main.primary_shared_key
    bigIqLicenseType           = var.bigIqLicenseType
    bigIqHost                  = var.bigIqHost
    bigIqPassword              = var.bigIqPassword
    bigIqUsername              = var.bigIqUsername
    bigIqLicensePool           = var.bigIqLicensePool
    bigIqSkuKeyword1           = var.bigIqSkuKeyword1
    bigIqSkuKeyword2           = var.bigIqSkuKeyword2
    bigIqUnitOfMeasure         = var.bigIqUnitOfMeasure
    bigIqHypervisor            = var.bigIqHypervisor
    # cluster info
    host1             = module.bigip.private_addresses["mgmt_private"]["private_ip"][0]
    host2             = module.bigip2.private_addresses["mgmt_private"]["private_ip"][0]
    remote_selfip_ext = module.bigip.private_addresses["public_private"]["private_ip"][0]
  })
}

############################ Compute ############################

# Create F5 BIG-IP VMs
module "bigip" {
  source                     = "F5Networks/bigip-module/azure"
  version                    = "1.2.6"
  prefix                     = var.projectPrefix
  resource_group_name        = azurerm_resource_group.main.name
  f5_instance_type           = var.instance_type
  f5_image_name              = var.image_name
  f5_product_name            = var.product
  f5_version                 = var.bigip_version
  f5_username                = var.f5_username
  f5_ssh_publickey           = file(var.ssh_key)
  mgmt_subnet_ids            = [{ "subnet_id" = data.azurerm_subnet.mgmt.id, "public_ip" = true, "private_ip_primary" = "" }]
  mgmt_securitygroup_ids     = [data.azurerm_network_security_group.mgmt.id]
  external_subnet_ids        = [{ "subnet_id" = data.azurerm_subnet.external.id, "public_ip" = true, "private_ip_primary" = "", "private_ip_secondary" = "" }]
  external_securitygroup_ids = [data.azurerm_network_security_group.external.id]
  internal_subnet_ids        = [{ "subnet_id" = data.azurerm_subnet.internal.id, "public_ip" = false, "private_ip_primary" = "" }]
  internal_securitygroup_ids = [data.azurerm_network_security_group.internal.id]
  availability_zone          = var.availability_zone
  custom_user_data           = local.f5_onboard1
  sleep_time                 = "30s"
  tags                       = local.tags
  az_keyvault_authentication = var.az_keyvault_authentication
  azure_secret_rg            = var.az_keyvault_authentication ? var.keyvault_rg : ""
  azure_keyvault_name        = var.az_keyvault_authentication ? var.keyvault_name : ""
  azure_keyvault_secret_name = var.az_keyvault_authentication ? var.keyvault_secret : ""
  user_identity              = var.az_keyvault_authentication ? data.azurerm_user_assigned_identity.main[0].id : null
}

module "bigip2" {
  source                     = "F5Networks/bigip-module/azure"
  version                    = "1.2.6"
  prefix                     = var.projectPrefix
  resource_group_name        = azurerm_resource_group.main.name
  f5_instance_type           = var.instance_type
  f5_image_name              = var.image_name
  f5_product_name            = var.product
  f5_version                 = var.bigip_version
  f5_username                = var.f5_username
  f5_ssh_publickey           = file(var.ssh_key)
  mgmt_subnet_ids            = [{ "subnet_id" = data.azurerm_subnet.mgmt.id, "public_ip" = true, "private_ip_primary" = "" }]
  mgmt_securitygroup_ids     = [data.azurerm_network_security_group.mgmt.id]
  external_subnet_ids        = [{ "subnet_id" = data.azurerm_subnet.external.id, "public_ip" = true, "private_ip_primary" = "", "private_ip_secondary" = "" }]
  external_securitygroup_ids = [data.azurerm_network_security_group.external.id]
  internal_subnet_ids        = [{ "subnet_id" = data.azurerm_subnet.internal.id, "public_ip" = false, "private_ip_primary" = "" }]
  internal_securitygroup_ids = [data.azurerm_network_security_group.internal.id]
  availability_zone          = var.availability_zone2
  custom_user_data           = local.f5_onboard2
  sleep_time                 = "30s"
  tags                       = local.tags
  az_keyvault_authentication = var.az_keyvault_authentication
  azure_secret_rg            = var.az_keyvault_authentication ? var.keyvault_rg : ""
  azure_keyvault_name        = var.az_keyvault_authentication ? var.keyvault_name : ""
  azure_keyvault_secret_name = var.az_keyvault_authentication ? var.keyvault_secret : ""
  user_identity              = var.az_keyvault_authentication ? data.azurerm_user_assigned_identity.main[0].id : null
}

############################ ALB Backend Pool ############################

# Note: JeffGiroux (REMOVE LATER)
#       https://github.com/F5Networks/terraform-azure-bigip-module/issues/29
#
#       BIG-IP module currently does NOT export network interface ID.
#       As a workaround, use the BIG-IP device ID to parse the name and
#       use that to query the data azurerm_network_interface.
#       Once output in module is fixed, data resource can be deleted.

# Retrieve NIC Info
data "azurerm_network_interface" "bigip-ext" {
  name                = format("%s-ext-nic-public-0", element(split("-f5vm01", element(split("/", module.bigip.bigip_instance_ids), 8)), 0))
  resource_group_name = azurerm_resource_group.main.name
}
data "azurerm_network_interface" "bigip2-ext" {
  name                = format("%s-ext-nic-public-0", element(split("-f5vm01", element(split("/", module.bigip2.bigip_instance_ids), 8)), 0))
  resource_group_name = azurerm_resource_group.main.name
}

# Associate the BIG-IP NIC to the ALB backend pool
resource "azurerm_network_interface_backend_address_pool_association" "f5vm01" {
  network_interface_id    = data.azurerm_network_interface.bigip-ext.id
  ip_configuration_name   = format("%s-secondary-ext-public-ip-0", element(split("-f5vm01", element(split("/", module.bigip.bigip_instance_ids), 8)), 0))
  backend_address_pool_id = azurerm_lb_backend_address_pool.backend_pool.id
}

resource "azurerm_network_interface_backend_address_pool_association" "f5vm02" {
  network_interface_id    = data.azurerm_network_interface.bigip2-ext.id
  ip_configuration_name   = format("%s-secondary-ext-public-ip-0", element(split("-f5vm01", element(split("/", module.bigip2.bigip_instance_ids), 8)), 0))
  backend_address_pool_id = azurerm_lb_backend_address_pool.backend_pool.id
}

Debug Output/Panic Output

https://gist.github.com/mwardbopp/c8bf379ed7da173507509cc3015da744

Expected Behaviour

A retryable error should be handled by the provider.

Actual Behaviour

It fails to either delete the resources or report the creation successfully.

Steps to Reproduce

terraform apply or delete, on a busy region like UKSouth. It doesn't happen nearly as much with WestEurope

Important Factoids

No response

References

No response

myc2h6o commented 1 year ago

Hi @mwardbopp thanks for opening the issue! The provider usually relies on the retry logic in the SDK itself and not adding additional retry. The debug log seems not having the error detail, would you mind sharing the error code and error detail you see when it fails? We may be able to add some retry logic based on the use cases

mwardbopp commented 1 year ago

azurerm_network_interface_backend_address_pool_association.f5vm01: Creating... module.bigip2.azurerm_virtual_machine_extension.vmext: Still creating... [30s elapsed] azurerm_network_interface_backend_address_pool_association.f5vm01: Creation complete after 5s [id=/subscriptions/c8bd4483-a1a2-47c4-acc8-4a49fbf180f3/resourceGroups/mydemo456-rg-6d99/providers/Microsoft.Network/networkInterfaces/mydemo456-6c9e-ext-nic-public-0/ipConfigurations/mydemo456-6c9e-secondary-ext-public-ip-0|/subscriptions/c8bd4483-a1a2-47c4-acc8-4a49fbf180f3/resourceGroups/mydemo456-rg-6d99/providers/Microsoft.Network/loadBalancers/mydemo456-lb-6d99/backendAddressPools/BackendPool1] module.bigip.azurerm_virtual_machine_extension.vmext: Still creating... [10s elapsed] module.bigip.azurerm_virtual_machine_extension.vmext: Still creating... [20s elapsed] module.bigip.azurerm_virtual_machine_extension.vmext: Still creating... [30s elapsed] ╷ │ Error: Code="RetryableError" Message="A retryable error occurred." │ │ with module.bigip.azurerm_virtual_machine_extension.vmext, │ on .terraform/modules/bigip/main.tf line 490, in resource "azurerm_virtual_machine_extension" "vmext": │ 490: resource "azurerm_virtual_machine_extension" "vmext" { │ ╵ ╷ │ Error: Code="RetryableError" Message="A retryable error occurred." │ │ with module.bigip2.azurerm_virtual_machine_extension.vmext, │ on .terraform/modules/bigip2/main.tf line 490, in resource "azurerm_virtual_machine_extension" "vmext": │ 490: resource "azurerm_virtual_machine_extension" "vmext" { │

rmcolbert commented 7 months ago

The challenge is that the "extension resource" is created successfully but Terraform only creates the state object if the extension's provisioining status is successful.

The resource should be created as soon as a 200/201 response is received from the REST API. If during the wait cycle for the provisioning results in a Provisioning Failed, the TF resource should be flagged as tainted and stored in the state file as such. Then on a subsequent apply, the failed extension would be removed and re-deployed.

q-yusufmahtab commented 2 months ago

We are still seeing this behaviour on provider version 3.116.0. Our module creates 3 VM extension resources, 1 of them passes while the other 2 throw similar error logs:

[2024-08-30T04:54:16.715Z] │ Error: creating/updating Extension (Subscription: "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
[2024-08-30T04:54:16.715Z] │ Resource Group Name: "xxxxx"
[2024-08-30T04:54:16.715Z] │ Virtual Machine Name: "az-release-2-7-x-jumpbox"
[2024-08-30T04:54:16.715Z] │ Extension Name: "AzureMonitorLinuxAgent"): polling after CreateOrUpdate: polling failed: the Azure API returned the following error:
[2024-08-30T04:54:16.715Z] │ 
[2024-08-30T04:54:16.715Z] │ Status: "RetryableError"
[2024-08-30T04:54:16.715Z] │ Code: ""
[2024-08-30T04:54:16.715Z] │ Message: "A retryable error occurred."
[2024-08-30T04:54:16.715Z] │ Activity Id: ""
[2024-08-30T04:54:16.715Z] │ 
[2024-08-30T04:54:16.716Z] │ ---
[2024-08-30T04:54:16.716Z] │ 
[2024-08-30T04:54:16.716Z] │ API Response:
[2024-08-30T04:54:16.716Z] │ 
[2024-08-30T04:54:16.716Z] │ ----[start]----
[2024-08-30T04:54:16.716Z] │ {
[2024-08-30T04:54:16.716Z] │   "startTime": "2024-08-30T04:38:32.1052565+00:00",
[2024-08-30T04:54:16.716Z] │   "endTime": "2024-08-30T04:38:33.1989923+00:00",
[2024-08-30T04:54:16.716Z] │   "status": "Failed",
[2024-08-30T04:54:16.716Z] │   "error": {
[2024-08-30T04:54:16.716Z] │     "code": "RetryableError",
[2024-08-30T04:54:16.716Z] │     "message": "A retryable error occurred."
[2024-08-30T04:54:16.716Z] │   },
[2024-08-30T04:54:16.716Z] │   "name": "c2876690-c24d-4aa1-85f2-fed7f3387c76"
[2024-08-30T04:54:16.716Z] │ }
[2024-08-30T04:54:16.716Z] │ -----[end]-----
[2024-08-30T04:54:16.716Z] │ 
[2024-08-30T04:54:16.716Z] │ 
[2024-08-30T04:54:16.716Z] │   with module.jumpbox[0].azurerm_virtual_machine_extension.azure_monitor_linux_agent,
[2024-08-30T04:54:16.716Z] │   on .terraform/modules/jumpbox/jumpbox/monitoring.tf line 3, in resource "azurerm_virtual_machine_extension" "azure_monitor_linux_agent":

[2024-08-30T04:54:16.716Z] │    3: resource "azurerm_virtual_machine_extension" "azure_monitor_linux_agent" {