hashicorp / terraform-provider-azurerm

Terraform provider for Azure Resource Manager
https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs
Mozilla Public License 2.0
4.54k stars 4.61k forks source link

Deadlock in azurerm_network_interface #16933

Closed jacksondaw closed 2 years ago

jacksondaw commented 2 years ago

Is there an existing issue for this?

Community Note

Terraform Version

1.2.1

AzureRM Provider Version

3.7.0

Affected Resource(s)/Data Source(s)

azurerm_network_interface, azurerm_subnet_network_security_group_association

Terraform Configuration Files

terraform {
  required_providers {
    azurerm = {
      source = "hashicorp/azurerm"
      version = "=3.7.0"
      #version = "3.4.0"
    }
  }
}

provider "azurerm" {
  features {}
}

resource "azurerm_resource_group" "example" {
  name     = "deadlock-resources"
  location = "Central US"
}

resource "azurerm_virtual_network" "example" {
  name                = "example-network"
  address_space       = ["10.0.0.0/16"]
  location            = azurerm_resource_group.example.location
  resource_group_name = azurerm_resource_group.example.name
}

resource "azurerm_subnet" "gateway" {
  name                 = "GatewaySubnet"
  resource_group_name  = azurerm_resource_group.example.name
  virtual_network_name = azurerm_virtual_network.example.name
  address_prefixes     = ["10.0.1.0/24"]
}

resource "azurerm_subnet" "example" {
  name                 = "frontend"
  resource_group_name  = azurerm_resource_group.example.name
  virtual_network_name = azurerm_virtual_network.example.name
  address_prefixes     = ["10.0.2.0/24"]
}

resource "azurerm_public_ip" "example" {
  name                = "test"
  location            = azurerm_resource_group.example.location
  resource_group_name = azurerm_resource_group.example.name

  allocation_method = "Dynamic"
}

resource "azurerm_virtual_network_gateway" "example" {
  name                = "test"
  location            = azurerm_resource_group.example.location
  resource_group_name = azurerm_resource_group.example.name

  type     = "Vpn"
  vpn_type = "RouteBased"

  active_active = false
  enable_bgp    = false
  sku           = "Basic"

  ip_configuration {
    name                          = "vnetGatewayConfig"
    public_ip_address_id          = azurerm_public_ip.example.id
    private_ip_address_allocation = "Dynamic"
    subnet_id                     = azurerm_subnet.gateway.id
  }

  vpn_client_configuration {
    address_space = ["10.2.0.0/24"]

    root_certificate {
      name = "DigiCert-Federated-ID-Root-CA"

      public_cert_data = <<EOF
MIIC5zCCAc+gAwIBAgIQYO1rZEYH35BF6lcB2wMGsTANBgkqhkiG9w0BAQsFADAW
MRQwEgYDVQQDDAtQMlNSb290Q2VydDAeFw0yMjAyMTExNzQ5MzJaFw0yMzAyMTEx
ODA5MzJaMBYxFDASBgNVBAMMC1AyU1Jvb3RDZXJ0MIIBIjANBgkqhkiG9w0BAQEF
AAOCAQ8AMIIBCgKCAQEAwxxgwHrji7GgS6zZPh/vfDn/qppvK0CKRnY7ybQa43L8
ppDZxKqF0AFKUCYdBNz+qTVsqS48J+X/kJUASpUFSq1xrzlXtAxgihfwuPlCEOEp
cwldEX6jbV7tCMI13uRZCpnQTD6k6kcKdcWuD3IQ9Z+LA4ia5yd3K3BtBqBbPKAn
MUsFAsFeQ0Y/sxaiuoDNw5Ru06tH2vO87kaV44HrnjUqKjG+9kBstLcu+hdqchI0
fNdu4tiQDlGaWN7TMM/umuBTB2l+Q39Bzc8k+RQEaYupNntUphlMejfmWHZvyVNf
ibeFRc2KbXW/6p08ktKjAW09secMTVG3++8LRvTGdQIDAQABozEwLzAOBgNVHQ8B
Af8EBAMCAgQwHQYDVR0OBBYEFGoPVsug2wAdztUuLbQcZhgdTZlPMA0GCSqGSIb3
DQEBCwUAA4IBAQB3mkyC5IypVxoswdqVbb71cTMK4T2WWBgD+wmY+8gcoy7L6ax3
pJ7CYGlEIFD8sjmf76Kdyhy2N4Basl88VHy/jzr9C1AV3SWtRUE1eqLuJzuBHHgB
VgLKHN52uxICjybTQhy2r4Pg4gNLcUJIBG915vGw9cVhxLmiAYbBknAfTBCSyfKT
ddlYKnkjkoWkLnfxjRtu1LpQa2yPoDgrldlSQpDQ6iUBNqZqKk0m3nM96xWLmUo1
+AMLHkS9yTRbLbCtGi33MMY69nAVSpJdr/8fi7+dDsjbQKLnS73/gaxgoN6I8k3f
SQLg/N2vWOGq4OJYtyo5VrVhIvdXzgy91q/3
EOF
    }

    revoked_certificate {
      name       = "Verizon-Global-Root-CA"
      thumbprint = "912198EEF23DCAC40939312FEE97DD560BAE49B1"
    }
  }
}

resource "azurerm_network_security_group" "example" {
  name                = "example-nsg"
  location            = azurerm_resource_group.example.location
  resource_group_name = azurerm_resource_group.example.name

  security_rule {
    name                       = "test123"
    priority                   = 100
    direction                  = "Inbound"
    access                     = "Allow"
    protocol                   = "Tcp"
    source_port_range          = "*"
    destination_port_range     = "*"
    source_address_prefix      = "*"
    destination_address_prefix = "*"
  }
}

resource "azurerm_route_table" "example" {
  name                = "example-routetable"
  location            = azurerm_resource_group.example.location
  resource_group_name = azurerm_resource_group.example.name

  route {
    name                   = "example"
    address_prefix         = "10.100.0.0/14"
    next_hop_type          = "VirtualAppliance"
    next_hop_in_ip_address = "10.10.1.1"
  }
}

resource "azurerm_subnet_route_table_association" "example" {
  subnet_id      = azurerm_subnet.example.id
  route_table_id = azurerm_route_table.example.id

  timeouts {
    create = "1h30m"
    update ="2h"
    delete = "20m"
  }
}

resource "azurerm_subnet_network_security_group_association" "example" {
  subnet_id                 = azurerm_subnet.example.id
  network_security_group_id = azurerm_network_security_group.example.id

  timeouts {
    create = "1h30m"
    update ="2h"
    delete = "20m"
  }
}

resource "azurerm_network_interface" "example" {
  name                = "example-nic"
  location            = azurerm_resource_group.example.location
  resource_group_name = azurerm_resource_group.example.name

  timeouts {
    create = "1h30m"
    update ="2h"
    delete = "20m"
  }

  ip_configuration {
    name                          = "internal"
    subnet_id                     = azurerm_subnet.example.id
    private_ip_address_allocation = "Dynamic"
  }
}

Debug Output/Panic Output

After the gateway is created the creation of the azurerm_subnet_network_security_group_association, azurerm_network_interface and azurerm_subnet_route_table_association continue until the apply times out. 

azurerm_subnet_route_table_association.example: Still creating... [39m22s elapsed]
azurerm_virtual_network_gateway.example: Still creating... [39m32s elapsed]
azurerm_subnet_route_table_association.example: Still creating... [39m32s elapsed]
azurerm_subnet_network_security_group_association.example: Still creating... [39m32s elapsed]
azurerm_network_interface.example: Still creating... [39m32s elapsed]
azurerm_virtual_network_gateway.example: Still creating... [39m42s elapsed]
azurerm_virtual_network_gateway.example: Creation complete after 39m44s [id=/subscriptions/29595f1f-cd48-4f1b-81e3-5c78e48ca974/resourceGroups/deadlock-resources/providers/Microsoft.Network/virtualNetworkGateways/test]
azurerm_network_interface.example: Still creating... [39m42s elapsed]
azurerm_subnet_route_table_association.example: Still creating... [39m42s elapsed]
azurerm_subnet_network_security_group_association.example: Still creating... [39m42s elapsed]
azurerm_subnet_network_security_group_association.example: Still creating... [39m52s elapsed]
azurerm_subnet_route_table_association.example: Still creating... [39m52s elapsed]
azurerm_network_interface.example: Still creating... [39m52s elapsed]
azurerm_network_interface.example: Still creating... [40m2s elapsed]
azurerm_subnet_route_table_association.example: Still creating... [40m2s elapsed]
azurerm_subnet_network_security_group_association.example: Still creating... [40m2s elapsed]
azurerm_subnet_network_security_group_association.example: Still creating... [40m12s elapsed]
azurerm_network_interface.example: Still creating... [40m12s elapsed]
azurerm_subnet_route_table_association.example: Still creating... [40m12s elapsed]
azurerm_subnet_route_table_association.example: Still creating... [40m22s elapsed]
azurerm_subnet_network_security_group_association.example: Still creating... [40m22s elapsed]
azurerm_network_interface.example: Still creating... [40m22s elapsed]
azurerm_network_interface.example: Still creating... [40m32s elapsed]
azurerm_subnet_route_table_association.example: Still creating... [40m32s elapsed]
azurerm_subnet_network_security_group_association.example: Still creating... [40m32s elapsed]
azurerm_subnet_route_table_association.example: Still creating... [40m42s elapsed]
azurerm_network_interface.example: Still creating... [40m42s elapsed]
azurerm_subnet_network_security_group_association.example: Still creating... [40m42s elapsed]
azurerm_subnet_network_security_group_association.example: Still creating... [40m52s elapsed]
azurerm_network_interface.example: Still creating... [40m52s elapsed]
azurerm_subnet_route_table_association.example: Still creating... [40m52s elapsed]
azurerm_network_interface.example: Still creating... [41m2s elapsed]

Expected Behaviour

All resources are created succcessfully.

Actual Behaviour

A deadlock is formed, and there is contention for the vnet and subnet locks in the azurerm_subnet_network_security_group_association and azurerm_network_interface modules.

I believe the subnet and vNet locks needs to be switched here: https://github.com/hashicorp/terraform-provider-azurerm/blob/697470d993d05167681d2a0f1ec68b9f27f82740/internal/services/network/network_interface_locking.go#L59

Steps to Reproduce

No response

Important Factoids

No response

References

No response

magodo commented 2 years ago

@jacksondaw Thank you for submitting this 👍

Looking through the locking order for the affected three resources, they are as below:

  1. azurerm_network_interface: NIC -> VNET -> SUBNET (link)
  2. azurerm_subnet_network_security_group_association: NSG -> VNET -> SUBNET (link)
  3. azurerm_subnet_route_table_association: ROUTE_TABLE -> VNET (link)

From above, the locking order seems consistent. The only suspicion I can tell is the order below might be inconsistent for multiple resources to lock: https://github.com/hashicorp/terraform-provider-azurerm/blob/697470d993d05167681d2a0f1ec68b9f27f82740/internal/locks/lock.go#L19-L21

But this is not your case.

The potential reason for your case might be the resource who firstly holds the lock of VNET got blocked somehow (probably in the API, e.g. waiting for some state). Could you please enable the debug log and re-produce this issue, and check whether there is API calls at the very end of the provision?

jacksondaw commented 2 years ago

Hi @magodo,

Thanks for the reply, been trying all day to get a gateway built but it keeps timing out before it is successful. I'll try a few more times and upload the logs when I get them. What I can't figure out is why the azurerm_virtual_network_gateway is blocking the other operations? In the graph it is on a different branch and there are no locks in the code.

In my repro I believe the locking order goes

azurerm_virtual_network_gateway-> blocks operations on the VNet

azurerm_subnet_network_security_group_association -> first inline for VNet lock

azurerm_network_interface -> obtains subnet lock, second in line for VNet lock

azurerm_virtual_network_gateway-> releases operations on the Vnet

azurerm_subnet_network_security_group_association -> obtains lock for VNet, waiting on lock for subnet.

Dead locked between azurerm_network_interface waiting on VNet lock and azurerm_subnet_network_security_group_association waiting for subnet lock.

jacksondaw commented 2 years ago

@magodo,

Do you want the whole TF log uploaded? Below is the last call in the TF_LOG no further calls are made to the management.azure.com api.

Terminal Ouput

azurerm_subnet_network_security_group_association.example: Still creating... [34m42s elapsed]
azurerm_virtual_network_gateway.example: Still creating... [34m41s elapsed]
azurerm_virtual_network_gateway.example: Creation complete after 34m45s [id=/subscriptions/29595f1f-cd48-4f1b-81e3-5c78e48ca974/resourceGroups/deadlock-resources/providers/Microsoft.Network/virtualNetworkGateways/test]
azurerm_subnet_network_security_group_association.example: Still creating... [34m52s elapsed]
azurerm_network_interface.example: Still creating... [34m52s elapsed]
azurerm_network_interface.example: Still creating... [35m2s elapsed]
azurerm_subnet_network_security_group_association.example: Still creating... [35m2s elapsed]
azurerm_subnet_network_security_group_association.example: Still creating... [35m12s elapsed]
azurerm_network_interface.example: Still creating... [35m12s elapsed]
azurerm_network_interface.example: Still creating... [35m22s elapsed]
azurerm_subnet_network_security_group_association.example: Still creating... [35m22s elapsed]
azurerm_network_interface.example: Still creating... [35m32s elapsed]
azurerm_subnet_network_security_group_association.example: Still creating... [35m32s elapsed]
azurerm_subnet_network_security_group_association.example: Still creating... [35m42s elapsed]
azurerm_network_interface.example: Still creating... [35m42s elapsed]
azurerm_network_interface.example: Still creating... [35m52s elapsed]
azurerm_subnet_network_security_group_association.example: Still creating... [35m52s elapsed]
azurerm_subnet_network_security_group_association.example: Still creating... [36m2s elapsed]
azurerm_network_interface.example: Still creating... [36m2s elapsed]
azurerm_subnet_network_security_group_association.example: Still creating... [36m12s elapsed]
azurerm_network_interface.example: Still creating... [36m12s elapsed]
azurerm_network_interface.example: Still creating... [36m22s elapsed]
azurerm_subnet_network_security_group_association.example: Still creating... [36m22s elapsed]
azurerm_subnet_network_security_group_association.example: Still creating... [36m32s elapsed]
azurerm_network_interface.example: Still creating... [36m32s elapsed]
azurerm_network_interface.example: Still creating... [36m42s elapsed]
...

TF_LOG_PATH output

GET /subscriptions/29595f1f-cd48-4f1b-81e3-5c78e48ca974/providers/Microsoft.Network/locations/centralus/operations/59cf0c8c-49b7-4f41-bb54-85606833087a?api-version=2021-08-01 HTTP/1.1
Host: management.azure.com
User-Agent: Go/go1.18.1 (amd64-windows) go-autorest/v14.2.1 Azure-SDK-For-Go/v64.0.0 network/2021-08-01 HashiCorp Terraform/1.1.8 (+https://www.terraform.io) Terraform Plugin SDK/2.10.1 terraform-provider-azurerm/dev pid-222c6c49-1b0a-5959-a213-6608f9eb8820
X-Ms-Correlation-Request-Id: 0451808e-edf8-a3cc-92b5-1bd458cf62cd
Accept-Encoding: gzip: timestamp=2022-05-26T17:04:07.260-0500
2022-05-26T17:04:07.323-0500 [DEBUG] provider.terraform-provider-azurerm_v3.7.0_x5.exe: AzureRM Response for https://management.azure.com/subscriptions/29595f1f-cd48-4f1b-81e3-5c78e48ca974/providers/Microsoft.Network/locations/centralus/operations/59cf0c8c-49b7-4f41-bb54-85606833087a?api-version=2021-08-01: 
HTTP/2.0 200 OK
Cache-Control: no-cache
Content-Type: application/json; charset=utf-8
Date: Thu, 26 May 2022 22:04:08 GMT
Expires: -1
Pragma: no-cache
Server: Microsoft-HTTPAPI/2.0
Server: Microsoft-HTTPAPI/2.0
Strict-Transport-Security: max-age=31536000; includeSubDomains
Vary: Accept-Encoding
X-Content-Type-Options: nosniff
X-Ms-Arm-Service-Request-Id: a3755228-94c2-4cef-ba75-d091c145569c
X-Ms-Correlation-Request-Id: 0451808e-edf8-a3cc-92b5-1bd458cf62cd
X-Ms-Ratelimit-Remaining-Subscription-Reads: 11982
X-Ms-Request-Id: 62fafdfd-54e2-4dbf-8ff3-621c8a95e384
X-Ms-Routing-Request-Id: CENTRALUS:20220526T220408Z:94e0a16c-3066-467d-a7e1-ed188af2f753

{
  "status": "Succeeded"
}: timestamp=2022-05-26T17:04:07.323-0500
2022-05-26T17:04:07.324-0500 [DEBUG] provider.terraform-provider-azurerm_v3.7.0_x5.exe: AzureRM Request: 
GET /subscriptions/29595f1f-cd48-4f1b-81e3-5c78e48ca974/resourceGroups/deadlock-resources/providers/Microsoft.Network/virtualNetworkGateways/test?api-version=2021-08-01 HTTP/1.1
Host: management.azure.com
User-Agent: Go/go1.18.1 (amd64-windows) go-autorest/v14.2.1 Azure-SDK-For-Go/v64.0.0 network/2021-08-01 HashiCorp Terraform/1.1.8 (+https://www.terraform.io) Terraform Plugin SDK/2.10.1 terraform-provider-azurerm/dev pid-222c6c49-1b0a-5959-a213-6608f9eb8820
X-Ms-Correlation-Request-Id: 0451808e-edf8-a3cc-92b5-1bd458cf62cd
Accept-Encoding: gzip: timestamp=2022-05-26T17:04:07.323-0500
2022-05-26T17:04:08.105-0500 [DEBUG] provider.terraform-provider-azurerm_v3.7.0_x5.exe: AzureRM Response for https://management.azure.com/subscriptions/29595f1f-cd48-4f1b-81e3-5c78e48ca974/resourceGroups/deadlock-resources/providers/Microsoft.Network/virtualNetworkGateways/test?api-version=2021-08-01: 
HTTP/2.0 200 OK
Cache-Control: no-cache
...
magodo commented 2 years ago

@jacksondaw

I've tested locally with above config, and it succeeded after 1h4m55s. The reason for this long time is not due to deadlock (as it won't end), but the API behavior.

The point when vnet gateway is provisioning, it seems changes the state of the vnet which contains the gateway subnet (as well as the frontend subnet) from Succeeded to Updating, due to the PutVirtualNetworkGatewayOperation. This state keeps until the vnet gateway is successfully created. During this duration, the association resource (either the subnet-nsg or subnet-routetable) is also trying to create and invoke a PUT API against the frontend subnet, that PUT fails due to the unexpected state of its containing vnet (the vnet gw is provisioning):

{
  "error": {
    "code": "RetryableError",
    "message": "A retryable error occurred.",
    "details": [
      {
        "code": "ReferencedResourceNotProvisioned",
        "message": "Cannot proceed with operation because resource /subscriptions/****/resourceGroups/deadlock-resources-mgd/providers/Microsoft.Network/virtualNetworkGateways/test used by resource /subscriptions/****/resourceGroups/deadlock-resources-mgd/providers/Microsoft.Network/virtualNetworks/example-network/subnets/frontend is not in Succeeded state. Resource is in Updating state and the last operation that updated/is updating the resource is PutVirtualNetworkGatewayOperation."
      }
    ]
  }
}

The response has status code 429 and has no retry-after in the header. The Go SDK will then keep retry, in a exponetially backoff mananer. The last retry at my side takes 32 min (doubles in last retry interval: 16 min).

jacksondaw commented 2 years ago

I see my repro also ran successfully after a similar retry timeout of 32 minutes. I have a more complex CI/CD pipeline that appears to behave similarly only it never completes creating multiple NICs and Security Group Associations. The timeout there is >2 hours and 30 minutes attempting to create the NICs and NSGAs. I'll try to capture the Logs there bring those findings back here.

github-actions[bot] commented 2 years ago

I'm going to lock this issue because it has been closed for 30 days ⏳. This helps our maintainers find and focus on the active issues. If you have found a problem that seems similar to this, please open a new issue and complete the issue template so we can capture all the details necessary to investigate further.