microsoft / service-fabric

Service Fabric is a distributed systems platform for packaging, deploying, and managing stateless and stateful distributed applications and containers at large scale.
https://docs.microsoft.com/en-us/azure/service-fabric/
MIT License
3.03k stars 399 forks source link

[BUG] -Linux VmScaletSet node type with NSG not available to cluster #1313

Open fuocor opened 2 years ago

fuocor commented 2 years ago

Describe the bug Defining a linux cluster that includes generic network security group seems to prevent the scaleset nodes from registering with the cluster with no apparent errors.

The logs on the node indicate that ServiceFabric extension started fine along with everything else. Removing the nsg from the subet for the nodes resolves the issue

here is the bicep for the nodetype

param adminPassword string
param adminUserName string
param certificateStoreLocation string
param clusterName string
param httpsCertificateThumbprint string
param monitoredCertificateList array
param storageAccountType string

param nodeType object = {
  applicationEndPort: 30000
  applicationStartPort: 20000
  durabilityLevel: ''
  ephemeralEndPort: 65534
  ephemeralStartPort: 49152
  fabricHttpGatewayPort: 19080
  fabricTcpGatewayPort: 19000
  instanceCount: 5
  multipleAvailabilityZones: false
  reverseProxyEndpointPort:  19081
  typeName: ''
  vmImageOffer: ''
  vmImagePublisher: ''
  vmImageSku: ''
  vmImageVersion: ''
  vmNodeTypeSize: ''
}

var addressPrefix = '10.10.0.0/16'
var backendAddressPoolName = 'LoadBalancerBEAddressPool'
var certificateStoreName = 'My'
var fabricHttpGatewayProbeName = 'FabricHttpGatewayProbe'
var fabricHttpsPort =  443
var fabricHttpsProbeName = 'FabricHttpsProbe'
var fabricStorageAccountName = '${take(clusterName, 17)}sf'
var fabricTcpGatewayProbeName = 'FabricGatewayProbe'
var frontendIPConfigurationName = 'LoadBalancerIPConfig'
var loadBalancerName = '${clusterName}-lb-${nodeType.typeName}'
var networkSecurityGroupName = toLower('${clusterName}-nsg-${nodeType.typeName}')
var nicName = 'nic'
var overProvision = false
var subnetName = 'subnet-${nodeType.typeName}'
var subnetPrefix = '10.10.0.0/24'
var virtualNetworkName = toLower('${clusterName}-vNet-${nodeType.typeName}')
var wadcfgxend = '"><MetricAggregation scheduledTransferPeriod="PT1H"/><MetricAggregation scheduledTransferPeriod="PT1M"/></Metrics></DiagnosticMonitorConfiguration></WadCfg>'
var wadcfgxstart = '${wadlogs}${wadperfcounters1}${wadperfcounters2}<Metrics resourceId="'
var wadlogs = '<WadCfg><DiagnosticMonitorConfiguration>'
var wadmetricsresourceid = '/subscriptions/${subscription().subscriptionId}/resourceGroups/${resourceGroup().name}/providers/Microsoft.Compute/virtualMachineScaleSets/${nodeType.typeName}'
var wadperfcounters1 = '<PerformanceCounters scheduledTransferPeriod="PT1M"><PerformanceCounterConfiguration counterSpecifier="\\Memory\\AvailableMemory" sampleRate="PT15S" unit="Bytes"><annotation displayName="Memory available" locale="en-us"/></PerformanceCounterConfiguration><PerformanceCounterConfiguration counterSpecifier="\\Memory\\PercentAvailableMemory" sampleRate="PT15S" unit="Percent"><annotation displayName="Mem. percent available" locale="en-us"/></PerformanceCounterConfiguration><PerformanceCounterConfiguration counterSpecifier="\\Memory\\UsedMemory" sampleRate="PT15S" unit="Bytes"><annotation displayName="Memory used" locale="en-us"/></PerformanceCounterConfiguration><PerformanceCounterConfiguration counterSpecifier="\\Memory\\PercentUsedMemory" sampleRate="PT15S" unit="Percent"><annotation displayName="Memory percentage" locale="en-us"/></PerformanceCounterConfiguration><PerformanceCounterConfiguration counterSpecifier="\\Memory\\PercentUsedByCache" sampleRate="PT15S" unit="Percent"><annotation displayName="Mem. used by cache" locale="en-us"/></PerformanceCounterConfiguration><PerformanceCounterConfiguration counterSpecifier="\\Processor\\PercentIdleTime" sampleRate="PT15S" unit="Percent"><annotation displayName="CPU idle time" locale="en-us"/></PerformanceCounterConfiguration><PerformanceCounterConfiguration counterSpecifier="\\Processor\\PercentUserTime" sampleRate="PT15S" unit="Percent"><annotation displayName="CPU user time" locale="en-us"/></PerformanceCounterConfiguration><PerformanceCounterConfiguration counterSpecifier="\\Processor\\PercentProcessorTime" sampleRate="PT15S" unit="Percent"><annotation displayName="CPU percentage guest OS" locale="en-us"/></PerformanceCounterConfiguration><PerformanceCounterConfiguration counterSpecifier="\\Processor\\PercentIOWaitTime" sampleRate="PT15S" unit="Percent"><annotation displayName="CPU IO wait time" locale="en-us"/></PerformanceCounterConfiguration>'
var wadperfcounters2 = '<PerformanceCounterConfiguration counterSpecifier="\\PhysicalDisk\\BytesPerSecond" sampleRate="PT15S" unit="BytesPerSecond"><annotation displayName="Disk total bytes" locale="en-us"/></PerformanceCounterConfiguration><PerformanceCounterConfiguration counterSpecifier="\\PhysicalDisk\\ReadBytesPerSecond" sampleRate="PT15S" unit="BytesPerSecond"><annotation displayName="Disk read guest OS" locale="en-us"/></PerformanceCounterConfiguration><PerformanceCounterConfiguration counterSpecifier="\\PhysicalDisk\\WriteBytesPerSecond" sampleRate="PT15S" unit="BytesPerSecond"><annotation displayName="Disk write guest OS" locale="en-us"/></PerformanceCounterConfiguration><PerformanceCounterConfiguration counterSpecifier="\\PhysicalDisk\\TransfersPerSecond" sampleRate="PT15S" unit="CountPerSecond"><annotation displayName="Disk transfers" locale="en-us"/></PerformanceCounterConfiguration><PerformanceCounterConfiguration counterSpecifier="\\PhysicalDisk\\ReadsPerSecond" sampleRate="PT15S" unit="CountPerSecond"><annotation displayName="Disk reads" locale="en-us"/></PerformanceCounterConfiguration><PerformanceCounterConfiguration counterSpecifier="\\PhysicalDisk\\WritesPerSecond" sampleRate="PT15S" unit="CountPerSecond"><annotation displayName="Disk writes" locale="en-us"/></PerformanceCounterConfiguration><PerformanceCounterConfiguration counterSpecifier="\\PhysicalDisk\\AverageReadTime" sampleRate="PT15S" unit="Seconds"><annotation displayName="Disk read time" locale="en-us"/></PerformanceCounterConfiguration><PerformanceCounterConfiguration counterSpecifier="\\PhysicalDisk\\AverageWriteTime" sampleRate="PT15S" unit="Seconds"><annotation displayName="Disk write time" locale="en-us"/></PerformanceCounterConfiguration><PerformanceCounterConfiguration counterSpecifier="\\PhysicalDisk\\AverageTransferTime" sampleRate="PT15S" unit="Seconds"><annotation displayName="Disk transfer time" locale="en-us"/></PerformanceCounterConfiguration><PerformanceCounterConfiguration counterSpecifier="\\PhysicalDisk\\AverageDiskQueueLength" sampleRate="PT15S" unit="Count"><annotation displayName="Disk queue length" locale="en-us"/></PerformanceCounterConfiguration></PerformanceCounters>'

resource keyVault 'Microsoft.KeyVault/vaults@2021-06-01-preview' existing = { 
  name: clusterName
}

resource clusterHttpsCertificate 'Microsoft.KeyVault/vaults/secrets@2021-06-01-preview' existing = {
  parent: keyVault
  name: 'ChorusHttpsCertificate'
}

resource serviceFabricCluster 'Microsoft.ServiceFabric/clusters@2021-06-01' existing = {
  name: clusterName
}

resource sshPublicKey 'Microsoft.Compute/sshPublicKeys@2021-07-01' existing = {
  name: '${clusterName}-ssh'
}

resource fabricStorageAccount 'Microsoft.Storage/storageAccounts@2021-06-01' existing = {
  name: fabricStorageAccountName
}

resource userAssignedIdenity 'Microsoft.ManagedIdentity/userAssignedIdentities@2018-11-30' existing = {
  name: clusterName
}

resource virtualNetwork 'Microsoft.Network/virtualNetworks@2021-05-01' = {
  name: virtualNetworkName
  location: resourceGroup().location
  tags: {
    resourceType: 'Service Fabric'
    clusterName: clusterName
  }
  properties: {
    addressSpace: {
      addressPrefixes: [
        addressPrefix
      ]
    }
    subnets: [
      {
        name: subnetName
        properties: {
          addressPrefix: subnetPrefix
          networkSecurityGroup: {
            id: networkSecurityGroup.id
          }
        }
      }
    ]
  }
}

resource publicIPAddresses 'Microsoft.Network/publicIPAddresses@2021-05-01' = {
  name: '${clusterName}-publicIP'
  location: resourceGroup().location
  tags: {
    resourceType: 'Service Fabric'
    clusterName: clusterName
  }
  properties: {
    // ddosSettings:{
    //   protectedIP: true
    //   protectionCoverage: 'Standard'
    // }
    dnsSettings: {
      domainNameLabel: clusterName
    }
    publicIPAllocationMethod: 'Static'
  }
  sku:{
    name: 'Standard'
    tier: 'Regional'
  }
}

resource networkSecurityGroup 'Microsoft.Network/networkSecurityGroups@2021-05-01' = {
  name: networkSecurityGroupName
  location: resourceGroup().location
  properties: {
    securityRules: [
      {
        name: 'allowSvcFabSMB'
        properties: {
          access: 'Allow'
          destinationAddressPrefix: '*'
          destinationPortRange: '445'
          direction: 'Inbound'
          priority: 3950
          protocol: '*'
          sourceAddressPrefix: 'VirtualNetwork'
          sourcePortRange: '*'
          description: 'allow SMB traffic within the net, used by fabric to move packages around'
        }
      }
      {
        name: 'allowSvcFabCluser'
        properties: {
          access: 'Allow'
          destinationAddressPrefix: '*'
          destinationPortRange: '1025-1027'
          direction: 'Inbound'
          priority: 3920
          protocol: '*'
          sourceAddressPrefix: 'VirtualNetwork'
          sourcePortRange: '*'
          description: 'allow ports within vnet that are used by the fabric to talk between nodes'
        }
      }
      {
        name: 'allow-${nodeType.typeName}-ephemeral'
        properties: {
          access: 'Allow'
          destinationAddressPrefix: '*'
          destinationPortRange: '${nodeType.ephemeralStartPort}-${nodeType.ephemeralEndPort}'
          direction: 'Inbound'
          priority: 3930
          protocol: '*'
          sourceAddressPrefix: 'VirtualNetwork'
          sourcePortRange: '*'
          description: 'allow fabric ephemeral ports within the vnet'
        }
      }
      {
        name: 'allow-${nodeType.typeName}-portal'
        properties: {
          access: 'Allow'
          destinationAddressPrefix: '*'
          destinationPortRange: '${nodeType.fabricHttpGatewayPort}'
          direction: 'Inbound'
          priority: 3900
          protocol: '*'
          sourceAddressPrefix: '*'
          sourcePortRange: '*'
          description: 'allow port used to access the fabric cluster web portal'
        }
      }
      {
        name: 'allow-${nodeType.typeName}-client'
        properties: {
          access: 'Allow'
          destinationAddressPrefix: '*'
          destinationPortRange: '${nodeType.fabricTcpGatewayPort}'
          direction: 'Inbound'
          priority: 3910
          protocol: '*'
          sourceAddressPrefix: '*'
          sourcePortRange: '*'
          description: 'allow port used by the fabric client (includes powershell)'
        }
      }
      {
        name: 'allow-${nodeType.typeName}-application'
        properties: {
          access: 'Allow'
          destinationAddressPrefix: '*'
          destinationPortRange: '${nodeType.applicationStartPort}-${nodeType.applicationEndPort}'
          direction: 'Inbound'
          priority: 3940
          protocol: '*'
          sourceAddressPrefix: '*'
          sourcePortRange: '*'
          description: 'allow fabric application ports within the vnet'
        }
      }
      {
        name: 'allow-${nodeType.typeName}-Https'
        properties: {
          access: 'Allow'
          destinationAddressPrefix: '*'
          destinationPortRange: '${fabricHttpsPort}'
          direction: 'Inbound'
          priority: 2002
          protocol: '*'
          sourceAddressPrefix: '*'
          sourcePortRange: '*'
          description: 'allow port used to access https/wss'
        }
      }
      {
        name: 'blockAll'
        properties: {
          access: 'Deny'
          destinationAddressPrefix: '*'
          destinationPortRange: '*'
          direction: 'Inbound'
          priority: 4095
          protocol: '*'
          sourceAddressPrefix: '*'
          sourcePortRange: '*'
          description: 'block all traffic except what we\'ve explicitly allowed'
        }
      }
      {
        name: 'allow-${nodeType.typeName}-ssh'
        properties: {
          access: 'Allow'
          destinationAddressPrefix: '*'
          destinationPortRange: '${loadBalancer.properties.inboundNatPools[0].properties.frontendPortRangeStart}'
          direction: 'Inbound'
          priority: 3001
          protocol: '*'
          sourceAddressPrefix: '*'
          sourcePortRange: '*'
          description: 'allow SSH within the VNet'
        }
      }
      {
        name: 'allow-${nodeType.typeName}-reverseProxy'
        properties: {
          access: 'Allow'
          destinationAddressPrefix: '*'
          destinationPortRange: '${nodeType.reverseProxyEndpointPort}'
          direction: 'Inbound'
          priority: 3980
          protocol: '*'
          sourceAddressPrefix: '*'
          sourcePortRange: '*'
          description: 'allow port used to access the fabric cluster using reverse proxy'
        }
      }
    ]
  }
  tags: {
    resourceType: 'Service Fabric'
    clusterName: clusterName
    displayName: 'Network Security Group'
  }
}

resource loadBalancer 'Microsoft.Network/loadBalancers@2021-05-01' = {
  name: loadBalancerName
  location: resourceGroup().location
  tags: {
    resourceType: 'Service Fabric'
    clusterName: clusterName
  }
  properties: {
    frontendIPConfigurations: [
      {
        name: frontendIPConfigurationName
        properties: {
          publicIPAddress: {
            id: publicIPAddresses.id
          }
        }
      }
    ]
    backendAddressPools: [
      {
        name: backendAddressPoolName
        properties: {}
      }
    ]
    loadBalancingRules: [
      {
        name: 'FabricTcpGateway'
        properties: {
          backendAddressPool: {
            id: resourceId('Microsoft.Network/loadBalancers/backendAddressPools', loadBalancerName, backendAddressPoolName)
          }
          backendPort: nodeType.fabricTcpGatewayPort
          enableFloatingIP: false
          frontendIPConfiguration: {
            id: resourceId('Microsoft.Network/loadBalancers/frontendIPConfigurations', loadBalancerName, frontendIPConfigurationName)
          }
          frontendPort: nodeType.fabricTcpGatewayPort
          idleTimeoutInMinutes: 5
          probe: {
            id: resourceId('Microsoft.Network/loadBalancers/probes', loadBalancerName, fabricTcpGatewayProbeName)
          }
          protocol: 'Tcp'
        }
      }
      {
        name: 'FabricHttpGateway'
        properties: {
          backendAddressPool: {
            id: resourceId('Microsoft.Network/loadBalancers/backendAddressPools', loadBalancerName, backendAddressPoolName)
          }
          backendPort: nodeType.fabricHttpGatewayPort
          enableFloatingIP: false
          frontendIPConfiguration: {
            id: resourceId('Microsoft.Network/loadBalancers/frontendIPConfigurations', loadBalancerName, frontendIPConfigurationName)
          }
          frontendPort: nodeType.fabricHttpGatewayPort
          idleTimeoutInMinutes: 5
          probe: {
            id: resourceId('Microsoft.Network/loadBalancers/probes', loadBalancerName, fabricHttpGatewayProbeName)
          }
          protocol: 'Tcp'
        }
      }
      {
        name: 'fabricHttpsRule'
        properties: {
          backendAddressPool: {
            id: resourceId('Microsoft.Network/loadBalancers/backendAddressPools', loadBalancerName, backendAddressPoolName)
          }
          backendPort: fabricHttpsPort
          enableFloatingIP: false
          frontendIPConfiguration: {
            id: resourceId('Microsoft.Network/loadBalancers/frontendIPConfigurations', loadBalancerName, frontendIPConfigurationName)
          }
          frontendPort: fabricHttpsPort
          idleTimeoutInMinutes: 5
          probe: {
            id: resourceId('Microsoft.Network/loadBalancers/probes', loadBalancerName, fabricHttpsProbeName)
          }
          protocol: 'Tcp'
        }
      }        
    ]
    probes: [
      {
        name: fabricTcpGatewayProbeName
        properties: {
          intervalInSeconds: 5
          numberOfProbes: 2
          port: nodeType.fabricTcpGatewayPort
          protocol: 'Tcp'
        }
      }
      {
        name: fabricHttpGatewayProbeName
        properties: {
          intervalInSeconds: 5
          numberOfProbes: 2
          port: nodeType.fabricHttpGatewayPort
          protocol: 'Tcp'
        }
      }
      {
        name: fabricHttpsProbeName
        properties: {
          intervalInSeconds: 5
          numberOfProbes: 2
          port: fabricHttpsPort
          protocol: 'Tcp'
        }
      }
    ]
    inboundNatPools: [
      {
        name: 'SSH0'
        properties: {
          backendPort: 22
          frontendIPConfiguration: {
            id: resourceId('Microsoft.Network/loadBalancers/frontendIPConfigurations', loadBalancerName, frontendIPConfigurationName)
          }
          frontendPortRangeEnd: 50999
          frontendPortRangeStart: 50000
          protocol: 'Tcp'
        }
      }
      {
        name: 'SSH1'
        properties: {
          backendPort: 22
          frontendIPConfiguration: {
            id: resourceId('Microsoft.Network/loadBalancers/frontendIPConfigurations', loadBalancerName, frontendIPConfigurationName)
          }
          frontendPortRangeEnd: 60999
          frontendPortRangeStart: 60000
          protocol: 'Tcp'
        }
      }
    ]
  }
  sku: {
    name: 'Standard'
    tier: 'Regional'
  }
}

resource virtualMachineScaleSet 'Microsoft.Compute/virtualMachineScaleSets@2021-07-01' = {
  name: nodeType.typeName
  location: resourceGroup().location
  identity: {
    type: 'UserAssigned'
    userAssignedIdentities: {
      '${userAssignedIdenity.id}': {}
    }
  }
  tags: {
    resourceType: 'Service Fabric'
    clusterName: clusterName
  }
  sku: {
    name: nodeType.vmNodeTypeSize
    capacity: nodeType.instanceCount
    tier: 'Standard'
  }
  properties: {
    overprovision: overProvision
    upgradePolicy: {
      mode: 'Automatic'
      automaticOSUpgradePolicy:{
        enableAutomaticOSUpgrade: true
      }
    }
    singlePlacementGroup: true
    // zoneBalance: true
    platformFaultDomainCount: 5
    virtualMachineProfile: {
      extensionProfile: {
        extensions: [
          {
            name: 'ServiceFabricLinuxNode'
            properties: {
              type: 'ServiceFabricLinuxNode'
              autoUpgradeMinorVersion: true
              protectedSettings: {
                StorageAccountKey1: fabricStorageAccount.listKeys().keys[0].value
                StorageAccountKey2: fabricStorageAccount.listKeys().keys[1].value
              }
              publisher: 'Microsoft.Azure.ServiceFabric'
              settings: {
                clusterEndpoint: serviceFabricCluster.properties.clusterEndpoint
                nodeTypeRef: nodeType.typeName
                durabilityLevel: nodeType.durabilityLevel
                enableParallelJobs: true
                nicPrefixOverride: subnetPrefix
                certificate: {
                  thumbprint: httpsCertificateThumbprint
                  x509StoreName: certificateStoreName
                }
              }
              typeHandlerVersion: '1.1'
            }
          }
          {
            name: 'KeyVaultForLinux'
            properties: {
              enableAutomaticUpgrade: true
              autoUpgradeMinorVersion: true
              publisher: 'Microsoft.Azure.KeyVault'
              settings: {
                  authenticationSettings: {
                    msiClientId: userAssignedIdenity.properties.clientId
                  }
                  secretsManagementSettings: {
                  pollingIntervalInS: '1800'
                  certificateStoreName: certificateStoreName
                  certificateStoreLocation: certificateStoreLocation
                  observedCertificates: monitoredCertificateList
                  requireInitialSync: true
                  linkOnRenewal: true
                }
              }
              type: 'KeyVaultForLinux'
              typeHandlerVersion: '2.0'
            }
          }
          {
            name: 'LinuxDiagnostic'
            properties: {
              autoUpgradeMinorVersion: true
              type: 'LinuxDiagnostic'
              typeHandlerVersion: '2.3'
              protectedSettings: {
                storageAccountName: fabricStorageAccountName
                storageAccountKey: fabricStorageAccount.listKeys().keys[0].value
                storageAccountEndPoint: 'https://${environment().suffixes.storage}'
              }
              publisher: 'Microsoft.OSTCExtensions'
              settings: {
                xmlCfg: base64('${wadcfgxstart}${wadmetricsresourceid}${wadcfgxend}')
                StorageAccount: fabricStorageAccountName
              }
            }
          }
        ]
      }
      networkProfile: {
        networkInterfaceConfigurations: [
          {
            name: '${nicName}-0'
            properties: {
              ipConfigurations: [
                {
                  name: '${nicName}-0'
                  properties: {
                    loadBalancerBackendAddressPools: [
                      {
                        id: loadBalancer.properties.backendAddressPools[0].id
                      }
                    ]
                    loadBalancerInboundNatPools: [
                      {
                        id: loadBalancer.properties.inboundNatPools[0].id
                      }
                    ]
                    subnet: {
                      id: virtualNetwork.properties.subnets[0].id
                    }
                  }
                }
              ]
              primary: true
            }
          }
        ]
      }
      osProfile: {
        adminPassword: adminPassword
        adminUsername: adminUserName
        computerNamePrefix: nodeType.typeName
        secrets: [
          {
            sourceVault: {
              id: keyVault.id
            }
            vaultCertificates: [
              {
                certificateUrl: clusterHttpsCertificate.properties.secretUriWithVersion
              }
            ]
          }
        ]
      }
      storageProfile: {
        imageReference: {
          publisher: nodeType.vmImagePublisher
          offer: nodeType.vmImageOffer
          sku: nodeType.vmImageSku
          version: nodeType.vmImageVersion
        }
        osDisk: {
          caching: 'ReadOnly'
          createOption: 'FromImage'
          managedDisk: {
            storageAccountType: storageAccountType
          }
        }
      }
    }
  }
}

resource serviceFabricClusterNodeType 'Microsoft.ServiceFabric/managedClusters/nodeTypes@2021-11-01-preview' existing = {
  name: clusterName
}
output virtualMachineScaleSet object = virtualMachineScaleSet

Area/Component: Cluster creation

To Reproduce Steps to reproduce the behavior:

  1. Create a linux cluster with a nsg

Expected behavior See the nodes

Observed behavior:

Screenshots image image

Service Fabric Runtime Version: 8.2.1124.1

Environment:


Assignees: /cc @microsoft/service-fabric-triage

fuocor commented 2 years ago

configuring the NSG not to block traffic on the VirtualNetwork works around the issue.

      {
        name: toLower('deny-${nodeType.typeName}-all-internet')
        properties: {
          access: 'Deny'
          destinationAddressPrefix: '*'
          destinationPortRange: '*'
          direction: 'Inbound'
          priority: 4095
          protocol: '*'
          sourceAddressPrefix: 'Internet'
          sourcePortRange: '*'
          description: 'Block all traffic except what we\'ve explicitly allowed'
        }
      }