MicrosoftDocs / azure-docs

Open source documentation of Microsoft Azure
https://docs.microsoft.com/azure
Creative Commons Attribution 4.0 International
10.01k stars 21k forks source link

Clarification for Legacy Container insights metric alerts #122190

Closed vimalkumarkada closed 1 week ago

vimalkumarkada commented 2 weeks ago

Hi,

We have recently created a few metric based alerts for our aks cluster, we wanted some clarification around the retirement, all these were first created from the UI, to make sure they are supported and then moved to our deployment module, can you please confirm they will not be broken come 31 May 2024.

  1. {
            "type": "microsoft.insights/metricAlerts",
            "apiVersion": "2018-03-01",
            "name": "DIsk space usage",
            "location": "global",
            "dependsOn": [
                "xxxxx"
            ],
            "properties": {
                "severity": 4,
                "enabled": true,
                "scopes": [
                    "xxxx"
                ],
                "evaluationFrequency": "PT1M",
                "windowSize": "PT5M",
                "criteria": {
                    "allOf": [
                        {
                            "threshold": 80,
                            "name": "Metric1",
                            "metricNamespace": "Microsoft.ContainerService/managedclusters",
                            "metricName": "node_disk_usage_percentage",
                            "operator": "GreaterThan",
                            "timeAggregation": "Average",
                            "skipMetricValidation": false,
                            "criterionType": "StaticThresholdCriterion"
                        }
                    ],
                    "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria"
                },
                "autoMitigate": true,
                "targetResourceType": "Microsoft.ContainerService/managedclusters",
                "targetResourceRegion": "australiaeast",
            }
        }
  2. {
            "type": "microsoft.insights/metricAlerts",
            "apiVersion": "2018-03-01",
            "name": "Average PV usage is greater than 80 percent",
            "location": "global",
            "dependsOn": [
    
            ],
            "properties": {
                "severity": 4,
                "enabled": true,
                "scopes": [
                    "xxxx"
                ],
                "evaluationFrequency": "PT1M",
                "windowSize": "PT5M",
                "criteria": {
                    "allOf": [
                        {
                            "threshold": 80,
                            "name": "Metric1",
                            "metricNamespace": "insights.container/persistentvolumes",
                            "metricName": "pvUsageExceededPercentage",
                            "operator": "GreaterThan",
                            "timeAggregation": "Average",
                            "skipMetricValidation": false,
                            "criterionType": "StaticThresholdCriterion"
                        }
                    ],
                    "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria"
                },
                "autoMitigate": true,
                "targetResourceType": "Microsoft.ContainerService/managedclusters",
                "targetResourceRegion": "australiaeast"
            }
        }
  3. {
            "type": "microsoft.insights/metricAlerts",
            "apiVersion": "2018-03-01",
            "name": "Memory Working Set Percentage - yyyy",
            "location": "global",
            "properties": {
                "severity": 4,
                "enabled": true,
                "scopes": [
                    "xxxx"
                ],
                "evaluationFrequency": "PT5M",
                "windowSize": "PT5M",
                "criteria": {
                    "allOf": [
                        {
                            "threshold": 95,
                            "name": "Metric1",
                            "metricName": "node_memory_working_set_percentage",
                            "operator": "GreaterThan",
                            "timeAggregation": "Average",
                            "criterionType": "StaticThresholdCriterion"
                        }
                    ],
                    "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria"
                },
                "autoMitigate": true,
                "targetResourceType": "Microsoft.ContainerService/managedclusters",
            }
        }
  4. {
            "type": "microsoft.insights/metricAlerts",
            "apiVersion": "2018-03-01",
            "name": "Number of OOM killed containers is greater than 0",
            "location": "global",
            "properties": {
                "severity": 4,
                "enabled": true,
                "scopes": [
                    "xxxx"
                ],
                "evaluationFrequency": "PT5M",
                "windowSize": "PT5M",
                "criteria": {
                    "allOf": [
                        {
                            "threshold": 0,
                            "name": "Metric1",
                            "metricNamespace": "insights.container/pods",
                            "metricName": "oomKilledContainerCount",
                            "operator": "GreaterThan",
                            "timeAggregation": "Maximum",
                            "skipMetricValidation": false,
                            "criterionType": "StaticThresholdCriterion"
                        }
                    ],
                    "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria"
                },
                "autoMitigate": true,
                "targetResourceType": "Microsoft.ContainerService/managedclusters",
                "targetResourceRegion": "australiaeast"
            }
        }
  5. {
            "type": "microsoft.insights/metricAlerts",
            "apiVersion": "2018-03-01",
            "name": "Pod container restarted",
            "location": "global",
            "properties": {
                "severity": 4,
                "enabled": true,
                "scopes": [
                    "xxxx"
                ],
                "evaluationFrequency": "PT1M",
                "windowSize": "PT5M",
                "criteria": {
                    "allOf": [
                        {
                            "threshold": 0,
                            "name": "Metric1",
                            "metricNamespace": "insights.container/pods",
                            "metricName": "restartingContainerCount",
                            "operator": "GreaterThan",
                            "timeAggregation": "Maximum",
                            "skipMetricValidation": false,
                            "criterionType": "StaticThresholdCriterion"
                        }
                    ],
                    "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria"
                },
                "autoMitigate": true,
                "targetResourceType": "Microsoft.ContainerService/managedclusters",
                "targetResourceRegion": "australiaeast",
            }
        }
  6. {
            "type": "microsoft.insights/metricAlerts",
            "apiVersion": "2018-03-01",
            "name": "Ready state of pods is less than 80 percent",
            "location": "global",
            "properties": {
                "severity": 4,
                "enabled": true,
                "scopes": [
                    "xxxx"
                ],
                "evaluationFrequency": "PT1M",
                "windowSize": "PT5M",
                "criteria": {
                    "allOf": [
                        {
                            "threshold": 80,
                            "name": "Metric1",
                            "metricNamespace": "insights.container/pods",
                            "metricName": "podReadyPercentage",
                            "operator": "LessThan",
                            "timeAggregation": "Average",
                            "skipMetricValidation": false,
                            "criterionType": "StaticThresholdCriterion"
                        }
                    ],
                    "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria"
                },
                "autoMitigate": true,
                "targetResourceType": "Microsoft.ContainerService/managedclusters",
                "targetResourceRegion": "australiaeast"
            }
        }
  7. {
            "type": "Microsoft.Insights/metricAlerts",
            "apiVersion": "2018-03-01",
            "name": "CPU Usage Percentage - yyyy",
            "location": "global",
            "properties": {
                "severity": 4,
                "enabled": true,
                "scopes": [
                    "xxxx"
                ],
                "evaluationFrequency": "PT5M",
                "windowSize": "PT5M",
                "criteria": {
                    "allOf": [
                        {
                            "threshold": 95,
                            "name": "Metric1",
                            "metricName": "node_cpu_usage_percentage",
                            "operator": "GreaterThan",
                            "timeAggregation": "Average",
                            "criterionType": "StaticThresholdCriterion"
                        }
                    ],
                    "odata.type": "Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria"
                }
            }
        }

    Document Details

Do not edit this section. It is required for learn.microsoft.com ➟ GitHub issue linking.

PesalaPavan commented 2 weeks ago

@vimalkumarkada Thanks for your feedback! We will investigate and update as appropriate.

bwren commented 1 week ago

The metrics listed in this article will be removed end of this month, so any alert rules based on them will stop working. You can enable the Prometheus alert rules using the process in https://learn.microsoft.com/azure/azure-monitor/containers/kubernetes-metric-alerts?tabs=portal

bwren commented 1 week ago

please-close

bwren commented 1 week ago

@vimalkumarkada Just to add a bit more context to this. Some of your alert rules use platform metrics. Those will continue to work since platform metrics aren't changing. Only the custom metrics in the article I referenced are being removed.