kruize / autotune

Autonomous Performance Tuning for Kubernetes!
Apache License 2.0
155 stars 53 forks source link

Support Multiple OpenShift Versions in Aggregation Functions #1233

Open shreyabiradar07 opened 1 month ago

shreyabiradar07 commented 1 month ago

Describe the feature

In the current Performance/Metric Profile implementation different aggregation function queries for multiple OpenShift versions are not supported. Internally each aggregation_function object is stored in a HashMap with key as aggregation function type and multiple instances with different versions cannot be stored.

Examples or references

/createMetricProfile API input payload - corresponding YAML file resource_optimization_openshift.yaml

{
    "apiVersion": "recommender.com/v1",
    "kind": "KruizePerformanceProfile",
    "metadata": {
        "name": "resource-optimization-openshift"
    },
    "profile_version": 1,
    "k8s_type": "openshift",
    "slo": {
        "slo_class": "resource_usage",
        "direction": "minimize",
        "objective_function": {
            "function_type": "source"
        },
        "function_variables": [
            {
                "name": "cpuRequest",
                "datasource": "prometheus",
                "value_type": "double",
                "kubernetes_object": "container",
                "aggregation_functions": [
                    {
                        "function": "avg",
                        "query": "avg(kube_pod_container_resource_requests{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=\"$CONTAINER_NAME$\", namespace=\"$NAMESPACE\", resource=\"cpu\", unit=\"core\"})"
                    },
                    {
                        "function": "sum",
                        "query": "sum(kube_pod_container_resource_requests{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=\"$CONTAINER_NAME$\", namespace=\"$NAMESPACE\", resource=\"cpu\", unit=\"core\"})"
                    }
                ]
            },
            {
                "name": "cpuLimit",
                "datasource": "prometheus",
                "value_type": "double",
                "kubernetes_object": "container",
                "aggregation_functions": [
                    {
                        "function": "avg",
                        "query": "avg(kube_pod_container_resource_limits{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=\"$CONTAINER_NAME$\", namespace=\"$NAMESPACE\", resource=\"cpu\", unit=\"core\"})"
                    },
                    {
                        "function": "sum",
                        "query": "sum(kube_pod_container_resource_limits{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=\"$CONTAINER_NAME$\", namespace=\"$NAMESPACE$\", resource=\"cpu\", unit=\"core\"})"
                    }
                ]
            },
            {
                "name": "cpuUsage",
                "datasource": "prometheus",
                "value_type": "double",
                "kubernetes_object": "container",
                "aggregation_functions": [
                    {
                        "function": "avg",
                        "query": "avg(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=”$CONTAINER_NAME$”}[15m]))",
                        "version": "<=4.8"
                    },
                    {
                        "function": "avg",
                        "query": "avg(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=”$CONTAINER_NAME$”}[15m]))",
                        "version": ">4.9"
                    },
                    {
                        "function": "min",
                        "query": "min(min_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=\"$CONTAINER_NAME$\"}[15m]))",
                        "version": "<=4.8"
                    },
                    {
                        "function": "min",
                        "query": "min(min_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=\"$CONTAINER_NAME$\"}[15m]))",
                        "version": ">4.9"
                    },
                    {
                        "function": "max",
                        "query": "max(max_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=\"$CONTAINER_NAME$\"}[15m]))",
                        "version": "<=4.8"
                    },
                    {
                        "function": "max",
                        "query": "max(max_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=\"$CONTAINER_NAME$\"}[15m]))",
                        "version": ">4.9"
                    },
                    {
                        "function": "sum",
                        "query": "sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=\"$CONTAINER_NAME$\"}[15m]))",
                        "version": "<=4.8"
                    },
                    {
                        "function": "sum",
                        "query": "sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=\"$CONTAINER_NAME$\"}[15m]))",
                        "version": ">4.9"
                    }
                ]
            },
            {
                "name": "cpuThrottle",
                "datasource": "prometheus",
                "value_type": "double",
                "kubernetes_object": "container",
                "aggregation_functions": [
                    {
                        "function": "avg",
                        "query": "avg(rate(container_cpu_cfs_throttled_seconds_total{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=”$CONTAINER_NAME$”}[15m]))"
                    },
                    {
                        "function": "max",
                        "query": "max(rate(container_cpu_cfs_throttled_seconds_total{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=”$CONTAINER_NAME$”}[15m]))"
                    },
                    {
                        "function": "sum",
                        "query": "sum(rate(container_cpu_cfs_throttled_seconds_total{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=”$CONTAINER_NAME$”}[15m]))"
                    }
                ]
            },
            {
                "name": "memoryRequest",
                "datasource": "prometheus",
                "value_type": "double",
                "kubernetes_object": "container",
                "aggregation_functions": [
                    {
                        "function": "avg",
                        "query": "avg(kube_pod_container_resource_requests{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=”$CONTAINER_NAME$”, namespace=”$NAMESPACE”, resource=\"memory\", unit=\"byte\"})"
                    },
                    {
                        "function": "sum",
                        "query": "sum(kube_pod_container_resource_requests{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=”$CONTAINER_NAME$”, namespace=”$NAMESPACE”, resource=\"memory\", unit=\"byte\"})"
                    }
                ]
            },
            {
                "name": "memoryLimit",
                "datasource": "prometheus",
                "value_type": "double",
                "kubernetes_object": "container",
                "aggregation_functions": [
                    {
                        "function": "avg",
                        "query": "avg(kube_pod_container_resource_limits{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=\"$CONTAINER_NAME$\", namespace=\"$NAMESPACE\", resource=\"memory\", unit=\"byte\"})"
                    },
                    {
                        "function": "sum",
                        "query": "sum(kube_pod_container_resource_limits{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=”$CONTAINER_NAME$”, namespace=”$NAMESPACE”, resource=\"memory\", unit=\"byte\"})"
                    }
                ]
            },
            {
                "name": "memoryUsage",
                "datasource": "prometheus",
                "value_type": "double",
                "kubernetes_object": "container",
                "aggregation_functions": [
                    {
                        "function": "avg",
                        "query": "avg(avg_over_time(container_memory_working_set_bytes{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=”$CONTAINER_NAME$”}[15m]))"
                    },
                    {
                        "function": "min",
                        "query": "min(min_over_time(container_memory_working_set_bytes{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=\"$CONTAINER_NAME$\"}[15m]))"
                    },
                    {
                        "function": "max",
                        "query": "max(max_over_time(container_memory_working_set_bytes{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=\"$CONTAINER_NAME$\"}[15m]))"
                    },
                    {
                        "function": "sum",
                        "query": "sum(avg_over_time(container_memory_working_set_bytes{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=\"$CONTAINER_NAME$\"}[15m]))"
                    }
                ]
            },
            {
                "name": "memoryRSS",
                "datasource": "prometheus",
                "value_type": "double",
                "kubernetes_object": "container",
                "aggregation_functions": [
                    {
                        "function": "avg",
                        "query": "avg(avg_over_time(container_memory_rss{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=”$CONTAINER_NAME$”}[15m]))"
                    },
                    {
                        "function": "min",
                        "query": "min(min_over_time(container_memory_rss{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=\"$CONTAINER_NAME$\"}[15m]))"
                    },
                    {
                        "function": "max",
                        "query": "max(max_over_time(container_memory_rss{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=\"$CONTAINER_NAME$\"}[15m]))"
                    },
                    {
                        "function": "sum",
                        "query": "sum(avg_over_time(container_memory_rss{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=”$CONTAINER_NAME$”}[15m]))"
                    }
                ]
            }
        ]
    }
}

/listMetricProfiles API response JSON - for CpuUsage Metric aggregation functions with only version > 4.9 are stored in the database as seen in output below

[
    {
        "apiVersion": "recommender.com/v1",
        "kind": "KruizePerformanceProfile",
        "metadata": {
            "name": "resource-optimization-openshift"
        },
        "profile_version": 1.0,
        "k8s_type": "openshift",
        "slo": {
            "sloClass": "resource_usage",
            "objective_function": {
                "function_type": "source"
            },
            "direction": "minimize",
            "function_variables": [
                {
                    "name": "cpuRequest",
                    "datasource": "prometheus",
                    "value_type": "double",
                    "kubernetes_object": "container",
                    "aggregation_functions": {
                        "avg": {
                            "function": "avg",
                            "query": "avg(kube_pod_container_resource_requests{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=\"$CONTAINER_NAME$\", namespace=\"$NAMESPACE\", resource=\"cpu\", unit=\"core\"})"
                        },
                        "sum": {
                            "function": "sum",
                            "query": "sum(kube_pod_container_resource_requests{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=\"$CONTAINER_NAME$\", namespace=\"$NAMESPACE\", resource=\"cpu\", unit=\"core\"})"
                        }
                    }
                },
                {
                    "name": "cpuLimit",
                    "datasource": "prometheus",
                    "value_type": "double",
                    "kubernetes_object": "container",
                    "aggregation_functions": {
                        "avg": {
                            "function": "avg",
                            "query": "avg(kube_pod_container_resource_limits{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=\"$CONTAINER_NAME$\", namespace=\"$NAMESPACE\", resource=\"cpu\", unit=\"core\"})"
                        },
                        "sum": {
                            "function": "sum",
                            "query": "sum(kube_pod_container_resource_limits{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=\"$CONTAINER_NAME$\", namespace=\"$NAMESPACE$\", resource=\"cpu\", unit=\"core\"})"
                        }
                    }
                },
                {
                    "name": "cpuUsage",
                    "datasource": "prometheus",
                    "value_type": "double",
                    "kubernetes_object": "container",
                    "aggregation_functions": {
                        "avg": {
                            "function": "avg",
                            "query": "avg(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=”$CONTAINER_NAME$”}[15m]))",
                            "version": ">4.9"
                        },
                        "min": {
                            "function": "min",
                            "query": "min(min_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=\"$CONTAINER_NAME$\"}[15m]))",
                            "version": ">4.9"
                        },
                        "max": {
                            "function": "max",
                            "query": "max(max_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=\"$CONTAINER_NAME$\"}[15m]))",
                            "version": ">4.9"
                        },
                        "sum": {
                            "function": "sum",
                            "query": "sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=\"$CONTAINER_NAME$\"}[15m]))",
                            "version": ">4.9"
                        }
                    }
                },
                {
                    "name": "cpuThrottle",
                    "datasource": "prometheus",
                    "value_type": "double",
                    "kubernetes_object": "container",
                    "aggregation_functions": {
                        "avg": {
                            "function": "avg",
                            "query": "avg(rate(container_cpu_cfs_throttled_seconds_total{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=”$CONTAINER_NAME$”}[15m]))"
                        },
                        "max": {
                            "function": "max",
                            "query": "max(rate(container_cpu_cfs_throttled_seconds_total{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=”$CONTAINER_NAME$”}[15m]))"
                        },
                        "sum": {
                            "function": "sum",
                            "query": "sum(rate(container_cpu_cfs_throttled_seconds_total{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=”$CONTAINER_NAME$”}[15m]))"
                        }
                    }
                },
                {
                    "name": "memoryRequest",
                    "datasource": "prometheus",
                    "value_type": "double",
                    "kubernetes_object": "container",
                    "aggregation_functions": {
                        "avg": {
                            "function": "avg",
                            "query": "avg(kube_pod_container_resource_requests{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=”$CONTAINER_NAME$”, namespace=”$NAMESPACE”, resource=\"memory\", unit=\"byte\"})"
                        },
                        "sum": {
                            "function": "sum",
                            "query": "sum(kube_pod_container_resource_requests{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=”$CONTAINER_NAME$”, namespace=”$NAMESPACE”, resource=\"memory\", unit=\"byte\"})"
                        }
                    }
                },
                {
                    "name": "memoryLimit",
                    "datasource": "prometheus",
                    "value_type": "double",
                    "kubernetes_object": "container",
                    "aggregation_functions": {
                        "avg": {
                            "function": "avg",
                            "query": "avg(kube_pod_container_resource_limits{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=\"$CONTAINER_NAME$\", namespace=\"$NAMESPACE\", resource=\"memory\", unit=\"byte\"})"
                        },
                        "sum": {
                            "function": "sum",
                            "query": "sum(kube_pod_container_resource_limits{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=”$CONTAINER_NAME$”, namespace=”$NAMESPACE”, resource=\"memory\", unit=\"byte\"})"
                        }
                    }
                },
                {
                    "name": "memoryUsage",
                    "datasource": "prometheus",
                    "value_type": "double",
                    "kubernetes_object": "container",
                    "aggregation_functions": {
                        "avg": {
                            "function": "avg",
                            "query": "avg(avg_over_time(container_memory_working_set_bytes{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=”$CONTAINER_NAME$”}[15m]))"
                        },
                        "min": {
                            "function": "min",
                            "query": "min(min_over_time(container_memory_working_set_bytes{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=\"$CONTAINER_NAME$\"}[15m]))"
                        },
                        "max": {
                            "function": "max",
                            "query": "max(max_over_time(container_memory_working_set_bytes{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=\"$CONTAINER_NAME$\"}[15m]))"
                        },
                        "sum": {
                            "function": "sum",
                            "query": "sum(avg_over_time(container_memory_working_set_bytes{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=\"$CONTAINER_NAME$\"}[15m]))"
                        }
                    }
                },
                {
                    "name": "memoryRSS",
                    "datasource": "prometheus",
                    "value_type": "double",
                    "kubernetes_object": "container",
                    "aggregation_functions": {
                        "avg": {
                            "function": "avg",
                            "query": "avg(avg_over_time(container_memory_rss{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=”$CONTAINER_NAME$”}[15m]))"
                        },
                        "min": {
                            "function": "min",
                            "query": "min(min_over_time(container_memory_rss{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=\"$CONTAINER_NAME$\"}[15m]))"
                        },
                        "max": {
                            "function": "max",
                            "query": "max(max_over_time(container_memory_rss{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=\"$CONTAINER_NAME$\"}[15m]))"
                        },
                        "sum": {
                            "function": "sum",
                            "query": "sum(avg_over_time(container_memory_rss{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=”$CONTAINER_NAME$”}[15m]))"
                        }
                    }
                }
            ]
        }
    }
]

Suggest a solution

Needs design changes and abstractions implementation which will pick the right queries based on the OpenShift cluster version. Enhancements are required to handle these version-specific queries effectively.

Additional Context

.