kruize / autotune

Autonomous Performance Tuning for Kubernetes!
Apache License 2.0
155 stars 53 forks source link

Missing aggregation functions in /listPerformanceProfiles API response #1229

Open shreyabiradar07 opened 2 months ago

shreyabiradar07 commented 2 months ago

Describe the bug

After creating a performance profile, making a POST request to the /createPerformanceProfile API, the aggregation functions avg, min and max are not included in the /listPerformanceProfiles API response despite being present in the input JSON payload.

How to reproduce it

{
    "name": "resource-optimization-openshift1",
    "profile_version": 1,
    "k8s_type": "openshift",
    "slo": {
        "slo_class": "resource_usage",
        "direction": "minimize",
        "objective_function": {
            "function_type": "source"
        },
        "function_variables": [
            {
                "name": "cpuRequest",
                "datasource": "prometheus",
                "value_type": "double",
                "kubernetes_object": "container",
                "aggregation_functions": [
                    {
                        "function": "avg",
                        "query": "avg(kube_pod_container_resource_requests{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=\"$CONTAINER_NAME$\", namespace=\"$NAMESPACE\", resource=\"cpu\", unit=\"core\"})"
                    },
                    {
                        "function": "sum",
                        "query": "sum(kube_pod_container_resource_requests{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=\"$CONTAINER_NAME$\", namespace=\"$NAMESPACE\", resource=\"cpu\", unit=\"core\"})"
                    }
                ]
            },
            {
                "name": "cpuLimit",
                "datasource": "prometheus",
                "value_type": "double",
                "kubernetes_object": "container",
                "aggregation_functions": [
                    {
                        "function": "avg",
                        "query": "avg(kube_pod_container_resource_limits{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=\"$CONTAINER_NAME$\", namespace=\"$NAMESPACE\", resource=\"cpu\", unit=\"core\"})"
                    },
                    {
                        "function": "sum",
                        "query": "sum(kube_pod_container_resource_limits{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=\"$CONTAINER_NAME$\", namespace=\"$NAMESPACE$\", resource=\"cpu\", unit=\"core\"})"
                    }
                ]
            },
            {
                "name": "cpuUsage",
                "datasource": "prometheus",
                "value_type": "double",
                "kubernetes_object": "container",
                "aggregation_functions": [
                    {
                        "function": "avg",
                        "query": "avg(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=”$CONTAINER_NAME$”}[15m]))",
                        "versions": "<=4.8"
                    },
                    {
                        "function": "avg",
                        "query": "avg(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=”$CONTAINER_NAME$”}[15m]))",
                        "versions": ">4.9"
                    },
                    {
                        "function": "min",
                        "query": "min(min_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=\"$CONTAINER_NAME$\"}[15m]))",
                        "versions": "<=4.8"
                    },
                    {
                        "function": "min",
                        "query": "min(min_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=\"$CONTAINER_NAME$\"}[15m]))",
                        "versions": ">4.9"
                    },
                    {
                        "function": "max",
                        "query": "max(max_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=\"$CONTAINER_NAME$\"}[15m]))",
                        "versions": "<=4.8"
                    },
                    {
                        "function": "max",
                        "query": "max(max_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=\"$CONTAINER_NAME$\"}[15m]))",
                        "versions": ">4.9"
                    },
                    {
                        "function": "sum",
                        "query": "sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=\"$CONTAINER_NAME$\"}[15m]))",
                        "versions": "<=4.8"
                    },
                    {
                        "function": "sum",
                        "query": "sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=\"$CONTAINER_NAME$\"}[15m]))",
                        "versions": ">4.9"
                    }
                ]
            },
            {
                "name": "cpuThrottle",
                "datasource": "prometheus",
                "value_type": "double",
                "kubernetes_object": "container",
                "aggregation_functions": [
                    {
                        "function": "avg",
                        "query": "avg(rate(container_cpu_cfs_throttled_seconds_total{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=”$CONTAINER_NAME$”}[15m]))"
                    },
                    {
                        "function": "max",
                        "query": "max(rate(container_cpu_cfs_throttled_seconds_total{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=”$CONTAINER_NAME$”}[15m]))"
                    },
                    {
                        "function": "sum",
                        "query": "sum(rate(container_cpu_cfs_throttled_seconds_total{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=”$CONTAINER_NAME$”}[15m]))"
                    }
                ]
            },
            {
                "name": "memoryRequest",
                "datasource": "prometheus",
                "value_type": "double",
                "kubernetes_object": "container",
                "aggregation_functions": [
                    {
                        "function": "avg",
                        "query": "avg(kube_pod_container_resource_requests{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=”$CONTAINER_NAME$”, namespace=”$NAMESPACE”, resource=\"memory\", unit=\"byte\"})"
                    },
                    {
                        "function": "sum",
                        "query": "sum(kube_pod_container_resource_requests{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=”$CONTAINER_NAME$”, namespace=”$NAMESPACE”, resource=\"memory\", unit=\"byte\"})"
                    }
                ]
            },
            {
                "name": "memoryLimit",
                "datasource": "prometheus",
                "value_type": "double",
                "kubernetes_object": "container",
                "aggregation_functions": [
                    {
                        "function": "avg",
                        "query": "avg(kube_pod_container_resource_limits{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=\"$CONTAINER_NAME$\", namespace=\"$NAMESPACE\", resource=\"memory\", unit=\"byte\"})"
                    },
                    {
                        "function": "sum",
                        "query": "sum(kube_pod_container_resource_limits{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=”$CONTAINER_NAME$”, namespace=”$NAMESPACE”, resource=\"memory\", unit=\"byte\"})"
                    }
                ]
            },
            {
                "name": "memoryUsage",
                "datasource": "prometheus",
                "value_type": "double",
                "kubernetes_object": "container",
                "aggregation_functions": [
                    {
                        "function": "avg",
                        "query": "avg(avg_over_time(container_memory_working_set_bytes{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=”$CONTAINER_NAME$”}[15m]))"
                    },
                    {
                        "function": "min",
                        "query": "min(min_over_time(container_memory_working_set_bytes{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=\"$CONTAINER_NAME$\"}[15m]))"
                    },
                    {
                        "function": "max",
                        "query": "max(max_over_time(container_memory_working_set_bytes{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=\"$CONTAINER_NAME$\"}[15m]))"
                    },
                    {
                        "function": "sum",
                        "query": "sum(avg_over_time(container_memory_working_set_bytes{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=\"$CONTAINER_NAME$\"}[15m]))"
                    }
                ]
            },
            {
                "name": "memoryRSS",
                "datasource": "prometheus",
                "value_type": "double",
                "kubernetes_object": "container",
                "aggregation_functions": [
                    {
                        "function": "avg",
                        "query": "avg(avg_over_time(container_memory_rss{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=”$CONTAINER_NAME$”}[15m]))"
                    },
                    {
                        "function": "min",
                        "query": "min(min_over_time(container_memory_rss{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=\"$CONTAINER_NAME$\"}[15m]))"
                    },
                    {
                        "function": "max",
                        "query": "max(max_over_time(container_memory_rss{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=\"$CONTAINER_NAME$\"}[15m]))"
                    },
                    {
                        "function": "sum",
                        "query": "sum(avg_over_time(container_memory_rss{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=”$CONTAINER_NAME$”}[15m]))"
                    }
                ]
            }
        ]
    }
}

Expected behavior The /listPerformanceProfiles API response should include all the aggregation functions present in input JSON payload: sum, avg, min and max

Relevant logs Missing avg, min and max functions in the /listPerformanceProfiles output JSON

[
    {
        "name": "resource-optimization-openshift",
        "profile_version": 1.0,
        "k8s_type": "openshift",
        "slo": {
            "sloClass": "resource_usage",
            "objective_function": {
                "function_type": "source"
            },
            "direction": "minimize",
            "function_variables": [
                {
                    "name": "cpuRequest",
                    "datasource": "prometheus",
                    "value_type": "double",
                    "kubernetes_object": "container",
                    "aggregation_functions": {
                        "sum": {
                            "function": "sum",
                            "query": "sum(kube_pod_container_resource_requests{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=\"$CONTAINER_NAME$\", namespace=\"$NAMESPACE\", resource=\"cpu\", unit=\"core\"})"
                        }
                    }
                },
                {
                    "name": "cpuLimit",
                    "datasource": "prometheus",
                    "value_type": "double",
                    "kubernetes_object": "container",
                    "aggregation_functions": {
                        "sum": {
                            "function": "sum",
                            "query": "sum(kube_pod_container_resource_limits{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=\"$CONTAINER_NAME$\", namespace=\"$NAMESPACE$\", resource=\"cpu\", unit=\"core\"})"
                        }
                    }
                },
                {
                    "name": "cpuUsage",
                    "datasource": "prometheus",
                    "value_type": "double",
                    "kubernetes_object": "container",
                    "aggregation_functions": {
                        "sum": {
                            "function": "sum",
                            "query": "sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=\"$CONTAINER_NAME$\"}[15m]))"
                        }
                    }
                },
                {
                    "name": "cpuThrottle",
                    "datasource": "prometheus",
                    "value_type": "double",
                    "kubernetes_object": "container",
                    "aggregation_functions": {
                        "sum": {
                            "function": "sum",
                            "query": "sum(rate(container_cpu_cfs_throttled_seconds_total{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=”$CONTAINER_NAME$”}[15m]))"
                        }
                    }
                },
                {
                    "name": "memoryRequest",
                    "datasource": "prometheus",
                    "value_type": "double",
                    "kubernetes_object": "container",
                    "aggregation_functions": {
                        "sum": {
                            "function": "sum",
                            "query": "sum(kube_pod_container_resource_requests{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=”$CONTAINER_NAME$”, namespace=”$NAMESPACE”, resource=\"memory\", unit=\"byte\"})"
                        }
                    }
                },
                {
                    "name": "memoryLimit",
                    "datasource": "prometheus",
                    "value_type": "double",
                    "kubernetes_object": "container",
                    "aggregation_functions": {
                        "sum": {
                            "function": "sum",
                            "query": "sum(kube_pod_container_resource_limits{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=”$CONTAINER_NAME$”, namespace=”$NAMESPACE”, resource=\"memory\", unit=\"byte\"})"
                        }
                    }
                },
                {
                    "name": "memoryUsage",
                    "datasource": "prometheus",
                    "value_type": "double",
                    "kubernetes_object": "container",
                    "aggregation_functions": {
                        "sum": {
                            "function": "sum",
                            "query": "sum(avg_over_time(container_memory_working_set_bytes{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=\"$CONTAINER_NAME$\"}[15m]))"
                        }
                    }
                },
                {
                    "name": "memoryRSS",
                    "datasource": "prometheus",
                    "value_type": "double",
                    "kubernetes_object": "container",
                    "aggregation_functions": {
                        "sum": {
                            "function": "sum",
                            "query": "sum(avg_over_time(container_memory_rss{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=”$CONTAINER_NAME$”}[15m]))"
                        }
                    }
                }
            ]
        }
    }
]

Environment:

Additional context On checking the database KruizePerformanceProfileEntry table records being stored, looks like in slo column avg, min & max functions are missing

 resource-optimization-openshift | openshift |               1 | {"sloClass": "resource_usage", "direction": "minimize", "function_variables": [{"name": "cpuRequest", "data
source": "prometheus", "value_type": "double", "cycleDataMap": {}, "kubernetes_object": "container", "trialSummaryResult": {}, "aggregation_functions": {"sum": {"query": "su
m(kube_pod_container_resource_requests{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=\"$CONTAINER_NAME$\", namespace=\"$NAMESPACE\", resource=\"cpu\", unit=\"core\"})",
 "function": "sum"}}}, {"name": "cpuLimit", "datasource": "prometheus", "value_type": "double", "cycleDataMap": {}, "kubernetes_object": "container", "trialSummaryResult": {
}, "aggregation_functions": {"sum": {"query": "sum(kube_pod_container_resource_limits{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=\"$CONTAINER_NAME$\", namespace=\"$N
AMESPACE$\", resource=\"cpu\", unit=\"core\"})", "function": "sum"}}}, {"name": "cpuUsage", "datasource": "prometheus", "value_type": "double", "cycleDataMap": {}, "kubernet
es_object": "container", "trialSummaryResult": {}, "aggregation_functions": {"sum": {"query": "sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_tot
al:sum_irate{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=\"$CONTAINER_NAME$\"}[15m]))", "function": "sum"}}}, {"name": "cpuThrottle", "data
source": "prometheus", "value_type": "double", "cycleDataMap": {}, "kubernetes_object": "container", "trialSummaryResult": {}, "aggregation_functions": {"sum": {"query": "su
m(rate(container_cpu_cfs_throttled_seconds_total{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=\"$NAMESPACE$\", container=”$CONTAINER_NAME$”}[15m]))", "function": "sum"
}}}, {"name": "memoryRequest", "datasource": "prometheus", "value_type": "double", "cycleDataMap": {}, "kubernetes_object": "container", "trialSummaryResult": {}, "aggregati
on_functions": {"sum": {"query": "sum(kube_pod_container_resource_requests{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=”$CONTAINER_NAME$”, namespace=”$NAMESPACE”, res
ource=\"memory\", unit=\"byte\"})", "function": "sum"}}}, {"name": "memoryLimit", "datasource": "prometheus", "value_type": "double", "cycleDataMap": {}, "kubernetes_object"
: "container", "trialSummaryResult": {}, "aggregation_functions": {"sum": {"query": "sum(kube_pod_container_resource_limits{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", containe
r=”$CONTAINER_NAME$”, namespace=”$NAMESPACE”, resource=\"memory\", unit=\"byte\"})", "function": "sum"}}}, {"name": "memoryUsage", "datasource": "prometheus", "value_type": 
"double", "cycleDataMap": {}, "kubernetes_object": "container", "trialSummaryResult": {}, "aggregation_functions": {"sum": {"query": "sum(avg_over_time(container_memory_work
ing_set_bytes{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=\"$CONTAINER_NAME$\"}[15m]))", "function": "sum"}}}, {"name": "memoryRSS", "datasourc
e": "prometheus", "value_type": "double", "cycleDataMap": {}, "kubernetes_object": "container", "trialSummaryResult": {}, "aggregation_functions": {"sum": {"query": "sum(avg
_over_time(container_memory_rss{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", namespace=$NAMESPACE$, container=”$CONTAINER_NAME$”}[15m]))", "function": "sum"}}}], "objective_func
tion": {"function_type": "source"}}

Proposed Solution Debug the processing logic for converting input JSON to PerformanceProfile object to ensure all aggregation functions are included and stored in the database