kruize / autotune

Autonomous Performance Tuning for Kubernetes!
Apache License 2.0
165 stars 54 forks source link

Intermittently no recommendations are observed for a job eventhough there is a data available #1339

Open kusumachalasani opened 1 month ago

kusumachalasani commented 1 month ago

Describe the bug No recommendations are observed for a "job" intermittently - although there is a data and the recommendations for the same experiment were generated earlier.

How to reproduce it On NERC, create an experiment using

[{
  "version": "v2.0",
  "experiment_name": "monitor_gpu_ttm",
  "cluster_name": "default",
  "performance_profile": "resource-optimization-local-monitoring",
  "mode": "monitor",
  "target_cluster": "local",
  "datasource": "prometheus-1",
  "kubernetes_objects": [
    {
      "type": "statefulset",
      "name": "training-ttm",
      "namespace": "unpartitioned-namespace",
      "containers": [
        {
          "container_image_name": "kruizehub/ttm:v1",
          "container_name": "training-container"
        }
      ]
    }
  ],
  "trial_settings": {
    "measurement_duration": "15min"
  },
  "recommendation_settings": {
    "threshold": "0.1"
  }
}]

curl -X POST http://kruize-openshift-tuning.apps.nerc-ocp-test-2.nerc.mghpcc.org/generateRecommendations?experiment_name=monitor_gpu_ttm [ { "cluster_name": "default", "experiment_type": "container", "kubernetes_objects": [ { "type": "job", "name": "training-ttm", "namespace": "unpartitioned-namespace", "containers": [ { "container_image_name": "kruizehub/ttm:v1", "container_name": "training-container", "recommendations": { "version": "1.0", "notifications": { "120001": { "type": "info", "message": "There is not enough data available to generate a recommendation.", "code": 120001 } }, "data": {} } } ] } ], "version": "v2.0", "experiment_name": "monitor_gpu_ttm" } ]

Expected behavior Few minutes earlier to it, recommendations for the same job were observed.

curl -X POST http://kruize-openshift-tuning.apps.nerc-ocp-test-2.nerc.mghpcc.org/generateRecommendations?experiment_name=monitor_gpu_ttm
[
  {
    "cluster_name": "default",
    "experiment_type": "container",
    "kubernetes_objects": [
      {
        "type": "job",
        "name": "training-ttm",
        "namespace": "unpartitioned-namespace",
        "containers": [
          {
            "container_image_name": "kruizehub/ttm:v1",
            "container_name": "training-container",
            "recommendations": {
              "version": "1.0",
              "notifications": {
                "111000": {
                  "type": "info",
                  "message": "Recommendations Are Available",
                  "code": 111000
                }
              },
              "data": {
                "2024-10-01T20:11:00.000Z": {
                  "notifications": {
                    "224001": {
                      "type": "error",
                      "message": "Amount field is missing in the Memory Section",
                      "code": 224001
                    },
                    "524002": {
                      "type": "critical",
                      "message": "Memory Limit Not Set",
                      "code": 524002
                    },
                    "524001": {
                      "type": "critical",
                      "message": "Memory Request Not Set",
                      "code": 524001
                    },
                    "223001": {
                      "type": "error",
                      "message": "Amount field is missing in the CPU Section",
                      "code": 223001
                    },
                    "111101": {
                      "type": "info",
                      "message": "Short Term Recommendations Available",
                      "code": 111101
                    },
                    "523001": {
                      "type": "critical",
                      "message": "CPU Request Not Set",
                      "code": 523001
                    },
                    "423001": {
                      "type": "warning",
                      "message": "CPU Limit Not Set",
                      "code": 423001
                    }
                  },
                  "monitoring_end_time": "2024-10-01T20:11:00.000Z",
                  "current": {},
                  "recommendation_terms": {
                    "short_term": {
                      "duration_in_hours": 24.0,
                      "notifications": {
                        "112101": {
                          "type": "info",
                          "message": "Cost Recommendations Available",
                          "code": 112101
                        },
                        "112102": {
                          "type": "info",
                          "message": "Performance Recommendations Available",
                          "code": 112102
                        }
                      },
                      "monitoring_start_time": "2024-09-30T20:11:00.000Z",
                      "recommendation_engines": {
                        "cost": {
                          "pods_count": 1,
                          "confidence_level": 0.0,
                          "config": {
                            "requests": {
                              "memory": {
                                "amount": 7.6173901824E9,
                                "format": "bytes"
                              },
                              "cpu": {
                                "amount": 10.487843658239633,
                                "format": "cores"
                              }
                            },
                            "limits": {
                              "memory": {
                                "amount": 7.6173901824E9,
                                "format": "bytes"
                              },
                              "nvidia.com/mig-7g.40gb": {
                                "amount": 1.0,
                                "format": "cores"
                              },
                              "cpu": {
                                "amount": 10.487843658239633,
                                "format": "cores"
                              }
                            }
                          },
                          "variation": {
                            "requests": {
                              "memory": {
                                "amount": 7.6173901824E9,
                                "format": "bytes"
                              },
                              "cpu": {
                                "amount": 10.487843658239633,
                                "format": "cores"
                              }
                            },
                            "limits": {
                              "memory": {
                                "amount": 7.6173901824E9,
                                "format": "bytes"
                              },
                              "cpu": {
                                "amount": 10.487843658239633,
                                "format": "cores"
                              }
                            }
                          },
                          "notifications": {}
                        },
                        "performance": {
                          "pods_count": 1,
                          "confidence_level": 0.0,
                          "config": {
                            "requests": {
                              "memory": {
                                "amount": 7.6173901824E9,
                                "format": "bytes"
                              },
                              "cpu": {
                                "amount": 10.487843658239633,
                                "format": "cores"
                              }
                            },
                            "limits": {
                              "memory": {
                                "amount": 7.6173901824E9,
                                "format": "bytes"
                              },
                              "nvidia.com/mig-7g.40gb": {
                                "amount": 1.0,
                                "format": "cores"
                              },
                              "cpu": {
                                "amount": 10.487843658239633,
                                "format": "cores"
                              }
                            }
                          },
                          "variation": {
                            "requests": {
                              "memory": {
                                "amount": 7.6173901824E9,
                                "format": "bytes"
                              },
                              "cpu": {
                                "amount": 10.487843658239633,
                                "format": "cores"
                              }
                            },
                            "limits": {
                              "memory": {
                                "amount": 7.6173901824E9,
                                "format": "bytes"
                              },
                              "cpu": {
                                "amount": 10.487843658239633,
                                "format": "cores"
                              }
                            }
                          },
                          "notifications": {}
                        }
                      },
                      "plots": {
                        "datapoints": 4,
                        "plots_data": {
                          "2024-10-01T08:11:00.000Z": {},
                          "2024-10-01T20:11:00.000Z": {
                            "cpuUsage": {
                              "min": 0.0,
                              "q1": 0.0,
                              "median": 10.487843658239633,
                              "q3": 10.487843658239633,
                              "max": 10.487843658239633,
                              "format": "cores"
                            },
                            "memoryUsage": {
                              "min": 1.30064384E9,
                              "q1": 1.49932032E9,
                              "median": 6.347825152E9,
                              "q3": 6.347825152E9,
                              "max": 6.347825152E9,
                              "format": "bytes"
                            }
                          },
                          "2024-10-01T14:11:00.000Z": {},
                          "2024-10-01T02:11:00.000Z": {}
                        }
                      }
                    },
                    "medium_term": {
                      "duration_in_hours": 168.0,
                      "notifications": {
                        "120001": {
                          "type": "info",
                          "message": "There is not enough data available to generate a recommendation.",
                          "code": 120001
                        }
                      }
                    },
                    "long_term": {
                      "duration_in_hours": 360.0,
                      "notifications": {
                        "120001": {
                          "type": "info",
                          "message": "There is not enough data available to generate a recommendation.",
                          "code": 120001
                        }
                      }
                    }
                  }
                }
              }
            }
          }
        ]
      }
    ],
    "version": "v2.0",
    "experiment_name": "monitor_gpu_ttm"
  }
]

Relevant logs No errors were observed.

Environment: