Open kusumachalasani opened 1 month ago
Describe the bug No recommendations are observed for a "job" intermittently - although there is a data and the recommendations for the same experiment were generated earlier.
How to reproduce it On NERC, create an experiment using
[{ "version": "v2.0", "experiment_name": "monitor_gpu_ttm", "cluster_name": "default", "performance_profile": "resource-optimization-local-monitoring", "mode": "monitor", "target_cluster": "local", "datasource": "prometheus-1", "kubernetes_objects": [ { "type": "statefulset", "name": "training-ttm", "namespace": "unpartitioned-namespace", "containers": [ { "container_image_name": "kruizehub/ttm:v1", "container_name": "training-container" } ] } ], "trial_settings": { "measurement_duration": "15min" }, "recommendation_settings": { "threshold": "0.1" } }]
curl -X POST http://kruize-openshift-tuning.apps.nerc-ocp-test-2.nerc.mghpcc.org/generateRecommendations?experiment_name=monitor_gpu_ttm [ { "cluster_name": "default", "experiment_type": "container", "kubernetes_objects": [ { "type": "job", "name": "training-ttm", "namespace": "unpartitioned-namespace", "containers": [ { "container_image_name": "kruizehub/ttm:v1", "container_name": "training-container", "recommendations": { "version": "1.0", "notifications": { "120001": { "type": "info", "message": "There is not enough data available to generate a recommendation.", "code": 120001 } }, "data": {} } } ] } ], "version": "v2.0", "experiment_name": "monitor_gpu_ttm" } ]
Expected behavior Few minutes earlier to it, recommendations for the same job were observed.
curl -X POST http://kruize-openshift-tuning.apps.nerc-ocp-test-2.nerc.mghpcc.org/generateRecommendations?experiment_name=monitor_gpu_ttm [ { "cluster_name": "default", "experiment_type": "container", "kubernetes_objects": [ { "type": "job", "name": "training-ttm", "namespace": "unpartitioned-namespace", "containers": [ { "container_image_name": "kruizehub/ttm:v1", "container_name": "training-container", "recommendations": { "version": "1.0", "notifications": { "111000": { "type": "info", "message": "Recommendations Are Available", "code": 111000 } }, "data": { "2024-10-01T20:11:00.000Z": { "notifications": { "224001": { "type": "error", "message": "Amount field is missing in the Memory Section", "code": 224001 }, "524002": { "type": "critical", "message": "Memory Limit Not Set", "code": 524002 }, "524001": { "type": "critical", "message": "Memory Request Not Set", "code": 524001 }, "223001": { "type": "error", "message": "Amount field is missing in the CPU Section", "code": 223001 }, "111101": { "type": "info", "message": "Short Term Recommendations Available", "code": 111101 }, "523001": { "type": "critical", "message": "CPU Request Not Set", "code": 523001 }, "423001": { "type": "warning", "message": "CPU Limit Not Set", "code": 423001 } }, "monitoring_end_time": "2024-10-01T20:11:00.000Z", "current": {}, "recommendation_terms": { "short_term": { "duration_in_hours": 24.0, "notifications": { "112101": { "type": "info", "message": "Cost Recommendations Available", "code": 112101 }, "112102": { "type": "info", "message": "Performance Recommendations Available", "code": 112102 } }, "monitoring_start_time": "2024-09-30T20:11:00.000Z", "recommendation_engines": { "cost": { "pods_count": 1, "confidence_level": 0.0, "config": { "requests": { "memory": { "amount": 7.6173901824E9, "format": "bytes" }, "cpu": { "amount": 10.487843658239633, "format": "cores" } }, "limits": { "memory": { "amount": 7.6173901824E9, "format": "bytes" }, "nvidia.com/mig-7g.40gb": { "amount": 1.0, "format": "cores" }, "cpu": { "amount": 10.487843658239633, "format": "cores" } } }, "variation": { "requests": { "memory": { "amount": 7.6173901824E9, "format": "bytes" }, "cpu": { "amount": 10.487843658239633, "format": "cores" } }, "limits": { "memory": { "amount": 7.6173901824E9, "format": "bytes" }, "cpu": { "amount": 10.487843658239633, "format": "cores" } } }, "notifications": {} }, "performance": { "pods_count": 1, "confidence_level": 0.0, "config": { "requests": { "memory": { "amount": 7.6173901824E9, "format": "bytes" }, "cpu": { "amount": 10.487843658239633, "format": "cores" } }, "limits": { "memory": { "amount": 7.6173901824E9, "format": "bytes" }, "nvidia.com/mig-7g.40gb": { "amount": 1.0, "format": "cores" }, "cpu": { "amount": 10.487843658239633, "format": "cores" } } }, "variation": { "requests": { "memory": { "amount": 7.6173901824E9, "format": "bytes" }, "cpu": { "amount": 10.487843658239633, "format": "cores" } }, "limits": { "memory": { "amount": 7.6173901824E9, "format": "bytes" }, "cpu": { "amount": 10.487843658239633, "format": "cores" } } }, "notifications": {} } }, "plots": { "datapoints": 4, "plots_data": { "2024-10-01T08:11:00.000Z": {}, "2024-10-01T20:11:00.000Z": { "cpuUsage": { "min": 0.0, "q1": 0.0, "median": 10.487843658239633, "q3": 10.487843658239633, "max": 10.487843658239633, "format": "cores" }, "memoryUsage": { "min": 1.30064384E9, "q1": 1.49932032E9, "median": 6.347825152E9, "q3": 6.347825152E9, "max": 6.347825152E9, "format": "bytes" } }, "2024-10-01T14:11:00.000Z": {}, "2024-10-01T02:11:00.000Z": {} } } }, "medium_term": { "duration_in_hours": 168.0, "notifications": { "120001": { "type": "info", "message": "There is not enough data available to generate a recommendation.", "code": 120001 } } }, "long_term": { "duration_in_hours": 360.0, "notifications": { "120001": { "type": "info", "message": "There is not enough data available to generate a recommendation.", "code": 120001 } } } } } } } } ] } ], "version": "v2.0", "experiment_name": "monitor_gpu_ttm" } ]
Relevant logs No errors were observed.
Environment:
Describe the bug No recommendations are observed for a "job" intermittently - although there is a data and the recommendations for the same experiment were generated earlier.
How to reproduce it On NERC, create an experiment using
curl -X POST http://kruize-openshift-tuning.apps.nerc-ocp-test-2.nerc.mghpcc.org/generateRecommendations?experiment_name=monitor_gpu_ttm [ { "cluster_name": "default", "experiment_type": "container", "kubernetes_objects": [ { "type": "job", "name": "training-ttm", "namespace": "unpartitioned-namespace", "containers": [ { "container_image_name": "kruizehub/ttm:v1", "container_name": "training-container", "recommendations": { "version": "1.0", "notifications": { "120001": { "type": "info", "message": "There is not enough data available to generate a recommendation.", "code": 120001 } }, "data": {} } } ] } ], "version": "v2.0", "experiment_name": "monitor_gpu_ttm" } ]
Expected behavior Few minutes earlier to it, recommendations for the same job were observed.
Relevant logs No errors were observed.
Environment: