Open tmacam opened 1 year ago
Regarding item 3 (missing Prometheus metrics), seems there is a major difference in how Prometheus is configured out of the box (be it the Azure managed one or from a fresh Helm setup) and how it is configured right now in the release clusters. This distinction is also encoded in the grafana dashboards we saved in dapr/dapr, which refer to metrics by names that only exists in the release longaul prometheus setup.
As an example, I am pasting a diff of what one would find in a helm-installed grafana and what we have in release longhaul:
--- fresh-from-helm-prometheus.yaml 2023-10-01 14:33:45.782910959 -0700
+++ release-prometheus.yaml 2023-10-01 14:33:45.793744284 -0700
@@ -1,4 +1,4 @@
-issue6946-prometheus.yml
+release-prometheus.yml
global:
evaluation_interval: 1m
scrape_interval: 1m
@@ -64,8 +64,7 @@
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
-- honor_labels: true
- job_name: kubernetes-service-endpoints
+- job_name: kubernetes-service-endpoints
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
@@ -73,10 +72,6 @@
regex: true
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scrape
- - action: drop
- regex: true
- source_labels:
- - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow
- action: replace
regex: (https?)
source_labels:
@@ -88,7 +83,7 @@
- __meta_kubernetes_service_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
- regex: (.+?)(?::\d+)?;(\d+)
+ regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
source_labels:
- __address__
@@ -102,17 +97,16 @@
- action: replace
source_labels:
- __meta_kubernetes_namespace
- target_label: namespace
+ target_label: kubernetes_namespace
- action: replace
source_labels:
- __meta_kubernetes_service_name
- target_label: service
+ target_label: kubernetes_name
- action: replace
source_labels:
- __meta_kubernetes_pod_node_name
- target_label: node
-- honor_labels: true
- job_name: kubernetes-service-endpoints-slow
+ target_label: kubernetes_node
+- job_name: kubernetes-service-endpoints-slow
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
@@ -131,7 +125,7 @@
- __meta_kubernetes_service_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
- regex: (.+?)(?::\d+)?;(\d+)
+ regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
source_labels:
- __address__
@@ -145,15 +139,15 @@
- action: replace
source_labels:
- __meta_kubernetes_namespace
- target_label: namespace
+ target_label: kubernetes_namespace
- action: replace
source_labels:
- __meta_kubernetes_service_name
- target_label: service
+ target_label: kubernetes_name
- action: replace
source_labels:
- __meta_kubernetes_pod_node_name
- target_label: node
+ target_label: kubernetes_node
scrape_interval: 5m
scrape_timeout: 30s
- honor_labels: true
@@ -165,8 +159,7 @@
regex: pushgateway
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_probe
-- honor_labels: true
- job_name: kubernetes-services
+- job_name: kubernetes-services
kubernetes_sd_configs:
- role: service
metrics_path: /probe
@@ -190,12 +183,11 @@
regex: __meta_kubernetes_service_label_(.+)
- source_labels:
- __meta_kubernetes_namespace
- target_label: namespace
+ target_label: kubernetes_namespace
- source_labels:
- __meta_kubernetes_service_name
- target_label: service
-- honor_labels: true
- job_name: kubernetes-pods
+ target_label: kubernetes_name
+- job_name: kubernetes-pods
kubernetes_sd_configs:
- role: pod
relabel_configs:
@@ -203,10 +195,6 @@
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape
- - action: drop
- regex: true
- source_labels:
- - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow
- action: replace
regex: (https?)
source_labels:
@@ -218,18 +206,11 @@
- __meta_kubernetes_pod_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
- regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
- replacement: '[$2]:$1'
- source_labels:
- - __meta_kubernetes_pod_annotation_prometheus_io_port
- - __meta_kubernetes_pod_ip
- target_label: __address__
- - action: replace
- regex: (\d+);((([0-9]+?)(\.|$)){4})
- replacement: $2:$1
+ regex: ([^:]+)(?::\d+)?;(\d+)
+ replacement: $1:$2
source_labels:
+ - __address__
- __meta_kubernetes_pod_annotation_prometheus_io_port
- - __meta_kubernetes_pod_ip
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
@@ -239,21 +220,16 @@
- action: replace
source_labels:
- __meta_kubernetes_namespace
- target_label: namespace
+ target_label: kubernetes_namespace
- action: replace
source_labels:
- __meta_kubernetes_pod_name
- target_label: pod
+ target_label: kubernetes_pod_name
- action: drop
regex: Pending|Succeeded|Failed|Completed
source_labels:
- __meta_kubernetes_pod_phase
- - action: replace
- source_labels:
- - __meta_kubernetes_pod_node_name
- target_label: node
-- honor_labels: true
- job_name: kubernetes-pods-slow
+- job_name: kubernetes-pods-slow
kubernetes_sd_configs:
- role: pod
relabel_configs:
@@ -272,18 +248,11 @@
- __meta_kubernetes_pod_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
- regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
- replacement: '[$2]:$1'
- source_labels:
- - __meta_kubernetes_pod_annotation_prometheus_io_port
- - __meta_kubernetes_pod_ip
- target_label: __address__
- - action: replace
- regex: (\d+);((([0-9]+?)(\.|$)){4})
- replacement: $2:$1
+ regex: ([^:]+)(?::\d+)?;(\d+)
+ replacement: $1:$2
source_labels:
+ - __address__
- __meta_kubernetes_pod_annotation_prometheus_io_port
- - __meta_kubernetes_pod_ip
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
@@ -293,19 +262,15 @@
- action: replace
source_labels:
- __meta_kubernetes_namespace
- target_label: namespace
+ target_label: kubernetes_namespace
- action: replace
source_labels:
- __meta_kubernetes_pod_name
- target_label: pod
+ target_label: kubernetes_pod_name
- action: drop
regex: Pending|Succeeded|Failed|Completed
source_labels:
- __meta_kubernetes_pod_phase
- - action: replace
- source_labels:
- - __meta_kubernetes_pod_node_name
- target_label: node
scrape_interval: 5m
scrape_timeout: 30s
alerting:
@@ -319,12 +284,15 @@
- source_labels: [__meta_kubernetes_namespace]
regex: dapr-monitoring
action: keep
- - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance]
- regex: dapr-prom
+ - source_labels: [__meta_kubernetes_pod_label_app]
+ regex: prometheus
action: keep
- - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
+ - source_labels: [__meta_kubernetes_pod_label_component]
regex: alertmanager
action: keep
+ - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_probe]
+ regex: .*
+ action: keep
- source_labels: [__meta_kubernetes_pod_container_port_number]
regex: "9093"
action: keep
Steps to Reproduce the Problem
Install a new AKS cluster as described by README.md
Expected Behavior
The managed grafana should present a dashboard similar to the one existing on the current release longhaul and similar to what is shown on https://docs.dapr.io/operations/observability/metrics/grafana/
Actual Behavior
Dapr
prometheus datasource installed in managed Grafana by default