dapr / test-infra

Test apps and tools for Dapr
Apache License 2.0
14 stars 24 forks source link

[AKS Templates] Fix Prometheus and Grafana on templates #204

Open tmacam opened 1 year ago

tmacam commented 1 year ago

Steps to Reproduce the Problem

Install a new AKS cluster as described by README.md

Expected Behavior

The managed grafana should present a dashboard similar to the one existing on the current release longhaul and similar to what is shown on https://docs.dapr.io/operations/observability/metrics/grafana/

Actual Behavior

  1. There is no default dashboard installed
  2. There is no Dapr prometheus datasource installed in managed Grafana by default
  3. Installing the dashboards available on https://github.com/dapr/dapr/tree/master/grafana don't produce the expected result as most of the dependent metrics are not available on the managed Prometheus
tmacam commented 1 year ago

Regarding item 3 (missing Prometheus metrics), seems there is a major difference in how Prometheus is configured out of the box (be it the Azure managed one or from a fresh Helm setup) and how it is configured right now in the release clusters. This distinction is also encoded in the grafana dashboards we saved in dapr/dapr, which refer to metrics by names that only exists in the release longaul prometheus setup.

As an example, I am pasting a diff of what one would find in a helm-installed grafana and what we have in release longhaul:

--- fresh-from-helm-prometheus.yaml 2023-10-01 14:33:45.782910959 -0700
+++ release-prometheus.yaml 2023-10-01 14:33:45.793744284 -0700
@@ -1,4 +1,4 @@
-issue6946-prometheus.yml
+release-prometheus.yml
 global:
   evaluation_interval: 1m
   scrape_interval: 1m
@@ -64,8 +64,7 @@
   tls_config:
     ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
     insecure_skip_verify: true
-- honor_labels: true
-  job_name: kubernetes-service-endpoints
+- job_name: kubernetes-service-endpoints
   kubernetes_sd_configs:
   - role: endpoints
   relabel_configs:
@@ -73,10 +72,6 @@
     regex: true
     source_labels:
     - __meta_kubernetes_service_annotation_prometheus_io_scrape
-  - action: drop
-    regex: true
-    source_labels:
-    - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow
   - action: replace
     regex: (https?)
     source_labels:
@@ -88,7 +83,7 @@
     - __meta_kubernetes_service_annotation_prometheus_io_path
     target_label: __metrics_path__
   - action: replace
-    regex: (.+?)(?::\d+)?;(\d+)
+    regex: ([^:]+)(?::\d+)?;(\d+)
     replacement: $1:$2
     source_labels:
     - __address__
@@ -102,17 +97,16 @@
   - action: replace
     source_labels:
     - __meta_kubernetes_namespace
-    target_label: namespace
+    target_label: kubernetes_namespace
   - action: replace
     source_labels:
     - __meta_kubernetes_service_name
-    target_label: service
+    target_label: kubernetes_name
   - action: replace
     source_labels:
     - __meta_kubernetes_pod_node_name
-    target_label: node
-- honor_labels: true
-  job_name: kubernetes-service-endpoints-slow
+    target_label: kubernetes_node
+- job_name: kubernetes-service-endpoints-slow
   kubernetes_sd_configs:
   - role: endpoints
   relabel_configs:
@@ -131,7 +125,7 @@
     - __meta_kubernetes_service_annotation_prometheus_io_path
     target_label: __metrics_path__
   - action: replace
-    regex: (.+?)(?::\d+)?;(\d+)
+    regex: ([^:]+)(?::\d+)?;(\d+)
     replacement: $1:$2
     source_labels:
     - __address__
@@ -145,15 +139,15 @@
   - action: replace
     source_labels:
     - __meta_kubernetes_namespace
-    target_label: namespace
+    target_label: kubernetes_namespace
   - action: replace
     source_labels:
     - __meta_kubernetes_service_name
-    target_label: service
+    target_label: kubernetes_name
   - action: replace
     source_labels:
     - __meta_kubernetes_pod_node_name
-    target_label: node
+    target_label: kubernetes_node
   scrape_interval: 5m
   scrape_timeout: 30s
 - honor_labels: true
@@ -165,8 +159,7 @@
     regex: pushgateway
     source_labels:
     - __meta_kubernetes_service_annotation_prometheus_io_probe
-- honor_labels: true
-  job_name: kubernetes-services
+- job_name: kubernetes-services
   kubernetes_sd_configs:
   - role: service
   metrics_path: /probe
@@ -190,12 +183,11 @@
     regex: __meta_kubernetes_service_label_(.+)
   - source_labels:
     - __meta_kubernetes_namespace
-    target_label: namespace
+    target_label: kubernetes_namespace
   - source_labels:
     - __meta_kubernetes_service_name
-    target_label: service
-- honor_labels: true
-  job_name: kubernetes-pods
+    target_label: kubernetes_name
+- job_name: kubernetes-pods
   kubernetes_sd_configs:
   - role: pod
   relabel_configs:
@@ -203,10 +195,6 @@
     regex: true
     source_labels:
     - __meta_kubernetes_pod_annotation_prometheus_io_scrape
-  - action: drop
-    regex: true
-    source_labels:
-    - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow
   - action: replace
     regex: (https?)
     source_labels:
@@ -218,18 +206,11 @@
     - __meta_kubernetes_pod_annotation_prometheus_io_path
     target_label: __metrics_path__
   - action: replace
-    regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
-    replacement: '[$2]:$1'
-    source_labels:
-    - __meta_kubernetes_pod_annotation_prometheus_io_port
-    - __meta_kubernetes_pod_ip
-    target_label: __address__
-  - action: replace
-    regex: (\d+);((([0-9]+?)(\.|$)){4})
-    replacement: $2:$1
+    regex: ([^:]+)(?::\d+)?;(\d+)
+    replacement: $1:$2
     source_labels:
+    - __address__
     - __meta_kubernetes_pod_annotation_prometheus_io_port
-    - __meta_kubernetes_pod_ip
     target_label: __address__
   - action: labelmap
     regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
@@ -239,21 +220,16 @@
   - action: replace
     source_labels:
     - __meta_kubernetes_namespace
-    target_label: namespace
+    target_label: kubernetes_namespace
   - action: replace
     source_labels:
     - __meta_kubernetes_pod_name
-    target_label: pod
+    target_label: kubernetes_pod_name
   - action: drop
     regex: Pending|Succeeded|Failed|Completed
     source_labels:
     - __meta_kubernetes_pod_phase
-  - action: replace
-    source_labels:
-    - __meta_kubernetes_pod_node_name
-    target_label: node
-- honor_labels: true
-  job_name: kubernetes-pods-slow
+- job_name: kubernetes-pods-slow
   kubernetes_sd_configs:
   - role: pod
   relabel_configs:
@@ -272,18 +248,11 @@
     - __meta_kubernetes_pod_annotation_prometheus_io_path
     target_label: __metrics_path__
   - action: replace
-    regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
-    replacement: '[$2]:$1'
-    source_labels:
-    - __meta_kubernetes_pod_annotation_prometheus_io_port
-    - __meta_kubernetes_pod_ip
-    target_label: __address__
-  - action: replace
-    regex: (\d+);((([0-9]+?)(\.|$)){4})
-    replacement: $2:$1
+    regex: ([^:]+)(?::\d+)?;(\d+)
+    replacement: $1:$2
     source_labels:
+    - __address__
     - __meta_kubernetes_pod_annotation_prometheus_io_port
-    - __meta_kubernetes_pod_ip
     target_label: __address__
   - action: labelmap
     regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
@@ -293,19 +262,15 @@
   - action: replace
     source_labels:
     - __meta_kubernetes_namespace
-    target_label: namespace
+    target_label: kubernetes_namespace
   - action: replace
     source_labels:
     - __meta_kubernetes_pod_name
-    target_label: pod
+    target_label: kubernetes_pod_name
   - action: drop
     regex: Pending|Succeeded|Failed|Completed
     source_labels:
     - __meta_kubernetes_pod_phase
-  - action: replace
-    source_labels:
-    - __meta_kubernetes_pod_node_name
-    target_label: node
   scrape_interval: 5m
   scrape_timeout: 30s
 alerting:
@@ -319,12 +284,15 @@
     - source_labels: [__meta_kubernetes_namespace]
       regex: dapr-monitoring
       action: keep
-    - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance]
-      regex: dapr-prom
+    - source_labels: [__meta_kubernetes_pod_label_app]
+      regex: prometheus
       action: keep
-    - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
+    - source_labels: [__meta_kubernetes_pod_label_component]
       regex: alertmanager
       action: keep
+    - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_probe]
+      regex: .*
+      action: keep
     - source_labels: [__meta_kubernetes_pod_container_port_number]
       regex: "9093"
       action: keep