tarantool / grafana-dashboard

Dashboard for Tarantool application and database server monitoring with Grafana
MIT License
34 stars 11 forks source link

Make a separate dashboard for Tarantool Data Grid metrics #134

Closed opomuc closed 2 years ago

opomuc commented 2 years ago

I have some panels for TDG, but they might be outdated:

"GraphQL -- query rps",

"expr": "rate(tdg_graphql_query_time_count{job=~\"$job\", alias=~\"$instance\"}[$__rate_interval])",
"expr": "rate(tdg_graphql_query_time_sum{job=~\"$job\", alias=~\"$instance\"}[$__rate_interval])/rate(tdg_graphql_query_time_count{job=~\"$job\", alias=~\"$instance\"}[$__rate_interval])",
"title": "GraphQL -- avg. query time",
"expr": "histogram_quantile(0.95, sum(rate(tdg_graphql_query_time_bucket{job=~\"$job\", alias=~\"$instance\"}[$__rate_interval])) by (le))",
"title": "GraphQL -- 95th percentile",
"expr": "sum by (method, type) (rate(tdg_rest_result_200{job=~\"$job\", alias=~\"$instance\"}[$__rate_interval]))",
"title": "Success requests (code 2xx)",
"expr": "sum by (method, type) (rate(tdg_rest_exec_time_count{job=~\"$job\", alias=~\"$instance\"}[$__rate_interval]) - rate(tdg_rest_result_200{job=~\"$job\", alias=~\"$instance\"}[$__rate_interval]))",
"title": "Error requests (code 4xx)",

Queries should be validated against current TDG master.

opomuc commented 2 years ago

What new issues has been fixed since I last checked:

So at least HTTP queries from above must be rewritten

opomuc commented 2 years ago
{
          "datasource": "Prometheus",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisLabel": "",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 0,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "auto",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green",
                    "value": null
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              }
            },
            "overrides": []
          },
          "gridPos": {
            "h": 8,
            "w": 12,
            "x": 0,
            "y": 1
          },
          "id": 72,
          "options": {
            "legend": {
              "calcs": [],
              "displayMode": "table",
              "placement": "right"
            },
            "tooltip": {
              "mode": "single"
            }
          },
          "targets": [
            {
              "exemplar": true,
              "expr": "rate(tdg_graphql_query_time_count{job=~\"$job\", ris_sub=~\"$zone\", alias=~\"$instance\"}[$__rate_interval])",
              "interval": "",
              "legendFormat": "{{alias}} - {{operation_name}}",
              "queryType": "randomWalk",
              "refId": "A"
            }
          ],
          "title": "GraphQL -- query rps",
          "type": "timeseries"
        },
opomuc commented 2 years ago
{
          "datasource": "Prometheus",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisLabel": "",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 0,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "auto",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green",
                    "value": null
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "ms"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 8,
            "w": 12,
            "x": 12,
            "y": 1
          },
          "id": 74,
          "options": {
            "legend": {
              "calcs": [],
              "displayMode": "table",
              "placement": "right"
            },
            "tooltip": {
              "mode": "single"
            }
          },
          "targets": [
            {
              "exemplar": true,
              "expr": "rate(tdg_graphql_query_time_sum{job=~\"$job\", ris_sub=~\"$zone\", alias=~\"$instance\"}[$__rate_interval])/rate(tdg_graphql_query_time_count{job=~\"$job\", ris_sub=~\"$zone\", alias=~\"$instance\"}[$__rate_interval])",
              "interval": "",
              "legendFormat": "{{alias}} - {{operation_name}}",
              "queryType": "randomWalk",
              "refId": "A"
            }
          ],
          "title": "GraphQL -- avg. query time",
          "type": "timeseries"
        },
opomuc commented 2 years ago
{
          "datasource": "Prometheus",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisLabel": "",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 0,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "auto",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green",
                    "value": null
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "ms"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 8,
            "w": 12,
            "x": 0,
            "y": 9
          },
          "id": 76,
          "options": {
            "legend": {
              "calcs": [],
              "displayMode": "list",
              "placement": "bottom"
            },
            "tooltip": {
              "mode": "single"
            }
          },
          "targets": [
            {
              "exemplar": true,
              "expr": "histogram_quantile(0.95, sum(rate(tdg_graphql_query_time_bucket{job=~\"$job\", ris_sub=~\"$zone\", alias=~\"$instance\"}[$__rate_interval])) by (le))",
              "interval": "",
              "legendFormat": "95th percentile",
              "queryType": "randomWalk",
              "refId": "A"
            }
          ],
          "title": "GraphQL -- 95th percentile",
          "type": "timeseries"
opomuc commented 2 years ago
{
          "aliasColors": {},
          "bars": false,
          "dashLength": 10,
          "dashes": false,
          "datasource": "Prometheus",
          "decimals": 3,
          "description": "Requests, processed with success (code 2xx) on Tarantool's side.\nGraph shows mean count per second.\nIf `No data` displayed for Prometheus panel,\ncheck up your 'rate_time_range' variable.\n",
          "fill": 0,
          "fillGradient": 0,
          "gridPos": {
            "h": 9,
            "w": 24,
            "x": 0,
            "y": 17
          },
          "hiddenSeries": false,
          "id": 77,
          "legend": {
            "alignAsTable": true,
            "avg": true,
            "current": true,
            "max": true,
            "min": false,
            "rightSide": true,
            "show": true,
            "sideWidth": null,
            "sort": "max",
            "sortDesc": true,
            "total": false,
            "values": true
          },
          "lines": true,
          "linewidth": 1,
          "links": [],
          "nullPointMode": "null",
          "options": {
            "alertThreshold": true
          },
          "percentage": false,
          "pluginVersion": "8.0.3",
          "pointradius": 5,
          "points": false,
          "renderer": "flot",
          "seriesOverrides": [],
          "spaceLength": 10,
          "stack": false,
          "steppedLine": false,
          "targets": [
            {
              "exemplar": true,
              "expr": "sum by (method, type) (rate(tdg_rest_result_200{job=~\"$job\", ris_sub=~\"$zone\", alias=~\"$instance\"}[$__rate_interval]))",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 2,
              "legendFormat": " {{method}} {{type}} ",
              "refId": "A"
            }
          ],
          "thresholds": [],
          "timeFrom": null,
          "timeRegions": [],
          "timeShift": null,
          "title": "Success requests (code 2xx)",
          "tooltip": {
            "shared": true,
            "sort": 2,
            "value_type": "individual"
          },
          "type": "graph",
          "xaxis": {
            "buckets": null,
            "mode": "time",
            "name": null,
            "show": true,
            "values": []
          },
          "yaxes": [
            {
              "decimals": 0,
              "format": "none",
              "label": "requests per second",
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            },
            {
              "decimals": 3,
              "format": "none",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            }
          ],
          "yaxis": {
            "align": false,
            "alignLevel": null
          }
        },
opomuc commented 2 years ago
{
          "aliasColors": {},
          "bars": false,
          "dashLength": 10,
          "dashes": false,
          "datasource": "Prometheus",
          "decimals": 3,
          "description": "Requests, processed with 4xx error on Tarantool's side.\nGraph shows mean count per second.\nIf `No data` displayed for Prometheus panel,\ncheck up your 'rate_time_range' variable.\n",
          "fill": 0,
          "fillGradient": 0,
          "gridPos": {
            "h": 9,
            "w": 24,
            "x": 0,
            "y": 26
          },
          "hiddenSeries": false,
          "id": 78,
          "legend": {
            "alignAsTable": true,
            "avg": true,
            "current": true,
            "max": true,
            "min": false,
            "rightSide": true,
            "show": true,
            "sideWidth": null,
            "sort": "current",
            "sortDesc": true,
            "total": false,
            "values": true
          },
          "lines": true,
          "linewidth": 1,
          "links": [],
          "nullPointMode": "null",
          "options": {
            "alertThreshold": true
          },
          "percentage": false,
          "pluginVersion": "8.0.3",
          "pointradius": 5,
          "points": false,
          "renderer": "flot",
          "seriesOverrides": [],
          "spaceLength": 10,
          "stack": false,
          "steppedLine": false,
          "targets": [
            {
              "exemplar": true,
              "expr": "sum by (method, type) (rate(tdg_rest_exec_time_count{job=~\"$job\", ris_sub=~\"$zone\", alias=~\"$instance\"}[$__rate_interval]) - rate(tdg_rest_result_200{job=~\"$job\", ris_sub=~\"$zone\", alias=~\"$instance\"}[$__rate_interval]))",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 2,
              "legendFormat": "{{method}} -- {{type}}",
              "refId": "A"
            }
          ],
          "thresholds": [],
          "timeFrom": null,
          "timeRegions": [],
          "timeShift": null,
          "title": "Error requests (code 4xx)",
          "tooltip": {
            "shared": true,
            "sort": 2,
            "value_type": "individual"
          },
          "type": "graph",
          "xaxis": {
            "buckets": null,
            "mode": "time",
            "name": null,
            "show": true,
            "values": []
          },
          "yaxes": [
            {
              "decimals": 0,
              "format": "none",
              "label": "requests per second",
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            },
            {
              "decimals": 3,
              "format": "none",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            }
          ],
          "yaxis": {
            "align": false,
            "alignLevel": null
          }
        }
no1seman commented 2 years ago

Seems that this issue: https://github.com/tarantool/tdg2/issues/1623 must be taken into account