etcd-io / etcd

Distributed reliable key-value store for the most critical data of a distributed system
https://etcd.io
Apache License 2.0
47.89k stars 9.78k forks source link

Analyse QPS data for robustness tests #18931

Open serathius opened 1 day ago

serathius commented 1 day ago

What would you like to be added?

Prow page for a test includes "Artifacts" link, which includes a a file with log. https://gcsweb.k8s.io/gcs/kubernetes-ci-logs/logs/ci-etcd-robustness-main-amd64/1858942059486908416/artifacts/

What if we could parse the file and track the progress over time? No longer guesses over what QPS is good What if we could do analysis for different dimensions? We could find test scenarios that might need improvemnt What if we could visualize it? Like https://perf-dash.k8s.io/#/?jobname=gce-5000Nodes&metriccategoryname=APIServer&metricname=LoadResponsiveness_Prometheus&Resource=pods&Scope=cluster&Subresource=&Verb=LIST

Script to parse it

import json

with open("test.out") as file:
  for line in file.readlines():
    if 'Reporting traffic before failure injection' not in line:
      continue
    log_data = json.loads(line)
    metrics_data = json.loads(log_data["Output"].split('\t')[-1])
    print(log_data["Test"], float(metrics_data["qps"]))

Why is this needed?

Track impact of QPS for tests over time.

serathius commented 16 hours ago

Rewrote in golang

package main

import (
    "bufio"
    "encoding/json"
    "fmt"
    "io"
    "net/http"
    "strings"
)

type objects struct {
    kind string
    //items []object
}

type object struct {
    mediaLink   string
    timeCreated string
}

func main() {
    resp, err := http.Get("https://storage.googleapis.com/storage/v1/b/kubernetes-ci-logs/o/?prefix=logs/ci-etcd-robustness-&matchGlob=**/artifacts/*.stdout")
    if err != nil {
        panic(fmt.Sprintf("Failed to fetch GCS items"))
    }
    defer resp.Body.Close()
    data, err := io.ReadAll(resp.Body)
    if err != nil {
        panic(fmt.Sprintf("Failed to read GCS items"))
    }
    var objs map[string]interface{}
    err = json.Unmarshal(data, &objs)
    if err != nil {
        panic(fmt.Sprintf("Failed to parse GCS items"))
    }
    items := objs["items"].([]interface{})
    fmt.Printf("Collected %d\n", len(items))
    for _, item := range items[:1] {
        obj := item.(map[string]interface{})
        resp, err := http.Get(obj["mediaLink"].(string))
        if err != nil {
            fmt.Printf("Failed to read build-log.txt file\n")
            continue
        }
        defer resp.Body.Close()
        scan := bufio.NewScanner(resp.Body)
        type TestLog struct {
            Output string
            Test   string
        }
        var log TestLog
        for scan.Scan() {
            err := json.Unmarshal(scan.Bytes(), &log)
            if err != nil {
                continue
            }
            if !strings.Contains(log.Output, "Reporting traffic before failure injection") {
                continue
            }
            stats := strings.SplitN(log.Output, "Reporting traffic before failure injection", 2)[1]
            parsedStats := map[string]interface{}{}
            err = json.Unmarshal([]byte(stats), &parsedStats)
            if err != nil {
                fmt.Printf("Failed to parse traffic stats\n")
            }
            fmt.Printf("%s %f\n", log.Test, parsedStats["qps"])
        }
    }
}