Every release involves the analysis of plots generated automatically to discover potential issues in the product.
This isn't an easy task, but the lack of a proper tool to compare the graphs makes it harder. These are just some issues the development team finds during this process:
The colors of the traces are hard to distinguish
The scales from one release sometimes don't match the new one, creating confusion with a different scale
The plot is static, we can't zoom in/out, add traces for other components/releases, etc.
The time axis isn't right for some cases
It isn't possible to determine the exact timestamp of an interesting peak, we need to guess.
We understand that v5.0 will bring many changes, but currently we lose too much time fighting with the plots instead of looking for issues.
We need a small improvement until the major release is ready.
```python
# data_compare.py
import pandas as pd
import plotly.express as px
import tkinter as tk
from tkinter import filedialog
import sys
from datetime import datetime, timedelta
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import argparse
# Read command line, and get the data files if not supplied
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--inputs', type=int, help='Amount of files to read', default=-1, required=False)
parser.add_argument('-f', '--files', type=str, help='Files to read, split by space', default=[],nargs="*", required=False)
parser.add_argument('-r', '--revisions', type=str, help='Revisions tag to add to the name of the traces', default=[],nargs="*", required=False)
args = parser.parse_args()
DATA_FILES_AMOUNT = args.inputs
DATA_FILES_ARRAY = args.files
if DATA_FILES_AMOUNT > 0 and len(DATA_FILES_ARRAY) == 0:
DATA_FILES_ARRAY.clear()
for i in range(DATA_FILES_AMOUNT):
root = tk.Tk()
root.withdraw()
DATA_FILES_ARRAY.append(filedialog.askopenfilename(title="Select a file", filetypes=[("CSV files", "*.csv")]))
elif DATA_FILES_AMOUNT <= 0 and len(DATA_FILES_ARRAY) != 0:
DATA_FILES_AMOUNT = len(DATA_FILES_ARRAY)
else:
print('Please provide the amount of files to read or the files to read')
sys.exit(1)
# Read the data from the files
df_array = []
for file in DATA_FILES_ARRAY:
df_array.append(pd.read_csv(file))
# Get the version and daemon name from the data
df_name_array = []
if len(args.revisions) == 0:
args.revisions = [''] * DATA_FILES_AMOUNT
for i in range(DATA_FILES_AMOUNT):
df_name_array.append('_' + df_array[i]['Version'][0] + args.revisions[i] + '_' + df_array[i]['Daemon'][0])
# Construct the time x_axis relative to the test duration
DATA_POINTS = len(df_array[0]['Timestamp'])
fmt = '%Y/%m/%d %H:%M:%S'
tstamp1 = datetime.strptime(df_array[0]['Timestamp'][0], fmt)
tstamp2 = datetime.strptime(df_array[0]['Timestamp'][df_array[0]['Timestamp'].last_valid_index()], fmt)
td = tstamp2 - tstamp1
step = int(td.total_seconds() / DATA_POINTS)
tstamp_initial = datetime.strptime('00:00:00', '%H:%M:%S')
time_col = []
for i in range(DATA_POINTS + 1):
tmp_time = tstamp_initial + timedelta(seconds=i*step)
time_col.append(tmp_time.time())
# Group the columns by type
y_cols_CPU = ['CPU(%)']
y_cols_memory = ['VMS(KB)', 'RSS(KB)', 'USS(KB)', 'PSS(KB)', 'SWAP(KB)']
y_cols_FD = ['FD']
y_cols_IO = ['Disk_Read(KB)', 'Disk_Written(KB)']
y_cols_IO_ops = ['Read_Ops', 'Write_Ops']
y_cols_IO_speed = ['Disk_Read_Speed(KB/s)', 'Disk_Write_Speed(KB/s)']
def plotter_multiple_y_axis(y_cols, name):
fig = make_subplots(specs=[[{"secondary_y": True}]])
for i in range(DATA_FILES_AMOUNT):
fig.add_trace(go.Scatter(
x=time_col,
y=df_array[i][y_cols[0]],
mode="lines",
name=y_cols[0] + df_name_array[i],
text=df_array[i]['Timestamp']), secondary_y=False)
fig.add_trace(go.Scatter(
x=time_col,
y=df_array[i][y_cols[1]],
mode="lines",
name=y_cols[1] + df_name_array[i],
text=df_array[i]['Timestamp']), secondary_y=True)
fig.update_layout(title=name)
fig.update_traces(hovertemplate= 'Time: %{text} Test time: %{x} Value: %{y}')
fig.update_xaxes(title_text="Test time (s)")
fig.update_yaxes(title_text=y_cols[0], secondary_y=False)
fig.update_yaxes(title_text=y_cols[1], secondary_y=True)
fig.show()
def plotter(y_cols, name):
fig = go.Figure()
for y_col in y_cols:
for i in range(DATA_FILES_AMOUNT):
fig.add_trace(go.Scatter(
x=time_col,
y=df_array[i][y_col],
mode="lines",
name=y_col + df_name_array[i],
text=df_array[i]['Timestamp']))
fig.update_traces(hovertemplate= 'Time: %{text} Test time: %{x} Value: %{y}')
fig.update_layout(
title=name,
xaxis_title="Test time (s)",
yaxis_title="Value")
fig.show()
# Analize data and print the results
print('Data compare results:')
for y_col in y_cols:
print("-" * 50)
print(f"Values for {y_col}")
df_mean_array = []
df_max_array = []
df_min_array = []
df_std_array = []
for i in range(DATA_FILES_AMOUNT):
df_mean_array.append(int(df_array[i][y_col].mean()))
df_max_array.append(int(df_array[i][y_col].max()))
df_min_array.append(int(df_array[i][y_col].min()))
df_std_array.append(int(df_array[i][y_col].std()))
print('Mean:')
for i in range(DATA_FILES_AMOUNT):
df_highlight = '(*)' + df_name_array[i] if max(df_mean_array) == df_mean_array[i] else df_name_array[i]
print(f"{df_highlight:>35}{df_mean_array[i]:>35}")
print('Max:')
for i in range(DATA_FILES_AMOUNT):
df_highlight = '(*)' + df_name_array[i] if max(df_max_array) == df_max_array[i] else df_name_array[i]
print(f"{df_highlight:>35}{df_max_array[i]:>35}")
print('Min:')
for i in range(DATA_FILES_AMOUNT):
df_highlight = '(*)' + df_name_array[i] if min(df_min_array) == df_min_array[i] else df_name_array[i]
print(f"{df_highlight:>35}{df_min_array[i]:>35}")
print('Std:')
for i in range(DATA_FILES_AMOUNT):
df_highlight = '(*)' + df_name_array[i] if max(df_std_array) == df_std_array[i] else df_name_array[i]
print(f"{df_highlight:>35}{df_std_array[i]:>35}")
print("-" * 50)
plotter(y_cols_CPU, 'Cluster workload benchmarks metrics compare: CPU(%)')
plotter(y_cols_memory, 'Cluster workload benchmarks metrics compare: Memory(KB)')
plotter(y_cols_FD, 'Cluster workload benchmarks metrics compare: FD')
plotter(y_cols_IO, 'Cluster workload benchmarks metrics compare: Disk IO(KB)')
plotter(y_cols_IO_ops, 'Cluster workload benchmarks metrics compare: Disk IO Ops')
plotter(y_cols_IO_speed, 'Cluster workload benchmarks metrics compare: Disk IO Speed(KB/s)')
plotter_multiple_y_axis(['CPU(%)', 'RSS(KB)'], 'Cluster workload benchmarks metrics compare: CPU(%) and RSS(KB)')
```
DoD
[ ] Determine what small changes could produce the major improvements in the plots/process.
[ ] Implement the first iteration and discuss with the team if the improvement reduces the analysis process burden.
Description
Every release involves the analysis of plots generated automatically to discover potential issues in the product. This isn't an easy task, but the lack of a proper tool to compare the graphs makes it harder. These are just some issues the development team finds during this process:
We understand that v5.0 will bring many changes, but currently we lose too much time fighting with the plots instead of looking for issues. We need a small improvement until the major release is ready.
Proposal / PoC
I propose a small script for the cluster benchmark metrics. I developed it to analyze this issue https://github.com/wazuh/wazuh/issues/27104#issuecomment-2511862102. Using the
plotly
package, we can easily compare different traces, releases, daemons, etc.https://github.com/user-attachments/assets/591cbaf5-6550-4b21-9e2a-fbebbd0687c3
requirements.txt
``` numpy==2.1.3 packaging==24.2 pandas==2.2.3 patsy==1.0.1 plotly==5.24.1 python-dateutil==2.9.0.post0 pytz==2024.2 scipy==1.14.1 six==1.16.0 statsmodels==0.14.4 tenacity==9.0.0 tzdata==2024.2 ```
data_compare.py
```python # data_compare.py import pandas as pd import plotly.express as px import tkinter as tk from tkinter import filedialog import sys from datetime import datetime, timedelta import plotly.graph_objects as go from plotly.subplots import make_subplots import argparse # Read command line, and get the data files if not supplied parser = argparse.ArgumentParser() parser.add_argument('-i', '--inputs', type=int, help='Amount of files to read', default=-1, required=False) parser.add_argument('-f', '--files', type=str, help='Files to read, split by space', default=[],nargs="*", required=False) parser.add_argument('-r', '--revisions', type=str, help='Revisions tag to add to the name of the traces', default=[],nargs="*", required=False) args = parser.parse_args() DATA_FILES_AMOUNT = args.inputs DATA_FILES_ARRAY = args.files if DATA_FILES_AMOUNT > 0 and len(DATA_FILES_ARRAY) == 0: DATA_FILES_ARRAY.clear() for i in range(DATA_FILES_AMOUNT): root = tk.Tk() root.withdraw() DATA_FILES_ARRAY.append(filedialog.askopenfilename(title="Select a file", filetypes=[("CSV files", "*.csv")])) elif DATA_FILES_AMOUNT <= 0 and len(DATA_FILES_ARRAY) != 0: DATA_FILES_AMOUNT = len(DATA_FILES_ARRAY) else: print('Please provide the amount of files to read or the files to read') sys.exit(1) # Read the data from the files df_array = [] for file in DATA_FILES_ARRAY: df_array.append(pd.read_csv(file)) # Get the version and daemon name from the data df_name_array = [] if len(args.revisions) == 0: args.revisions = [''] * DATA_FILES_AMOUNT for i in range(DATA_FILES_AMOUNT): df_name_array.append('_' + df_array[i]['Version'][0] + args.revisions[i] + '_' + df_array[i]['Daemon'][0]) # Construct the time x_axis relative to the test duration DATA_POINTS = len(df_array[0]['Timestamp']) fmt = '%Y/%m/%d %H:%M:%S' tstamp1 = datetime.strptime(df_array[0]['Timestamp'][0], fmt) tstamp2 = datetime.strptime(df_array[0]['Timestamp'][df_array[0]['Timestamp'].last_valid_index()], fmt) td = tstamp2 - tstamp1 step = int(td.total_seconds() / DATA_POINTS) tstamp_initial = datetime.strptime('00:00:00', '%H:%M:%S') time_col = [] for i in range(DATA_POINTS + 1): tmp_time = tstamp_initial + timedelta(seconds=i*step) time_col.append(tmp_time.time()) # Group the columns by type y_cols_CPU = ['CPU(%)'] y_cols_memory = ['VMS(KB)', 'RSS(KB)', 'USS(KB)', 'PSS(KB)', 'SWAP(KB)'] y_cols_FD = ['FD'] y_cols_IO = ['Disk_Read(KB)', 'Disk_Written(KB)'] y_cols_IO_ops = ['Read_Ops', 'Write_Ops'] y_cols_IO_speed = ['Disk_Read_Speed(KB/s)', 'Disk_Write_Speed(KB/s)'] def plotter_multiple_y_axis(y_cols, name): fig = make_subplots(specs=[[{"secondary_y": True}]]) for i in range(DATA_FILES_AMOUNT): fig.add_trace(go.Scatter( x=time_col, y=df_array[i][y_cols[0]], mode="lines", name=y_cols[0] + df_name_array[i], text=df_array[i]['Timestamp']), secondary_y=False) fig.add_trace(go.Scatter( x=time_col, y=df_array[i][y_cols[1]], mode="lines", name=y_cols[1] + df_name_array[i], text=df_array[i]['Timestamp']), secondary_y=True) fig.update_layout(title=name) fig.update_traces(hovertemplate= 'Time: %{text}
Test time: %{x}
Value: %{y}') fig.update_xaxes(title_text="Test time (s)") fig.update_yaxes(title_text=y_cols[0], secondary_y=False) fig.update_yaxes(title_text=y_cols[1], secondary_y=True) fig.show() def plotter(y_cols, name): fig = go.Figure() for y_col in y_cols: for i in range(DATA_FILES_AMOUNT): fig.add_trace(go.Scatter( x=time_col, y=df_array[i][y_col], mode="lines", name=y_col + df_name_array[i], text=df_array[i]['Timestamp'])) fig.update_traces(hovertemplate= 'Time: %{text}
Test time: %{x}
Value: %{y}') fig.update_layout( title=name, xaxis_title="Test time (s)", yaxis_title="Value") fig.show() # Analize data and print the results print('Data compare results:') for y_col in y_cols: print("-" * 50) print(f"Values for {y_col}") df_mean_array = [] df_max_array = [] df_min_array = [] df_std_array = [] for i in range(DATA_FILES_AMOUNT): df_mean_array.append(int(df_array[i][y_col].mean())) df_max_array.append(int(df_array[i][y_col].max())) df_min_array.append(int(df_array[i][y_col].min())) df_std_array.append(int(df_array[i][y_col].std())) print('Mean:') for i in range(DATA_FILES_AMOUNT): df_highlight = '(*)' + df_name_array[i] if max(df_mean_array) == df_mean_array[i] else df_name_array[i] print(f"{df_highlight:>35}{df_mean_array[i]:>35}") print('Max:') for i in range(DATA_FILES_AMOUNT): df_highlight = '(*)' + df_name_array[i] if max(df_max_array) == df_max_array[i] else df_name_array[i] print(f"{df_highlight:>35}{df_max_array[i]:>35}") print('Min:') for i in range(DATA_FILES_AMOUNT): df_highlight = '(*)' + df_name_array[i] if min(df_min_array) == df_min_array[i] else df_name_array[i] print(f"{df_highlight:>35}{df_min_array[i]:>35}") print('Std:') for i in range(DATA_FILES_AMOUNT): df_highlight = '(*)' + df_name_array[i] if max(df_std_array) == df_std_array[i] else df_name_array[i] print(f"{df_highlight:>35}{df_std_array[i]:>35}") print("-" * 50) plotter(y_cols_CPU, 'Cluster workload benchmarks metrics compare: CPU(%)') plotter(y_cols_memory, 'Cluster workload benchmarks metrics compare: Memory(KB)') plotter(y_cols_FD, 'Cluster workload benchmarks metrics compare: FD') plotter(y_cols_IO, 'Cluster workload benchmarks metrics compare: Disk IO(KB)') plotter(y_cols_IO_ops, 'Cluster workload benchmarks metrics compare: Disk IO Ops') plotter(y_cols_IO_speed, 'Cluster workload benchmarks metrics compare: Disk IO Speed(KB/s)') plotter_multiple_y_axis(['CPU(%)', 'RSS(KB)'], 'Cluster workload benchmarks metrics compare: CPU(%) and RSS(KB)') ```
DoD