gm3dmo / syslog-to-csv

Turn log files into data.
GNU General Public License v3.0
15 stars 7 forks source link

DuckDB experiments on gitrpcd logfmt files #37

Open gm3dmo opened 5 months ago

gm3dmo commented 5 months ago

Convert a bundle to jsonl and upload to DuckDB for superfast querying

The main benefit of this approach is that it the ingestion doesn't get wrecked by log format changes. It simply ingests everything.

It seems that the log files in a bundle cover only a very short timespan now.

Setup:

git clone https://github.com/gm3dmo/syslog-to-csv
curl -L -O https://raw.githubusercontent.com/dloss/klp/main/klp.py
curl -L -O https://github.com/duckdb/duckdb/releases/download/v1.0.0/duckdb_cli-linux-amd64.zip
unzip duckdb_cli-linux-amd64.zip
python -m venv venv
source venv/bin/activate
pip install csvkit

# split syslog files
pypy3 syslog-to-csv/syslog-daemon-splitter.py system-logs/syslog --sankey
pypy3 syslog-to-csv/syslog-daemon-splitter.py system-logs/syslog.1 --sankey

# Convert the split `system-logs/split-logs-syslog/gitrpcd.log` to jsonl format:

# gitrpcd
pypy3 klp.py --stats system-logs/split-logs-syslog/gitrpcd.log --output-format jsonl > system-logs/split-logs-syslog/gitrpcd.json
pypy3 klp.py --stats system-logs/split-logs-syslog.1/gitrpcd.log --output-format jsonl > system-logs/split-logs-syslog.1/gitrpcd.json

# authzd
pypy3 klp.py --stats system-logs/split-logs-syslog/authzd.log --output-format jsonl > system-logs/split-logs-syslog/authzd.json
pypy3 klp.py --stats system-logs/split-logs-syslog.1/authzd.log --output-format jsonl > system-logs/split-logs-syslog.1/authzd.json

#hookshot
pypy3 klp.py --stats system-logs/split-logs-syslog/hookshot-go.log --output-format jsonl > system-logs/split-logs-syslog/hookshot-go.json
pypy3 klp.py --stats system-logs/split-logs-syslog.1/hookshot-go.log --output-format jsonl > system-logs/split-logs-syslog.1/hookshot-go.json

# babeld
pypy3 klp.py --stats babeld-logs/babeld.log  --output-format jsonl > babeld-logs/babeld.json
pypy3 klp.py --stats babeld-logs/babeld.log.1  --output-format jsonl > babeld-logs/babeld.1.json

# unicorn
pypy3 klp.py --stats docker/container-logs/github-unicorn.log  --output-format jsonl > docker/container-logs/github-unicorn.json

# spokesd 
pypy3 klp.py --stats system-logs/split-logs-syslog/spokesd.log --output-format jsonl > system-logs/split-logs-syslog/spokesd.json
pypy3 klp.py --stats system-logs/split-logs-syslog.1/spokesd.log --output-format jsonl > system-logs/split-logs-syslog.1/spokesd.json

# Create duckdb tables using ddl files:

./duckdb test.db < syslog-to-csv/duckdb/exceptions.ddl
./duckdb test.db < syslog-to-csv/duckdb/authzd.ddl
./duckdb test.db < syslog-to-csv/duckdb/unicorn.ddl
./duckdb test.db < syslog-to-csv/duckdb/gitrpcd.ddl
./duckdb test.db < syslog-to-csv/duckdb/babeld.ddl
./duckdb test.db < syslog-to-csv/duckdb/hookshot.ddl

./duckdb test.db < syslog-to-csv/duckdb/spokesd.ddl

Queries

3.12

select SeverityText, count(msg) as count_of_msg, msg from spokesd group by SeverityText, msg order by count_of_msg desc, SeverityText, msg;

3.11

select level, count(msg) as count_of_msg, msg from spokesd group by level, msg order by count_of_msg desc, level, msg;

gitrpcd

gitrpcd queries

select count(repository_id), repository_id from gitrpcd group by repository_id;
select count(user_agent), user_agent from gitrpcd group by user_agent;
select count(msg), msg from gitrpcd group by msg;
select level, count(msg), msg from gitrpcd group by level, msg order by level, msg;
select level, count(msg) as count_of_msg, msg from gitrpcd group by level, msg order by count_of_msg desc, level, msg;
.mode csv
.headers on
SELECT 
  DATE_TRUNC('hour', CAST("time" AS TIMESTAMP)) AS time_hour,
  "msg",
  COUNT(*) AS msg_count
FROM gitrpcd
GROUP BY time_hour, "msg"
ORDER BY time_hour;

authzd

.mode csv
.headers on
SELECT 
  DATE_TRUNC('hour', CAST("Timestamp" AS TIMESTAMP)) AS time_hour, Body, COUNT(Body) AS body_count,
FROM authzd
GROUP BY time_hour, "Body"
ORDER BY time_hour;
select SeverityText, count(Body) as count_of_body, Body from authzd group by SeverityText, Body order by count_of_body desc, SeverityText;
select SeverityLevel, count(Body) as count_of_body, Body from authzd group by SeverityLevel, Body order by count_of_body desc, SeverityLevel;

babeld

babeld queries

./duckdb test.db < syslog-to-csv/duckdb/babeld/babeld-timespan.sql
./duckdb test.db < syslog-to-csv/duckdb/babeld/babeld-message-by-hour.sql
./duckdb test.db < syslog-to-csv/duckdb/babeld/babeld-top-20-msg.sql

Unicorn

./duckdb test.db < syslog-to-csv/duckdb/unicorn/3.12/unicorn-span.sql
./duckdb test.db < syslog-to-csv/duckdb/unicorn/3.12/unicorn-status-by-hour.sql

Hookshot

./duckdb test.db < syslog-to-csv/duckdb/hookshot/hookshot-status-5-minutes.sql
./duckdb test.db < syslog-to-csv/duckdb/hookshot/hookshot-status-columns.sql

Exceptions

.mode csv
.headers on
SELECT 
  DATE_TRUNC('hour', CAST(created_at AS TIMESTAMP)) AS hour,
  message,
  COUNT(*) AS message_count
FROM exceptions
GROUP BY hour, message
ORDER BY hour, message_count DESC;
gm3dmo commented 5 months ago

Does conversion to json work:

[Bundle #135189] 135189 $ wc -l system-logs/split-logs-syslog.1/gitrpcd.*
   290161 system-logs/split-logs-syslog.1/gitrpcd.json
   290161 system-logs/split-logs-syslog.1/gitrpcd.log
   580322 total

[Bundle #135189] 135189 $ head -1 system-logs/split-logs-syslog.1/gitrpcd.log ; tail -1 system-logs/split-logs-syslog.1/gitrpcd.log


Jun  6 15:59:23 github-circle-electron-com-primary gitrpcd[4289]: time=2024-06-06T15:59:23.796887689Z level=INFO msg=request app=gitrpcd sha=19857c54c7a0c0d822992c234066f7df356db7ed host=github-circle-electron-com-primary method=CONNECT url=//git-upload-pack:80

Jun  6 16:59:51 github-circle-electron-com-primary gitrpcd[4289]: time=2024-06-06T16:59:51.562751566Z level=INFO msg="command exited" app=gitrpcd sha=19857c54c7a0c0d822992c234066f7df356db7ed host=github-circle-electron-com-primary request_id=f83c9a875f8aaee1a9277ec8a322a06c component=githttpdaemon method=CONNECT request_url=//git-upload-pack:80 user_agent=babeld/f3ea3d34 command=git-upload-pack path=/0/nw/05/ed/e5/46448/49702.git at=finish elapsed=2.290181 exit=0
 select min(time), max(time) from gitrpcd;
┌────────────────────────────────┬────────────────────────────────┐
│          min("time")           │          max("time")           │
│            varchar             │            varchar             │
├────────────────────────────────┼────────────────────────────────┤
│ 2024-06-06T15:59:23.796887689Z │ 2024-06-06T17:05:59.523893746Z │
└────────────────────────────────┴────────────────────────────────┘
$ grep "2024-06-06T17:05:59.523893746Z" system-logs/split-logs-syslog/gitrpcd.log 
Jun  6 17:05:59 github-schneider-electric-com-primary gitrpcd[4289]: time=2024-06-06T17:05:59.523893746Z level=INFO msg=request app=gitrpcd sha=19857c54c7a0c0d822992c234066f7df356db7ed host=github-schneider-electric-com-primary service=trees component=twirp twirp_status=200 twirp_method=CompareTrees twirp_service=TreesAPI twirp_package=github.spokes.trees.v1 request_id=f99be8e8e7188ace9d9877e858fca920 user_agent="github-enterpriseworker/6995b8978891dac1dc583f8baae2753f7ccd93ce spokesd/88fbad0cdc52afbac03a3edb70a913cf07bd22f6" remote_addr=127.0.0.1:56394 http_version=HTTP/1.1 spec=33396/36060 request_duration=3.953501
 select count(*) from gitrpcd
  ;
┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│       310521 │
└──────────────┘
 wc -l system-logs/split-logs-syslog/gitrpcd.log system-logs/split-logs-syslog.1/gitrpcd.log 
    20360 system-logs/split-logs-syslog/gitrpcd.log
   290161 system-logs/split-logs-syslog.1/gitrpcd.log
   310521 total