flashcatcloud / categraf

one-stop telemetry collector for nightingale
https://flashcat.cloud/docs/
MIT License
839 stars 254 forks source link

promsql execute kubelet_running_pods,nwebapi report panic recovered #360

Closed cdy668 closed 1 year ago

cdy668 commented 1 year ago

Relevant config.toml

# debug, release
RunMode = "release"

# # custom i18n dict config
# I18N = "./etc/i18n.json"

# # custom i18n request header key
# I18NHeaderKey = "X-Language"

# metrics descriptions
MetricsYamlFile = "./etc/metrics.yaml"

BuiltinAlertsDir = "./etc/alerts"
BuiltinDashboardsDir = "./etc/dashboards"

# config | api
ClustersFrom = "config"

# using when ClustersFrom = "api"
ClustersFromAPIs = []

[[NotifyChannels]]
Label = "邮箱"
# do not change Key
Key = "email"

[[NotifyChannels]]
Label = "钉钉机器人"
# do not change Key
Key = "dingtalk"

[[NotifyChannels]]
Label = "企微机器人"
# do not change Key
Key = "wecom"

[[NotifyChannels]]
Label = "飞书机器人"
# do not change Key
Key = "feishu"

[[NotifyChannels]]
Label = "mm bot"
# do not change Key
Key = "mm"

[[NotifyChannels]]
Label = "telegram机器人"
# do not change Key
Key = "telegram"

[[ContactKeys]]
Label = "Wecom Robot Token"
# do not change Key
Key = "wecom_robot_token"

[[ContactKeys]]
Label = "Dingtalk Robot Token"
# do not change Key
Key = "dingtalk_robot_token"

[[ContactKeys]]
Label = "Feishu Robot Token"
# do not change Key
Key = "feishu_robot_token"

[[ContactKeys]]
Label = "MatterMost Webhook URL"
# do not change Key
Key = "mm_webhook_url"

[[ContactKeys]]
Label = "Telegram Robot Token"
# do not change Key
Key = "telegram_robot_token"

[Log]
# log write dir
Dir = "logs"
# log level: DEBUG INFO WARNING ERROR
Level = "DEBUG"
# stdout, stderr, file
Output = "stdout"
# # rotate by time
# KeepHours: 4
# # rotate by size
# RotateNum = 3
# # unit: MB
# RotateSize = 256

[HTTP]
# http listening address
Host = "0.0.0.0"
# http listening port
Port = 18000
# https cert file path
CertFile = ""
# https key file path
KeyFile = ""
# whether print access log
PrintAccessLog = true
# whether enable pprof
PProf = false
# http graceful shutdown timeout, unit: s
ShutdownTimeout = 30
# max content length: 64M
MaxContentLength = 67108864
# http server read timeout, unit: s
ReadTimeout = 20
# http server write timeout, unit: s
WriteTimeout = 40
# http server idle timeout, unit: s
IdleTimeout = 120

[JWTAuth]
# signing key
SigningKey = "5b94a0fd640fe2765af826acfe42d151"
# unit: min
AccessExpired = 1500
# unit: min
RefreshExpired = 10080
RedisKeyPrefix = "/jwt/"

[ProxyAuth]
# if proxy auth enabled, jwt auth is disabled
Enable = false
# username key in http proxy header
HeaderUserNameKey = "X-User-Name"
DefaultRoles = ["Standard"]

[BasicAuth]
user001 = "ccc26da7b9axxxxxxxxa36c07dcc5"

[AnonymousAccess]
PromQuerier = false
AlertDetail = false

[LDAP]
Enable = false
Host = "ldap.example.org"
Port = 389
BaseDn = "dc=example,dc=org"
# AD: manange@example.org
BindUser = "cn=manager,dc=example,dc=org"
BindPass = "*******"
# openldap format e.g. (&(uid=%s))
# AD format e.g. (&(sAMAccountName=%s))
AuthFilter = "(&(uid=%s))"
CoverAttributes = true
TLS = false
StartTLS = true
# ldap user default roles
DefaultRoles = ["Standard"]

[LDAP.Attributes]
Nickname = "cn"
Phone = "mobile"
Email = "mail"

[OIDC]
Enable = false
RedirectURL = "http://n9e.com/callback"
SsoAddr = "http://sso.example.org"
ClientId = ""
ClientSecret = ""
CoverAttributes = true
DefaultRoles = ["Standard"]

[OIDC.Attributes]
Nickname = "nickname"
Phone = "phone_number"
Email = "email"

[Redis]
# address, ip:port
Address = "192.168.100.xx:6379"
# requirepass
Password = "xxxxxxx"
# # db
DB = 10

[DB]
# postgres: host=%s port=%s user=%s dbname=%s password=%s sslmode=%s
DSN="devops:xxxx@tcp(192.168.100.xx:3306)/n9e_v5?charset=utf8mb4&parseTime=True&loc=Local&allowNativePasswords=true"
# enable debug mode or not
Debug = true
# mysql postgres
DBType = "mysql"
# unit: s
MaxLifetime = 7200
# max open connections
MaxOpenConns = 150
# max idle connections
MaxIdleConns = 50
# table prefix
TablePrefix = ""
# enable auto migrate or not
# EnableAutoMigrate = false

[[Clusters]]
# Prometheus cluster name
Name = "Default"
# Prometheus APIs base url
Prom = "http://192.168.100.xx:9090"
# Basic auth username
BasicAuthUser = ""
# Basic auth password
BasicAuthPass = ""
# timeout settings, unit: ms
Timeout = 30000
DialTimeout = 3000
MaxIdleConnsPerHost = 100

[Ibex]
Address = "http://192.168.100.xx:10090"
# basic auth
BasicAuthUser = "ibex"
BasicAuthPass = "ibex"
# unit: ms
Timeout = 3000

[TargetMetrics]
TargetUp = '''max(max_over_time(target_up{ident=~"(%s)"}[%dm])) by (ident)'''
LoadPerCore = '''max(max_over_time(system_load_norm_1{ident=~"(%s)"}[%dm])) by (ident)'''
MemUtil = '''100-max(max_over_time(mem_available_percent{ident=~"(%s)"}[%dm])) by (ident)'''
DiskUtil = '''max(max_over_time(disk_used_percent{ident=~"(%s)", path="/"}[%dm])) by (ident)'''

Logs from categraf

[0.926ms] [rows:1] SELECT * FROM `board` WHERE id = 8
2023-02-17 09:11:46.361076 INFO aop/logger.go:279 [GIN] | 200 |     999.595µs |    100.10.10.66 | GET      /api/n9e/board/8/pure

2023/02/17 09:11:48 [Recovery] 2023/02/17 - 09:11:48 panic recovered:
write tcp 192.168.100.xx:18000->100.10.10.66:28986: i/o timeout
/home/runner/go/pkg/mod/github.com/gin-gonic/gin@v1.7.4/render/json.go:56 (0x980f3e)
/home/runner/go/pkg/mod/github.com/gin-gonic/gin@v1.7.4/context.go:913 (0x98be37)
/home/runner/go/pkg/mod/github.com/gin-gonic/gin@v1.7.4/context.go:956 (0xf4268f)
/home/runner/go/pkg/mod/github.com/toolkits/pkg@v1.3.1-0.20220824084030-9f9f830a05d5/ginx/render.go:53 (0xf4259b)
/home/runner/work/nightingale/nightingale/src/webapi/router/router_prometheus.go:61 (0xf8ef06)
/home/runner/go/pkg/mod/github.com/gin-gonic/gin@v1.7.4/context.go:165 (0xf8c43e)
/home/runner/work/nightingale/nightingale/src/webapi/router/router_mw.go:93 (0xf8c425)
/home/runner/go/pkg/mod/github.com/gin-gonic/gin@v1.7.4/context.go:165 (0xf42f30)
/home/runner/work/nightingale/nightingale/src/pkg/aop/logger.go:251 (0xf42f11)
/home/runner/go/pkg/mod/github.com/gin-gonic/gin@v1.7.4/context.go:165 (0xf434fa)
/home/runner/work/nightingale/nightingale/src/pkg/aop/recovery.go:98 (0xf434e6)
/home/runner/go/pkg/mod/github.com/gin-gonic/gin@v1.7.4/context.go:165 (0xf71952)
/home/runner/work/nightingale/nightingale/src/webapi/router/router.go:49 (0xf7193e)
/home/runner/go/pkg/mod/github.com/gin-gonic/gin@v1.7.4/context.go:165 (0xf71348)
/home/runner/work/nightingale/nightingale/src/webapi/router/router.go:23 (0xf71325)
/home/runner/go/pkg/mod/github.com/gin-gonic/gin@v1.7.4/context.go:165 (0x99249d)
/home/runner/go/pkg/mod/github.com/gin-gonic/gin@v1.7.4/gin.go:489 (0x992125)
/home/runner/go/pkg/mod/github.com/gin-gonic/gin@v1.7.4/gin.go:445 (0x991c84)
/opt/hostedtoolcache/go/1.18.10/x64/src/net/http/server.go:2916 (0x6f541a)
/opt/hostedtoolcache/go/1.18.10/x64/src/net/http/server.go:1966 (0x6f0416)
/opt/hostedtoolcache/go/1.18.10/x64/src/runtime/asm_amd64.s:1571 (0x46b500)

2023/02/17 09:36:13 write tcp 192.168.100.xx:18000->100.10.10.66:29064: write: broken pipe
POST /api/n9e/query-range-batch HTTP/1.1
Host: 192.168.100.xx:18000
Accept: application/json
Accept-Encoding: gzip, deflate
Accept-Language: zh-CN,zh;q=0.9
Authorization: Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3NfdXVpZCI6ImI3ZmRmZGQ3LTllZGItNDQ1Yi1hMjAyLWYyMmViOGNmY2UwNSIsImF1dGhvcml6ZWQiOnRydWUsImV4cCI6MTY3NjYyOTkyMiwidXNlcl9pZGVudGl0eSI6IjEtcm9vdCJ9.MLap3uCIKY_SkLtBZR4yghgUH7xBJdZY9L2VShgISSs
Connection: keep-alive
Content-Length: 171
Content-Type: application/json;charset=UTF-8
Cookie: username=xxxxxx; password=xxxxx
Origin: http://192.168.100.xx:18000
Referer: http://192.168.100.xx:18000/dashboards/8
User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36
X-Cluster: Default
X-Language: zh

System info

flashcatcloud/categraf:v0.1.5 docker Version: 20.10.21 CentOS Linux release 7.9.2009 (Core)

Docker

Client: Context: default Debug Mode: false Plugins: app: Docker App (Docker Inc., v0.9.1-beta3) buildx: Docker Buildx (Docker Inc., v0.9.1-docker) scan: Docker Scan (Docker Inc., v0.21.0)

Server: Containers: 12 Running: 12 Paused: 0 Stopped: 0 Images: 22 Server Version: 20.10.21 Storage Driver: overlay2 Backing Filesystem: xfs Supports d_type: true Native Overlay Diff: true userxattr: false Logging Driver: json-file Cgroup Driver: cgroupfs Cgroup Version: 1 Plugins: Volume: local Network: bridge host ipvlan macvlan null overlay Log: awslogs fluentd gcplogs gelf journald json-file local logentries splunk syslog Swarm: inactive Runtimes: io.containerd.runc.v2 io.containerd.runtime.v1.linux runc Default Runtime: runc Init Binary: docker-init containerd version: 770bd0108c32f3fb5c73ae1264f7e503fe7b2661 runc version: v1.1.4-0-g5fd4c4d init version: de40ad0 Security Options: seccomp Profile: default Kernel Version: 5.4.195-1.el7.elrepo.x86_64 Operating System: CentOS Linux 7 (Core) OSType: linux Architecture: x86_64 CPUs: 16 Total Memory: 31.34GiB

Steps to reproduce

1.kubectl apply -n monitoring -f k8s/daemonset.yaml 2.edit k8s/daemonset.yaml, replace NSERVER_SERVICE_WITH_PORT with service ip:port of nserver in your cluster, replace CATEGRAF_NAMESPACE with namespace value, then run: 3.Import https://github.com/flashcatcloud/categraf/blob/main/inputs/kubernetes/kubelet-metrics-dash.json ...

Expected behavior

when I use Kubelet metrics dashboard,web result Failed to fetch.It seems that the query return time is too long, resulting in timeout

Actual behavior

sum(up{source="kubelet", cluster=~"$cluster"}) There is no data for this indicator

Additional info

No response

kongfei605 commented 1 year ago

config.toml is not configure of categraf, it is the configure of nightingale . Read this link may help you