Closed cdy668 closed 1 year ago
# debug, release RunMode = "release" # # custom i18n dict config # I18N = "./etc/i18n.json" # # custom i18n request header key # I18NHeaderKey = "X-Language" # metrics descriptions MetricsYamlFile = "./etc/metrics.yaml" BuiltinAlertsDir = "./etc/alerts" BuiltinDashboardsDir = "./etc/dashboards" # config | api ClustersFrom = "config" # using when ClustersFrom = "api" ClustersFromAPIs = [] [[NotifyChannels]] Label = "邮箱" # do not change Key Key = "email" [[NotifyChannels]] Label = "钉钉机器人" # do not change Key Key = "dingtalk" [[NotifyChannels]] Label = "企微机器人" # do not change Key Key = "wecom" [[NotifyChannels]] Label = "飞书机器人" # do not change Key Key = "feishu" [[NotifyChannels]] Label = "mm bot" # do not change Key Key = "mm" [[NotifyChannels]] Label = "telegram机器人" # do not change Key Key = "telegram" [[ContactKeys]] Label = "Wecom Robot Token" # do not change Key Key = "wecom_robot_token" [[ContactKeys]] Label = "Dingtalk Robot Token" # do not change Key Key = "dingtalk_robot_token" [[ContactKeys]] Label = "Feishu Robot Token" # do not change Key Key = "feishu_robot_token" [[ContactKeys]] Label = "MatterMost Webhook URL" # do not change Key Key = "mm_webhook_url" [[ContactKeys]] Label = "Telegram Robot Token" # do not change Key Key = "telegram_robot_token" [Log] # log write dir Dir = "logs" # log level: DEBUG INFO WARNING ERROR Level = "DEBUG" # stdout, stderr, file Output = "stdout" # # rotate by time # KeepHours: 4 # # rotate by size # RotateNum = 3 # # unit: MB # RotateSize = 256 [HTTP] # http listening address Host = "0.0.0.0" # http listening port Port = 18000 # https cert file path CertFile = "" # https key file path KeyFile = "" # whether print access log PrintAccessLog = true # whether enable pprof PProf = false # http graceful shutdown timeout, unit: s ShutdownTimeout = 30 # max content length: 64M MaxContentLength = 67108864 # http server read timeout, unit: s ReadTimeout = 20 # http server write timeout, unit: s WriteTimeout = 40 # http server idle timeout, unit: s IdleTimeout = 120 [JWTAuth] # signing key SigningKey = "5b94a0fd640fe2765af826acfe42d151" # unit: min AccessExpired = 1500 # unit: min RefreshExpired = 10080 RedisKeyPrefix = "/jwt/" [ProxyAuth] # if proxy auth enabled, jwt auth is disabled Enable = false # username key in http proxy header HeaderUserNameKey = "X-User-Name" DefaultRoles = ["Standard"] [BasicAuth] user001 = "ccc26da7b9axxxxxxxxa36c07dcc5" [AnonymousAccess] PromQuerier = false AlertDetail = false [LDAP] Enable = false Host = "ldap.example.org" Port = 389 BaseDn = "dc=example,dc=org" # AD: manange@example.org BindUser = "cn=manager,dc=example,dc=org" BindPass = "*******" # openldap format e.g. (&(uid=%s)) # AD format e.g. (&(sAMAccountName=%s)) AuthFilter = "(&(uid=%s))" CoverAttributes = true TLS = false StartTLS = true # ldap user default roles DefaultRoles = ["Standard"] [LDAP.Attributes] Nickname = "cn" Phone = "mobile" Email = "mail" [OIDC] Enable = false RedirectURL = "http://n9e.com/callback" SsoAddr = "http://sso.example.org" ClientId = "" ClientSecret = "" CoverAttributes = true DefaultRoles = ["Standard"] [OIDC.Attributes] Nickname = "nickname" Phone = "phone_number" Email = "email" [Redis] # address, ip:port Address = "192.168.100.xx:6379" # requirepass Password = "xxxxxxx" # # db DB = 10 [DB] # postgres: host=%s port=%s user=%s dbname=%s password=%s sslmode=%s DSN="devops:xxxx@tcp(192.168.100.xx:3306)/n9e_v5?charset=utf8mb4&parseTime=True&loc=Local&allowNativePasswords=true" # enable debug mode or not Debug = true # mysql postgres DBType = "mysql" # unit: s MaxLifetime = 7200 # max open connections MaxOpenConns = 150 # max idle connections MaxIdleConns = 50 # table prefix TablePrefix = "" # enable auto migrate or not # EnableAutoMigrate = false [[Clusters]] # Prometheus cluster name Name = "Default" # Prometheus APIs base url Prom = "http://192.168.100.xx:9090" # Basic auth username BasicAuthUser = "" # Basic auth password BasicAuthPass = "" # timeout settings, unit: ms Timeout = 30000 DialTimeout = 3000 MaxIdleConnsPerHost = 100 [Ibex] Address = "http://192.168.100.xx:10090" # basic auth BasicAuthUser = "ibex" BasicAuthPass = "ibex" # unit: ms Timeout = 3000 [TargetMetrics] TargetUp = '''max(max_over_time(target_up{ident=~"(%s)"}[%dm])) by (ident)''' LoadPerCore = '''max(max_over_time(system_load_norm_1{ident=~"(%s)"}[%dm])) by (ident)''' MemUtil = '''100-max(max_over_time(mem_available_percent{ident=~"(%s)"}[%dm])) by (ident)''' DiskUtil = '''max(max_over_time(disk_used_percent{ident=~"(%s)", path="/"}[%dm])) by (ident)'''
[0.926ms] [rows:1] SELECT * FROM `board` WHERE id = 8 2023-02-17 09:11:46.361076 INFO aop/logger.go:279 [GIN] | 200 | 999.595µs | 100.10.10.66 | GET /api/n9e/board/8/pure 2023/02/17 09:11:48 [Recovery] 2023/02/17 - 09:11:48 panic recovered: write tcp 192.168.100.xx:18000->100.10.10.66:28986: i/o timeout /home/runner/go/pkg/mod/github.com/gin-gonic/gin@v1.7.4/render/json.go:56 (0x980f3e) /home/runner/go/pkg/mod/github.com/gin-gonic/gin@v1.7.4/context.go:913 (0x98be37) /home/runner/go/pkg/mod/github.com/gin-gonic/gin@v1.7.4/context.go:956 (0xf4268f) /home/runner/go/pkg/mod/github.com/toolkits/pkg@v1.3.1-0.20220824084030-9f9f830a05d5/ginx/render.go:53 (0xf4259b) /home/runner/work/nightingale/nightingale/src/webapi/router/router_prometheus.go:61 (0xf8ef06) /home/runner/go/pkg/mod/github.com/gin-gonic/gin@v1.7.4/context.go:165 (0xf8c43e) /home/runner/work/nightingale/nightingale/src/webapi/router/router_mw.go:93 (0xf8c425) /home/runner/go/pkg/mod/github.com/gin-gonic/gin@v1.7.4/context.go:165 (0xf42f30) /home/runner/work/nightingale/nightingale/src/pkg/aop/logger.go:251 (0xf42f11) /home/runner/go/pkg/mod/github.com/gin-gonic/gin@v1.7.4/context.go:165 (0xf434fa) /home/runner/work/nightingale/nightingale/src/pkg/aop/recovery.go:98 (0xf434e6) /home/runner/go/pkg/mod/github.com/gin-gonic/gin@v1.7.4/context.go:165 (0xf71952) /home/runner/work/nightingale/nightingale/src/webapi/router/router.go:49 (0xf7193e) /home/runner/go/pkg/mod/github.com/gin-gonic/gin@v1.7.4/context.go:165 (0xf71348) /home/runner/work/nightingale/nightingale/src/webapi/router/router.go:23 (0xf71325) /home/runner/go/pkg/mod/github.com/gin-gonic/gin@v1.7.4/context.go:165 (0x99249d) /home/runner/go/pkg/mod/github.com/gin-gonic/gin@v1.7.4/gin.go:489 (0x992125) /home/runner/go/pkg/mod/github.com/gin-gonic/gin@v1.7.4/gin.go:445 (0x991c84) /opt/hostedtoolcache/go/1.18.10/x64/src/net/http/server.go:2916 (0x6f541a) /opt/hostedtoolcache/go/1.18.10/x64/src/net/http/server.go:1966 (0x6f0416) /opt/hostedtoolcache/go/1.18.10/x64/src/runtime/asm_amd64.s:1571 (0x46b500) 2023/02/17 09:36:13 write tcp 192.168.100.xx:18000->100.10.10.66:29064: write: broken pipe POST /api/n9e/query-range-batch HTTP/1.1 Host: 192.168.100.xx:18000 Accept: application/json Accept-Encoding: gzip, deflate Accept-Language: zh-CN,zh;q=0.9 Authorization: Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3NfdXVpZCI6ImI3ZmRmZGQ3LTllZGItNDQ1Yi1hMjAyLWYyMmViOGNmY2UwNSIsImF1dGhvcml6ZWQiOnRydWUsImV4cCI6MTY3NjYyOTkyMiwidXNlcl9pZGVudGl0eSI6IjEtcm9vdCJ9.MLap3uCIKY_SkLtBZR4yghgUH7xBJdZY9L2VShgISSs Connection: keep-alive Content-Length: 171 Content-Type: application/json;charset=UTF-8 Cookie: username=xxxxxx; password=xxxxx Origin: http://192.168.100.xx:18000 Referer: http://192.168.100.xx:18000/dashboards/8 User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36 X-Cluster: Default X-Language: zh
flashcatcloud/categraf:v0.1.5 docker Version: 20.10.21 CentOS Linux release 7.9.2009 (Core)
Client: Context: default Debug Mode: false Plugins: app: Docker App (Docker Inc., v0.9.1-beta3) buildx: Docker Buildx (Docker Inc., v0.9.1-docker) scan: Docker Scan (Docker Inc., v0.21.0)
Server: Containers: 12 Running: 12 Paused: 0 Stopped: 0 Images: 22 Server Version: 20.10.21 Storage Driver: overlay2 Backing Filesystem: xfs Supports d_type: true Native Overlay Diff: true userxattr: false Logging Driver: json-file Cgroup Driver: cgroupfs Cgroup Version: 1 Plugins: Volume: local Network: bridge host ipvlan macvlan null overlay Log: awslogs fluentd gcplogs gelf journald json-file local logentries splunk syslog Swarm: inactive Runtimes: io.containerd.runc.v2 io.containerd.runtime.v1.linux runc Default Runtime: runc Init Binary: docker-init containerd version: 770bd0108c32f3fb5c73ae1264f7e503fe7b2661 runc version: v1.1.4-0-g5fd4c4d init version: de40ad0 Security Options: seccomp Profile: default Kernel Version: 5.4.195-1.el7.elrepo.x86_64 Operating System: CentOS Linux 7 (Core) OSType: linux Architecture: x86_64 CPUs: 16 Total Memory: 31.34GiB
1.kubectl apply -n monitoring -f k8s/daemonset.yaml 2.edit k8s/daemonset.yaml, replace NSERVER_SERVICE_WITH_PORT with service ip:port of nserver in your cluster, replace CATEGRAF_NAMESPACE with namespace value, then run: 3.Import https://github.com/flashcatcloud/categraf/blob/main/inputs/kubernetes/kubelet-metrics-dash.json ...
when I use Kubelet metrics dashboard,web result Failed to fetch.It seems that the query return time is too long, resulting in timeout
sum(up{source="kubelet", cluster=~"$cluster"}) There is no data for this indicator
No response
config.toml is not configure of categraf, it is the configure of nightingale . Read this link may help you
Relevant config.toml
Logs from categraf
System info
flashcatcloud/categraf:v0.1.5 docker Version: 20.10.21 CentOS Linux release 7.9.2009 (Core)
Docker
Client: Context: default Debug Mode: false Plugins: app: Docker App (Docker Inc., v0.9.1-beta3) buildx: Docker Buildx (Docker Inc., v0.9.1-docker) scan: Docker Scan (Docker Inc., v0.21.0)
Server: Containers: 12 Running: 12 Paused: 0 Stopped: 0 Images: 22 Server Version: 20.10.21 Storage Driver: overlay2 Backing Filesystem: xfs Supports d_type: true Native Overlay Diff: true userxattr: false Logging Driver: json-file Cgroup Driver: cgroupfs Cgroup Version: 1 Plugins: Volume: local Network: bridge host ipvlan macvlan null overlay Log: awslogs fluentd gcplogs gelf journald json-file local logentries splunk syslog Swarm: inactive Runtimes: io.containerd.runc.v2 io.containerd.runtime.v1.linux runc Default Runtime: runc Init Binary: docker-init containerd version: 770bd0108c32f3fb5c73ae1264f7e503fe7b2661 runc version: v1.1.4-0-g5fd4c4d init version: de40ad0 Security Options: seccomp Profile: default Kernel Version: 5.4.195-1.el7.elrepo.x86_64 Operating System: CentOS Linux 7 (Core) OSType: linux Architecture: x86_64 CPUs: 16 Total Memory: 31.34GiB
Steps to reproduce
1.kubectl apply -n monitoring -f k8s/daemonset.yaml 2.edit k8s/daemonset.yaml, replace NSERVER_SERVICE_WITH_PORT with service ip:port of nserver in your cluster, replace CATEGRAF_NAMESPACE with namespace value, then run: 3.Import https://github.com/flashcatcloud/categraf/blob/main/inputs/kubernetes/kubelet-metrics-dash.json ...
Expected behavior
when I use Kubelet metrics dashboard,web result Failed to fetch.It seems that the query return time is too long, resulting in timeout
Actual behavior
sum(up{source="kubelet", cluster=~"$cluster"}) There is no data for this indicator
Additional info
No response