flashcatcloud / categraf

one-stop telemetry collector for nightingale
https://flashcat.cloud/docs/
MIT License
818 stars 251 forks source link

http_server_requests_seconds指标丢失 #763

Closed xzhaoxz closed 7 months ago

xzhaoxz commented 8 months ago

Relevant config.toml

[global]
# whether print configs
print_configs = false

# add label(agent_hostname) to series
# "" -> auto detect hostname
# "xx" -> use specified string xx
# "$hostname" -> auto detect hostname
# "$ip" -> auto detect ip
# "$hostname-$ip" -> auto detect hostname and ip to replace the vars
hostname = "$ip"

# will not add label(agent_hostname) if true
omit_hostname = false

# global collect interval, unit: second
interval = 15

# input provider settings; optional: local / http
providers = ["local"]

# The concurrency setting controls the number of concurrent tasks spawned for each input. 
# By default, it is set to runtime.NumCPU() * 10. This setting is particularly useful when dealing
# with configurations that involve extensive instances of input like ping, net_response, or http_response.
# As multiple goroutines run simultaneously, the "ResponseTime" metric might appear larger than expected. 
# However, utilizing the concurrency setting can help mitigate this issue and optimize the response time.
concurrency = -1

[global.labels]
#region = "shanghai"
env = "dev"
host= "$ip"

[log]
# file_name is the file to write logs to
file_name = "stdout"

# options below will not be work when file_name is stdout or stderr
# max_size is the maximum size in megabytes of the log file before it gets rotated. It defaults to 100 megabytes.
max_size = 100
# max_age is the maximum number of days to retain old log files based on the timestamp encoded in their filename.  
max_age = 1
# max_backups is the maximum number of old log files to retain.  
max_backups = 1
# local_time determines if the time used for formatting the timestamps in backup files is the computer's local time.  
local_time = true
# Compress determines if the rotated log files should be compressed using gzip. 
compress = false

[writer_opt]
batch = 1000
chan_size = 1000000

[[writers]]
url = "http://192.168.240.180:9090/api/v1/write"

## Optional TLS Config
# tls_min_version = "1.2"
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = true

# Basic auth username
basic_auth_user = ""

# Basic auth password
basic_auth_pass = ""

## Optional headers
# headers = ["X-From", "categraf", "X-Xyz", "abc"]

# timeout settings, unit: ms
timeout = 5000
dial_timeout = 2500
max_idle_conns_per_host = 100

[http]
enable = false
address = ":9100"
print_access = false
run_mode = "release"

[ibex]
enable = false
## ibex flush interval
interval = "1000ms"
## n9e ibex server rpc address
#servers = ["127.0.0.1:20090"]
## temp script dir
meta_dir = "./meta"

[heartbeat]
enable = false

# report os version cpu.util mem.util metadata
#url = "http://127.0.0.1:17000/v1/n9e/heartbeat"

# interval, unit: s
interval = 10

# Basic auth username
basic_auth_user = ""

# Basic auth password
basic_auth_pass = ""

## Optional headers
# headers = ["X-From", "categraf", "X-Xyz", "abc"]

# timeout settings, unit: ms
timeout = 5000
dial_timeout = 2500
max_idle_conns_per_host = 100

[prometheus]
enable = false
scrape_config_file = "D:/opt/in_cluster_scrape.yaml"
## log level, debug warn info error
log_level = "info"
## wal file storage path ,default ./data-agent
# wal_storage_path = "/path/to/storage"
## wal reserve time duration, default value is 2 hour
# wal_min_duration = 2

prometheus.toml
# # collect interval
 interval = 15

[[instances]]
urls = [
     "http://192.168.240.180:8001/metrics"
]

url_label_key = "instance"
url_label_value = "{{.Host}}"

## metrics duplication allowed, default false
#  duplication_allowed=true

## Scrape Services available in Consul Catalog
# [instances.consul]
#   enabled = false
#   agent = "http://localhost:8500"
#   query_interval = "5m"

#   [[instances.consul.query]]
#     name = "a service name"
#     tag = "a service tag"
#     url = 'http://{{if ne .ServiceAddress ""}}{{.ServiceAddress}}{{else}}{{.Address}}{{end}}:{{.ServicePort}}/{{with .ServiceMeta.metrics_path}}{{.}}{{else}}metrics{{end}}'
#     [instances.consul.query.tags]
#       host = "{{.Node}}"

# bearer_token_string = ""

# e.g. /run/secrets/kubernetes.io/serviceaccount/token
# bearer_token_file = ""

# # basic auth
# username = ""
# password = ""

# headers = ["X-From", "categraf"]

# # interval = global.interval * interval_times
# interval_times = 1

# labels = {}

# support glob
ignore_metrics = [ "go_*" ]

# support glob
# ignore_label_keys = []

# timeout for every url
# timeout = "3s"

## Optional TLS Config
# use_tls = false
# tls_min_version = "1.2"
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = true

Logs from categraf

System info

0.3.44

Docker

No response

Steps to reproduce

监控接口请求情况得到指标如下 http_server_requests_seconds{applications="gateway-service",exception="None",method="GET",outcome="SUCCESS",service="pay",status="200",uri="/actuator/prometheus",quantile="0.5",} 0.0 http_server_requests_seconds{applications="gateway-service",exception="None",method="GET",outcome="SUCCESS",service="pay",status="200",uri="/actuator/prometheus",quantile="0.75",} 0.0 http_server_requests_seconds{applications="gateway-service",exception="None",method="GET",outcome="SUCCESS",service="pay",status="200",uri="/actuator/prometheus",quantile="0.9",} 0.0 http_server_requests_seconds{applications="gateway-service",exception="None",method="GET",outcome="SUCCESS",service="pay",status="200",uri="/actuator/prometheus",quantile="0.95",} 0.0 http_server_requests_seconds{applications="gateway-service",exception="None",method="GET",outcome="SUCCESS",service="pay",status="200",uri="/actuator/prometheus",quantile="0.99",} 0.0 http_server_requests_seconds_bucket{applications="gateway-service",exception="None",method="GET",outcome="SUCCESS",service="pay",status="200",uri="/actuator/prometheus",le="0.3",} 28.0 http_server_requests_seconds_bucket{applications="gateway-service",exception="None",method="GET",outcome="SUCCESS",service="pay",status="200",uri="/actuator/prometheus",le="0.5",} 29.0 http_server_requests_seconds_bucket{applications="gateway-service",exception="None",method="GET",outcome="SUCCESS",service="pay",status="200",uri="/actuator/prometheus",le="1.0",} 29.0 http_server_requests_seconds_bucket{applications="gateway-service",exception="None",method="GET",outcome="SUCCESS",service="pay",status="200",uri="/actuator/prometheus",le="2.0",} 30.0 http_server_requests_seconds_bucket{applications="gateway-service",exception="None",method="GET",outcome="SUCCESS",service="pay",status="200",uri="/actuator/prometheus",le="3.0",} 30.0 http_server_requests_seconds_bucket{applications="gateway-service",exception="None",method="GET",outcome="SUCCESS",service="pay",status="200",uri="/actuator/prometheus",le="5.0",} 30.0 http_server_requests_seconds_bucket{applications="gateway-service",exception="None",method="GET",outcome="SUCCESS",service="pay",status="200",uri="/actuator/prometheus",le="10.0",} 30.0 http_server_requests_seconds_bucket{applications="gateway-service",exception="None",method="GET",outcome="SUCCESS",service="pay",status="200",uri="/actuator/prometheus",le="15.0",} 30.0 http_server_requests_seconds_bucket{applications="gateway-service",exception="None",method="GET",outcome="SUCCESS",service="pay",status="200",uri="/actuator/prometheus",le="+Inf",} 30.0 http_server_requests_seconds_count{applications="gateway-service",exception="None",method="GET",outcome="SUCCESS",service="pay",status="200",uri="/actuator/prometheus",} 30.0 http_server_requests_seconds_sum{applications="gateway-service",exception="None",method="GET",outcome="SUCCESS",service="pay",status="200",uri="/actuator/prometheus",} 4.5247628

使用categraf拉取指标上报到监控系统如prometheus,没有了http_server_requests_seconds指标

Expected behavior

上报的数据和原始数据一致

Actual behavior

上报数据丢失,缺失指标且 丢失的指标tag=quantile出现在别的指标中

Additional info

No response

thinwonton commented 8 months ago

需要在springboot中配置,即可有tomcat的全局指标:

收集tomcat的指标

server.tomcat.mbeanregistry.enabled=true

kongfei605 commented 8 months ago

1 检查下ignoremetrics = [ "go*" ] ,所谓丢失的指标是否包含go_ 2 检查relabel 的配置 3 完整的指标,和categraf采集的指标都发一下。

这个插件第一次听到丢指标的反馈。

kongfei605 commented 7 months ago

还有后续吗?先close了,如果有后续可以随时reopen