Closed gopherunner closed 7 years ago
My input conf:
[[inputs.ceph]] interval = '1m' ceph_binary = "/usr/bin/ceph" socket_dir = "/var/run/ceph" mon_prefix = "ceph-mon" osd_prefix = "ceph-osd" socket_suffix = "asok" ceph_user = "client.admin" ceph_config = "/etc/ceph/cephCLUS.conf" gather_admin_socket_stats = true gather_cluster_stats = true
I think the cluster name might be selected through the socket files. Can you add the output of ls /var/run/ceph
?
ls /var/run/ceph cephCLUS-mon.cibm02.asok
And the output log file is: ... 2017-05-16T11:59:40Z E! error reading from socket '/var/run/ceph/cephCLUS-mon.cibm02.asok': error running ceph dump: exit status 22 2017-05-16T11:59:40Z E! ERROR in input [inputs.ceph]: error executing command: error running ceph status: exit status 1
Could it be a permission error? Does it work if you run sudo -u telegraf ceph --cluster cephNN status
?
yeah it work fine: root@cibm02:~# sudo -u telegraf ceph --cluster cephCLUS status cluster xxxxx-xxxxx-xxxxx-axxxxxx-xxxxxxxxx health HEALTH_WARN noout,sortbitwise flag(s) set monmap e1: 3 mons at {cibm01=xx.xx.xx.x:6789/0,cibm02=..xx.xx.xx.x:6789/0,cibm03=xx.xx.xx.x:6789/0} election epoch 152, quorum 0,1,2 cibm01,cibm02,cibm03 osdmap e1362: 72 osds: 72 up, 72 in flags noout,sortbitwise pgmap v20669560: 8192 pgs, 2 pools, 15186 GB data, 3807 kobjects 30344 GB used, 22924 GB / 53269 GB avail 8192 active+clean client io 158 MB/s rd, 146 MB/s wr, 3864 op/s rd, 5928 op/s wr
any other idea?
Can you try:
sudo -u telegraf /usr/bin/ceph --admin-daemon /var/run/ceph/cephCLUS-mon.cibm02.asok perfcounters_dump`
@catovermoon Any luck with this?
Please reopen if you need more help.
I have the same issue: if cluster name is not 'ceph', a very few metrics are collected.
For debug attempt:
sudo -u telegraf /usr/bin/ceph --admin-daemon /var/run/ceph/foobar-mon.mon21.asok perfcounters_dump
{
"AsyncMessenger::Worker-0": {
"msgr_recv_messages": 403993,
"msgr_send_messages": 301044,
"msgr_recv_bytes": 477136016,
"msgr_send_bytes": 392683308,
"msgr_created_connections": 6579,
"msgr_active_connections": 6519,
"msgr_running_total_time": 81.806679135,
"msgr_running_send_time": 29.319598349,
"msgr_running_recv_time": 96.771484181,
"msgr_running_fast_dispatch_time": 0.000000000
},
"AsyncMessenger::Worker-1": {
"msgr_recv_messages": 26888,
"msgr_send_messages": 30495,
"msgr_recv_bytes": 3874897,
"msgr_send_bytes": 196702692,
"msgr_created_connections": 6363,
"msgr_active_connections": 6306,
"msgr_running_total_time": 5.170767434,
"msgr_running_send_time": 1.473356964,
"msgr_running_recv_time": 3.951634696,
"msgr_running_fast_dispatch_time": 0.000000000
},
"AsyncMessenger::Worker-2": {
"msgr_recv_messages": 103700,
"msgr_send_messages": 49847,
"msgr_recv_bytes": 57939104,
"msgr_send_bytes": 232874934,
"msgr_created_connections": 7075,
"msgr_active_connections": 7013,
"msgr_running_total_time": 18.917342645,
"msgr_running_send_time": 4.361112854,
"msgr_running_recv_time": 22.489082102,
"msgr_running_fast_dispatch_time": 0.000000000
},
"cluster": {
"num_mon": 3,
"num_mon_quorum": 3,
"num_osd": 5,
"num_osd_up": 5,
"num_osd_in": 5,
"osd_epoch": 27,
"osd_bytes": 9970275102720,
"osd_bytes_used": 3047665664,
"osd_bytes_avail": 9967227437056,
"num_pool": 0,
"num_pg": 0,
"num_pg_active_clean": 0,
"num_pg_active": 0,
"num_pg_peering": 0,
"num_object": 0,
"num_object_degraded": 0,
"num_object_misplaced": 0,
"num_object_unfound": 0,
"num_bytes": 0,
"num_mds_up": 0,
"num_mds_in": 0,
"num_mds_failed": 0,
"mds_epoch": 1
},
"finisher-mon_finisher": {
"queue_len": 0,
"complete_latency": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
}
},
"finisher-monstore": {
"queue_len": 0,
"complete_latency": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
}
},
"mon": {
"num_sessions": 9,
"session_add": 18370,
"session_rm": 18361,
"session_trim": 1,
"num_elections": 6,
"election_call": 3,
"election_win": 0,
"election_lose": 2
},
"paxos": {
"start_leader": 0,
"start_peon": 2,
"restart": 10,
"refresh": 48733,
"refresh_latency": {
"avgcount": 48733,
"sum": 22.534339836,
"avgtime": 0.000462404
},
"begin": 48733,
"begin_keys": {
"avgcount": 0,
"sum": 0
},
"begin_bytes": {
"avgcount": 48733,
"sum": 183199201
},
"begin_latency": {
"avgcount": 48733,
"sum": 114.443853855,
"avgtime": 0.002348385
},
"commit": 48733,
"commit_keys": {
"avgcount": 0,
"sum": 0
},
"commit_bytes": {
"avgcount": 0,
"sum": 0
},
"commit_latency": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
},
"collect": 2,
"collect_keys": {
"avgcount": 2,
"sum": 2
},
"collect_bytes": {
"avgcount": 2,
"sum": 48
},
"collect_latency": {
"avgcount": 2,
"sum": 0.001191435,
"avgtime": 0.000595717
},
"collect_uncommitted": 0,
"collect_timeout": 0,
"accept_timeout": 0,
"lease_ack_timeout": 0,
"lease_timeout": 0,
"store_state": 48733,
"store_state_keys": {
"avgcount": 48733,
"sum": 362473
},
"store_state_bytes": {
"avgcount": 48733,
"sum": 355466100
},
"store_state_latency": {
"avgcount": 48733,
"sum": 131.170374020,
"avgtime": 0.002691612
},
"share_state": 0,
"share_state_keys": {
"avgcount": 0,
"sum": 0
},
"share_state_bytes": {
"avgcount": 0,
"sum": 0
},
"new_pn": 0,
"new_pn_latency": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
}
},
"rocksdb": {
"get": 1790796,
"submit_transaction": 0,
"submit_transaction_sync": 97505,
"get_latency": {
"avgcount": 1790796,
"sum": 14.452057257,
"avgtime": 0.000008070
},
"submit_latency": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
},
"submit_sync_latency": {
"avgcount": 97505,
"sum": 244.005146725,
"avgtime": 0.002502488
},
"compact": 0,
"compact_range": 572,
"compact_queue_merge": 0,
"compact_queue_len": 0,
"rocksdb_write_wal_time": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
},
"rocksdb_write_memtable_time": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
},
"rocksdb_write_delay_time": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
},
"rocksdb_write_pre_and_post_time": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
}
},
"throttle-mon_client_bytes": {
"val": 0,
"max": 104857600,
"get_started": 0,
"get": 231439,
"get_sum": 112841137,
"get_or_fail_fail": 0,
"get_or_fail_success": 231439,
"take": 0,
"take_sum": 0,
"put": 231439,
"put_sum": 112841137,
"wait": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
}
},
"throttle-mon_daemon_bytes": {
"val": 0,
"max": 419430400,
"get_started": 0,
"get": 6,
"get_sum": 626,
"get_or_fail_fail": 0,
"get_or_fail_success": 6,
"take": 0,
"take_sum": 0,
"put": 6,
"put_sum": 626,
"wait": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
}
},
"throttle-msgr_dispatch_throttler-mon": {
"val": 0,
"max": 104857600,
"get_started": 0,
"get": 534579,
"get_sum": 499391007,
"get_or_fail_fail": 0,
"get_or_fail_success": 534579,
"take": 0,
"take_sum": 0,
"put": 534579,
"put_sum": 499391007,
"wait": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
}
},
"throttle-msgr_dispatch_throttler-mon-mgrc": {
"val": 0,
"max": 104857600,
"get_started": 0,
"get": 2,
"get_sum": 16,
"get_or_fail_fail": 0,
"get_or_fail_success": 2,
"take": 0,
"take_sum": 0,
"put": 2,
"put_sum": 16,
"wait": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
}
}
}
The exact same configuration with cluster name 'ceph' works fine, but it does not work with 'foobar'.
(I can't reopen the issue).
@amarao Can you open a new issue?
Yes, but of course.
My apology. I start to reproduce a clean installation for a new issue and realized, I hadn't changed telegraf configuration. There is no bug.
For those who come here with the same problem:
Options mon_prefix
and osd_prefix
should reflects a cluster name. For a cluster foobar
configuration file should look like this:
[inputs.ceph]
ceph_binary = "/usr/bin/ceph"
ceph_config = "/etc/ceph/foobar.conf"
...
mon_prefix = "foobar-mon"
osd_prefix = "foobar-osd"
(instead of ceph.conf, ceph-mon and ceph-osd respectively).
Bug report
Is there a way to specify on the telegraf.conf file to use a ceph cluster with name? for example, to run dump command I need to run like this: $ ceph --cluster cephNN status, so when I start the telegraf service, it said: ERROR in input [inputs.ceph]: error executing command: error running ceph status: exit status 1
Relevant telegraf.conf:
System info:
[Include Telegraf version, operating system name, and other relevant details]
Steps to reproduce:
Expected behavior:
Actual behavior:
Additional info:
[Include gist of relevant config, logs, etc.]