Open nik-johnson-net opened 2 years ago
Are there any chances for fixing it? datadog-agent-7.54.1-1.x86_64 + Ceph Pacific doesn't produce any mon-related metrics, that's the most crucial component to monitor in the cluster.
Some basic numbers (ceph.num_mons
) could be extracted from ceph status
, the more detailed metrics are on ceph tell mon.2 mon_status -fjson
. The command proposed by @nik-johnson-net doesn't work for me.
Error:
2024-07-04 11:13:21 UTC | CORE | WARN | (pkg/collector/python/datadog_agent.go:131 in LogMessage) | ceph:b57df6890fa03820 | (ceph.py:68) | Unable to parse data from cmd=mon_status: Expecting value: line 2 column 1 (char 1)
Sample output:
root@ceph-mon-2 ~]# ceph status -fjson | jq .
{
"fsid": "3b9bd3ca-ff4d-11eb-ada4-566fa99c0024",
"health": {
"status": "HEALTH_OK",
"checks": {},
"mutes": []
},
"election_epoch": 106590,
"quorum": [
0,
1,
2
],
"quorum_names": [
"ceph-mon-1",
"ceph-mon-0",
"ceph-mon-2"
],
"quorum_age": 2652,
"monmap": {
"epoch": 57,
"min_mon_release_name": "pacific",
"num_mons": 3
},
..
[root@ceph-mon-2 ~]# ceph tell mon.2 mon_status -fjson | jq
{
"name": "ceph-mon-2",
"rank": 2,
"state": "peon",
"election_epoch": 106590,
"quorum": [
0,
1,
2
],
"quorum_age": 2705,
"features": {
"required_con": "2449958747317026820",
"required_mon": [
"kraken",
"luminous",
"mimic",
"osdmap-prune",
"nautilus",
"octopus",
"pacific",
"elector-pinging"
],
"quorum_con": "4540138297136906239",
"quorum_mon": [
"kraken",
"luminous",
"mimic",
"osdmap-prune",
"nautilus",
"octopus",
"pacific",
"elector-pinging"
]
},
"outside_quorum": [],
"extra_probe_peers": [],
"sync_provider": [],
"monmap": {
"epoch": 57,
"fsid": "3b9bd3ca-ff4d-11eb-ada4-566fa99c0024",
"modified": "2024-07-04T10:44:56.699904Z",
"created": "2021-08-17T11:21:40.985975Z",
"min_mon_release": 16,
"min_mon_release_name": "pacific",
"election_strategy": 1,
"disallowed_leaders: ": "",
"stretch_mode": false,
"features": {
"persistent": [
"kraken",
"luminous",
"mimic",
"osdmap-prune",
"nautilus",
"octopus",
"pacific",
"elector-pinging"
],
"optional": []
},
"mons": [
{
"rank": 0,
"name": "ceph-mon-1",
"public_addrs": {
"addrvec": [
{
"type": "v2",
"addr": "10.20.1.51:3300",
"nonce": 0
},
{
"type": "v1",
"addr": "10.20.1.51:6789",
"nonce": 0
}
]
},
"addr": "10.20.1.51:6789/0",
"public_addr": "10.20.1.51:6789/0",
"priority": 0,
"weight": 0,
"crush_location": "{}"
},
{
"rank": 1,
"name": "ceph-mon-0",
"public_addrs": {
"addrvec": [
{
"type": "v2",
"addr": "10.20.1.50:3300",
"nonce": 0
},
{
"type": "v1",
"addr": "10.20.1.50:6789",
"nonce": 0
}
]
},
"addr": "10.20.1.50:6789/0",
"public_addr": "10.20.1.50:6789/0",
"priority": 0,
"weight": 0,
"crush_location": "{}"
},
{
"rank": 2,
"name": "ceph-mon-2",
"public_addrs": {
"addrvec": [
{
"type": "v2",
"addr": "10.20.1.52:3300",
"nonce": 0
},
{
"type": "v1",
"addr": "10.20.1.52:6789",
"nonce": 0
}
]
},
"addr": "10.20.1.52:6789/0",
"public_addr": "10.20.1.52:6789/0",
"priority": 0,
"weight": 0,
"crush_location": "{}"
}
]
},
"feature_map": {
"mon": [
{
"features": "0x3f01cfb9fffdffff",
"release": "luminous",
"num": 1
}
],
"client": [
{
"features": "0x3f01cfb9fffdffff",
"release": "luminous",
"num": 3
}
]
},
"stretch_mode": false
}
https://github.com/DataDog/integrations-core/blob/2206d5030ee3949d8be07bad477894f22eb3d52d/ceph/datadog_checks/ceph/ceph.py#L62
"mon_status" has been moved to a daemon specific command, so instead the command must be issued like:
/usr/bin/ceph --cluster ceph daemon mon.node-a mon_status -fjson
A patch would need to locate the specific daemon to query.