Closed AbhishekMundada closed 1 year ago
Failure text :
________________________ test_monitoring_shows_osd_down ________________________
measure_stop_ceph_osd = {'first_run': True, 'metadata': None, 'prometheus_alerts': [{'activeAt': '2023-03-14T18:05:30Z', 'annotations': {'desc...mespace': 'openshift-monitoring', 'severity': 'none'}, 'state': 'firing', ...}, ...], 'result': 'rook-ceph-osd-2', ...}
@tier3
@pytest.mark.polarion_id("OCS-1307")
@skipif_managed_service
def test_monitoring_shows_osd_down(measure_stop_ceph_osd):
"""
Make sure simple problems with OSD daemons are reported via OCP Prometheus.
"""
prometheus = PrometheusAPI()
# time (in seconds) for monitoring to notice the change
expected_delay = 60
affected_osd = measure_stop_ceph_osd["result"]
# translate this into ceph daemon name
ceph_daemon = "osd.{}".format(int(affected_osd[len("rook-ceph-osd-") :]))
logger.info(f"affected osd was {affected_osd}, aka {ceph_daemon} ceph daemon")
logger.info("let's check that ceph health was affected")
health_result = prometheus.query_range(
query="ceph_health_status",
start=measure_stop_ceph_osd["start"],
end=measure_stop_ceph_osd["stop"],
step=15,
)
health_validation = check_query_range_result_enum(
result=health_result,
good_values=[1],
bad_values=[0],
exp_metric_num=1,
exp_delay=expected_delay,
)
health_msg = "health status should be affected by missing osd"
logger.info("let's check that osd up value was affected")
osd_up_result = prometheus.query_range(
query='ceph_osd_up{ceph_daemon="%s"}' % ceph_daemon,
start=measure_stop_ceph_osd["start"],
end=measure_stop_ceph_osd["stop"],
step=15,
)
osd_up_validation = check_query_range_result_enum(
result=osd_up_result,
good_values=[0],
bad_values=[1],
exp_metric_num=1,
exp_delay=expected_delay,
)
osd_up_msg = "ceph_osd_up value should be affected by missing osd"
logger.info("let's check that osd in value was not affected")
# osd in value is not affected because we just stopped the osd, we
# haven't removed it from the luster
osd_in_result = prometheus.query_range(
query='ceph_osd_in{ceph_daemon="%s"}' % ceph_daemon,
start=measure_stop_ceph_osd["start"],
end=measure_stop_ceph_osd["stop"],
step=15,
)
osd_in_validation = check_query_range_result_enum(
result=osd_in_result, good_values=[1], bad_values=[0], exp_metric_num=1
)
osd_in_msg = "ceph_osd_in value should not be affected by missing osd"
# checking validation results when all queries are performed makes sure
# that there is evidence for all queries in the test case logs in case of
# an assert failure
assert health_validation, health_msg
assert osd_up_validation, osd_up_msg
> assert osd_in_validation, osd_in_msg
E AssertionError: ceph_osd_in value should not be affected by missing osd
E assert False
tests/manage/monitoring/prometheusmetrics/test_monitoring_negative.py:157: AssertionError
ClusterVersion : Cluster version is 4.13.0-0.nightly-ppc64le-2023-03-13-152806
ODF version : [root@rdr-kms-sao01-bastion-0 ~]# oc get csv odf-operator.v4.13.0-98.stable -n openshift-storage -o yaml |grep full_version full_version: 4.13.0-98
This issue has been automatically marked as stale because it has not had recent activity. It will be closed in 30 days if no further activity occurs.
This issue has been automatically closed due to inactivity. Please re-open if this still requires investigation.
tests/manage/monitoring/prometheusmetrics/test_monitoring_negative.py::test_monitoring_shows_osd_down testcase is failing with the following error. Error: ERROR - ceph_osd_in has bad value 0 at 2023-03-16 11:34:21.432000