multi storage cluster does not report metrics failing test_monitoring_reporting_ok_when_idle

DanielOsypenko commented 1 month ago

def test_monitoring_reporting_ok_when_idle(workload_idle, threading_lock): """ When nothing is happening, OCP Prometheus reports OCS status as OK.

If this test case fails, the status is either reported wrong or the
cluster is in a broken state. Either way, a failure here is not good.
"""
prometheus = PrometheusAPI(threading_lock=threading_lock)

health_result = prometheus.query_range(
    query="ceph_health_status",
    start=workload_idle["start"],
    end=workload_idle["stop"],
    step=15,
)
health_validation = check_query_range_result_enum(
    result=health_result, good_values=[0], bad_values=[1], exp_metric_num=1
)
health_msg = "ceph_health_status {} report 0 (health ok) as expected"
if health_validation:
    health_msg = health_msg.format("does")
    logger.info(health_msg)
else:
    health_msg = health_msg.format("should")
    logger.error(health_msg)

mon_result = prometheus.query_range(
    query="ceph_mon_quorum_status",
    start=workload_idle["start"],
    end=workload_idle["stop"],
    step=15,
)
mon_validation = check_query_range_result_enum(
    result=mon_result,
    good_values=[1],
    bad_values=[0],
    exp_metric_num=workload_idle["result"]["mon_num"],
)
mon_msg = "ceph_mon_quorum_status {} indicate no problems with quorum"
if mon_validation:
    mon_msg = mon_msg.format("does")
    logger.info(mon_msg)
else:
    mon_msg = mon_msg.format("should")
    logger.error(mon_msg)

osd_validations = []
for metric in ("ceph_osd_up", "ceph_osd_in"):
    osd_result = prometheus.query_range(
        query=metric,
        start=workload_idle["start"],
        end=workload_idle["stop"],
        step=15,
    )
    osd_validation = check_query_range_result_enum(
        result=osd_result,
        good_values=[1],
        bad_values=[0],
        exp_metric_num=workload_idle["result"]["osd_num"],
    )
    osd_validations.append(osd_validation)
    osd_msg = "{} metric {} indicate no problems with OSDs"
    if osd_validation:
        osd_msg = osd_msg.format(metric, "does")
        logger.info(osd_msg)
    else:
        osd_msg = osd_msg.format(metric, "should")
        logger.error(osd_msg)

# after logging everything properly, make the test fail if necessary
# see ERRORs reported in the test log for details

assert health_validation, health_msg E AssertionError: ceph_health_status should report 0 (health ok) as expected E assert False

https://reportportal-ocs4.apps.ocp-c1.prod.psi.redhat.com/ui/#ocs/launches/678/22684/1087608/1087648/log?logParams=history%3D1087648%26page.page%3D1

DanielOsypenko commented 1 month ago

collect more ceph health during the metrics time range. https://reportportal-ocs4.apps.ocp-c1.prod.psi.redhat.com/ui/#ocs/launches/678/22998/1105668/1105709/log

DanielOsypenko commented 22 hours ago

4.15 issue confirmed -> https://reportportal-ocs4.apps.ocp-c1.prod.psi.redhat.com/ui/#ocs/launches/632/22655/1085850/1085890/log

red-hat-storage / ocs-ci

multi storage cluster does not report metrics failing test_monitoring_reporting_ok_when_idle #10058