red-hat-storage / ocs-ci

https://ocs-ci.readthedocs.io/en/latest/
MIT License
108 stars 166 forks source link

test_monitoring_after_rebooting_node_where_mgr_is_running failed to verify ceph health after node reboot #10712

Open nagendra202 opened 2 hours ago

nagendra202 commented 2 hours ago

self = <tests.functional.workloads.ocp.monitoring.test_monitoring_on_negative_scenarios.TestMonitoringBackedByOCS object at 0x7fdcb926a5b0> nodes = <ocs_ci.ocs.platform_nodes.VMWareUPINodes object at 0x7fdce666b5e0> pods = [<ocs_ci.ocs.resources.pod.Pod object at 0x7fdcd5b3e8e0>, <ocs_ci.ocs.resources.pod.Pod object at 0x7fdceb242130>, , <ocs_ci.ocs.resources.pod.Pod object at 0x7fdcb6277fa0>, <ocs_ci.ocs.resources.pod.Pod object at 0x7fdca2894e50>] threading_lock = <unlocked _thread.RLock object owner=0 count=0 at 0x7fdd463923c0>

` @pytest.mark.polarion_id("OCS-710") def test_monitoring_after_rebooting_node_where_mgr_is_running( self, nodes, pods, threading_lock ): """ Test case to validate rebooting a node where mgr is running should not delete the data collected on prometheus pod

"""

# Get the mgr pod obj
mgr_pod_obj = pod.get_mgr_pods()

# Get the node where the mgr pod is hosted
mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0])

# Reboot the node where the mgr pod is hosted
nodes.restart_nodes([mgr_node_obj])

# Validate all nodes are in READY state
retry((CommandFailed, ResourceWrongStatusException), tries=20, delay=15)(
    wait_for_nodes_status()
)

# Check for Ceph pods
pod_obj = ocp.OCP(
    kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]
)
assert pod_obj.wait_for_resource(
    condition="Running", selector="app=rook-ceph-mgr", timeout=600
)
assert pod_obj.wait_for_resource(
    condition="Running",
    selector="app=rook-ceph-mon",
    resource_count=3,
    timeout=600,
)
assert pod_obj.wait_for_resource(
    condition="Running",
    selector="app=rook-ceph-osd",
    resource_count=3,
    timeout=600,
)

# Check the node are Ready state and check cluster is health ok
self.sanity_helpers.health_check(tries=40)

# Check for ceph health check metrics is updated with new mgr pod

wait_to_update_mgrpod_info_prometheus_pod(threading_lock) ` tests/functional/workloads/ocp/monitoring/test_monitoring_on_negative_scenarios.py:503:

ocs_ci/utility/retry.py:49: in f_retry return f(args, *kwargs)

threading_lock = <unlocked _thread.RLock object owner=0 count=0 at 0x7fdd463923c0>

` @retry(AssertionError, tries=30, delay=3, backoff=1) def wait_to_update_mgrpod_info_prometheus_pod(threading_lock): """ Validates the ceph health metrics is updated on prometheus pod

Args:
    threading_lock (threading.RLock): A lock to ensure only one thread is making the 'oc' calls

"""

log.info("Verifying ceph health status metrics is updated after rebooting the node")
ocp_obj = ocp.OCP(
    kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]
)
mgr_pod = (
    ocp_obj.get(selector=constants.MGR_APP_LABEL)
    .get("items")[0]
    .get("metadata")
    .get("name")
)

assert check_ceph_health_status_metrics_on_prometheus( mgr_pod=mgr_pod, threading_lock=threading_lock ), "Ceph health status metrics are not updated after the rebooting node where the mgr running" E AssertionError: Ceph health status metrics are not updated after the rebooting node where the mgr running E assert False E + where False = check_ceph_health_status_metrics_on_prometheus(mgr_pod='rook-ceph-mgr-a-5d54947b9c-scwpp', threading_lock=<unlocked _thread.RLock object owner=0 count=0 at 0x7fdd463923c0>)

`

tests/functional/workloads/ocp/monitoring/test_monitoring_on_negative_scenarios.py:65: AssertionError

nagendra202 commented 2 hours ago

RP: https://reportportal-ocs4.apps.ocp-c1.prod.psi.redhat.com/ui/#ocs/launches/738/26072/1273832/1273877/log?logParams=history%3D1251288%26page.page%3D1