red-hat-storage / ocs-ci

https://ocs-ci.readthedocs.io/en/latest/
MIT License
108 stars 166 forks source link

test_nodereplacement_proactive_with_io_running is failing with cluster health going to bad state #10355

Open pintojoy opened 2 months ago

pintojoy commented 2 months ago

self = <tests.functional.z_cluster.nodes.test_node_replacement_proactive.TestNodeReplacementWithIO object at 0x7f50dcace7f0> pvc_factory = <function pvc_factory_fixture..factory at 0x7f50b2b10160> pod_factory = <function pod_factory_fixture..factory at 0x7f50b2b108b0> dc_pod_factory = <function dc_pod_factory..factory at 0x7f50bc404040> bucket_factory = <function bucket_factory_fixture.._create_buckets at 0x7f50b1190550> rgw_bucket_factory = <function bucket_factory_fixture.._create_buckets at 0x7f50b1190dc0>

def test_nodereplacement_proactive_with_io_running( self, pvc_factory, pod_factory, dc_pod_factory, bucket_factory, rgw_bucket_factory, ): """ Knip-894 Node Replacement proactive when IO running in the background

"""

# Get worker nodes
worker_node_list = node.get_worker_nodes()
log.info(f"Current available worker nodes are {worker_node_list}")

osd_node_name = select_osd_node_name()

log.info("Creating dc pod backed with rbd pvc and running io in bg")
for worker_node in worker_node_list:
    if worker_node != osd_node_name:
        rbd_dc_pod = dc_pod_factory(
            interface=constants.CEPHBLOCKPOOL, node_name=worker_node, size=20
        )
        pod.run_io_in_bg(rbd_dc_pod, expect_to_fail=False, fedora_dc=True)

log.info("Creating dc pod backed with cephfs pvc and running io in bg")
for worker_node in worker_node_list:
    if worker_node != osd_node_name:
        cephfs_dc_pod = dc_pod_factory(
            interface=constants.CEPHFILESYSTEM, node_name=worker_node, size=20
        )
        pod.run_io_in_bg(cephfs_dc_pod, expect_to_fail=False, fedora_dc=True)

delete_and_create_osd_node(osd_node_name)

# Creating Resources
log.info("Creating Resources using sanity helpers")
self.sanity_helpers.create_resources(
    pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory
)
# Deleting Resources
self.sanity_helpers.delete_resources()

# Verify everything running fine
log.info("Verifying All resources are Running and matches expected result")

self.sanity_helpers.health_check(tries=120)

tests/functional/z_cluster/nodes/test_node_replacement_proactive.py:247:

ocs_ci/helpers/sanity_helpers.py:51: in health_check ceph_health_check( ocs_ci/utility/utils.py:2396: in ceph_health_check return retry( ocs_ci/utility/retry.py:49: in f_retry return f(args, *kwargs)

namespace = 'openshift-storage'

def ceph_health_check_base(namespace=None): """ Exec ceph health cmd on tools pod to determine health of cluster.

Args:
    namespace (str): Namespace of OCS
        (default: config.ENV_DATA['cluster_namespace'])

Raises:
    CephHealthException: If the ceph health returned is not HEALTH_OK
    CommandFailed: If the command to retrieve the tools pod name or the
        command to get ceph health returns a non-zero exit code
Returns:
    boolean: True if HEALTH_OK

"""
namespace = namespace or config.ENV_DATA["cluster_namespace"]
health = run_ceph_health_cmd(namespace)

if health.strip() == "HEALTH_OK":
    log.info("Ceph cluster health is HEALTH_OK.")
    return True
else:

  raise CephHealthException(f"Ceph cluster health is not OK. Health: {health}")

E ocs_ci.ocs.exceptions.CephHealthException: Ceph cluster health is not OK. Health: HEALTH_WARN Degraded data redundancy: 1201875/5833671 objects degraded (20.602%), 20 pgs degraded, 20 pgs undersized

yitzhak12 commented 1 month ago

I see this issue here also: https://reportportal-ocs4.apps.ocp-c1.prod.psi.redhat.com/ui/#ocs/launches/738/24928/1208693/1208719/log. It seems that the problem is only with vSphere. With AWS, it passed, as you can see here: https://reportportal-ocs4.apps.ocp-c1.prod.psi.redhat.com/ui/#ocs/launches/738/24837/1203821/1203848/log.