red-hat-storage / ocs-ci

https://ocs-ci.readthedocs.io/en/latest/
MIT License
108 stars 166 forks source link

test_nodereplacement_proactive_with_io_running fails on vsphere comapct mode cluster #10055

Open pintojoy opened 4 months ago

pintojoy commented 4 months ago

self = <tests.functional.z_cluster.nodes.test_node_replacement_proactive.TestNodeReplacementWithIO object at 0x7f0664779d60> pvc_factory = <function pvc_factory_fixture..factory at 0x7f0644f4d8b0> pod_factory = <function pod_factory_fixture..factory at 0x7f0644f4d9d0> dc_pod_factory = <function dc_pod_factory..factory at 0x7f0644f4dc10> bucket_factory = <function bucket_factory_fixture.._create_buckets at 0x7f0644a0af70> rgw_bucket_factory = <function bucket_factory_fixture.._create_buckets at 0x7f063f670a60>

def test_nodereplacement_proactive_with_io_running( self, pvc_factory, pod_factory, dc_pod_factory, bucket_factory, rgw_bucket_factory, ): """ Knip-894 Node Replacement proactive when IO running in the background

"""

# Get worker nodes
worker_node_list = node.get_worker_nodes()
log.info(f"Current available worker nodes are {worker_node_list}")

osd_node_name = select_osd_node_name()

log.info("Creating dc pod backed with rbd pvc and running io in bg")
for worker_node in worker_node_list:
    if worker_node != osd_node_name:
        rbd_dc_pod = dc_pod_factory(
            interface=constants.CEPHBLOCKPOOL, node_name=worker_node, size=20
        )
        pod.run_io_in_bg(rbd_dc_pod, expect_to_fail=False, fedora_dc=True)

log.info("Creating dc pod backed with cephfs pvc and running io in bg")
for worker_node in worker_node_list:
    if worker_node != osd_node_name:
        cephfs_dc_pod = dc_pod_factory(
            interface=constants.CEPHFILESYSTEM, node_name=worker_node, size=20
        )
        pod.run_io_in_bg(cephfs_dc_pod, expect_to_fail=False, fedora_dc=True)

delete_and_create_osd_node(osd_node_name)

tests/functional/z_cluster/nodes/test_node_replacement_proactive.py:235:

tests/functional/z_cluster/nodes/test_node_replacement_proactive.py:154: in delete_and_create_osd_node new_node_name = node.delete_and_create_osd_node_ipi(osd_node_name) ocs_ci/ocs/node.py:929: in delete_and_create_osd_node_ipi new_machine_name = machine.delete_machine_and_check_state_of_new_spinned_machine( ocs_ci/ocs/machine.py:170: in delete_machine_and_check_state_of_new_spinned_machine delete_machine(machine_name) ocs_ci/ocs/machine.py:110: in delete_machine machine_obj.delete(resource_name=machine_name) ocs_ci/ocs/ocp.py:461: in delete return self.exec_oc_cmd(command, timeout=timeout) ocs_ci/ocs/ocp.py:189: in exec_oc_cmd out = run_cmd( ocs_ci/utility/utils.py:487: in run_cmd completed_process = exec_cmd( ocs_ci/utility/utils.py:674: in exec_cmd completed_process = subprocess.run( /usr/lib64/python3.9/subprocess.py:507: in run stdout, stderr = process.communicate(input, timeout=timeout) /usr/lib64/python3.9/subprocess.py:1134: in communicate stdout, stderr = self._communicate(input, endtime, timeout) /usr/lib64/python3.9/subprocess.py:1996: in _communicate self._check_timeout(endtime, orig_timeout, stdout, stderr)

self = <Popen: returncode: -9 args: ['oc', '--kubeconfig', '/home/jenkins/current-c...> endtime = 75675.009184865, orig_timeout = 600 stdout_seq = [b'machine.machine.openshift.io "j-001vif1cs36-uba-xx6vn-worker-0-sdvm7" deleted\n'] stderr_seq = [], skip_check_and_raise = False

def _check_timeout(self, endtime, orig_timeout, stdout_seq, stderr_seq, skip_check_and_raise=False): """Convenience for checking if a timeout has expired.""" if endtime is None: return if skip_check_and_raise or _time() > endtime:

  raise TimeoutExpired(
            self.args, orig_timeout,
            output=b''.join(stdout_seq) if stdout_seq else None,
            stderr=b''.join(stderr_seq) if stderr_seq else None)

E subprocess.TimeoutExpired: Command '['oc', '--kubeconfig', '/home/jenkins/current-cluster-dir/openshift-cluster-dir/auth/kubeconfig', '-n', 'openshift-machine-api', 'delete', 'machine', 'j-001vif1cs36-uba-xx6vn-worker-0-sdvm7']' timed out after 600 seconds

github-actions[bot] commented 1 month ago

This issue has been automatically marked as stale because it has not had recent activity. It will be closed in 30 days if no further activity occurs.

yitzhak12 commented 2 weeks ago

The failure still exists https://reportportal-ocs4.apps.ocp-c1.prod.psi.redhat.com/ui/#ocs/launches/738/26431/1299763/1299789/log.

yitzhak12 commented 2 weeks ago

I tested it locally, and I successfully deleted the machine. A new machine with a new associated node has come up. I labeled the node with an OCS label, and Ceph health is back to being OK. We may need to wait longer for the machine to be deleted. I will test it again by increasing the timeout.