tests/functional/disaster-recovery/regional-dr/test_failover_and_relocate.py::TestFailoverAndRelocate::test_failover_and_relocate[primary_down-rbd] is failing with below error
This test case failed in ppc64le arch
Version details:
OCP: 4.17.3
ODF: 4.17.0-126
ACM: 2.12
Gitops: 1.14.1
OADP: 1.4.1
submariner: 0.19.0
volsync: 0.11
23:54:19 - MainThread - ocs_ci.ocs.resources.drpc - INFO - C[rdr-hub1-417] - Current lastGroupSyncTime is 2024-11-15T04:42:05Z.
23:54:19 - MainThread - ocs_ci.helpers.dr_helpers - INFO - C[rdr-hub1-417] - Verified: Current lastGroupSyncTime 2024-11-15T04:42:05Z is different from previous value 2024-11-15T04:28:02Z
23:54:19 - MainThread - ocs_ci.helpers.dr_helpers - INFO - C[rdr-hub1-417] - Time in minutes since the last sync 12.233333333333333
23:54:19 - MainThread - ocs_ci.framework.pytest_customization.reports - INFO - C[rdr-hub1-417] - duration reported by tests/functional/disaster-recovery/regional-dr/test_failover_and_relocate.py::TestFailoverAndRelocate::test_failover_and_relocate[primary_down-rbd] immediately after test execution: 1797.95
FAILED
_____ TestFailoverAndRelocate.test_failover_and_relocate[primary_down-rbd] _____
self = <test_failover_and_relocate.TestFailoverAndRelocate object at 0x7ffe8e85acd0>
primary_cluster_down = True, pvc_interface = 'CephBlockPool'
setup_acm_ui = None
dr_workload = <function dr_workload.<locals>.factory at 0x7ffe8f0c1ca0>
nodes_multicluster = [<ocs_ci.ocs.platform_nodes.IBMPowerNodes object at 0x7ffe8f156ee0>, <ocs_ci.ocs.platform_nodes.IBMPowerNodes object at 0x7ffe8f156760>, <ocs_ci.ocs.platform_nodes.IBMPowerNodes object at 0x7ffe8d916fa0>]
node_restart_teardown = None
@pytest.mark.parametrize(
argnames=["primary_cluster_down", "pvc_interface"],
argvalues=[
pytest.param(
False,
constants.CEPHBLOCKPOOL,
marks=pytest.mark.polarion_id(polarion_id_primary_up),
id="primary_up-rbd",
),
pytest.param(
True,
constants.CEPHBLOCKPOOL,
marks=pytest.mark.polarion_id(polarion_id_primary_down),
id="primary_down-rbd",
),
pytest.param(
False,
constants.CEPHFILESYSTEM,
marks=pytest.mark.polarion_id(polarion_id_primary_up_cephfs),
id="primary_up-cephfs",
),
pytest.param(
True,
constants.CEPHFILESYSTEM,
marks=pytest.mark.polarion_id(polarion_id_primary_down_cephfs),
id="primary_down-cephfs",
),
],
)
def test_failover_and_relocate(
self,
primary_cluster_down,
pvc_interface,
setup_acm_ui,
dr_workload,
nodes_multicluster,
node_restart_teardown,
):
"""
Tests to verify application failover when the primary cluster is either UP or DOWN and relocate between managed
clusters.
This test is also compatible to be run from ACM UI,
pass the yaml conf/ocsci/dr_ui.yaml to trigger it.
"""
if config.RUN.get("rdr_failover_via_ui"):
acm_obj = AcmAddClusters()
workloads = dr_workload(
num_of_subscription=1, num_of_appset=1, pvc_interface=pvc_interface
)
drpc_subscription = DRPC(namespace=workloads[0].workload_namespace)
drpc_appset = DRPC(
namespace=constants.GITOPS_CLUSTER_NAMESPACE,
resource_name=f"{workloads[1].appset_placement_name}-drpc",
)
drpc_objs = [drpc_subscription, drpc_appset]
primary_cluster_name = dr_helpers.get_current_primary_cluster_name(
workloads[0].workload_namespace
)
config.switch_to_cluster_by_name(primary_cluster_name)
primary_cluster_index = config.cur_index
primary_cluster_nodes = get_node_objs()
secondary_cluster_name = dr_helpers.get_current_secondary_cluster_name(
workloads[0].workload_namespace
)
if pvc_interface == constants.CEPHFILESYSTEM:
# Verify the creation of ReplicationDestination resources on secondary cluster
config.switch_to_cluster_by_name(secondary_cluster_name)
for wl in workloads:
dr_helpers.wait_for_replication_destinations_creation(
wl.workload_pvc_count, wl.workload_namespace
)
scheduling_interval = dr_helpers.get_scheduling_interval(
workloads[0].workload_namespace
)
wait_time = 2 * scheduling_interval # Time in minutes
logger.info(f"Waiting for {wait_time} minutes to run IOs")
sleep(wait_time * 60)
for obj in drpc_objs:
before_failover_last_group_sync_time = (
dr_helpers.verify_last_group_sync_time(obj, scheduling_interval)
)
logger.info("Verified lastGroupSyncTime before failover.")
if config.RUN.get("rdr_failover_via_ui"):
logger.info("Start the process of Failover from ACM UI")
config.switch_acm_ctx()
dr_submariner_validation_from_ui(acm_obj)
# Stop primary cluster nodes
if primary_cluster_down:
config.switch_to_cluster_by_name(primary_cluster_name)
logger.info(f"Stopping nodes of primary cluster: {primary_cluster_name}")
nodes_multicluster[primary_cluster_index].stop_nodes(primary_cluster_nodes)
# Verify if cluster is marked unavailable on ACM console
if config.RUN.get("rdr_failover_via_ui"):
config.switch_acm_ctx()
check_cluster_status_on_acm_console(
acm_obj,
down_cluster_name=primary_cluster_name,
expected_text="Unknown",
)
elif config.RUN.get("rdr_failover_via_ui"):
check_cluster_status_on_acm_console(acm_obj)
for wl in workloads:
if config.RUN.get("rdr_failover_via_ui"):
# Failover via ACM UI
failover_relocate_ui(
acm_obj,
scheduling_interval=scheduling_interval,
workload_to_move=f"{wl.workload_name}-1",
policy_name=wl.dr_policy_name,
failover_or_preferred_cluster=secondary_cluster_name,
)
else:
# Failover action via CLI
dr_helpers.failover(
secondary_cluster_name,
wl.workload_namespace,
wl.workload_type,
wl.appset_placement_name
if wl.workload_type == constants.APPLICATION_SET
else None,
)
# Verify resources creation on secondary cluster (failoverCluster)
config.switch_to_cluster_by_name(secondary_cluster_name)
for wl in workloads:
dr_helpers.wait_for_all_resources_creation(
wl.workload_pvc_count,
wl.workload_pod_count,
wl.workload_namespace,
)
# Verify resources deletion from primary cluster
config.switch_to_cluster_by_name(primary_cluster_name)
# Start nodes if cluster is down
if primary_cluster_down:
logger.info(
f"Waiting for {wait_time} minutes before starting nodes of primary cluster: {primary_cluster_name}"
)
sleep(wait_time * 60)
nodes_multicluster[primary_cluster_index].start_nodes(primary_cluster_nodes)
wait_for_nodes_status([node.name for node in primary_cluster_nodes])
logger.info("Wait for 180 seconds for pods to stabilize")
sleep(180)
logger.info(
"Wait for all the pods in openshift-storage to be in running state"
)
assert wait_for_pods_to_be_running(
timeout=720
), "Not all the pods reached running state"
logger.info("Checking for Ceph Health OK")
ceph_health_check()
for wl in workloads:
dr_helpers.wait_for_all_resources_deletion(wl.workload_namespace)
if pvc_interface == constants.CEPHFILESYSTEM:
for wl in workloads:
# Verify the deletion of ReplicationDestination resources on secondary cluster
config.switch_to_cluster_by_name(secondary_cluster_name)
dr_helpers.wait_for_replication_destinations_deletion(
wl.workload_namespace
)
# Verify the creation of ReplicationDestination resources on primary cluster
config.switch_to_cluster_by_name(primary_cluster_name)
dr_helpers.wait_for_replication_destinations_creation(
wl.workload_pvc_count, wl.workload_namespace
)
if pvc_interface == constants.CEPHBLOCKPOOL:
dr_helpers.wait_for_mirroring_status_ok(
replaying_images=sum([wl.workload_pvc_count for wl in workloads])
)
after_failover_last_group_sync_time = []
for obj in drpc_objs:
after_failover_last_group_sync_time.append(
> dr_helpers.verify_last_group_sync_time(
obj, scheduling_interval, before_failover_last_group_sync_time
)
)
tests/functional/disaster-recovery/regional-dr/test_failover_and_relocate.py::TestFailoverAndRelocate::test_failover_and_relocate[primary_down-rbd] is failing with below error This test case failed in ppc64le arch Version details: OCP: 4.17.3 ODF: 4.17.0-126 ACM: 2.12 Gitops: 1.14.1 OADP: 1.4.1 submariner: 0.19.0 volsync: 0.11
test_failover_and_relocate[primary_down-rbd].log