=================================== FAILURES ===================================
_______________________ test_recover_stuck_raft_recovery _______________________
request = <FixtureRequest for <Function test_recover_stuck_raft_recovery>>
manager = <test.pylib.manager_client.ManagerClient object at 0x7fb3b9375dd0>
@pytest.mark.asyncio
@skip_mode('release', 'error injections are not supported in release mode')
@log_run_time
async def test_recover_stuck_raft_recovery(request, manager: ManagerClient):
"""
After creating a cluster, we enter RECOVERY state on every server. Then, we delete the Raft data
and the upgrade state on all servers. We restart them and the upgrade procedure starts. One of the
servers fails, the rest enter 'synchronize' state. We assume the failed server cannot be recovered.
We cannot just remove it at this point; it's already part of group 0, `remove_from_group0` will wait
until upgrade procedure finishes - but the procedure is stuck. To proceed we enter RECOVERY state on
the other servers, remove the failed one, and clear existing Raft data. After leaving RECOVERY the
remaining nodes will restart the procedure, establish a new group 0 and finish upgrade.
"""
cfg = {'enable_user_defined_functions': False,
'force_gossip_topology_changes': True}
servers = [await manager.server_add(config=cfg) for _ in range(3)]
srv1, *others = servers
logging.info("Waiting until driver connects to every server")
cql = manager.get_cql()
hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
logging.info(f"Setting recovery state on {hosts}")
await asyncio.gather(*(enter_recovery_state(cql, h) for h in hosts))
await asyncio.gather(*(manager.server_restart(srv.server_id) for srv in servers))
cql = await reconnect_driver(manager)
logging.info(f"Cluster restarted, waiting until driver reconnects to {others}")
hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
logging.info(f"Driver reconnected, hosts: {hosts}")
logging.info(f"Deleting Raft data and upgrade state on {hosts}")
await asyncio.gather(*(delete_raft_data_and_upgrade_state(cql, h) for h in hosts))
logging.info(f"Stopping {servers}")
await asyncio.gather(*(manager.server_stop_gracefully(srv.server_id) for srv in servers))
logging.info(f"Starting {srv1} with injected group 0 upgrade error")
await manager.server_update_config(srv1.server_id, 'error_injections_at_startup', ['group0_upgrade_before_synchronize'])
await manager.server_start(srv1.server_id)
logging.info(f"Starting {others}")
await asyncio.gather(*(manager.server_start(srv.server_id) for srv in others))
cql = await reconnect_driver(manager)
logging.info(f"Cluster restarted, waiting until driver reconnects to {others}")
hosts = await wait_for_cql_and_get_hosts(cql, others, time.time() + 60)
logging.info(f"Driver reconnected, hosts: {hosts}")
logging.info(f"Waiting until {hosts} enter 'synchronize' state")
await asyncio.gather(*(wait_for_upgrade_state('synchronize', cql, h, time.time() + 60) for h in hosts))
logging.info(f"{hosts} entered synchronize")
log_file1 = await manager.server_open_log(srv1.server_id)
logging.info(f"Checking if Raft upgrade procedure failed on {srv1}")
await log_file1.wait_for("error injection before group 0 upgrade enters synchronize")
logging.info(f"Setting recovery state on {hosts}")
await asyncio.gather(*(enter_recovery_state(cql, h) for h in hosts))
logging.info(f"Restarting {others}")
await asyncio.gather(*(manager.server_restart(srv.server_id) for srv in others))
cql = await reconnect_driver(manager)
logging.info(f"{others} restarted, waiting until driver reconnects to them")
hosts = await wait_for_cql_and_get_hosts(cql, others, time.time() + 60)
logging.info(f"Checking if {hosts} are in recovery state")
for host in hosts:
> rs = await cql.run_async(
"select value from system.scylla_local where key = 'group0_upgrade_state'",
host=host)
E cassandra.cluster.NoHostAvailable: ('Unable to complete the operation against any hosts', {<Host: 127.232.37.35:9042 datacenter1>: ConnectionException('Host has been marked down or removed')})
test/topology_custom/test_raft_recovery_stuck.py:88: NoHostAvailable
------------------------------ Captured log setup ------------------------------
https://jenkins.scylladb.com/job/scylla-master/job/next/7785/ failed with the following error:
Attached log file