yaronkaikov / scylla

NoSQL data store using the seastar framework, compatible with Apache Cassandra
http://scylladb.com
GNU Affero General Public License v3.0
0 stars 0 forks source link

[x86_64, debug] topology_experimental_raft/test_tablets failed with ReadTimeout #20

Closed yaronkaikov closed 2 months ago

yaronkaikov commented 2 months ago

https://jenkins.scylladb.com/job/scylla-master/job/next/7591/ failed with the following error:

______________________________ test_tablet_split _______________________________

manager = <test.pylib.manager_client.ManagerClient object at 0x7fbf74285650>

    @pytest.mark.asyncio
    @skip_mode('release', 'error injections are not supported in release mode')
    async def test_tablet_split(manager: ManagerClient):
        logger.info("Bootstrapping cluster")
        cmdline = [
            '--logger-log-level', 'storage_service=debug',
            '--logger-log-level', 'table=debug',
            '--target-tablet-size-in-bytes', '1024',
        ]
        servers = [await manager.server_add(cmdline=cmdline)]

        await manager.api.disable_tablet_balancing(servers[0].ip_addr)

        cql = manager.get_cql()
        await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1};")
        await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);")

        # enough to trigger multiple splits with max size of 1024 bytes.
        keys = range(256)
        await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys])

        async def check():
            logger.info("Checking table")
            cql = manager.get_cql()
            rows = await cql.run_async("SELECT * FROM test.test;")
            assert len(rows) == len(keys)
            for r in rows:
                assert r.c == r.pk

        await check()

        await manager.api.flush_keyspace(servers[0].ip_addr, "test")

        tablet_count = await get_tablet_count(manager, servers[0], 'test', 'test')
        assert tablet_count == 1

        logger.info("Adding new server")
        servers.append(await manager.server_add(cmdline=cmdline))

        # Increases the chance of tablet migration concurrent with split
        await inject_error_one_shot_on(manager, "tablet_allocator_shuffle", servers)
        await inject_error_on(manager, "tablet_load_stats_refresh_before_rebalancing", servers)

        s1_log = await manager.server_open_log(servers[0].server_id)
        s1_mark = await s1_log.mark()

        # Now there's a split and migration need, so they'll potentially run concurrently.
        await manager.api.enable_tablet_balancing(servers[0].ip_addr)

        await check()
        time.sleep(5) # Give load balancer some time to do work

        await s1_log.wait_for('Detected tablet split for table', from_mark=s1_mark)

>       await check()

test/topology_experimental_raft/test_tablets.py:723: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    async def check():
        logger.info("Checking table")
        cql = manager.get_cql()
>       rows = await cql.run_async("SELECT * FROM test.test;")
E       cassandra.ReadTimeout: Error from server: code=1200 [Coordinator node timed out waiting for replica nodes' responses] message="Operation timed out for test.test - received only 0 responses from 1 CL=LOCAL_QUORUM." info={'consistency': 'LOCAL_QUORUM', 'required_responses': 1, 'received_responses': 0}

test/topology_experimental_raft/test_tablets.py:693: ReadTimeout
------------------------------ Captured log setup ------------------------------