yaronkaikov / scylla

NoSQL data store using the seastar framework, compatible with Apache Cassandra
http://scylladb.com
GNU Affero General Public License v3.0
0 stars 0 forks source link

[x86_64, debug] topology_experimental_raft/test_tablets failed with ReadTimeout #18

Closed yaronkaikov closed 2 months ago

yaronkaikov commented 2 months ago

https://jenkins.scylladb.com/job/scylla-master/job/next/7591/ failed with the following error:


manager = <test.pylib.manager_client.ManagerClient object at 0x7fbf74285650>

    @pytest.mark.asyncio

    @skip_mode('release', 'error injections are not supported in release mode')

    async def test_tablet_split(manager: ManagerClient):

        logger.info("Bootstrapping cluster")

        cmdline = [

            '--logger-log-level', 'storage_service=debug',

            '--logger-log-level', 'table=debug',

            '--target-tablet-size-in-bytes', '1024',

        ]

        servers = [await manager.server_add(cmdline=cmdline)]

        await manager.api.disable_tablet_balancing(servers[0].ip_addr)

        cql = manager.get_cql()

        await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1};")

        await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);")

        # enough to trigger multiple splits with max size of 1024 bytes.

        keys = range(256)

        await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys])

        async def check():

            logger.info("Checking table")

            cql = manager.get_cql()

            rows = await cql.run_async("SELECT * FROM test.test;")

            assert len(rows) == len(keys)

            for r in rows:

                assert r.c == r.pk

        await check()

        await manager.api.flush_keyspace(servers[0].ip_addr, "test")

        tablet_count = await get_tablet_count(manager, servers[0], 'test', 'test')

        assert tablet_count == 1

        logger.info("Adding new server")

        servers.append(await manager.server_add(cmdline=cmdline))

        # Increases the chance of tablet migration concurrent with split

        await inject_error_one_shot_on(manager, "tablet_allocator_shuffle", servers)

        await inject_error_on(manager, "tablet_load_stats_refresh_before_rebalancing", servers)

        s1_log = await manager.server_open_log(servers[0].server_id)

        s1_mark = await s1_log.mark()

        # Now there's a split and migration need, so they'll potentially run concurrently.

        await manager.api.enable_tablet_balancing(servers[0].ip_addr)

        await check()

        time.sleep(5) # Give load balancer some time to do work

        await s1_log.wait_for('Detected tablet split for table', from_mark=s1_mark)

>       await check()

test/topology_experimental_raft/test_tablets.py:723: 

_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    async def check():

        logger.info("Checking table")

        cql = manager.get_cql()

>       rows = await cql.run_async("SELECT * FROM test.test;")

E       cassandra.ReadTimeout: Error from server: code=1200 [Coordinator node timed out waiting for replica nodes' responses] message="Operation timed out for test.test - received only 0 responses from 1 CL=LOCAL_QUORUM." info={'consistency': 'LOCAL_QUORUM', 'required_responses': 1, 'received_responses': 0}

test/topology_experimental_raft/test_tablets.py:693: ReadTimeout

------------------------------ Captured log setup ------------------------------