vegaprotocol / vega

A Go implementation of the Vega Protocol, a protocol for creating and trading derivatives on a fully decentralised network.
https://vega.xyz
GNU Affero General Public License v3.0
37 stars 19 forks source link

[Bug]: Consensus failure during protocol upgrade test #9304

Closed jgsbennett closed 10 months ago

jgsbennett commented 10 months ago

Problem encountered

There is a consensus failure during one of the protocol upgrade tests. The consensus failure takes place during the test's "wait_for_upgrade" step. The relevant test is checking governance transfers during protocol upgrade.

context = {'AssetID': '8165572b7550e88f836fb2d8121427915025c165044af6bfe7145b01cb6ce4e9', 'app_state.network_parameters': {'bloc..., ...}, 'current_test_name': 'test_governance_transfers_during_protocol_upgrade', 'current_test_start_block': 124, ...}
proposer = '10A93F18C4'

    @pytest.mark.gov_transfers
    @pytest.mark.network_infra
    def test_governance_transfers_during_protocol_upgrade(context, proposer):
        """
        Transfer proposals in either a pre or post enactment state are restored after a protocol upgrade (0028-GOVE-102).

        Recurring transfers proposed before an upgrade which start before, during or after an upgrade should complete on the
         proposed end epoch (0028-GOVE-130)

        One off delivery transfers proposed before an upgrade which are due to start during or after an upgrade should
        complete either when the network is available again or at the proposed delivery date/time (0028-GOVE-131)
        """
        tag = FUTURE_VERSION_TAG

        # build selection of transfers, active and dormant to test survivability across upgrade
        destination_accounts = rewardMod.setup_treasury_data(count=4)

        _transfer_one_off_delivered = governMod.GovernanceTransfer(
            transfer_type="All_Or_Nothing",
            amount=0.5,
            source_type="Reward",
            destination_type="General",
            destination=destination_accounts[0][0],
            asset_id=settings.context["gov_transfer_asset_0"],
        )
        _transfer_one_off_delivered.propose(proposer)

        _transfer_enacted_dormant = governMod.GovernanceTransfer(
            transfer_type="All_Or_Nothing",
            amount=0.5,
            source_type="Reward",
            destination_type="General",
            destination=destination_accounts[0][1],
            asset_id=settings.context["gov_transfer_asset_1"],
            transfer_deliver_on=time.time() + 40,  # expected to complete sometime after full restart
        )
        _transfer_enacted_dormant.propose(proposer)

        _transfer_recurr_dormant = governMod.GovernanceTransfer(
            transfer_type="All_Or_Nothing",
            amount=0.5,
            source_type="Reward",
            destination_type="General",
            destination=destination_accounts[0][2],
            asset_id=settings.context["gov_transfer_asset_2"],
            start_offset=8,
            end_offset=8,  # expected to complete immediately on full restart
        )
        _transfer_recurr_dormant.propose(proposer)

        _transfer_recurr_active = governMod.GovernanceTransfer(
            transfer_type="All_Or_Nothing",
            amount=0.5,
            source_type="Reward",
            destination_type="General",
            destination=destination_accounts[0][3],
            asset_id=settings.context["gov_transfer_asset_3"],
            start_offset=2,
            end_offset=12,  # expected to resume after full restart and complete
        )
        _transfer_recurr_active.propose(proposer)

        # Ask VC to prep the directories for upgrade
        network.Network.protocol_upgrade(run_config_path=DEFAULT_VISOR_RUN_CONFIG, release_tag=tag)

        # Send propose transaction for all nodes for upgrade
        nodes = network.Network.list_nodes(mode="validator")
        upgrade_block_height = networkMod.get_api_statistics().statistics.block_height + 40
        networkMod.submit_upgrade_proposal_for_nodes(nodes, upgrade_block_height, tag)
        check_proposal_in_datanode(
            upgrade_block_height, tag, "PROTOCOL_UPGRADE_PROPOSAL_STATUS_PENDING", [n.get_vega_pub_key() for n in nodes]
        )

>       networkMod.wait_for_upgrade(upgrade_block_height=upgrade_block_height, release_tag=tag)

tests/protocol_upgrade/protocol_upgrade_test.py:2224: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
decorators.py:16: in time_closure
    result = func(*args, **kwargs)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

upgrade_block_height = 378, release_tag = 'v99.99.0+dev.806a8ae6.modified'
timeout = 90
nodes_to_check = [<modules.vegacapsule.network.Node object at 0x7f6f1b331f90>, <modules.vegacapsule.network.Node object at 0x7f6f1b31a6...odules.vegacapsule.network.Node object at 0x7f6f1b31af50>, <modules.vegacapsule.network.Node object at 0x7f6f1b319650>]

    @timefunc
    def wait_for_upgrade(upgrade_block_height, release_tag, timeout=90, nodes_to_check=None):
        """Waits until upgrade has taken places."""

        if nodes_to_check is None:
            nodes_to_check = Network.list_nodes(mode="validator")

        def upgrade_done():
            try:
                for node in nodes_to_check:
                    stats = node.statistics()
                    if stats.block_height <= upgrade_block_height:
                        logging.info(
                            f"Upgrade not complete. Node {node.get_name()} reports block height {stats.block_height} <= {upgrade_block_height}"
                        )
                        return False
                    if stats.app_version != release_tag:
                        logging.info(
                            f"Upgrade not complete. Node {node.get_name()} reports version {stats.app_version} <= {release_tag}"
                        )
                        return False
            except grpc.RpcError as e:
                if e.code() == grpc.StatusCode.UNAVAILABLE:
                    logging.info(f"Node {node.get_name()} currently unavailable.")
                    return False
                else:
                    raise
            except grpc.FutureTimeoutError as e:
                logging.info(f"Node {node.get_name()} currently unavailable.")
                return False
            Network.wait_for_data_node_ready((now + timeout) - time.time())
            logging.info(f"All nodes report correct version and block height > upgrade block.")
            return True

        now = time.time()
        while time.time() < now + timeout:
            if upgrade_done():
                # we have to restart the scanners because they will have become disconnected when the nodes stopped and started
                conftest.restart_scanners(clean=False, reason="protocol upgrade")
                return
            time.sleep(0.2)
>       raise AssertionError(f"Upgrade did not complete in {timeout} seconds.")
E       AssertionError: Upgrade did not complete in 90 seconds.

Observed behaviour

Consensus failure

Expected behaviour

Never get consensus failures

Steps to reproduce

Unknown. Only seen one occurrence so far.

Software version

806a8ae61fd34d97f8242ca417c9db134e28480b

Failing test

No response

Jenkins run

https://jenkins.vega.rocks/job/common/job/system-tests-wrapper/3966/testReport/tests.protocol_upgrade/protocol_upgrade_test/test_governance_transfers_during_protocol_upgrade/

Configuration used

No response

Relevant log output

2023-09-05T15:57:04.506Z    ERROR   tendermint  consensus/state.go:732  CONSENSUS FAILURE!!!    {"err": "precommit step; +2/3 prevoted for an invalid block: wrong Block.Header.AppHash.  Expected F2B9BC2E343B09980CB7D5A419527BBC8DCF296FC976F256C190AF02E6597DCD, got B5CDA7833598B7A341D6831081EDA5152E91372012B4D5092916AD667283D830", "stack": "goroutine 160 [running]:\nruntime/debug.Stack()\n\t/usr/local/go/src/runtime/debug/stack.go:24 +0x65\ngithub.com/tendermint/tendermint/consensus.(*State).receiveRoutine.func2()\n\t/jenkins/GOPATH/pkg/mod/github.com/vegaprotocol/cometbft@v0.34.28-0.20230322133204-3d8588de736e/consensus/state.go:732 +0x4c\npanic({0x486b1c0, 0xc000ef3f00})\n\t/usr/local/go/src/runtime/panic.go:884 +0x213\ngithub.com/tendermint/tendermint/consensus.(*State).enterPrecommit(0xc000ed4e00, 0x53, 0x8)\n\t/jenkins/GOPATH/pkg/mod/github.com/vegaprotocol/cometbft@v0.34.28-0.20230322133204-3d8588de736e/consensus/state.go:1414 +0x1699\ngithub.com/tendermint/tendermint/consensus.(*State).addVote(0xc000ed4e00, 0xc004127f40, {0xc000673590, 0x28})\n\t/jenkins/GOPATH/pkg/mod/github.com/vegaprotocol/cometbft@v0.34.28-0.20230322133204-3d8588de736e/consensus/state.go:2137 +0x13d8\ngithub.com/tendermint/tendermint/consensus.(*State).tryAddVote(0xc000ed4e00, 0xc004127f40, {0xc000673590?, 0xc00110dc08?})\n\t/jenkins/GOPATH/pkg/mod/github.com/vegaprotocol/cometbft@v0.34.28-0.20230322133204-3d8588de736e/consensus/state.go:1963 +0x2c\ngithub.com/tendermint/tendermint/consensus.(*State).handleMsg(0xc000ed4e00, {{0x590c1c0, 0xc0044ee740}, {0xc000673590, 0x28}})\n\t/jenkins/GOPATH/pkg/mod/github.com/vegaprotocol/cometbft@v0.34.28-0.20230322133204-3d8588de736e/consensus/state.go:861 +0x3ff\ngithub.com/tendermint/tendermint/consensus.(*State).receiveRoutine(0xc000ed4e00, 0x0)\n\t/jenkins/GOPATH/pkg/mod/github.com/vegaprotocol/cometbft@v0.34.28-0.20230322133204-3d8588de736e/consensus/state.go:768 +0x3f0\ncreated by github.com/tendermint/tendermint/consensus.(*State).OnStart\n\t/jenkins/GOPATH/pkg/mod/github.com/vegaprotocol/cometbft@v0.34.28-0.20230322133204-3d8588de736e/consensus/state.go:379 +0x12d\n"}
jgsbennett commented 10 months ago

The node it comes from seems to be this one: https://jenkins.vega.rocks/job/common/job/system-tests-wrapper/3966/artifact/testnet/logs/testnet-nodeset-validators-2-validator/vega-validator-2.stdout-2023-09-05T15%3A56%3A25Z.log

wwestgarth commented 10 months ago

Not a core issue. The test that is falling out of consensus after a protocol upgrade is still using the old binary version which is why its app-hash differs.