Graceful rolling update

hulucc commented 1 year ago

I want to do graceful rolling update, and I notice leader election will be trigged when starting an stopped node, which will cause about 1 second interrupt. Is this by design and is there anyway I can avoid this? I expect the graceful rolling update:

leader node give away leader before shutdown (can be done by RequestLeaderTransfer)
former leader node restarted and join cluster without trigger leader election
loop this for all node

lni commented 1 year ago

Please enable that CheckQuorum option and see whether that solves your problem. https://github.com/lni/dragonboat/blob/023bafb8e648934a8f16b98ff58d91b743358687/config/config.go#L74

I don't quite understand why such restarted node would trigger an election, once started, it waits for the election timeout first, it is expected to receive heartbeat from the leader node and become a follower.

If the above CheckQuorum option doesn't help, please provide more details including your configurations, more log on what happened on the restarted node.

hulucc commented 1 year ago

Minimal reproduce code, dragonboat verision: v3.6.6

package main

import (
    "context"
    "encoding/binary"
    "fmt"
    "io"
    "log"
    "time"
    "errors"

    "github.com/lni/dragonboat/v3"
    "github.com/lni/dragonboat/v3/config"
    sm "github.com/lni/dragonboat/v3/statemachine"
    "github.com/lni/goutils/syncutil"
)

type exampleStateMachine struct {
    ClusterID uint64
    NodeID    uint64
    Tick      uint64
}

func NewExampleStateMachine(clusterID uint64, nodeID uint64) sm.IStateMachine {
    return &exampleStateMachine{ClusterID: clusterID, NodeID: nodeID}
}
func (it *exampleStateMachine) Update(req []byte) (sm.Result, error) {
    it.Tick++
    return sm.Result{}, nil
}
func (it *exampleStateMachine) Lookup(req interface{}) (interface{}, error) {
    return it.Tick, nil
}
func (it *exampleStateMachine) SaveSnapshot(w io.Writer, _ sm.ISnapshotFileCollection, _ <-chan struct{}) error {
    data := make([]byte, 8)
    binary.LittleEndian.PutUint64(data, it.Tick)
    w.Write(data)
    return nil
}
func (it *exampleStateMachine) RecoverFromSnapshot(r io.Reader, _ []sm.SnapshotFile, _ <-chan struct{}) error {
    data, _ := io.ReadAll(r)
    v := binary.LittleEndian.Uint64(data)
    it.Tick = v
    return nil
}
func (it *exampleStateMachine) Close() error {
    return nil
}

func newNodeHost(nhid uint64, addr string) *dragonboat.NodeHost {
    conf := config.NodeHostConfig{
        NodeHostDir:         fmt.Sprintf("single_nodehost_test_dir_safe_to_delete/%d", nhid),
        AddressByNodeHostID: false,
        RTTMillisecond:      100,
        RaftAddress:         addr,
        Expert: config.ExpertConfig{
            TestNodeHostID: nhid,
        },
    }
    nh, err := dragonboat.NewNodeHost(conf)
    if err != nil {
        panic(err)
    }
    return nh
}

func startShard(nh *dragonboat.NodeHost, clusterID uint64, nodeID uint64, initials map[uint64]string, fn interface{}) {
    conf := config.Config{
        NodeID:              nodeID,
        ClusterID:           clusterID,
        CheckQuorum:         true,
        ElectionRTT:         10,
        HeartbeatRTT:        1,
        SnapshotEntries:     0,
        CompactionOverhead:  0,
        OrderedConfigChange: false,
    }
    var err error
    switch f := fn.(type) {
    case sm.CreateStateMachineFunc:
        if len(initials) > 0 {
            err = nh.StartCluster(initials, false, f, conf)
        } else {
            err = nh.StartCluster(nil, false, f, conf)
        }
    case sm.CreateConcurrentStateMachineFunc:
        if len(initials) > 0 {
            err = nh.StartConcurrentCluster(initials, false, f, conf)
        } else {
            err = nh.StartConcurrentCluster(nil, false, f, conf)
        }
    default:
        panic(fmt.Errorf("unknown fn type: %T", f))
    }
    if err != nil {
        panic(err)
    }
}

func tick(nh *dragonboat.NodeHost, clusterID uint64) *syncutil.Stopper {
    stopper := syncutil.NewStopper()
    stopper.RunWorker(func() {
        t := time.NewTicker(time.Millisecond * 30)
        defer t.Stop()
        nop := nh.GetNoOPSession(clusterID)
        ctx := context.Background()
        for {
            select {
            case <-t.C:
                ctx1, cancel := context.WithTimeout(ctx, time.Second)
                _, err := nh.SyncPropose(ctx1, nop, []byte("tick"))
                cancel()
                if err != nil {
                    // log.Fatalln("[FATAL]", fmt.Errorf("node %d shahrd %d tick err: %w", nh.NodeHostConfig().Expert.TestNodeHostID, clusterID, err))
                    log.Println("[WARN]", fmt.Errorf("node %d shard %d tick err: %w", nh.NodeHostConfig().Expert.TestNodeHostID, clusterID, err))
                }
            case <-stopper.ShouldStop():
                return
            }
        }
    })
    return stopper
}

func waitReady(nh *dragonboat.NodeHost, clusterID uint64) {
    for {
        ctx, cancel := context.WithTimeout(context.TODO(), time.Second)
        defer cancel()
        if _, err := nh.SyncRead(ctx, clusterID, nil); err != nil {
            if errors.Is(err, dragonboat.ErrClusterNotReady) {
                time.Sleep(time.Second)
                continue
            }
        }
        return
    }
}

func main() {
    clusterID := uint64(0)
    fn := sm.CreateStateMachineFunc(NewExampleStateMachine)
    initials := map[uint64]string{
        1: "127.0.0.1:3001",
        2: "127.0.0.1:3002",
        3: "127.0.0.1:3003",
    }
    nh1 := newNodeHost(1, initials[1])
    nh2 := newNodeHost(2, initials[2])
    nh3 := newNodeHost(3, initials[3])
    startShard(nh1, clusterID, 1, initials, fn)
    startShard(nh2, clusterID, 2, initials, fn)
    startShard(nh3, clusterID, 3, initials, fn)
    if err := nh1.RequestLeaderTransfer(clusterID, 1); err != nil {
        log.Fatalln("[FATAL]", fmt.Errorf("RequestLeaderTransfer err: %w", err))
    }
    waitReady(nh1, clusterID)
    stopper := tick(nh1, clusterID)
    defer stopper.Stop()
    println("phase1")

    nh3.Stop()
    time.Sleep(time.Second*3)
    println("phase2")

    nh3 = newNodeHost(3, initials[3])
    startShard(nh3, clusterID, 3, nil, fn)
    waitReady(nh3, clusterID)
    println("phase3")

    nh2.Stop()
    time.Sleep(time.Second*3)
    println("phase4")

    nh2 = newNodeHost(2, initials[2])
    startShard(nh2, clusterID, 2, nil, fn)
    waitReady(nh2, clusterID)
    println("phase5")

    stopper.Stop()
    stopper = tick(nh3, clusterID)

    if err := nh1.RequestLeaderTransfer(clusterID, 3); err != nil {
        log.Fatalln("[FATAL]", fmt.Errorf("RequestLeaderTransfer err: %w", err))
    }
    println("phase6")

    nh1.Stop()
    time.Sleep(time.Second*3)
    println("phase7")

    nh1 = newNodeHost(1, initials[1])
    startShard(nh1, clusterID, 1, nil, fn)
    waitReady(nh1, clusterID)
    println("phase8")

    time.Sleep(time.Second*5)
}

log

|| [go run .]
|| 2023-02-14 17:17:41.958516 I | dragonboat: go version: go1.19, windows/amd64
|| 2023-02-14 17:17:41.958516 I | dragonboat: dragonboat version: 3.3.5 (Rel)
|| 2023-02-14 17:17:41.958516 W | dragonboat: unsupported OS/ARCH windows/amd64, don't use for production
|| 2023-02-14 17:17:41.959032 I | config: using default EngineConfig
|| 2023-02-14 17:17:41.959032 I | config: using default LogDBConfig
|| 2023-02-14 17:17:41.959032 I | dragonboat: DeploymentID set to 1
|| 2023-02-14 17:17:41.964515 I | dragonboat: LogDB info received, shard 0, busy false
|| 2023-02-14 17:17:41.967515 I | dragonboat: LogDB info received, shard 1, busy false
|| 2023-02-14 17:17:41.970516 I | dragonboat: LogDB info received, shard 2, busy false
|| 2023-02-14 17:17:41.973515 I | dragonboat: LogDB info received, shard 3, busy false
|| 2023-02-14 17:17:41.976535 I | dragonboat: LogDB info received, shard 4, busy false
|| 2023-02-14 17:17:41.980042 I | dragonboat: LogDB info received, shard 5, busy false
|| 2023-02-14 17:17:41.983041 I | dragonboat: LogDB info received, shard 6, busy false
|| 2023-02-14 17:17:41.986134 I | dragonboat: LogDB info received, shard 7, busy false
|| 2023-02-14 17:17:41.989649 I | dragonboat: LogDB info received, shard 8, busy false
|| 2023-02-14 17:17:41.992649 I | dragonboat: LogDB info received, shard 9, busy false
|| 2023-02-14 17:17:41.996373 I | dragonboat: LogDB info received, shard 10, busy false
|| 2023-02-14 17:17:41.998869 I | dragonboat: LogDB info received, shard 11, busy false
|| 2023-02-14 17:17:42.002190 I | dragonboat: LogDB info received, shard 12, busy false
|| 2023-02-14 17:17:42.005686 I | dragonboat: LogDB info received, shard 13, busy false
|| 2023-02-14 17:17:42.008186 I | dragonboat: LogDB info received, shard 14, busy false
|| 2023-02-14 17:17:42.011185 I | dragonboat: LogDB info received, shard 15, busy false
|| 2023-02-14 17:17:42.011185 I | logdb: using plain logdb
|| 2023-02-14 17:17:42.011185 I | dragonboat: logdb memory limit: 8192 MBytes
|| 2023-02-14 17:17:42.011185 I | dragonboat: NodeHost ID: nhid-1
|| 2023-02-14 17:17:42.011185 I | dragonboat: using regular node registry
|| 2023-02-14 17:17:42.011185 I | dragonboat: filesystem error injection mode enabled: false
|| 2023-02-14 17:17:42.012849 I | transport: transport type: go-tcp-transport
|| 2023-02-14 17:17:42.015470 I | dragonboat: transport type: go-tcp-transport
|| 2023-02-14 17:17:42.015470 I | dragonboat: logdb type: sharded-pebble
|| 2023-02-14 17:17:42.015470 I | dragonboat: nodehost address: 127.0.0.1:3001
|| 2023-02-14 17:17:42.015470 I | dragonboat: go version: go1.19, windows/amd64
|| 2023-02-14 17:17:42.015470 I | dragonboat: dragonboat version: 3.3.5 (Rel)
|| 2023-02-14 17:17:42.015470 W | dragonboat: unsupported OS/ARCH windows/amd64, don't use for production
|| 2023-02-14 17:17:42.015470 I | config: using default EngineConfig
|| 2023-02-14 17:17:42.015470 I | config: using default LogDBConfig
|| 2023-02-14 17:17:42.015470 I | dragonboat: DeploymentID set to 1
|| 2023-02-14 17:17:42.020646 I | dragonboat: LogDB info received, shard 0, busy false
|| 2023-02-14 17:17:42.023233 I | dragonboat: LogDB info received, shard 1, busy false
|| 2023-02-14 17:17:42.026861 I | dragonboat: LogDB info received, shard 2, busy false
|| 2023-02-14 17:17:42.029408 I | dragonboat: LogDB info received, shard 3, busy false
|| 2023-02-14 17:17:42.032512 I | dragonboat: LogDB info received, shard 4, busy false
|| 2023-02-14 17:17:42.036604 I | dragonboat: LogDB info received, shard 5, busy false
|| 2023-02-14 17:17:42.039054 I | dragonboat: LogDB info received, shard 6, busy false
|| 2023-02-14 17:17:42.042054 I | dragonboat: LogDB info received, shard 7, busy false
|| 2023-02-14 17:17:42.045745 I | dragonboat: LogDB info received, shard 8, busy false
|| 2023-02-14 17:17:42.048933 I | dragonboat: LogDB info received, shard 9, busy false
|| 2023-02-14 17:17:42.052546 I | dragonboat: LogDB info received, shard 10, busy false
|| 2023-02-14 17:17:42.055937 I | dragonboat: LogDB info received, shard 11, busy false
|| 2023-02-14 17:17:42.059433 I | dragonboat: LogDB info received, shard 12, busy false
|| 2023-02-14 17:17:42.062565 I | dragonboat: LogDB info received, shard 13, busy false
|| 2023-02-14 17:17:42.066266 I | dragonboat: LogDB info received, shard 14, busy false
|| 2023-02-14 17:17:42.069384 I | logdb: using plain logdb
|| 2023-02-14 17:17:42.069384 I | dragonboat: LogDB info received, shard 15, busy false
|| 2023-02-14 17:17:42.069896 I | dragonboat: logdb memory limit: 8192 MBytes
|| 2023-02-14 17:17:42.069896 I | dragonboat: NodeHost ID: nhid-2
|| 2023-02-14 17:17:42.069896 I | dragonboat: using regular node registry
|| 2023-02-14 17:17:42.069896 I | dragonboat: filesystem error injection mode enabled: false
|| 2023-02-14 17:17:42.070973 I | transport: transport type: go-tcp-transport
|| 2023-02-14 17:17:42.071479 I | dragonboat: transport type: go-tcp-transport
|| 2023-02-14 17:17:42.071479 I | dragonboat: logdb type: sharded-pebble
|| 2023-02-14 17:17:42.071479 I | dragonboat: nodehost address: 127.0.0.1:3002
|| 2023-02-14 17:17:42.071479 I | dragonboat: go version: go1.19, windows/amd64
|| 2023-02-14 17:17:42.071479 I | dragonboat: dragonboat version: 3.3.5 (Rel)
|| 2023-02-14 17:17:42.071479 W | dragonboat: unsupported OS/ARCH windows/amd64, don't use for production
|| 2023-02-14 17:17:42.071479 I | config: using default EngineConfig
|| 2023-02-14 17:17:42.071479 I | config: using default LogDBConfig
|| 2023-02-14 17:17:42.071558 I | dragonboat: DeploymentID set to 1
|| 2023-02-14 17:17:42.076060 I | dragonboat: LogDB info received, shard 0, busy false
|| 2023-02-14 17:17:42.079203 I | dragonboat: LogDB info received, shard 1, busy false
|| 2023-02-14 17:17:42.082219 I | dragonboat: LogDB info received, shard 2, busy false
|| 2023-02-14 17:17:42.085220 I | dragonboat: LogDB info received, shard 3, busy false
|| 2023-02-14 17:17:42.087897 I | dragonboat: LogDB info received, shard 4, busy false
|| 2023-02-14 17:17:42.090896 I | dragonboat: LogDB info received, shard 5, busy false
|| 2023-02-14 17:17:42.093894 I | dragonboat: LogDB info received, shard 6, busy false
|| 2023-02-14 17:17:42.097009 I | dragonboat: LogDB info received, shard 7, busy false
|| 2023-02-14 17:17:42.100196 I | dragonboat: LogDB info received, shard 8, busy false
|| 2023-02-14 17:17:42.103345 I | dragonboat: LogDB info received, shard 9, busy false
|| 2023-02-14 17:17:42.105987 I | dragonboat: LogDB info received, shard 10, busy false
|| 2023-02-14 17:17:42.108622 I | dragonboat: LogDB info received, shard 11, busy false
|| 2023-02-14 17:17:42.111842 I | dragonboat: LogDB info received, shard 12, busy false
|| 2023-02-14 17:17:42.114451 I | dragonboat: LogDB info received, shard 13, busy false
|| 2023-02-14 17:17:42.117568 I | dragonboat: LogDB info received, shard 14, busy false
|| 2023-02-14 17:17:42.120759 I | dragonboat: LogDB info received, shard 15, busy false
|| 2023-02-14 17:17:42.120759 I | logdb: using plain logdb
|| 2023-02-14 17:17:42.121272 I | dragonboat: logdb memory limit: 8192 MBytes
|| 2023-02-14 17:17:42.121272 I | dragonboat: NodeHost ID: nhid-3
|| 2023-02-14 17:17:42.121272 I | dragonboat: using regular node registry
|| 2023-02-14 17:17:42.121272 I | dragonboat: filesystem error injection mode enabled: false
|| 2023-02-14 17:17:42.122064 I | transport: transport type: go-tcp-transport
|| 2023-02-14 17:17:42.122580 I | dragonboat: transport type: go-tcp-transport
|| 2023-02-14 17:17:42.122580 I | dragonboat: logdb type: sharded-pebble
|| 2023-02-14 17:17:42.122580 I | dragonboat: nodehost address: 127.0.0.1:3003
|| 2023-02-14 17:17:42.123139 I | dragonboat: [00000:00001] replaying raft logs
|| 2023-02-14 17:17:42.123139 I | dragonboat: [00000:00001] has logdb entries size 1602 commit 1602 term 12
|| 2023-02-14 17:17:42.123139 I | raft: [00000:00001] created, initial: true, new: false
|| 2023-02-14 17:17:42.123139 I | raft: [00000:00001] raft log rate limit enabled: false, 0
|| 2023-02-14 17:17:42.123139 I | raft: [f:1,l:1602,t:12,c:1602,a:0] [00000:00001] t12 became follower
|| 2023-02-14 17:17:42.123688 I | rsm: [00000:00001] no snapshot available during launch
|| 2023-02-14 17:17:42.123688 I | dragonboat: [00000:00001] initialized using <00000:00001:0>
|| 2023-02-14 17:17:42.123688 I | dragonboat: [00000:00001] initial index set to 0
|| 2023-02-14 17:17:42.124208 I | dragonboat: [00000:00002] replaying raft logs
|| 2023-02-14 17:17:42.124208 I | dragonboat: [00000:00002] has logdb entries size 1602 commit 1602 term 12
|| 2023-02-14 17:17:42.124208 I | raft: [00000:00002] created, initial: true, new: false
|| 2023-02-14 17:17:42.124208 I | raft: [00000:00002] raft log rate limit enabled: false, 0
|| 2023-02-14 17:17:42.124208 I | raft: [f:1,l:1602,t:12,c:1602,a:0] [00000:00002] t12 became follower
|| 2023-02-14 17:17:42.124208 I | rsm: [00000:00002] no snapshot available during launch
|| 2023-02-14 17:17:42.124208 I | dragonboat: [00000:00002] initialized using <00000:00002:0>
|| 2023-02-14 17:17:42.124208 I | dragonboat: [00000:00002] initial index set to 0
|| 2023-02-14 17:17:42.124720 I | dragonboat: [00000:00003] replaying raft logs
|| 2023-02-14 17:17:42.124720 I | dragonboat: [00000:00003] has logdb entries size 1602 commit 1602 term 12
|| 2023-02-14 17:17:42.124720 I | raft: [00000:00003] created, initial: true, new: false
|| 2023-02-14 17:17:42.124720 I | raft: [00000:00003] raft log rate limit enabled: false, 0
|| 2023-02-14 17:17:42.124720 I | raft: [f:1,l:1602,t:12,c:1602,a:0] [00000:00003] t12 became follower
|| 2023-02-14 17:17:42.124720 I | rsm: [00000:00003] no snapshot available during launch
|| 2023-02-14 17:17:42.124720 I | dragonboat: [00000:00003] initialized using <00000:00003:0>
|| 2023-02-14 17:17:42.124720 I | dragonboat: [00000:00003] initial index set to 0
|| 2023-02-14 17:17:42.124720 W | raft: [f:1,l:1602,t:12,c:1602,a:0] [00000:00001] t12 dropped ReadIndex, no leader
|| 2023-02-14 17:17:42.124720 W | raft: [f:1,l:1602,t:12,c:1602,a:0] [00000:00001] t12 dropped LeaderTransfer, no leader
|| 2023-02-14 17:17:42.125233 I | rsm: [00000:00001] applied ADD ccid 0 (1), n00001 (127.0.0.1:3001)
|| 2023-02-14 17:17:42.125233 I | rsm: [00000:00001] applied ADD ccid 0 (2), n00002 (127.0.0.1:3002)
|| 2023-02-14 17:17:42.125233 I | rsm: [00000:00001] applied ADD ccid 0 (3), n00003 (127.0.0.1:3003)
|| 2023-02-14 17:17:42.173643 I | rsm: [00000:00002] applied ADD ccid 0 (1), n00001 (127.0.0.1:3001)
|| 2023-02-14 17:17:42.173643 I | rsm: [00000:00002] applied ADD ccid 0 (2), n00002 (127.0.0.1:3002)
|| 2023-02-14 17:17:42.173909 I | rsm: [00000:00002] applied ADD ccid 0 (3), n00003 (127.0.0.1:3003)
|| 2023-02-14 17:17:42.233179 I | rsm: [00000:00003] applied ADD ccid 0 (1), n00001 (127.0.0.1:3001)
|| 2023-02-14 17:17:42.233179 I | rsm: [00000:00003] applied ADD ccid 0 (2), n00002 (127.0.0.1:3002)
|| 2023-02-14 17:17:42.233179 I | rsm: [00000:00003] applied ADD ccid 0 (3), n00003 (127.0.0.1:3003)
|| 2023-02-14 17:17:43.135162 W | raft: [f:1,l:1602,t:12,c:1602,a:1602] [00000:00001] t12 dropped ReadIndex, no leader
|| 2023-02-14 17:17:43.181232 W | raft: [f:1,l:1602,t:12,c:1602,a:1602] [00000:00002] t13 became candidate
|| 2023-02-14 17:17:43.181232 W | raft: [f:1,l:1602,t:12,c:1602,a:1602] [00000:00002] t13 received RequestVoteResp from n00002
|| 2023-02-14 17:17:43.181232 W | raft: [f:1,l:1602,t:12,c:1602,a:1602] [00000:00002] t13 sent RequestVote to n00003
|| 2023-02-14 17:17:43.181758 W | raft: [f:1,l:1602,t:12,c:1602,a:1602] [00000:00002] t13 sent RequestVote to n00001
|| 2023-02-14 17:17:43.189452 W | raft: [f:1,l:1602,t:12,c:1602,a:1602] [00000:00001] t12 received RequestVote with higher term (13) from n00002
|| 2023-02-14 17:17:43.189452 W | raft: [f:1,l:1602,t:12,c:1602,a:1602] [00000:00001] t12 become followerKE after receiving higher term from n00002
|| 2023-02-14 17:17:43.189452 I | raft: [f:1,l:1602,t:12,c:1602,a:1602] [00000:00001] t13 became follower
|| 2023-02-14 17:17:43.190450 W | raft: [f:1,l:1602,t:12,c:1602,a:1602] [00000:00001] t13 cast vote from n00002 index 1602 term 13, log term: 12
|| 2023-02-14 17:17:43.191450 W | raft: [f:1,l:1602,t:12,c:1602,a:1602] [00000:00003] t12 received RequestVote with higher term (13) from n00002
|| 2023-02-14 17:17:43.191450 W | raft: [f:1,l:1602,t:12,c:1602,a:1602] [00000:00003] t12 become followerKE after receiving higher term from n00002
|| 2023-02-14 17:17:43.192449 I | raft: [f:1,l:1602,t:12,c:1602,a:1602] [00000:00003] t13 became follower
|| 2023-02-14 17:17:43.192449 W | raft: [f:1,l:1602,t:12,c:1602,a:1602] [00000:00003] t13 cast vote from n00002 index 1602 term 13, log term: 12
|| 2023-02-14 17:17:43.196047 W | raft: [f:1,l:1602,t:12,c:1602,a:1602] [00000:00002] t13 received RequestVoteResp from n00001
|| 2023-02-14 17:17:43.196047 W | raft: [f:1,l:1602,t:12,c:1602,a:1602] [00000:00002] t13 received 2 votes and 0 rejections, quorum is 2
|| 2023-02-14 17:17:43.196047 I | raft: [f:1,l:1603,t:13,c:1602,a:1602] [00000:00002] t13 became leader
|| phase1
|| 2023-02-14 17:17:44.177114 E | transport: send batch failed, target 127.0.0.1:3003 (write tcp 127.0.0.1:52308->127.0.0.1:3003: wsasend: An existing connection was forcibly closed by the remote host.), 1
|| 2023-02-14 17:17:44.177114 W | transport: breaker 127.0.0.1:3002 to 127.0.0.1:3003 failed, connect and process failed: write tcp 127.0.0.1:52308->127.0.0.1:3003: wsasend: An existing connection was forcibly closed by the remote host.
|| 2023-02-14 17:17:44.177114 W | transport: 127.0.0.1:3003 became unreachable, affected 1 nodes
|| 2023-02-14 17:17:46.826915 E | transport: Nodehost 127.0.0.1:3002 failed to get a connection to 127.0.0.1:3003, dial tcp 127.0.0.1:3003: connectex: No connection could be made because the target machine actively refused it.
|| 2023-02-14 17:17:46.826915 W | transport: breaker 127.0.0.1:3002 to 127.0.0.1:3003 failed, connect and process failed: dial tcp 127.0.0.1:3003: connectex: No connection could be made because the target machine actively refused it.
|| 2023-02-14 17:17:46.826915 W | transport: 127.0.0.1:3003 became unreachable, affected 0 nodes
|| phase2
|| 2023-02-14 17:17:47.169227 I | dragonboat: go version: go1.19, windows/amd64
|| 2023-02-14 17:17:47.169227 I | dragonboat: dragonboat version: 3.3.5 (Rel)
|| 2023-02-14 17:17:47.169227 W | dragonboat: unsupported OS/ARCH windows/amd64, don't use for production
|| 2023-02-14 17:17:47.169227 I | config: using default EngineConfig
|| 2023-02-14 17:17:47.169227 I | config: using default LogDBConfig
|| 2023-02-14 17:17:47.169472 I | dragonboat: DeploymentID set to 1
|| 2023-02-14 17:17:47.178334 I | dragonboat: LogDB info received, shard 0, busy false
|| 2023-02-14 17:17:47.181350 I | dragonboat: LogDB info received, shard 1, busy false
|| 2023-02-14 17:17:47.185958 I | dragonboat: LogDB info received, shard 2, busy false
|| 2023-02-14 17:17:47.189458 I | dragonboat: LogDB info received, shard 3, busy false
|| 2023-02-14 17:17:47.193457 I | dragonboat: LogDB info received, shard 4, busy false
|| 2023-02-14 17:17:47.196957 I | dragonboat: LogDB info received, shard 5, busy false
|| 2023-02-14 17:17:47.200404 I | dragonboat: LogDB info received, shard 6, busy false
|| 2023-02-14 17:17:47.204289 I | dragonboat: LogDB info received, shard 7, busy false
|| 2023-02-14 17:17:47.207289 I | dragonboat: LogDB info received, shard 8, busy false
|| 2023-02-14 17:17:47.210808 I | dragonboat: LogDB info received, shard 9, busy false
|| 2023-02-14 17:17:47.213791 I | dragonboat: LogDB info received, shard 10, busy false
|| 2023-02-14 17:17:47.216723 I | dragonboat: LogDB info received, shard 11, busy false
|| 2023-02-14 17:17:47.219718 I | dragonboat: LogDB info received, shard 12, busy false
|| 2023-02-14 17:17:47.223395 I | dragonboat: LogDB info received, shard 13, busy false
|| 2023-02-14 17:17:47.226391 I | dragonboat: LogDB info received, shard 14, busy false
|| 2023-02-14 17:17:47.229406 I | dragonboat: LogDB info received, shard 15, busy false
|| 2023-02-14 17:17:47.229406 I | logdb: using plain logdb
|| 2023-02-14 17:17:47.229406 I | dragonboat: logdb memory limit: 8192 MBytes
|| 2023-02-14 17:17:47.229406 I | dragonboat: NodeHost ID: nhid-3
|| 2023-02-14 17:17:47.229406 I | dragonboat: using regular node registry
|| 2023-02-14 17:17:47.229406 I | dragonboat: filesystem error injection mode enabled: false
|| 2023-02-14 17:17:47.229406 I | transport: transport type: go-tcp-transport
|| 2023-02-14 17:17:47.229406 I | dragonboat: transport type: go-tcp-transport
|| 2023-02-14 17:17:47.229406 I | dragonboat: logdb type: sharded-pebble
|| 2023-02-14 17:17:47.229406 I | dragonboat: nodehost address: 127.0.0.1:3003
|| 2023-02-14 17:17:47.230376 I | dragonboat: [00000:00003] replaying raft logs
|| 2023-02-14 17:17:47.230376 I | dragonboat: [00000:00003] has logdb entries size 1603 commit 1603 term 13
|| 2023-02-14 17:17:47.230376 I | raft: [00000:00003] created, initial: true, new: false
|| 2023-02-14 17:17:47.230376 I | raft: [00000:00003] raft log rate limit enabled: false, 0
|| 2023-02-14 17:17:47.230376 I | raft: [f:1,l:1603,t:13,c:1603,a:0] [00000:00003] t13 became follower
|| 2023-02-14 17:17:47.231377 I | rsm: [00000:00003] no snapshot available during launch
|| 2023-02-14 17:17:47.231377 I | dragonboat: [00000:00003] initialized using <00000:00003:0>
|| 2023-02-14 17:17:47.231459 I | dragonboat: [00000:00003] initial index set to 0
|| 2023-02-14 17:17:47.340019 I | rsm: [00000:00003] applied ADD ccid 0 (1), n00001 (127.0.0.1:3001)
|| 2023-02-14 17:17:47.340019 I | rsm: [00000:00003] applied ADD ccid 0 (2), n00002 (127.0.0.1:3002)
|| 2023-02-14 17:17:47.340019 I | rsm: [00000:00003] applied ADD ccid 0 (3), n00003 (127.0.0.1:3003)
|| 2023-02-14 17:17:48.241145 W | raft: [f:1,l:1603,t:13,c:1603,a:1603] [00000:00003] t13 dropped ReadIndex, no leader
|| phase3
|| 2023-02-14 17:17:49.278337 E | transport: send batch failed, target 127.0.0.1:3002 (write tcp 127.0.0.1:52310->127.0.0.1:3002: wsasend: An existing connection was forcibly closed by the remote host.), 1
|| 2023-02-14 17:17:49.278337 W | transport: breaker 127.0.0.1:3001 to 127.0.0.1:3002 failed, connect and process failed: write tcp 127.0.0.1:52310->127.0.0.1:3002: wsasend: An existing connection was forcibly closed by the remote host.
|| 2023-02-14 17:17:49.278337 W | transport: 127.0.0.1:3002 became unreachable, affected 1 nodes
|| 2023-02-14 17:17:50.289025 I | [WARN] node 1 shard 0 tick err: timeout
|| 2023-02-14 17:17:51.129979 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00003] t14 became candidate
|| 2023-02-14 17:17:51.129979 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00003] t14 received RequestVoteResp from n00003
|| 2023-02-14 17:17:51.129979 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00003] t14 sent RequestVote to n00002
|| 2023-02-14 17:17:51.129979 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00003] t14 sent RequestVote to n00001
|| 2023-02-14 17:17:51.130111 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 became candidate
|| 2023-02-14 17:17:51.130111 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 received RequestVoteResp from n00001
|| 2023-02-14 17:17:51.130111 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 sent RequestVote to n00002
|| 2023-02-14 17:17:51.130111 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 sent RequestVote to n00003
|| 2023-02-14 17:17:51.131138 E | transport: send batch failed, target 127.0.0.1:3002 (write tcp 127.0.0.1:52319->127.0.0.1:3002: wsasend: An existing connection was forcibly closed by the remote host.), 1
|| 2023-02-14 17:17:51.131138 W | transport: breaker 127.0.0.1:3003 to 127.0.0.1:3002 failed, connect and process failed: write tcp 127.0.0.1:52319->127.0.0.1:3002: wsasend: An existing connection was forcibly closed by the remote host.
|| 2023-02-14 17:17:51.131138 W | transport: 127.0.0.1:3002 became unreachable, affected 1 nodes
|| 2023-02-14 17:17:51.132179 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 rejected vote n00003 index1773 term14,logterm13,grantfalse,utdtrue
|| 2023-02-14 17:17:51.132678 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00003] t14 rejected vote n00001 index1773 term14,logterm13,grantfalse,utdtrue
|| 2023-02-14 17:17:51.132678 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00003] t14 received RequestVoteResp rejection from n00001
|| 2023-02-14 17:17:51.132678 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00003] t14 received 1 votes and 1 rejections, quorum is 2
|| 2023-02-14 17:17:51.132678 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 received RequestVoteResp rejection from n00003
|| 2023-02-14 17:17:51.132678 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 received 1 votes and 1 rejections, quorum is 2
|| 2023-02-14 17:17:51.301186 I | [WARN] node 1 shard 0 tick err: timeout
|| 2023-02-14 17:17:51.301186 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:51.301186 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:51.316982 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:51.316982 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:51.347969 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:51.347969 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:51.379060 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:51.379060 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:51.410740 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:51.410740 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:51.442249 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:51.442249 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:51.473248 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:51.473248 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:51.504398 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:51.504398 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:51.534938 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:51.535329 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:51.566274 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:51.566274 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:51.597141 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:51.597141 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:51.627855 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:51.627855 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:51.658238 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:51.658238 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:51.689227 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:51.689227 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:51.720035 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:51.720035 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:51.736070 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:51.736070 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:51.781256 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:51.781256 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:51.796270 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:51.796270 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:51.827174 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:51.827174 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:51.858269 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:51.858269 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:51.889278 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:51.889278 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:51.920060 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:51.920060 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:51.951175 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:51.951175 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:51.982435 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:51.982435 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:52.013788 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:52.013788 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:52.044462 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:52.044462 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:52.075132 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:52.075132 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:52.106424 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:52.106424 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:52.136909 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:52.136909 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:52.168862 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:52.169017 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:52.200652 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:52.200652 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:52.231472 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:52.231515 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:52.247104 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:52.247104 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| phase4
|| 2023-02-14 17:17:52.262667 I | dragonboat: go version: go1.19, windows/amd64
|| 2023-02-14 17:17:52.262667 I | dragonboat: dragonboat version: 3.3.5 (Rel)
|| 2023-02-14 17:17:52.262667 W | dragonboat: unsupported OS/ARCH windows/amd64, don't use for production
|| 2023-02-14 17:17:52.262667 I | config: using default EngineConfig
|| 2023-02-14 17:17:52.262667 I | config: using default LogDBConfig
|| 2023-02-14 17:17:52.262667 I | dragonboat: DeploymentID set to 1
|| 2023-02-14 17:17:52.274229 I | dragonboat: LogDB info received, shard 0, busy false
|| 2023-02-14 17:17:52.276866 I | dragonboat: LogDB info received, shard 1, busy false
|| 2023-02-14 17:17:52.276866 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:52.277376 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:52.279739 I | dragonboat: LogDB info received, shard 2, busy false
|| 2023-02-14 17:17:52.282500 I | dragonboat: LogDB info received, shard 3, busy false
|| 2023-02-14 17:17:52.285111 I | dragonboat: LogDB info received, shard 4, busy false
|| 2023-02-14 17:17:52.288676 I | dragonboat: LogDB info received, shard 5, busy false
|| 2023-02-14 17:17:52.291677 I | dragonboat: LogDB info received, shard 6, busy false
|| 2023-02-14 17:17:52.295431 I | dragonboat: LogDB info received, shard 7, busy false
|| 2023-02-14 17:17:52.299127 I | dragonboat: LogDB info received, shard 8, busy false
|| 2023-02-14 17:17:52.302487 I | dragonboat: LogDB info received, shard 9, busy false
|| 2023-02-14 17:17:52.306194 I | dragonboat: LogDB info received, shard 10, busy false
|| 2023-02-14 17:17:52.306194 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:52.306194 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:52.309415 I | dragonboat: LogDB info received, shard 11, busy false
|| 2023-02-14 17:17:52.313151 I | dragonboat: LogDB info received, shard 12, busy false
|| 2023-02-14 17:17:52.316489 I | dragonboat: LogDB info received, shard 13, busy false
|| 2023-02-14 17:17:52.319708 I | dragonboat: LogDB info received, shard 14, busy false
|| 2023-02-14 17:17:52.322982 I | logdb: using plain logdb
|| 2023-02-14 17:17:52.322982 I | dragonboat: LogDB info received, shard 15, busy false
|| 2023-02-14 17:17:52.323501 I | dragonboat: logdb memory limit: 8192 MBytes
|| 2023-02-14 17:17:52.323501 I | dragonboat: NodeHost ID: nhid-2
|| 2023-02-14 17:17:52.323552 I | dragonboat: using regular node registry
|| 2023-02-14 17:17:52.323552 I | dragonboat: filesystem error injection mode enabled: false
|| 2023-02-14 17:17:52.323752 I | transport: transport type: go-tcp-transport
|| 2023-02-14 17:17:52.323752 I | dragonboat: transport type: go-tcp-transport
|| 2023-02-14 17:17:52.323752 I | dragonboat: logdb type: sharded-pebble
|| 2023-02-14 17:17:52.323752 I | dragonboat: nodehost address: 127.0.0.1:3002
|| 2023-02-14 17:17:52.324280 I | dragonboat: [00000:00002] replaying raft logs
|| 2023-02-14 17:17:52.324794 I | dragonboat: [00000:00002] has logdb entries size 1773 commit 1773 term 13
|| 2023-02-14 17:17:52.324794 I | raft: [00000:00002] created, initial: true, new: false
|| 2023-02-14 17:17:52.324794 I | raft: [00000:00002] raft log rate limit enabled: false, 0
|| 2023-02-14 17:17:52.324794 I | raft: [f:1,l:1773,t:13,c:1773,a:0] [00000:00002] t13 became follower
|| 2023-02-14 17:17:52.324794 I | rsm: [00000:00002] no snapshot available during launch
|| 2023-02-14 17:17:52.324794 I | dragonboat: [00000:00002] initialized using <00000:00002:0>
|| 2023-02-14 17:17:52.324794 I | dragonboat: [00000:00002] initial index set to 0
|| 2023-02-14 17:17:52.339541 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 dropped proposal, no leader
|| 2023-02-14 17:17:52.339541 I | [WARN] node 1 shard 0 tick err: request dropped as the cluster is not ready
|| 2023-02-14 17:17:52.340739 W | raft: [f:1,l:1773,t:13,c:1773,a:0] [00000:00002] t13 dropped proposal, no leader
|| 2023-02-14 17:17:52.340739 W | raft: [f:1,l:1773,t:13,c:1773,a:0] [00000:00002] t13 received RequestVote with higher term (14) from n00001
|| 2023-02-14 17:17:52.340739 W | raft: [f:1,l:1773,t:13,c:1773,a:0] [00000:00002] t13 become followerKE after receiving higher term from n00001
|| 2023-02-14 17:17:52.340739 I | raft: [f:1,l:1773,t:13,c:1773,a:0] [00000:00002] t14 became follower
|| 2023-02-14 17:17:52.340739 W | raft: [f:1,l:1773,t:13,c:1773,a:0] [00000:00002] t14 cast vote from n00001 index 1773 term 14, log term: 13
|| 2023-02-14 17:17:52.341333 I | rsm: [00000:00002] applied ADD ccid 0 (1), n00001 (127.0.0.1:3001)
|| 2023-02-14 17:17:52.341333 I | rsm: [00000:00002] applied ADD ccid 0 (2), n00002 (127.0.0.1:3002)
|| 2023-02-14 17:17:52.341333 I | rsm: [00000:00002] applied ADD ccid 0 (3), n00003 (127.0.0.1:3003)
|| 2023-02-14 17:17:52.344656 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 received RequestVoteResp from n00002
|| 2023-02-14 17:17:52.344656 W | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00001] t14 received 2 votes and 1 rejections, quorum is 2
|| 2023-02-14 17:17:52.344656 I | raft: [f:1,l:1774,t:14,c:1773,a:1773] [00000:00001] t14 became leader
|| 2023-02-14 17:17:52.344760 I | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00003] t14 became follower
|| phase5
|| phase6
|| 2023-02-14 17:17:53.358028 E | transport: send batch failed, target 127.0.0.1:3001 (write tcp 127.0.0.1:52330->127.0.0.1:3001: wsasend: An existing connection was forcibly closed by the remote host.), 1
|| 2023-02-14 17:17:53.358028 W | transport: breaker 127.0.0.1:3003 to 127.0.0.1:3001 failed, connect and process failed: write tcp 127.0.0.1:52330->127.0.0.1:3001: wsasend: An existing connection was forcibly closed by the remote host.
|| 2023-02-14 17:17:53.358028 W | transport: 127.0.0.1:3001 became unreachable, affected 1 nodes
|| 2023-02-14 17:17:54.368471 I | [WARN] node 3 shard 0 tick err: timeout
|| 2023-02-14 17:17:54.429976 W | raft: [f:1,l:1807,t:14,c:1807,a:1807] [00000:00002] t15 became candidate
|| 2023-02-14 17:17:54.429976 W | raft: [f:1,l:1807,t:14,c:1807,a:1807] [00000:00002] t15 received RequestVoteResp from n00002
|| 2023-02-14 17:17:54.429976 W | raft: [f:1,l:1807,t:14,c:1807,a:1807] [00000:00002] t15 sent RequestVote to n00003
|| 2023-02-14 17:17:54.429976 W | raft: [f:1,l:1807,t:14,c:1807,a:1807] [00000:00002] t15 sent RequestVote to n00001
|| 2023-02-14 17:17:54.436908 E | transport: send batch failed, target 127.0.0.1:3001 (write tcp 127.0.0.1:52332->127.0.0.1:3001: wsasend: An existing connection was forcibly closed by the remote host.), 1
|| 2023-02-14 17:17:54.436908 W | transport: breaker 127.0.0.1:3002 to 127.0.0.1:3001 failed, connect and process failed: write tcp 127.0.0.1:52332->127.0.0.1:3001: wsasend: An existing connection was forcibly closed by the remote host.
|| 2023-02-14 17:17:54.436908 W | transport: 127.0.0.1:3001 became unreachable, affected 1 nodes
|| 2023-02-14 17:17:54.438487 W | raft: [f:1,l:1807,t:14,c:1807,a:1807] [00000:00003] t14 received RequestVote with higher term (15) from n00002
|| 2023-02-14 17:17:54.438487 W | raft: [f:1,l:1807,t:14,c:1807,a:1807] [00000:00003] t14 become followerKE after receiving higher term from n00002
|| 2023-02-14 17:17:54.438487 I | raft: [f:1,l:1807,t:14,c:1807,a:1807] [00000:00003] t15 became follower
|| 2023-02-14 17:17:54.438487 W | raft: [f:1,l:1807,t:14,c:1807,a:1807] [00000:00003] t15 cast vote from n00002 index 1807 term 15, log term: 14
|| 2023-02-14 17:17:54.441201 W | raft: [f:1,l:1807,t:14,c:1807,a:1807] [00000:00002] t15 received RequestVoteResp from n00003
|| 2023-02-14 17:17:54.441201 W | raft: [f:1,l:1807,t:14,c:1807,a:1807] [00000:00002] t15 received 2 votes and 0 rejections, quorum is 2
|| 2023-02-14 17:17:54.441201 I | raft: [f:1,l:1808,t:15,c:1807,a:1807] [00000:00002] t15 became leader
|| 2023-02-14 17:17:55.374277 I | [WARN] node 3 shard 0 tick err: timeout
|| phase7
|| 2023-02-14 17:17:56.350925 I | dragonboat: go version: go1.19, windows/amd64
|| 2023-02-14 17:17:56.350925 I | dragonboat: dragonboat version: 3.3.5 (Rel)
|| 2023-02-14 17:17:56.350925 W | dragonboat: unsupported OS/ARCH windows/amd64, don't use for production
|| 2023-02-14 17:17:56.351246 I | config: using default EngineConfig
|| 2023-02-14 17:17:56.351246 I | config: using default LogDBConfig
|| 2023-02-14 17:17:56.351352 I | dragonboat: DeploymentID set to 1
|| 2023-02-14 17:17:56.362717 I | dragonboat: LogDB info received, shard 0, busy false
|| 2023-02-14 17:17:56.367712 I | dragonboat: LogDB info received, shard 0, busy false
|| 2023-02-14 17:17:56.369271 I | dragonboat: LogDB info received, shard 1, busy false
|| 2023-02-14 17:17:56.372607 I | dragonboat: LogDB info received, shard 2, busy false
|| 2023-02-14 17:17:56.376247 I | dragonboat: LogDB info received, shard 3, busy false
|| 2023-02-14 17:17:56.379276 I | dragonboat: LogDB info received, shard 4, busy false
|| 2023-02-14 17:17:56.382486 I | dragonboat: LogDB info received, shard 5, busy false
|| 2023-02-14 17:17:56.385099 I | dragonboat: LogDB info received, shard 6, busy false
|| 2023-02-14 17:17:56.388554 I | dragonboat: LogDB info received, shard 7, busy false
|| 2023-02-14 17:17:56.391690 I | dragonboat: LogDB info received, shard 8, busy false
|| 2023-02-14 17:17:56.394827 I | dragonboat: LogDB info received, shard 9, busy false
|| 2023-02-14 17:17:56.398190 I | dragonboat: LogDB info received, shard 10, busy false
|| 2023-02-14 17:17:56.400910 I | dragonboat: LogDB info received, shard 11, busy false
|| 2023-02-14 17:17:56.403893 I | dragonboat: LogDB info received, shard 12, busy false
|| 2023-02-14 17:17:56.406893 I | dragonboat: LogDB info received, shard 13, busy false
|| 2023-02-14 17:17:56.409894 I | dragonboat: LogDB info received, shard 14, busy false
|| 2023-02-14 17:17:56.412893 I | logdb: using plain logdb
|| 2023-02-14 17:17:56.412893 I | dragonboat: LogDB info received, shard 15, busy false
|| 2023-02-14 17:17:56.412893 I | dragonboat: logdb memory limit: 8192 MBytes
|| 2023-02-14 17:17:56.412893 I | dragonboat: NodeHost ID: nhid-1
|| 2023-02-14 17:17:56.412893 I | dragonboat: using regular node registry
|| 2023-02-14 17:17:56.412893 I | dragonboat: filesystem error injection mode enabled: false
|| 2023-02-14 17:17:56.414042 E | transport: Nodehost 127.0.0.1:3003 failed to get a connection to 127.0.0.1:3001, dial tcp 127.0.0.1:3001: connectex: No connection could be made because the target machine actively refused it.
|| 2023-02-14 17:17:56.414042 W | transport: breaker 127.0.0.1:3003 to 127.0.0.1:3001 failed, connect and process failed: dial tcp 127.0.0.1:3001: connectex: No connection could be made because the target machine actively refused it.
|| 2023-02-14 17:17:56.414120 W | transport: 127.0.0.1:3001 became unreachable, affected 0 nodes
|| 2023-02-14 17:17:56.414120 I | transport: transport type: go-tcp-transport
|| 2023-02-14 17:17:56.414120 I | dragonboat: transport type: go-tcp-transport
|| 2023-02-14 17:17:56.414120 I | dragonboat: logdb type: sharded-pebble
|| 2023-02-14 17:17:56.414120 I | dragonboat: nodehost address: 127.0.0.1:3001
|| 2023-02-14 17:17:56.414120 I | dragonboat: [00000:00001] replaying raft logs
|| 2023-02-14 17:17:56.415156 I | dragonboat: [00000:00001] has logdb entries size 1807 commit 1807 term 14
|| 2023-02-14 17:17:56.415156 I | raft: [00000:00001] created, initial: true, new: false
|| 2023-02-14 17:17:56.415156 I | raft: [00000:00001] raft log rate limit enabled: false, 0
|| 2023-02-14 17:17:56.415156 I | raft: [f:1,l:1807,t:14,c:1807,a:0] [00000:00001] t14 became follower
|| 2023-02-14 17:17:56.415156 I | rsm: [00000:00001] no snapshot available during launch
|| 2023-02-14 17:17:56.415156 I | dragonboat: [00000:00001] initialized using <00000:00001:0>
|| 2023-02-14 17:17:56.415156 I | dragonboat: [00000:00001] initial index set to 0
|| 2023-02-14 17:17:56.523627 I | rsm: [00000:00001] applied ADD ccid 0 (1), n00001 (127.0.0.1:3001)
|| 2023-02-14 17:17:56.523627 I | rsm: [00000:00001] applied ADD ccid 0 (2), n00002 (127.0.0.1:3002)
|| 2023-02-14 17:17:56.523627 I | rsm: [00000:00001] applied ADD ccid 0 (3), n00003 (127.0.0.1:3003)
|| 2023-02-14 17:17:56.881657 W | raft: [f:1,l:1807,t:14,c:1807,a:1807] [00000:00001] t14 received Heartbeat with higher term (15) from n00002
|| 2023-02-14 17:17:56.881657 W | raft: [f:1,l:1807,t:14,c:1807,a:1807] [00000:00001] t14 become follower after receiving higher term from n00002
|| 2023-02-14 17:17:56.882685 I | raft: [f:1,l:1807,t:14,c:1807,a:1807] [00000:00001] t15 became follower
|| phase8
|| [Finished in 22 seconds with code 1]

You can see multiple tick timeout and no leader election after phase1, which is not expected.

lni commented 1 year ago

Please provide more descriptions on what your test program is trying to do, what is expected and what actually happened.

kevburnsjr commented 1 year ago

@hulucc Your logs indicate that the first leader change request was dropped because the cluster was not yet stable.

|| 2023-02-14 16:51:15.378999 W | raft: [f:1,l:1330,t:13,c:1330,a:0] [00000:00001] t13 dropped LeaderTransfer, no leader

... so when your code Stops nh3 in phase1 your code appears to intend to stop a follower but it's actually stopping the leader.

|| 2023-02-14 16:51:16.595686 I | raft: [f:1,l:1331,t:14,c:1330,a:1330] [00000:00003] t14 became leader
|| phase1

[00000:00003] t14 became leader in the logs means clusterID 0, nodeID 3. This can help debug such issues.

Similarly at the beginning of Phase 6 your code appears to expect node 3 to be the leader but the leader is actually node 1.

|| 2023-02-14 17:17:52.344656 I | raft: [f:1,l:1774,t:14,c:1773,a:1773] [00000:00001] t14 became leader
|| 2023-02-14 17:17:52.344760 I | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00003] t14 became follower
|| phase5
|| phase6

RequestLeaderTransfer makes a request to transfer the leadership of the specified Raft cluster to the target node identified by targetNodeID. It returns an error if the request fails to be started. There is no guarantee that such request can be fulfilled, i.e. the leadership transfer can still fail after a successful return of the RequestLeaderTransfer method.

Were you expecting RequestLeaderTransfer to be synchronous? Or were you expecting the nodehost to automatically request a synchronous leader change for all active clusters of which the host is a leader prior to returning from Stop? Or both?

hulucc commented 1 year ago

@hulucc Your logs indicate that the first leader change request was dropped because the cluster was not yet stable.
|| 2023-02-14 16:51:15.378999 W | raft: [f:1,l:1330,t:13,c:1330,a:0] [00000:00001] t13 dropped LeaderTransfer, no leader
... so when your code Stops nh3 in phase1 your code appears to intend to stop a follower but it's actually stopping the leader.
|| 2023-02-14 16:51:16.595686 I | raft: [f:1,l:1331,t:14,c:1330,a:1330] [00000:00003] t14 became leader
|| phase1
[00000:00003] t14 became leader in the logs means clusterID 0, nodeID 3. This can help debug such issues.

Similarly at the beginning of Phase 6 your code appears to expect node 3 to be the leader but the leader is actually node 1.
|| 2023-02-14 17:17:52.344656 I | raft: [f:1,l:1774,t:14,c:1773,a:1773] [00000:00001] t14 became leader
|| 2023-02-14 17:17:52.344760 I | raft: [f:1,l:1773,t:13,c:1773,a:1773] [00000:00003] t14 became follower
|| phase5
|| phase6
RequestLeaderTransfer makes a request to transfer the leadership of the specified Raft cluster to the target node identified by targetNodeID. It returns an error if the request fails to be started. There is no guarantee that such request can be fulfilled, i.e. the leadership transfer can still fail after a successful return of the RequestLeaderTransfer method.

Were you expecting RequestLeaderTransfer to be synchronous? Or were you expecting the nodehost to automatically request a synchronous leader change for all active clusters of which the host is a leader prior to returning from Stop? Or both?

@kevburnsjr Yes, actually I was not realized that RequestLeaderTransfer is async. After make RequestLeaderTransfer synchronous, there is no problem with graceful rolling update. Thanks for the help. Here is fixed code for any one may need it.

package main

import (
    "context"
    "encoding/binary"
    "fmt"
    "io"
    "log"
    "time"
    "errors"

    "github.com/lni/dragonboat/v3"
    "github.com/lni/dragonboat/v3/config"
    sm "github.com/lni/dragonboat/v3/statemachine"
    "github.com/lni/goutils/syncutil"
)

type exampleStateMachine struct {
    ClusterID uint64
    NodeID    uint64
    Tick      uint64
}

func NewExampleStateMachine(clusterID uint64, nodeID uint64) sm.IStateMachine {
    return &exampleStateMachine{ClusterID: clusterID, NodeID: nodeID}
}
func (it *exampleStateMachine) Update(req []byte) (sm.Result, error) {
    it.Tick++
    return sm.Result{}, nil
}
func (it *exampleStateMachine) Lookup(req interface{}) (interface{}, error) {
    return it.Tick, nil
}
func (it *exampleStateMachine) SaveSnapshot(w io.Writer, _ sm.ISnapshotFileCollection, _ <-chan struct{}) error {
    data := make([]byte, 8)
    binary.LittleEndian.PutUint64(data, it.Tick)
    w.Write(data)
    return nil
}
func (it *exampleStateMachine) RecoverFromSnapshot(r io.Reader, _ []sm.SnapshotFile, _ <-chan struct{}) error {
    data, _ := io.ReadAll(r)
    v := binary.LittleEndian.Uint64(data)
    it.Tick = v
    return nil
}
func (it *exampleStateMachine) Close() error {
    return nil
}

func newNodeHost(nhid uint64, addr string) *dragonboat.NodeHost {
    conf := config.NodeHostConfig{
        NodeHostDir:         fmt.Sprintf("single_nodehost_test_dir_safe_to_delete/%d", nhid),
        AddressByNodeHostID: false,
        RTTMillisecond:      100,
        RaftAddress:         addr,
        Expert: config.ExpertConfig{
            TestNodeHostID: nhid,
        },
    }
    nh, err := dragonboat.NewNodeHost(conf)
    if err != nil {
        panic(err)
    }
    return nh
}

func startShard(nh *dragonboat.NodeHost, clusterID uint64, nodeID uint64, initials map[uint64]string, fn interface{}) {
    conf := config.Config{
        NodeID:              nodeID,
        ClusterID:           clusterID,
        CheckQuorum:         true,
        ElectionRTT:         10,
        HeartbeatRTT:        1,
        SnapshotEntries:     0,
        CompactionOverhead:  0,
        OrderedConfigChange: false,
    }
    var err error
    switch f := fn.(type) {
    case sm.CreateStateMachineFunc:
        if len(initials) > 0 {
            err = nh.StartCluster(initials, false, f, conf)
        } else {
            err = nh.StartCluster(nil, false, f, conf)
        }
    case sm.CreateConcurrentStateMachineFunc:
        if len(initials) > 0 {
            err = nh.StartConcurrentCluster(initials, false, f, conf)
        } else {
            err = nh.StartConcurrentCluster(nil, false, f, conf)
        }
    default:
        panic(fmt.Errorf("unknown fn type: %T", f))
    }
    if err != nil {
        panic(err)
    }
}

func tick(nh *dragonboat.NodeHost, clusterID uint64) *syncutil.Stopper {
    stopper := syncutil.NewStopper()
    stopper.RunWorker(func() {
        t := time.NewTicker(time.Millisecond * 30)
        defer t.Stop()
        nop := nh.GetNoOPSession(clusterID)
        ctx := context.Background()
        for {
            select {
            case <-t.C:
                ctx1, cancel := context.WithTimeout(ctx, time.Second)
                _, err := nh.SyncPropose(ctx1, nop, []byte("tick"))
                cancel()
                if err != nil {
                    // log.Fatalln("[FATAL]", fmt.Errorf("node %d shahrd %d tick err: %w", nh.NodeHostConfig().Expert.TestNodeHostID, clusterID, err))
                    log.Println("[WARN]", fmt.Errorf("node %d shard %d tick err: %w", nh.NodeHostConfig().Expert.TestNodeHostID, clusterID, err))
                }
            case <-stopper.ShouldStop():
                return
            }
        }
    })
    return stopper
}

func waitReady(nh *dragonboat.NodeHost, clusterID uint64) {
    for {
        ctx, cancel := context.WithTimeout(context.TODO(), time.Second)
        defer cancel()
        if _, err := nh.SyncRead(ctx, clusterID, nil); err != nil {
            if errors.Is(err, dragonboat.ErrClusterNotReady) {
                time.Sleep(time.Second)
                continue
            }
        }
        return
    }
}

func waitLeader(nh *dragonboat.NodeHost, clusterID uint64) {
    for {
        leaderNodeID, valid, err := nh.GetLeaderID(clusterID)
        if !valid || err != nil {
            time.Sleep(time.Second)
            continue
        }
        if nh.HasNodeInfo(clusterID, leaderNodeID) {
            return
        }
    }
}

func main() {
    clusterID := uint64(0)
    fn := sm.CreateStateMachineFunc(NewExampleStateMachine)
    initials := map[uint64]string{
        1: "127.0.0.1:3001",
        2: "127.0.0.1:3002",
        3: "127.0.0.1:3003",
    }
    nh1 := newNodeHost(1, initials[1])
    nh2 := newNodeHost(2, initials[2])
    nh3 := newNodeHost(3, initials[3])
    startShard(nh1, clusterID, 1, initials, fn)
    startShard(nh2, clusterID, 2, initials, fn)
    startShard(nh3, clusterID, 3, initials, fn)
    waitReady(nh1, clusterID)
    if err := nh1.RequestLeaderTransfer(clusterID, 1); err != nil {
        log.Fatalln("[FATAL]", fmt.Errorf("RequestLeaderTransfer err: %w", err))
    }
    waitLeader(nh1, clusterID)
    stopper := tick(nh1, clusterID)
    defer stopper.Stop()
    println("phase1")

    nh3.Stop()
    time.Sleep(time.Second*3)
    println("phase2")

    nh3 = newNodeHost(3, initials[3])
    startShard(nh3, clusterID, 3, nil, fn)
    waitReady(nh3, clusterID)
    println("phase3")

    nh2.Stop()
    time.Sleep(time.Second*3)
    println("phase4")

    nh2 = newNodeHost(2, initials[2])
    startShard(nh2, clusterID, 2, nil, fn)
    waitReady(nh2, clusterID)
    println("phase5")

    stopper.Stop()
    stopper = tick(nh3, clusterID)

    if err := nh3.RequestLeaderTransfer(clusterID, 3); err != nil {
        log.Fatalln("[FATAL]", fmt.Errorf("RequestLeaderTransfer err: %w", err))
    }
    waitLeader(nh3, clusterID)
    println("phase6")

    nh1.Stop()
    time.Sleep(time.Second*3)
    println("phase7")

    nh1 = newNodeHost(1, initials[1])
    startShard(nh1, clusterID, 1, nil, fn)
    waitReady(nh1, clusterID)
    println("phase8")

    time.Sleep(time.Second*5)
}

lni / dragonboat

Graceful rolling update #270