A client that requests a VStream from a replica tablet can block that tablet from being transitioned from a replica to a primary. I think this is a bug.
Reproduction
S/o to @ryanpbrewster who shared this repro with me!
Create cluster with an unsharded keyspace, with a primary and replica
Create a data table used purely for inserts to generate VStream events:
CREATE TABLE data (id INT NOT NULL AUTO_INCREMENT, text1 VARCHAR(36), PRIMARY KEY(id))
Open a VStream on the replica
Generate a backlog of VStream events by INSERTing records on the primary
Begin sending ApplyVSchema events to the primary
While that is happening, try to transition the replica to primary
Removing either steps 3 or 5 will allow the replica to transition to primary.
The script below carries out steps 3-6. It can be executed with ./main [topo-addr] [keyspace] [cell].
package main
import (
"context"
"log"
"time"
"google.golang.org/grpc"
"vitess.io/vitess/go/netutil"
"vitess.io/vitess/go/vt/grpcclient"
"vitess.io/vitess/go/vt/proto/binlogdata"
"vitess.io/vitess/go/vt/proto/query"
"vitess.io/vitess/go/vt/proto/queryservice"
"vitess.io/vitess/go/vt/proto/topodata"
"vitess.io/vitess/go/vt/proto/vschema"
"vitess.io/vitess/go/vt/proto/vtctldata"
"vitess.io/vitess/go/vt/vtctl/grpcvtctldclient"
"vitess.io/vitess/go/vt/vttablet/grpctmclient"
)
func main() {
ctx := context.Background()
// Connect to topo indirectly via vtctld.
vc, err := grpcvtctldclient.NewWithDialOpts(
"vitess-cluster-dev-vtctld-372e9986:15999", /* address */
true, /* fail fast */
grpc.WithInsecure(),
)
if err != nil {
log.Fatalf("failed to get new grpc client: %s", err.Error())
}
// Get a replica tablet and primary tablet
resp, err := vc.GetTablets(
ctx,
&vtctldata.GetTabletsRequest{
Cells: []string{"local"},
Keyspace: "src",
},
grpc.FailFast(true),
)
if err != nil {
log.Fatalf("failed to get replica tablets: %v", err)
}
tablets := resp.GetTablets()
if len(tablets) == 0 {
log.Fatalf("no tablets found")
}
var primary *topodata.Tablet
var replica *topodata.Tablet
for _, tablet := range tablets {
if tablet.Type == topodata.TabletType_PRIMARY {
primary = tablet
}
if tablet.Type == topodata.TabletType_REPLICA {
replica = tablet
}
}
if primary == nil {
log.Fatalf("failed to get primary tablet")
}
if replica == nil {
log.Fatalf("failed to get replica tablet")
}
// Create queryservice to replica.
addr := ""
if grpcPort, ok := replica.PortMap["grpc"]; ok {
addr = netutil.JoinHostPort(replica.Hostname, grpcPort)
} else {
addr = replica.Hostname
}
rcc, err := grpcclient.Dial(
addr,
true, /* fail fast */
grpc.WithInsecure(), /* not recommended for production use! */
)
if err != nil {
log.Fatalf("failed to dial tablet: %v", err)
}
rqc := queryservice.NewQueryClient(rcc)
// Create queryservice to primary.
addr = ""
if grpcPort, ok := primary.PortMap["grpc"]; ok {
addr = netutil.JoinHostPort(primary.Hostname, grpcPort)
} else {
addr = primary.Hostname
}
pcc, err := grpcclient.Dial(
addr,
true, /* fail fast */
grpc.WithInsecure(), /* not recommended for production use! */
)
if err != nil {
log.Fatalf("failed to dial tablet: %v", err)
}
pqc := queryservice.NewQueryClient(pcc)
// Create VStream from replica.
stream, err := rqc.VStream(context.TODO(), &binlogdata.VStreamRequest{
Target: &query.Target{
Keyspace: replica.Keyspace,
Shard: replica.Shard,
TabletType: replica.Type,
},
Position: "current",
Filter: &binlogdata.Filter{},
})
if err != nil {
log.Fatalf("client.VStream: %v", err)
}
log.Printf("set up vstream: %v", stream)
log.Printf("wait 60 seconds")
wait := time.Duration(60 * time.Second)
var left = wait
for start := time.Now(); time.Since(start) < wait; left = wait - time.Since(start) {
log.Printf("T-%v", left)
time.Sleep(1 * time.Second)
}
log.Printf("waiting for an initial vstream event")
for {
resp, err := stream.Recv()
if err != nil {
log.Printf("failed to recv vstream resp: %v", err)
}
log.Printf("got an initial vstream event: %v", resp)
break
}
// Trigger a big backlog of vstream events on the primary.
// This was done ahead of time: create table data(id int not null auto_increment, text1 varchar(36), primary key(id));
log.Printf("inserting a bunch of data on primary to generate a backlog of vstream events")
for i := 0; i < 5000; i++ {
_, err := pqc.Execute(
ctx,
&query.ExecuteRequest{
Target: &query.Target{
Keyspace: primary.Keyspace,
Shard: primary.Shard,
TabletType: primary.Type,
},
Query: &query.BoundQuery{
BindVariables: make(map[string]*query.BindVariable),
Sql: "INSERT INTO data(text1) VALUES(uuid())",
},
},
grpc.FailFast(true),
)
if err != nil {
log.Fatalf("failed to insert data: %v", err)
}
}
ch1 := make(chan struct{})
ch2 := make(chan struct{})
// SetVSchema. VStreamer will be blocked from sending events to us.
// This should block VStreamer from sending events to stream, because the
// event buffer is small.
log.Printf("sending a bunch of ApplyVSchema requests to the tablet's cell/keyspace")
go func(ch1 chan<- struct{}, ch2 <-chan struct{}) {
count := 0
for {
resp, err := vc.ApplyVSchema(
ctx,
&vtctldata.ApplyVSchemaRequest{
Cells: []string{primary.Alias.GetCell()},
Keyspace: primary.Keyspace,
VSchema: &vschema.Keyspace{},
},
grpc.FailFast(true),
)
if err != nil {
log.Fatalf("failed to ApplyVSchema: %v", err)
}
log.Printf("vschema after ApplyVSchema: %v", resp.VSchema)
count++
if count == 10 {
close(ch1)
}
time.Sleep(1 * time.Second)
}
}(ch1, ch2)
<-ch1
// Change tablet type.
tmc := grpctmclient.NewClient()
log.Printf("changing tablet type to primary")
if err := tmc.ChangeType(context.TODO(), replica, topodata.TabletType_PRIMARY, false /*semi-sync*/); err != nil {
log.Fatalf("failed to change tablet to primary: %v", err)
}
log.Printf("changed tablet type to primary")
close(ch2)
}
Overview
A client that requests a VStream from a replica tablet can block that tablet from being transitioned from a replica to a primary. I think this is a bug.
Reproduction
S/o to @ryanpbrewster who shared this repro with me!
data
table used purely for inserts to generate VStream events:Removing either steps 3 or 5 will allow the replica to transition to primary.
The script below carries out steps 3-6. It can be executed with
./main [topo-addr] [keyspace] [cell]
.Environment
This repro was tested against Vitess main:
In addition to the HEAD commit, I have some local log statements in Vitess to get a better understanding of where things are deadlocking.
Logs
Here are logs from the repro program, as well as from the replica tablet.
Repro logs
Replica logs