Closed twthorn closed 6 months ago
I'm able to repeat the problem this way using the local examples on main:
git checkout main && make build
pushd examples/local
# Apply this diff
diff --git a/examples/common/scripts/vttablet-up.sh b/examples/common/scripts/vttablet-up.sh
index daa40aee89..b572018188 100755
--- a/examples/common/scripts/vttablet-up.sh
+++ b/examples/common/scripts/vttablet-up.sh
@@ -54,6 +54,7 @@ vttablet \
--service_map 'grpc-queryservice,grpc-tabletmanager,grpc-updatestream' \
--pid_file $VTDATAROOT/$tablet_dir/vttablet.pid \
--heartbeat_on_demand_duration=5s \
+ --enable_replication_reporter \
> $VTDATAROOT/$tablet_dir/vttablet.out 2>&1 &
# Block waiting for the tablet to be listening
diff --git a/examples/local/101_initial_cluster.sh b/examples/local/101_initial_cluster.sh
index 95b51f168c..d0a1f3a507 100755
--- a/examples/local/101_initial_cluster.sh
+++ b/examples/local/101_initial_cluster.sh
@@ -48,7 +48,7 @@ else
# correct durability policy. Please see the comment above for
# more context on using a custom sidecar database name in your
# Vitess clusters.
- vtctldclient CreateKeyspace --sidecar-db-name="${SIDECAR_DB_NAME}" --durability-policy=semi_sync commerce || fail "Failed to create and configure the commerce keyspace"
+ vtctldclient CreateKeyspace --sidecar-db-name="${SIDECAR_DB_NAME}" --durability-policy=none commerce || fail "Failed to create and configure the commerce keyspace"
fi
# start mysqlctls for keyspace commerce
diff --git a/examples/local/vstream_client.go b/examples/local/vstream_client.go
index 98d2129f89..30a611acb4 100644
--- a/examples/local/vstream_client.go
+++ b/examples/local/vstream_client.go
@@ -38,7 +38,7 @@ import (
*/
func main() {
ctx := context.Background()
- streamCustomer := true
+ streamCustomer := false
var vgtid *binlogdatapb.VGtid
if streamCustomer {
vgtid = &binlogdatapb.VGtid{
@@ -76,7 +76,7 @@ func main() {
//MinimizeSkew: false,
//HeartbeatInterval: 60, //seconds
}
- reader, err := conn.VStream(ctx, topodatapb.TabletType_PRIMARY, vgtid, filter, flags)
+ reader, err := conn.VStream(ctx, topodatapb.TabletType_REPLICA, vgtid, filter, flags)
for {
e, err := reader.Recv()
switch err {
# Set up the cluster
./101_initial_cluster.sh; mysql < ../common/insert_commerce_data.sql
# Stop vtorc so that it doesn't repair replication
../common/scripts/vtorc-down.sh
# Start the vstream
go run vstream_client.go
# In another Terminal window
mysql -e "insert into customer values (100, 'mlord1@planetscale.com')"
# Stop replication on the 1 replica tablet
replicauid=$(vtctldclient GetTablets --keyspace commerce --tablet-type replica | awk '{print $1}' | cut -d- -f2 | bc)
command mysql -u root --socket=${VTDATAROOT}/vt_0000000${replicauid}/mysql.sock --binary-as-hex=false vt_commerce -e "stop replica"
command mysql -u root --socket=${VTDATAROOT}/vt_0000000${replicauid}/mysql.sock --binary-as-hex=false vt_commerce -e "show replica status\G" | grep Running
# Wait beyond the default healthy replica threshold
sleep 40
curl -s localhost:15${replicauid}/debug/status_details
mysql -e "insert into customer values (200, 'mlord2@planetscale.com')"
grep vstream /opt/vtdataroot/tmp/vtgate.WARNING
The VStream never tries to reinitialize and select a new tablet. And it never receives the second INSERT because the replica it's connected to never receives it:
[type:BEGIN keyspace:"commerce" shard:"0" type:FIELD field_event:{table_name:"commerce.customer" fields:{name:"customer_id" type:INT64 table:"customer" org_table:"customer" database:"vt_commerce" org_name:"customer_id" column_length:20 charset:63 flags:49667 column_type:"bigint"} fields:{name:"email" type:VARBINARY table:"customer" org_table:"customer" database:"vt_commerce" org_name:"email" column_length:128 charset:63 flags:128 column_type:"varbinary(128)"} keyspace:"commerce" shard:"0"} keyspace:"commerce" shard:"0"]
[type:VGTID vgtid:{shard_gtids:{keyspace:"commerce" shard:"0" gtid:"MySQL56/71551112-fdcd-11ee-9fe9-8fcc543c344e:1-43"}} keyspace:"commerce" shard:"0"]
[type:ROW row_event:{table_name:"commerce.customer" row_changes:{after:{lengths:1 lengths:16 values:"1alice@domain.com"}} keyspace:"commerce" shard:"0"} keyspace:"commerce" shard:"0" type:ROW row_event:{table_name:"commerce.customer" row_changes:{after:{lengths:1 lengths:14 values:"2bob@domain.com"}} keyspace:"commerce" shard:"0"} keyspace:"commerce" shard:"0" type:ROW row_event:{table_name:"commerce.customer" row_changes:{after:{lengths:1 lengths:18 values:"3charlie@domain.com"}} keyspace:"commerce" shard:"0"} keyspace:"commerce" shard:"0" type:ROW row_event:{table_name:"commerce.customer" row_changes:{after:{lengths:1 lengths:14 values:"4dan@domain.com"}} keyspace:"commerce" shard:"0"} keyspace:"commerce" shard:"0" type:ROW row_event:{table_name:"commerce.customer" row_changes:{after:{lengths:1 lengths:14 values:"5eve@domain.com"}} keyspace:"commerce" shard:"0"} keyspace:"commerce" shard:"0" type:VGTID vgtid:{shard_gtids:{keyspace:"commerce" shard:"0" gtid:"MySQL56/71551112-fdcd-11ee-9fe9-8fcc543c344e:1-43" table_p_ks:{table_name:"customer" lastpk:{fields:{name:"customer_id" type:INT64 charset:63 flags:49667} rows:{lengths:1 values:"5"}}}}} keyspace:"commerce" shard:"0" type:COMMIT keyspace:"commerce" shard:"0"]
[type:BEGIN keyspace:"commerce" shard:"0" type:VGTID vgtid:{shard_gtids:{keyspace:"commerce" shard:"0" gtid:"MySQL56/71551112-fdcd-11ee-9fe9-8fcc543c344e:1-43"}} keyspace:"commerce" shard:"0" type:COMMIT keyspace:"commerce" shard:"0"]
[type:COPY_COMPLETED keyspace:"commerce" shard:"0" type:COPY_COMPLETED]
[type:BEGIN timestamp:1713476997 current_time:1713476997575921000 keyspace:"commerce" shard:"0" type:FIELD timestamp:1713476997 field_event:{table_name:"commerce.customer" fields:{name:"customer_id" type:INT64 table:"customer" org_table:"customer" database:"vt_commerce" org_name:"customer_id" column_length:20 charset:63 flags:49667 column_type:"bigint"} fields:{name:"email" type:VARBINARY table:"customer" org_table:"customer" database:"vt_commerce" org_name:"email" column_length:128 charset:63 flags:128 column_type:"varbinary(128)"} keyspace:"commerce" shard:"0"} current_time:1713476997577073000 keyspace:"commerce" shard:"0" type:ROW timestamp:1713476997 row_event:{table_name:"commerce.customer" row_changes:{after:{lengths:3 lengths:22 values:"100mlord1@planetscale.com"}} keyspace:"commerce" shard:"0" flags:1} current_time:1713476997577249000 keyspace:"commerce" shard:"0" type:VGTID vgtid:{shard_gtids:{keyspace:"commerce" shard:"0" gtid:"MySQL56/71551112-fdcd-11ee-9fe9-8fcc543c344e:1-44"}} keyspace:"commerce" shard:"0" type:COMMIT timestamp:1713476997 current_time:1713476997577257000 keyspace:"commerce" shard:"0"]
Overview of the Issue
VStreams are not terminated/restarted to pick a new tablet when the lag of the tablet being used grows too large (higher than discovery_low_replication_lag). This can lead to high lag for the client and degrade performance.
When the lag grows high enough, and exceeds discovery_high_replication_lag_minimum_serving then the tablet stops serving and the vstream doesn't send any data. Then once it goes below that threshold, it will start sending data again that will be out of date/high lag.
So the fix may be two parts:
See logs - we can see the tablet stop serving, but there are no logs about a new vstream being created to rerun the tablet picker logic and get a healthy tablet.
Reproduction Steps
Binary Version
Operating System and Environment details
Log Fragments