tarantool / vshard

The new generation of sharding based on virtual buckets
Other
99 stars 30 forks source link

router: failover says, that `All replicas are ok` even when they're not #500

Open Serpentian opened 1 week ago

Serpentian commented 1 week ago

Currently failover fiber pings replicas, counts the number of failed requests. After closing https://github.com/tarantool/vshard/issues/453 it'll also check the state of replication from master. But currently failover says All replicas are ok, when there was no exception during fiber step (and it's difficult to trigger one, everything is pcalled there). So, it just spams, that everything is ok after every change of the prioritized replica.

We should properly return the state of replicas from the failover_step. Here's the draft patch:

Details

```diff diff --git a/vshard/router/init.lua b/vshard/router/init.lua index e68550b..3a4474b 100644 --- a/vshard/router/init.lua +++ b/vshard/router/init.lua @@ -1268,6 +1268,7 @@ local function failover_ping(replica, opts) end local function failover_ping_round(router, curr_ts) + local all_replicas_ok = true local opts = {timeout = router.failover_ping_timeout} for _, replicaset in pairs(router.replicasets) do local replica = replicaset.replica @@ -1300,9 +1301,14 @@ local function failover_ping_round(router, curr_ts) -- and closed. replica:detach_conn() replicaset:connect_replica(replica) + all_replicas_ok = false + end + if replica.health_status ~= consts.STATUS.GREEN then + all_replicas_ok = false end end end + return all_replicas_ok end @@ -1314,7 +1320,7 @@ end -- local function failover_step(router) local curr_ts = fiber_clock() - failover_ping_round(router, curr_ts) + local all_replicas_ok = failover_ping_round(router, curr_ts) local id_to_update = failover_collect_to_update(router) if #id_to_update == 0 then return false @@ -1347,7 +1353,7 @@ local function failover_step(router) end ::continue:: end - return replica_is_changed + return replica_is_changed, all_replicas_ok end -- @@ -1384,19 +1390,19 @@ local function failover_service_f(router, service) end service:next_iter() service:set_activity('updating replicas') - local ok, replica_is_changed = pcall(failover_step, router) + local ok, replica_is_changed, all_ok = pcall(failover_step, router) if not ok then log.error(service:set_status_error( 'Error during failovering: %s', lerror.make(replica_is_changed))) - replica_is_changed = true - elseif not prev_was_ok then + all_ok = false + elseif not prev_was_ok and all_ok then log.info('All replicas are ok') service:set_status_ok() end - prev_was_ok = not replica_is_changed + prev_was_ok = all_ok local logf - if replica_is_changed then + if replica_is_changed or not all_ok then logf = log.info else -- In any case it is necessary to periodically log ```