Currently failover fiber pings replicas, counts the number of failed requests. After closing https://github.com/tarantool/vshard/issues/453 it'll also check the state of replication from master. But currently failover says All replicas are ok, when there was no exception during fiber step (and it's difficult to trigger one, everything is pcalled there). So, it just spams, that everything is ok after every change of the prioritized replica.
We should properly return the state of replicas from the failover_step. Here's the draft patch:
Details
```diff
diff --git a/vshard/router/init.lua b/vshard/router/init.lua
index e68550b..3a4474b 100644
--- a/vshard/router/init.lua
+++ b/vshard/router/init.lua
@@ -1268,6 +1268,7 @@ local function failover_ping(replica, opts)
end
local function failover_ping_round(router, curr_ts)
+ local all_replicas_ok = true
local opts = {timeout = router.failover_ping_timeout}
for _, replicaset in pairs(router.replicasets) do
local replica = replicaset.replica
@@ -1300,9 +1301,14 @@ local function failover_ping_round(router, curr_ts)
-- and closed.
replica:detach_conn()
replicaset:connect_replica(replica)
+ all_replicas_ok = false
+ end
+ if replica.health_status ~= consts.STATUS.GREEN then
+ all_replicas_ok = false
end
end
end
+ return all_replicas_ok
end
@@ -1314,7 +1320,7 @@ end
--
local function failover_step(router)
local curr_ts = fiber_clock()
- failover_ping_round(router, curr_ts)
+ local all_replicas_ok = failover_ping_round(router, curr_ts)
local id_to_update = failover_collect_to_update(router)
if #id_to_update == 0 then
return false
@@ -1347,7 +1353,7 @@ local function failover_step(router)
end
::continue::
end
- return replica_is_changed
+ return replica_is_changed, all_replicas_ok
end
--
@@ -1384,19 +1390,19 @@ local function failover_service_f(router, service)
end
service:next_iter()
service:set_activity('updating replicas')
- local ok, replica_is_changed = pcall(failover_step, router)
+ local ok, replica_is_changed, all_ok = pcall(failover_step, router)
if not ok then
log.error(service:set_status_error(
'Error during failovering: %s',
lerror.make(replica_is_changed)))
- replica_is_changed = true
- elseif not prev_was_ok then
+ all_ok = false
+ elseif not prev_was_ok and all_ok then
log.info('All replicas are ok')
service:set_status_ok()
end
- prev_was_ok = not replica_is_changed
+ prev_was_ok = all_ok
local logf
- if replica_is_changed then
+ if replica_is_changed or not all_ok then
logf = log.info
else
-- In any case it is necessary to periodically log
```
Currently failover fiber pings replicas, counts the number of failed requests. After closing https://github.com/tarantool/vshard/issues/453 it'll also check the state of replication from master. But currently failover says
All replicas are ok
, when there was no exception during fiber step (and it's difficult to trigger one, everything is pcalled there). So, it just spams, that everything is ok after every change of the prioritized replica.We should properly return the state of replicas from the
failover_step
. Here's the draft patch:Details
```diff diff --git a/vshard/router/init.lua b/vshard/router/init.lua index e68550b..3a4474b 100644 --- a/vshard/router/init.lua +++ b/vshard/router/init.lua @@ -1268,6 +1268,7 @@ local function failover_ping(replica, opts) end local function failover_ping_round(router, curr_ts) + local all_replicas_ok = true local opts = {timeout = router.failover_ping_timeout} for _, replicaset in pairs(router.replicasets) do local replica = replicaset.replica @@ -1300,9 +1301,14 @@ local function failover_ping_round(router, curr_ts) -- and closed. replica:detach_conn() replicaset:connect_replica(replica) + all_replicas_ok = false + end + if replica.health_status ~= consts.STATUS.GREEN then + all_replicas_ok = false end end end + return all_replicas_ok end @@ -1314,7 +1320,7 @@ end -- local function failover_step(router) local curr_ts = fiber_clock() - failover_ping_round(router, curr_ts) + local all_replicas_ok = failover_ping_round(router, curr_ts) local id_to_update = failover_collect_to_update(router) if #id_to_update == 0 then return false @@ -1347,7 +1353,7 @@ local function failover_step(router) end ::continue:: end - return replica_is_changed + return replica_is_changed, all_replicas_ok end -- @@ -1384,19 +1390,19 @@ local function failover_service_f(router, service) end service:next_iter() service:set_activity('updating replicas') - local ok, replica_is_changed = pcall(failover_step, router) + local ok, replica_is_changed, all_ok = pcall(failover_step, router) if not ok then log.error(service:set_status_error( 'Error during failovering: %s', lerror.make(replica_is_changed))) - replica_is_changed = true - elseif not prev_was_ok then + all_ok = false + elseif not prev_was_ok and all_ok then log.info('All replicas are ok') service:set_status_ok() end - prev_was_ok = not replica_is_changed + prev_was_ok = all_ok local logf - if replica_is_changed then + if replica_is_changed or not all_ok then logf = log.info else -- In any case it is necessary to periodically log ```