From e7f1e2fe75afba9bdbc9c9cc760bd5287844059c Mon Sep 17 00:00:00 2001 From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org> Date: Wed, 8 Dec 2021 01:48:27 +0100 Subject: [PATCH] test: fix flaky read_only_reason test This is a second attempt to stabilize #5568 test since 5105c2d794b43d6733b3cbcbbbe418fe02d56d79 ("test: fix flaky read_only_reason test"). It had several failures with various frequency. * test_read_only_reason_synchro() could see ro_reason as nil even after box.error.READONLY was raised (and some yields passed); * test_read_only_reason_orphan() could do the same; * `box.ctl.demote()` could raise error "box.ctl.demote does not support simultaneous invocations". The orphan failure couldn't be reproduced. It was caught only locally, so maybe it was just some unnoticed diff breaking the test. Failure of test_read_only_reason_synchro() could happen when demote() was called in a previous test case, then the current test case called promote(), then got box.error.READONLY on the replica, then that old demote() was delivered to replica, and the attempt to get ro_reason returned nil. It is attempted to be fixed with replication_synchro_quorum = 2, so master promote()/demote() will implicitly push the previous operation to the replica. Via term bump and quorum wait. Additionally, huge replication_synchro_timeout is added for manual promotions. Automatic promotion is retried so here the timeout is not so important. `box.ctl.demote` failure due to `simultaneous invocations` seems to be happening because the original auto-election win didn't finish limbo transition yet. Hence the instance calling demote() now would think it is called 'simultaneously' with another promote()/demote(). It is attempted to be fixed from 2 sides: - Add waiting for `not box.info.ro` on the leader node after auto-promotion. To ensure the limbo is taken by the leader; - The first option didn't help much, so `box.ctl.demote()` is simply called in a loop until it succeeds. Closes #6670 --- test/luatest_helpers/server.lua | 6 ++- .../gh_5568_read_only_reason_test.lua | 49 +++++++++++++++---- 2 files changed, 45 insertions(+), 10 deletions(-) diff --git a/test/luatest_helpers/server.lua b/test/luatest_helpers/server.lua index 900db22438..714c53727b 100644 --- a/test/luatest_helpers/server.lua +++ b/test/luatest_helpers/server.lua @@ -104,8 +104,12 @@ function Server:wait_for_readiness() end function Server:wait_election_leader() + -- Include read-only property too because if an instance is a leader, it + -- does not mean it finished the synchro queue ownership transition. It is + -- read-only until that happens. But in tests usually the leader is needed + -- as a writable node. return wait_cond('election leader', self, self.exec, self, function() - return box.info.election.state == 'leader' + return box.info.election.state == 'leader' and not box.info.ro end) end diff --git a/test/replication-luatest/gh_5568_read_only_reason_test.lua b/test/replication-luatest/gh_5568_read_only_reason_test.lua index 775a83e2e3..80a897e744 100644 --- a/test/replication-luatest/gh_5568_read_only_reason_test.lua +++ b/test/replication-luatest/gh_5568_read_only_reason_test.lua @@ -1,6 +1,7 @@ local t = require('luatest') local cluster = require('test.luatest_helpers.cluster') local helpers = require('test.luatest_helpers') +local wait_timeout = 120 -- -- gh-5568: ER_READONLY needs to carry additional info explaining the reason @@ -38,6 +39,17 @@ local function make_destroy_cluster(g) return function() g.cluster:drop() end end +-- XXX: retry demote in a loop. Otherwise it might fail due to a false-positive +-- 'can not be called simultaneously'. +local function force_demote(instance) + t.helpers.retrying({}, function() + assert(instance:exec(function() + box.ctl.demote() + return true + end)) + end) +end + -- -- This group's test cases leave instances in a valid state after which they can -- be reused. @@ -155,12 +167,18 @@ end -- g.test_read_only_reason_election_has_leader = function(g) g.master:exec(function() - box.cfg{election_mode = 'candidate'} + box.cfg{ + election_mode = 'candidate', + replication_synchro_quorum = 2, + } + end) + g.replica:exec(function() + box.cfg{election_mode = 'voter'} end) g.master:wait_election_leader() g.replica:wait_election_leader_found() + local ok, err = g.replica:exec(function() - box.cfg{election_mode = 'voter'} local ok, err = pcall(box.schema.create_space, 'test') return ok, err:unpack() end) @@ -184,11 +202,12 @@ g.test_read_only_reason_election_has_leader = function(g) -- Cleanup. g.master:exec(function() box.cfg{election_mode = 'off'} - box.ctl.demote() end) - g.replica:exec(function() + force_demote(g.master) + g.replica:exec(function(wait_timeout) box.cfg{election_mode = 'off'} - end) + box.ctl.wait_rw(wait_timeout) + end, {wait_timeout}) end -- @@ -196,6 +215,10 @@ end -- g.test_read_only_reason_synchro = function(g) g.master:exec(function() + box.cfg{ + replication_synchro_quorum = 2, + replication_synchro_timeout = 1000000, + } box.ctl.promote() end) @@ -226,9 +249,10 @@ g.test_read_only_reason_synchro = function(g) }, 'reason is synchro, has owner') -- Cleanup. - g.master:exec(function() - box.ctl.demote() - end) + force_demote(g.master) + g.replica:exec(function(wait_timeout) + box.ctl.wait_rw(wait_timeout) + end, {wait_timeout}) end -- @@ -248,7 +272,10 @@ g.test_read_only_reason_election_has_leader_no_uuid = function(g) box.cfg{election_mode = 'voter'} end) g.master:exec(function() - box.cfg{election_mode = 'candidate'} + box.cfg{ + election_mode = 'candidate', + replication_synchro_quorum = 2 + } end) g.master:wait_election_leader() g.replica:wait_election_leader_found() @@ -292,6 +319,10 @@ end -- g.test_read_only_reason_synchro_no_uuid = function(g) g.master:exec(function() + box.cfg{ + replication_synchro_quorum = 2, + replication_synchro_timeout = 1000000, + } box.ctl.promote() box.space._cluster:run_triggers(false) box.space._cluster:delete{box.info.id} -- GitLab