diff --git a/src/box/raft.c b/src/box/raft.c index 24f65ada7d6cce007fa1f5e4d62c03097ca69f9c..b70f47006e90542f8fb2369af63c6330b4be5db2 100644 --- a/src/box/raft.c +++ b/src/box/raft.c @@ -449,7 +449,14 @@ raft_process_msg(const struct raft_request *req, uint32_t source) if (source == raft.leader) { say_info("RAFT: the node %u has resigned from the " "leader role", raft.leader); - raft_sm_schedule_new_election(); + /* + * Candidate node clears leader implicitly when starts a + * new term, but non-candidate won't do that, so clear + * it manually. + */ + raft.leader = 0; + if (raft.is_candidate) + raft_sm_schedule_new_election(); } return 0; } diff --git a/test/replication/gh-5426-election-on-off.result b/test/replication/gh-5426-election-on-off.result new file mode 100644 index 0000000000000000000000000000000000000000..1abfb9154102be8f92e6f46e1d98d22bce847bb3 --- /dev/null +++ b/test/replication/gh-5426-election-on-off.result @@ -0,0 +1,134 @@ +-- test-run result file version 2 +test_run = require('test_run').new() + | --- + | ... +box.schema.user.grant('guest', 'super') + | --- + | ... + +old_election_mode = box.cfg.election_mode + | --- + | ... +old_replication_timeout = box.cfg.replication_timeout + | --- + | ... + +test_run:cmd('create server replica with rpl_master=default,\ + script="replication/replica.lua"') + | --- + | - true + | ... +test_run:cmd('start server replica with wait=True, wait_load=True') + | --- + | - true + | ... + +-- +-- gh-5426: leader resignation could crash non-candidate nodes. +-- +-- Small timeout to speed up the election. +box.cfg{ \ + replication_timeout = 0.1, \ + election_mode = 'candidate', \ +} + | --- + | ... + +-- First crash could happen when the election was disabled on the non-leader +-- node. +test_run:wait_cond(function() return box.info.election.state == 'leader' end) + | --- + | - true + | ... + +test_run:switch('replica') + | --- + | - true + | ... +test_run:wait_cond(function() return box.info.election.leader ~= 0 end) + | --- + | - true + | ... + +test_run:switch('default') + | --- + | - true + | ... +box.cfg{election_mode = 'off'} + | --- + | ... + +test_run:switch('replica') + | --- + | - true + | ... +test_run:wait_cond(function() return box.info.election.leader == 0 end) + | --- + | - true + | ... + +-- Another crash could happen if election mode was 'voter' on the non-leader +-- node. +box.cfg{election_mode = 'voter'} + | --- + | ... + +test_run:switch('default') + | --- + | - true + | ... +box.cfg{election_mode = 'candidate'} + | --- + | ... +test_run:wait_cond(function() return box.info.election.state == 'leader' end) + | --- + | - true + | ... + +test_run:switch('replica') + | --- + | - true + | ... +test_run:wait_cond(function() return box.info.election.leader ~= 0 end) + | --- + | - true + | ... + +test_run:switch('default') + | --- + | - true + | ... +box.cfg{election_mode = 'off'} + | --- + | ... + +test_run:switch('replica') + | --- + | - true + | ... +test_run:wait_cond(function() return box.info.election.leader == 0 end) + | --- + | - true + | ... + +test_run:switch('default') + | --- + | - true + | ... +test_run:cmd('stop server replica') + | --- + | - true + | ... +test_run:cmd('delete server replica') + | --- + | - true + | ... +box.cfg{ \ + election_mode = old_election_mode, \ + replication_timeout = old_replication_timeout, \ +} + | --- + | ... +box.schema.user.revoke('guest', 'super') + | --- + | ... diff --git a/test/replication/gh-5426-election-on-off.test.lua b/test/replication/gh-5426-election-on-off.test.lua new file mode 100644 index 0000000000000000000000000000000000000000..d6b980d0a290cd08b9a0202a56e7045a98133cb9 --- /dev/null +++ b/test/replication/gh-5426-election-on-off.test.lua @@ -0,0 +1,57 @@ +test_run = require('test_run').new() +box.schema.user.grant('guest', 'super') + +old_election_mode = box.cfg.election_mode +old_replication_timeout = box.cfg.replication_timeout + +test_run:cmd('create server replica with rpl_master=default,\ + script="replication/replica.lua"') +test_run:cmd('start server replica with wait=True, wait_load=True') + +-- +-- gh-5426: leader resignation could crash non-candidate nodes. +-- +-- Small timeout to speed up the election. +box.cfg{ \ + replication_timeout = 0.1, \ + election_mode = 'candidate', \ +} + +-- First crash could happen when the election was disabled on the non-leader +-- node. +test_run:wait_cond(function() return box.info.election.state == 'leader' end) + +test_run:switch('replica') +test_run:wait_cond(function() return box.info.election.leader ~= 0 end) + +test_run:switch('default') +box.cfg{election_mode = 'off'} + +test_run:switch('replica') +test_run:wait_cond(function() return box.info.election.leader == 0 end) + +-- Another crash could happen if election mode was 'voter' on the non-leader +-- node. +box.cfg{election_mode = 'voter'} + +test_run:switch('default') +box.cfg{election_mode = 'candidate'} +test_run:wait_cond(function() return box.info.election.state == 'leader' end) + +test_run:switch('replica') +test_run:wait_cond(function() return box.info.election.leader ~= 0 end) + +test_run:switch('default') +box.cfg{election_mode = 'off'} + +test_run:switch('replica') +test_run:wait_cond(function() return box.info.election.leader == 0 end) + +test_run:switch('default') +test_run:cmd('stop server replica') +test_run:cmd('delete server replica') +box.cfg{ \ + election_mode = old_election_mode, \ + replication_timeout = old_replication_timeout, \ +} +box.schema.user.revoke('guest', 'super') diff --git a/test/replication/suite.cfg b/test/replication/suite.cfg index a862f5a97d0809403cc15c9665cdd5f1962f1a36..766f276a292b9dfca6c7014324c65ef9fbf48437 100644 --- a/test/replication/suite.cfg +++ b/test/replication/suite.cfg @@ -14,6 +14,7 @@ "gh-3760-misc-return-on-quorum-0.test.lua": {}, "gh-4399-misc-no-failure-on-error-reading-wal.test.lua": {}, "gh-4424-misc-orphan-on-reconfiguration-error.test.lua": {}, + "gh-5426-election-on-off.test.lua": {}, "once.test.lua": {}, "on_replace.test.lua": {}, "status.test.lua": {},