Skip to content
Snippets Groups Projects
Commit f85e886e authored by Vladislav Shpilevoy's avatar Vladislav Shpilevoy Committed by Kirill Yukhin
Browse files

raft: fix crash when leader resigned from its role

Nodes with disabled Raft keep listening for Raft events and
persist them. To be able to quickly enroll into the process if
they are configured to be candidates.

The same for the voter nodes - they can't be a leader, but watch
and persist all what is happening.

However when a leader resigned from its role, the voter and
disabled nodes tried to start a new election round, even though
they were not supposed to. That led to a crash, and is fixed in
this patch.

Closes #5426
parent eb989bc7
No related branches found
No related tags found
No related merge requests found
......@@ -449,7 +449,14 @@ raft_process_msg(const struct raft_request *req, uint32_t source)
if (source == raft.leader) {
say_info("RAFT: the node %u has resigned from the "
"leader role", raft.leader);
raft_sm_schedule_new_election();
/*
* Candidate node clears leader implicitly when starts a
* new term, but non-candidate won't do that, so clear
* it manually.
*/
raft.leader = 0;
if (raft.is_candidate)
raft_sm_schedule_new_election();
}
return 0;
}
......
-- test-run result file version 2
test_run = require('test_run').new()
| ---
| ...
box.schema.user.grant('guest', 'super')
| ---
| ...
old_election_mode = box.cfg.election_mode
| ---
| ...
old_replication_timeout = box.cfg.replication_timeout
| ---
| ...
test_run:cmd('create server replica with rpl_master=default,\
script="replication/replica.lua"')
| ---
| - true
| ...
test_run:cmd('start server replica with wait=True, wait_load=True')
| ---
| - true
| ...
--
-- gh-5426: leader resignation could crash non-candidate nodes.
--
-- Small timeout to speed up the election.
box.cfg{ \
replication_timeout = 0.1, \
election_mode = 'candidate', \
}
| ---
| ...
-- First crash could happen when the election was disabled on the non-leader
-- node.
test_run:wait_cond(function() return box.info.election.state == 'leader' end)
| ---
| - true
| ...
test_run:switch('replica')
| ---
| - true
| ...
test_run:wait_cond(function() return box.info.election.leader ~= 0 end)
| ---
| - true
| ...
test_run:switch('default')
| ---
| - true
| ...
box.cfg{election_mode = 'off'}
| ---
| ...
test_run:switch('replica')
| ---
| - true
| ...
test_run:wait_cond(function() return box.info.election.leader == 0 end)
| ---
| - true
| ...
-- Another crash could happen if election mode was 'voter' on the non-leader
-- node.
box.cfg{election_mode = 'voter'}
| ---
| ...
test_run:switch('default')
| ---
| - true
| ...
box.cfg{election_mode = 'candidate'}
| ---
| ...
test_run:wait_cond(function() return box.info.election.state == 'leader' end)
| ---
| - true
| ...
test_run:switch('replica')
| ---
| - true
| ...
test_run:wait_cond(function() return box.info.election.leader ~= 0 end)
| ---
| - true
| ...
test_run:switch('default')
| ---
| - true
| ...
box.cfg{election_mode = 'off'}
| ---
| ...
test_run:switch('replica')
| ---
| - true
| ...
test_run:wait_cond(function() return box.info.election.leader == 0 end)
| ---
| - true
| ...
test_run:switch('default')
| ---
| - true
| ...
test_run:cmd('stop server replica')
| ---
| - true
| ...
test_run:cmd('delete server replica')
| ---
| - true
| ...
box.cfg{ \
election_mode = old_election_mode, \
replication_timeout = old_replication_timeout, \
}
| ---
| ...
box.schema.user.revoke('guest', 'super')
| ---
| ...
test_run = require('test_run').new()
box.schema.user.grant('guest', 'super')
old_election_mode = box.cfg.election_mode
old_replication_timeout = box.cfg.replication_timeout
test_run:cmd('create server replica with rpl_master=default,\
script="replication/replica.lua"')
test_run:cmd('start server replica with wait=True, wait_load=True')
--
-- gh-5426: leader resignation could crash non-candidate nodes.
--
-- Small timeout to speed up the election.
box.cfg{ \
replication_timeout = 0.1, \
election_mode = 'candidate', \
}
-- First crash could happen when the election was disabled on the non-leader
-- node.
test_run:wait_cond(function() return box.info.election.state == 'leader' end)
test_run:switch('replica')
test_run:wait_cond(function() return box.info.election.leader ~= 0 end)
test_run:switch('default')
box.cfg{election_mode = 'off'}
test_run:switch('replica')
test_run:wait_cond(function() return box.info.election.leader == 0 end)
-- Another crash could happen if election mode was 'voter' on the non-leader
-- node.
box.cfg{election_mode = 'voter'}
test_run:switch('default')
box.cfg{election_mode = 'candidate'}
test_run:wait_cond(function() return box.info.election.state == 'leader' end)
test_run:switch('replica')
test_run:wait_cond(function() return box.info.election.leader ~= 0 end)
test_run:switch('default')
box.cfg{election_mode = 'off'}
test_run:switch('replica')
test_run:wait_cond(function() return box.info.election.leader == 0 end)
test_run:switch('default')
test_run:cmd('stop server replica')
test_run:cmd('delete server replica')
box.cfg{ \
election_mode = old_election_mode, \
replication_timeout = old_replication_timeout, \
}
box.schema.user.revoke('guest', 'super')
......@@ -14,6 +14,7 @@
"gh-3760-misc-return-on-quorum-0.test.lua": {},
"gh-4399-misc-no-failure-on-error-reading-wal.test.lua": {},
"gh-4424-misc-orphan-on-reconfiguration-error.test.lua": {},
"gh-5426-election-on-off.test.lua": {},
"once.test.lua": {},
"on_replace.test.lua": {},
"status.test.lua": {},
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment