Skip to content
Snippets Groups Projects
  • Serge Petrenko's avatar
    fc3e6986
    replication: fix flaky election_basic test · fc3e6986
    Serge Petrenko authored
    Found the following error in our CI:
    
     Test failed! Result content mismatch:
     --- replication/election_basic.result	Fri Aug 13 13:50:26 2021
     +++ /build/usr/src/debug/tarantool-2.9.0.276/test/var/rejects/replication/election_basic.reject	Sat Aug 14 08:14:17 2021
     @@ -116,6 +116,7 @@
       | ...
      box.ctl.demote()
       | ---
     + | - error: box.ctl.demote does not support simultaneous invocations
       | ...
      --
    
    Even though box.ctl.demote() or box.ctl.promote() isn't called above the
    failing line, promote() is issued internally once the instance becomes
    the leader.
    
    Wait until previous promote is finished
    (i.e. box.info.synchro.queue.owner is set)
    fc3e6986
    History
    replication: fix flaky election_basic test
    Serge Petrenko authored
    Found the following error in our CI:
    
     Test failed! Result content mismatch:
     --- replication/election_basic.result	Fri Aug 13 13:50:26 2021
     +++ /build/usr/src/debug/tarantool-2.9.0.276/test/var/rejects/replication/election_basic.reject	Sat Aug 14 08:14:17 2021
     @@ -116,6 +116,7 @@
       | ...
      box.ctl.demote()
       | ---
     + | - error: box.ctl.demote does not support simultaneous invocations
       | ...
      --
    
    Even though box.ctl.demote() or box.ctl.promote() isn't called above the
    failing line, promote() is issued internally once the instance becomes
    the leader.
    
    Wait until previous promote is finished
    (i.e. box.info.synchro.queue.owner is set)
election_basic.result 8.97 KiB
-- test-run result file version 2
test_run = require('test_run').new()
 | ---
 | ...
--
-- gh-1146: Raft protocol for automated leader election.
--

old_election_timeout = box.cfg.election_timeout
 | ---
 | ...

-- Election is turned off by default.
assert(box.cfg.election_mode == 'off')
 | ---
 | - true
 | ...
-- Ensure election options are validated.
box.cfg{election_mode = 100}
 | ---
 | - error: 'Incorrect value for option ''election_mode'': should be of type string'
 | ...
box.cfg{election_mode = '100'}
 | ---
 | - error: 'Incorrect value for option ''election_mode'': the value must be one of the
 |     following strings: ''off'', ''voter'', ''candidate'', ''manual'''
 | ...
box.cfg{election_timeout = -1}
 | ---
 | - error: 'Incorrect value for option ''election_timeout'': the value must be a positive
 |     number'
 | ...
box.cfg{election_timeout = 0}
 | ---
 | - error: 'Incorrect value for option ''election_timeout'': the value must be a positive
 |     number'
 | ...

-- When election is disabled, the instance is a follower. Does not try to become
-- a leader, and does not block write operations.
term = box.info.election.term
 | ---
 | ...
vote = box.info.election.vote
 | ---
 | ...
assert(box.info.election.state == 'follower')
 | ---
 | - true
 | ...
assert(box.info.election.leader == 0)
 | ---
 | - true
 | ...
assert(not box.info.ro)
 | ---
 | - true
 | ...

-- Turned on election blocks writes until the instance becomes a leader.
box.cfg{election_mode = 'voter'}
 | ---
 | ...
assert(box.info.election.state == 'follower')
 | ---
 | - true
 | ...
assert(box.info.ro)
 | ---
 | - true
 | ...
-- Term is not changed, because the instance can't be a candidate,
-- and therefore didn't try to vote nor to bump the term.
assert(box.info.election.term == term)
 | ---
 | - true
 | ...
assert(box.info.election.vote == vote)
 | ---
 | - true
 | ...
assert(box.info.election.leader == 0)
 | ---
 | - true
 | ...

-- Candidate instance votes immediately, if sees no leader.
box.cfg{election_timeout = 1000}
 | ---
 | ...
box.cfg{election_mode = 'candidate'}
 | ---
 | ...
test_run:wait_cond(function() return box.info.election.state == 'leader' end)
 | ---
 | - true
 | ...
test_run:wait_cond(function()\
    return box.info.synchro.queue.owner == box.info.id\
end)
 | ---
 | - true
 | ...
assert(box.info.election.term > term)
 | ---
 | - true
 | ...
assert(box.info.election.vote == box.info.id)
 | ---
 | - true
 | ...
assert(box.info.election.leader == box.info.id)
 | ---
 | - true
 | ...

box.cfg{                                                                        \
    election_mode = 'off',                                                      \
    election_timeout = old_election_timeout                                     \
}
 | ---
 | ...
box.ctl.demote()
 | ---
 | ...

--
-- See if bootstrap with election enabled works.
--
SERVERS = {'election_replica1', 'election_replica2', 'election_replica3'}
 | ---
 | ...
test_run:create_cluster(SERVERS, "replication")
 | ---
 | ...
test_run:wait_fullmesh(SERVERS)
 | ---
 | ...
is_r1_leader = nil
 | ---
 | ...
is_r2_leader = nil
 | ---
 | ...
is_r3_leader = nil
 | ---
 | ...
is_leader_cmd = 'return box.info.election.state == \'leader\''
 | ---
 | ...
leader_id_cmd = 'return box.info.election.leader'
 | ---
 | ...
test_run:wait_cond(function()                                                   \
    is_r1_leader = test_run:eval('election_replica1', is_leader_cmd)[1]         \
    is_r2_leader = test_run:eval('election_replica2', is_leader_cmd)[1]         \
    is_r3_leader = test_run:eval('election_replica3', is_leader_cmd)[1]         \
    return is_r1_leader or is_r2_leader or is_r3_leader                         \
end)
 | ---
 | - true
 | ...
leader_count = is_r1_leader and 1 or 0
 | ---
 | ...
leader_count = leader_count + (is_r2_leader and 1 or 0)
 | ---
 | ...
leader_count = leader_count + (is_r3_leader and 1 or 0)
 | ---
 | ...
assert(leader_count == 1)
 | ---
 | - true
 | ...
-- All nodes have the same leader.
r1_leader = test_run:eval('election_replica1', leader_id_cmd)[1]
 | ---
 | ...
r2_leader = test_run:eval('election_replica2', leader_id_cmd)[1]
 | ---
 | ...
r3_leader = test_run:eval('election_replica3', leader_id_cmd)[1]
 | ---
 | ...
assert(r1_leader ~= 0)
 | ---
 | - true
 | ...
assert(r1_leader == r2_leader)
 | ---
 | - true
 | ...
assert(r1_leader == r3_leader)
 | ---
 | - true
 | ...

--
-- Leader death starts a new election.
--
leader_name = nil
 | ---
 | ...
nonleader1_name = nil
 | ---
 | ...
nonleader2_name = nil
 | ---
 | ...
if is_r1_leader then                                                            \
    leader_name = 'election_replica1'                                           \
    nonleader1_name = 'election_replica2'                                       \
    nonleader2_name = 'election_replica3'                                       \
elseif is_r2_leader then                                                        \
    leader_name = 'election_replica2'                                           \
    nonleader1_name = 'election_replica1'                                       \
    nonleader2_name = 'election_replica3'                                       \
elseif is_r3_leader then                                                        \
    leader_name = 'election_replica3'                                           \
    nonleader1_name = 'election_replica1'                                       \
    nonleader2_name = 'election_replica2'                                       \
end
 | ---
 | ...
-- Lower the quorum so the 2 alive nodes could elect a new leader when the third
-- node dies.
test_run:switch(nonleader1_name)
 | ---
 | - true
 | ...
box.cfg{replication_synchro_quorum = 2}
 | ---
 | ...
-- Switch via default where the names are defined.
test_run:switch('default')
 | ---
 | - true
 | ...
test_run:switch(nonleader2_name)
 | ---
 | - true
 | ...
box.cfg{replication_synchro_quorum = 2}
 | ---
 | ...

test_run:switch('default')
 | ---
 | - true
 | ...
test_run:cmd(string.format('stop server %s', leader_name))
 | ---
 | - true
 | ...
test_run:wait_cond(function()                                                   \
    is_r1_leader = test_run:eval(nonleader1_name, is_leader_cmd)[1]             \
    is_r2_leader = test_run:eval(nonleader2_name, is_leader_cmd)[1]             \
    return is_r1_leader or is_r2_leader                                         \
end)
 | ---
 | - true
 | ...
r1_leader = test_run:eval(nonleader1_name, leader_id_cmd)[1]
 | ---
 | ...
r2_leader = test_run:eval(nonleader2_name, leader_id_cmd)[1]
 | ---
 | ...
assert(r1_leader ~= 0)
 | ---
 | - true
 | ...
assert(r1_leader == r2_leader)
 | ---
 | - true
 | ...

test_run:cmd(string.format('start server %s', leader_name))
 | ---
 | - true
 | ...

test_run:drop_cluster(SERVERS)
 | ---
 | ...

-- gh-5819: on_election triggers, that are filed on every visible state change.
box.schema.user.grant('guest', 'replication')
 | ---
 | ...
test_run:cmd('create server replica with rpl_master=default,\
	      script="replication/replica.lua"')
 | ---
 | - true
 | ...
test_run:cmd('start server replica')
 | ---
 | - true
 | ...

repl = test_run:eval('replica', 'return box.cfg.listen')[1]
 | ---
 | ...
box.cfg{replication = repl}
 | ---
 | ...
test_run:switch('replica')
 | ---
 | - true
 | ...
box.cfg{election_mode='voter'}
 | ---
 | ...
test_run:switch('default')
 | ---
 | - true
 | ...

election_tbl = {}
 | ---
 | ...

function trig()\
    election_tbl[#election_tbl+1] = box.info.election\
end
 | ---
 | ...

_ = box.ctl.on_election(trig)
 | ---
 | ...

box.cfg{replication_synchro_quorum=2}
 | ---
 | ...
box.cfg{election_mode='candidate'}
 | ---
 | ...

test_run:wait_cond(function() return #election_tbl == 3 end)
 | ---
 | - true
 | ...
assert(election_tbl[1].state == 'follower')
 | ---
 | - true
 | ...
assert(election_tbl[2].state == 'candidate')
 | ---
 | - true
 | ...
assert(election_tbl[2].vote == 1)
 | ---
 | - true
 | ...
assert(election_tbl[3].state == 'leader')
 | ---
 | - true
 | ...

box.cfg{election_mode='voter'}
 | ---
 | ...
test_run:wait_cond(function() return #election_tbl == 4 end)
 | ---
 | - true
 | ...
assert(election_tbl[4].state == 'follower')
 | ---
 | - true
 | ...

box.cfg{election_mode='off'}
 | ---
 | ...
test_run:wait_cond(function() return #election_tbl == 5 end)
 | ---
 | - true
 | ...

box.cfg{election_mode='manual'}
 | ---
 | ...
test_run:wait_cond(function() return #election_tbl == 6 end)
 | ---
 | - true
 | ...
assert(election_tbl[6].state == 'follower')
 | ---
 | - true
 | ...

box.ctl.promote()
 | ---
 | ...

test_run:wait_cond(function() return #election_tbl == 9 end)
 | ---
 | - true
 | ...
assert(election_tbl[7].state == 'follower')
 | ---
 | - true
 | ...
assert(election_tbl[7].term == election_tbl[6].term + 1)
 | ---
 | - true
 | ...
assert(election_tbl[8].state == 'candidate')
 | ---
 | - true
 | ...
assert(election_tbl[9].state == 'leader')
 | ---
 | - true
 | ...

test_run:cmd('stop server replica')
 | ---
 | - true
 | ...
test_run:cmd('delete server replica')
 | ---
 | - true
 | ...

box.schema.user.revoke('guest', 'replication')
 | ---
 | ...