From 4fd7f7f62a1cc6a2e8856ce1646257445d292722 Mon Sep 17 00:00:00 2001 From: Yaroslav Dynnikov <yaroslav.dynnikov@gmail.com> Date: Tue, 22 Feb 2022 14:32:33 +0300 Subject: [PATCH] test: raft leader election This patch introduces two tests: 1. `couple.test_failover` reproduces heatbeat timeout on a follower that leads to a new election. 2. `threesome.test_leader_dispuption` simulates disconnected follower. It shouldn't disrupt the leader in this case. Leader death is already tested in `threesome.test_log_rollback`. Close https://gitlab.com/picodata/picodata/picodata/-/issues/25 --- src/main.rs | 10 ++++++++++ src/traft/node.rs | 9 +++++++++ test/couple_test.lua | 21 +++++++++++++++++++- test/threesome_test.lua | 44 ++++++++++++++++++++++++++++++++++++++++- 4 files changed, 82 insertions(+), 2 deletions(-) diff --git a/src/main.rs b/src/main.rs index a6272a6c36..c4640d65a1 100644 --- a/src/main.rs +++ b/src/main.rs @@ -76,6 +76,7 @@ fn picolib_setup(args: args::Run) { // Export public API luamod.set("run", tlua::function0(move || start(&args))); luamod.set("raft_status", tlua::function0(raft_status)); + luamod.set("raft_tick", tlua::function1(raft_tick)); luamod.set( "raft_propose_info", tlua::function1(|x: String| raft_propose(Message::Info { msg: x })), @@ -185,6 +186,15 @@ fn start(args: &args::Run) { ); } +fn raft_tick(n_times: u32) { + let stash = Stash::access(); + let raft_ref = stash.raft_node(); + let raft_node = raft_ref.as_ref().expect("Picodata not running yet"); + for _ in 0..n_times { + raft_node.tick(); + } +} + fn raft_status() -> traft::Status { let stash = Stash::access(); let raft_ref = stash.raft_node(); diff --git a/src/traft/node.rs b/src/traft/node.rs index 09751a771c..b7add2eb34 100644 --- a/src/traft/node.rs +++ b/src/traft/node.rs @@ -37,6 +37,7 @@ enum Request { Propose { data: Vec<u8> }, ProposeWaitApplied { data: Vec<u8>, notify: Notify }, Step(raft::Message), + Tick, } impl Node { @@ -100,6 +101,11 @@ impl Node { let req = Request::Step(msg); self.inbox.send(req).unwrap(); } + + pub fn tick(&self) { + let req = Request::Tick; + self.inbox.send(req).unwrap(); + } } fn raft_main( @@ -152,6 +158,9 @@ fn raft_main( tlog!(Error, "{e}"); } } + Ok(Request::Tick) => { + raw_node.tick(); + } Err(fiber::RecvError::Timeout) => (), Err(fiber::RecvError::Disconnected) => unreachable!(), } diff --git a/test/couple_test.lua b/test/couple_test.lua index 5ac01d63a9..6f344ec8d1 100644 --- a/test/couple_test.lua +++ b/test/couple_test.lua @@ -37,7 +37,7 @@ g.after_all(function() fio.rmtree(g.data_dir) end) -g.test = function() +g.test_follower_proposal = function() -- Speed up node election g.cluster.i1:try_promote() @@ -54,3 +54,22 @@ g.test = function() '127.0.0.1:13302' ) end + +g.test_failover = function() + g.cluster.i1:try_promote() + h.retrying({}, function() + g.cluster.i2:assert_raft_status("Follower", 1) + end) + + -- Speed up election timeout + g.cluster.i2:connect():eval([[ + while picolib.raft_status().raft_state == 'Follower' do + picolib.raft_tick(1) + end + ]]) + + h.retrying({}, function() + g.cluster.i1:assert_raft_status("Follower", 2) + g.cluster.i2:assert_raft_status("Leader") + end) +end diff --git a/test/threesome_test.lua b/test/threesome_test.lua index 50cfb0ea89..faf6c443e7 100644 --- a/test/threesome_test.lua +++ b/test/threesome_test.lua @@ -57,7 +57,7 @@ local function propose_state_change(srv, value) return srv:raft_propose_eval(0.1, code) end -g.test = function() +g.test_log_rollback = function() -- Speed up node election g.cluster.i1:try_promote() h.retrying({}, function() @@ -106,3 +106,45 @@ g.test = function() true ) end + +g.test_leader_disruption = function() + g.cluster.i1:try_promote() + h.retrying({}, function() + g.cluster.i2:assert_raft_status("Follower", 1) + g.cluster.i3:assert_raft_status("Follower", 1) + end) + + -- Simulate asymmetric network failure. + -- Node i3 doesn't receive any messages, + -- including the heartbeat from the leader. + -- Then it starts a new election. + g.cluster.i3:connect():call( + 'box.schema.func.drop', + {'.raft_interact'} + ) + + -- Speed up election timeout + g.cluster.i3:connect():eval([[ + while picolib.raft_status().raft_state == 'Follower' do + picolib.raft_tick(1) + end + ]]) + g.cluster.i3:assert_raft_status("PreCandidate", 0) + + -- Advance the raft log. It makes i1 and i2 to reject the RequestPreVote. + g.cluster.i1:raft_propose_eval(1, 'return') + + -- Restore normal network operation + g.cluster.i3:connect():call( + 'box.schema.func.create', + {'.raft_interact', { + language = "C", + if_not_exists = true + }} + ) + + -- i3 should become the follower again without disrupting i1 + h.retrying({}, function() + g.cluster.i3:assert_raft_status("Follower", 1) + end) +end -- GitLab