From 4fd7f7f62a1cc6a2e8856ce1646257445d292722 Mon Sep 17 00:00:00 2001
From: Yaroslav Dynnikov <yaroslav.dynnikov@gmail.com>
Date: Tue, 22 Feb 2022 14:32:33 +0300
Subject: [PATCH] test: raft leader election

This patch introduces two tests:

1. `couple.test_failover` reproduces heatbeat timeout on a follower
   that leads to a new election.

2. `threesome.test_leader_dispuption` simulates disconnected follower.
   It shouldn't disrupt the leader in this case.

Leader death is already tested in `threesome.test_log_rollback`.

Close https://gitlab.com/picodata/picodata/picodata/-/issues/25
---
 src/main.rs             | 10 ++++++++++
 src/traft/node.rs       |  9 +++++++++
 test/couple_test.lua    | 21 +++++++++++++++++++-
 test/threesome_test.lua | 44 ++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 82 insertions(+), 2 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index a6272a6c36..c4640d65a1 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -76,6 +76,7 @@ fn picolib_setup(args: args::Run) {
     // Export public API
     luamod.set("run", tlua::function0(move || start(&args)));
     luamod.set("raft_status", tlua::function0(raft_status));
+    luamod.set("raft_tick", tlua::function1(raft_tick));
     luamod.set(
         "raft_propose_info",
         tlua::function1(|x: String| raft_propose(Message::Info { msg: x })),
@@ -185,6 +186,15 @@ fn start(args: &args::Run) {
     );
 }
 
+fn raft_tick(n_times: u32) {
+    let stash = Stash::access();
+    let raft_ref = stash.raft_node();
+    let raft_node = raft_ref.as_ref().expect("Picodata not running yet");
+    for _ in 0..n_times {
+        raft_node.tick();
+    }
+}
+
 fn raft_status() -> traft::Status {
     let stash = Stash::access();
     let raft_ref = stash.raft_node();
diff --git a/src/traft/node.rs b/src/traft/node.rs
index 09751a771c..b7add2eb34 100644
--- a/src/traft/node.rs
+++ b/src/traft/node.rs
@@ -37,6 +37,7 @@ enum Request {
     Propose { data: Vec<u8> },
     ProposeWaitApplied { data: Vec<u8>, notify: Notify },
     Step(raft::Message),
+    Tick,
 }
 
 impl Node {
@@ -100,6 +101,11 @@ impl Node {
         let req = Request::Step(msg);
         self.inbox.send(req).unwrap();
     }
+
+    pub fn tick(&self) {
+        let req = Request::Tick;
+        self.inbox.send(req).unwrap();
+    }
 }
 
 fn raft_main(
@@ -152,6 +158,9 @@ fn raft_main(
                     tlog!(Error, "{e}");
                 }
             }
+            Ok(Request::Tick) => {
+                raw_node.tick();
+            }
             Err(fiber::RecvError::Timeout) => (),
             Err(fiber::RecvError::Disconnected) => unreachable!(),
         }
diff --git a/test/couple_test.lua b/test/couple_test.lua
index 5ac01d63a9..6f344ec8d1 100644
--- a/test/couple_test.lua
+++ b/test/couple_test.lua
@@ -37,7 +37,7 @@ g.after_all(function()
     fio.rmtree(g.data_dir)
 end)
 
-g.test = function()
+g.test_follower_proposal = function()
     -- Speed up node election
     g.cluster.i1:try_promote()
 
@@ -54,3 +54,22 @@ g.test = function()
         '127.0.0.1:13302'
     )
 end
+
+g.test_failover = function()
+    g.cluster.i1:try_promote()
+    h.retrying({}, function()
+        g.cluster.i2:assert_raft_status("Follower", 1)
+    end)
+
+    -- Speed up election timeout
+    g.cluster.i2:connect():eval([[
+        while picolib.raft_status().raft_state == 'Follower' do
+            picolib.raft_tick(1)
+        end
+    ]])
+
+    h.retrying({}, function()
+        g.cluster.i1:assert_raft_status("Follower", 2)
+        g.cluster.i2:assert_raft_status("Leader")
+    end)
+end
diff --git a/test/threesome_test.lua b/test/threesome_test.lua
index 50cfb0ea89..faf6c443e7 100644
--- a/test/threesome_test.lua
+++ b/test/threesome_test.lua
@@ -57,7 +57,7 @@ local function propose_state_change(srv, value)
     return srv:raft_propose_eval(0.1, code)
 end
 
-g.test = function()
+g.test_log_rollback = function()
     -- Speed up node election
     g.cluster.i1:try_promote()
     h.retrying({}, function()
@@ -106,3 +106,45 @@ g.test = function()
         true
     )
 end
+
+g.test_leader_disruption = function()
+    g.cluster.i1:try_promote()
+    h.retrying({}, function()
+        g.cluster.i2:assert_raft_status("Follower", 1)
+        g.cluster.i3:assert_raft_status("Follower", 1)
+    end)
+
+    -- Simulate asymmetric network failure.
+    -- Node i3 doesn't receive any messages,
+    -- including the heartbeat from the leader.
+    -- Then it starts a new election.
+    g.cluster.i3:connect():call(
+        'box.schema.func.drop',
+        {'.raft_interact'}
+    )
+
+    -- Speed up election timeout
+    g.cluster.i3:connect():eval([[
+        while picolib.raft_status().raft_state == 'Follower' do
+            picolib.raft_tick(1)
+        end
+    ]])
+    g.cluster.i3:assert_raft_status("PreCandidate", 0)
+
+    -- Advance the raft log. It makes i1 and i2 to reject the RequestPreVote.
+    g.cluster.i1:raft_propose_eval(1, 'return')
+
+    -- Restore normal network operation
+    g.cluster.i3:connect():call(
+        'box.schema.func.create',
+        {'.raft_interact', {
+            language = "C",
+            if_not_exists = true
+        }}
+    )
+
+    -- i3 should become the follower again without disrupting i1
+    h.retrying({}, function()
+        g.cluster.i3:assert_raft_status("Follower", 1)
+    end)
+end
-- 
GitLab