From f077ebf60fde411e1d49eea7c8cb08c71d6e14b3 Mon Sep 17 00:00:00 2001
From: Nikita Zheleztsov <n.zheleztsov@proton.me>
Date: Tue, 18 Apr 2023 02:15:47 +0300
Subject: [PATCH] replication: fix updating is_candidate in raft

Currently on applier death `is_candidate` is updated after trying
to start election. So, raft assumes it has healthy quorum and
bumps term even when there's not enough healthy nodes to do that.

Trigger on updating above-mentioned flag is run in
`replicaset_on_health_change`. So, let's move it before executing
`raft_notify_is_leader_seen`, which tries to start election.

Closes #8433

NO_DOC=bugfix
---
 .../unreleased/gh-8433-raft-is-candidate.md   |   4 +
 src/box/replication.cc                        |  22 ++--
 .../gh_8433_raft_is_candidate_test.lua        | 115 ++++++++++++++++++
 3 files changed, 134 insertions(+), 7 deletions(-)
 create mode 100644 changelogs/unreleased/gh-8433-raft-is-candidate.md
 create mode 100644 test/replication-luatest/gh_8433_raft_is_candidate_test.lua

diff --git a/changelogs/unreleased/gh-8433-raft-is-candidate.md b/changelogs/unreleased/gh-8433-raft-is-candidate.md
new file mode 100644
index 0000000000..a7bb9bc257
--- /dev/null
+++ b/changelogs/unreleased/gh-8433-raft-is-candidate.md
@@ -0,0 +1,4 @@
+## bugfix/replication
+
+* Fixed a bug that occurred on applier failure: a node could start an election
+  without having a quorum to do this (gh-8433).
diff --git a/src/box/replication.cc b/src/box/replication.cc
index eee635ceef..a94d8b4e37 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -471,16 +471,24 @@ replica_update_applier_health(struct replica *replica)
 	replica->is_applier_healthy = is_healthy;
 	if (!replica->is_relay_healthy || replica->anon)
 		return;
-	if (is_healthy) {
+	if (is_healthy)
 		replicaset.healthy_count++;
-	} else {
+	else
 		replicaset.healthy_count--;
-		if (replica->id != REPLICA_ID_NIL) {
-			raft_notify_is_leader_seen(box_raft(), false,
-						   replica->id);
-		}
-	}
+
+	/*
+	 * It is important for Raft to run replicaset_on_health_change() before
+	 * raft_notify_is_leader_seen(). replicaset_on_health_change() executes
+	 * a trigger, updating Raft's is_candidate flag, which shows whether the
+	 * node have enough quorum to start election. It should be done before
+	 * running raft_notify_is_leader_seen, which cleans the flag related to
+	 * corresponding replica->id in leader_witness_map and tries to start
+	 * election. Otherwise, election can be started even if the node doesn't
+	 * have enough quorum to do so.
+	 */
 	replicaset_on_health_change();
+	if (!is_healthy && replica->id != REPLICA_ID_NIL)
+		raft_notify_is_leader_seen(box_raft(), false, replica->id);
 }
 
 static void
diff --git a/test/replication-luatest/gh_8433_raft_is_candidate_test.lua b/test/replication-luatest/gh_8433_raft_is_candidate_test.lua
new file mode 100644
index 0000000000..340d2f7caa
--- /dev/null
+++ b/test/replication-luatest/gh_8433_raft_is_candidate_test.lua
@@ -0,0 +1,115 @@
+local luatest = require('luatest')
+local server = require('luatest.server')
+local replica_set = require('luatest.replica_set')
+local proxy = require('luatest.replica_proxy')
+
+local g = luatest.group('is_candidate_fail')
+
+local function wait_for_upstream_death_with_id(id)
+    luatest.helpers.retrying({}, function()
+        luatest.assert_equals(box.info.replication[id].upstream.status,
+                              'disconnected')
+    end)
+end
+
+g.before_all(function(g)
+    g.replica_set = replica_set:new({})
+    local rs_id = g.replica_set.id
+    g.box_cfg = {
+        election_mode = 'candidate',
+        replication_timeout = 0.1,
+        replication = {
+            server.build_listen_uri('server1', rs_id),
+            server.build_listen_uri('server2', rs_id),
+            server.build_listen_uri('server3', rs_id),
+        },
+    }
+
+    g.server1 = g.replica_set:build_and_add_server{
+        alias = 'server1',
+        box_cfg = g.box_cfg,
+    }
+
+    -- Force server1 to be a leader for reliability and reproducibility
+    g.box_cfg.election_mode = 'voter'
+    g.server2 = g.replica_set:build_and_add_server{
+        alias = 'server2',
+        box_cfg = g.box_cfg,
+    }
+
+    g.proxy1 = proxy:new{
+        client_socket_path = server.build_listen_uri('server1_proxy'),
+        server_socket_path = server.build_listen_uri('server1', rs_id),
+    }
+
+    g.proxy2 = proxy:new{
+        client_socket_path = server.build_listen_uri('server2_proxy'),
+        server_socket_path = server.build_listen_uri('server2', rs_id),
+    }
+
+    luatest.assert(g.proxy1:start{force = true}, 'Proxy from 3 to 1 started')
+    luatest.assert(g.proxy2:start{force = true}, 'Proxy from 3 to 2 started')
+    g.box_cfg.replication[1] = server.build_listen_uri('server1_proxy')
+    g.box_cfg.replication[2] = server.build_listen_uri('server2_proxy')
+
+    g.server3 = g.replica_set:build_and_add_server{
+        alias = 'server3',
+        box_cfg = g.box_cfg,
+    }
+
+    g.replica_set:start()
+    g.replica_set:wait_for_fullmesh()
+end)
+
+g.after_all(function(g)
+    g.replica_set:drop()
+end)
+
+g.test_prevote_fail = function(g)
+    --
+    -- Test that applier failure doesn't start election if the node
+    -- doesn't have enough quorum for doing that:
+    --     1. Fill leader_witness_map in order to not allow election_update_cb
+    --        start election on timeout.
+    --     2. Break applier to leader and wait until it dies and
+    --        the corresponding bit in leader_witness_map is cleared.
+    --     3. Break last applier and make sure, that election isn't started.
+    --
+    luatest.assert_equals(g.replica_set:get_leader(), g.server1)
+    local old_term = g.server1:get_election_term()
+    g.server3:exec(function()
+        box.cfg({election_mode = 'candidate'})
+    end)
+
+    -- We must be sure, that the server3 cannot start elections on timeout.
+    -- It's witness map should not be empty, when update_election_cb is executed
+    luatest.helpers.retrying({}, function()
+        if not g.server3:grep_log('leader is seen: true, state: follower') then
+            error("Witness map may still be empty")
+        end
+    end)
+
+    g.proxy1:pause()
+
+    -- Wait for the server3 to notice leader death and clear the
+    -- corresponding bit in leader_witness_map. The elections are not
+    -- supposed to start as the server2 says, that it can see the leader.
+    local id1 = g.server1:get_instance_id()
+    g.server3:exec(wait_for_upstream_death_with_id, {id1})
+
+    g.proxy2:pause()
+
+    -- server3 should not start elections, as it doesn't
+    -- have enough quorum of healthy nodes to do that.
+    local id2 = g.server2:get_instance_id()
+    g.server3:exec(wait_for_upstream_death_with_id, {id2})
+    luatest.assert_equals(g.server3:get_election_term(), old_term)
+
+    -- Restore appliers
+    g.proxy1:resume()
+    g.proxy2:resume()
+    -- Make sure that replica set is stable
+    luatest.assert_equals(g.server3:get_election_term(), old_term)
+    luatest.assert_equals(g.server1:get_election_term(),
+                          g.server3:get_election_term())
+end
-- 
GitLab