From 32ae02545ab3dd741cc2a7c475eeb787e4882d30 Mon Sep 17 00:00:00 2001
From: Serge Petrenko <sergepetrenko@tarantool.org>
Date: Wed, 18 Jan 2023 11:36:53 +0300
Subject: [PATCH] raft: fix 'manual' nodes bumping the term excessively

A node configured in 'manual' mode and promoted by `box.ctl.promote()`
stays in is_candidate state for the whole term, even though it is not
is_cfg_candidate.

If such a node is the first one to notice leader death or to hit the
election timeout, it bumps the term excessively, then immediately
becomes a mere follower, because its is_candidate is reset with
is_cfg_candidate.

This extra term bump (one term after the node was actually promoted) is
unnecessary and might lead to strange errors:

tarantool> box.ctl.promote()
---
- error: 'The term is outdated: old - 3, new - 4'
...

Fix this by checking if a node is configured as a candidate before
trying to start new elections.

Closes #8168

NO_DOC=bugfix

(cherry picked from commit 5765fdc4e4ecf2c6cdd0845a24bea8d39fc9d2bb)
---
 .../gh-8168-double-term-bump-on-promote.md    |  4 ++
 src/lib/raft/raft.c                           | 13 ++++--
 .../gh_8168_extra_term_bump_test.lua          | 46 +++++++++++++++++++
 test/unit/raft.c                              | 37 ++++++++++++++-
 test/unit/raft.result                         | 11 ++++-
 5 files changed, 106 insertions(+), 5 deletions(-)
 create mode 100644 changelogs/unreleased/gh-8168-double-term-bump-on-promote.md
 create mode 100644 test/replication-luatest/gh_8168_extra_term_bump_test.lua

diff --git a/changelogs/unreleased/gh-8168-double-term-bump-on-promote.md b/changelogs/unreleased/gh-8168-double-term-bump-on-promote.md
new file mode 100644
index 0000000000..f8370313bc
--- /dev/null
+++ b/changelogs/unreleased/gh-8168-double-term-bump-on-promote.md
@@ -0,0 +1,4 @@
+## bugfix/raft
+
+* Fixed nodes configured with `election_mode` = "manual" sometimes bumping the
+  election term excessively after their promotion (gh-8168).
diff --git a/src/lib/raft/raft.c b/src/lib/raft/raft.c
index 1e5ff2a9d4..75ae79207f 100644
--- a/src/lib/raft/raft.c
+++ b/src/lib/raft/raft.c
@@ -333,7 +333,12 @@ raft_sm_schedule_new_election(struct raft *raft);
 static inline void
 raft_sm_election_update(struct raft *raft)
 {
-	if (!raft->is_candidate)
+	/*
+	 * The node might be promoted for current term, in which case
+	 * is_candidate would be true. But it's not enough. If is_cfg_candidate
+	 * is false, the node would give up as soon as new term starts.
+	 */
+	if (!raft->is_cfg_candidate)
 		return;
 	/*
 	 * Pre-vote protection. Every node must agree that the leader is gone.
@@ -615,8 +620,10 @@ raft_process_msg(struct raft *raft, const struct raft_msg *req, uint32_t source)
 			/*
 			 * No need for pre-vote checks when the leader
 			 * deliberately told us it's resigning.
+			 * Note, the only case when automatic elections are
+			 * allowed is when the node is configured candidate.
 			 */
-			if (raft->is_candidate)
+			if (raft->is_cfg_candidate)
 				raft_sm_schedule_new_election(raft);
 		}
 		return 0;
@@ -951,7 +958,7 @@ static void
 raft_sm_schedule_new_election(struct raft *raft)
 {
 	say_info("RAFT: begin new election round");
-	assert(raft->is_candidate);
+	assert(raft->is_cfg_candidate);
 	/* Everyone is a follower until its vote for self is persisted. */
 	raft_sm_schedule_new_term(raft, raft->volatile_term + 1);
 	raft_sm_schedule_new_vote(raft, raft->self, raft->vclock);
diff --git a/test/replication-luatest/gh_8168_extra_term_bump_test.lua b/test/replication-luatest/gh_8168_extra_term_bump_test.lua
new file mode 100644
index 0000000000..f607b01508
--- /dev/null
+++ b/test/replication-luatest/gh_8168_extra_term_bump_test.lua
@@ -0,0 +1,46 @@
+local t = require('luatest')
+local replica_set = require('luatest.replica_set')
+local server = require('luatest.server')
+
+local g = t.group('gh-8168')
+
+g.before_each(function(cg)
+    cg.replica_set = replica_set:new{}
+    local box_cfg = {
+        replication = {
+            server.build_listen_uri('server1'),
+            server.build_listen_uri('server2'),
+        },
+        election_mode = 'manual',
+        replication_timeout = 0.1,
+    }
+    cg.server1 = cg.replica_set:build_and_add_server{
+        alias = 'server1',
+        box_cfg = box_cfg,
+    }
+    box_cfg.election_timeout = 1e-9
+    box_cfg.read_only = true
+    cg.server2 = cg.replica_set:build_and_add_server{
+        alias = 'server2',
+        box_cfg = box_cfg,
+    }
+    cg.replica_set:start()
+end)
+
+g.after_each(function(cg)
+    cg.replica_set:drop()
+end)
+
+g.test_double_term_bump_on_promote = function(cg)
+    cg.server1:wait_for_election_leader()
+    local term = cg.server1:get_election_term()
+    local ok, _ = cg.server2:exec(function()
+        return pcall(box.ctl.promote)
+    end)
+    t.assert(ok, 'Promote succeeded')
+    t.assert_equals(cg.server2:get_election_term(), term + 1,
+                    'Promote bumped term once')
+    t.assert_equals(cg.server2:exec(function()
+        return box.info.election.state
+    end), 'leader', 'Server 2 is the leader')
+end
diff --git a/test/unit/raft.c b/test/unit/raft.c
index 923eac192c..c2934f262a 100644
--- a/test/unit/raft.c
+++ b/test/unit/raft.c
@@ -1569,7 +1569,7 @@ raft_test_too_long_wal_write(void)
 static void
 raft_test_promote_restore(void)
 {
-	raft_start_test(12);
+	raft_start_test(21);
 	struct raft_node node;
 	raft_node_create(&node);
 
@@ -1617,6 +1617,41 @@ raft_test_promote_restore(void)
 	raft_node_restore(&node);
 	ok(!node.raft.is_candidate, "not a candidate");
 
+	/*
+	 * The node doesn't schedule new elections in the next round after
+	 * promotion.
+	 */
+	raft_node_cfg_is_candidate(&node, false);
+	raft_node_cfg_is_enabled(&node, true);
+	raft_node_cfg_election_quorum(&node, 2);
+	raft_node_promote(&node);
+
+	is(node.raft.state, RAFT_STATE_CANDIDATE, "became candidate");
+	is(node.raft.term, 5, "new term");
+
+	/* Wait for the election timeout. */
+	double ts = raft_time();
+	raft_run_next_event();
+	ok(raft_time() - ts >= node.raft.election_timeout,
+	   "election timeout passed");
+	is(node.raft.state, RAFT_STATE_CANDIDATE, "still a candidate");
+	is(node.raft.term, 5, "do not bump term");
+
+	is(raft_node_send_leader(
+		&node,
+		5 /* Term. */,
+		2 /* Source. */
+	), 0, "another leader is accepted");
+
+	is(raft_node_send_follower(
+		&node,
+		5 /* Term. */,
+		2 /* Source. */
+	), 0, "leader resign is accepted");
+
+	is(node.raft.state, RAFT_STATE_FOLLOWER, "stay follower");
+	is(node.raft.term, 5, "do not bump term");
+
 	raft_node_destroy(&node);
 	raft_finish_test();
 }
diff --git a/test/unit/raft.result b/test/unit/raft.result
index d32947a469..2fdca67ef3 100644
--- a/test/unit/raft.result
+++ b/test/unit/raft.result
@@ -265,7 +265,7 @@ ok 13 - subtests
 ok 14 - subtests
 	*** raft_test_too_long_wal_write: done ***
 	*** raft_test_promote_restore ***
-    1..12
+    1..21
     ok 1 - became leader after promotion
     ok 2 - restore drops a non-candidate leader to a follower
     ok 3 - became leader after promotion
@@ -278,6 +278,15 @@ ok 14 - subtests
     ok 10 - still old term
     ok 11 - not a candidate
     ok 12 - not a candidate
+    ok 13 - became candidate
+    ok 14 - new term
+    ok 15 - election timeout passed
+    ok 16 - still a candidate
+    ok 17 - do not bump term
+    ok 18 - another leader is accepted
+    ok 19 - leader resign is accepted
+    ok 20 - stay follower
+    ok 21 - do not bump term
 ok 15 - subtests
 	*** raft_test_promote_restore: done ***
 	*** raft_test_bump_term_before_cfg ***
-- 
GitLab