From 29d1c0fa6c76ff06d748d8916c5ffbbc9e4a55e9 Mon Sep 17 00:00:00 2001
From: Georgiy Lebedev <g.lebedev@tarantool.org>
Date: Mon, 3 Jun 2024 12:06:24 +0300
Subject: [PATCH] box: update synchro quorum in on_commit trigger instead of
 on_replace
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, we update the synchronous replication quorum from the
`on_replace` trigger of the `_cluster` space when registering a new
replica. However, during the join process, the replica cannot ack its own
insertion into the `_cluster` space. In the scope of #9723, we are going to
enable synchronous replication for most of the system spaces, including the
`_cluster` space. There are several problems with this:

1. Joining a replica to a 1-member cluster without manual changing of
quorum won't work: it is impossible to commit the insertion into the
`_cluster` space with only 1 node, since the quorum will equal to 2 right
after the insertion.

2. Joining a replica to a 3-member cluster may fail: the quorum will become
equal to 3 right after the insertion, the newly joined replica cannot ACK
its own insertion into the `_cluster` space — if one out of original 3
nodes fails, then reconfiguration will fail.

Generally speaking, it will be impossible to join a new replica to the
cluster, if a quorum, which includes the newly added replica (which cannot
ACK), cannot be gathered.

To solve these problems, let's update the quorum in the `on_commit`
trigger. This way we’ll be able to insert a node regardless of the current
configuration. This somewhat contradicts with the Raft specification, which
requires application of all configuration changes in the `on_replace`
trigger (i.e., as soon as they are persisted in the WAL, without quorum
confirmation), but still forbids several reconfigurations at the same time.

Closes #10087

NO_DOC=<no special documentation page devoted to cluster reconfiguration>
---
 ...0087-update-quorum-in-on_commit-trigger.md |  5 +
 src/box/alter.cc                              | 18 ++++
 src/box/replication.cc                        |  1 -
 ...pdate_quorum_in_on_commit_trigger_test.lua | 93 +++++++++++++++++++
 4 files changed, 116 insertions(+), 1 deletion(-)
 create mode 100644 changelogs/unreleased/gh-10087-update-quorum-in-on_commit-trigger.md
 create mode 100644 test/replication-luatest/gh_10087_update_quorum_in_on_commit_trigger_test.lua

diff --git a/changelogs/unreleased/gh-10087-update-quorum-in-on_commit-trigger.md b/changelogs/unreleased/gh-10087-update-quorum-in-on_commit-trigger.md
new file mode 100644
index 0000000000..da8673a8b1
--- /dev/null
+++ b/changelogs/unreleased/gh-10087-update-quorum-in-on_commit-trigger.md
@@ -0,0 +1,5 @@
+## feature/replication
+
+* Now the synchronous replication quorum is updated after the cluster
+  reconfiguration change is confirmed by a quorum rather than immediately after
+  persisting the configuration change in the WAL (gh-10087).
diff --git a/src/box/alter.cc b/src/box/alter.cc
index 0259139c30..2d1565b26c 100644
--- a/src/box/alter.cc
+++ b/src/box/alter.cc
@@ -4424,6 +4424,15 @@ on_replace_cluster_clear_id(struct trigger *trigger, void * /* event */)
 	return 0;
 }
 
+/** Update the synchronous replication quorum. */
+static int
+on_replace_cluster_update_quorum(struct trigger * /* trigger */,
+				 void * /* event */)
+{
+	box_update_replication_synchro_quorum();
+	return 0;
+}
+
 /** Replica definition. */
 struct replica_def {
 	/** Instance ID. */
@@ -4639,6 +4648,15 @@ on_replace_dd_cluster_insert(const struct replica_def *new_def)
 			 tt_uuid_str(&replica->uuid));
 		return -1;
 	}
+	/*
+	 * Update the quorum only after commit. Otherwise the replica would have
+	 * to ack its own insertion.
+	 */
+	struct trigger *on_commit = txn_alter_trigger_new(
+		on_replace_cluster_update_quorum, NULL);
+	if (on_commit == NULL)
+		return -1;
+	txn_stmt_on_commit(stmt, on_commit);
 	struct trigger *on_rollback = txn_alter_trigger_new(
 		on_replace_cluster_clear_id, NULL);
 	if (on_rollback == NULL)
diff --git a/src/box/replication.cc b/src/box/replication.cc
index 50cbe77ef0..2a26a446ee 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -376,7 +376,6 @@ replica_set_id(struct replica *replica, uint32_t replica_id)
 	say_info("assigned id %d to replica %s",
 		 replica->id, tt_uuid_str(&replica->uuid));
 	replica->anon = false;
-	box_update_replication_synchro_quorum();
 	box_broadcast_ballot();
 }
 
diff --git a/test/replication-luatest/gh_10087_update_quorum_in_on_commit_trigger_test.lua b/test/replication-luatest/gh_10087_update_quorum_in_on_commit_trigger_test.lua
new file mode 100644
index 0000000000..9f8ee32cf6
--- /dev/null
+++ b/test/replication-luatest/gh_10087_update_quorum_in_on_commit_trigger_test.lua
@@ -0,0 +1,93 @@
+local t = require('luatest')
+local replica_set = require('luatest.replica_set')
+local server = require('luatest.server')
+
+local g_one_member_cluster = t.group('one_member_cluster')
+local g_three_member_cluster = t.group('three_member_cluster')
+
+g_one_member_cluster.before_all(function(cg)
+    cg.master = server:new{alias = 'master'}
+    cg.master:start()
+    -- Make `_cluster` space synchronous.
+    cg.master:exec(function()
+        box.ctl.promote()
+        box.space._cluster:alter{is_sync = true}
+    end)
+end)
+
+-- Test that synchronous insertion into 1-member cluster works properly.
+g_one_member_cluster.test_insertion = function(cg)
+    cg.replica = server:new{alias = 'replica', box_cfg = {
+        replication = cg.master.net_box_uri,
+    }}
+    cg.replica:start()
+    cg.master:wait_for_downstream_to(cg.replica)
+    cg.replica:exec(function()
+        t.assert_not_equals(box.space._cluster:get{box.info.id}, nil)
+    end)
+end
+
+g_one_member_cluster.after_all(function(cg)
+    cg.master:drop()
+    if cg.replica ~= nil then
+        cg.replica:drop()
+    end
+end)
+
+g_three_member_cluster.before_all(function(cg)
+    cg.replica_set = replica_set:new{}
+    cg.master = cg.replica_set:build_and_add_server{alias = 'master'}
+    cg.master:start()
+    cg.replica_to_be_disabled =
+        cg.replica_set:build_and_add_server{alias = 'to_be_disabled',
+                                            box_cfg = {
+        replication = {
+            cg.master.net_box_uri,
+            server.build_listen_uri('replica', cg.replica_set.id),
+        },
+    }}
+    cg.replica = cg.replica_set:build_and_add_server{alias = 'replica',
+                                                     box_cfg = {
+        replication = {
+            cg.master.net_box_uri,
+            server.build_listen_uri('to_be_disabled', cg.replica_set.id),
+        },
+    }}
+    cg.replica_set:start()
+
+    -- Make `_cluster` space synchronous.
+    cg.master:exec(function()
+        box.ctl.promote()
+        box.space._cluster:alter{is_sync = true}
+    end)
+
+    cg.master:wait_for_downstream_to(cg.replica_to_be_disabled)
+    cg.master:wait_for_downstream_to(cg.replica)
+end)
+
+-- Test that synchronous insertion into 3-member cluster with 1 disabled node
+-- works properly.
+g_three_member_cluster.test_insertion = function(cg)
+    cg.replica_to_be_disabled:exec(function()
+        box.cfg{replication = ''}
+    end)
+    cg.replica_to_be_added =
+        cg.replica_set:build_and_add_server{alias = 'to_be_added',
+                                            box_cfg = {
+        replication = {
+            cg.master.net_box_uri,
+            server.build_listen_uri('to_be_disabled', cg.replica_set.id),
+            server.build_listen_uri('replica', cg.replica_set.id),
+        },
+    }}
+    cg.replica_to_be_added:start()
+    cg.master:wait_for_downstream_to(cg.replica)
+    cg.master:wait_for_downstream_to(cg.replica_to_be_added)
+    cg.replica_to_be_added:exec(function()
+        t.assert_not_equals(box.space._cluster:get{box.info.id}, nil)
+    end)
+end
+
+g_three_member_cluster.after_all(function(cg)
+    cg.replica_set:drop()
+end)
-- 
GitLab