From 737dc12a0bf5bc1d47b058b34128f0fc98bf77ca Mon Sep 17 00:00:00 2001
From: Georgiy Lebedev <g.lebedev@tarantool.org>
Date: Mon, 3 Jun 2024 12:06:24 +0300
Subject: [PATCH] box: update synchro quorum in on_commit trigger instead of
 on_replace
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, we update the synchronous replication quorum from the
`on_replace` trigger of the `_cluster` space when registering a new
replica. However, during the join process, the replica cannot ack its own
insertion into the `_cluster` space. In the scope of #9723, we are going to
enable synchronous replication for most of the system spaces, including the
`_cluster` space. There are several problems with this:

1. Joining a replica to a 1-member cluster without manual changing of
quorum won't work: it is impossible to commit the insertion into the
`_cluster` space with only 1 node, since the quorum will equal to 2 right
after the insertion.

2. Joining a replica to a 3-member cluster may fail: the quorum will become
equal to 3 right after the insertion, the newly joined replica cannot ACK
its own insertion into the `_cluster` space — if one out of original 3
nodes fails, then reconfiguration will fail.

Generally speaking, it will be impossible to join a new replica to the
cluster, if a quorum, which includes the newly added replica (which cannot
ACK), cannot be gathered.

To solve these problems, let's update the quorum in the `on_commit`
trigger. This way we’ll be able to insert a node regardless of the current
configuration. This somewhat contradicts with the Raft specification, which
requires application of all configuration changes in the `on_replace`
trigger (i.e., as soon as they are persisted in the WAL, without quorum
confirmation), but still forbids several reconfigurations at the same time.

Closes #10087

NO_DOC=<no special documentation page devoted to cluster reconfiguration>

(cherry picked from commit 29d1c0fa6c76ff06d748d8916c5ffbbc9e4a55e9)
---
 ...0087-update-quorum-in-on_commit-trigger.md |  7 ++
 src/box/alter.cc                              | 18 ++++
 src/box/replication.cc                        |  1 -
 ...pdate_quorum_in_on_commit_trigger_test.lua | 94 +++++++++++++++++++
 4 files changed, 119 insertions(+), 1 deletion(-)
 create mode 100644 changelogs/unreleased/gh-10087-update-quorum-in-on_commit-trigger.md
 create mode 100644 test/replication-luatest/gh_10087_update_quorum_in_on_commit_trigger_test.lua

diff --git a/changelogs/unreleased/gh-10087-update-quorum-in-on_commit-trigger.md b/changelogs/unreleased/gh-10087-update-quorum-in-on_commit-trigger.md
new file mode 100644
index 0000000000..615f973866
--- /dev/null
+++ b/changelogs/unreleased/gh-10087-update-quorum-in-on_commit-trigger.md
@@ -0,0 +1,7 @@
+## bugfix/replication
+
+* Fixed the inability to add a new replica to the replicaset if the user has
+  manually made space `_cluster` synchronous. Now the synchronous replication
+  quorum is updated after the `_cluster` change is confirmed by a quorum rather
+  than immediately after persisting the configuration change in the WAL
+  (gh-10087).
diff --git a/src/box/alter.cc b/src/box/alter.cc
index e135ea1476..36f97e7202 100644
--- a/src/box/alter.cc
+++ b/src/box/alter.cc
@@ -4128,6 +4128,15 @@ on_replace_cluster_clear_id(struct trigger *trigger, void * /* event */)
 	return 0;
 }
 
+/** Update the synchronous replication quorum. */
+static int
+on_replace_cluster_update_quorum(struct trigger * /* trigger */,
+				 void * /* event */)
+{
+	box_update_replication_synchro_quorum();
+	return 0;
+}
+
 /**
  * A trigger invoked on replace in the space _cluster,
  * which contains cluster configuration.
@@ -4203,6 +4212,15 @@ on_replace_dd_cluster(struct trigger *trigger, void *event)
 			diag_set(ClientError, ER_UNSUPPORTED, "Tarantool", msg);
 			return -1;
 		}
+		/*
+		 * Update the quorum only after commit. Otherwise the replica
+		 * would have to ack its own insertion.
+		 */
+		struct trigger *on_commit = txn_alter_trigger_new(
+			on_replace_cluster_update_quorum, NULL);
+		if (on_commit == NULL)
+			return -1;
+		txn_stmt_on_commit(stmt, on_commit);
 		struct trigger *on_rollback = txn_alter_trigger_new(
 			on_replace_cluster_clear_id, NULL);
 		if (on_rollback == NULL)
diff --git a/src/box/replication.cc b/src/box/replication.cc
index 5730569a4c..5b7c76afd4 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -360,7 +360,6 @@ replica_set_id(struct replica *replica, uint32_t replica_id)
 	say_info("assigned id %d to replica %s",
 		 replica->id, tt_uuid_str(&replica->uuid));
 	replica->anon = false;
-	box_update_replication_synchro_quorum();
 	box_broadcast_ballot();
 }
 
diff --git a/test/replication-luatest/gh_10087_update_quorum_in_on_commit_trigger_test.lua b/test/replication-luatest/gh_10087_update_quorum_in_on_commit_trigger_test.lua
new file mode 100644
index 0000000000..7fc1ed70b3
--- /dev/null
+++ b/test/replication-luatest/gh_10087_update_quorum_in_on_commit_trigger_test.lua
@@ -0,0 +1,94 @@
+local t = require('luatest')
+local replica_set = require('luatest.replica_set')
+local server = require('luatest.server')
+
+local g_one_member_cluster = t.group('one_member_cluster')
+local g_three_member_cluster = t.group('three_member_cluster')
+
+g_one_member_cluster.before_all(function(cg)
+    cg.master = server:new{alias = 'master'}
+    cg.master:start()
+    -- Make `_cluster` space synchronous.
+    cg.master:exec(function()
+        box.ctl.promote()
+        box.space._cluster:alter{is_sync = true}
+    end)
+end)
+
+-- Test that synchronous insertion into 1-member cluster works properly.
+g_one_member_cluster.test_insertion = function(cg)
+    cg.replica = server:new{alias = 'replica', box_cfg = {
+        replication = cg.master.net_box_uri,
+    }}
+    cg.replica:start()
+    cg.master:wait_for_downstream_to(cg.replica)
+    cg.replica:exec(function()
+        t.assert_not_equals(box.space._cluster:get{box.info.id}, nil)
+    end)
+end
+
+g_one_member_cluster.after_all(function(cg)
+    cg.master:drop()
+    if cg.replica ~= nil then
+        cg.replica:drop()
+    end
+end)
+
+g_three_member_cluster.before_all(function(cg)
+    cg.replica_set = replica_set:new{}
+    cg.master = cg.replica_set:build_and_add_server{alias = 'master'}
+    cg.master:start()
+    cg.replica_to_be_disabled =
+        cg.replica_set:build_and_add_server{alias = 'to_be_disabled',
+                                            box_cfg = {
+        replication = {
+            cg.master.net_box_uri,
+            server.build_listen_uri('replica', cg.replica_set.id),
+        },
+    }}
+    cg.replica = cg.replica_set:build_and_add_server{alias = 'replica',
+                                                     box_cfg = {
+        replication = {
+            cg.master.net_box_uri,
+            server.build_listen_uri('to_be_disabled', cg.replica_set.id),
+        },
+    }}
+    cg.replica_set:start()
+
+    -- Make `_cluster` space synchronous.
+    cg.master:exec(function()
+        box.ctl.promote()
+        box.space._cluster:alter{is_sync = true}
+    end)
+
+    cg.master:wait_for_downstream_to(cg.replica_to_be_disabled)
+    cg.master:wait_for_downstream_to(cg.replica)
+end)
+
+-- Test that synchronous insertion into 3-member cluster with 1 disabled node
+-- works properly.
+g_three_member_cluster.test_insertion = function(cg)
+    cg.replica_to_be_disabled:exec(function()
+        box.cfg{replication = ''}
+    end)
+    cg.replica_to_be_added =
+        cg.replica_set:build_and_add_server{alias = 'to_be_added',
+                                            box_cfg = {
+        replication = {
+            cg.master.net_box_uri,
+            server.build_listen_uri('to_be_disabled', cg.replica_set.id),
+            server.build_listen_uri('replica', cg.replica_set.id),
+        },
+        replication_sync_timeout = 0,
+    }}
+    cg.replica_to_be_added:start()
+    cg.master:wait_for_downstream_to(cg.replica)
+    cg.master:wait_for_downstream_to(cg.replica_to_be_added)
+    cg.replica_to_be_added:exec(function()
+        t.assert_not_equals(box.space._cluster:get{box.info.id}, nil)
+    end)
+end
+
+g_three_member_cluster.after_all(function(cg)
+    cg.replica_set:drop()
+end)
-- 
GitLab