From cb4d0fcdf08707a30750763bae104d92208bbded Mon Sep 17 00:00:00 2001
From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
Date: Sun, 29 Sep 2019 18:06:22 +0200
Subject: [PATCH] replication: recfg with 0 quorum returns immediately

Replication quorum 0 not only affects orphan status, but also,
according to documentation, makes box.cfg() return immediately
regardless of whether connections to upstreams are established.

It was not so before the patch. What is worse, even with non 0
quorum the instance was blocked on reconfiguration for connect
timeout seconds, if at least one node is not connected.

Now quorum is respected on reconfiguration. On a bootstrap it is
still impossible to return earlier than
replication_connect_timeout, because nodes need to choose some
cluster settings. Too early start would make it impossible -
cluster's participants will just start and choose different
cluster UUIDs.

Closes #3760

(cherry picked from commit c6bea65f8ef5f6c737cf70c0127189d0ebcbc36e)
---
 src/box/replication.cc         | 14 ++++++++++++++
 test/replication/misc.result   | 27 ++++++++++++++++++++++++---
 test/replication/misc.test.lua | 24 +++++++++++++++++++-----
 3 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/src/box/replication.cc b/src/box/replication.cc
index 6fcc56fe37..ee102a5978 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -659,6 +659,20 @@ replicaset_connect(struct applier **appliers, int count,
 	}
 
 	while (state.connected < count) {
+		/*
+		 * After a quorum is reached, it is considered
+		 * enough to proceed. Except if a connection is
+		 * critical.
+		 * Connection *is* critical even with 0 quorum
+		 * when the instance starts first time and needs
+		 * to choose replicaset UUID, fill _cluster, etc.
+		 * If 0 quorum allowed to return immediately even
+		 * at first start, then it would be impossible to
+		 * bootstrap a replicaset - all nodes would start
+		 * immediately and choose different cluster UUIDs.
+		 */
+		if (state.connected >= quorum && !connect_quorum)
+			break;
 		double wait_start = ev_monotonic_now(loop());
 		if (fiber_cond_wait_timeout(&state.wakeup, timeout) != 0)
 			break;
diff --git a/test/replication/misc.result b/test/replication/misc.result
index ae72ce3e44..3905cc49ed 100644
--- a/test/replication/misc.result
+++ b/test/replication/misc.result
@@ -818,12 +818,33 @@ box.info.ro
 ---
 - false
 ...
-box.cfg{replication=""}
+--
+-- gh-3760: replication quorum 0 on reconfiguration should return
+-- from box.cfg immediately.
+--
+replication = box.cfg.replication
+---
+...
+box.cfg{                                                        \
+    replication = {},                                           \
+    replication_connect_quorum = 0,                             \
+    replication_connect_timeout = 1000000                       \
+}
 ---
 ...
-box.cfg{replication_connect_timeout=replication_connect_timeout}
+-- The call below would hang, if quorum 0 is ignored, or checked
+-- too late.
+box.cfg{replication = {'localhost:12345'}}
 ---
 ...
-box.cfg{replication_connect_quorum=replication_connect_quorum}
+box.info.status
+---
+- running
+...
+box.cfg{                                                        \
+    replication = replication,                                  \
+    replication_connect_quorum = replication_connect_quorum,    \
+    replication_connect_timeout = replication_connect_timeout   \
+}
 ---
 ...
diff --git a/test/replication/misc.test.lua b/test/replication/misc.test.lua
index 16e7e9e42e..696564f94a 100644
--- a/test/replication/misc.test.lua
+++ b/test/replication/misc.test.lua
@@ -327,8 +327,22 @@ box.cfg{replication_connect_quorum=1}
 box.info.status
 box.info.ro
 
-box.cfg{replication=""}
-
-
-box.cfg{replication_connect_timeout=replication_connect_timeout}
-box.cfg{replication_connect_quorum=replication_connect_quorum}
+--
+-- gh-3760: replication quorum 0 on reconfiguration should return
+-- from box.cfg immediately.
+--
+replication = box.cfg.replication
+box.cfg{                                                        \
+    replication = {},                                           \
+    replication_connect_quorum = 0,                             \
+    replication_connect_timeout = 1000000                       \
+}
+-- The call below would hang, if quorum 0 is ignored, or checked
+-- too late.
+box.cfg{replication = {'localhost:12345'}}
+box.info.status
+box.cfg{                                                        \
+    replication = replication,                                  \
+    replication_connect_quorum = replication_connect_quorum,    \
+    replication_connect_timeout = replication_connect_timeout   \
+}
-- 
GitLab