From cd627f26526d7da8facda264912e2a0f5338a79a Mon Sep 17 00:00:00 2001
From: Serge Petrenko <sergepetrenko@tarantool.org>
Date: Wed, 28 Aug 2019 11:25:00 +0300
Subject: [PATCH] replication: enter orphan mode on manual replication
 configuration chage

Currently we only enter orphan mode when instance fails to connect to
replication_connect_quorum remote instances during local recovery.
On bootstrap and manual replication configuration change an error is
thrown. We better enter orphan mode on manual config change, and leave
it only in case we managed to sync with replication_connect_quorum
instances.

Closes #4424

@TarantoolBot document
Title: document reaction on error in replication configuration change.

Now when issuing `box.cfg{replication={uri1, uri2, ...}}` and failing to
sync with replication_connect_quorum remote instances, the server will
throw an error, if it is bootstrapping, and just set its state to orphan
in all other cases (recovering from existing xlog/snap files or manually
changing box.cfg.replication on the fly). To leave orphan mode, you may
wait until the server manages to sync with replication_connect_quorum
instances.
In order to leave orphan mode you need to make the server sync with
enough instances. To do so, you may either:
1) set replication_connect_quorum to a lower value
2) reset box.cfg.replication to exclude instances that cannot
   be reached or synced with
3) just set box.cfg.replication to "" (empty string)

(cherry picked from commit 5a0cfe025735dbb12486971c299803ae9a3ff46b)
---
 src/box/box.cc                 |  14 +++--
 src/box/box.h                  |   7 +++
 src/box/replication.cc         |  11 ++++
 test/replication/misc.result   | 104 ++++++++++++++++++++++++++++++++-
 test/replication/misc.test.lua |  41 ++++++++++++-
 5 files changed, 168 insertions(+), 9 deletions(-)

diff --git a/src/box/box.cc b/src/box/box.cc
index 80249919ef..13d9c41eb0 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -257,7 +257,7 @@ box_wait_ro(bool ro, double timeout)
 }
 
 void
-box_set_orphan(bool orphan)
+box_do_set_orphan(bool orphan)
 {
 	if (is_orphan == orphan)
 		return; /* nothing to do */
@@ -266,7 +266,12 @@ box_set_orphan(bool orphan)
 
 	is_orphan = orphan;
 	fiber_cond_broadcast(&ro_cond);
+}
 
+void
+box_set_orphan(bool orphan)
+{
+	box_do_set_orphan(orphan);
 	/* Update the title to reflect the new status. */
 	if (is_orphan) {
 		say_crit("entering orphan mode");
@@ -699,11 +704,10 @@ box_set_replication(void)
 	box_check_replication();
 	/*
 	 * Try to connect to all replicas within the timeout period.
-	 * The configuration will succeed as long as we've managed
-	 * to connect to at least replication_connect_quorum
-	 * masters.
+	 * Stay in orphan mode in case we fail to connect to at least
+	 * 'replication_connect_quorum' remote instances.
 	 */
-	box_sync_replication(true);
+	box_sync_replication(false);
 	/* Follow replica */
 	replicaset_follow();
 	/* Wait until appliers are in sync */
diff --git a/src/box/box.h b/src/box/box.h
index ddcfbe2e59..ccd527bd54 100644
--- a/src/box/box.h
+++ b/src/box/box.h
@@ -127,6 +127,13 @@ box_wait_ro(bool ro, double timeout);
 void
 box_set_orphan(bool orphan);
 
+/**
+ * Set orphan mode but don't update instance title.
+ * \sa box_set_orphan
+ */
+void
+box_do_set_orphan(bool orphan);
+
 /**
  * Iterate over all spaces and save them to the
  * snapshot file.
diff --git a/src/box/replication.cc b/src/box/replication.cc
index 28f7acedce..d691ce4876 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -610,6 +610,17 @@ replicaset_connect(struct applier **appliers, int count,
 
 	say_info("connecting to %d replicas", count);
 
+	if (!connect_quorum) {
+		/*
+		 * Enter orphan mode on configuration change and
+		 * only leave it when we manage to sync with
+		 * replicaset_quorum instances. Don't change
+		 * title though, it should be 'loading' during
+		 * local recovery.
+		 */
+		box_do_set_orphan(true);
+	}
+
 	/*
 	 * Simultaneously connect to remote peers to receive their UUIDs
 	 * and fill the resulting set:
diff --git a/test/replication/misc.result b/test/replication/misc.result
index 0a57edda59..ae72ce3e44 100644
--- a/test/replication/misc.result
+++ b/test/replication/misc.result
@@ -18,10 +18,19 @@ replication_connect_timeout = box.cfg.replication_connect_timeout
 box.cfg{replication_timeout=0.05, replication_connect_timeout=0.05, replication={}}
 ---
 ...
+box.cfg{replication_connect_quorum=2}
+---
+...
 box.cfg{replication = {'127.0.0.1:12345', box.cfg.listen}}
 ---
-- error: 'Incorrect value for option ''replication'': failed to connect to one or
-    more replicas'
+...
+box.info.status
+---
+- orphan
+...
+box.info.ro
+---
+- true
 ...
 -- gh-3606 - Tarantool crashes if box.cfg.replication is updated concurrently
 fiber = require('fiber')
@@ -47,8 +56,16 @@ c:get()
 ---
 - true
 ...
-box.cfg{replication_timeout = replication_timeout, replication_connect_timeout = replication_connect_timeout}
+box.cfg{replication = "", replication_timeout = replication_timeout, replication_connect_timeout = replication_connect_timeout}
+---
+...
+box.info.status
+---
+- running
+...
+box.info.ro
 ---
+- false
 ...
 -- gh-3111 - Allow to rebootstrap a replica from a read-only master
 replica_uuid = uuid.new()
@@ -729,3 +746,84 @@ test_run:cleanup_cluster()
 box.schema.user.revoke('guest', 'replication')
 ---
 ...
+--
+-- gh-4424 Always enter orphan mode on error in replication
+-- configuration change.
+--
+replication_connect_timeout = box.cfg.replication_connect_timeout
+---
+...
+replication_connect_quorum = box.cfg.replication_connect_quorum
+---
+...
+box.cfg{replication="12345", replication_connect_timeout=0.1, replication_connect_quorum=1}
+---
+...
+box.info.status
+---
+- orphan
+...
+box.info.ro
+---
+- true
+...
+-- reset replication => leave orphan mode
+box.cfg{replication=""}
+---
+...
+box.info.status
+---
+- running
+...
+box.info.ro
+---
+- false
+...
+-- no switch to orphan when quorum == 0
+box.cfg{replication="12345", replication_connect_quorum=0}
+---
+...
+box.info.status
+---
+- running
+...
+box.info.ro
+---
+- false
+...
+-- we could connect to one out of two replicas. Set orphan.
+box.cfg{replication_connect_quorum=2}
+---
+...
+box.cfg{replication={box.cfg.listen, "12345"}}
+---
+...
+box.info.status
+---
+- orphan
+...
+box.info.ro
+---
+- true
+...
+-- lower quorum => leave orphan mode
+box.cfg{replication_connect_quorum=1}
+---
+...
+box.info.status
+---
+- running
+...
+box.info.ro
+---
+- false
+...
+box.cfg{replication=""}
+---
+...
+box.cfg{replication_connect_timeout=replication_connect_timeout}
+---
+...
+box.cfg{replication_connect_quorum=replication_connect_quorum}
+---
+...
diff --git a/test/replication/misc.test.lua b/test/replication/misc.test.lua
index 99e9955093..16e7e9e42e 100644
--- a/test/replication/misc.test.lua
+++ b/test/replication/misc.test.lua
@@ -8,7 +8,10 @@ box.schema.user.grant('guest', 'replication')
 replication_timeout = box.cfg.replication_timeout
 replication_connect_timeout = box.cfg.replication_connect_timeout
 box.cfg{replication_timeout=0.05, replication_connect_timeout=0.05, replication={}}
+box.cfg{replication_connect_quorum=2}
 box.cfg{replication = {'127.0.0.1:12345', box.cfg.listen}}
+box.info.status
+box.info.ro
 
 -- gh-3606 - Tarantool crashes if box.cfg.replication is updated concurrently
 fiber = require('fiber')
@@ -19,7 +22,9 @@ f()
 c:get()
 c:get()
 
-box.cfg{replication_timeout = replication_timeout, replication_connect_timeout = replication_connect_timeout}
+box.cfg{replication = "", replication_timeout = replication_timeout, replication_connect_timeout = replication_connect_timeout}
+box.info.status
+box.info.ro
 
 -- gh-3111 - Allow to rebootstrap a replica from a read-only master
 replica_uuid = uuid.new()
@@ -293,3 +298,37 @@ test_run:cmd("cleanup server replica")
 test_run:cmd("delete server replica")
 test_run:cleanup_cluster()
 box.schema.user.revoke('guest', 'replication')
+
+--
+-- gh-4424 Always enter orphan mode on error in replication
+-- configuration change.
+--
+replication_connect_timeout = box.cfg.replication_connect_timeout
+replication_connect_quorum = box.cfg.replication_connect_quorum
+box.cfg{replication="12345", replication_connect_timeout=0.1, replication_connect_quorum=1}
+box.info.status
+box.info.ro
+-- reset replication => leave orphan mode
+box.cfg{replication=""}
+box.info.status
+box.info.ro
+-- no switch to orphan when quorum == 0
+box.cfg{replication="12345", replication_connect_quorum=0}
+box.info.status
+box.info.ro
+
+-- we could connect to one out of two replicas. Set orphan.
+box.cfg{replication_connect_quorum=2}
+box.cfg{replication={box.cfg.listen, "12345"}}
+box.info.status
+box.info.ro
+-- lower quorum => leave orphan mode
+box.cfg{replication_connect_quorum=1}
+box.info.status
+box.info.ro
+
+box.cfg{replication=""}
+
+
+box.cfg{replication_connect_timeout=replication_connect_timeout}
+box.cfg{replication_connect_quorum=replication_connect_quorum}
-- 
GitLab