From 64c463e0366233eb3bad82e094831daa058b5888 Mon Sep 17 00:00:00 2001
From: Nick Zavaritsky <mejedi@gmail.com>
Date: Mon, 11 Jan 2016 20:50:45 +0300
Subject: [PATCH] Fix #1075: box.cfg blocks until rw mode is activated, 1.6

---
 src/box/box.cc                           | 22 ++++++++++++++++++++++
 test/replication-py/init_storage.result  |  4 ++++
 test/replication-py/init_storage.test.py | 15 ++++++++++++++-
 test/replication-py/readonly.result      |  4 ++++
 test/replication-py/readonly.test.py     | 16 +++++++++++++++-
 test/replication-py/replica.lua          |  5 ++++-
 6 files changed, 63 insertions(+), 3 deletions(-)

diff --git a/src/box/box.cc b/src/box/box.cc
index fb6e93c984..3c25883d71 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -31,6 +31,7 @@
 #include "box/box.h"
 
 #include <say.h>
+#include "ipc.h"
 #include "iproto.h"
 #include "iproto_constants.h"
 #include "recovery.h"
@@ -86,6 +87,7 @@ static struct recover_row_ctx {
 bool snapshot_in_progress = false;
 static bool box_init_done = false;
 bool is_ro = true;
+struct ipc_channel *wait_rw;
 
 void
 recover_row_ctx_init(struct recover_row_ctx *ctx, size_t rows_per_wal)
@@ -175,6 +177,18 @@ void
 box_set_ro(bool ro)
 {
 	is_ro = ro;
+	if (ro == false && !ipc_channel_is_full(wait_rw))
+		ipc_channel_put(wait_rw, NULL);
+}
+
+static void
+box_wait_rw()
+{
+	void *msg;
+	while (is_ro) {
+		ipc_channel_get(wait_rw, &msg);
+		assert(msg == NULL);
+	}
 }
 
 bool
@@ -939,6 +953,8 @@ box_free(void)
 #endif
 		engine_shutdown();
 	}
+	if (wait_rw)
+		ipc_channel_delete(wait_rw);
 }
 
 static void
@@ -985,6 +1001,10 @@ box_init(void)
 {
 	error_init();
 
+	wait_rw = ipc_channel_new(1);
+	if (wait_rw == NULL)
+		diag_raise();
+
 	tuple_init(cfg_getd("slab_alloc_arena"),
 		   cfg_geti("slab_alloc_minimal"),
 		   cfg_geti("slab_alloc_maximal"),
@@ -1086,6 +1106,8 @@ box_init(void)
 	/* Enter read-write mode. */
 	if (recovery->server_id > 0)
 		box_set_ro(false);
+	else
+		box_wait_rw();
 	title("running");
 	say_info("ready to accept requests");
 
diff --git a/test/replication-py/init_storage.result b/test/replication-py/init_storage.result
index 0e27d903be..759a1b86bc 100644
--- a/test/replication-py/init_storage.result
+++ b/test/replication-py/init_storage.result
@@ -17,6 +17,10 @@ box.space.test
 ---
 - null
 ...
+box_cfg_done
+---
+- false
+...
 -------------------------------------------------------------
 replica JOIN
 -------------------------------------------------------------
diff --git a/test/replication-py/init_storage.test.py b/test/replication-py/init_storage.test.py
index d94d4414f7..7083ea09da 100644
--- a/test/replication-py/init_storage.test.py
+++ b/test/replication-py/init_storage.test.py
@@ -23,10 +23,23 @@ replica = TarantoolServer(server.ini)
 replica.script = 'replication-py/replica.lua'
 replica.vardir = server.vardir #os.path.join(server.vardir, 'replica')
 replica.rpl_master = master
-replica.deploy()
+
+# #1075: Box.once should wait before the server enters RW mode
+#
+# We expect the replica to get blocked in box.cfg{}, hence wait = False.
+# Since xlog files on master were deleted, they aren't delivered,
+# and replica waits indefinitely.
+#
+# Note: replica waits for a log entry indicating that this very replica
+# joined the cluster. Once the entry is fetched we assume that the
+# replica is relatively up to date and enter RW mode. Never happens in
+# this particular test case.
+replica.deploy(wait = False)
 
 replica.admin('box.space.test')
 
+replica.admin('box_cfg_done') # blocked in box.cfg it should be
+
 replica.stop()
 replica.cleanup(True)
 
diff --git a/test/replication-py/readonly.result b/test/replication-py/readonly.result
index a7848717d0..954221155d 100644
--- a/test/replication-py/readonly.result
+++ b/test/replication-py/readonly.result
@@ -39,3 +39,7 @@ box.info.vclock[2]
 ---
 - null
 ...
+box_cfg_done
+---
+- false
+...
diff --git a/test/replication-py/readonly.test.py b/test/replication-py/readonly.test.py
index 0ecf684dd3..1b12cabe50 100644
--- a/test/replication-py/readonly.test.py
+++ b/test/replication-py/readonly.test.py
@@ -31,7 +31,18 @@ os.remove(wal)
 
 # Start replica without master
 server.stop()
-replica.start()
+
+# #1075: Box.once should wait before the server enters RW mode
+#
+# We expect the replica to get blocked in box.cfg{}, hence wait = False.
+# Since neither xlog files nor master are available, the replica waits
+# indefinitely.
+#
+# Note: replica monitors _cluster table, synchronized via replication.
+# The replica enters RW mode once it discovers that according to
+# _cluster table it had joined the cluster. Never happens in this
+# particular test case.
+replica.start(wait = False)
 replica.admin('box.cfg{replication_source = ""}')
 
 # Check that replica in read-only mode
@@ -41,6 +52,9 @@ replica.admin('box.info.server.lsn')
 replica.admin('space = box.schema.space.create("ro")')
 replica.admin('box.info.vclock[%d]' % replica_id)
 
+# Check that box.cfg didn't return yet
+replica.admin('box_cfg_done')
+
 replica.stop()
 replica.cleanup(True)
 server.deploy()
diff --git a/test/replication-py/replica.lua b/test/replication-py/replica.lua
index 3a08208e06..00ecd005ec 100644
--- a/test/replication-py/replica.lua
+++ b/test/replication-py/replica.lua
@@ -1,4 +1,7 @@
 #!/usr/bin/env tarantool
+box_cfg_done = false
+
+require('console').listen(os.getenv('ADMIN'))
 
 box.cfg({
     listen              = os.getenv("LISTEN"),
@@ -6,4 +9,4 @@ box.cfg({
     slab_alloc_arena    = 0.1,
 })
 
-require('console').listen(os.getenv('ADMIN'))
+box_cfg_done = true
-- 
GitLab