From a008e681d534a546ffdc8df83ad18d80644be0fd Mon Sep 17 00:00:00 2001
From: Georgy Moshkin <gmoshkin@picodata.io>
Date: Mon, 21 Nov 2022 14:09:55 +0300
Subject: [PATCH] fix: remove read barrier from postjoin

When instance runs `postjoin` function, it's supposed to have both
target and current grade Offline. Taking a read barrier in that state
makes no sence. Instead, an instance should set target grade online
asap. Ensuring it syncs is the responsibility of the governor.

Excessive read barrier affects `restart_both` test. One of the instances
(i1 who becomes a follower) sends a `read_index` request to i2 (leader).
But i2 reveals  `self.commit_to_current_term()` condition isn't met and
drops the request. As a result, i1 is obliged to wait for 10s timeout
before retrying `read_index` request again, but the test timeouts even
sooner.
---
 src/main.rs             | 27 ++++-----------------------
 test/int/test_basics.py | 14 +++++++-------
 2 files changed, 11 insertions(+), 30 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index 40b59e92cd..64a84b19f9 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -958,28 +958,6 @@ fn postjoin(args: &args::Run, storage: Storage) {
     box_cfg.listen = Some(args.listen.clone());
     tarantool::set_cfg(&box_cfg);
 
-    tlog!(Debug, "Getting a read barrier...");
-    loop {
-        if node.status().leader_id.is_none() {
-            // This check doesn't guarantee anything. It only eliminates
-            // unnecesary requests that will fail for sure. For example,
-            // re-election still may be abrupt while `node.read_index()`
-            // implicitly yields.
-            node.wait_status();
-            continue;
-        }
-
-        let timeout = Duration::from_secs(10);
-        if let Err(e) = node.wait_for_read_state(timeout) {
-            tlog!(Debug, "unable to get a read barrier: {e}");
-            fiber::sleep(Duration::from_millis(100));
-            continue;
-        } else {
-            break;
-        }
-    }
-    tlog!(Info, "Read barrier aquired, raft is ready");
-
     if let Err(e) = tarantool::on_shutdown(traft::failover::on_shutdown) {
         tlog!(Error, "failed setting on_shutdown trigger: {e}");
     }
@@ -994,13 +972,16 @@ fn postjoin(args: &args::Run, storage: Storage) {
             .cluster_id()
             .unwrap()
             .expect("cluster_id must be persisted at the time of postjoin");
+        let Some(leader_id) = node.status().leader_id else {
+            fiber::sleep(Duration::from_millis(100));
+            continue
+        };
 
         tlog!(Info, "initiating self-activation of {}", peer.instance_id);
         let req = UpdatePeerRequest::new(peer.instance_id, cluster_id)
             .with_target_grade(TargetGradeVariant::Online)
             .with_failure_domain(args.failure_domain());
 
-        let Some(leader_id) = node.status().leader_id else { continue };
         let leader = storage.peers.get(&leader_id).unwrap();
 
         // It's necessary to call `proc_update_peer` remotely on a
diff --git a/test/int/test_basics.py b/test/int/test_basics.py
index 048f3ac66a..2e8afff420 100644
--- a/test/int/test_basics.py
+++ b/test/int/test_basics.py
@@ -222,13 +222,13 @@ def test_raft_log(instance: Instance):
 |  3  | 1  |1.0.3|Insert(cluster_state, ["desired_schema_version",0])|
 |  4  | 1  |     |AddNode(1)|
 |  5  | 2  |     |-|
-|  6  | 2  |1.1.2|PersistPeer(i1, 1, r1, 127.0.0.1:{p}, Offline(0) -> Online(1), 6, {b})|
-|  7  | 2  |1.1.3|PersistPeer(i1, 1, r1, 127.0.0.1:{p}, RaftSynced(1) -> Online(1), 7, {b})|
-|  8  | 2  |1.1.4|PersistPeer(i1, 1, r1, 127.0.0.1:{p}, Replicated(1) -> Online(1), 8, {b})|
-|  9  | 2  |1.1.5|Insert(replicasets, ["r1","e0df68c5-e7f9-395f-86b3-30ad9e1b7b07","i1",1.0,0])|
-| 10  | 2  |1.1.6|PersistPeer(i1, 1, r1, 127.0.0.1:{p}, ShardingInitialized(1) -> Online(1), 10, {b})|
-| 11  | 2  |1.1.7|Replace(cluster_state, ["vshard_bootstrapped",true])|
-| 12  | 2  |1.1.8|PersistPeer(i1, 1, r1, 127.0.0.1:{p}, Online(1), 12, {b})|
+|  6  | 2  |1.1.1|PersistPeer(i1, 1, r1, 127.0.0.1:{p}, Offline(0) -> Online(1), 6, {b})|
+|  7  | 2  |1.1.2|PersistPeer(i1, 1, r1, 127.0.0.1:{p}, RaftSynced(1) -> Online(1), 7, {b})|
+|  8  | 2  |1.1.3|PersistPeer(i1, 1, r1, 127.0.0.1:{p}, Replicated(1) -> Online(1), 8, {b})|
+|  9  | 2  |1.1.4|Insert(replicasets, ["r1","e0df68c5-e7f9-395f-86b3-30ad9e1b7b07","i1",1.0,0])|
+| 10  | 2  |1.1.5|PersistPeer(i1, 1, r1, 127.0.0.1:{p}, ShardingInitialized(1) -> Online(1), 10, {b})|
+| 11  | 2  |1.1.6|Replace(cluster_state, ["vshard_bootstrapped",true])|
+| 12  | 2  |1.1.7|PersistPeer(i1, 1, r1, 127.0.0.1:{p}, Online(1), 12, {b})|
 +-----+----+-----+--------+
 """.format(  # noqa: E501
         p=instance.port, b="{}"
-- 
GitLab