From a008e681d534a546ffdc8df83ad18d80644be0fd Mon Sep 17 00:00:00 2001 From: Georgy Moshkin <gmoshkin@picodata.io> Date: Mon, 21 Nov 2022 14:09:55 +0300 Subject: [PATCH] fix: remove read barrier from postjoin When instance runs `postjoin` function, it's supposed to have both target and current grade Offline. Taking a read barrier in that state makes no sence. Instead, an instance should set target grade online asap. Ensuring it syncs is the responsibility of the governor. Excessive read barrier affects `restart_both` test. One of the instances (i1 who becomes a follower) sends a `read_index` request to i2 (leader). But i2 reveals `self.commit_to_current_term()` condition isn't met and drops the request. As a result, i1 is obliged to wait for 10s timeout before retrying `read_index` request again, but the test timeouts even sooner. --- src/main.rs | 27 ++++----------------------- test/int/test_basics.py | 14 +++++++------- 2 files changed, 11 insertions(+), 30 deletions(-) diff --git a/src/main.rs b/src/main.rs index 40b59e92cd..64a84b19f9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -958,28 +958,6 @@ fn postjoin(args: &args::Run, storage: Storage) { box_cfg.listen = Some(args.listen.clone()); tarantool::set_cfg(&box_cfg); - tlog!(Debug, "Getting a read barrier..."); - loop { - if node.status().leader_id.is_none() { - // This check doesn't guarantee anything. It only eliminates - // unnecesary requests that will fail for sure. For example, - // re-election still may be abrupt while `node.read_index()` - // implicitly yields. - node.wait_status(); - continue; - } - - let timeout = Duration::from_secs(10); - if let Err(e) = node.wait_for_read_state(timeout) { - tlog!(Debug, "unable to get a read barrier: {e}"); - fiber::sleep(Duration::from_millis(100)); - continue; - } else { - break; - } - } - tlog!(Info, "Read barrier aquired, raft is ready"); - if let Err(e) = tarantool::on_shutdown(traft::failover::on_shutdown) { tlog!(Error, "failed setting on_shutdown trigger: {e}"); } @@ -994,13 +972,16 @@ fn postjoin(args: &args::Run, storage: Storage) { .cluster_id() .unwrap() .expect("cluster_id must be persisted at the time of postjoin"); + let Some(leader_id) = node.status().leader_id else { + fiber::sleep(Duration::from_millis(100)); + continue + }; tlog!(Info, "initiating self-activation of {}", peer.instance_id); let req = UpdatePeerRequest::new(peer.instance_id, cluster_id) .with_target_grade(TargetGradeVariant::Online) .with_failure_domain(args.failure_domain()); - let Some(leader_id) = node.status().leader_id else { continue }; let leader = storage.peers.get(&leader_id).unwrap(); // It's necessary to call `proc_update_peer` remotely on a diff --git a/test/int/test_basics.py b/test/int/test_basics.py index 048f3ac66a..2e8afff420 100644 --- a/test/int/test_basics.py +++ b/test/int/test_basics.py @@ -222,13 +222,13 @@ def test_raft_log(instance: Instance): | 3 | 1 |1.0.3|Insert(cluster_state, ["desired_schema_version",0])| | 4 | 1 | |AddNode(1)| | 5 | 2 | |-| -| 6 | 2 |1.1.2|PersistPeer(i1, 1, r1, 127.0.0.1:{p}, Offline(0) -> Online(1), 6, {b})| -| 7 | 2 |1.1.3|PersistPeer(i1, 1, r1, 127.0.0.1:{p}, RaftSynced(1) -> Online(1), 7, {b})| -| 8 | 2 |1.1.4|PersistPeer(i1, 1, r1, 127.0.0.1:{p}, Replicated(1) -> Online(1), 8, {b})| -| 9 | 2 |1.1.5|Insert(replicasets, ["r1","e0df68c5-e7f9-395f-86b3-30ad9e1b7b07","i1",1.0,0])| -| 10 | 2 |1.1.6|PersistPeer(i1, 1, r1, 127.0.0.1:{p}, ShardingInitialized(1) -> Online(1), 10, {b})| -| 11 | 2 |1.1.7|Replace(cluster_state, ["vshard_bootstrapped",true])| -| 12 | 2 |1.1.8|PersistPeer(i1, 1, r1, 127.0.0.1:{p}, Online(1), 12, {b})| +| 6 | 2 |1.1.1|PersistPeer(i1, 1, r1, 127.0.0.1:{p}, Offline(0) -> Online(1), 6, {b})| +| 7 | 2 |1.1.2|PersistPeer(i1, 1, r1, 127.0.0.1:{p}, RaftSynced(1) -> Online(1), 7, {b})| +| 8 | 2 |1.1.3|PersistPeer(i1, 1, r1, 127.0.0.1:{p}, Replicated(1) -> Online(1), 8, {b})| +| 9 | 2 |1.1.4|Insert(replicasets, ["r1","e0df68c5-e7f9-395f-86b3-30ad9e1b7b07","i1",1.0,0])| +| 10 | 2 |1.1.5|PersistPeer(i1, 1, r1, 127.0.0.1:{p}, ShardingInitialized(1) -> Online(1), 10, {b})| +| 11 | 2 |1.1.6|Replace(cluster_state, ["vshard_bootstrapped",true])| +| 12 | 2 |1.1.7|PersistPeer(i1, 1, r1, 127.0.0.1:{p}, Online(1), 12, {b})| +-----+----+-----+--------+ """.format( # noqa: E501 p=instance.port, b="{}" -- GitLab