diff --git a/src/main.rs b/src/main.rs index de9b76fb4c2430ce0a3e0a67e8272607ba297337..979fe6d60a16a1f7f9f342e34884203a9b9edbc3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -720,10 +720,6 @@ fn postjoin(args: &args::Run) { box_cfg.listen = Some(args.listen.clone()); tarantool::set_cfg(&box_cfg); - if let Err(e) = tarantool::on_shutdown(traft::failover::on_shutdown) { - tlog!(Error, "failed setting on_shutdown trigger: {e}"); - } - tlog!(Debug, "Getting a read barrier..."); loop { if node.status().leader_id == None { @@ -746,6 +742,10 @@ fn postjoin(args: &args::Run) { } tlog!(Info, "Read barrier aquired, raft is ready"); + if let Err(e) = tarantool::on_shutdown(traft::failover::on_shutdown) { + tlog!(Error, "failed setting on_shutdown trigger: {e}"); + } + let peer = traft::Storage::peer_by_raft_id(raft_id).unwrap().unwrap(); box_cfg.replication = traft::Storage::box_replication(&peer.replicaset_id, None).unwrap(); tarantool::set_cfg(&box_cfg); diff --git a/src/traft/failover.rs b/src/traft/failover.rs index e3c982b8624f2fcb9ebfbb5817c0da697c3442ac..a1b9fb5c23b4d7b7d55ed00c4d27c683b10cb175 100644 --- a/src/traft/failover.rs +++ b/src/traft/failover.rs @@ -2,6 +2,7 @@ use std::time::{Duration, Instant}; use ::tarantool::fiber::sleep; use ::tarantool::proc; +use ::tarantool::unwrap_or; use crate::{stringify_cfunc, tarantool, tlog}; @@ -33,8 +34,11 @@ pub fn on_shutdown() { // will run until we get successfully deactivate or tarantool shuts down // the on_shutdown fiber (after 3 secs) loop { - let status = node::global().unwrap().status(); - let leader_id = status.leader_id.expect("leader_id deinitialized"); + let node = node::global().unwrap(); + let leader_id = unwrap_or!(node.status().leader_id, { + node.wait_status(); + continue; + }); let leader = Storage::peer_by_raft_id(leader_id).unwrap().unwrap(); let wait_before_retry = Duration::from_millis(300); let now = Instant::now(); diff --git a/test/int/test_couple.py b/test/int/test_couple.py index 3d0b900f7f33d5b02a648fae35a1a5e933e07e75..04d94e92f4e05ed6b170bae6b332538c80bc6134 100644 --- a/test/int/test_couple.py +++ b/test/int/test_couple.py @@ -1,6 +1,7 @@ import funcy # type: ignore import pytest from conftest import Cluster, Instance +from time import sleep @funcy.retry(tries=20, timeout=0.1) @@ -179,3 +180,20 @@ def test_deactivation(cluster2: Cluster): assert raft_update_peer(i2, target=i2, is_active=True) == [{}] assert raft_update_peer(i2, target=i2, is_active=True) == [{}] + + +def test_gl119_panic_in_on_shutdown(cluster2: Cluster): + i1, i2 = cluster2.instances + + i2.call("picolib.raft_timeout_now", timeout=0.01) + assert i2.terminate() == 0 + + # second instance terminates first, so it becomes a follower + i2.terminate() + # terminate the leader, so the follower can't acquire the read barrier + i1.terminate() + + i2.start() + # wait for the follower to start acquiring the read barrier + sleep(1) + assert i2.terminate() == 0