From 298d5d404f0adccc3a84d0df9c7a4952935f7206 Mon Sep 17 00:00:00 2001 From: Georgy Moshkin <gmoshkin@picodata.io> Date: Wed, 27 Jul 2022 12:05:47 +0300 Subject: [PATCH] fix: used to panic during shut down when leader id is None --- src/main.rs | 8 ++++---- src/traft/failover.rs | 8 ++++++-- test/int/test_couple.py | 18 ++++++++++++++++++ 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/src/main.rs b/src/main.rs index de9b76fb4c..979fe6d60a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -720,10 +720,6 @@ fn postjoin(args: &args::Run) { box_cfg.listen = Some(args.listen.clone()); tarantool::set_cfg(&box_cfg); - if let Err(e) = tarantool::on_shutdown(traft::failover::on_shutdown) { - tlog!(Error, "failed setting on_shutdown trigger: {e}"); - } - tlog!(Debug, "Getting a read barrier..."); loop { if node.status().leader_id == None { @@ -746,6 +742,10 @@ fn postjoin(args: &args::Run) { } tlog!(Info, "Read barrier aquired, raft is ready"); + if let Err(e) = tarantool::on_shutdown(traft::failover::on_shutdown) { + tlog!(Error, "failed setting on_shutdown trigger: {e}"); + } + let peer = traft::Storage::peer_by_raft_id(raft_id).unwrap().unwrap(); box_cfg.replication = traft::Storage::box_replication(&peer.replicaset_id, None).unwrap(); tarantool::set_cfg(&box_cfg); diff --git a/src/traft/failover.rs b/src/traft/failover.rs index e3c982b862..a1b9fb5c23 100644 --- a/src/traft/failover.rs +++ b/src/traft/failover.rs @@ -2,6 +2,7 @@ use std::time::{Duration, Instant}; use ::tarantool::fiber::sleep; use ::tarantool::proc; +use ::tarantool::unwrap_or; use crate::{stringify_cfunc, tarantool, tlog}; @@ -33,8 +34,11 @@ pub fn on_shutdown() { // will run until we get successfully deactivate or tarantool shuts down // the on_shutdown fiber (after 3 secs) loop { - let status = node::global().unwrap().status(); - let leader_id = status.leader_id.expect("leader_id deinitialized"); + let node = node::global().unwrap(); + let leader_id = unwrap_or!(node.status().leader_id, { + node.wait_status(); + continue; + }); let leader = Storage::peer_by_raft_id(leader_id).unwrap().unwrap(); let wait_before_retry = Duration::from_millis(300); let now = Instant::now(); diff --git a/test/int/test_couple.py b/test/int/test_couple.py index 3d0b900f7f..04d94e92f4 100644 --- a/test/int/test_couple.py +++ b/test/int/test_couple.py @@ -1,6 +1,7 @@ import funcy # type: ignore import pytest from conftest import Cluster, Instance +from time import sleep @funcy.retry(tries=20, timeout=0.1) @@ -179,3 +180,20 @@ def test_deactivation(cluster2: Cluster): assert raft_update_peer(i2, target=i2, is_active=True) == [{}] assert raft_update_peer(i2, target=i2, is_active=True) == [{}] + + +def test_gl119_panic_in_on_shutdown(cluster2: Cluster): + i1, i2 = cluster2.instances + + i2.call("picolib.raft_timeout_now", timeout=0.01) + assert i2.terminate() == 0 + + # second instance terminates first, so it becomes a follower + i2.terminate() + # terminate the leader, so the follower can't acquire the read barrier + i1.terminate() + + i2.start() + # wait for the follower to start acquiring the read barrier + sleep(1) + assert i2.terminate() == 0 -- GitLab