From 298d5d404f0adccc3a84d0df9c7a4952935f7206 Mon Sep 17 00:00:00 2001
From: Georgy Moshkin <gmoshkin@picodata.io>
Date: Wed, 27 Jul 2022 12:05:47 +0300
Subject: [PATCH] fix: used to panic during shut down when leader id is None

---
 src/main.rs             |  8 ++++----
 src/traft/failover.rs   |  8 ++++++--
 test/int/test_couple.py | 18 ++++++++++++++++++
 3 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index de9b76fb4c..979fe6d60a 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -720,10 +720,6 @@ fn postjoin(args: &args::Run) {
     box_cfg.listen = Some(args.listen.clone());
     tarantool::set_cfg(&box_cfg);
 
-    if let Err(e) = tarantool::on_shutdown(traft::failover::on_shutdown) {
-        tlog!(Error, "failed setting on_shutdown trigger: {e}");
-    }
-
     tlog!(Debug, "Getting a read barrier...");
     loop {
         if node.status().leader_id == None {
@@ -746,6 +742,10 @@ fn postjoin(args: &args::Run) {
     }
     tlog!(Info, "Read barrier aquired, raft is ready");
 
+    if let Err(e) = tarantool::on_shutdown(traft::failover::on_shutdown) {
+        tlog!(Error, "failed setting on_shutdown trigger: {e}");
+    }
+
     let peer = traft::Storage::peer_by_raft_id(raft_id).unwrap().unwrap();
     box_cfg.replication = traft::Storage::box_replication(&peer.replicaset_id, None).unwrap();
     tarantool::set_cfg(&box_cfg);
diff --git a/src/traft/failover.rs b/src/traft/failover.rs
index e3c982b862..a1b9fb5c23 100644
--- a/src/traft/failover.rs
+++ b/src/traft/failover.rs
@@ -2,6 +2,7 @@ use std::time::{Duration, Instant};
 
 use ::tarantool::fiber::sleep;
 use ::tarantool::proc;
+use ::tarantool::unwrap_or;
 
 use crate::{stringify_cfunc, tarantool, tlog};
 
@@ -33,8 +34,11 @@ pub fn on_shutdown() {
     // will run until we get successfully deactivate or tarantool shuts down
     // the on_shutdown fiber (after 3 secs)
     loop {
-        let status = node::global().unwrap().status();
-        let leader_id = status.leader_id.expect("leader_id deinitialized");
+        let node = node::global().unwrap();
+        let leader_id = unwrap_or!(node.status().leader_id, {
+            node.wait_status();
+            continue;
+        });
         let leader = Storage::peer_by_raft_id(leader_id).unwrap().unwrap();
         let wait_before_retry = Duration::from_millis(300);
         let now = Instant::now();
diff --git a/test/int/test_couple.py b/test/int/test_couple.py
index 3d0b900f7f..04d94e92f4 100644
--- a/test/int/test_couple.py
+++ b/test/int/test_couple.py
@@ -1,6 +1,7 @@
 import funcy  # type: ignore
 import pytest
 from conftest import Cluster, Instance
+from time import sleep
 
 
 @funcy.retry(tries=20, timeout=0.1)
@@ -179,3 +180,20 @@ def test_deactivation(cluster2: Cluster):
 
     assert raft_update_peer(i2, target=i2, is_active=True) == [{}]
     assert raft_update_peer(i2, target=i2, is_active=True) == [{}]
+
+
+def test_gl119_panic_in_on_shutdown(cluster2: Cluster):
+    i1, i2 = cluster2.instances
+
+    i2.call("picolib.raft_timeout_now", timeout=0.01)
+    assert i2.terminate() == 0
+
+    # second instance terminates first, so it becomes a follower
+    i2.terminate()
+    # terminate the leader, so the follower can't acquire the read barrier
+    i1.terminate()
+
+    i2.start()
+    # wait for the follower to start acquiring the read barrier
+    sleep(1)
+    assert i2.terminate() == 0
-- 
GitLab