From 31bf1bc25297e0b4da6e75797e91a6f4a711f5f5 Mon Sep 17 00:00:00 2001
From: Georgy Moshkin <gmoshkin@picodata.io>
Date: Fri, 3 Jun 2022 18:37:33 +0300
Subject: [PATCH] fix(discovery): don't fail if raft node is ready but
 leader_id is not

If proc_discover is invoked after raft node was initialized but before
raft leader was elected, it would return an error before this commit.
Because of that it was impossible to restart the whole cluster at once.

This commit change proc_discover such that in case leader_id is not
ready, the normal discovery algorithm takes place.

Closes #93
---
 src/discovery.rs        | 11 +++++------
 src/traft/node.rs       |  9 ---------
 test/int/test_couple.py |  5 ++---
 3 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/src/discovery.rs b/src/discovery.rs
index c919659d6f..36a55b3bb0 100644
--- a/src/discovery.rs
+++ b/src/discovery.rs
@@ -199,18 +199,17 @@ fn proc_discover<'a>(
     request: Request,
     request_to: Address,
 ) -> Result<Cow<'a, Response>, Box<dyn StdError>> {
-    if let Ok(node) = traft::node::global() {
+    let ready_ids = traft::node::global().ok().and_then(|node| {
         let status = node.status();
-        let id = status.id;
-        let leader_id = status
-            .leader_id
-            .ok_or_else(|| format!("leader_id is unknown for node {id}"))?;
+        status.leader_id.map(|leader_id| (leader_id, status.id))
+    });
+    if let Some((leader_id, id)) = ready_ids {
         let leader = traft::Storage::peer_by_raft_id(leader_id)?.ok_or_else(|| {
             format!("leader_id is present ({leader_id}) but it's address is unknown for node {id}")
         })?;
         Ok(Cow::Owned(Response::Done(Role::new(
             leader.peer_address,
-            status.am_leader(),
+            leader_id == id,
         ))))
     } else {
         let discovery = discovery.as_mut().ok_or("discovery uninitialized")?;
diff --git a/src/traft/node.rs b/src/traft/node.rs
index b33906cc16..883cf98d95 100644
--- a/src/traft/node.rs
+++ b/src/traft/node.rs
@@ -66,15 +66,6 @@ pub struct Status {
     pub is_ready: bool,
 }
 
-impl Status {
-    pub fn am_leader(&self) -> bool {
-        match self.leader_id {
-            Some(id) => self.id == id,
-            None => false,
-        }
-    }
-}
-
 /// The heart of `traft` module - the Node.
 #[derive(Debug)]
 pub struct Node {
diff --git a/test/int/test_couple.py b/test/int/test_couple.py
index eefaac1136..e9ac3e62c1 100644
--- a/test/int/test_couple.py
+++ b/test/int/test_couple.py
@@ -76,7 +76,6 @@ def test_restart_leader(cluster2: Cluster):
     assert i1.raft_propose_eval("return")
 
 
-@pytest.mark.xfail  # FIXME
 def test_restart_both(cluster2: Cluster):
     # Given a cluster of 2 instances - i1, i2
     # When both instances are stopped and then started again
@@ -91,9 +90,9 @@ def test_restart_both(cluster2: Cluster):
         assert instance._raft_status().is_ready is False
 
     i1.start()
-    # This synchronization makes sure the test fails.
+    # This synchronization is necessary for proper test case reproducing.
     # i1 has already initialized raft node but can't win election yet
-    # i2 starts discovery and fails: leader_id is unknown for node 1
+    # i2 starts discovery and should be able to advance further
     wait_alive(i1)
     i2.start()
 
-- 
GitLab