From 4c63d09e088d83ff9625097c8ef5f2e83e8b52ec Mon Sep 17 00:00:00 2001
From: Georgy Moshkin <gmoshkin@picodata.io>
Date: Fri, 29 Mar 2024 17:53:34 +0300
Subject: [PATCH] fix: use batch-dml when joining instances for robustness

---
 src/rpc/join.rs           | 12 ++----------
 src/traft/node.rs         | 20 ++++++++++++--------
 test/int/test_snapshot.py |  4 ++--
 3 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/src/rpc/join.rs b/src/rpc/join.rs
index 446f710356..aba562abbf 100644
--- a/src/rpc/join.rs
+++ b/src/rpc/join.rs
@@ -126,17 +126,9 @@ pub fn handle_join_request_and_wait(req: Request, timeout: Duration) -> Result<R
         }
         // Only in this order - so that when instance exists - address will always be there.
         handle_result!(cas::compare_and_swap(
-            Op::Dml(op_addr),
-            cas::Predicate {
-                index: raft_storage.applied()?,
-                term: raft_storage.term()?,
-                ranges: ranges.clone(),
+            Op::BatchDml {
+                ops: vec![op_addr, op_instance],
             },
-            ADMIN_ID,
-            deadline.duration_since(fiber::clock()),
-        ));
-        handle_result!(cas::compare_and_swap(
-            Op::Dml(op_instance),
             cas::Predicate {
                 index: raft_storage.applied()?,
                 term: raft_storage.term()?,
diff --git a/src/traft/node.rs b/src/traft/node.rs
index 176cd3a7dc..f58dce4f70 100644
--- a/src/traft/node.rs
+++ b/src/traft/node.rs
@@ -727,18 +727,22 @@ impl NodeImpl {
 
     fn wake_governor_if_needed(&self, op: &Op) {
         let wake_governor = match &op {
-            Op::Dml(op) => {
-                matches!(
-                    op.space().try_into(),
-                    Ok(ClusterwideTable::Property
-                        | ClusterwideTable::Replicaset
-                        | ClusterwideTable::Instance)
-                )
-            }
+            Op::Dml(op) => dml_is_governor_wakeup_worthy(op),
+            Op::BatchDml { ops } => ops.iter().any(dml_is_governor_wakeup_worthy),
             Op::DdlPrepare { .. } => true,
             _ => false,
         };
 
+        #[inline(always)]
+        fn dml_is_governor_wakeup_worthy(op: &Dml) -> bool {
+            matches!(
+                op.space().try_into(),
+                Ok(ClusterwideTable::Property
+                    | ClusterwideTable::Replicaset
+                    | ClusterwideTable::Instance)
+            )
+        }
+
         // NOTE: this may be premature, because the dml may fail to apply and/or
         // the transaction may be rolled back, but we ignore this for the sake
         // of simplicity, as nothing bad can happen if governor makes another
diff --git a/test/int/test_snapshot.py b/test/int/test_snapshot.py
index d04e3359e1..26b042b4dc 100644
--- a/test/int/test_snapshot.py
+++ b/test/int/test_snapshot.py
@@ -18,9 +18,9 @@ def test_bootstrap_from_snapshot(cluster: Cluster):
     # Ensure i2 bootstraps from a snapshot
     i2 = cluster.add_instance(wait_online=True)
     # Whenever a snapshot is applied, all preceeding raft log is
-    # implicitly compacted. Adding an instance implies appending 3
+    # implicitly compacted. Adding an instance implies appending 2
     # entries to the raft log. i2 catches them via a snapshot.
-    assert i2.raft_first_index() == i1.raft_first_index() + 3
+    assert i2.raft_first_index() == i1.raft_first_index() + 2
 
     # Ensure new instance replicates the property
     assert i2.call("box.space._pico_property:get", "animal") == ["animal", "horse"]
-- 
GitLab