From bcd63a5603aabeebd0fca0908b8d815fbdcfb6d0 Mon Sep 17 00:00:00 2001
From: Georgy Moshkin <gmoshkin@picodata.io>
Date: Mon, 12 Aug 2024 20:51:54 +0300
Subject: [PATCH] fix: remove long erroneous waits when leader changes during
 cas operation

---
 src/plugin/mod.rs          | 6 ++----
 src/rpc/join.rs            | 3 +--
 src/rpc/update_instance.rs | 3 +--
 src/schema.rs              | 1 +
 src/traft/error.rs         | 4 +++-
 5 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/plugin/mod.rs b/src/plugin/mod.rs
index fc056c8352..70c354c8e5 100644
--- a/src/plugin/mod.rs
+++ b/src/plugin/mod.rs
@@ -397,12 +397,11 @@ fn do_routing_table_cas(
                 node.wait_index(index, deadline.duration_since(fiber::clock()))?;
                 if term != raft::Storage::term(raft_storage, index)? {
                     // leader switched - retry
-                    node.wait_status();
                     continue;
                 }
             }
             Err(err) => {
-                if err.is_cas_err() || err.is_term_mismatch_err() {
+                if err.is_retriable() {
                     // cas error - retry
                     fiber::sleep(Duration::from_millis(500));
                     continue;
@@ -514,12 +513,11 @@ fn do_plugin_cas(
                 node.wait_index(index, deadline.duration_since(Instant::now_fiber()))?;
                 if term != raft::Storage::term(raft_storage, index)? {
                     // leader switched - retry
-                    node.wait_status();
                     continue;
                 }
             }
             Err(err) => {
-                if err.is_cas_err() | err.is_term_mismatch_err() {
+                if err.is_retriable() {
                     // cas error - retry
                     fiber::sleep(Duration::from_millis(500));
                     continue;
diff --git a/src/rpc/join.rs b/src/rpc/join.rs
index 6fa9f89ba0..ae090f3033 100644
--- a/src/rpc/join.rs
+++ b/src/rpc/join.rs
@@ -135,12 +135,11 @@ pub fn handle_join_request_and_wait(req: Request, timeout: Duration) -> Result<R
                 node.wait_index(index, deadline.duration_since(fiber::clock()))?;
                 if term != raft::Storage::term(raft_storage, index)? {
                     // leader switched - retry
-                    node.wait_status();
                     continue;
                 }
             }
             Err(err) => {
-                if err.is_cas_err() | err.is_term_mismatch_err() {
+                if err.is_retriable() {
                     // cas error - retry
                     fiber::sleep(Duration::from_millis(500));
                     continue;
diff --git a/src/rpc/update_instance.rs b/src/rpc/update_instance.rs
index 5f3b5e9bf8..4c970a56ba 100644
--- a/src/rpc/update_instance.rs
+++ b/src/rpc/update_instance.rs
@@ -181,7 +181,6 @@ pub fn handle_update_instance_request_in_governor_and_also_wait_too(
                 node.wait_index(index, deadline.duration_since(fiber::clock()))?;
                 if term != raft::Storage::term(raft_storage, index)? {
                     // leader switched - retry
-                    node.wait_status();
                     continue;
                 }
             }
@@ -189,7 +188,7 @@ pub fn handle_update_instance_request_in_governor_and_also_wait_too(
                 if req.dont_retry {
                     return Err(err);
                 }
-                if err.is_cas_err() || err.is_term_mismatch_err() {
+                if err.is_retriable() {
                     // cas error - retry
                     fiber::sleep(Duration::from_millis(500));
                     continue;
diff --git a/src/schema.rs b/src/schema.rs
index 1e88fb92b2..e7c56108da 100644
--- a/src/schema.rs
+++ b/src/schema.rs
@@ -2469,6 +2469,7 @@ pub fn abort_ddl(timeout: Duration) -> traft::Result<RaftIndex> {
         };
 
         let req = Request::new(Op::DdlAbort, predicate, effective_user_id())?;
+        // FIXME: this error handling is wrong, must retry if e.is_retriable()
         let (index, term) = compare_and_swap(&req, timeout)?;
         node.wait_index(index, timeout)?;
         if raft::Storage::term(&node.raft_storage, index)? != term {
diff --git a/src/traft/error.rs b/src/traft/error.rs
index 02e7028a8c..860bbe2514 100644
--- a/src/traft/error.rs
+++ b/src/traft/error.rs
@@ -150,7 +150,9 @@ pub fn is_retriable_error_message(msg: &str) -> bool {
     }
 
     if msg.contains("compare-and-swap") {
-        return msg.contains("Compacted") || msg.contains("ConflictFound");
+        return msg.contains("Compacted")
+            || msg.contains("ConflictFound")
+            || msg.contains("EntryTermMismatch");
     }
 
     return false;
-- 
GitLab