From bcd63a5603aabeebd0fca0908b8d815fbdcfb6d0 Mon Sep 17 00:00:00 2001 From: Georgy Moshkin <gmoshkin@picodata.io> Date: Mon, 12 Aug 2024 20:51:54 +0300 Subject: [PATCH] fix: remove long erroneous waits when leader changes during cas operation --- src/plugin/mod.rs | 6 ++---- src/rpc/join.rs | 3 +-- src/rpc/update_instance.rs | 3 +-- src/schema.rs | 1 + src/traft/error.rs | 4 +++- 5 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/plugin/mod.rs b/src/plugin/mod.rs index fc056c8352..70c354c8e5 100644 --- a/src/plugin/mod.rs +++ b/src/plugin/mod.rs @@ -397,12 +397,11 @@ fn do_routing_table_cas( node.wait_index(index, deadline.duration_since(fiber::clock()))?; if term != raft::Storage::term(raft_storage, index)? { // leader switched - retry - node.wait_status(); continue; } } Err(err) => { - if err.is_cas_err() || err.is_term_mismatch_err() { + if err.is_retriable() { // cas error - retry fiber::sleep(Duration::from_millis(500)); continue; @@ -514,12 +513,11 @@ fn do_plugin_cas( node.wait_index(index, deadline.duration_since(Instant::now_fiber()))?; if term != raft::Storage::term(raft_storage, index)? { // leader switched - retry - node.wait_status(); continue; } } Err(err) => { - if err.is_cas_err() | err.is_term_mismatch_err() { + if err.is_retriable() { // cas error - retry fiber::sleep(Duration::from_millis(500)); continue; diff --git a/src/rpc/join.rs b/src/rpc/join.rs index 6fa9f89ba0..ae090f3033 100644 --- a/src/rpc/join.rs +++ b/src/rpc/join.rs @@ -135,12 +135,11 @@ pub fn handle_join_request_and_wait(req: Request, timeout: Duration) -> Result<R node.wait_index(index, deadline.duration_since(fiber::clock()))?; if term != raft::Storage::term(raft_storage, index)? { // leader switched - retry - node.wait_status(); continue; } } Err(err) => { - if err.is_cas_err() | err.is_term_mismatch_err() { + if err.is_retriable() { // cas error - retry fiber::sleep(Duration::from_millis(500)); continue; diff --git a/src/rpc/update_instance.rs b/src/rpc/update_instance.rs index 5f3b5e9bf8..4c970a56ba 100644 --- a/src/rpc/update_instance.rs +++ b/src/rpc/update_instance.rs @@ -181,7 +181,6 @@ pub fn handle_update_instance_request_in_governor_and_also_wait_too( node.wait_index(index, deadline.duration_since(fiber::clock()))?; if term != raft::Storage::term(raft_storage, index)? { // leader switched - retry - node.wait_status(); continue; } } @@ -189,7 +188,7 @@ pub fn handle_update_instance_request_in_governor_and_also_wait_too( if req.dont_retry { return Err(err); } - if err.is_cas_err() || err.is_term_mismatch_err() { + if err.is_retriable() { // cas error - retry fiber::sleep(Duration::from_millis(500)); continue; diff --git a/src/schema.rs b/src/schema.rs index 1e88fb92b2..e7c56108da 100644 --- a/src/schema.rs +++ b/src/schema.rs @@ -2469,6 +2469,7 @@ pub fn abort_ddl(timeout: Duration) -> traft::Result<RaftIndex> { }; let req = Request::new(Op::DdlAbort, predicate, effective_user_id())?; + // FIXME: this error handling is wrong, must retry if e.is_retriable() let (index, term) = compare_and_swap(&req, timeout)?; node.wait_index(index, timeout)?; if raft::Storage::term(&node.raft_storage, index)? != term { diff --git a/src/traft/error.rs b/src/traft/error.rs index 02e7028a8c..860bbe2514 100644 --- a/src/traft/error.rs +++ b/src/traft/error.rs @@ -150,7 +150,9 @@ pub fn is_retriable_error_message(msg: &str) -> bool { } if msg.contains("compare-and-swap") { - return msg.contains("Compacted") || msg.contains("ConflictFound"); + return msg.contains("Compacted") + || msg.contains("ConflictFound") + || msg.contains("EntryTermMismatch"); } return false; -- GitLab