From 3c92409447b844c01ca9c5993eb90c713f8f4b60 Mon Sep 17 00:00:00 2001
From: Georgy Moshkin <gmoshkin@picodata.io>
Date: Tue, 24 Sep 2024 16:35:40 +0300
Subject: [PATCH] test: reduce test_log_rollback flakiness

---
 test/conftest.py                 | 19 +++++++++++++++++++
 test/int/test_network_effects.py | 32 ++++++++++++++++++++++++++------
 2 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/test/conftest.py b/test/conftest.py
index 9346dd5caf..56f28ce34a 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -58,6 +58,8 @@ PICO_SERVICE_ID = 32
 class ErrorCode:
     Loading = 116
     Other = 10000
+    NotALeader = 10001
+    StorageCorrupted = 10002
     TermMismatch = 10003
     RaftLogUnavailable = 10004
     RaftLogCompacted = 10005
@@ -71,6 +73,23 @@ class ErrorCode:
     LeaderUnknown = 10018
     PluginError = 10019
 
+    # Make sure this matches this list in picoplugin::error_code::ErrorCode::is_retriable_for_cas
+    retriable_for_cas = set(
+        [
+            LeaderUnknown,
+            NotALeader,
+            TermMismatch,
+            RaftLogCompacted,
+            RaftLogUnavailable,
+            CasEntryTermMismatch,
+            CasConflictFound,
+        ]
+    )
+
+    @classmethod
+    def is_retriable_for_cas(cls, code):
+        return code in cls.retriable_for_cas
+
 
 def eprint(*args, **kwargs):
     print(*args, file=sys.stderr, **kwargs)
diff --git a/test/int/test_network_effects.py b/test/int/test_network_effects.py
index a63eb8d78d..b4d622ec2b 100644
--- a/test/int/test_network_effects.py
+++ b/test/int/test_network_effects.py
@@ -1,7 +1,7 @@
 import pytest
 import time
 
-from conftest import Cluster, Instance, TarantoolError, Retriable
+from conftest import Cluster, Instance, TarantoolError, Retriable, ErrorCode
 
 
 @pytest.fixture
@@ -65,12 +65,32 @@ def test_log_rollback(cluster3: Cluster):
 
     key = 0
 
-    def propose_state_change(srv: Instance, value):
+    def propose_state_change(srv: Instance, value, timeout=10):
+        deadline = time.time() + timeout
         nonlocal key
-        index = cluster3.cas(
-            "insert", "_pico_property", (f"check{key}", value), instance=srv
-        )
-        srv.raft_wait_index(index)
+        while True:
+            try:
+                index = cluster3.cas(
+                    "insert",
+                    "_pico_property",
+                    (f"check{key}", value),
+                    instance=srv,
+                )
+                srv.raft_wait_index(index)
+                break
+            except TarantoolError as e:
+                print(f"\x1b[33m### CaS error: {e}", end="")
+                if time.time() > deadline:
+                    print(", timeout\x1b[0m")
+                    raise
+
+                if ErrorCode.is_retriable_for_cas(e.args[0]):
+                    print(", retrying...\x1b[0m")
+                    time.sleep(0.1)
+                    continue
+
+                print(", failure!\x1b[0m")
+                raise
         key += 1
 
     propose_state_change(i1, "i1 is a leader")
-- 
GitLab