From cd44ef071a4beff3dd29f99103f23f8543e5c05b Mon Sep 17 00:00:00 2001
From: Georgy Moshkin <gmoshkin@picodata.io>
Date: Wed, 23 Oct 2024 14:44:17 +0300
Subject: [PATCH] test: add test with fully offline replicaset

---
 test/int/test_replication.py | 68 ++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/test/int/test_replication.py b/test/int/test_replication.py
index 6c35c0c177..10a99f570c 100644
--- a/test/int/test_replication.py
+++ b/test/int/test_replication.py
@@ -362,3 +362,71 @@ def test_expel_blocked_by_replicaset_master_switchover_to_offline_replica(
     # i5 is also synchronized
     rows = i5.sql(""" SELECT * FROM mytable ORDER BY id """)
     assert rows == [[0, "foo"], [1, "bar"], [2, "baz"]]
+
+
+def test_offline_replicaset(cluster: Cluster):
+    cluster.set_config_file(
+        yaml="""
+cluster:
+    name: test
+    tier:
+        raft:
+            replication_factor: 1
+            can_vote: true
+        storage:
+            replication_factor: 2
+            can_vote: false
+"""
+    )
+    [raft1, raft2, raft3] = cluster.deploy(instance_count=3, tier="raft")
+    raft1.promote_or_fail()
+
+    storage1 = cluster.add_instance(wait_online=False, tier="storage")
+    storage2 = cluster.add_instance(wait_online=False, tier="storage")
+    cluster.wait_online()
+
+    counter = raft1.wait_governor_status("idle")
+
+    assert storage1.replicaset_name == storage2.replicaset_name
+
+    # Terminate the whole replicaset
+    # NOTE: order of instance termination is reversed just to win some time
+    # because master switchover is skipped, the other way round will also work fine
+    storage2.terminate()
+    storage1.terminate()
+
+    # Make sure governor is not blocked by an offline replicaset
+    raft1.wait_governor_status("idle", old_step_counter=counter)
+
+    # Try adding a new instance
+    storage3 = cluster.add_instance(wait_online=False, tier="storage")
+    storage3.start()
+
+    # Governor is blocked on the sharding configuration because there's only
+    # one replicaset with non-zero weight and it's currently fully offline.
+    # This means that all buckets are currently on the offline instances.
+    # XXX There's probably something better we could do here, but for now this
+    # is what's happening
+    raft1.wait_governor_status("update current sharding configuration")
+
+    # The solution at the moment is to wake up one of the instances in that replicaset
+    storage1.start()
+
+    # Now adding an instance works fine
+    storage4 = cluster.add_instance(wait_online=True, tier="storage")
+    storage3.wait_online()
+    raft1.wait_governor_status("idle")
+
+    assert storage4.replicaset_name != storage1.replicaset_name
+    assert storage4.replicaset_name == storage3.replicaset_name
+
+    # Terminate the another whole replicaset
+    storage4.terminate()
+    storage3.terminate()
+
+    # This time the instances are added and become online just fine
+    storage5 = cluster.add_instance(wait_online=True, tier="storage")
+    storage6 = cluster.add_instance(wait_online=True, tier="storage")
+
+    assert storage5.replicaset_name != storage1.replicaset_name
+    assert storage5.replicaset_name == storage6.replicaset_name
-- 
GitLab