diff --git a/src/traft/node.rs b/src/traft/node.rs index f1879a665a27d599d8105b21cec98f9d95ff6b2e..8827ce33c74071c41b695c14fb6f65ed747abc5e 100644 --- a/src/traft/node.rs +++ b/src/traft/node.rs @@ -2209,6 +2209,17 @@ impl NodeImpl { } } + #[cfg(feature = "error_injection")] + if crate::error_injection::is_enabled("BLOCK_WHEN_PERSISTING_DDL_COMMIT") { + for entry in entries_to_persist { + let row = traft::Entry::try_from(entry).unwrap(); + let op = row.into_op(); + if let Some(Op::DdlCommit) = op { + crate::error_injection!(block "BLOCK_WHEN_PERSISTING_DDL_COMMIT"); + } + } + } + // Apply committed entries. let committed_entries = ready.committed_entries(); if !committed_entries.is_empty() { diff --git a/test/conftest.py b/test/conftest.py index c0a414b609cdc3256fb22abf873e5df132a4a578..80bc8f118543182bf7d96abeb2291920e2b8cd9e 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -413,6 +413,10 @@ class ProcessDead(Exception): pass +class NotALeader(Exception): + pass + + class Retriable: """A utility class for handling retries. @@ -1400,6 +1404,19 @@ class Instance: timeout=timeout + 1, # this timeout is for network call ) + def wait_governor_status(self, expected_status: str, timeout: int | float = 5): + assert expected_status != "not a leader", "use another function" + + def impl(): + info = self.call(".proc_runtime_info")["internal"] + actual_status = info["governor_loop_status"] + if actual_status == "not a leader": + raise NotALeader("not a leader") + + assert actual_status == expected_status + + Retriable(timeout=timeout, rps=1, fatal=NotALeader).call(impl) + def promote_or_fail(self): attempt = 0 diff --git a/test/int/test_network_effects.py b/test/int/test_network_effects.py index acd85311e8b9ece0c5108bc695cbc5daab69b78a..a63eb8d78d15ab2eea5b15ef41580d8588aca486 100644 --- a/test/int/test_network_effects.py +++ b/test/int/test_network_effects.py @@ -180,3 +180,30 @@ def test_instance_automatic_offline_detection(cluster: Cluster): assert get_instance_states(peer, instance_id) == ("Online", "Online") Retriable(timeout=10, rps=5).call(lambda: assert_online(i1, i3.instance_id)) + + +def test_governor_timeout_when_proposing_raft_op(cluster: Cluster): + i1, i2, i3 = cluster.deploy(instance_count=3) + + i2.call("pico._inject_error", "BLOCK_WHEN_PERSISTING_DDL_COMMIT", True) + i3.call("pico._inject_error", "BLOCK_WHEN_PERSISTING_DDL_COMMIT", True) + + with pytest.raises(TimeoutError): + i1.sql( + """ + CREATE TABLE dining_table (id INTEGER NOT NULL PRIMARY KEY) DISTRIBUTED BY (id) + """ + ) + + # Wait until governor starts applying the DDL. + # This will block because both followers can't apply. + i1.wait_governor_status("apply clusterwide schema change") + + # FIXME: this is going to be flaky, need some way to make this stable + time.sleep(3) + + i2.call("pico._inject_error", "BLOCK_WHEN_PERSISTING_DDL_COMMIT", False) + i3.call("pico._inject_error", "BLOCK_WHEN_PERSISTING_DDL_COMMIT", False) + + # Wait until governor finishes with all the needed changes. + i1.wait_governor_status("idle") diff --git a/test/int/test_replication.py b/test/int/test_replication.py index 5b30202264bd14d3ea1e7bfcdbded60278050547..85634b27b4948be4c79aad0e4eacd69013886341 100644 --- a/test/int/test_replication.py +++ b/test/int/test_replication.py @@ -181,22 +181,6 @@ def test_master_auto_switchover(cluster: Cluster): assert not i5.eval("return box.info.ro") -def wait_governor_status(i: Instance, expected_status, timeout=5): - assert expected_status != "not a leader", "use another function" - - class NotALeader(Exception): - pass - - def impl(): - actual_status = i.call(".proc_runtime_info")["internal"]["governor_loop_status"] - if actual_status == "not a leader": - raise NotALeader("not a leader") - - assert actual_status == expected_status - - Retriable(timeout=timeout, rps=1, fatal=NotALeader).call(impl) - - def get_vclock_without_local(i: Instance): vclock = i.eval("return box.info.vclock") del vclock[0] @@ -250,7 +234,7 @@ def test_replication_sync_before_master_switchover(cluster: Cluster): # This will block until i5 synchronizes with old master, which it won't # until the injected error is disabled. time.sleep(1) # Just in case, nothing really relies on this sleep - wait_governor_status(i1, "configure replication") + i1.wait_governor_status("configure replication") # i5 does not become writable until it synchronizes assert i5.eval("return box.info.ro") is True @@ -261,7 +245,7 @@ def test_replication_sync_before_master_switchover(cluster: Cluster): ) # Wait until governor finishes with all the needed changes. - wait_governor_status(i1, "idle") + i1.wait_governor_status("idle") assert i5.eval("return box.space.mytable.id") is not None vclock = get_vclock_without_local(i5)