From b76c2fb14e8fa97cfac77d1e5586a336846a8ff4 Mon Sep 17 00:00:00 2001
From: Georgy Moshkin <gmoshkin@picodata.io>
Date: Wed, 28 Aug 2024 18:40:53 +0300
Subject: [PATCH] feat: add some parameters to configure timeouts in governor
 loop

---
 src/config.rs           | 21 +++++++++++++++++++++
 src/governor/mod.rs     | 25 ++++++++++++++++++-------
 test/int/test_basics.py |  5 ++++-
 test/int/test_sql.py    |  6 ++++++
 4 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/src/config.rs b/src/config.rs
index 568959d6d1..999916676c 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -1482,6 +1482,27 @@ pub struct AlterSystemParameters {
     #[introspection(sbroad_type = SbroadType::Double)]
     #[introspection(config_default = (24 * 3600))]
     pub snapshot_read_view_close_timeout: f64,
+
+    /// Governor proposes operations to raft log and waits for them to be
+    /// applied to the local raft machine.
+    /// Governor will only wait for this many seconds before going to the new loop iteration.
+    #[introspection(sbroad_type = SbroadType::Double)]
+    #[introspection(config_default = 3.0)]
+    pub governor_raft_op_timeout: f64,
+
+    /// Governor sets up replication and configuration in the cluster as well as
+    /// applying global schema changes. It is doing so by making RPC requests to
+    /// other instances.
+    /// Governor will only wait for the RPC responses for this many seconds before going to the new loop iteration.
+    #[introspection(sbroad_type = SbroadType::Double)]
+    #[introspection(config_default = 3.0)]
+    pub governor_common_rpc_timeout: f64,
+
+    /// Governor sets up the plugin system by making RPC requests to other instances.
+    /// Governor will only wait for the RPC responses for this many seconds before going to the new loop iteration.
+    #[introspection(sbroad_type = SbroadType::Double)]
+    #[introspection(config_default = 10.0)]
+    pub governor_plugin_rpc_timeout: f64,
 }
 
 /// A special macro helper for referring to alter system parameters thoroughout
diff --git a/src/governor/mod.rs b/src/governor/mod.rs
index bed84873c7..95fd79c22d 100644
--- a/src/governor/mod.rs
+++ b/src/governor/mod.rs
@@ -49,9 +49,6 @@ pub(crate) mod plan;
 
 impl Loop {
     const RETRY_TIMEOUT: Duration = Duration::from_millis(250);
-    const RAFT_OP_TIMEOUT: Duration = Duration::from_secs(3);
-    const COMMON_RPC_TIMEOUT: Duration = Duration::from_secs(3);
-    const PLUGIN_RPC_TIMEOUT: Duration = Duration::from_secs(10);
 
     async fn iter_fn(
         State {
@@ -69,9 +66,23 @@ impl Loop {
             return ControlFlow::Continue(());
         }
 
-        let raft_op_timeout = Loop::RAFT_OP_TIMEOUT;
-        let rpc_timeout = Loop::COMMON_RPC_TIMEOUT;
-        let plugin_rpc_timeout = Loop::PLUGIN_RPC_TIMEOUT;
+        let v: f64 = storage
+            .properties
+            .get_or_alter_system_default(crate::system_parameter_name!(governor_raft_op_timeout))
+            .expect("storage should never ever fail");
+        let raft_op_timeout = Duration::from_secs_f64(v);
+
+        let v: f64 = storage
+            .properties
+            .get_or_alter_system_default(crate::system_parameter_name!(governor_common_rpc_timeout))
+            .expect("storage should never ever fail");
+        let rpc_timeout = Duration::from_secs_f64(v);
+
+        let v: f64 = storage
+            .properties
+            .get_or_alter_system_default(crate::system_parameter_name!(governor_plugin_rpc_timeout))
+            .expect("storage should never ever fail");
+        let plugin_rpc_timeout = Duration::from_secs_f64(v);
 
         let instances = storage
             .instances
@@ -218,7 +229,7 @@ impl Loop {
                 tlog!(Info, "proposing conf_change"; "cc" => ?conf_change);
                 if let Err(e) = node.propose_conf_change_and_wait(term, conf_change) {
                     tlog!(Warning, "failed proposing conf_change: {e}");
-                    fiber::sleep(Duration::from_secs(1));
+                    fiber::sleep(Loop::RETRY_TIMEOUT);
                 }
             }
 
diff --git a/test/int/test_basics.py b/test/int/test_basics.py
index d8d0786273..e92ba59a62 100644
--- a/test/int/test_basics.py
+++ b/test/int/test_basics.py
@@ -396,7 +396,10 @@ Insert(_pico_property, ["max_heartbeat_period",5.0]),
 Insert(_pico_property, ["max_pg_statements",1024]),
 Insert(_pico_property, ["max_pg_portals",1024]),
 Insert(_pico_property, ["snapshot_chunk_max_size",16777216]),
-Insert(_pico_property, ["snapshot_read_view_close_timeout",86400]))|
+Insert(_pico_property, ["snapshot_read_view_close_timeout",86400]),
+Insert(_pico_property, ["governor_raft_op_timeout",3.0]),
+Insert(_pico_property, ["governor_common_rpc_timeout",3.0]),
+Insert(_pico_property, ["governor_plugin_rpc_timeout",10.0]))|
 |  0  | 1  |BatchDml(
 Insert(_pico_user, [0,"guest",0,["md5","md5084e0343a0486ff05530df6c705c8bb4"],1,"user"]),
 Insert(_pico_privilege, [1,0,"login","universe",0,0]),
diff --git a/test/int/test_sql.py b/test/int/test_sql.py
index d8fb43c3c3..dc8118159d 100644
--- a/test/int/test_sql.py
+++ b/test/int/test_sql.py
@@ -398,6 +398,9 @@ def test_read_from_system_tables(cluster: Cluster):
     assert keys == [
         "auto_offline_timeout",
         "global_schema_version",
+        "governor_common_rpc_timeout",
+        "governor_plugin_rpc_timeout",
+        "governor_raft_op_timeout",
         "max_heartbeat_period",
         "max_login_attempts",
         "max_pg_portals",
@@ -5065,6 +5068,9 @@ def test_alter_system_property(cluster: Cluster):
         ("max_login_attempts", 8),
         ("max_pg_statements", 4096),
         ("max_pg_portals", 2048),
+        ("governor_raft_op_timeout", 10),
+        ("governor_common_rpc_timeout", 10),
+        ("governor_plugin_rpc_timeout", 20),
     ]
 
     default_prop = []
-- 
GitLab