From 0e9b87c7d23ee9445bd8ad92a78542b4ec241adb Mon Sep 17 00:00:00 2001 From: Vladimir Davydov <vdavydov.dev@gmail.com> Date: Wed, 14 Feb 2018 16:57:33 +0300 Subject: [PATCH] Introduce replication_connect_timeout configuration option Currently, the max time box.cfg() may wait for connection to replicas to be established is hardcoded to box.cfg.replication_timeout times 4. As a result, users can't revert to pre replication_connect_quorum behavior, when box.cfg() blocks until it connects to all replicas. To fix that, let's introduce a new configuration option, replication_connect_timeout, which determines the replication configuration timeout. By default the option is set to 4 seconds. Closes #3151 --- src/box/box.cc | 23 +++++++++++++++-- src/box/lua/load_cfg.lua | 2 ++ src/box/replication.cc | 1 + src/box/replication.h | 18 ++++++------- test/app-tap/init_script.result | 45 +++++++++++++++++---------------- test/box-tap/cfg.test.lua | 7 ++++- test/box/admin.result | 2 ++ test/box/cfg.result | 4 +++ test/replication/quorum.lua | 1 + 9 files changed, 68 insertions(+), 35 deletions(-) diff --git a/src/box/box.cc b/src/box/box.cc index f055788d90..fa1eb051db 100644 --- a/src/box/box.cc +++ b/src/box/box.cc @@ -386,6 +386,17 @@ box_check_replication_timeout(void) return timeout; } +static double +box_check_replication_connect_timeout(void) +{ + double timeout = cfg_getd("replication_connect_timeout"); + if (timeout <= 0) { + tnt_raise(ClientError, ER_CFG, "replication_connect_timeout", + "the value must be greather than 0"); + } + return timeout; +} + static int box_check_replication_connect_quorum(void) { @@ -490,6 +501,7 @@ box_check_config() box_check_replicaset_uuid(&uuid); box_check_replication(); box_check_replication_timeout(); + box_check_replication_connect_timeout(); box_check_replication_connect_quorum(); box_check_replication_sync_lag(); box_check_readahead(cfg_geti("readahead")); @@ -580,7 +592,7 @@ box_set_replication(void) box_check_replication(); /* Try to connect to all replicas within the timeout period */ - box_sync_replication(replication_connect_quorum_timeout(), true); + box_sync_replication(replication_connect_timeout, true); /* Follow replica */ replicaset_follow(); } @@ -591,6 +603,12 @@ box_set_replication_timeout(void) replication_timeout = box_check_replication_timeout(); } +void +box_set_replication_connect_timeout(void) +{ + replication_connect_timeout = box_check_replication_connect_timeout(); +} + void box_set_replication_connect_quorum(void) { @@ -1678,6 +1696,7 @@ box_cfg_xc(void) box_set_checkpoint_count(); box_set_too_long_threshold(); box_set_replication_timeout(); + box_set_replication_connect_timeout(); box_set_replication_connect_quorum(); replication_sync_lag = box_check_replication_sync_lag(); xstream_create(&join_stream, apply_initial_join_row); @@ -1803,7 +1822,7 @@ box_cfg_xc(void) title("orphan"); /* Wait for the cluster to start up */ - box_sync_replication(replication_connect_quorum_timeout(), false); + box_sync_replication(replication_connect_timeout, false); } else { if (!tt_uuid_is_nil(&instance_uuid)) INSTANCE_UUID = instance_uuid; diff --git a/src/box/lua/load_cfg.lua b/src/box/lua/load_cfg.lua index 4ac0408321..891d819d3c 100644 --- a/src/box/lua/load_cfg.lua +++ b/src/box/lua/load_cfg.lua @@ -57,6 +57,7 @@ local default_cfg = { worker_pool_threads = 4, replication_timeout = 1, replication_sync_lag = 10, + replication_connect_timeout = 4, replication_connect_quorum = nil, -- connect all } @@ -112,6 +113,7 @@ local template_cfg = { worker_pool_threads = 'number', replication_timeout = 'number', replication_sync_lag = 'number', + replication_connect_timeout = 'number', replication_connect_quorum = 'number', } diff --git a/src/box/replication.cc b/src/box/replication.cc index 35efd8addf..319ea57e5b 100644 --- a/src/box/replication.cc +++ b/src/box/replication.cc @@ -47,6 +47,7 @@ struct tt_uuid INSTANCE_UUID; struct tt_uuid REPLICASET_UUID; double replication_timeout = 1.0; /* seconds */ +double replication_connect_timeout = 4.0; /* seconds */ int replication_connect_quorum = REPLICATION_CONNECT_QUORUM_ALL; double replication_sync_lag = 10.0; /* seconds */ diff --git a/src/box/replication.h b/src/box/replication.h index a7595f6180..f964eed086 100644 --- a/src/box/replication.h +++ b/src/box/replication.h @@ -103,6 +103,14 @@ static const int REPLICATION_CONNECT_QUORUM_ALL = INT_MAX; */ extern double replication_timeout; +/** + * Maximal time box.cfg() may wait for connections to all configured + * replicas to be established. If box.cfg() fails to connect to all + * replicas within the timeout, it will either leave the instance in + * the orphan mode (recovery) or fail (bootstrap, reconfiguration). + */ +extern double replication_connect_timeout; + /** * Minimal number of replicas to sync for this instance to switch * to the write mode. If set to REPLICATION_CONNECT_QUORUM_ALL, @@ -136,16 +144,6 @@ replication_disconnect_timeout(void) return replication_timeout * 4; } -/** - * Fail box.cfg() if the quorum hasn't been assembled within - * the given period. - */ -static inline double -replication_connect_quorum_timeout(void) -{ - return replication_reconnect_timeout() * 4; -} - void replication_init(void); diff --git a/test/app-tap/init_script.result b/test/app-tap/init_script.result index 53f87a54b3..80153e3861 100644 --- a/test/app-tap/init_script.result +++ b/test/app-tap/init_script.result @@ -21,28 +21,29 @@ box.cfg 16 pid_file:box.pid 17 read_only:false 18 readahead:16320 -19 replication_sync_lag:10 -20 replication_timeout:1 -21 rows_per_wal:500000 -22 slab_alloc_factor:1.05 -23 too_long_threshold:0.5 -24 vinyl_bloom_fpr:0.05 -25 vinyl_cache:134217728 -26 vinyl_dir:. -27 vinyl_max_tuple_size:1048576 -28 vinyl_memory:134217728 -29 vinyl_page_size:8192 -30 vinyl_range_size:1073741824 -31 vinyl_read_threads:1 -32 vinyl_run_count_per_level:2 -33 vinyl_run_size_ratio:3.5 -34 vinyl_timeout:60 -35 vinyl_write_threads:2 -36 wal_dir:. -37 wal_dir_rescan_delay:2 -38 wal_max_size:268435456 -39 wal_mode:write -40 worker_pool_threads:4 +19 replication_connect_timeout:4 +20 replication_sync_lag:10 +21 replication_timeout:1 +22 rows_per_wal:500000 +23 slab_alloc_factor:1.05 +24 too_long_threshold:0.5 +25 vinyl_bloom_fpr:0.05 +26 vinyl_cache:134217728 +27 vinyl_dir:. +28 vinyl_max_tuple_size:1048576 +29 vinyl_memory:134217728 +30 vinyl_page_size:8192 +31 vinyl_range_size:1073741824 +32 vinyl_read_threads:1 +33 vinyl_run_count_per_level:2 +34 vinyl_run_size_ratio:3.5 +35 vinyl_timeout:60 +36 vinyl_write_threads:2 +37 wal_dir:. +38 wal_dir_rescan_delay:2 +39 wal_max_size:268435456 +40 wal_mode:write +41 worker_pool_threads:4 -- -- Test insert from detached fiber -- diff --git a/test/box-tap/cfg.test.lua b/test/box-tap/cfg.test.lua index 67991ecfae..90dc04bd30 100755 --- a/test/box-tap/cfg.test.lua +++ b/test/box-tap/cfg.test.lua @@ -6,7 +6,7 @@ local socket = require('socket') local fio = require('fio') local uuid = require('uuid') local msgpack = require('msgpack') -test:plan(80) +test:plan(85) -------------------------------------------------------------------------------- -- Invalid values @@ -27,6 +27,11 @@ invalid('memtx_min_tuple_size', 1000000000) invalid('replication', '//guest@localhost:3301') invalid('replication_timeout', -1) invalid('replication_timeout', 0) +invalid('replication_sync_lag', -1) +invalid('replication_sync_lag', 0) +invalid('replication_connect_timeout', -1) +invalid('replication_connect_timeout', 0) +invalid('replication_connect_quorum', -1) invalid('wal_mode', 'invalid') invalid('rows_per_wal', -1) invalid('listen', '//!') diff --git a/test/box/admin.result b/test/box/admin.result index 13e599eb5f..7a3e937b1a 100644 --- a/test/box/admin.result +++ b/test/box/admin.result @@ -54,6 +54,8 @@ cfg_filter(box.cfg) - false - - readahead - 16320 + - - replication_connect_timeout + - 4 - - replication_sync_lag - 10 - - replication_timeout diff --git a/test/box/cfg.result b/test/box/cfg.result index 9f0ad59549..67539cd175 100644 --- a/test/box/cfg.result +++ b/test/box/cfg.result @@ -50,6 +50,8 @@ cfg_filter(box.cfg) - false - - readahead - 16320 + - - replication_connect_timeout + - 4 - - replication_sync_lag - 10 - - replication_timeout @@ -137,6 +139,8 @@ cfg_filter(box.cfg) - false - - readahead - 16320 + - - replication_connect_timeout + - 4 - - replication_sync_lag - 10 - - replication_timeout diff --git a/test/replication/quorum.lua b/test/replication/quorum.lua index 5138425a98..9c7bf5c930 100644 --- a/test/replication/quorum.lua +++ b/test/replication/quorum.lua @@ -16,6 +16,7 @@ box.cfg({ listen = instance_uri(INSTANCE_ID); replication_timeout = 0.05; replication_sync_lag = 0.01; + replication_connect_timeout = 0.1; replication_connect_quorum = 3; replication = { instance_uri(1); -- GitLab