From cd627f26526d7da8facda264912e2a0f5338a79a Mon Sep 17 00:00:00 2001 From: Serge Petrenko <sergepetrenko@tarantool.org> Date: Wed, 28 Aug 2019 11:25:00 +0300 Subject: [PATCH] replication: enter orphan mode on manual replication configuration chage Currently we only enter orphan mode when instance fails to connect to replication_connect_quorum remote instances during local recovery. On bootstrap and manual replication configuration change an error is thrown. We better enter orphan mode on manual config change, and leave it only in case we managed to sync with replication_connect_quorum instances. Closes #4424 @TarantoolBot document Title: document reaction on error in replication configuration change. Now when issuing `box.cfg{replication={uri1, uri2, ...}}` and failing to sync with replication_connect_quorum remote instances, the server will throw an error, if it is bootstrapping, and just set its state to orphan in all other cases (recovering from existing xlog/snap files or manually changing box.cfg.replication on the fly). To leave orphan mode, you may wait until the server manages to sync with replication_connect_quorum instances. In order to leave orphan mode you need to make the server sync with enough instances. To do so, you may either: 1) set replication_connect_quorum to a lower value 2) reset box.cfg.replication to exclude instances that cannot be reached or synced with 3) just set box.cfg.replication to "" (empty string) (cherry picked from commit 5a0cfe025735dbb12486971c299803ae9a3ff46b) --- src/box/box.cc | 14 +++-- src/box/box.h | 7 +++ src/box/replication.cc | 11 ++++ test/replication/misc.result | 104 ++++++++++++++++++++++++++++++++- test/replication/misc.test.lua | 41 ++++++++++++- 5 files changed, 168 insertions(+), 9 deletions(-) diff --git a/src/box/box.cc b/src/box/box.cc index 80249919ef..13d9c41eb0 100644 --- a/src/box/box.cc +++ b/src/box/box.cc @@ -257,7 +257,7 @@ box_wait_ro(bool ro, double timeout) } void -box_set_orphan(bool orphan) +box_do_set_orphan(bool orphan) { if (is_orphan == orphan) return; /* nothing to do */ @@ -266,7 +266,12 @@ box_set_orphan(bool orphan) is_orphan = orphan; fiber_cond_broadcast(&ro_cond); +} +void +box_set_orphan(bool orphan) +{ + box_do_set_orphan(orphan); /* Update the title to reflect the new status. */ if (is_orphan) { say_crit("entering orphan mode"); @@ -699,11 +704,10 @@ box_set_replication(void) box_check_replication(); /* * Try to connect to all replicas within the timeout period. - * The configuration will succeed as long as we've managed - * to connect to at least replication_connect_quorum - * masters. + * Stay in orphan mode in case we fail to connect to at least + * 'replication_connect_quorum' remote instances. */ - box_sync_replication(true); + box_sync_replication(false); /* Follow replica */ replicaset_follow(); /* Wait until appliers are in sync */ diff --git a/src/box/box.h b/src/box/box.h index ddcfbe2e59..ccd527bd54 100644 --- a/src/box/box.h +++ b/src/box/box.h @@ -127,6 +127,13 @@ box_wait_ro(bool ro, double timeout); void box_set_orphan(bool orphan); +/** + * Set orphan mode but don't update instance title. + * \sa box_set_orphan + */ +void +box_do_set_orphan(bool orphan); + /** * Iterate over all spaces and save them to the * snapshot file. diff --git a/src/box/replication.cc b/src/box/replication.cc index 28f7acedce..d691ce4876 100644 --- a/src/box/replication.cc +++ b/src/box/replication.cc @@ -610,6 +610,17 @@ replicaset_connect(struct applier **appliers, int count, say_info("connecting to %d replicas", count); + if (!connect_quorum) { + /* + * Enter orphan mode on configuration change and + * only leave it when we manage to sync with + * replicaset_quorum instances. Don't change + * title though, it should be 'loading' during + * local recovery. + */ + box_do_set_orphan(true); + } + /* * Simultaneously connect to remote peers to receive their UUIDs * and fill the resulting set: diff --git a/test/replication/misc.result b/test/replication/misc.result index 0a57edda59..ae72ce3e44 100644 --- a/test/replication/misc.result +++ b/test/replication/misc.result @@ -18,10 +18,19 @@ replication_connect_timeout = box.cfg.replication_connect_timeout box.cfg{replication_timeout=0.05, replication_connect_timeout=0.05, replication={}} --- ... +box.cfg{replication_connect_quorum=2} +--- +... box.cfg{replication = {'127.0.0.1:12345', box.cfg.listen}} --- -- error: 'Incorrect value for option ''replication'': failed to connect to one or - more replicas' +... +box.info.status +--- +- orphan +... +box.info.ro +--- +- true ... -- gh-3606 - Tarantool crashes if box.cfg.replication is updated concurrently fiber = require('fiber') @@ -47,8 +56,16 @@ c:get() --- - true ... -box.cfg{replication_timeout = replication_timeout, replication_connect_timeout = replication_connect_timeout} +box.cfg{replication = "", replication_timeout = replication_timeout, replication_connect_timeout = replication_connect_timeout} +--- +... +box.info.status +--- +- running +... +box.info.ro --- +- false ... -- gh-3111 - Allow to rebootstrap a replica from a read-only master replica_uuid = uuid.new() @@ -729,3 +746,84 @@ test_run:cleanup_cluster() box.schema.user.revoke('guest', 'replication') --- ... +-- +-- gh-4424 Always enter orphan mode on error in replication +-- configuration change. +-- +replication_connect_timeout = box.cfg.replication_connect_timeout +--- +... +replication_connect_quorum = box.cfg.replication_connect_quorum +--- +... +box.cfg{replication="12345", replication_connect_timeout=0.1, replication_connect_quorum=1} +--- +... +box.info.status +--- +- orphan +... +box.info.ro +--- +- true +... +-- reset replication => leave orphan mode +box.cfg{replication=""} +--- +... +box.info.status +--- +- running +... +box.info.ro +--- +- false +... +-- no switch to orphan when quorum == 0 +box.cfg{replication="12345", replication_connect_quorum=0} +--- +... +box.info.status +--- +- running +... +box.info.ro +--- +- false +... +-- we could connect to one out of two replicas. Set orphan. +box.cfg{replication_connect_quorum=2} +--- +... +box.cfg{replication={box.cfg.listen, "12345"}} +--- +... +box.info.status +--- +- orphan +... +box.info.ro +--- +- true +... +-- lower quorum => leave orphan mode +box.cfg{replication_connect_quorum=1} +--- +... +box.info.status +--- +- running +... +box.info.ro +--- +- false +... +box.cfg{replication=""} +--- +... +box.cfg{replication_connect_timeout=replication_connect_timeout} +--- +... +box.cfg{replication_connect_quorum=replication_connect_quorum} +--- +... diff --git a/test/replication/misc.test.lua b/test/replication/misc.test.lua index 99e9955093..16e7e9e42e 100644 --- a/test/replication/misc.test.lua +++ b/test/replication/misc.test.lua @@ -8,7 +8,10 @@ box.schema.user.grant('guest', 'replication') replication_timeout = box.cfg.replication_timeout replication_connect_timeout = box.cfg.replication_connect_timeout box.cfg{replication_timeout=0.05, replication_connect_timeout=0.05, replication={}} +box.cfg{replication_connect_quorum=2} box.cfg{replication = {'127.0.0.1:12345', box.cfg.listen}} +box.info.status +box.info.ro -- gh-3606 - Tarantool crashes if box.cfg.replication is updated concurrently fiber = require('fiber') @@ -19,7 +22,9 @@ f() c:get() c:get() -box.cfg{replication_timeout = replication_timeout, replication_connect_timeout = replication_connect_timeout} +box.cfg{replication = "", replication_timeout = replication_timeout, replication_connect_timeout = replication_connect_timeout} +box.info.status +box.info.ro -- gh-3111 - Allow to rebootstrap a replica from a read-only master replica_uuid = uuid.new() @@ -293,3 +298,37 @@ test_run:cmd("cleanup server replica") test_run:cmd("delete server replica") test_run:cleanup_cluster() box.schema.user.revoke('guest', 'replication') + +-- +-- gh-4424 Always enter orphan mode on error in replication +-- configuration change. +-- +replication_connect_timeout = box.cfg.replication_connect_timeout +replication_connect_quorum = box.cfg.replication_connect_quorum +box.cfg{replication="12345", replication_connect_timeout=0.1, replication_connect_quorum=1} +box.info.status +box.info.ro +-- reset replication => leave orphan mode +box.cfg{replication=""} +box.info.status +box.info.ro +-- no switch to orphan when quorum == 0 +box.cfg{replication="12345", replication_connect_quorum=0} +box.info.status +box.info.ro + +-- we could connect to one out of two replicas. Set orphan. +box.cfg{replication_connect_quorum=2} +box.cfg{replication={box.cfg.listen, "12345"}} +box.info.status +box.info.ro +-- lower quorum => leave orphan mode +box.cfg{replication_connect_quorum=1} +box.info.status +box.info.ro + +box.cfg{replication=""} + + +box.cfg{replication_connect_timeout=replication_connect_timeout} +box.cfg{replication_connect_quorum=replication_connect_quorum} -- GitLab