From db72782c71e525e2d0d51d4522c6edf85e0557e9 Mon Sep 17 00:00:00 2001 From: Serge Petrenko <sergepetrenko@tarantool.org> Date: Fri, 28 Feb 2020 14:43:17 +0300 Subject: [PATCH] replication: fix rebootstrap in case the instance is listed in box.cfg.replication When checking wheter rejoin is needed, replica loops through all the instances in box.cfg.replication, which makes it believe that there is a master holding files, needed by it, since it accounts itself just like all other instances. So make replica skip itself when finding an instance which holds files needed by it, and determining whether rebootstrap is needed. We already have a working test for the issue, it missed the issue due to replica.lua replication settings. Fix replica.lua to optionally include itself in box.cfg.replication, so that the corresponding test works correctly. Closes #4759 (cherry picked from commit dbcfaf70d9312bbcb2189009cd63dbe75f955733) --- src/box/replication.cc | 7 ++++++- test/replication/replica.lua | 11 ++++++++++- test/replication/replica_rejoin.result | 12 ++++++------ test/replication/replica_rejoin.test.lua | 12 ++++++------ 4 files changed, 28 insertions(+), 14 deletions(-) diff --git a/src/box/replication.cc b/src/box/replication.cc index 5ba83930d5..c9f6c97671 100644 --- a/src/box/replication.cc +++ b/src/box/replication.cc @@ -730,7 +730,12 @@ replicaset_needs_rejoin(struct replica **master) struct replica *leader = NULL; replicaset_foreach(replica) { struct applier *applier = replica->applier; - if (applier == NULL) + /* + * Skip the local instance, we shouldn't perform a + * check against our own gc vclock. + */ + if (applier == NULL || tt_uuid_is_equal(&replica->uuid, + &INSTANCE_UUID)) continue; const struct ballot *ballot = &applier->ballot; diff --git a/test/replication/replica.lua b/test/replication/replica.lua index 20ac064e16..1cd60e0620 100644 --- a/test/replication/replica.lua +++ b/test/replication/replica.lua @@ -1,8 +1,17 @@ #!/usr/bin/env tarantool +repl_include_self = arg[1] and arg[1] == 'true' or false +repl_list = nil + +if repl_include_self then + repl_list = {os.getenv("MASTER"), os.getenv("LISTEN")} +else + repl_list = os.getenv("MASTER") +end + box.cfg({ listen = os.getenv("LISTEN"), - replication = os.getenv("MASTER"), + replication = repl_list, memtx_memory = 107374182, replication_timeout = 0.1, replication_connect_timeout = 0.5, diff --git a/test/replication/replica_rejoin.result b/test/replication/replica_rejoin.result index f71292da1b..b8ed79f140 100644 --- a/test/replication/replica_rejoin.result +++ b/test/replication/replica_rejoin.result @@ -37,7 +37,7 @@ test_run:cmd("create server replica with rpl_master=default, script='replication --- - true ... -test_run:cmd("start server replica") +test_run:cmd("start server replica with args='true'") --- - true ... @@ -111,7 +111,7 @@ box.cfg{checkpoint_count = checkpoint_count} ... -- Restart the replica. Since xlogs have been removed, -- it is supposed to rejoin without changing id. -test_run:cmd("start server replica") +test_run:cmd("start server replica with args='true'") --- - true ... @@ -158,7 +158,7 @@ box.space.test:select() - [30, 30] ... -- Check that restart works as usual. -test_run:cmd("restart server replica") +test_run:cmd("restart server replica with args='true'") box.info.replication[1].upstream.status == 'follow' or box.info --- - true @@ -210,7 +210,7 @@ test_run:wait_cond(function() return #fio.glob(fio.pathjoin(box.cfg.wal_dir, '*. box.cfg{checkpoint_count = checkpoint_count} --- ... -test_run:cmd("start server replica") +test_run:cmd("start server replica with args='true'") --- - true ... @@ -252,7 +252,7 @@ test_run:cleanup_cluster() box.space.test:truncate() --- ... -test_run:cmd("start server replica") +test_run:cmd("start server replica with args='true'") --- - true ... @@ -349,7 +349,7 @@ _ = test_run:wait_vclock('default', vclock) --- ... -- Restart the replica. It should successfully rebootstrap. -test_run:cmd("restart server replica") +test_run:cmd("restart server replica with args='true'") box.space.test:select() --- - - [1] diff --git a/test/replication/replica_rejoin.test.lua b/test/replication/replica_rejoin.test.lua index 22a91d8d7d..25c0849ec9 100644 --- a/test/replication/replica_rejoin.test.lua +++ b/test/replication/replica_rejoin.test.lua @@ -17,7 +17,7 @@ _ = box.space.test:insert{3} -- Join a replica, then stop it. test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") -test_run:cmd("start server replica") +test_run:cmd("start server replica with args='true'") test_run:cmd("switch replica") box.info.replication[1].upstream.status == 'follow' or box.info box.space.test:select() @@ -45,7 +45,7 @@ box.cfg{checkpoint_count = checkpoint_count} -- Restart the replica. Since xlogs have been removed, -- it is supposed to rejoin without changing id. -test_run:cmd("start server replica") +test_run:cmd("start server replica with args='true'") box.info.replication[2].downstream.vclock ~= nil or box.info test_run:cmd("switch replica") box.info.replication[1].upstream.status == 'follow' or box.info @@ -60,7 +60,7 @@ test_run:cmd("switch replica") box.space.test:select() -- Check that restart works as usual. -test_run:cmd("restart server replica") +test_run:cmd("restart server replica with args='true'") box.info.replication[1].upstream.status == 'follow' or box.info box.space.test:select() @@ -78,7 +78,7 @@ for i = 1, 3 do box.space.test:insert{i * 100} end fio = require('fio') test_run:wait_cond(function() return #fio.glob(fio.pathjoin(box.cfg.wal_dir, '*.xlog')) == 1 end) or fio.pathjoin(box.cfg.wal_dir, '*.xlog') box.cfg{checkpoint_count = checkpoint_count} -test_run:cmd("start server replica") +test_run:cmd("start server replica with args='true'") test_run:cmd("switch replica") box.info.status -- orphan box.space.test:select() @@ -94,7 +94,7 @@ test_run:cmd("stop server replica") test_run:cmd("cleanup server replica") test_run:cleanup_cluster() box.space.test:truncate() -test_run:cmd("start server replica") +test_run:cmd("start server replica with args='true'") -- Subscribe the master to the replica. replica_listen = test_run:cmd("eval replica 'return box.cfg.listen'") replica_listen ~= nil @@ -128,7 +128,7 @@ for i = 1, 10 do box.space.test:replace{2} end vclock = test_run:get_vclock('replica') _ = test_run:wait_vclock('default', vclock) -- Restart the replica. It should successfully rebootstrap. -test_run:cmd("restart server replica") +test_run:cmd("restart server replica with args='true'") box.space.test:select() box.snapshot() box.space.test:replace{2} -- GitLab