From 1795832283eacc993c2c28d124dac4335a7957ad Mon Sep 17 00:00:00 2001 From: Serge Petrenko <sergepetrenko@tarantool.org> Date: Mon, 30 Sep 2019 16:43:37 +0300 Subject: [PATCH] replication: add is_orphan field to ballot A successfully fetched remote instance ballot isn't updated during bootstrap procedure. This leads to a case when different instances choose different masters as their bootstrap leaders. Imagine such a situation. You start instance A without replication set up. Instance A successfully bootstraps. You also have instances B and C both with replication set up to {A, B, C} and replication_connect_quorum set to 3 You first start instance B. It doesn't proceed to choosing a leader until one of the events happens: either the replication_connect_timeout runs out, or instance C is up and starts listening on its port. B has established connection to A and fetched its ballot, with some vclock, say, {1: 1}. B retries connection to C every replication_timeout seconds. Then you start instance C. Instance C succeeds in connecting to A and B right away and bootstraps from instance A. Instance A registers C in its _cluster table. This registration is replicated to instance C. Meanwhile, instance C is trying to sync with quorum instances (which is 3), and stays in orphan mode. Now replication_timeout on instance B finally runs out. It retries a previously unsuccessful connection to C and succeeds. C sends its ballot to B with vclock = {1: 2, 2:0} (in our example), since it has already incremented it after _cluster registration. B sees that C has a greater vclock than A, and chooses to bootstrap from C instead of A. C is orphan and rejects B's attempt to join. B dies. To fix such ungentlemanlike behaviour of C, we should at least include loading status in ballot and prefer fully bootstrapped instances to the ones still syncing with other replicas. We also need to use a separate flag instead of ballot's already existent is_ro, since we still want to prefer loading instances over the ones explicitly configured to be read-only. Closes #4527 (cherry picked from commit dc1e4009465174dc09d6026d89f3eb3bcbc9ed49) --- src/box/box.cc | 8 +++ src/box/iproto_constants.h | 1 + src/box/replication.cc | 10 +++- src/box/xrow.c | 13 ++++- src/box/xrow.h | 5 ++ test/replication/bootstrap_leader.result | 61 ++++++++++++++++++++++ test/replication/bootstrap_leader.test.lua | 27 ++++++++++ test/replication/replica_uuid_rw.lua | 26 +++++++++ test/replication/replica_uuid_rw1.lua | 1 + test/replication/replica_uuid_rw2.lua | 1 + test/replication/replica_uuid_rw3.lua | 1 + 11 files changed, 150 insertions(+), 4 deletions(-) create mode 100644 test/replication/bootstrap_leader.result create mode 100644 test/replication/bootstrap_leader.test.lua create mode 100644 test/replication/replica_uuid_rw.lua create mode 120000 test/replication/replica_uuid_rw1.lua create mode 120000 test/replication/replica_uuid_rw2.lua create mode 120000 test/replication/replica_uuid_rw3.lua diff --git a/src/box/box.cc b/src/box/box.cc index 939f12bf35..b4dc0e41fd 100644 --- a/src/box/box.cc +++ b/src/box/box.cc @@ -1652,6 +1652,14 @@ void box_process_vote(struct ballot *ballot) { ballot->is_ro = cfg_geti("read_only") != 0; + /* + * is_ro is true on initial load and is set to box.cfg.read_only + * after box_cfg() returns, during dynamic box.cfg parameters setting. + * We would like to prefer already bootstrapped instances to the ones + * still bootstrapping and the ones still bootstrapping, but writeable + * to the ones that have box.cfg.read_only = true. + */ + ballot->is_loading = is_ro; vclock_copy(&ballot->vclock, &replicaset.vclock); vclock_copy(&ballot->gc_vclock, &gc.vclock); } diff --git a/src/box/iproto_constants.h b/src/box/iproto_constants.h index 126d733529..65b634cd74 100644 --- a/src/box/iproto_constants.h +++ b/src/box/iproto_constants.h @@ -137,6 +137,7 @@ enum iproto_ballot_key { IPROTO_BALLOT_IS_RO = 0x01, IPROTO_BALLOT_VCLOCK = 0x02, IPROTO_BALLOT_GC_VCLOCK = 0x03, + IPROTO_BALLOT_IS_LOADING = 0x04, }; #define bit(c) (1ULL<<IPROTO_##c) diff --git a/src/box/replication.cc b/src/box/replication.cc index d691ce4876..6fcc56fe37 100644 --- a/src/box/replication.cc +++ b/src/box/replication.cc @@ -863,8 +863,8 @@ replicaset_next(struct replica *replica) } /** - * Compare vclock and read only mode of all connected - * replicas and elect a leader. + * Compare vclock, read only mode and orphan status + * of all connected replicas and elect a leader. * Initiallly, skip read-only replicas, since they * can not properly act as bootstrap masters (register * new nodes in _cluster table). If there are no read-write @@ -892,6 +892,12 @@ replicaset_round(bool skip_ro) leader = replica; continue; } + /* + * Try to find a replica which has already left + * orphan mode. + */ + if (applier->ballot.is_loading && ! leader->applier->ballot.is_loading) + continue; /* * Choose the replica with the most advanced * vclock. If there are two or more replicas diff --git a/src/box/xrow.c b/src/box/xrow.c index 0ae5271c1b..18bf089718 100644 --- a/src/box/xrow.c +++ b/src/box/xrow.c @@ -441,8 +441,9 @@ iproto_reply_vote(struct obuf *out, const struct ballot *ballot, uint64_t sync, uint32_t schema_version) { size_t max_size = IPROTO_HEADER_LEN + mp_sizeof_map(1) + - mp_sizeof_uint(UINT32_MAX) + mp_sizeof_map(3) + + mp_sizeof_uint(UINT32_MAX) + mp_sizeof_map(4) + mp_sizeof_uint(UINT32_MAX) + mp_sizeof_bool(ballot->is_ro) + + mp_sizeof_uint(UINT32_MAX) + mp_sizeof_bool(ballot->is_loading) + mp_sizeof_uint(UINT32_MAX) + mp_sizeof_vclock(&ballot->vclock) + mp_sizeof_uint(UINT32_MAX) + mp_sizeof_vclock(&ballot->gc_vclock); @@ -456,9 +457,11 @@ iproto_reply_vote(struct obuf *out, const struct ballot *ballot, char *data = buf + IPROTO_HEADER_LEN; data = mp_encode_map(data, 1); data = mp_encode_uint(data, IPROTO_BALLOT); - data = mp_encode_map(data, 3); + data = mp_encode_map(data, 4); data = mp_encode_uint(data, IPROTO_BALLOT_IS_RO); data = mp_encode_bool(data, ballot->is_ro); + data = mp_encode_uint(data, IPROTO_BALLOT_IS_LOADING); + data = mp_encode_bool(data, ballot->is_loading); data = mp_encode_uint(data, IPROTO_BALLOT_VCLOCK); data = mp_encode_vclock(data, &ballot->vclock); data = mp_encode_uint(data, IPROTO_BALLOT_GC_VCLOCK); @@ -1077,6 +1080,7 @@ int xrow_decode_ballot(struct xrow_header *row, struct ballot *ballot) { ballot->is_ro = false; + ballot->is_loading = false; vclock_create(&ballot->vclock); const char *start = NULL; @@ -1121,6 +1125,11 @@ xrow_decode_ballot(struct xrow_header *row, struct ballot *ballot) goto err; ballot->is_ro = mp_decode_bool(&data); break; + case IPROTO_BALLOT_IS_LOADING: + if (mp_typeof(*data) != MP_BOOL) + goto err; + ballot->is_loading = mp_decode_bool(&data); + break; case IPROTO_BALLOT_VCLOCK: if (mp_decode_vclock(&data, &ballot->vclock) != 0) goto err; diff --git a/src/box/xrow.h b/src/box/xrow.h index 35ec06dc0c..60def2d3cc 100644 --- a/src/box/xrow.h +++ b/src/box/xrow.h @@ -275,6 +275,11 @@ xrow_encode_auth(struct xrow_header *row, const char *salt, size_t salt_len, struct ballot { /** Set if the instance is running in read-only mode. */ bool is_ro; + /** + * Set if the instance hasn't finished bootstrap or recovery, or + * is syncing with other replicas in the replicaset. + */ + bool is_loading; /** Current instance vclock. */ struct vclock vclock; /** Oldest vclock available on the instance. */ diff --git a/test/replication/bootstrap_leader.result b/test/replication/bootstrap_leader.result new file mode 100644 index 0000000000..7d1a33d8e9 --- /dev/null +++ b/test/replication/bootstrap_leader.result @@ -0,0 +1,61 @@ +-- test-run result file version 2 +test_run = require('test_run').new() + | --- + | ... + +create_server_cmd = "create server replica%d with script='replication/replica_uuid_rw%d.lua'" + | --- + | ... + +test_run:cmd("setopt delimiter ';'") + | --- + | - true + | ... + +for i = 1,3 do + test_run:cmd(string.format(create_server_cmd, i, i)) +end; + | --- + | ... + +test_run:cmd("start server replica1 with wait_load=True, wait=True"); + | --- + | - true + | ... +test_run:cmd("start server replica2 with args='1,2,3 1.0 100500 0.1', wait_load=False, wait=False"); + | --- + | - true + | ... +test_run:cmd("start server replica3 with args='1,2,3 0.1 0.5 100500', wait_load=True, wait=True"); + | --- + | - true + | ... + +test_run:cmd("switch replica3"); + | --- + | - true + | ... +fiber = require('fiber'); + | --- + | ... +while (#box.info.replication < 3) do + fiber.sleep(0.05) +end; + | --- + | ... +test_run:cmd("switch default"); + | --- + | - true + | ... + +for i = 1,3 do + test_run:cmd("stop server replica"..i.." with cleanup=1") + test_run:cmd("delete server replica"..i) +end; + | --- + | ... + +test_run:cmd("setopt delimiter ''"); + | --- + | - true + | ... diff --git a/test/replication/bootstrap_leader.test.lua b/test/replication/bootstrap_leader.test.lua new file mode 100644 index 0000000000..984a82b8e1 --- /dev/null +++ b/test/replication/bootstrap_leader.test.lua @@ -0,0 +1,27 @@ +test_run = require('test_run').new() + +create_server_cmd = "create server replica%d with script='replication/replica_uuid_rw%d.lua'" + +test_run:cmd("setopt delimiter ';'") + +for i = 1,3 do + test_run:cmd(string.format(create_server_cmd, i, i)) +end; + +test_run:cmd("start server replica1 with wait_load=True, wait=True"); +test_run:cmd("start server replica2 with args='1,2,3 1.0 100500 0.1', wait_load=False, wait=False"); +test_run:cmd("start server replica3 with args='1,2,3 0.1 0.5 100500', wait_load=True, wait=True"); + +test_run:cmd("switch replica3"); +fiber = require('fiber'); +while (#box.info.replication < 3) do + fiber.sleep(0.05) +end; +test_run:cmd("switch default"); + +for i = 1,3 do + test_run:cmd("stop server replica"..i.." with cleanup=1") + test_run:cmd("delete server replica"..i) +end; + +test_run:cmd("setopt delimiter ''"); diff --git a/test/replication/replica_uuid_rw.lua b/test/replication/replica_uuid_rw.lua new file mode 100644 index 0000000000..60f2d63628 --- /dev/null +++ b/test/replication/replica_uuid_rw.lua @@ -0,0 +1,26 @@ +#!/usr/bin/env tarantool + +local INSTANCE_ID = string.match(arg[0], "%d") +local SOCKET_DIR = require('fio').cwd() + +local function instance_uri(instance_id) + return SOCKET_DIR..'/replica_uuid_rw'..instance_id..'.sock' +end + +local repl_tbl = {} +for num in string.gmatch(arg[1] or "", "%d") do + table.insert(repl_tbl, instance_uri(num)) +end + +require('console').listen(os.getenv('ADMIN')) + +box.cfg({ + instance_uuid = "aaaaaaaa-aaaa-0000-0000-00000000000"..INSTANCE_ID, + listen = instance_uri(INSTANCE_ID), + replication = repl_tbl, + replication_timeout = arg[2] and tonumber(arg[2]) or 0.1, + replication_connect_timeout = arg[3] and tonumber(arg[3]) or 0.5, + replication_sync_timeout = arg[4] and tonumber(arg[4]) or 1.0, +}) + +box.once("bootstrap", function() box.schema.user.grant('guest', 'replication') end) diff --git a/test/replication/replica_uuid_rw1.lua b/test/replication/replica_uuid_rw1.lua new file mode 120000 index 0000000000..328386f504 --- /dev/null +++ b/test/replication/replica_uuid_rw1.lua @@ -0,0 +1 @@ +replica_uuid_rw.lua \ No newline at end of file diff --git a/test/replication/replica_uuid_rw2.lua b/test/replication/replica_uuid_rw2.lua new file mode 120000 index 0000000000..328386f504 --- /dev/null +++ b/test/replication/replica_uuid_rw2.lua @@ -0,0 +1 @@ +replica_uuid_rw.lua \ No newline at end of file diff --git a/test/replication/replica_uuid_rw3.lua b/test/replication/replica_uuid_rw3.lua new file mode 120000 index 0000000000..328386f504 --- /dev/null +++ b/test/replication/replica_uuid_rw3.lua @@ -0,0 +1 @@ +replica_uuid_rw.lua \ No newline at end of file -- GitLab