Skip to content
Snippets Groups Projects
  • Alexander V. Tikhonov's avatar
    d9d1deac
    test: flaky replication/replica_rejoin.test.lua · d9d1deac
    Alexander V. Tikhonov authored
    On heavy loaded hosts found the following issue:
    
      [151] --- replication/replica_rejoin.result     Tue Sep 29 10:57:26 2020
      [151] +++ replication/replica_rejoin.reject     Tue Sep 29 10:57:48 2020
      [151] @@ -230,7 +230,12 @@
      [151]      return box.info ~= nil and box.info.replication[1] ~= nil
      [151]  end)
      [151]  ---
      [151] -- true
      [151] +- error: "builtin/box/load_cfg.lua:601: Please call box.cfg{} first\nstack traceback:\n\tbuiltin/box/load_cfg.lua:601:
      [151] +    in function '__index'\n\t[string \"return test_run:wait_cond(function()         ...\"]:1:
      [151] +    in function 'cond'\n\t/tmp/tnt/151_replication/test_run.lua:411: in function </tmp/tnt/151_replication/test_run.lua:404>\n\t[C]:
      [151] +    in function 'pcall'\n\tbuiltin/box/console.lua:402: in function 'eval'\n\tbuiltin/box/console.lua:708:
      [151] +    in function 'repl'\n\tbuiltin/box/console.lua:842: in function <builtin/box/console.lua:828>\n\t[C]:
      [151] +    in function 'pcall'\n\tbuiltin/socket.lua:1081: in function <builtin/socket.lua:1079>"
      [151]  ...
      [151]  test_run:wait_upstream(1, {message_re = 'Missing %.xlog file', status = 'loading'})
      [151]  ---
      [151]
    
    It happened because box.cfg was not ready to provide information. In
    real there is no need to use local check for replication information
    parts availablity, due to wait_upstream() function used below, do it
    itself.
    
    Part of #4985
    d9d1deac
    History
    test: flaky replication/replica_rejoin.test.lua
    Alexander V. Tikhonov authored
    On heavy loaded hosts found the following issue:
    
      [151] --- replication/replica_rejoin.result     Tue Sep 29 10:57:26 2020
      [151] +++ replication/replica_rejoin.reject     Tue Sep 29 10:57:48 2020
      [151] @@ -230,7 +230,12 @@
      [151]      return box.info ~= nil and box.info.replication[1] ~= nil
      [151]  end)
      [151]  ---
      [151] -- true
      [151] +- error: "builtin/box/load_cfg.lua:601: Please call box.cfg{} first\nstack traceback:\n\tbuiltin/box/load_cfg.lua:601:
      [151] +    in function '__index'\n\t[string \"return test_run:wait_cond(function()         ...\"]:1:
      [151] +    in function 'cond'\n\t/tmp/tnt/151_replication/test_run.lua:411: in function </tmp/tnt/151_replication/test_run.lua:404>\n\t[C]:
      [151] +    in function 'pcall'\n\tbuiltin/box/console.lua:402: in function 'eval'\n\tbuiltin/box/console.lua:708:
      [151] +    in function 'repl'\n\tbuiltin/box/console.lua:842: in function <builtin/box/console.lua:828>\n\t[C]:
      [151] +    in function 'pcall'\n\tbuiltin/socket.lua:1081: in function <builtin/socket.lua:1079>"
      [151]  ...
      [151]  test_run:wait_upstream(1, {message_re = 'Missing %.xlog file', status = 'loading'})
      [151]  ---
      [151]
    
    It happened because box.cfg was not ready to provide information. In
    real there is no need to use local check for replication information
    parts availablity, due to wait_upstream() function used below, do it
    itself.
    
    Part of #4985
replica_rejoin.test.lua 5.97 KiB
env = require('test_run')
test_run = env.new()
log = require('log')
engine = test_run:get_cfg('engine')

test_run:cleanup_cluster()

--
-- gh-461: check that a replica refetches the last checkpoint
-- in case it fell behind the master.
--
box.schema.user.grant('guest', 'replication')
_ = box.schema.space.create('test', {engine = engine})
_ = box.space.test:create_index('pk')
_ = box.space.test:insert{1}
_ = box.space.test:insert{2}
_ = box.space.test:insert{3}

-- Join a replica, then stop it.
test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'")
test_run:cmd("start server replica with args='true'")
test_run:cmd("switch replica")
box.info.replication[1].upstream.status == 'follow' or log.error(box.info)
box.space.test:select()
test_run:cmd("switch default")
test_run:cmd("stop server replica")

-- Restart the server to purge the replica from
-- the garbage collection state.
test_run:cmd("restart server default")

-- Make some checkpoints to remove old xlogs.
checkpoint_count = box.cfg.checkpoint_count
box.cfg{checkpoint_count = 1}
_ = box.space.test:delete{1}
_ = box.space.test:insert{10}
box.snapshot()
_ = box.space.test:delete{2}
_ = box.space.test:insert{20}
box.snapshot()
_ = box.space.test:delete{3}
_ = box.space.test:insert{30}
fio = require('fio')
test_run:wait_cond(function() return #fio.glob(fio.pathjoin(box.cfg.wal_dir, '*.xlog')) == 1 end) or fio.pathjoin(box.cfg.wal_dir, '*.xlog')
box.cfg{checkpoint_count = checkpoint_count}

-- Restart the replica. Since xlogs have been removed,
-- it is supposed to rejoin without changing id.
test_run:cmd("start server replica with args='true'")
box.info.replication[2].downstream.vclock ~= nil or log.error(box.info)
test_run:cmd("switch replica")
box.info.replication[1].upstream.status == 'follow' or log.error(box.info)
box.space.test:select()
test_run:cmd("switch default")

-- Make sure the replica follows new changes.
for i = 10, 30, 10 do box.space.test:update(i, {{'!', 1, i}}) end
vclock = test_run:get_vclock('default')
vclock[0] = nil
_ = test_run:wait_vclock('replica', vclock)
test_run:cmd("switch replica")
box.space.test:select()

-- Check that restart works as usual.
test_run:cmd("restart server replica with args='true'")
box.info.replication[1].upstream.status == 'follow' or log.error(box.info)
box.space.test:select()

-- Check that rebootstrap is NOT initiated unless the replica
-- is strictly behind the master.
box.space.test:replace{1, 2, 3} -- bumps LSN on the replica
test_run:cmd("switch default")
test_run:cmd("stop server replica")
test_run:cmd("restart server default")
checkpoint_count = box.cfg.checkpoint_count
box.cfg{checkpoint_count = 1}
for i = 1, 3 do box.space.test:delete{i * 10} end
box.snapshot()
for i = 1, 3 do box.space.test:insert{i * 100} end
fio = require('fio')
test_run:wait_cond(function() return #fio.glob(fio.pathjoin(box.cfg.wal_dir, '*.xlog')) == 1 end) or fio.pathjoin(box.cfg.wal_dir, '*.xlog')
box.cfg{checkpoint_count = checkpoint_count}
test_run:cmd("start server replica with args='true', wait=False")
test_run:cmd("switch replica")
test_run:wait_upstream(1, {message_re = 'Missing %.xlog file', status = 'loading'})
box.space.test:select()

--
-- gh-3740: rebootstrap crashes if the master has rows originating
-- from the replica.
--

-- Bootstrap a new replica.
test_run:cmd("switch default")
test_run:cmd("stop server replica")
test_run:cmd("cleanup server replica")
test_run:cleanup_cluster()
box.space.test:truncate()
test_run:cmd("start server replica with args='true'")
-- Subscribe the master to the replica.
replica_listen = test_run:cmd("eval replica 'return box.cfg.listen'")
replica_listen ~= nil
box.cfg{replication = replica_listen}
-- Unsubscribe the replica from the master.
test_run:cmd("switch replica")
box.cfg{replication = ''}
-- Bump vclock on the master.
test_run:cmd("switch default")
box.space.test:replace{1}
-- Bump vclock on the replica.
test_run:cmd("switch replica")
for i = 1, 10 do box.space.test:replace{2} end
vclock = test_run:get_vclock('replica')
vclock[0] = nil
_ = test_run:wait_vclock('default', vclock)
-- Restart the master and force garbage collection.
test_run:cmd("switch default")
test_run:cmd("restart server default")
replica_listen = test_run:cmd("eval replica 'return box.cfg.listen'")
replica_listen ~= nil
box.cfg{replication = replica_listen}
default_checkpoint_count = box.cfg.checkpoint_count
box.cfg{checkpoint_count = 1}
box.snapshot()
box.cfg{checkpoint_count = default_checkpoint_count}
fio = require('fio')
test_run:wait_cond(function() return #fio.glob(fio.pathjoin(box.cfg.wal_dir, '*.xlog')) == 1 end) or fio.pathjoin(box.cfg.wal_dir, '*.xlog')
-- Bump vclock on the replica again.
test_run:cmd("switch replica")
for i = 1, 10 do box.space.test:replace{2} end
vclock = test_run:get_vclock('replica')
vclock[0] = nil
_ = test_run:wait_vclock('default', vclock)
-- Restart the replica. It should successfully rebootstrap.
test_run:cmd("restart server replica with args='true'")
box.space.test:select()
box.snapshot()
box.space.test:replace{2}

-- Cleanup.
test_run:cmd("switch default")
box.cfg{replication = ''}
test_run:cmd("stop server replica")
test_run:cmd("cleanup server replica")
test_run:cmd("delete server replica")
test_run:cleanup_cluster()
box.space.test:drop()
box.schema.user.revoke('guest', 'replication')

--
-- gh-4107: rebootstrap fails if the replica was deleted from
-- the cluster on the master.
--
box.schema.user.grant('guest', 'replication')
test_run:cmd("create server replica with rpl_master=default, script='replication/replica_uuid.lua'")
start_cmd = string.format("start server replica with args='%s'", require('uuid').new())
box.space._cluster:get(2) == nil
test_run:cmd(start_cmd)
test_run:cmd("stop server replica")
test_run:cmd("cleanup server replica")
box.space._cluster:delete(2) ~= nil
test_run:cmd(start_cmd)
box.space._cluster:get(2) ~= nil
test_run:cmd("stop server replica")
test_run:cmd("cleanup server replica")
test_run:cmd("delete server replica")
box.schema.user.revoke('guest', 'replication')
test_run:cleanup_cluster()