Skip to content
Snippets Groups Projects
Commit 99c7a971 authored by Konstantin Belyavskiy's avatar Konstantin Belyavskiy Committed by Vladimir Davydov
Browse files

replication: disconnect applier on timeout

In replication schema if one of the instances was powered off, it isn't
detected by others and the connection hangs. Alive machines show
'follow' state. Add timeout to solve this issue. It's safe since
applier and relay both send messages every replication_timeout so we can
assume that if we read nothing we have problem with connection. Use
replication_disconnect_timeout which is replication_timeout * 4 as for
now.

The test fixed and comments improved by @locker.

Closes #3025
parent 5e75e2fa
No related branches found
No related tags found
No related merge requests found
......@@ -445,7 +445,18 @@ applier_subscribe(struct applier *applier)
applier_set_state(applier, APPLIER_FOLLOW);
}
coio_read_xrow(coio, ibuf, &row);
/*
* Tarantool < 1.7.7 does not send periodic heartbeat
* messages so we can't assume that if we haven't heard
* from the master for quite a while the connection is
* broken - the master might just be idle.
*/
if (applier->version_id < version_id(1, 7, 7)) {
coio_read_xrow(coio, ibuf, &row);
} else {
double timeout = replication_disconnect_timeout();
coio_read_xrow_timeout_xc(coio, ibuf, &row, timeout);
}
if (iproto_type_is_error(row.type))
xrow_decode_error_xc(&row); /* error */
......
......@@ -407,6 +407,71 @@ test_run:cmd("cleanup server replica")
---
- true
...
errinj.set("ERRINJ_RELAY_EXIT_DELAY", 0)
---
- ok
...
box.cfg{replication_timeout = 0.01}
---
...
test_run:cmd("create server replica_timeout with rpl_master=default, script='replication/replica_timeout.lua'")
---
- true
...
test_run:cmd("start server replica_timeout with args='0.01'")
---
- true
...
test_run:cmd("switch replica_timeout")
---
- true
...
fiber = require('fiber')
---
...
while box.info.replication[1].upstream.status ~= 'follow' do fiber.sleep(0.0001) end
---
...
box.info.replication[1].upstream.status
---
- follow
...
test_run:cmd("switch default")
---
- true
...
errinj.set("ERRINJ_RELAY_REPORT_INTERVAL", 5)
---
- ok
...
test_run:cmd("switch replica_timeout")
---
- true
...
-- Check replica's disconnection on timeout (gh-3025).
-- If master stops send heartbeat messages to replica,
-- due to infinite read timeout connection never breaks,
-- replica shows state 'follow' so old behaviour hangs
-- here in infinite loop.
while box.info.replication[1].upstream.message ~= 'timed out' do fiber.sleep(0.0001) end
---
...
test_run:cmd("switch default")
---
- true
...
test_run:cmd("stop server replica_timeout")
---
- true
...
test_run:cmd("cleanup server replica_timeout")
---
- true
...
errinj.set("ERRINJ_RELAY_REPORT_INTERVAL", 0)
---
- ok
...
box.snapshot()
---
- ok
......@@ -414,6 +479,10 @@ box.snapshot()
for i = 0, 9999 do box.space.test:replace({i, 4, 5, 'test'}) end
---
...
-- Check that replication_timeout is not taken into account
-- during the join stage, i.e. a replica with a minuscule
-- timeout successfully bootstraps and breaks connection only
-- after subscribe.
test_run:cmd("create server replica_ack with rpl_master=default, script='replication/replica_ack.lua'")
---
- true
......@@ -426,9 +495,11 @@ test_run:cmd("switch replica_ack")
---
- true
...
box.info.replication[1].upstream.status
fiber = require('fiber')
---
...
while box.info.replication[1].upstream.message ~= 'timed out' do fiber.sleep(0.0001) end
---
- follow
...
test_run:cmd("stop server default")
---
......
......@@ -168,14 +168,46 @@ while box.info.replication[1].upstream.status ~= 'follow' do fiber.sleep(0.0001)
test_run:cmd("switch default")
test_run:cmd("stop server replica")
test_run:cmd("cleanup server replica")
errinj.set("ERRINJ_RELAY_EXIT_DELAY", 0)
box.cfg{replication_timeout = 0.01}
test_run:cmd("create server replica_timeout with rpl_master=default, script='replication/replica_timeout.lua'")
test_run:cmd("start server replica_timeout with args='0.01'")
test_run:cmd("switch replica_timeout")
fiber = require('fiber')
while box.info.replication[1].upstream.status ~= 'follow' do fiber.sleep(0.0001) end
box.info.replication[1].upstream.status
test_run:cmd("switch default")
errinj.set("ERRINJ_RELAY_REPORT_INTERVAL", 5)
test_run:cmd("switch replica_timeout")
-- Check replica's disconnection on timeout (gh-3025).
-- If master stops send heartbeat messages to replica,
-- due to infinite read timeout connection never breaks,
-- replica shows state 'follow' so old behaviour hangs
-- here in infinite loop.
while box.info.replication[1].upstream.message ~= 'timed out' do fiber.sleep(0.0001) end
test_run:cmd("switch default")
test_run:cmd("stop server replica_timeout")
test_run:cmd("cleanup server replica_timeout")
errinj.set("ERRINJ_RELAY_REPORT_INTERVAL", 0)
box.snapshot()
for i = 0, 9999 do box.space.test:replace({i, 4, 5, 'test'}) end
-- Check that replication_timeout is not taken into account
-- during the join stage, i.e. a replica with a minuscule
-- timeout successfully bootstraps and breaks connection only
-- after subscribe.
test_run:cmd("create server replica_ack with rpl_master=default, script='replication/replica_ack.lua'")
test_run:cmd("start server replica_ack")
test_run:cmd("switch replica_ack")
box.info.replication[1].upstream.status
fiber = require('fiber')
while box.info.replication[1].upstream.message ~= 'timed out' do fiber.sleep(0.0001) end
test_run:cmd("stop server default")
test_run:cmd("deploy server default")
......
......@@ -4,7 +4,8 @@ box.cfg({
listen = os.getenv("LISTEN"),
replication = os.getenv("MASTER"),
memtx_memory = 107374182,
replication_timeout = 0.00001
replication_timeout = 0.00001,
replication_connect_quorum = 0,
})
require('console').listen(os.getenv('ADMIN'))
#!/usr/bin/env tarantool
local TIMEOUT = tonumber(arg[1])
box.cfg({
listen = os.getenv("LISTEN"),
replication = os.getenv("MASTER"),
memtx_memory = 107374182,
replication_timeout = TIMEOUT,
})
require('console').listen(os.getenv('ADMIN'))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment