diff --git a/src/box/applier.cc b/src/box/applier.cc index 91769ae00c55e59a8e636a2dc1907f235031efef..6bfe5a99a1ced9bbd4f6b0532f53315a02163188 100644 --- a/src/box/applier.cc +++ b/src/box/applier.cc @@ -98,6 +98,11 @@ applier_log_error(struct applier *applier, struct error *e) /* * Fiber function to write vclock to replication master. + * To track connection status, replica answers master + * with encoded vclock. In addition to DML requests, + * master also sends heartbeat messages every + * replication_timeout seconds (introduced in 1.7.7). + * On such requests replica also responds with vclock. */ static int applier_writer_f(va_list ap) @@ -106,10 +111,18 @@ applier_writer_f(va_list ap) struct ev_io io; coio_create(&io, applier->io.fd); - /* Re-connect loop */ while (!fiber_is_cancelled()) { - fiber_cond_wait_timeout(&applier->writer_cond, - replication_timeout); + /* + * Tarantool >= 1.7.7 sends periodic heartbeat + * messages so we don't need to send ACKs every + * replication_timeout seconds any more. + */ + if (applier->version_id >= version_id(1, 7, 7)) + fiber_cond_wait_timeout(&applier->writer_cond, + TIMEOUT_INFINITY); + else + fiber_cond_wait_timeout(&applier->writer_cond, + replication_timeout); /* Send ACKs only when in FOLLOW mode ,*/ if (applier->state != APPLIER_SYNC && applier->state != APPLIER_FOLLOW) diff --git a/test/replication/errinj.result b/test/replication/errinj.result index 2f4b2a84a2d9c8a46a4bd94a7dd2a20b22767f49..c31e7866a55bb4efe43f00486b169dc1676e3fd6 100644 --- a/test/replication/errinj.result +++ b/test/replication/errinj.result @@ -472,6 +472,45 @@ errinj.set("ERRINJ_RELAY_REPORT_INTERVAL", 0) --- - ok ... +-- Check replica's ACKs don't prevent the master from sending +-- heartbeat messages (gh-3160). +test_run:cmd("start server replica_timeout with args='0.009'") +--- +- true +... +test_run:cmd("switch replica_timeout") +--- +- true +... +fiber = require('fiber') +--- +... +while box.info.replication[1].upstream.status ~= 'follow' do fiber.sleep(0.0001) end +--- +... +box.info.replication[1].upstream.status -- follow +--- +- follow +... +for i = 0, 15 do fiber.sleep(0.01) if box.info.replication[1].upstream.status ~= 'follow' then break end end +--- +... +box.info.replication[1].upstream.status -- follow +--- +- follow +... +test_run:cmd("switch default") +--- +- true +... +test_run:cmd("stop server replica_timeout") +--- +- true +... +test_run:cmd("cleanup server replica_timeout") +--- +- true +... box.snapshot() --- - ok diff --git a/test/replication/errinj.test.lua b/test/replication/errinj.test.lua index 84a811610904f53b938a847bc628228fc1ef7831..017d9a07df1579a767742275ee4e4804ab38a886 100644 --- a/test/replication/errinj.test.lua +++ b/test/replication/errinj.test.lua @@ -196,6 +196,22 @@ test_run:cmd("stop server replica_timeout") test_run:cmd("cleanup server replica_timeout") errinj.set("ERRINJ_RELAY_REPORT_INTERVAL", 0) +-- Check replica's ACKs don't prevent the master from sending +-- heartbeat messages (gh-3160). + +test_run:cmd("start server replica_timeout with args='0.009'") +test_run:cmd("switch replica_timeout") + +fiber = require('fiber') +while box.info.replication[1].upstream.status ~= 'follow' do fiber.sleep(0.0001) end +box.info.replication[1].upstream.status -- follow +for i = 0, 15 do fiber.sleep(0.01) if box.info.replication[1].upstream.status ~= 'follow' then break end end +box.info.replication[1].upstream.status -- follow + +test_run:cmd("switch default") +test_run:cmd("stop server replica_timeout") +test_run:cmd("cleanup server replica_timeout") + box.snapshot() for i = 0, 9999 do box.space.test:replace({i, 4, 5, 'test'}) end