From c6485d8d3ef346b39b73a189e9358c1d76dca206 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov <vdavydov@tarantool.org> Date: Fri, 27 May 2022 14:48:45 +0300 Subject: [PATCH] applier: add timeout to greeting read A Tarantool server is supposed to send a greeting message right after accepting a new client so the first thing an applier does after connecting to the master is reads the greeting. It does this without timeouts. The problem is that if by mistake we connect to a wrong instance, which doesn't send anything to clients, the applier will hang forever (until the remote closes the socket), without logging any errors. This may happen even with a valid Tarantool instance - if SSL encryption is enabled on the master, but not on the client, because the SSL protocol assumes that the client initiates a connection by writing to the socket first (before the server). Let's add a timeout to the operation reading the greeting. The timeout is set to replication_disconnect_timeout(), after which a connection is broken if the master doesn't send heartbeats for that long. Note, we don't add a timeout to other read/write operations issued to initiate a replication connection, because if we received a greeting and it's valid, then the master is likely to be fine. Closes #7204 NO_DOC=bug --- .../gh-7204-replication-greeting-timeout.md | 4 ++ src/box/applier.cc | 10 ++++- .../gh_7204_greeting_timeout_test.lua | 37 +++++++++++++++++++ 3 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 changelogs/unreleased/gh-7204-replication-greeting-timeout.md create mode 100644 test/replication-luatest/gh_7204_greeting_timeout_test.lua diff --git a/changelogs/unreleased/gh-7204-replication-greeting-timeout.md b/changelogs/unreleased/gh-7204-replication-greeting-timeout.md new file mode 100644 index 0000000000..4b0ab6b18a --- /dev/null +++ b/changelogs/unreleased/gh-7204-replication-greeting-timeout.md @@ -0,0 +1,4 @@ +## bugfix/replication + +* Fixed the bug because of which the error reason was not logged on a replica + in case when the master didn't send a greeting message (gh-7204). diff --git a/src/box/applier.cc b/src/box/applier.cc index f30d9f2f77..1cd5b92fb3 100644 --- a/src/box/applier.cc +++ b/src/box/applier.cc @@ -357,7 +357,15 @@ applier_connect(struct applier *applier) close(fd); diag_raise(); } - if (coio_readn(io, greetingbuf, IPROTO_GREETING_SIZE) < 0) + /* + * Abort if the master doesn't send a greeting within the configured + * timeout so as not to block forever if we connect to a wrong + * instance, which doesn't send anything to accepted clients. + * No timeouts after this point, because if we receive a proper + * greeting, the server is likely to be fine. + */ + if (coio_readn_timeout(io, greetingbuf, IPROTO_GREETING_SIZE, + replication_disconnect_timeout()) < 0) diag_raise(); applier->last_row_time = ev_monotonic_now(loop()); diff --git a/test/replication-luatest/gh_7204_greeting_timeout_test.lua b/test/replication-luatest/gh_7204_greeting_timeout_test.lua new file mode 100644 index 0000000000..2caca5dcd8 --- /dev/null +++ b/test/replication-luatest/gh_7204_greeting_timeout_test.lua @@ -0,0 +1,37 @@ +local fiber = require('fiber') +local server = require('test.luatest_helpers.server') +local socket = require('socket') +local t = require('luatest') + +local g = t.group() + +g.before_all(function(g) + g.server = server:new({ + alias = 'master', + box_cfg = { + replication_timeout = 0.1, + replication_connect_timeout = 0.5, + }, + }) + g.server:start() +end) + +g.after_all(function(g) + g.server:drop() +end) + +g.test_greeting_timeout = function(g) + local uri = server.build_instance_uri('server') + local s = socket.tcp_server('unix/', uri, { + handler = function() fiber.sleep(9000) end + }) + t.assert(s) + g.server:exec(function(uri) + box.cfg{replication = uri} + end, {uri}) + t.helpers.retrying({}, function() + t.assert(g.server:grep_log('timed out')) + t.assert(g.server:grep_log('will retry')) + end) + s:close() +end -- GitLab