diff --git a/changelogs/unreleased/gh-9235-assertion_in_box_wait_limbo_acked.md b/changelogs/unreleased/gh-9235-assertion_in_box_wait_limbo_acked.md new file mode 100644 index 0000000000000000000000000000000000000000..fd2db3bd3e0f3ba1daa57bb756a5a3e8ebc0c559 --- /dev/null +++ b/changelogs/unreleased/gh-9235-assertion_in_box_wait_limbo_acked.md @@ -0,0 +1,9 @@ +## bugfix/core + +* Fixed a bug when the assertion in `box_wait_limbo_acked` would fail. The + assertion is that the lsn of the last entry in limbo is always positive after + `wal_sync`. What happened in the release build before the patch? If the + `replication_synchro_quorum` is set too high on the replica, then it will never + be reached. After the timeout is triggered, the user will receive a `TimedOut` + error. If `replication_synchro_quorum` <= number of instances in the replica + set, the program will immediately stop with a `Segmentation fault` (gh-9235). diff --git a/src/box/box.cc b/src/box/box.cc index 59912cd457100ab4d9dd850574991d007085efc0..5727f6d3831da0904ca39b89a978b5840c1b6d01 100644 --- a/src/box/box.cc +++ b/src/box/box.cc @@ -2475,6 +2475,7 @@ box_wait_limbo_acked(double timeout) if (last_entry->lsn < 0) { int64_t tid = last_entry->txn->id; + journal_queue_flush(); if (wal_sync(NULL) != 0) return -1; diff --git a/test/replication-luatest/gh_9235_assertion_in_box_wait_limbo_acked_test.lua b/test/replication-luatest/gh_9235_assertion_in_box_wait_limbo_acked_test.lua new file mode 100644 index 0000000000000000000000000000000000000000..d039e385e8daa756a4ae91b47a560a457ae0a5aa --- /dev/null +++ b/test/replication-luatest/gh_9235_assertion_in_box_wait_limbo_acked_test.lua @@ -0,0 +1,99 @@ +local t = require('luatest') +local cluster = require('luatest.replica_set') +local server = require('luatest.server') + +local g = t.group('assertion-in-box-wait-limbo-acked') +-- +-- gh-9235: +-- Assertion in box_wait_limbo_acked. +-- +local wait_timeout = 10 + +local function wait_pair_sync(server1, server2) + -- Without retrying it fails sometimes when vclocks are empty and both + -- instances are in 'connect' state instead of 'follow'. + t.helpers.retrying({timeout = wait_timeout}, function() + server1:wait_for_vclock_of(server2) + server2:wait_for_vclock_of(server1) + server1:assert_follows_upstream(server2:get_instance_id()) + server2:assert_follows_upstream(server1:get_instance_id()) + end) +end + +local function server_wait_wal_is_blocked(server) + server:exec(function(wait_timeout) + t.helpers.retrying({timeout = wait_timeout}, function() + t.assert(box.error.injection.get('ERRINJ_WAL_DELAY')) + end) + end, {wait_timeout}) +end + +local function server_wait_synchro_queue_len_is_equal(server, expected) + server:exec(function(expected, wait_timeout) + t.helpers.retrying({timeout = wait_timeout}, function(expected) + t.assert_equals(box.info.synchro.queue.len, expected) + end, expected) + end, {expected, wait_timeout}) +end + +g.before_each(function(cg) + t.tarantool.skip_if_not_debug() + + cg.cluster = cluster:new({}) + local box_cfg = { + replication = { + server.build_listen_uri('master', cg.cluster.id), + server.build_listen_uri('replica', cg.cluster.id), + }, + election_mode = 'candidate', + replication_timeout = 0.1, + election_fencing_mode='off', + replication_synchro_quorum = 2, + replication_synchro_timeout = 100000, + } + cg.master = cg.cluster:build_and_add_server({ + alias = 'master', + box_cfg = box_cfg + }) + box_cfg.election_mode = 'off' + cg.replica = cg.cluster:build_and_add_server({ + alias = 'replica', + box_cfg = box_cfg + }) + cg.cluster:start() + cg.master:wait_until_election_leader_found() + cg.replica:wait_until_election_leader_found() + cg.master:exec(function() + box.schema.space.create('test', {is_sync = true}) + box.space.test:create_index('pk') + end) + wait_pair_sync(cg.replica, cg.master) +end) + +g.after_each(function(cg) + cg.cluster:drop() +end) + +g.test_assert_last_entry_lsn_is_positive = function(cg) + local f = cg.replica:exec(function() + box.error.injection.set('ERRINJ_WAL_DELAY_COUNTDOWN', 0) + local f = require('fiber').create(function() box.ctl.promote() end) + box.cfg{wal_queue_max_size=1} + f:set_joinable(true) + return f:id() + end) + server_wait_wal_is_blocked(cg.replica) + cg.master:exec(function() + require('fiber').create(function() box.space.test:insert{1} end) + require('fiber').create(function() box.space.test:insert{2} end) + end) + server_wait_synchro_queue_len_is_equal(cg.replica, 1) + cg.replica:exec(function(f) + box.error.injection.set('ERRINJ_WAL_DELAY', false) + require('fiber').find(f):join() + end, {f}) + cg.master:exec(function() + box.cfg{replication_synchro_quorum=1} + end) + server_wait_synchro_queue_len_is_equal(cg.replica, 0) +end