From f9299f247f2638b7a41daa0ffe5d5c9898850f82 Mon Sep 17 00:00:00 2001 From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org> Date: Wed, 6 Apr 2022 00:00:39 +0200 Subject: [PATCH] limbo: don't clear txn sync flag in case of fail The limbo cleared TXN_WAIT_SYNC and TXN_WAIT_ACK flags for all removed entries - succeeded and failed. For succeeded it is fine. For failed it was not. The reason is that a transaction could be rolled back after a successful WAL write but before its waiting fiber wakes up. Then on wakeup the fiber wouldn't not see TXN_WAIT_SYNC flag and assert that the transaction signature >= 0. It wasn't true for txns rolled back due to synchro-reasons like a foreign PROMOTE not including this transaction. The patch makes so a failed transaction keeps its TXN_WAIT_SYNC flag so as its owner fiber on wakeup would reach txn_limbo_wait_complete(), notice the bad signature, and follow the rollback-path. TXN_WAIT_ACK is dropped, because the transaction owner otherwise would try to call txn_limbo_ack() for the transaction even if the limbo doesn't belong to the instance anymore. An alternative solution would be to check signature value for all transactions even when journal_entry->res is >= 0. But that would slow down the common path even for non-synchro transactions. Closes #6842 NO_DOC=Bugfix --- .../unreleased/gh-6842-qsync-assertions.md | 5 + src/box/txn_limbo.c | 3 +- .../gh_6842_qsync_applier_order_test.lua | 102 +++++++++++++++++- 3 files changed, 105 insertions(+), 5 deletions(-) create mode 100644 changelogs/unreleased/gh-6842-qsync-assertions.md diff --git a/changelogs/unreleased/gh-6842-qsync-assertions.md b/changelogs/unreleased/gh-6842-qsync-assertions.md new file mode 100644 index 0000000000..084c528d6e --- /dev/null +++ b/changelogs/unreleased/gh-6842-qsync-assertions.md @@ -0,0 +1,5 @@ +## bugfix/raft + +* Fixed several crashes and/or undefined behaviors (assertions in debug build) + which could appear when new synchronous transactions were made during ongoing + elections (gh-6842). diff --git a/src/box/txn_limbo.c b/src/box/txn_limbo.c index b1626b83c8..4d7f8b9815 100644 --- a/src/box/txn_limbo.c +++ b/src/box/txn_limbo.c @@ -489,7 +489,7 @@ txn_limbo_read_rollback(struct txn_limbo *limbo, int64_t lsn) return; rlist_foreach_entry_safe_reverse(e, &limbo->queue, in_queue, tmp) { txn_limbo_abort(limbo, e); - txn_clear_flags(e->txn, TXN_WAIT_SYNC | TXN_WAIT_ACK); + txn_clear_flags(e->txn, TXN_WAIT_ACK); /* * Should be written to WAL by now. Rollback is always written * after the affected transactions. @@ -575,6 +575,7 @@ txn_limbo_read_demote(struct txn_limbo *limbo, int64_t lsn) void txn_limbo_ack(struct txn_limbo *limbo, uint32_t replica_id, int64_t lsn) { + assert(!txn_limbo_is_ro(limbo)); if (rlist_empty(&limbo->queue)) return; /* diff --git a/test/replication-luatest/gh_6842_qsync_applier_order_test.lua b/test/replication-luatest/gh_6842_qsync_applier_order_test.lua index 1a2a37e7cd..de9214c087 100644 --- a/test/replication-luatest/gh_6842_qsync_applier_order_test.lua +++ b/test/replication-luatest/gh_6842_qsync_applier_order_test.lua @@ -6,7 +6,7 @@ local wait_timeout = 120 local g = luatest.group('gh-6842') -g.before_all(function(g) +local function cluster_create(g) g.cluster = cluster:new({}) local box_cfg = { replication_timeout = 0.1, @@ -32,13 +32,16 @@ g.before_all(function(g) read_only = false } end) -end) +end -g.after_all(function(g) +local function cluster_drop(g) g.cluster:drop() g.server1 = nil g.server2 = nil -end) +end + +g.before_all(cluster_create) +g.after_all(cluster_drop) g.after_each(function(g) -- Restore cluster state like it was on start. @@ -119,6 +122,19 @@ local function wait_synchro_is_busy(server) end) end +-- +-- Wait until the server sees the given upstream status for a specified +-- replica ID. +-- +local function wait_upstream_status(server, id_arg, status_arg) + luatest.helpers.retrying({timeout = wait_timeout}, server.exec, server, + function(id, status) + if box.info.replication[id].upstream.status ~= status then + error('Waiting for upstream status') + end + end, {id_arg, status_arg}) +end + -- -- Server 1 was a synchro queue owner. Then it receives a foreign PROMOTE which -- goes to WAL but is not applied yet. Server 1 during that tries to make a @@ -274,3 +290,81 @@ g.test_remote_promote_during_local_txn_including_it = function(g) end) luatest.assert_equals(rows, {{1}, {2}, {3}}, 'synchro transactions work') end + +-- +-- Server 1 was a synchro queue owner. It starts a synchro txn. The txn is +-- not written to WAL yet. Server 2 makes a PROMOTE which doesn't include that +-- txn. +-- +-- Then the PROMOTE is delivered to server 1 and goes to WAL too. Then the txn +-- and PROMOTE WAL writes are processed by TX thread in a single batch without +-- yields. +-- +-- The bug was that the fiber authored the txn wasn't ready to a WAL write +-- being success, txn being marked as non-synchro anymore (it happened on +-- rollback in the synchro queue), but the txn signature being bad. +-- +g.test_remote_promote_during_local_txn_not_including_it = function(g) + -- Start a synchro txn on server 1. + local fids = g.server1:exec(function() + box.ctl.promote() + local s = box.schema.create_space('test', {is_sync = true}) + s:create_index('pk') + box.cfg{ + replication_synchro_quorum = 3, + } + box.error.injection.set('ERRINJ_WAL_DELAY', true) + local fiber = require('fiber') + -- More than one transaction to ensure that it works not just for one. + local f1 = fiber.new(s.replace, s, {1}) + f1:set_joinable(true) + local f2 = fiber.new(s.replace, s, {2}) + f2:set_joinable(true) + return {f1:id(), f2:id()} + end) + -- Deliver server 1 promotion to 2. Otherwise server 2 might fail trying to + -- start its own promotion simultaneously. + g.server2:wait_vclock_of(g.server1) + + -- Server 2 sends a PROMOTE not covering the txns to server 1. + g.server2:exec(function() + box.cfg{ + replication_synchro_quorum = 1, + } + box.ctl.promote() + end) + + -- Server 1 receives the PROMOTE. + wait_synchro_is_busy(g.server1) + + local rows = g.server1:exec(function(fids) + -- Simulate the TX thread being slow. To increase likelihood of the txn + -- and PROMOTE WAL writes being processed by TX in a single batch. + box.error.injection.set('ERRINJ_TX_DELAY_PRIO_ENDPOINT', 0.1) + -- Let them finish WAL writes. + box.error.injection.set('ERRINJ_WAL_DELAY', false) + + local fiber = require('fiber') + for _, fid in pairs(fids) do + fiber.find(fid):join() + end + box.error.injection.set('ERRINJ_TX_DELAY_PRIO_ENDPOINT', 0) + return box.space.test:select() + end, {fids}) + luatest.assert_equals(rows, {}, 'txns were rolled back') + + -- Replication is broken now because server 2 (synchro queue owner) got a + -- txn from server 1. It was written before ownership transaction but it + -- doesn't matter. Although maybe some day it could be detected and handled + -- more graceful. + wait_upstream_status(g.server2, g.server1:instance_id(), 'stopped') + + rows = g.server2:exec(function() + return box.space.test:select() + end) + luatest.assert_equals(rows, {}, 'server 2 received it but did not apply') + + -- Recreate the cluster because it is broken now. + cluster_drop(g) + cluster_create(g) +end -- GitLab