From f9299f247f2638b7a41daa0ffe5d5c9898850f82 Mon Sep 17 00:00:00 2001
From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
Date: Wed, 6 Apr 2022 00:00:39 +0200
Subject: [PATCH] limbo: don't clear txn sync flag in case of fail

The limbo cleared TXN_WAIT_SYNC and TXN_WAIT_ACK flags for all
removed entries - succeeded and failed. For succeeded it is fine.
For failed it was not.

The reason is that a transaction could be rolled back after a
successful WAL write but before its waiting fiber wakes up. Then
on wakeup the fiber wouldn't not see TXN_WAIT_SYNC flag and assert
that the transaction signature >= 0. It wasn't true for txns
rolled back due to synchro-reasons like a foreign PROMOTE not
including this transaction.

The patch makes so a failed transaction keeps its TXN_WAIT_SYNC
flag so as its owner fiber on wakeup would reach
txn_limbo_wait_complete(), notice the bad signature, and follow
the rollback-path.

TXN_WAIT_ACK is dropped, because the transaction owner otherwise
would try to call txn_limbo_ack() for the transaction even if the
limbo doesn't belong to the instance anymore.

An alternative solution would be to check signature value for all
transactions even when journal_entry->res is >= 0. But that would
slow down the common path even for non-synchro transactions.

Closes #6842

NO_DOC=Bugfix
---
 .../unreleased/gh-6842-qsync-assertions.md    |   5 +
 src/box/txn_limbo.c                           |   3 +-
 .../gh_6842_qsync_applier_order_test.lua      | 102 +++++++++++++++++-
 3 files changed, 105 insertions(+), 5 deletions(-)
 create mode 100644 changelogs/unreleased/gh-6842-qsync-assertions.md

diff --git a/changelogs/unreleased/gh-6842-qsync-assertions.md b/changelogs/unreleased/gh-6842-qsync-assertions.md
new file mode 100644
index 0000000000..084c528d6e
--- /dev/null
+++ b/changelogs/unreleased/gh-6842-qsync-assertions.md
@@ -0,0 +1,5 @@
+## bugfix/raft
+
+* Fixed several crashes and/or undefined behaviors (assertions in debug build)
+  which could appear when new synchronous transactions were made during ongoing
+  elections (gh-6842).
diff --git a/src/box/txn_limbo.c b/src/box/txn_limbo.c
index b1626b83c8..4d7f8b9815 100644
--- a/src/box/txn_limbo.c
+++ b/src/box/txn_limbo.c
@@ -489,7 +489,7 @@ txn_limbo_read_rollback(struct txn_limbo *limbo, int64_t lsn)
 		return;
 	rlist_foreach_entry_safe_reverse(e, &limbo->queue, in_queue, tmp) {
 		txn_limbo_abort(limbo, e);
-		txn_clear_flags(e->txn, TXN_WAIT_SYNC | TXN_WAIT_ACK);
+		txn_clear_flags(e->txn, TXN_WAIT_ACK);
 		/*
 		 * Should be written to WAL by now. Rollback is always written
 		 * after the affected transactions.
@@ -575,6 +575,7 @@ txn_limbo_read_demote(struct txn_limbo *limbo, int64_t lsn)
 void
 txn_limbo_ack(struct txn_limbo *limbo, uint32_t replica_id, int64_t lsn)
 {
+	assert(!txn_limbo_is_ro(limbo));
 	if (rlist_empty(&limbo->queue))
 		return;
 	/*
diff --git a/test/replication-luatest/gh_6842_qsync_applier_order_test.lua b/test/replication-luatest/gh_6842_qsync_applier_order_test.lua
index 1a2a37e7cd..de9214c087 100644
--- a/test/replication-luatest/gh_6842_qsync_applier_order_test.lua
+++ b/test/replication-luatest/gh_6842_qsync_applier_order_test.lua
@@ -6,7 +6,7 @@ local wait_timeout = 120
 
 local g = luatest.group('gh-6842')
 
-g.before_all(function(g)
+local function cluster_create(g)
     g.cluster = cluster:new({})
     local box_cfg = {
         replication_timeout = 0.1,
@@ -32,13 +32,16 @@ g.before_all(function(g)
             read_only = false
         }
     end)
-end)
+end
 
-g.after_all(function(g)
+local function cluster_drop(g)
     g.cluster:drop()
     g.server1 = nil
     g.server2 = nil
-end)
+end
+
+g.before_all(cluster_create)
+g.after_all(cluster_drop)
 
 g.after_each(function(g)
     -- Restore cluster state like it was on start.
@@ -119,6 +122,19 @@ local function wait_synchro_is_busy(server)
     end)
 end
 
+--
+-- Wait until the server sees the given upstream status for a specified
+-- replica ID.
+--
+local function wait_upstream_status(server, id_arg, status_arg)
+    luatest.helpers.retrying({timeout = wait_timeout}, server.exec, server,
+                             function(id, status)
+        if box.info.replication[id].upstream.status ~= status then
+            error('Waiting for upstream status')
+        end
+    end, {id_arg, status_arg})
+end
+
 --
 -- Server 1 was a synchro queue owner. Then it receives a foreign PROMOTE which
 -- goes to WAL but is not applied yet. Server 1 during that tries to make a
@@ -274,3 +290,81 @@ g.test_remote_promote_during_local_txn_including_it = function(g)
     end)
     luatest.assert_equals(rows, {{1}, {2}, {3}}, 'synchro transactions work')
 end
+
+--
+-- Server 1 was a synchro queue owner. It starts a synchro txn. The txn is
+-- not written to WAL yet. Server 2 makes a PROMOTE which doesn't include that
+-- txn.
+--
+-- Then the PROMOTE is delivered to server 1 and goes to WAL too. Then the txn
+-- and PROMOTE WAL writes are processed by TX thread in a single batch without
+-- yields.
+--
+-- The bug was that the fiber authored the txn wasn't ready to a WAL write
+-- being success, txn being marked as non-synchro anymore (it happened on
+-- rollback in the synchro queue), but the txn signature being bad.
+--
+g.test_remote_promote_during_local_txn_not_including_it = function(g)
+    -- Start a synchro txn on server 1.
+    local fids = g.server1:exec(function()
+        box.ctl.promote()
+        local s = box.schema.create_space('test', {is_sync = true})
+        s:create_index('pk')
+        box.cfg{
+            replication_synchro_quorum = 3,
+        }
+        box.error.injection.set('ERRINJ_WAL_DELAY', true)
+        local fiber = require('fiber')
+        -- More than one transaction to ensure that it works not just for one.
+        local f1 = fiber.new(s.replace, s, {1})
+        f1:set_joinable(true)
+        local f2 = fiber.new(s.replace, s, {2})
+        f2:set_joinable(true)
+        return {f1:id(), f2:id()}
+    end)
+    -- Deliver server 1 promotion to 2. Otherwise server 2 might fail trying to
+    -- start its own promotion simultaneously.
+    g.server2:wait_vclock_of(g.server1)
+
+    -- Server 2 sends a PROMOTE not covering the txns to server 1.
+    g.server2:exec(function()
+        box.cfg{
+            replication_synchro_quorum = 1,
+        }
+        box.ctl.promote()
+    end)
+
+    -- Server 1 receives the PROMOTE.
+    wait_synchro_is_busy(g.server1)
+
+    local rows = g.server1:exec(function(fids)
+        -- Simulate the TX thread being slow. To increase likelihood of the txn
+        -- and PROMOTE WAL writes being processed by TX in a single batch.
+        box.error.injection.set('ERRINJ_TX_DELAY_PRIO_ENDPOINT', 0.1)
+        -- Let them finish WAL writes.
+        box.error.injection.set('ERRINJ_WAL_DELAY', false)
+
+        local fiber = require('fiber')
+        for _, fid in pairs(fids) do
+            fiber.find(fid):join()
+        end
+        box.error.injection.set('ERRINJ_TX_DELAY_PRIO_ENDPOINT', 0)
+        return box.space.test:select()
+    end, {fids})
+    luatest.assert_equals(rows, {}, 'txns were rolled back')
+
+    -- Replication is broken now because server 2 (synchro queue owner) got a
+    -- txn from server 1. It was written before ownership transaction but it
+    -- doesn't matter. Although maybe some day it could be detected and handled
+    -- more graceful.
+    wait_upstream_status(g.server2, g.server1:instance_id(), 'stopped')
+
+    rows = g.server2:exec(function()
+        return box.space.test:select()
+    end)
+    luatest.assert_equals(rows, {}, 'server 2 received it but did not apply')
+
+    -- Recreate the cluster because it is broken now.
+    cluster_drop(g)
+    cluster_create(g)
+end
-- 
GitLab