From b5811f1560548338c061faed810bf426115fe241 Mon Sep 17 00:00:00 2001
From: Serge Petrenko <sergepetrenko@tarantool.org>
Date: Tue, 28 Jun 2022 14:53:42 +0300
Subject: [PATCH] replication: relax split-brain checks after DEMOTE

Our txn_limbo_is_replica_outdated check works correctly only when there
is a stream of PROMOTE requests. Only the author of the latest PROMOTE
is writable and may issue transactions. No matter synchronous or
asynchronous.

So txn_limbo_is_replica_outdated assumes that everyone but the node with
the greatest PROMOTE/DEMOTE term is outdated.

This isn't true for DEMOTE requests. There is only one server which
issues the DEMOTE request, but once it's written, it's fine to accept
asynchronous transactions from everyone.

Now the check is too strict. Every time there is an asynchronous
transaction from someone, who isn't the author of the latest PROMOTE or
DEMOTE, replication is broken with ER_SPLIT_BRAIN.

Let's relax it: when limbo owner is 0, it's fine to accept asynchronous
transactions from everyone, no matter the term of their latest PROMOTE
and DEMOTE.

This means that now after a DEMOTE we will miss one case of true
split-brain: when old leader continues writing data in an obsolete term,
and the new leader first issues PROMOTE and then DEMOTE.

This is a tradeoff for making async master-master work after DEMOTE.

The completely correct fix would be to write the term the transaction
was written in with each transaction and replace
txn_limbo_is_replica_outdated with txn_limbo_is_request_outdated, so
that we decide whether to filter the request or not judging by the term
it was applied in, not by the term we seen in some past PROMOTE from the
node. This fix seems too costy though, given that we only miss one case
of split-brain at the moment when the user enables master-master
replication (by writing a DEMOTE). And in master-master there is no such
thing as a split-brain.

Follow-up #5295
Closes #7286

NO_DOC=internal chcange
---
 .../gh-7286-false-positive-split-brain.md     |  3 ++
 src/box/applier.cc                            |  9 ++--
 ...h_7286_split_brain_false_positive_test.lua | 47 +++++++++++++++++++
 3 files changed, 55 insertions(+), 4 deletions(-)
 create mode 100644 changelogs/unreleased/gh-7286-false-positive-split-brain.md
 create mode 100644 test/replication-luatest/gh_7286_split_brain_false_positive_test.lua

diff --git a/changelogs/unreleased/gh-7286-false-positive-split-brain.md b/changelogs/unreleased/gh-7286-false-positive-split-brain.md
new file mode 100644
index 0000000000..54f2049fb3
--- /dev/null
+++ b/changelogs/unreleased/gh-7286-false-positive-split-brain.md
@@ -0,0 +1,3 @@
+## bugfix/replication
+
+* Fixed a false positive split-brain error after `box.ctl.demote()` (gh-7286).
diff --git a/src/box/applier.cc b/src/box/applier.cc
index 3d2bae2465..c842e44a84 100644
--- a/src/box/applier.cc
+++ b/src/box/applier.cc
@@ -1205,14 +1205,15 @@ applier_synchro_filter_tx(struct stailq *rows)
 	 * This means the only filtered out transactions are synchronous ones or
 	 * the ones depending on them.
 	 *
-	 * Any asynchronous transaction from an obsolete term is a marker of
-	 * split-brain by itself: consider it a synchronous transaction, which
-	 * is committed with quorum 1.
+	 * Any asynchronous transaction from an obsolete term when limbo is
+	 * claimed by someone is a marker of split-brain by itself: consider it
+	 * a synchronous transaction, which is committed with quorum 1.
 	 */
 	struct xrow_header *last_row =
 		&stailq_last_entry(rows, struct applier_tx_row, next)->row;
 	if (!last_row->wait_sync) {
-		if (iproto_type_is_dml(last_row->type)) {
+		if (iproto_type_is_dml(last_row->type) &&
+		    txn_limbo.owner_id != REPLICA_ID_NIL) {
 			tnt_raise(ClientError, ER_SPLIT_BRAIN,
 				  "got an async transaction from an old term");
 		}
diff --git a/test/replication-luatest/gh_7286_split_brain_false_positive_test.lua b/test/replication-luatest/gh_7286_split_brain_false_positive_test.lua
new file mode 100644
index 0000000000..5a8b96aa6f
--- /dev/null
+++ b/test/replication-luatest/gh_7286_split_brain_false_positive_test.lua
@@ -0,0 +1,47 @@
+local t = require('luatest')
+local cluster = require('test.luatest_helpers.cluster')
+local server = require('test.luatest_helpers.server')
+
+local g = t.group('gh-7286')
+
+-- gh-7286: false positive ER_SPLIT_BRAIN error after box.ctl.demote() with
+-- disabled elections.
+g.before_all(function(cg)
+    cg.cluster = cluster:new{}
+
+    cg.box_cfg = {
+        -- Just in case it's turned on by default sometime.
+        election_mode = 'off',
+        replication_timeout = 0.1,
+        replication = {
+            server.build_instance_uri('node1'),
+            server.build_instance_uri('node2'),
+        },
+    }
+    cg.node1 = cg.cluster:build_and_add_server{
+        alias = 'node1',
+        box_cfg = cg.box_cfg,
+    }
+    cg.node2 = cg.cluster:build_and_add_server{
+        alias = 'node2',
+        box_cfg = cg.box_cfg,
+    }
+    cg.cluster:start()
+end)
+
+g.after_all(function(cg)
+    cg.cluster:drop()
+end)
+
+g.test_false_positive_split_brain = function(cg)
+    cg.node1:exec(function()
+        box.ctl.promote()
+        box.ctl.demote()
+    end)
+    cg.node2:wait_vclock_of(cg.node1)
+    cg.node2:exec(function()
+        box.space._schema:replace{'smth'}
+    end)
+    cg.node1:wait_vclock_of(cg.node2)
+    cg.node1:assert_follows_upstream(cg.node2:instance_id())
+end
-- 
GitLab