From a532e375fe3703a210a58e1cc4b57fdc608d34c2 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@tarantool.org>
Date: Wed, 10 May 2023 18:29:17 +0300
Subject: [PATCH] box: exclude uncommitted alter records from snapshot

With MVCC off (box.cfg.memtx_use_mvcc_engine = false), a memtx space
read view may include a dirty (not committed to WAL) record. To prevent
such records from being written to a snapshot, we sync WAL after
creating a read view for a snapshot. The problem is that it doesn't work
for long (yielding) DDL operations, such as building a new index,
because such operations yield before waiting on WAL. As a result,
a dirty DDL record may make it to a snapshot even though it may fail
eventually. To fix that, let's keep track of all yielding DDL statements
and exclude them from a read view using the memtx snapshot cleaner.

Closes #8530

NO_DOC=bug fix
---
 .../gh-8530-alter-space-snapshot-fix.md       |  4 +
 src/box/alter.cc                              |  7 ++
 src/box/memtx_tx.c                            | 11 ++-
 src/box/space.c                               |  2 +
 src/box/space.h                               | 21 +++++
 .../gh_8530_alter_space_snapshot_test.lua     | 77 +++++++++++++++++++
 6 files changed, 120 insertions(+), 2 deletions(-)
 create mode 100644 changelogs/unreleased/gh-8530-alter-space-snapshot-fix.md
 create mode 100644 test/box-luatest/gh_8530_alter_space_snapshot_test.lua

diff --git a/changelogs/unreleased/gh-8530-alter-space-snapshot-fix.md b/changelogs/unreleased/gh-8530-alter-space-snapshot-fix.md
new file mode 100644
index 0000000000..e89a7110e9
--- /dev/null
+++ b/changelogs/unreleased/gh-8530-alter-space-snapshot-fix.md
@@ -0,0 +1,4 @@
+## bugfix/core
+
+* Fixed a bug because of which a dirty (not committed to WAL) DDL record could
+  be written to a snapshot and cause a recovery failure (gh-8530).
diff --git a/src/box/alter.cc b/src/box/alter.cc
index 678ab95853..068ac6ac46 100644
--- a/src/box/alter.cc
+++ b/src/box/alter.cc
@@ -903,6 +903,13 @@ alter_space_rollback(struct trigger *trigger, void * /* event */)
 static void
 alter_space_do(struct txn_stmt *stmt, struct alter_space *alter)
 {
+	struct space_alter_stmt alter_stmt;
+	alter_stmt.old_tuple = stmt->old_tuple;
+	alter_stmt.new_tuple = stmt->new_tuple;
+	rlist_add_entry(&stmt->space->alter_stmts, &alter_stmt, link);
+	auto alter_stmt_guard = make_scoped_guard([&] {
+		rlist_del_entry(&alter_stmt, link);
+	});
 	/**
 	 * AlterSpaceOp::prepare() may perform a potentially long
 	 * lasting operation that may yield, e.g. building of a new
diff --git a/src/box/memtx_tx.c b/src/box/memtx_tx.c
index 2a555faaf4..640464a1f4 100644
--- a/src/box/memtx_tx.c
+++ b/src/box/memtx_tx.c
@@ -3344,7 +3344,8 @@ memtx_tx_snapshot_cleaner_create(struct memtx_tx_snapshot_cleaner *cleaner,
 				 struct space *space)
 {
 	cleaner->ht = NULL;
-	if (space == NULL || rlist_empty(&space->memtx_stories))
+	if (rlist_empty(&space->memtx_stories) &&
+	    rlist_empty(&space->alter_stmts))
 		return;
 	struct mh_snapshot_cleaner_t *ht = mh_snapshot_cleaner_new();
 	struct memtx_story *story;
@@ -3361,7 +3362,13 @@ memtx_tx_snapshot_cleaner_create(struct memtx_tx_snapshot_cleaner *cleaner,
 		entry.to = clean;
 		mh_snapshot_cleaner_put(ht,  &entry, NULL, 0);
 	}
-
+	struct space_alter_stmt *alter_stmt;
+	rlist_foreach_entry(alter_stmt, &space->alter_stmts, link) {
+		struct memtx_tx_snapshot_cleaner_entry entry;
+		entry.from = alter_stmt->new_tuple;
+		entry.to = alter_stmt->old_tuple;
+		mh_snapshot_cleaner_put(ht, &entry, NULL, 0);
+	}
 	cleaner->ht = ht;
 }
 
diff --git a/src/box/space.c b/src/box/space.c
index 3b34f87a4b..e64186f913 100644
--- a/src/box/space.c
+++ b/src/box/space.c
@@ -330,6 +330,7 @@ space_create(struct space *space, struct engine *engine,
 	}
 	space->constraint_ids = mh_strnptr_new();
 	rlist_create(&space->memtx_stories);
+	rlist_create(&space->alter_stmts);
 	return 0;
 
 fail_free_indexes:
@@ -389,6 +390,7 @@ space_new_ephemeral(struct space_def *def, struct rlist *key_list)
 void
 space_delete(struct space *space)
 {
+	assert(rlist_empty(&space->alter_stmts));
 	memtx_tx_on_space_delete(space);
 	for (uint32_t j = 0; j <= space->index_id_max; j++) {
 		struct index *index = space->index_map[j];
diff --git a/src/box/space.h b/src/box/space.h
index d1d8a5fe70..afcddebc5f 100644
--- a/src/box/space.h
+++ b/src/box/space.h
@@ -230,6 +230,17 @@ struct space {
 	 * List of all tx stories in the space.
 	 */
 	struct rlist memtx_stories;
+	/**
+	 * List of currently running long (yielding) space alter operations
+	 * triggered by statements applied to this space (see alter_space_do),
+	 * linked by space_alter_stmt::link.
+	 *
+	 * We must exclude such statements from snapshot because they haven't
+	 * reached WAL yet and may actually fail. With MVCC off, such
+	 * statements would be visible from a read view so we have to keep
+	 * track of them separately.
+	 */
+	struct rlist alter_stmts;
 	/** Space upgrade state or NULL. */
 	struct space_upgrade *upgrade;
 	/**
@@ -239,6 +250,16 @@ struct space {
 	struct space_wal_ext *wal_ext;
 };
 
+/** Space alter statement. */
+struct space_alter_stmt {
+	/** Link in space::alter_stmts. */
+	struct rlist link;
+	/** Old tuple. */
+	struct tuple *old_tuple;
+	/** New tuple. */
+	struct tuple *new_tuple;
+};
+
 /**
  * Detach constraints from space. They can be reattached or deleted then.
  */
diff --git a/test/box-luatest/gh_8530_alter_space_snapshot_test.lua b/test/box-luatest/gh_8530_alter_space_snapshot_test.lua
new file mode 100644
index 0000000000..cddc401f03
--- /dev/null
+++ b/test/box-luatest/gh_8530_alter_space_snapshot_test.lua
@@ -0,0 +1,77 @@
+local server = require('luatest.server')
+local t = require('luatest')
+
+local g = t.group('gh_8530_alter_space_snapshot', {
+    {memtx_use_mvcc_engine = true},
+    {memtx_use_mvcc_engine = false},
+})
+
+g.before_all(function(cg)
+    t.tarantool.skip_if_not_debug()
+    cg.server = server:new({
+        box_cfg = {
+            memtx_use_mvcc_engine = cg.params.memtx_use_mvcc_engine,
+        },
+    })
+    cg.server:start()
+end)
+
+g.after_all(function(cg)
+    cg.server:drop()
+end)
+
+g.before_each(function(cg)
+    cg.server:exec(function()
+        local s = box.schema.create_space('test')
+        s:create_index('primary')
+        s:insert({1, 10})
+        s:insert({2, 20})
+        s:insert({3, 30})
+    end)
+end)
+
+g.after_each(function(cg)
+    cg.server:exec(function()
+        box.space.test:drop()
+    end)
+end)
+
+-- Check that a new index record that hasn't been written to WAL doesn't make
+-- it to a snapshot.
+g.test_build_index = function(cg)
+    cg.server:exec(function()
+        local fiber = require('fiber')
+        local s = box.space.test
+        box.error.injection.set('ERRINJ_BUILD_INDEX_DELAY', true)
+        local f = fiber.create(s.create_index, s, 'secondary',
+                               {parts = {2, 'unsigned'}})
+        fiber.sleep(0.01)
+        box.snapshot()
+        t.assert_equals(f:status(), 'suspended')
+    end)
+    cg.server:restart()
+    cg.server:exec(function()
+        local s = box.space.test
+        t.assert_equals(s.index.secondary, nil)
+    end)
+end
+
+-- Check that a space format record that hasn't been written to WAL doesn't
+-- make it to a snapshot.
+g.test_change_format = function(cg)
+    cg.server:exec(function()
+        local fiber = require('fiber')
+        local s = box.space.test
+        box.error.injection.set('ERRINJ_CHECK_FORMAT_DELAY', true)
+        local f = fiber.create(s.format, s,
+                               {{'a', 'unsigned'}, {'b', 'unsigned'}})
+        fiber.sleep(0.01)
+        box.snapshot()
+        t.assert_equals(f:status(), 'suspended')
+    end)
+    cg.server:restart()
+    cg.server:exec(function()
+        local s = box.space.test
+        t.assert_equals(s:format(), {})
+    end)
+end
-- 
GitLab