From ea1c829faa02df0cb0876f77ec0952e6c572d65f Mon Sep 17 00:00:00 2001 From: Andrey Saranchin <Andrey22102001@gmail.com> Date: Mon, 14 Oct 2024 16:25:55 +0300 Subject: [PATCH] alter: wait for previous alters to commit on DDL Yielding DDL operations acquire DDL lock so that the space cannot be modified under its feet. However, there is a case when it actually can: if a yielding DDL has started when there is another DDL is being committed and it gets rolled back due to WAL error, `struct space` created by rolled back DDL is deleted - and it's the space being altered by the yielding DDL. In order to fix this problem, let's simply wait for all previous alters to be committed. We could use `wal_sync` to wait for all previous transactions to be committed, but it is more complicated - we need to use `wal_sync` for single instance and `txn_limbo_wait_last_txn` when the limbo queue has an owner. Such approach has more pitfalls and requires more tests to cover all cases. When relying on `struct alter_space` directly, all situations are handled with the same logic. Alternative solutions that we have tried: 1. Throw an error in the case when user tries to alter space when there is another non-committed alter. Such approach breaks applier since it applies rows asynchronously. Trying applier to execute operations synchronously breaks it even harder. 2. Do not use space in `build_index` and `check_format` methods. In this case, there is another problem: rollback order. We have to rollback previous alters firstly, and the in-progress one can be rolled back only after it's over. It breaks fundamental memtx invariant: rollback order must be reverse of replace order. We could try to use `before_replace` triggers for alter, but the patch would be bulky. Closes #10235 NO_DOC=bugfix (cherry picked from commit fee8c5dd6b16471739ed8512ba4137ff2e7274aa) --- .../gh-10235-consequent-ddl-crash.md | 4 + src/box/alter.cc | 85 ++++++++++++++++--- .../gh_10235_crash_on_consequent_ddl_test.lua | 43 ++++++++++ 3 files changed, 120 insertions(+), 12 deletions(-) create mode 100644 changelogs/unreleased/gh-10235-consequent-ddl-crash.md create mode 100644 test/engine-luatest/gh_10235_crash_on_consequent_ddl_test.lua diff --git a/changelogs/unreleased/gh-10235-consequent-ddl-crash.md b/changelogs/unreleased/gh-10235-consequent-ddl-crash.md new file mode 100644 index 0000000000..b078f67c97 --- /dev/null +++ b/changelogs/unreleased/gh-10235-consequent-ddl-crash.md @@ -0,0 +1,4 @@ +## bugfix/box + +* Fixed a crash when the first of two consequent DDL operations gets + rolled back due to WAL failure (gh-10235). diff --git a/src/box/alter.cc b/src/box/alter.cc index 191e17e255..b8badc5ab7 100644 --- a/src/box/alter.cc +++ b/src/box/alter.cc @@ -678,7 +678,16 @@ txn_alter_trigger_new(trigger_f run, void *data) return trigger; } +/** + * List of all alive alter_space objects. + */ +static RLIST_HEAD(alter_space_list); + struct alter_space { + /** Link in alter_space_list. */ + struct rlist in_list; + /** Transaction doing this alter. */ + struct txn *txn; /** List of alter operations */ struct rlist ops; /** Definition of the new space - space_def. */ @@ -710,6 +719,11 @@ struct alter_space { int n_rows; }; +/** + * Fiber cond that wakes up all waiters on every deletion of alter_space. + */ +static FIBER_COND(alter_space_delete_cond); + static struct alter_space * alter_space_new(struct space *old_space) { @@ -724,6 +738,8 @@ alter_space_new(struct space *old_space) } alter = (struct alter_space *)memset(alter, 0, size); rlist_create(&alter->ops); + rlist_add_entry(&alter_space_list, alter, in_list); + alter->txn = in_txn(); alter->old_space = old_space; alter->space_def = space_def_dup(alter->old_space->def); if (old_space->format != NULL) @@ -738,6 +754,8 @@ alter_space_new(struct space *old_space) static void alter_space_delete(struct alter_space *alter) { + fiber_cond_broadcast(&alter_space_delete_cond); + rlist_del_entry(alter, in_list); /* Destroy the ops. */ while (! rlist_empty(&alter->ops)) { AlterSpaceOp *op = rlist_shift_entry(&alter->ops, @@ -1049,15 +1067,62 @@ class CheckSpaceFormat: public AlterSpaceOp virtual void prepare(struct alter_space *alter); }; +/** + * The object is used to grant ability to yield with RAII approach. + * Transaction is allowed to yield only on its first statement, so if the + * statement is not first, it simply does nothing. + * If it's the first statement, the guard blocks execution until all previous + * alters will be rolled back or committed so that the space object won't be + * deleted right from under our feet. In the case when the previous alters were + * rolled back and the space was removed from space cache, the constructor + * throws an error. + */ +class AlterYieldGuard +{ +public: + AlterYieldGuard(struct space *old_space) { + if (!txn_is_first_statement(in_txn())) + return; + txn_can_yield(in_txn(), true); + uint32_t space_id = old_space->def->id; + while (true) { + bool space_is_being_altered = false; + struct alter_space *alter; + rlist_foreach_entry(alter, &alter_space_list, in_list) { + if (alter->txn != in_txn() && + alter->old_space->def->id == space_id) { + space_is_being_altered = true; + break; + } + } + if (!space_is_being_altered) + break; + /* + * Wait for deletion of any alter to check if the + * space is being altered again. + */ + fiber_cond_wait(&alter_space_delete_cond); + } + /* Check if the space is still alive. */ + if (space_by_id(space_id) != old_space) { + txn_can_yield(in_txn(), false); + /* Cannot access the space name since it was deleted. */ + tnt_raise(ClientError, ER_ALTER_SPACE, + tt_sprintf("%u", space_id), + "the space was concurrently modified"); + } + } + + ~AlterYieldGuard() { + txn_can_yield(in_txn(), false); + } +}; + static inline void space_check_format_with_yield(struct space *space, struct tuple_format *format) { - struct txn *txn = in_txn(); - assert(txn != NULL); - (void) txn_can_yield(txn, true); - auto yield_guard = - make_scoped_guard([=] { txn_can_yield(txn, false); }); + AlterYieldGuard guard(space); space_check_format_xc(space, format); } @@ -1350,11 +1415,7 @@ static inline void space_build_index_with_yield(struct space *old_space, struct space *new_space, struct index *new_index) { - struct txn *txn = in_txn(); - assert(txn != NULL); - (void) txn_can_yield(txn, true); - auto yield_guard = - make_scoped_guard([=] { txn_can_yield(txn, false); }); + AlterYieldGuard guard(old_space); space_build_index_xc(old_space, new_space, new_index); } @@ -1549,10 +1610,10 @@ TruncateIndex::prepare(struct alter_space *alter) * space_build_index() to let the engine know that the * index was recreated. For example, Vinyl uses this * callback to load indexes during local recovery. + * No need to yield here since we build an empty index. */ assert(new_index != NULL); - space_build_index_with_yield(alter->new_space, alter->new_space, - new_index); + space_build_index_xc(alter->new_space, alter->new_space, new_index); } void diff --git a/test/engine-luatest/gh_10235_crash_on_consequent_ddl_test.lua b/test/engine-luatest/gh_10235_crash_on_consequent_ddl_test.lua new file mode 100644 index 0000000000..a4dc510fc0 --- /dev/null +++ b/test/engine-luatest/gh_10235_crash_on_consequent_ddl_test.lua @@ -0,0 +1,43 @@ +local t = require('luatest') +local server = require('luatest.server') + +local g = t.group(nil, {{engine = 'memtx'}, {engine = 'vinyl'}}) + +g.before_each(function(cg) + t.tarantool.skip_if_not_debug() + cg.server = server:new() + cg.server:start() +end) + +g.after_each(function(cg) + cg.server:drop() +end) + +-- Reproducer from the issue. +g.test_crash_on_consequent_alters = function(cg) + cg.server:exec(function(engine) + local fiber = require('fiber') + local space = box.schema.space.create('test', {engine = engine}) + space:create_index('pk') + for i = 1, 100 do + space:replace({i, 2}) + end + local ch = fiber.channel(1) + fiber.create(function() + box.begin() + space:create_index('sk') + -- Inject WAL error only after index is built in order + -- to allow vinyl use WAL during the index build + box.error.injection.set('ERRINJ_WAL_WRITE', true) + ch:put(true) + box.commit() + end) + t.assert(ch:get(10)) + local function alter_space() + space:format({{'one', 'unsigned'}, {'two', 'unsigned'}}) + end + t.assert_error_msg_content_equals( + "Can't modify space '512': the space was concurrently modified", + alter_space) + end, {cg.params.engine}) +end -- GitLab