From 3da31b83e2bbe2be3ac2580039fe7281aba769ee Mon Sep 17 00:00:00 2001 From: Nikita Zheleztsov <n.zheleztsov@proton.me> Date: Tue, 16 Jul 2024 18:55:15 +0300 Subject: [PATCH] engine: move raft and limbo states after system data in checkpoint Before this commit raft and limbo states were written at the end of the checkpoint, which makes it very costly to access them. Checkpoint join needs to access limbo and raft state in order to send them during JOIN_META stage. We cannot use the latest states, like it's done for read-view snapshot fetching: states may be far ahead of the data, written to the checkpoint, which we're going to send. This commit moves raft and limbo states after data from the system spaces but before user data. We cannot put them right at the beginning of the snapshot, because then we'll have to patch recovery process, which currently strongly relies on the fact, that system spaces are at the beginning of the snapshot (this was done in order to apply force recovery only for user data). If we patch recovery process, then old versions, where it's unpatched, won't be able to recover from the snapshots done by the newer version, compatibility of snapshots will be broken. The current change is not breaking, old Tarantool versions can restore from the snapshot made by the newer one. Needed for tarantool/tarantool-ee#741 NO_DOC=internal NO_CHANGELOG=internal --- src/box/memtx_engine.cc | 36 +++++++++++++++-- test/engine-luatest/checkpoint_test.lua | 53 +++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 3 deletions(-) create mode 100644 test/engine-luatest/checkpoint_test.lua diff --git a/src/box/memtx_engine.cc b/src/box/memtx_engine.cc index 5ea6a247fb..043d871b2d 100644 --- a/src/box/memtx_engine.cc +++ b/src/box/memtx_engine.cc @@ -936,6 +936,17 @@ checkpoint_write_synchro(struct xlog *l, const struct synchro_request *req) return checkpoint_write_row(l, &row); } +static int +checkpoint_write_system_data(struct xlog *l, const struct raft_request *raft, + const struct synchro_request *synchro) +{ + if (checkpoint_write_raft(l, raft) != 0) + return -1; + if (checkpoint_write_synchro(l, synchro) != 0) + return -1; + return 0; +} + #ifndef NDEBUG /* * The functions defined below are used in tests to write a corrupted @@ -1039,6 +1050,7 @@ checkpoint_f(va_list ap) struct mh_i32_t *temp_space_ids; + bool is_synchro_written = false; say_info("saving snapshot `%s'", snap->filename); ERROR_INJECT_WHILE(ERRINJ_SNAP_WRITE_DELAY, { fiber_sleep(0.001); @@ -1058,6 +1070,19 @@ checkpoint_f(va_list ap) }); if (skip) continue; + /* + * Raft and limbo states are written right after system spaces + * but before user ones. This is needed to reduce the time, + * needed to acquire states from the checkpoint, which is used + * during checkpoint join. + */ + if (!is_synchro_written && !space_id_is_system(space_rv->id)) { + rc = checkpoint_write_system_data(snap, &ckpt->raft, + &ckpt->synchro_state); + if (rc != 0) + break; + is_synchro_written = true; + } struct index_read_view *index_rv = space_read_view_index(space_rv, 0); assert(index_rv != NULL); @@ -1115,9 +1140,14 @@ checkpoint_f(va_list ap) if (checkpoint_write_invalid_system_row(snap) != 0) goto fail; }); - if (checkpoint_write_raft(snap, &ckpt->raft) != 0) - goto fail; - if (checkpoint_write_synchro(snap, &ckpt->synchro_state) != 0) + /* + * There may be no user data (e.g. when only RAFT_PROMOTE is written), + * write limbo and raft states right after system spaces, at the end + * of the checkpoint. + */ + if (!is_synchro_written && + checkpoint_write_system_data(snap, &ckpt->raft, + &ckpt->synchro_state) != 0) goto fail; goto done; done: diff --git a/test/engine-luatest/checkpoint_test.lua b/test/engine-luatest/checkpoint_test.lua new file mode 100644 index 0000000000..6c0ba2d148 --- /dev/null +++ b/test/engine-luatest/checkpoint_test.lua @@ -0,0 +1,53 @@ +local t = require('luatest') +local server = require('luatest.server') +local xlog = require('xlog') + +local g = t.group() + +g.before_all(function(g) + g.server = server:new() + g.server:start() + g.server:exec(function() + box.schema.space.create('test'):create_index('pk') + end) +end) + +g.after_all(function(g) + g.server:drop() +end) + +-- +-- Test, that the limbo and raft states are saved after +-- system spaces but before user data. +-- +g.test_synchro_states_checkpoint = function(g) + local lsn = g.server:exec(function() + box.space.test:insert{1, 'data'} + box.snapshot() + return box.info.vclock[1] + end) + + -- Read the whole snap in memory, as otherwise + -- it's impossible to access data by index. + local data = {} + local snap_template = '%020d.snap' + local snap = g.server.workdir .. '/' .. string.format(snap_template, lsn) + for _, v in xlog.pairs(snap) do + table.insert(data, v) + end + + -- Skip data from the system spaces. + local state_idx = 0 + for i, row in ipairs(data) do + if row.HEADER.type ~= 'INSERT' then + state_idx = i + break + end + end + -- The next rows are raft and limbo states. + t.assert_equals(data[state_idx].HEADER.type, 'RAFT') + t.assert_equals(data[state_idx + 1].HEADER.type, 'RAFT_PROMOTE') + -- And after states there's some user data. + t.assert_equals(data[state_idx + 2].HEADER.type, 'INSERT') + t.assert_ge(data[state_idx + 2].BODY.space_id, 512) +end -- GitLab