From 8f7bae8cf92e77cf0decb150648d90196128104e Mon Sep 17 00:00:00 2001 From: Vladimir Davydov <vdavydov@tarantool.org> Date: Fri, 22 Nov 2024 16:58:41 +0300 Subject: [PATCH] vinyl: skip invisible read sources A Vinyl read iterator scans all read sources (memory and disk levels) even if it's executed in a read view from which most of the sources are invisible. As a result, a long running scanning request may spend most of the time skipping invisible statements. The situation is exacerbated if the instance is experiencing a heavy write load because it would pile up old statement versions in memory and force the iterator to skip over them after each disk read. Since the replica join procedure in Vinyl uses a read view iterator under the hood, the issue is responsible for a severe performance degradation of the master instance and the overall join procedure slowdown when a new replica is joined to an instance running under a heavy write load. Let's fix this issue by making a read iterator skip read sources that aren't visible from its read view. Closes #10846 NO_DOC=bug fix (cherry picked from commit 6a214e42e707b502022622866d898123a6f177f1) --- ...plica-join-degradation-under-write-load.md | 5 ++ src/box/vy_read_iterator.c | 34 +++++++++ .../gh_10846_skip_invisible_read_src_test.lua | 75 +++++++++++++++++++ 3 files changed, 114 insertions(+) create mode 100644 changelogs/unreleased/gh-10846-fix-vinyl-replica-join-degradation-under-write-load.md create mode 100644 test/vinyl-luatest/gh_10846_skip_invisible_read_src_test.lua diff --git a/changelogs/unreleased/gh-10846-fix-vinyl-replica-join-degradation-under-write-load.md b/changelogs/unreleased/gh-10846-fix-vinyl-replica-join-degradation-under-write-load.md new file mode 100644 index 0000000000..2a42f121d9 --- /dev/null +++ b/changelogs/unreleased/gh-10846-fix-vinyl-replica-join-degradation-under-write-load.md @@ -0,0 +1,5 @@ +## bugfix/vinyl + +* Fixed a bug when joining a new replica to a master instance that experiences + a heavy write load would severely degrade the master instance performance. + The fix should also speed up long-running scan requests (gh-10846). diff --git a/src/box/vy_read_iterator.c b/src/box/vy_read_iterator.c index 013d38ea28..b47730e1ea 100644 --- a/src/box/vy_read_iterator.c +++ b/src/box/vy_read_iterator.c @@ -59,6 +59,8 @@ struct vy_read_src { bool is_last; /** See vy_read_iterator->front_id. */ uint32_t front_id; + /** Max LSN that can be stored in this source. */ + int64_t max_lsn; /** History of the key the iterator is positioned at. */ struct vy_history history; }; @@ -104,6 +106,7 @@ vy_read_iterator_add_src(struct vy_read_iterator *itr) } struct vy_read_src *src = &itr->src[itr->src_count++]; memset(src, 0, sizeof(*src)); + src->max_lsn = INT64_MAX; vy_history_create(&src->history, &itr->lsm->env->history_node_pool); return src; } @@ -193,6 +196,24 @@ vy_read_iterator_cmp_stmt(struct vy_read_iterator *itr, vy_entry_compare(a, b, itr->lsm->cmp_def); } +/** + * Returns true if the given source can store statements visible from + * the read view used by the iterator. + */ +static inline bool +vy_read_iterator_src_is_visible(struct vy_read_iterator *itr, + struct vy_read_src *src) +{ + uint32_t src_id = src - itr->src; + assert(src_id < itr->src_count); + /* The last source can store statements visible from any read view. */ + if (src_id == itr->src_count - 1) + return true; + /* Sources are sorted by LSN so we check the next source's max LSN. */ + struct vy_read_src *next_src = &itr->src[src_id + 1]; + return (**itr->read_view).vlsn > next_src->max_lsn; +} + /** * Check if the statement at which the given read source * is positioned precedes the current candidate for the @@ -205,6 +226,7 @@ vy_read_iterator_evaluate_src(struct vy_read_iterator *itr, struct vy_read_src *src, struct vy_entry *next, bool *stop) { + assert(src->is_started); uint32_t src_id = src - itr->src; struct vy_entry entry = vy_history_last_stmt(&src->history); int cmp = vy_read_iterator_cmp_stmt(itr, entry, *next); @@ -271,6 +293,7 @@ vy_read_iterator_reevaluate_srcs(struct vy_read_iterator *itr, if (i >= itr->skipped_src) break; struct vy_read_src *src = &itr->src[i]; + assert(src->is_started); struct vy_entry entry = vy_history_last_stmt(&src->history); int cmp = vy_read_iterator_cmp_stmt(itr, entry, *next); if (cmp < 0) { @@ -376,6 +399,9 @@ vy_read_iterator_scan_mem(struct vy_read_iterator *itr, uint32_t mem_src, assert(mem_src >= itr->mem_src && mem_src < itr->disk_src); + if (!vy_read_iterator_src_is_visible(itr, src)) + return 0; + rc = vy_mem_iterator_restore(src_itr, itr->last, &src->history); if (rc == 0) { if (!src->is_started || mem_src >= itr->skipped_src) { @@ -414,6 +440,9 @@ vy_read_iterator_scan_disk(struct vy_read_iterator *itr, uint32_t disk_src, assert(disk_src >= itr->disk_src && disk_src < itr->src_count); + if (!vy_read_iterator_src_is_visible(itr, src)) + return 0; + if (!src->is_started || disk_src >= itr->skipped_src) rc = vy_run_iterator_skip(src_itr, itr->last, &src->history); @@ -442,6 +471,9 @@ vy_read_iterator_restore_mem(struct vy_read_iterator *itr, struct vy_read_src *src = &itr->src[itr->mem_src]; struct vy_mem_iterator *src_itr = &src->mem_iterator; + if (!vy_read_iterator_src_is_visible(itr, src)) + return 0; + /* * 'next' may refer to a statement in the memory source history, * which may be cleaned up by vy_mem_iterator_restore(), so we need @@ -686,6 +718,7 @@ vy_read_iterator_add_mem(struct vy_read_iterator *itr, bool is_prepared_ok) &lsm->stat.memory.iterator, mem, iterator_type, itr->key, itr->read_view, is_prepared_ok); + sub_src->max_lsn = mem->dump_lsn; } } @@ -710,6 +743,7 @@ vy_read_iterator_add_disk(struct vy_read_iterator *itr) iterator_type, itr->key, itr->read_view, lsm->cmp_def, lsm->key_def, lsm->disk_format); + sub_src->max_lsn = slice->run->dump_lsn; } } diff --git a/test/vinyl-luatest/gh_10846_skip_invisible_read_src_test.lua b/test/vinyl-luatest/gh_10846_skip_invisible_read_src_test.lua new file mode 100644 index 0000000000..6542a9f8d4 --- /dev/null +++ b/test/vinyl-luatest/gh_10846_skip_invisible_read_src_test.lua @@ -0,0 +1,75 @@ +local server = require('luatest.server') +local t = require('luatest') + +local g = t.group() + +g.before_all(function(cg) + t.tarantool.skip_if_not_debug() + cg.server = server:new() + cg.server:start() +end) + +g.after_all(function(cg) + cg.server:drop() +end) + +g.after_each(function(cg) + cg.server:exec(function() + box.error.injection.set('ERRINJ_VY_COMPACTION_DELAY', false) + if box.space.test ~= nil then + box.space.test:drop() + end + end) +end) + +g.test_skip_invisible_read_src = function(cg) + cg.server:exec(function() + box.error.injection.set('ERRINJ_VY_COMPACTION_DELAY', true) + + local s = box.schema.space.create('test', {engine = 'vinyl'}) + local i = s:create_index('primary') + + local function write(c) + box.begin() + for i = 101, 200 do + s:replace{i, c} + end + box.commit() + end + + write(1) + box.snapshot() + write(2) + + local gen, param, state = i:pairs() + local _, tuple = gen(param, state) + t.assert_equals(tuple, {101, 2}) + + t.assert_covers(i:stat(), { + range_count = 1, + run_count = 1, + memory = {iterator = {lookup = 1, get = {rows = 1}}}, + disk = {iterator = {lookup = 1, get = {rows = 1}}}, + }) + + box.snapshot() + write(3) + box.snapshot() + write(4) + + box.stat.reset() + + -- The iterator must be sent to a read view. + local _, tuple = gen(param, state) + t.assert_equals(tuple, {102, 2}) + + -- The iterator must skip the memory level and the most recent run + -- because they were created after the read view. + t.assert_covers(i:stat(), { + range_count = 1, + run_count = 3, + memory = {iterator = {lookup = 0, get = {rows = 0}}}, + disk = {iterator = {lookup = 2, get = {rows = 2}}}, + }) + end) +end -- GitLab